Ticket #490: Updated VAD with new algorithm.

git-svn-id: https://svn.pjsip.org/repos/pjproject/trunk@2222 74dad513-b988-da41-8d7b-12977e46ad98
diff --git a/pjmedia/include/pjmedia/silencedet.h b/pjmedia/include/pjmedia/silencedet.h
index af6f0e2..5f6a249 100644
--- a/pjmedia/include/pjmedia/silencedet.h
+++ b/pjmedia/include/pjmedia/silencedet.h
@@ -1,6 +1,6 @@
 /* $Id$ */
 /* 
- * Copyright (C) 2003-2008 Benny Prijono <benny@prijono.org>
+ * Copyright (C) 2003-2007 Benny Prijono <benny@prijono.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -108,26 +108,27 @@
  * Set other silence detector parameters.
  *
  * @param sd		    The silence detector
- * @param min_silence	    Minimum duration of silence (in msec) before 
+ * @param before_silence    Minimum duration of silence (in msec) before 
  *			    silence is reported. If -1 is specified, then
  *			    the default value will be used. The default is
  *			    400 msec.
- * @param min_signal	    Minimum duration of signal (in msec) before
- *			    signal is reported. If -1 is specified, then
- *			    the default value will be used. The default is
- *			    equal to one frame.
- * @param recalc_time	    The interval to recalculate signal and silence
- *			    proportion and to readjust the silence threshold
- *			    when adaptive silence detection is set. If -1
- *			    is specified, then the default value will be used.
- *			    The default value is 5000 (msec).
+ * @param recalc_time1	    The interval (in msec) to recalculate threshold
+ *			    in non-silence condition when adaptive silence 
+ *			    detection is set. If -1 is specified, then the 
+ *			    default value will be used. The default is 4000
+ *			    (msec).
+ * @param recalc_time2	    The interval (in msec) to recalculate threshold
+ *			    in silence condition when adaptive silence detection
+ *			    is set. If -1 is specified, then the default value 
+ *			    will be used. The default value is 2000 (msec).
  *
  * @return		    PJ_SUCCESS on success.
  */
 PJ_DECL(pj_status_t) pjmedia_silence_det_set_params( pjmedia_silence_det *sd,
-						     int min_silence,
-						     int min_signal,
-						     int recalc_time);
+						     int before_silence,
+						     int recalc_time1,
+						     int recalc_time2);
+
 
 /**
  * Disable the silence detector.
diff --git a/pjmedia/src/pjmedia/silencedet.c b/pjmedia/src/pjmedia/silencedet.c
index f5d29e6..59738f1 100644
--- a/pjmedia/src/pjmedia/silencedet.c
+++ b/pjmedia/src/pjmedia/silencedet.c
@@ -1,6 +1,6 @@
 /* $Id$ */
 /* 
- * Copyright (C) 2003-2008 Benny Prijono <benny@prijono.org>
+ * Copyright (C) 2003-2007 Benny Prijono <benny@prijono.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -23,16 +23,42 @@
 #include <pj/log.h>
 #include <pj/pool.h>
 
-
 #define THIS_FILE   "silencedet.c"
 
+#if 0
+#   define TRACE_(x)	PJ_LOG(3,x)
+#else
+#   define TRACE_(x)
+#endif
+
+/**
+ * This enumeration specifies operation mode of silence detector 
+ */
 typedef enum pjmedia_silence_det_mode {
     VAD_MODE_NONE,
     VAD_MODE_FIXED,
     VAD_MODE_ADAPTIVE
 } pjmedia_silence_det_mode;
 
+/**
+ * Default settings
+ */
+#define DEF_RECALC_ON_VOICED	    4000 /* Time to recalculate threshold
+					    in voiced condition, in ms	  */
+#define DEF_RECALC_ON_SILENCE	    2000 /* Time to recalculate threshold
+					    in silence condition, in ms.  */
+#define DEF_BEFORE_SILENCE	    400	 /* Silence time before really changing
+					    state into SILENCE, in ms.	  */
+#define DEF_THRESHOLD		    1000 /* Default threshold.		  */
 
+/**
+ * This enumeration specifies the states of the silence detector.
+ */
+enum pjmedia_silence_det_state {
+    STATE_SILENCE,
+    STATE_START_SILENCE,
+    STATE_VOICED
+};
 
 /**
  * This structure holds the silence detector state.
@@ -41,20 +67,23 @@
 {
     char      objname[PJ_MAX_OBJ_NAME]; /**< VAD name.			    */
 
-    int	      mode;		/**< VAD mode.				    */
-    unsigned  ptime;		/**< Frame time, in msec.		    */
+    int	      mode;			/**< VAD mode.			    */
+    unsigned  ptime;			/**< Frame time, in msec.	    */
 
-    unsigned  min_signal_cnt;	/**< # of signal frames.before talk burst   */
-    unsigned  min_silence_cnt;	/**< # of silence frames before silence.    */
-    unsigned  recalc_cnt;	/**< # of frames before adaptive recalc.    */
-
-    pj_bool_t in_talk;		/**< In talk burst?			    */
-    unsigned  cur_cnt;		/**< # of frames in current mode.	    */
-    unsigned  signal_cnt;	/**< # of signal frames received.	    */
-    unsigned  silence_cnt;	/**< # of silence frames received	    */
-    unsigned  cur_threshold;	/**< Current silence threshold.		    */
-    unsigned  weakest_signal;	/**< Weakest signal detected.		    */
-    unsigned  loudest_silence;	/**< Loudest silence detected.		    */
+    unsigned  threshold;		/**< Current threshold level.	    */
+    unsigned  sum_level;		/**< Total sum of recent level.	    */
+    unsigned  sum_cnt;			/**< Number of level summed.	    */
+    unsigned  silence_timer;		/**< Silence condition timer.	    */
+    unsigned  voiced_timer;		/**< Voiced condition timer.	    */
+    
+    enum pjmedia_silence_det_state state;/**< Silence detector state.	    */
+    unsigned  recalc_on_voiced;		/**< Setting of time to recalc 
+					     threshold in voiced condition. */
+    unsigned  recalc_on_silence;	/**< Setting of time to recalc 
+					     threshold in silence condition.*/
+    unsigned  before_silence;		/**< Setting of silence time before 
+					     really changing state into SILENCE,
+					     in ms.			    */
 };
 
 
@@ -70,20 +99,15 @@
 
     sd = PJ_POOL_ZALLOC_T(pool, pjmedia_silence_det);
 
-    pj_ansi_strncpy(sd->objname, THIS_FILE, PJ_MAX_OBJ_NAME);
+    pj_ansi_snprintf(sd->objname, PJ_MAX_OBJ_NAME, THIS_FILE, sd);
     sd->objname[PJ_MAX_OBJ_NAME-1] = '\0';
 
     sd->ptime = samples_per_frame * 1000 / clock_rate;
-    sd->signal_cnt = 0;
-    sd->silence_cnt = 0;
-    sd->weakest_signal = 0xFFFFFFFFUL;
-    sd->loudest_silence = 0;
      
     /* Default settings */
     pjmedia_silence_det_set_params(sd, -1, -1, -1);
 
-    /* Restart in fixed, silent mode */
-    sd->in_talk = PJ_FALSE;
+    /* Restart in adaptive, silent mode */
     pjmedia_silence_det_set_adaptive( sd, -1 );
 
     *p_sd = sd;
@@ -101,17 +125,16 @@
     return PJ_SUCCESS;
 }
 
-
 PJ_DEF(pj_status_t) pjmedia_silence_det_set_adaptive(pjmedia_silence_det *sd,
 						     int threshold)
 {
     PJ_ASSERT_RETURN(sd, PJ_EINVAL);
 
     if (threshold < 0)
-	threshold = PJMEDIA_SILENCE_DET_THRESHOLD;
+	threshold = DEF_THRESHOLD;
 
     sd->mode = VAD_MODE_ADAPTIVE;
-    sd->cur_threshold = threshold;
+    sd->threshold = threshold;
 
     return PJ_SUCCESS;
 }
@@ -122,31 +145,31 @@
     PJ_ASSERT_RETURN(sd, PJ_EINVAL);
 
     if (threshold < 0)
-	threshold = PJMEDIA_SILENCE_DET_THRESHOLD;
+	threshold = DEF_THRESHOLD;
 
     sd->mode = VAD_MODE_FIXED;
-    sd->cur_threshold = threshold;
+    sd->threshold = threshold;
 
     return PJ_SUCCESS;
 }
 
 PJ_DEF(pj_status_t) pjmedia_silence_det_set_params( pjmedia_silence_det *sd,
-						    int min_silence,
-						    int min_signal,
-						    int recalc_time)
+						    int before_silence,
+						    int recalc_time1,
+						    int recalc_time2)
 {
     PJ_ASSERT_RETURN(sd, PJ_EINVAL);
 
-    if (min_silence == -1)
-	min_silence = 500;
-    if (min_signal < 0)
-	min_signal = sd->ptime;
-    if (recalc_time < 0)
-	recalc_time = 2000;
+    if (recalc_time1 < 0)
+	recalc_time1 = DEF_RECALC_ON_VOICED;
+    if (recalc_time2 < 0)
+	recalc_time2 = DEF_RECALC_ON_SILENCE;
+    if (before_silence < 0)
+	before_silence = DEF_BEFORE_SILENCE;
 
-    sd->min_signal_cnt = min_signal / sd->ptime;
-    sd->min_silence_cnt = min_silence / sd->ptime;
-    sd->recalc_cnt = recalc_time / sd->ptime;
+    sd->recalc_on_voiced = recalc_time1;
+    sd->recalc_on_silence = recalc_time2;
+    sd->before_silence  = before_silence;
 
     return PJ_SUCCESS;
 }
@@ -186,109 +209,104 @@
 PJ_DEF(pj_bool_t) pjmedia_silence_det_apply( pjmedia_silence_det *sd,
 					     pj_uint32_t level)
 {
-    pj_bool_t have_signal;
+    int avg_recent_level;
 
-    /* Always return false if VAD is disabled */
     if (sd->mode == VAD_MODE_NONE)
 	return PJ_FALSE;
 
-    /* Convert PCM level to ulaw */
-    level = pjmedia_linear2ulaw(level) ^ 0xff;
-    
-    /* Do we have signal? */
-    have_signal = level > sd->cur_threshold;
-    
-    /* We we're in transition between silence and signel, increment the 
-     * current frame counter. We will only switch mode when we have enough
-     * frames.
-     */
-    if (sd->in_talk != have_signal) {
-	unsigned limit;
+    if (sd->mode == VAD_MODE_FIXED)
+	return (level < sd->threshold);
 
-	sd->cur_cnt++;
+    /* Calculating recent level */
+    sd->sum_level += level;
+    ++sd->sum_cnt;
+    avg_recent_level = (sd->sum_level / sd->sum_cnt);
 
-	limit = (sd->in_talk ? sd->min_silence_cnt : 
-				sd->min_signal_cnt);
+    if (level > sd->threshold) {
+	sd->silence_timer = 0;
+	sd->voiced_timer += sd->ptime;
 
-	if (sd->cur_cnt > limit) {
+	switch(sd->state) {
+	    case STATE_VOICED:
+		if (sd->voiced_timer > sd->recalc_on_voiced) {
+		    /* Voiced for long time (>recalc_on_voiced), current 
+		     * threshold seems to be too low.
+		     */
+		    sd->threshold = (avg_recent_level + sd->threshold) >> 1;
+		    TRACE_((THIS_FILE,"Re-adjust threshold (in talk burst)"
+			    "to %d", sd->threshold));
 
-	    /* Swap mode */
-	    sd->in_talk = !sd->in_talk;
-	    
-	    /* Restart adaptive cur_threshold measurements */
-	    sd->weakest_signal = 0xFFFFFFFFUL;
-	    sd->loudest_silence = 0;
-	    sd->signal_cnt = 0;
-	    sd->silence_cnt = 0;
-	    sd->cur_cnt = 0;
+		    sd->voiced_timer = 0;
+
+		    /* Reset sig_level */
+		    sd->sum_level = avg_recent_level;
+		    sd->sum_cnt = 1;
+		}
+		break;
+
+	    case STATE_SILENCE:
+		TRACE_((THIS_FILE,"Starting talk burst (level=%d threshold=%d)",
+			level, sd->threshold));
+
+	    case STATE_START_SILENCE:
+		sd->state = STATE_VOICED;
+
+		/* Reset sig_level */
+		sd->sum_level = level;
+		sd->sum_cnt = 1;
+
+		break;
+
+	    default:
+		pj_assert(0);
+		break;
 	}
-
     } else {
-	/* Reset frame count */
-	sd->cur_cnt = 0;
-    }
-    
+	sd->voiced_timer = 0;
+	sd->silence_timer += sd->ptime;
 
-    /* Count the number of silent and signal frames and calculate min/max */
-    if (have_signal) {
-	if (level < sd->weakest_signal)
-	    sd->weakest_signal = level;
-	sd->signal_cnt++;
-    }
-    else {
-	if (level > sd->loudest_silence)
-	    sd->loudest_silence = level;
-	sd->silence_cnt++;
-    }
+	switch(sd->state) {
+	    case STATE_SILENCE:
+		if (sd->silence_timer >= sd->recalc_on_silence) {
+		    sd->threshold = avg_recent_level << 1;
+		    TRACE_((THIS_FILE,"Re-adjust threshold (in silence)"
+			    "to %d", sd->threshold));
 
-    /* See if we have had enough frames to look at proportions of 
-     * silence/signal frames.
-     */
-    if ((sd->signal_cnt + sd->silence_cnt) > sd->recalc_cnt) {
-	
-	if (sd->mode == VAD_MODE_ADAPTIVE) {
-	    pj_bool_t updated = PJ_TRUE;
-	    unsigned pct_signal, new_threshold = sd->cur_threshold;
+		    sd->silence_timer = 0;
 
-	    /* Get percentage of signal */
-	    pct_signal = sd->signal_cnt * 100 / 
-		        (sd->signal_cnt + sd->silence_cnt);
+		    /* Reset sig_level */
+		    sd->sum_level = avg_recent_level;
+		    sd->sum_cnt = 1;
+		}
+		break;
 
-	    /* Adjust according to signal/silence proportions. */
-	    if (pct_signal > 95) {
-		new_threshold += (sd->weakest_signal+1 - sd->cur_threshold)/2;
-	    } else if (pct_signal < 5) {
-		new_threshold = (sd->cur_threshold+sd->loudest_silence)/2+1;
-	    } else if (pct_signal > 80) {
-		new_threshold++;
-	    } else if (pct_signal < 10) {
-		new_threshold--;
-	    } else {
-		updated = PJ_FALSE;
-	    }
+	    case STATE_VOICED:
+		sd->state = STATE_START_SILENCE;
 
-	    if (new_threshold > PJMEDIA_SILENCE_DET_MAX_THRESHOLD)
-		new_threshold = PJMEDIA_SILENCE_DET_MAX_THRESHOLD;
+		/* Reset sig_level */
+		sd->sum_level = level;
+		sd->sum_cnt = 1;
 
-	    if (updated && sd->cur_threshold != new_threshold) {
-		PJ_LOG(5,(sd->objname, 
-			  "Vad cur_threshold updated %d-->%d. "
-			  "Signal lo=%d",
-			  sd->cur_threshold, new_threshold,
-			  sd->weakest_signal));
-		sd->cur_threshold = new_threshold;
-	    }
+	    case STATE_START_SILENCE:
+		if (sd->silence_timer >= sd->before_silence) {
+		    sd->state = STATE_SILENCE;
+		    sd->threshold = avg_recent_level << 1;
+		    TRACE_((THIS_FILE,"Starting silence (level=%d "
+			    "threshold=%d)", level, sd->threshold));
+
+		    /* Reset sig_level */
+		    sd->sum_level = avg_recent_level;
+		    sd->sum_cnt = 1;
+		}
+		break;
+
+	    default:
+		pj_assert(0);
+		break;
 	}
-
-	/* Reset. */
-	sd->weakest_signal = 0xFFFFFFFFUL;
-	sd->loudest_silence = 0;
-	sd->signal_cnt = 0;
-	sd->silence_cnt = 0;
     }
-    
-    return !sd->in_talk;
 
+    return (sd->state == STATE_SILENCE);
 }