blob: a8090981434f950ecc21356b8f027f1560806a6d [file] [log] [blame]
Alexandre Lision744f7422013-09-25 11:39:37 -04001/***********************************************************************
2Copyright (c) 2006-2011, Skype Limited. All rights reserved.
3Redistribution and use in source and binary forms, with or without
4modification, are permitted provided that the following conditions
5are met:
6- Redistributions of source code must retain the above copyright notice,
7this list of conditions and the following disclaimer.
8- Redistributions in binary form must reproduce the above copyright
9notice, this list of conditions and the following disclaimer in the
10documentation and/or other materials provided with the distribution.
Alexandre Lision85382382014-01-27 15:54:16 -050011- Neither the name of Internet Society, IETF or IETF Trust, nor the
Alexandre Lision744f7422013-09-25 11:39:37 -040012names of specific contributors, may be used to endorse or promote
13products derived from this software without specific prior written
14permission.
Alexandre Lision85382382014-01-27 15:54:16 -050015THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
Alexandre Lision744f7422013-09-25 11:39:37 -040016AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25POSSIBILITY OF SUCH DAMAGE.
26***********************************************************************/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include "main.h"
Alexandre Lision85382382014-01-27 15:54:16 -050033#include "stack_alloc.h"
Alexandre Lision744f7422013-09-25 11:39:37 -040034
35/* Silk VAD noise level estimation */
Alexandre Lision85382382014-01-27 15:54:16 -050036static OPUS_INLINE void silk_VAD_GetNoiseLevels(
Alexandre Lision744f7422013-09-25 11:39:37 -040037 const opus_int32 pX[ VAD_N_BANDS ], /* I subband energies */
38 silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */
39);
40
41/**********************************/
42/* Initialization of the Silk VAD */
43/**********************************/
44opus_int silk_VAD_Init( /* O Return value, 0 if success */
45 silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */
46)
47{
48 opus_int b, ret = 0;
49
50 /* reset state memory */
51 silk_memset( psSilk_VAD, 0, sizeof( silk_VAD_state ) );
52
53 /* init noise levels */
54 /* Initialize array with approx pink noise levels (psd proportional to inverse of frequency) */
55 for( b = 0; b < VAD_N_BANDS; b++ ) {
56 psSilk_VAD->NoiseLevelBias[ b ] = silk_max_32( silk_DIV32_16( VAD_NOISE_LEVELS_BIAS, b + 1 ), 1 );
57 }
58
59 /* Initialize state */
60 for( b = 0; b < VAD_N_BANDS; b++ ) {
61 psSilk_VAD->NL[ b ] = silk_MUL( 100, psSilk_VAD->NoiseLevelBias[ b ] );
62 psSilk_VAD->inv_NL[ b ] = silk_DIV32( silk_int32_MAX, psSilk_VAD->NL[ b ] );
63 }
64 psSilk_VAD->counter = 15;
65
66 /* init smoothed energy-to-noise ratio*/
67 for( b = 0; b < VAD_N_BANDS; b++ ) {
68 psSilk_VAD->NrgRatioSmth_Q8[ b ] = 100 * 256; /* 100 * 256 --> 20 dB SNR */
69 }
70
71 return( ret );
72}
73
74/* Weighting factors for tilt measure */
75static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
76
77/***************************************/
78/* Get the speech activity level in Q8 */
79/***************************************/
80opus_int silk_VAD_GetSA_Q8( /* O Return value, 0 if success */
81 silk_encoder_state *psEncC, /* I/O Encoder state */
82 const opus_int16 pIn[] /* I PCM input */
83)
84{
85 opus_int SA_Q15, pSNR_dB_Q7, input_tilt;
Alexandre Lision85382382014-01-27 15:54:16 -050086 opus_int decimated_framelength1, decimated_framelength2;
87 opus_int decimated_framelength;
88 opus_int dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
Alexandre Lision744f7422013-09-25 11:39:37 -040089 opus_int32 sumSquared, smooth_coef_Q16;
90 opus_int16 HPstateTmp;
Alexandre Lision85382382014-01-27 15:54:16 -050091 VARDECL( opus_int16, X );
Alexandre Lision744f7422013-09-25 11:39:37 -040092 opus_int32 Xnrg[ VAD_N_BANDS ];
93 opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
94 opus_int32 speech_nrg, x_tmp;
Alexandre Lision85382382014-01-27 15:54:16 -050095 opus_int X_offset[ VAD_N_BANDS ];
Alexandre Lision744f7422013-09-25 11:39:37 -040096 opus_int ret = 0;
97 silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
Alexandre Lision85382382014-01-27 15:54:16 -050098 SAVE_STACK;
Alexandre Lision744f7422013-09-25 11:39:37 -040099
100 /* Safety checks */
101 silk_assert( VAD_N_BANDS == 4 );
102 silk_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
103 silk_assert( psEncC->frame_length <= 512 );
104 silk_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
105
106 /***********************/
107 /* Filter and Decimate */
108 /***********************/
Alexandre Lision85382382014-01-27 15:54:16 -0500109 decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
110 decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
111 decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
112 /* Decimate into 4 bands:
113 0 L 3L L 3L 5L
114 - -- - -- --
115 8 8 2 4 4
116
117 [0-1 kHz| temp. |1-2 kHz| 2-4 kHz | 4-8 kHz |
118
119 They're arranged to allow the minimal ( frame_length / 4 ) extra
120 scratch space during the downsampling process */
121 X_offset[ 0 ] = 0;
122 X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
123 X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
124 X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
125 ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
126
Alexandre Lision744f7422013-09-25 11:39:37 -0400127 /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
Alexandre Lision85382382014-01-27 15:54:16 -0500128 silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[ 0 ],
129 X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
Alexandre Lision744f7422013-09-25 11:39:37 -0400130
131 /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
Alexandre Lision85382382014-01-27 15:54:16 -0500132 silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
133 X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
Alexandre Lision744f7422013-09-25 11:39:37 -0400134
135 /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
Alexandre Lision85382382014-01-27 15:54:16 -0500136 silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
137 X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
Alexandre Lision744f7422013-09-25 11:39:37 -0400138
139 /*********************************************/
140 /* HP filter on lowest band (differentiator) */
141 /*********************************************/
Alexandre Lision85382382014-01-27 15:54:16 -0500142 X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
143 HPstateTmp = X[ decimated_framelength - 1 ];
Alexandre Lision744f7422013-09-25 11:39:37 -0400144 for( i = decimated_framelength - 1; i > 0; i-- ) {
Alexandre Lision85382382014-01-27 15:54:16 -0500145 X[ i - 1 ] = silk_RSHIFT( X[ i - 1 ], 1 );
146 X[ i ] -= X[ i - 1 ];
Alexandre Lision744f7422013-09-25 11:39:37 -0400147 }
Alexandre Lision85382382014-01-27 15:54:16 -0500148 X[ 0 ] -= psSilk_VAD->HPstate;
Alexandre Lision744f7422013-09-25 11:39:37 -0400149 psSilk_VAD->HPstate = HPstateTmp;
150
151 /*************************************/
152 /* Calculate the energy in each band */
153 /*************************************/
154 for( b = 0; b < VAD_N_BANDS; b++ ) {
155 /* Find the decimated framelength in the non-uniformly divided bands */
156 decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
157
158 /* Split length into subframe lengths */
159 dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
160 dec_subframe_offset = 0;
161
162 /* Compute energy per sub-frame */
163 /* initialize with summed energy of last subframe */
164 Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
165 for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
166 sumSquared = 0;
167 for( i = 0; i < dec_subframe_length; i++ ) {
168 /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2. */
169 /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128) */
Alexandre Lision85382382014-01-27 15:54:16 -0500170 x_tmp = silk_RSHIFT(
171 X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
Alexandre Lision744f7422013-09-25 11:39:37 -0400172 sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
173
174 /* Safety check */
175 silk_assert( sumSquared >= 0 );
176 }
177
178 /* Add/saturate summed energy of current subframe */
179 if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
180 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
181 } else {
182 /* Look-ahead subframe */
183 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
184 }
185
186 dec_subframe_offset += dec_subframe_length;
187 }
188 psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
189 }
190
191 /********************/
192 /* Noise estimation */
193 /********************/
194 silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
195
196 /***********************************************/
197 /* Signal-plus-noise to noise ratio estimation */
198 /***********************************************/
199 sumSquared = 0;
200 input_tilt = 0;
201 for( b = 0; b < VAD_N_BANDS; b++ ) {
202 speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
203 if( speech_nrg > 0 ) {
204 /* Divide, with sufficient resolution */
205 if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
206 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
207 } else {
208 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
209 }
210
211 /* Convert to log domain */
212 SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
213
214 /* Sum-of-squares */
215 sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 ); /* Q14 */
216
217 /* Tilt measure */
218 if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
219 /* Scale down SNR value for small subband speech energies */
220 SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
221 }
222 input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
223 } else {
224 NrgToNoiseRatio_Q8[ b ] = 256;
225 }
226 }
227
228 /* Mean-of-squares */
229 sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
230
231 /* Root-mean-square approximation, scale to dBs, and write to output pointer */
232 pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
233
234 /*********************************/
235 /* Speech Probability Estimation */
236 /*********************************/
237 SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
238
239 /**************************/
240 /* Frequency Tilt Measure */
241 /**************************/
242 psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
243
244 /**************************************************/
245 /* Scale the sigmoid output based on power levels */
246 /**************************************************/
247 speech_nrg = 0;
248 for( b = 0; b < VAD_N_BANDS; b++ ) {
249 /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
250 speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
251 }
252
253 /* Power scaling */
254 if( speech_nrg <= 0 ) {
255 SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
256 } else if( speech_nrg < 32768 ) {
257 if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
258 speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
259 } else {
260 speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
261 }
262
263 /* square-root */
264 speech_nrg = silk_SQRT_APPROX( speech_nrg );
265 SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
266 }
267
268 /* Copy the resulting speech activity in Q8 */
269 psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
270
271 /***********************************/
272 /* Energy Level and SNR estimation */
273 /***********************************/
274 /* Smoothing coefficient */
275 smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
276
277 if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
278 smooth_coef_Q16 >>= 1;
279 }
280
281 for( b = 0; b < VAD_N_BANDS; b++ ) {
282 /* compute smoothed energy-to-noise ratio per band */
283 psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
284 NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
285
286 /* signal to noise ratio in dB per band */
287 SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
288 /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
289 psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
290 }
291
Alexandre Lision85382382014-01-27 15:54:16 -0500292 RESTORE_STACK;
Alexandre Lision744f7422013-09-25 11:39:37 -0400293 return( ret );
294}
295
296/**************************/
297/* Noise level estimation */
298/**************************/
Alexandre Lision85382382014-01-27 15:54:16 -0500299static OPUS_INLINE void silk_VAD_GetNoiseLevels(
Alexandre Lision744f7422013-09-25 11:39:37 -0400300 const opus_int32 pX[ VAD_N_BANDS ], /* I subband energies */
301 silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */
302)
303{
304 opus_int k;
305 opus_int32 nl, nrg, inv_nrg;
306 opus_int coef, min_coef;
307
308 /* Initially faster smoothing */
309 if( psSilk_VAD->counter < 1000 ) { /* 1000 = 20 sec */
310 min_coef = silk_DIV32_16( silk_int16_MAX, silk_RSHIFT( psSilk_VAD->counter, 4 ) + 1 );
311 } else {
312 min_coef = 0;
313 }
314
315 for( k = 0; k < VAD_N_BANDS; k++ ) {
316 /* Get old noise level estimate for current band */
317 nl = psSilk_VAD->NL[ k ];
318 silk_assert( nl >= 0 );
319
320 /* Add bias */
321 nrg = silk_ADD_POS_SAT32( pX[ k ], psSilk_VAD->NoiseLevelBias[ k ] );
322 silk_assert( nrg > 0 );
323
324 /* Invert energies */
325 inv_nrg = silk_DIV32( silk_int32_MAX, nrg );
326 silk_assert( inv_nrg >= 0 );
327
328 /* Less update when subband energy is high */
329 if( nrg > silk_LSHIFT( nl, 3 ) ) {
330 coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 >> 3;
331 } else if( nrg < nl ) {
332 coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16;
333 } else {
334 coef = silk_SMULWB( silk_SMULWW( inv_nrg, nl ), VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 << 1 );
335 }
336
337 /* Initially faster smoothing */
338 coef = silk_max_int( coef, min_coef );
339
340 /* Smooth inverse energies */
341 psSilk_VAD->inv_NL[ k ] = silk_SMLAWB( psSilk_VAD->inv_NL[ k ], inv_nrg - psSilk_VAD->inv_NL[ k ], coef );
342 silk_assert( psSilk_VAD->inv_NL[ k ] >= 0 );
343
344 /* Compute noise level by inverting again */
345 nl = silk_DIV32( silk_int32_MAX, psSilk_VAD->inv_NL[ k ] );
346 silk_assert( nl >= 0 );
347
348 /* Limit noise levels (guarantee 7 bits of head room) */
349 nl = silk_min( nl, 0x00FFFFFF );
350
351 /* Store as part of state */
352 psSilk_VAD->NL[ k ] = nl;
353 }
354
355 /* Increment frame counter */
356 psSilk_VAD->counter++;
357}