blob: d4d9a45f1f7d68b40a3fd094bdfe8efc437db0ae [file] [log] [blame]
#include "stt_whisper.h"
#include "whisper.h"
#ifdef WIN32
#include <math.h>
#include <atomic>
#include <cmath>
#include <mutex>
#include <string>
#include <thread>
#include <vector>
void print_array(const std::vector<float>& data)
fprintf(stdout, "print array: [");
for (int i = 0; i < std::min((int)data.size(), 10); i++) {
fprintf(stdout, " %.8f,", data[i]);
fprintf(stdout, " ]\n");
void high_pass_filter(std::vector<float>& data, float cutoff, float sample_rate)
const float rc = 1.0f / (2.0f * M_PI * cutoff);
const float dt = 1.0f / sample_rate;
const float alpha = dt / (rc + dt);
float y = data[0];
for (size_t i = 1; i < data.size(); i++) {
y = alpha * (y + data[i] - data[i - 1]);
data[i] = y;
/** Check if speech is ending. */
bool vad_simple(std::vector<float>& pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose)
const int n_samples = pcmf32.size();
const int n_samples_last = (sample_rate * last_ms) / 1000;
if (n_samples_last >= n_samples) {
// not enough samples - assume no speech
return false;
if (freq_thold > 0.0f) {
high_pass_filter(pcmf32, freq_thold, sample_rate);
float energy_all = 0.0f;
float energy_last = 0.0f;
for (int i = 0; i < n_samples; i++) {
energy_all += fabsf(pcmf32[i]);
if (i >= n_samples - n_samples_last) {
energy_last += fabsf(pcmf32[i]);
energy_all /= n_samples;
energy_last /= n_samples_last;
if (verbose) {
fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
if ((energy_all < 0.0001f && energy_last < 0.0001f) || energy_last > vad_thold * energy_all) {
return false;
return true;
RealtimeSttWhisper::RealtimeSttWhisper(const std::string& path_model)
ctx = whisper_init_from_file(path_model.c_str());
is_running = true;
worker = std::thread(&RealtimeSttWhisper::Run, this);
t_last_iter = std::chrono::high_resolution_clock::now();
is_running = false;
if (worker.joinable())
/** Add audio data in PCM f32 format. */
void RealtimeSttWhisper::AddAudioData(const float* data, size_t n_samples)
std::lock_guard<std::mutex> lock(s_mutex);
// printf("AddAudioData: remaining: %d, new: %d\n", (int)s_queued_pcmf32.size(), (int)data.size());
s_queued_pcmf32.insert(s_queued_pcmf32.end(), data, data + n_samples);
/** Get newly transcribed text. */
std::vector<transcribed_msg> RealtimeSttWhisper::GetTranscribed()
std::vector<transcribed_msg> transcribed;
std::lock_guard<std::mutex> lock(s_mutex);
transcribed = std::move(s_transcribed_msgs);
return transcribed;
/** Run Whisper in its own thread to not block the main thread. */
void RealtimeSttWhisper::Run()
struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
// See here for example
wparams.n_threads = 4;
wparams.no_context = true;
wparams.single_segment = true;
wparams.print_progress = false;
wparams.print_realtime = false;
wparams.print_special = false;
wparams.print_timestamps = false;
wparams.max_tokens = 64;
wparams.translate = false;
* Experimental optimization: Reduce audio_ctx to 15s (half of the chunk
* size whisper is designed for) to speed up 2x.
wparams.audio_ctx = 768;
/* When more than this amount of audio received, run an iteration. */
const int trigger_ms = 400;
const int n_samples_trigger = (trigger_ms / 1000.0) * WHISPER_SAMPLE_RATE;
* When more than this amount of audio accumulates in the audio buffer,
* force finalize current audio context and clear the buffer. Note that
* VAD may finalize an iteration earlier.
// This is recommended to be smaller than the time wparams.audio_ctx
// represents so an iteration can fit in one chunk.
const int iter_threshold_ms = trigger_ms * 35;
const int n_samples_iter_threshold = (iter_threshold_ms / 1000.0) * WHISPER_SAMPLE_RATE;
* ### Reminders
* - Note that whisper designed to process audio in 30-second chunks, and
* the execution time of processing smaller chunks may not be shorter.
* - The design of trigger and threshold allows inputing audio data at
* arbitrary rates with zero config. Inspired by's
* real-time transcription API
* (
/* VAD parameters */
// The most recent 3s.
const int vad_window_s = 3;
const int n_samples_vad_window = WHISPER_SAMPLE_RATE * vad_window_s;
// In VAD, compare the energy of the last 500ms to that of the total 3s.
const int vad_last_ms = 500;
// Keep the last 0.5s of an iteration to the next one for better
// transcription at begin/end.
const int n_samples_keep_iter = WHISPER_SAMPLE_RATE * 0.5;
const float vad_thold = 0.3f;
const float freq_thold = 200.0f;
/* Audio buffer */
std::vector<float> pcmf32;
/* Processing loop */
while (is_running) {
std::unique_lock<std::mutex> lock(s_mutex);
if (s_queued_pcmf32.size() < n_samples_trigger) {
std::lock_guard<std::mutex> lock(s_mutex);
if (s_queued_pcmf32.size() > 2 * n_samples_iter_threshold) {
fprintf(stderr, "\n\n%s: WARNING: too much audio is going to be processed, result may not come out in real time\n\n", __func__);
std::lock_guard<std::mutex> lock(s_mutex);
pcmf32.insert(pcmf32.end(), s_queued_pcmf32.begin(), s_queued_pcmf32.end());
// printf("existing: %d, new: %d, will process: %d, threshold: %d\n",
// n_samples_old, n_samples_new, (int)pcmf32.size(), n_samples_iter_threshold);
// print_array(pcmf32);
wparams.language = lang_.c_str();
int ret = whisper_full(ctx, wparams,, pcmf32.size());
if (ret != 0) {
fprintf(stderr, "Failed to process audio, returned %d\n", ret);
transcribed_msg msg;
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char* text = whisper_full_get_segment_text(ctx, i);
msg.text += text;
* Simple VAD from the "stream" example in whisper.cpp
bool speech_has_end = false;
/* Need enough accumulated audio to do VAD. */
if ((int)pcmf32.size() >= n_samples_vad_window) {
std::vector<float> pcmf32_window(pcmf32.end() - n_samples_vad_window, pcmf32.end());
speech_has_end = vad_simple(pcmf32_window, WHISPER_SAMPLE_RATE, vad_last_ms,
vad_thold, freq_thold, false);
if (speech_has_end)
printf("speech end detected\n");
* Clear audio buffer when the size exceeds iteration threshold or
* speech end is detected.
if (pcmf32.size() > n_samples_iter_threshold || speech_has_end) {
const auto t_now = std::chrono::high_resolution_clock::now();
const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last_iter).count();
printf("iter took: %ldms\n", t_diff);
t_last_iter = t_now;
msg.is_partial = false;
* Keep the last few samples in the audio buffer, so the next
* iteration has a smoother start.
std::vector<float> last(pcmf32.end() - n_samples_keep_iter, pcmf32.end());
pcmf32 = std::move(last);
} else {
msg.is_partial = true;
std::lock_guard<std::mutex> lock(s_mutex);
s_transcribed_msgs.insert(s_transcribed_msgs.end(), std::move(msg));