| #include "stt_whisper.h" |
| #include "whisper.h" |
| |
| #ifdef WIN32 |
| #define _USE_MATH_DEFINES |
| #include <math.h> |
| #endif |
| |
| #include <atomic> |
| #include <cmath> |
| #include <mutex> |
| #include <string> |
| #include <thread> |
| #include <vector> |
| |
| void print_array(const std::vector<float>& data) |
| { |
| fprintf(stdout, "print array: ["); |
| for (int i = 0; i < std::min((int)data.size(), 10); i++) { |
| fprintf(stdout, " %.8f,", data[i]); |
| } |
| fprintf(stdout, " ]\n"); |
| } |
| |
| void high_pass_filter(std::vector<float>& data, float cutoff, float sample_rate) |
| { |
| const float rc = 1.0f / (2.0f * M_PI * cutoff); |
| const float dt = 1.0f / sample_rate; |
| const float alpha = dt / (rc + dt); |
| |
| float y = data[0]; |
| |
| for (size_t i = 1; i < data.size(); i++) { |
| y = alpha * (y + data[i] - data[i - 1]); |
| data[i] = y; |
| } |
| } |
| |
| /** Check if speech is ending. */ |
| bool vad_simple(std::vector<float>& pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) |
| { |
| const int n_samples = pcmf32.size(); |
| const int n_samples_last = (sample_rate * last_ms) / 1000; |
| |
| if (n_samples_last >= n_samples) { |
| // not enough samples - assume no speech |
| return false; |
| } |
| |
| if (freq_thold > 0.0f) { |
| high_pass_filter(pcmf32, freq_thold, sample_rate); |
| } |
| |
| float energy_all = 0.0f; |
| float energy_last = 0.0f; |
| |
| for (int i = 0; i < n_samples; i++) { |
| energy_all += fabsf(pcmf32[i]); |
| |
| if (i >= n_samples - n_samples_last) { |
| energy_last += fabsf(pcmf32[i]); |
| } |
| } |
| |
| energy_all /= n_samples; |
| energy_last /= n_samples_last; |
| |
| if (verbose) { |
| fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold); |
| } |
| |
| if ((energy_all < 0.0001f && energy_last < 0.0001f) || energy_last > vad_thold * energy_all) { |
| return false; |
| } |
| |
| return true; |
| } |
| |
| RealtimeSttWhisper::RealtimeSttWhisper(const std::string& path_model) |
| { |
| ctx = whisper_init_from_file(path_model.c_str()); |
| is_running = true; |
| worker = std::thread(&RealtimeSttWhisper::Run, this); |
| t_last_iter = std::chrono::high_resolution_clock::now(); |
| } |
| |
| RealtimeSttWhisper::~RealtimeSttWhisper() |
| { |
| is_running = false; |
| if (worker.joinable()) |
| worker.join(); |
| whisper_free(ctx); |
| } |
| |
| /** Add audio data in PCM f32 format. */ |
| void RealtimeSttWhisper::AddAudioData(const float* data, size_t n_samples) |
| { |
| std::lock_guard<std::mutex> lock(s_mutex); |
| // printf("AddAudioData: remaining: %d, new: %d\n", (int)s_queued_pcmf32.size(), (int)data.size()); |
| s_queued_pcmf32.insert(s_queued_pcmf32.end(), data, data + n_samples); |
| } |
| |
| /** Get newly transcribed text. */ |
| std::vector<transcribed_msg> RealtimeSttWhisper::GetTranscribed() |
| { |
| std::vector<transcribed_msg> transcribed; |
| std::lock_guard<std::mutex> lock(s_mutex); |
| transcribed = std::move(s_transcribed_msgs); |
| s_transcribed_msgs.clear(); |
| return transcribed; |
| } |
| |
| /** Run Whisper in its own thread to not block the main thread. */ |
| void RealtimeSttWhisper::Run() |
| { |
| struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY); |
| |
| // See here for example https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp#L302 |
| wparams.n_threads = 4; |
| wparams.no_context = true; |
| wparams.single_segment = true; |
| wparams.print_progress = false; |
| wparams.print_realtime = false; |
| wparams.print_special = false; |
| wparams.print_timestamps = false; |
| wparams.max_tokens = 64; |
| wparams.translate = false; |
| |
| /** |
| * Experimental optimization: Reduce audio_ctx to 15s (half of the chunk |
| * size whisper is designed for) to speed up 2x. |
| * https://github.com/ggerganov/whisper.cpp/issues/137#issuecomment-1318412267 |
| */ |
| wparams.audio_ctx = 768; |
| |
| /* When more than this amount of audio received, run an iteration. */ |
| const int trigger_ms = 400; |
| const int n_samples_trigger = (trigger_ms / 1000.0) * WHISPER_SAMPLE_RATE; |
| /** |
| * When more than this amount of audio accumulates in the audio buffer, |
| * force finalize current audio context and clear the buffer. Note that |
| * VAD may finalize an iteration earlier. |
| */ |
| // This is recommended to be smaller than the time wparams.audio_ctx |
| // represents so an iteration can fit in one chunk. |
| const int iter_threshold_ms = trigger_ms * 35; |
| const int n_samples_iter_threshold = (iter_threshold_ms / 1000.0) * WHISPER_SAMPLE_RATE; |
| |
| /** |
| * ### Reminders |
| * |
| * - Note that whisper designed to process audio in 30-second chunks, and |
| * the execution time of processing smaller chunks may not be shorter. |
| * - The design of trigger and threshold allows inputing audio data at |
| * arbitrary rates with zero config. Inspired by Assembly.ai's |
| * real-time transcription API |
| * (https://github.com/misraturp/Real-time-transcription-from-microphone/blob/main/speech_recognition.py) |
| */ |
| |
| /* VAD parameters */ |
| // The most recent 3s. |
| const int vad_window_s = 3; |
| const int n_samples_vad_window = WHISPER_SAMPLE_RATE * vad_window_s; |
| // In VAD, compare the energy of the last 500ms to that of the total 3s. |
| const int vad_last_ms = 500; |
| // Keep the last 0.5s of an iteration to the next one for better |
| // transcription at begin/end. |
| const int n_samples_keep_iter = WHISPER_SAMPLE_RATE * 0.5; |
| const float vad_thold = 0.3f; |
| const float freq_thold = 200.0f; |
| |
| /* Audio buffer */ |
| std::vector<float> pcmf32; |
| |
| /* Processing loop */ |
| while (is_running) { |
| { |
| std::unique_lock<std::mutex> lock(s_mutex); |
| |
| if (s_queued_pcmf32.size() < n_samples_trigger) { |
| lock.unlock(); |
| std::this_thread::sleep_for(std::chrono::milliseconds(10)); |
| continue; |
| } |
| } |
| |
| { |
| std::lock_guard<std::mutex> lock(s_mutex); |
| |
| if (s_queued_pcmf32.size() > 2 * n_samples_iter_threshold) { |
| fprintf(stderr, "\n\n%s: WARNING: too much audio is going to be processed, result may not come out in real time\n\n", __func__); |
| } |
| } |
| |
| { |
| std::lock_guard<std::mutex> lock(s_mutex); |
| |
| pcmf32.insert(pcmf32.end(), s_queued_pcmf32.begin(), s_queued_pcmf32.end()); |
| |
| // printf("existing: %d, new: %d, will process: %d, threshold: %d\n", |
| // n_samples_old, n_samples_new, (int)pcmf32.size(), n_samples_iter_threshold); |
| |
| // print_array(pcmf32); |
| |
| s_queued_pcmf32.clear(); |
| wparams.language = lang_.c_str(); |
| } |
| |
| { |
| int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()); |
| if (ret != 0) { |
| fprintf(stderr, "Failed to process audio, returned %d\n", ret); |
| continue; |
| } |
| } |
| |
| { |
| transcribed_msg msg; |
| |
| const int n_segments = whisper_full_n_segments(ctx); |
| for (int i = 0; i < n_segments; ++i) { |
| const char* text = whisper_full_get_segment_text(ctx, i); |
| msg.text += text; |
| } |
| |
| /** |
| * Simple VAD from the "stream" example in whisper.cpp |
| * https://github.com/ggerganov/whisper.cpp/blob/231bebca7deaf32d268a8b207d15aa859e52dbbe/examples/stream/stream.cpp#L378 |
| */ |
| bool speech_has_end = false; |
| |
| /* Need enough accumulated audio to do VAD. */ |
| if ((int)pcmf32.size() >= n_samples_vad_window) { |
| std::vector<float> pcmf32_window(pcmf32.end() - n_samples_vad_window, pcmf32.end()); |
| speech_has_end = vad_simple(pcmf32_window, WHISPER_SAMPLE_RATE, vad_last_ms, |
| vad_thold, freq_thold, false); |
| if (speech_has_end) |
| printf("speech end detected\n"); |
| } |
| |
| /** |
| * Clear audio buffer when the size exceeds iteration threshold or |
| * speech end is detected. |
| */ |
| if (pcmf32.size() > n_samples_iter_threshold || speech_has_end) { |
| const auto t_now = std::chrono::high_resolution_clock::now(); |
| const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last_iter).count(); |
| printf("iter took: %lldms\n", t_diff); |
| t_last_iter = t_now; |
| |
| msg.is_partial = false; |
| /** |
| * Keep the last few samples in the audio buffer, so the next |
| * iteration has a smoother start. |
| */ |
| std::vector<float> last(pcmf32.end() - n_samples_keep_iter, pcmf32.end()); |
| pcmf32 = std::move(last); |
| } else { |
| msg.is_partial = true; |
| } |
| |
| std::lock_guard<std::mutex> lock(s_mutex); |
| s_transcribed_msgs.insert(s_transcribed_msgs.end(), std::move(msg)); |
| } |
| } |
| } |