blob: 11a5adf54b18139ca57317168a7656f05e96ad11 [file] [log] [blame]
Aline Gondim Santos329f8622022-11-08 08:04:22 -03001/**
2 * Copyright (C) 2022 Savoir-faire Linux Inc.
3 *
4 * Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 */
20
21#include "TranscriptAudioSubscriber.h"
22
23#include <pluglog.h>
24#include <frameUtils.h>
25#include <bitset>
26#include <iostream>
Adrien Beraud087d5592023-03-06 11:22:33 -050027#include <fmt/core.h>
28#include <fmt/format.h>
Aline Gondim Santos329f8622022-11-08 08:04:22 -030029
Adrien Beraud087d5592023-03-06 11:22:33 -050030#include "stt_whisper.h"
31
32const std::string TAG = "TranscriptAudio";
Aline Gondim Santos329f8622022-11-08 08:04:22 -030033const char sep = separator();
34
35namespace jami {
36
Adrien Beraud087d5592023-03-06 11:22:33 -050037TranscriptAudioSubscriber::TranscriptAudioSubscriber(const std::string& dataPath,
38 TranscriptVideoSubscriber* videoSubscriber)
Aline Gondim Santos329f8622022-11-08 08:04:22 -030039 : path_ {dataPath}
Aline Gondim Santos329f8622022-11-08 08:04:22 -030040 , mVS_ {videoSubscriber}
41{
Adrien Beraud087d5592023-03-06 11:22:33 -050042 Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("TranscriptAudioSubscriber {}", fmt::ptr(this)));
Aline Gondim Santos329f8622022-11-08 08:04:22 -030043}
44
45TranscriptAudioSubscriber::~TranscriptAudioSubscriber()
46{
Adrien Beraud087d5592023-03-06 11:22:33 -050047 Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("~TranscriptAudioSubscriber {}", fmt::ptr(this)));
Aline Gondim Santos329f8622022-11-08 08:04:22 -030048}
49
Aline Gondim Santos440e8122023-01-09 14:03:49 -030050/**
51 * Waits for audio samples and then process them
52 **/
53void
54TranscriptAudioSubscriber::processFrame()
55{
Adrien Beraud087d5592023-03-06 11:22:33 -050056 if (!whisper_) {
57 whisper_ = std::make_unique<RealtimeSttWhisper>(path_ + "/assets/ggml-base.bin");
58 whisper_->setLanguage(language_);
Aline Gondim Santos440e8122023-01-09 14:03:49 -030059 }
Adrien Beraud087d5592023-03-06 11:22:33 -050060
61 while (running) {
62 decltype(frames_) frames;
63 {
64 std::unique_lock<std::mutex> l(inputLock);
65 cv_.wait(l, [&]{
66 return !running || !frames_.empty();
67 });
68 if (!running)
69 return;
70 frames = std::move(frames_);
71 }
72
73 for (auto& f : frames) {
74 uniqueFramePtr filteredFrame = getUniqueFrame();
75 filteredFrame->sample_rate = WHISPER_SAMPLE_RATE;
76 filteredFrame->format = AV_SAMPLE_FMT_FLT;
77 av_channel_layout_from_mask(&filteredFrame->ch_layout , AV_CH_LAYOUT_MONO);
78 try {
79 if (resampler_.resample(f.get(), filteredFrame.get()) == 0) {
80 whisper_->AddAudioData((float*) filteredFrame->buf[0]->data,
81 filteredFrame->nb_samples);
82 }
83 } catch (...) {
84 }
85 }
86
87 auto result = whisper_->GetTranscribed();
88 if (not result.empty()) {
89 std::string txt;
90 for (const auto& t : result) {
91 if (not t.is_partial)
92 txt += t.text;
93 }
94 if (!txt.empty())
95 mVS_->setText(txt);
96 }
97 }
98 whisper_.reset();
Aline Gondim Santos440e8122023-01-09 14:03:49 -030099}
100
Aline Gondim Santos329f8622022-11-08 08:04:22 -0300101void
102TranscriptAudioSubscriber::stop()
103{
Adrien Beraud087d5592023-03-06 11:22:33 -0500104 Plog::log(Plog::LogPriority::INFO, TAG, "stop()");
105 {
106 std::unique_lock<std::mutex> l(inputLock);
107 running = false;
108 cv_.notify_all();
109 }
Aline Gondim Santos440e8122023-01-09 14:03:49 -0300110 if (processFrameThread.joinable()) {
111 processFrameThread.join();
112 }
Adrien Beraud087d5592023-03-06 11:22:33 -0500113 mVS_->setText("");
Aline Gondim Santos440e8122023-01-09 14:03:49 -0300114}
115
116void
117TranscriptAudioSubscriber::start()
118{
Adrien Beraud087d5592023-03-06 11:22:33 -0500119 Plog::log(Plog::LogPriority::INFO, TAG, "start()");
Aline Gondim Santos440e8122023-01-09 14:03:49 -0300120 running = true;
Adrien Beraud087d5592023-03-06 11:22:33 -0500121 processFrameThread = std::thread([this](){ processFrame(); });
122 mVS_->setText("");
Aline Gondim Santos329f8622022-11-08 08:04:22 -0300123}
124
125void
Adrien Beraud087d5592023-03-06 11:22:33 -0500126TranscriptAudioSubscriber::setParameter(const std::string& parameter, Parameter type)
Aline Gondim Santosbd032f82022-11-25 15:39:12 -0300127{
Aline Gondim Santos440e8122023-01-09 14:03:49 -0300128 std::unique_lock<std::mutex> l(inputLock);
Aline Gondim Santosbd032f82022-11-25 15:39:12 -0300129 switch (type) {
130 case (Parameter::LANGUAGE):
131 language_ = parameter;
Adrien Beraud087d5592023-03-06 11:22:33 -0500132 if (whisper_)
133 whisper_->setLanguage(parameter);
Aline Gondim Santosbd032f82022-11-25 15:39:12 -0300134 break;
135 default:
136 return;
137 }
138}
139
140void
Aline Gondim Santos440e8122023-01-09 14:03:49 -0300141TranscriptAudioSubscriber::update(jami::Observable<AVFrame*>* obs, AVFrame* const& pluginFrame)
Aline Gondim Santos329f8622022-11-08 08:04:22 -0300142{
Aline Gondim Santos440e8122023-01-09 14:03:49 -0300143 std::unique_lock<std::mutex> l(inputLock);
Adrien Beraud087d5592023-03-06 11:22:33 -0500144 if (!pluginFrame || obs != observable_)
Aline Gondim Santos329f8622022-11-08 08:04:22 -0300145 return;
146
Adrien Beraud087d5592023-03-06 11:22:33 -0500147 frames_.emplace_back(uniqueFramePtr(av_frame_clone(pluginFrame), frameFree));
148 cv_.notify_all();
Aline Gondim Santos329f8622022-11-08 08:04:22 -0300149 // audio returns as is
150}
151
152void
153TranscriptAudioSubscriber::attached(jami::Observable<AVFrame*>* observable)
154{
Adrien Beraud087d5592023-03-06 11:22:33 -0500155 std::unique_lock<std::mutex> l(inputLock);
156 Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("::Attached ! {} for {}", fmt::ptr(this), fmt::ptr(observable)));
Aline Gondim Santos329f8622022-11-08 08:04:22 -0300157 observable_ = observable;
Aline Gondim Santos440e8122023-01-09 14:03:49 -0300158 start();
Aline Gondim Santos329f8622022-11-08 08:04:22 -0300159}
160
161void
Adrien Beraud087d5592023-03-06 11:22:33 -0500162TranscriptAudioSubscriber::detached(jami::Observable<AVFrame*>* observable)
Aline Gondim Santos329f8622022-11-08 08:04:22 -0300163{
Aline Gondim Santos329f8622022-11-08 08:04:22 -0300164 firstRun = true;
165 observable_ = nullptr;
Aline Gondim Santos440e8122023-01-09 14:03:49 -0300166 stop();
Adrien Beraud087d5592023-03-06 11:22:33 -0500167 Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("::Detached ! {} for {}", fmt::ptr(this), fmt::ptr(observable)));
Aline Gondim Santos329f8622022-11-08 08:04:22 -0300168}
169
170void
171TranscriptAudioSubscriber::detach()
172{
173 if (observable_) {
174 firstRun = true;
Aline Gondim Santos329f8622022-11-08 08:04:22 -0300175 Plog::log(Plog::LogPriority::INFO, TAG, "::Calling detach()");
176 observable_->detach(this);
177 }
178}
179} // namespace jami