Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 1 | /** |
| 2 | * Copyright (C) 2022 Savoir-faire Linux Inc. |
| 3 | * |
| 4 | * Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com> |
| 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License as published by |
| 8 | * the Free Software Foundation; either version 3 of the License, or |
| 9 | * (at your option) any later version. |
| 10 | * |
| 11 | * This program is distributed in the hope that it will be useful, |
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | * GNU General Public License for more details. |
| 15 | * |
| 16 | * You should have received a copy of the GNU General Public License |
| 17 | * along with this program; if not, write to the Free Software |
| 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| 19 | */ |
| 20 | |
| 21 | #include "TranscriptAudioSubscriber.h" |
| 22 | |
| 23 | #include <pluglog.h> |
| 24 | #include <frameUtils.h> |
| 25 | #include <bitset> |
| 26 | #include <iostream> |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 27 | #include <fmt/core.h> |
| 28 | #include <fmt/format.h> |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 29 | |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 30 | #include "stt_whisper.h" |
| 31 | |
| 32 | const std::string TAG = "TranscriptAudio"; |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 33 | const char sep = separator(); |
| 34 | |
| 35 | namespace jami { |
| 36 | |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 37 | TranscriptAudioSubscriber::TranscriptAudioSubscriber(const std::string& dataPath, |
| 38 | TranscriptVideoSubscriber* videoSubscriber) |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 39 | : path_ {dataPath} |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 40 | , mVS_ {videoSubscriber} |
| 41 | { |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 42 | Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("TranscriptAudioSubscriber {}", fmt::ptr(this))); |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 43 | } |
| 44 | |
| 45 | TranscriptAudioSubscriber::~TranscriptAudioSubscriber() |
| 46 | { |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 47 | Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("~TranscriptAudioSubscriber {}", fmt::ptr(this))); |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 48 | } |
| 49 | |
Aline Gondim Santos | 440e812 | 2023-01-09 14:03:49 -0300 | [diff] [blame] | 50 | /** |
| 51 | * Waits for audio samples and then process them |
| 52 | **/ |
| 53 | void |
| 54 | TranscriptAudioSubscriber::processFrame() |
| 55 | { |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 56 | if (!whisper_) { |
| 57 | whisper_ = std::make_unique<RealtimeSttWhisper>(path_ + "/assets/ggml-base.bin"); |
| 58 | whisper_->setLanguage(language_); |
Aline Gondim Santos | 440e812 | 2023-01-09 14:03:49 -0300 | [diff] [blame] | 59 | } |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 60 | |
| 61 | while (running) { |
| 62 | decltype(frames_) frames; |
| 63 | { |
| 64 | std::unique_lock<std::mutex> l(inputLock); |
| 65 | cv_.wait(l, [&]{ |
| 66 | return !running || !frames_.empty(); |
| 67 | }); |
| 68 | if (!running) |
| 69 | return; |
| 70 | frames = std::move(frames_); |
| 71 | } |
| 72 | |
| 73 | for (auto& f : frames) { |
| 74 | uniqueFramePtr filteredFrame = getUniqueFrame(); |
| 75 | filteredFrame->sample_rate = WHISPER_SAMPLE_RATE; |
| 76 | filteredFrame->format = AV_SAMPLE_FMT_FLT; |
| 77 | av_channel_layout_from_mask(&filteredFrame->ch_layout , AV_CH_LAYOUT_MONO); |
| 78 | try { |
| 79 | if (resampler_.resample(f.get(), filteredFrame.get()) == 0) { |
| 80 | whisper_->AddAudioData((float*) filteredFrame->buf[0]->data, |
| 81 | filteredFrame->nb_samples); |
| 82 | } |
| 83 | } catch (...) { |
| 84 | } |
| 85 | } |
| 86 | |
| 87 | auto result = whisper_->GetTranscribed(); |
| 88 | if (not result.empty()) { |
| 89 | std::string txt; |
| 90 | for (const auto& t : result) { |
| 91 | if (not t.is_partial) |
| 92 | txt += t.text; |
| 93 | } |
| 94 | if (!txt.empty()) |
| 95 | mVS_->setText(txt); |
| 96 | } |
| 97 | } |
| 98 | whisper_.reset(); |
Aline Gondim Santos | 440e812 | 2023-01-09 14:03:49 -0300 | [diff] [blame] | 99 | } |
| 100 | |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 101 | void |
| 102 | TranscriptAudioSubscriber::stop() |
| 103 | { |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 104 | Plog::log(Plog::LogPriority::INFO, TAG, "stop()"); |
| 105 | { |
| 106 | std::unique_lock<std::mutex> l(inputLock); |
| 107 | running = false; |
| 108 | cv_.notify_all(); |
| 109 | } |
Aline Gondim Santos | 440e812 | 2023-01-09 14:03:49 -0300 | [diff] [blame] | 110 | if (processFrameThread.joinable()) { |
| 111 | processFrameThread.join(); |
| 112 | } |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 113 | mVS_->setText(""); |
Aline Gondim Santos | 440e812 | 2023-01-09 14:03:49 -0300 | [diff] [blame] | 114 | } |
| 115 | |
| 116 | void |
| 117 | TranscriptAudioSubscriber::start() |
| 118 | { |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 119 | Plog::log(Plog::LogPriority::INFO, TAG, "start()"); |
Aline Gondim Santos | 440e812 | 2023-01-09 14:03:49 -0300 | [diff] [blame] | 120 | running = true; |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 121 | processFrameThread = std::thread([this](){ processFrame(); }); |
| 122 | mVS_->setText(""); |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 123 | } |
| 124 | |
| 125 | void |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 126 | TranscriptAudioSubscriber::setParameter(const std::string& parameter, Parameter type) |
Aline Gondim Santos | bd032f8 | 2022-11-25 15:39:12 -0300 | [diff] [blame] | 127 | { |
Aline Gondim Santos | 440e812 | 2023-01-09 14:03:49 -0300 | [diff] [blame] | 128 | std::unique_lock<std::mutex> l(inputLock); |
Aline Gondim Santos | bd032f8 | 2022-11-25 15:39:12 -0300 | [diff] [blame] | 129 | switch (type) { |
| 130 | case (Parameter::LANGUAGE): |
| 131 | language_ = parameter; |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 132 | if (whisper_) |
| 133 | whisper_->setLanguage(parameter); |
Aline Gondim Santos | bd032f8 | 2022-11-25 15:39:12 -0300 | [diff] [blame] | 134 | break; |
| 135 | default: |
| 136 | return; |
| 137 | } |
| 138 | } |
| 139 | |
| 140 | void |
Aline Gondim Santos | 440e812 | 2023-01-09 14:03:49 -0300 | [diff] [blame] | 141 | TranscriptAudioSubscriber::update(jami::Observable<AVFrame*>* obs, AVFrame* const& pluginFrame) |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 142 | { |
Aline Gondim Santos | 440e812 | 2023-01-09 14:03:49 -0300 | [diff] [blame] | 143 | std::unique_lock<std::mutex> l(inputLock); |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 144 | if (!pluginFrame || obs != observable_) |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 145 | return; |
| 146 | |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 147 | frames_.emplace_back(uniqueFramePtr(av_frame_clone(pluginFrame), frameFree)); |
| 148 | cv_.notify_all(); |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 149 | // audio returns as is |
| 150 | } |
| 151 | |
| 152 | void |
| 153 | TranscriptAudioSubscriber::attached(jami::Observable<AVFrame*>* observable) |
| 154 | { |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 155 | std::unique_lock<std::mutex> l(inputLock); |
| 156 | Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("::Attached ! {} for {}", fmt::ptr(this), fmt::ptr(observable))); |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 157 | observable_ = observable; |
Aline Gondim Santos | 440e812 | 2023-01-09 14:03:49 -0300 | [diff] [blame] | 158 | start(); |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 159 | } |
| 160 | |
| 161 | void |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 162 | TranscriptAudioSubscriber::detached(jami::Observable<AVFrame*>* observable) |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 163 | { |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 164 | firstRun = true; |
| 165 | observable_ = nullptr; |
Aline Gondim Santos | 440e812 | 2023-01-09 14:03:49 -0300 | [diff] [blame] | 166 | stop(); |
Adrien Beraud | 087d559 | 2023-03-06 11:22:33 -0500 | [diff] [blame] | 167 | Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("::Detached ! {} for {}", fmt::ptr(this), fmt::ptr(observable))); |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 168 | } |
| 169 | |
| 170 | void |
| 171 | TranscriptAudioSubscriber::detach() |
| 172 | { |
| 173 | if (observable_) { |
| 174 | firstRun = true; |
Aline Gondim Santos | 329f862 | 2022-11-08 08:04:22 -0300 | [diff] [blame] | 175 | Plog::log(Plog::LogPriority::INFO, TAG, "::Calling detach()"); |
| 176 | observable_->detach(this); |
| 177 | } |
| 178 | } |
| 179 | } // namespace jami |