WhisperTranscript/TranscriptAudioSubscriber.cpp - jami-plugins - Gitiles

 /**
  *  Copyright (C) 2022 Savoir-faire Linux Inc.
  *
  *  Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation; either version 3 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA.
  */

 #include "TranscriptAudioSubscriber.h"

 #include <pluglog.h>
 #include <frameUtils.h>
 #include <bitset>
 #include <iostream>
 #include <fmt/core.h>
 #include <fmt/format.h>

 #include "stt_whisper.h"

 const std::string TAG = "TranscriptAudio";
 const char sep = separator();

 namespace jami {

 TranscriptAudioSubscriber::TranscriptAudioSubscriber(const std::string& dataPath,
                                                      TranscriptVideoSubscriber* videoSubscriber)
     : path_ {dataPath}
     , mVS_ {videoSubscriber}
 {
     Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("TranscriptAudioSubscriber {}", fmt::ptr(this)));
 }

 TranscriptAudioSubscriber::~TranscriptAudioSubscriber()
 {
     Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("~TranscriptAudioSubscriber {}", fmt::ptr(this)));
 }

 /**
  * Waits for audio samples and then process them
  **/
 void
 TranscriptAudioSubscriber::processFrame()
 {
     if (!whisper_) {
         whisper_ = std::make_unique<RealtimeSttWhisper>(path_ + "/assets/ggml-base.bin");
         whisper_->setLanguage(language_);
     }

     while (running) {
         decltype(frames_) frames;
         {
             std::unique_lock<std::mutex> l(inputLock);
             cv_.wait(l, [&]{
                 return !running || !frames_.empty();
             });
             if (!running)
                 return;
             frames = std::move(frames_);
         }

         for (auto& f : frames) {
             uniqueFramePtr filteredFrame = getUniqueFrame();
             filteredFrame->sample_rate = WHISPER_SAMPLE_RATE;
             filteredFrame->format  = AV_SAMPLE_FMT_FLT;
             av_channel_layout_from_mask(&filteredFrame->ch_layout , AV_CH_LAYOUT_MONO);
             try {
                 if (resampler_.resample(f.get(), filteredFrame.get()) == 0) {
                     whisper_->AddAudioData((float*) filteredFrame->buf[0]->data,
                                            filteredFrame->nb_samples);
                 }
             } catch (...) {
             }
         }

         auto result = whisper_->GetTranscribed();
         if (not result.empty()) {
             std::string txt;
             for (const auto& t : result) {
                 if (not t.is_partial)
                     txt += t.text;
             }
             if (!txt.empty())
                 mVS_->setText(txt);
         }
     }
     whisper_.reset();
 }

 void
 TranscriptAudioSubscriber::stop()
 {
     Plog::log(Plog::LogPriority::INFO, TAG, "stop()");
     {
         std::unique_lock<std::mutex> l(inputLock);
         running = false;
         cv_.notify_all();
     }
     if (processFrameThread.joinable()) {
         processFrameThread.join();
     }
     mVS_->setText("");
 }

 void
 TranscriptAudioSubscriber::start()
 {
     Plog::log(Plog::LogPriority::INFO, TAG, "start()");
     running = true;
     processFrameThread = std::thread([this](){ processFrame(); });
     mVS_->setText("");
 }

 void
 TranscriptAudioSubscriber::setParameter(const std::string& parameter, Parameter type)
 {
     std::unique_lock<std::mutex> l(inputLock);
     switch (type) {
     case (Parameter::LANGUAGE):
         language_ = parameter;
         if (whisper_)
             whisper_->setLanguage(parameter);
         break;
     default:
         return;
     }
 }

 void
 TranscriptAudioSubscriber::update(jami::Observable<AVFrame*>* obs, AVFrame* const& pluginFrame)
 {
     std::unique_lock<std::mutex> l(inputLock);
     if (!pluginFrame || obs != observable_)
         return;

     frames_.emplace_back(uniqueFramePtr(av_frame_clone(pluginFrame), frameFree));
     cv_.notify_all();
     // audio returns as is
 }

 void
 TranscriptAudioSubscriber::attached(jami::Observable<AVFrame*>* observable)
 {
     std::unique_lock<std::mutex> l(inputLock);
     Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("::Attached ! {} for {}", fmt::ptr(this), fmt::ptr(observable)));
     observable_ = observable;
     start();
 }

 void
 TranscriptAudioSubscriber::detached(jami::Observable<AVFrame*>* observable)
 {
     firstRun = true;
     observable_ = nullptr;
     stop();
     Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("::Detached ! {} for {}", fmt::ptr(this), fmt::ptr(observable)));
 }

 void
 TranscriptAudioSubscriber::detach()
 {
     if (observable_) {
         firstRun = true;
         Plog::log(Plog::LogPriority::INFO, TAG, "::Calling detach()");
         observable_->detach(this);
     }
 }
 } // namespace jami
	/**
	* Copyright (C) 2022 Savoir-faire Linux Inc.
	*
	* Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 3 of the License, or
	* (at your option) any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
	*/

	#include "TranscriptAudioSubscriber.h"

	#include <pluglog.h>
	#include <frameUtils.h>
	#include <bitset>
	#include <iostream>
	#include <fmt/core.h>
	#include <fmt/format.h>

	#include "stt_whisper.h"

	const std::string TAG = "TranscriptAudio";
	const char sep = separator();

	namespace jami {

	TranscriptAudioSubscriber::TranscriptAudioSubscriber(const std::string& dataPath,
	TranscriptVideoSubscriber* videoSubscriber)
	: path_ {dataPath}
	, mVS_ {videoSubscriber}
	{
	Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("TranscriptAudioSubscriber {}", fmt::ptr(this)));
	}

	TranscriptAudioSubscriber::~TranscriptAudioSubscriber()
	{
	Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("~TranscriptAudioSubscriber {}", fmt::ptr(this)));
	}

	/**
	* Waits for audio samples and then process them
	**/
	void
	TranscriptAudioSubscriber::processFrame()
	{
	if (!whisper_) {
	whisper_ = std::make_unique<RealtimeSttWhisper>(path_ + "/assets/ggml-base.bin");
	whisper_->setLanguage(language_);
	}

	while (running) {
	decltype(frames_) frames;
	{
	std::unique_lock<std::mutex> l(inputLock);
	cv_.wait(l, [&]{
	return !running \|\| !frames_.empty();
	});
	if (!running)
	return;
	frames = std::move(frames_);
	}

	for (auto& f : frames) {
	uniqueFramePtr filteredFrame = getUniqueFrame();
	filteredFrame->sample_rate = WHISPER_SAMPLE_RATE;
	filteredFrame->format = AV_SAMPLE_FMT_FLT;
	av_channel_layout_from_mask(&filteredFrame->ch_layout , AV_CH_LAYOUT_MONO);
	try {
	if (resampler_.resample(f.get(), filteredFrame.get()) == 0) {
	whisper_->AddAudioData((float*) filteredFrame->buf[0]->data,
	filteredFrame->nb_samples);
	}
	} catch (...) {
	}
	}

	auto result = whisper_->GetTranscribed();
	if (not result.empty()) {
	std::string txt;
	for (const auto& t : result) {
	if (not t.is_partial)
	txt += t.text;
	}
	if (!txt.empty())
	mVS_->setText(txt);
	}
	}
	whisper_.reset();
	}

	void
	TranscriptAudioSubscriber::stop()
	{
	Plog::log(Plog::LogPriority::INFO, TAG, "stop()");
	{
	std::unique_lock<std::mutex> l(inputLock);
	running = false;
	cv_.notify_all();
	}
	if (processFrameThread.joinable()) {
	processFrameThread.join();
	}
	mVS_->setText("");
	}

	void
	TranscriptAudioSubscriber::start()
	{
	Plog::log(Plog::LogPriority::INFO, TAG, "start()");
	running = true;
	processFrameThread = std::thread([this](){ processFrame(); });
	mVS_->setText("");
	}

	void
	TranscriptAudioSubscriber::setParameter(const std::string& parameter, Parameter type)
	{
	std::unique_lock<std::mutex> l(inputLock);
	switch (type) {
	case (Parameter::LANGUAGE):
	language_ = parameter;
	if (whisper_)
	whisper_->setLanguage(parameter);
	break;
	default:
	return;
	}
	}

	void
	TranscriptAudioSubscriber::update(jami::Observable<AVFrame> obs, AVFrame* const& pluginFrame)
	{
	std::unique_lock<std::mutex> l(inputLock);
	if (!pluginFrame \|\| obs != observable_)
	return;

	frames_.emplace_back(uniqueFramePtr(av_frame_clone(pluginFrame), frameFree));
	cv_.notify_all();
	// audio returns as is
	}

	void
	TranscriptAudioSubscriber::attached(jami::Observable<AVFrame> observable)
	{
	std::unique_lock<std::mutex> l(inputLock);
	Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("::Attached ! {} for {}", fmt::ptr(this), fmt::ptr(observable)));
	observable_ = observable;
	start();
	}

	void
	TranscriptAudioSubscriber::detached(jami::Observable<AVFrame> observable)
	{
	firstRun = true;
	observable_ = nullptr;
	stop();
	Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("::Detached ! {} for {}", fmt::ptr(this), fmt::ptr(observable)));
	}

	void
	TranscriptAudioSubscriber::detach()
	{
	if (observable_) {
	firstRun = true;
	Plog::log(Plog::LogPriority::INFO, TAG, "::Calling detach()");
	observable_->detach(this);
	}
	}
	} // namespace jami