WhisperTranscript/TranscriptAudioSubscriber.cpp - jami-plugins - Gitiles

 /**
  *  Copyright (C) 2022 Savoir-faire Linux Inc.
  *
  *  Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation; either version 3 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA.
  */

 #include "TranscriptAudioSubscriber.h"

 #include <pluglog.h>
 #include <frameUtils.h>
 #include <bitset>
 #include <iostream>

 const std::string TAG = "Transcript";
 const char sep = separator();

 namespace jami {

 TranscriptAudioSubscriber::TranscriptAudioSubscriber(const std::string& dataPath, TranscriptVideoSubscriber* videoSubscriber, bool acc)
     : path_ {dataPath}
     , modelProcessor_ {dataPath, acc}
     , mVS_ {videoSubscriber}
 {
     loadMelFilters(path_ + "/assets/mel_filters.bin", modelFilters_);

     /**
      * Waits for audio samples and then process them
      **/
     processFrameThread = std::thread([this] {
         while (running) {
             std::unique_lock<std::mutex> l(inputLock);
             inputCv.wait(l, [this] { return not running or newFrame; });
             if (not running) {
                 break;
             }

             logMelSpectrogram(currentModelInput_.data(), currentModelInput_.size(), 8, modelFilters_, melSpectrogram_);
             inputPadTrim(melSpectrogram_);
             newFrame = false;

             currentModelInput_.clear();
 #ifndef __DEBUG__
             /** Unlock the mutex, this way we let the other thread
              *  copy new data while we are processing the old one
              **/
             l.unlock();
 #endif
             modelProcessor_.feedInput(melSpectrogram_.data);
             auto text = modelProcessor_.getText();
             mVS_->setText(text);
         }
     });
 }

 TranscriptAudioSubscriber::~TranscriptAudioSubscriber()
 {
     modelProcessor_.endModels();
     formatFilter_.clean();
     stop();
     processFrameThread.join();
     Plog::log(Plog::LogPriority::INFO, TAG, "~TranscriptMediaProcessor");
 }

 void
 TranscriptAudioSubscriber::stop()
 {
     running = false;
     inputCv.notify_all();
 }

 void
 TranscriptAudioSubscriber::update(jami::Observable<AVFrame*>*, AVFrame* const& pluginFrame)
 {
     if (!pluginFrame || modelFilters_.data.empty())
         return;

     if (firstRun) {
         modelProcessor_.getText();
         count_ = 0;
         pastModelInput_.clear();
         currentModelInput_.clear();
         futureModelInput_.clear();
         formatFilter_.clean();
         AudioFormat afmt = AudioFormat(pluginFrame->sample_rate,
                                         pluginFrame->channels,
                                         static_cast<AVSampleFormat>(pluginFrame->format));
         MediaStream ms = MediaStream("input", afmt);
         formatFilter_.initialize(filterDescription_, {ms});
         firstRun = false;
     }

     if (!formatFilter_.initialized_)
         return;

     if (formatFilter_.feedInput(pluginFrame, "input") == 0) {
         uniqueFramePtr filteredFrame = {formatFilter_.readOutput(), frameFree};
         if (filteredFrame) {
             for (size_t i = 0; i < filteredFrame->buf[0]->size; i += 2) {
                 std::lock_guard<std::mutex> l(inputLock);
                 int16_t rawValue = (filteredFrame->buf[0]->data[i+1] << 8) | filteredFrame->buf[0]->data[i];

                 // If not a positive value, perform the 2's complement math on the value
                 if ((rawValue & 0x8000) != 0) {
                     rawValue = (~(rawValue - 0x0001)) * -1;
                 }
                 futureModelInput_.emplace_back(float(rawValue)/32768.0f);
                 if (count_++ > WHISPER_STREAM_SAMPLES_CHUNK_STEP)
                     overlapInput_.emplace_back(float(rawValue)/32768.0f);
                 count_++;

                 // Trigger transcription when we have enough samples
                 if (futureModelInput_.size() == WHISPER_STREAM_SAMPLES_CHUNK && !newFrame) {
                     pastModelInput_.clear();
                     std::swap(pastModelInput_, currentModelInput_);
                     std::swap(currentModelInput_, futureModelInput_);
                     std::swap(futureModelInput_, overlapInput_);
                     count_ = 0;
                     overlapInput_.clear();
                     newFrame = true;
                     inputCv.notify_all();
                 }
             }
         }
     }
     // audio returns as is
 }

 void
 TranscriptAudioSubscriber::attached(jami::Observable<AVFrame*>* observable)
 {
     Plog::log(Plog::LogPriority::INFO, TAG, "::Attached ! ");
     observable_ = observable;
 }

 void
 TranscriptAudioSubscriber::detached(jami::Observable<AVFrame*>*)
 {
     modelProcessor_.getText();
     firstRun = true;
     observable_ = nullptr;
     Plog::log(Plog::LogPriority::INFO, TAG, "::Detached()");
 }

 void
 TranscriptAudioSubscriber::detach()
 {
     if (observable_) {
         firstRun = true;
         std::ostringstream oss;
         Plog::log(Plog::LogPriority::INFO, TAG, "::Calling detach()");
         observable_->detach(this);
     }
 }
 } // namespace jami
	/**
	* Copyright (C) 2022 Savoir-faire Linux Inc.
	*
	* Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 3 of the License, or
	* (at your option) any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
	*/

	#include "TranscriptAudioSubscriber.h"

	#include <pluglog.h>
	#include <frameUtils.h>
	#include <bitset>
	#include <iostream>

	const std::string TAG = "Transcript";
	const char sep = separator();

	namespace jami {

	TranscriptAudioSubscriber::TranscriptAudioSubscriber(const std::string& dataPath, TranscriptVideoSubscriber* videoSubscriber, bool acc)
	: path_ {dataPath}
	, modelProcessor_ {dataPath, acc}
	, mVS_ {videoSubscriber}
	{
	loadMelFilters(path_ + "/assets/mel_filters.bin", modelFilters_);

	/**
	* Waits for audio samples and then process them
	**/
	processFrameThread = std::thread([this] {
	while (running) {
	std::unique_lock<std::mutex> l(inputLock);
	inputCv.wait(l, [this] { return not running or newFrame; });
	if (not running) {
	break;
	}

	logMelSpectrogram(currentModelInput_.data(), currentModelInput_.size(), 8, modelFilters_, melSpectrogram_);
	inputPadTrim(melSpectrogram_);
	newFrame = false;

	currentModelInput_.clear();
	#ifndef __DEBUG__
	/** Unlock the mutex, this way we let the other thread
	* copy new data while we are processing the old one
	**/
	l.unlock();
	#endif
	modelProcessor_.feedInput(melSpectrogram_.data);
	auto text = modelProcessor_.getText();
	mVS_->setText(text);
	}
	});
	}

	TranscriptAudioSubscriber::~TranscriptAudioSubscriber()
	{
	modelProcessor_.endModels();
	formatFilter_.clean();
	stop();
	processFrameThread.join();
	Plog::log(Plog::LogPriority::INFO, TAG, "~TranscriptMediaProcessor");
	}

	void
	TranscriptAudioSubscriber::stop()
	{
	running = false;
	inputCv.notify_all();
	}

	void
	TranscriptAudioSubscriber::update(jami::Observable<AVFrame>, AVFrame* const& pluginFrame)
	{
	if (!pluginFrame \|\| modelFilters_.data.empty())
	return;

	if (firstRun) {
	modelProcessor_.getText();
	count_ = 0;
	pastModelInput_.clear();
	currentModelInput_.clear();
	futureModelInput_.clear();
	formatFilter_.clean();
	AudioFormat afmt = AudioFormat(pluginFrame->sample_rate,
	pluginFrame->channels,
	static_cast<AVSampleFormat>(pluginFrame->format));
	MediaStream ms = MediaStream("input", afmt);
	formatFilter_.initialize(filterDescription_, {ms});
	firstRun = false;
	}

	if (!formatFilter_.initialized_)
	return;

	if (formatFilter_.feedInput(pluginFrame, "input") == 0) {
	uniqueFramePtr filteredFrame = {formatFilter_.readOutput(), frameFree};
	if (filteredFrame) {
	for (size_t i = 0; i < filteredFrame->buf[0]->size; i += 2) {
	std::lock_guard<std::mutex> l(inputLock);
	int16_t rawValue = (filteredFrame->buf[0]->data[i+1] << 8) \| filteredFrame->buf[0]->data[i];

	// If not a positive value, perform the 2's complement math on the value
	if ((rawValue & 0x8000) != 0) {
	rawValue = (~(rawValue - 0x0001)) * -1;
	}
	futureModelInput_.emplace_back(float(rawValue)/32768.0f);
	if (count_++ > WHISPER_STREAM_SAMPLES_CHUNK_STEP)
	overlapInput_.emplace_back(float(rawValue)/32768.0f);
	count_++;

	// Trigger transcription when we have enough samples
	if (futureModelInput_.size() == WHISPER_STREAM_SAMPLES_CHUNK && !newFrame) {
	pastModelInput_.clear();
	std::swap(pastModelInput_, currentModelInput_);
	std::swap(currentModelInput_, futureModelInput_);
	std::swap(futureModelInput_, overlapInput_);
	count_ = 0;
	overlapInput_.clear();
	newFrame = true;
	inputCv.notify_all();
	}
	}
	}
	}
	// audio returns as is
	}

	void
	TranscriptAudioSubscriber::attached(jami::Observable<AVFrame> observable)
	{
	Plog::log(Plog::LogPriority::INFO, TAG, "::Attached ! ");
	observable_ = observable;
	}

	void
	TranscriptAudioSubscriber::detached(jami::Observable<AVFrame>)
	{
	modelProcessor_.getText();
	firstRun = true;
	observable_ = nullptr;
	Plog::log(Plog::LogPriority::INFO, TAG, "::Detached()");
	}

	void
	TranscriptAudioSubscriber::detach()
	{
	if (observable_) {
	firstRun = true;
	std::ostringstream oss;
	Plog::log(Plog::LogPriority::INFO, TAG, "::Calling detach()");
	observable_->detach(this);
	}
	}
	} // namespace jami