WhisperTranscript/ModelProcessor.h - jami-plugins - Gitiles

 /**
  *  Copyright (C) 2022 Savoir-faire Linux Inc.
  *
  *  Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation; either version 3 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
  * USA.
  */

 #pragma once

 #include <map>
 #include <vector>
 #include <algorithm>
 #include <set>
 #include <mutex>

 #include <onnxruntime_cxx_api.h>
 #ifdef __ANDROID__
 #include <nnapi_provider_factory.h>
 #endif

 #include <functional>

 #include "Preprocess.h"

 namespace jami {

 // Use script getonnxio.py to grab model inputs and outputs
 // names and shapes.
 // Note: None is a open shape. If in the input, it will be defined by
 // the data we want to use as input. As for open output, it is recommended
 // to not try to pre allocate the tensor and use the model.run return.

 static const char* encoderInputNames[4] = {"mel"};
 static const char* encoderOutputNames[4] = {"617"};

 #define MODELFEATURESHAPE 384
 #define MODELKVCACHESHAPE 8

 #define MODELLOGITSHAPE 51865 // 51864 for english models

 static const std::vector<const char*> decoderInputNames = {"audio_features", "tokens", "kv_cache", "offset"};
 static const std::vector<const char*> decoderOutputNames = {"logits", "output_kv_cache"};

 static const std::vector<const char *> logSoftMaxInputNames = {"logits"};
 static const std::vector<const char *> logSoftMaxOutputNames = {"token_ids", "probs"};

 typedef struct whisperTokenData {
     int64_t id;  // token id
     int64_t tid; // forced timestamp token id

     float p;     // probability of the token
     float pt;    // probability of the timestamp token
     float ptsum; // sum of probabilities of all timestamp tokens

     // token-level timestamp data
     // do not use if you haven't computed token-level timestamps
     int64_t t0; // start time of the token
     int64_t t1; //   end time of the token

     float vlen; // voice length of the token
 } whisperTokenData;

 class ModelProcessor
 {
 public:
     ModelProcessor(const std::string& path, bool acc);
     ~ModelProcessor();

     void initModels(const std::string& encoderModelPath, const std::string& decoderModelPath, const std::string& logSoftMaxModelPath, bool activateAcc);
     void endModels();

     whisperTokenData whisper_sample_best(const float * probs);

     /**
      * @brief feedInput
      * Takes a input and feeds it to the model storage for predictions
      * @param input
      * @param preferenceLanguage
      */
     std::string feedInput(std::vector<float>& input, const std::string& preferenceLanguage = "auto");

     bool isAllocated() { return isAllocated_; }

 private:
     // Tokens
     whisperVocab vocab_;

     whisperTokenData getToken(std::vector<float>& logits);
     void filterLogits(std::vector<float>& logits, int offset);
     void filterLanguageLogits(std::vector<float>& logits);

     // onnx related
     Ort::MemoryInfo allocatorInfo_ = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
     bool isAllocated_ {false};
     Ort::Env env_ {ORT_LOGGING_LEVEL_WARNING, "whisperTest"};
     Ort::Session* encoderSession_ {};
     Ort::Session* decoderSession_ {};
     Ort::Session* logSoftMaxSession_ {};
     Ort::SessionOptions sessOpt_;

     // Encoder tensors. 1 input and 1 output
     std::vector<int64_t> melInputShape_ {1, 80, 3000}; // Input Data Type: 1 (float), Input Shape: [1, 80, 3000]
     Ort::Value audioFeaturesTensor_ {nullptr};
     std::vector<int64_t> audioFeaturesShape_ {1, 1500, MODELFEATURESHAPE}; // Output Data Type: 1 (float), Output Shape: [1, 1500, MODELFEATURESHAPE]
     std::array<float, 1500 * MODELFEATURESHAPE> audioFeatures_ {};

     std::vector<float> output_;

     // Decoder tensors. 4 inputs and 2 outputs
     std::vector<int64_t> tokensOutput_ { };

     // LogProb check
     std::array<int64_t, 3> logitsShape_ {1, 1, MODELLOGITSHAPE};

     int sampleLen = 100;

     std::mutex mtx_;

 };
 } // namespace jami
	/**
	* Copyright (C) 2022 Savoir-faire Linux Inc.
	*
	* Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 3 of the License, or
	* (at your option) any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
	* USA.
	*/

	#pragma once

	#include <map>
	#include <vector>
	#include <algorithm>
	#include <set>
	#include <mutex>

	#include <onnxruntime_cxx_api.h>
	#ifdef __ANDROID__
	#include <nnapi_provider_factory.h>
	#endif

	#include <functional>

	#include "Preprocess.h"

	namespace jami {

	// Use script getonnxio.py to grab model inputs and outputs
	// names and shapes.
	// Note: None is a open shape. If in the input, it will be defined by
	// the data we want to use as input. As for open output, it is recommended
	// to not try to pre allocate the tensor and use the model.run return.

	static const char* encoderInputNames[4] = {"mel"};
	static const char* encoderOutputNames[4] = {"617"};

	#define MODELFEATURESHAPE 384
	#define MODELKVCACHESHAPE 8

	#define MODELLOGITSHAPE 51865 // 51864 for english models

	static const std::vector<const char*> decoderInputNames = {"audio_features", "tokens", "kv_cache", "offset"};
	static const std::vector<const char*> decoderOutputNames = {"logits", "output_kv_cache"};

	static const std::vector<const char *> logSoftMaxInputNames = {"logits"};
	static const std::vector<const char *> logSoftMaxOutputNames = {"token_ids", "probs"};

	typedef struct whisperTokenData {
	int64_t id; // token id
	int64_t tid; // forced timestamp token id

	float p; // probability of the token
	float pt; // probability of the timestamp token
	float ptsum; // sum of probabilities of all timestamp tokens

	// token-level timestamp data
	// do not use if you haven't computed token-level timestamps
	int64_t t0; // start time of the token
	int64_t t1; // end time of the token

	float vlen; // voice length of the token
	} whisperTokenData;

	class ModelProcessor
	{
	public:
	ModelProcessor(const std::string& path, bool acc);
	~ModelProcessor();

	void initModels(const std::string& encoderModelPath, const std::string& decoderModelPath, const std::string& logSoftMaxModelPath, bool activateAcc);
	void endModels();

	whisperTokenData whisper_sample_best(const float * probs);

	/**
	* @brief feedInput
	* Takes a input and feeds it to the model storage for predictions
	* @param input
	* @param preferenceLanguage
	*/
	std::string feedInput(std::vector<float>& input, const std::string& preferenceLanguage = "auto");

	bool isAllocated() { return isAllocated_; }

	private:
	// Tokens
	whisperVocab vocab_;

	whisperTokenData getToken(std::vector<float>& logits);
	void filterLogits(std::vector<float>& logits, int offset);
	void filterLanguageLogits(std::vector<float>& logits);

	// onnx related
	Ort::MemoryInfo allocatorInfo_ = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
	bool isAllocated_ {false};
	Ort::Env env_ {ORT_LOGGING_LEVEL_WARNING, "whisperTest"};
	Ort::Session* encoderSession_ {};
	Ort::Session* decoderSession_ {};
	Ort::Session* logSoftMaxSession_ {};
	Ort::SessionOptions sessOpt_;

	// Encoder tensors. 1 input and 1 output
	std::vector<int64_t> melInputShape_ {1, 80, 3000}; // Input Data Type: 1 (float), Input Shape: [1, 80, 3000]
	Ort::Value audioFeaturesTensor_ {nullptr};
	std::vector<int64_t> audioFeaturesShape_ {1, 1500, MODELFEATURESHAPE}; // Output Data Type: 1 (float), Output Shape: [1, 1500, MODELFEATURESHAPE]
	std::array<float, 1500 * MODELFEATURESHAPE> audioFeatures_ {};

	std::vector<float> output_;

	// Decoder tensors. 4 inputs and 2 outputs
	std::vector<int64_t> tokensOutput_ { };

	// LogProb check
	std::array<int64_t, 3> logitsShape_ {1, 1, MODELLOGITSHAPE};

	int sampleLen = 100;

	std::mutex mtx_;

	};
	} // namespace jami