Blame - WhisperTranscript/ModelProcessor.h - jami-plugins

blob: 51f45e559c59f6fc2421ec279563db03b193ae5f [file] [log] [blame]

Aline Gondim Santos	329f862	2022-11-08 08:04:22 -0300	[diff] [blame]	1	/**
				2	* Copyright (C) 2022 Savoir-faire Linux Inc.
				3	*
				4	* Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
				5	*
				6	* This program is free software; you can redistribute it and/or modify
				7	* it under the terms of the GNU General Public License as published by
				8	* the Free Software Foundation; either version 3 of the License, or
				9	* (at your option) any later version.
				10	*
				11	* This program is distributed in the hope that it will be useful,
				12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				14	* GNU General Public License for more details.
				15	*
				16	* You should have received a copy of the GNU General Public License
				17	* along with this program; if not, write to the Free Software
				18	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
				19	* USA.
				20	*/
				21
				22	#pragma once
				23
				24	#include <map>
				25	#include <vector>
				26	#include <algorithm>
Aline Gondim Santos	bd032f8	2022-11-25 15:39:12 -0300	[diff] [blame]	27	#include <set>
				28	#include <mutex>
Aline Gondim Santos	329f862	2022-11-08 08:04:22 -0300	[diff] [blame]	29
				30	#include <onnxruntime_cxx_api.h>
				31	#ifdef __ANDROID__
				32	#include <nnapi_provider_factory.h>
				33	#endif
				34
				35	#include <functional>
				36
				37	#include "Preprocess.h"
				38
				39	namespace jami {
				40
Aline Gondim Santos	bd032f8	2022-11-25 15:39:12 -0300	[diff] [blame]	41	// Use script getonnxio.py to grab model inputs and outputs
				42	// names and shapes.
				43	// Note: None is a open shape. If in the input, it will be defined by
				44	// the data we want to use as input. As for open output, it is recommended
				45	// to not try to pre allocate the tensor and use the model.run return.
				46
Aline Gondim Santos	329f862	2022-11-08 08:04:22 -0300	[diff] [blame]	47	static const char* encoderInputNames[4] = {"mel"};
Aline Gondim Santos	bd032f8	2022-11-25 15:39:12 -0300	[diff] [blame]	48	static const char* encoderOutputNames[4] = {"617"};
				49
				50	#define MODELFEATURESHAPE 384
				51	#define MODELKVCACHESHAPE 8
				52
				53	#define MODELLOGITSHAPE 51865 // 51864 for english models
Aline Gondim Santos	329f862	2022-11-08 08:04:22 -0300	[diff] [blame]	54
				55	static const std::vector<const char*> decoderInputNames = {"audio_features", "tokens", "kv_cache", "offset"};
				56	static const std::vector<const char*> decoderOutputNames = {"logits", "output_kv_cache"};
				57
				58	static const std::vector<const char *> logSoftMaxInputNames = {"logits"};
				59	static const std::vector<const char *> logSoftMaxOutputNames = {"token_ids", "probs"};
				60
				61	typedef struct whisperTokenData {
				62	int64_t id; // token id
				63	int64_t tid; // forced timestamp token id
				64
				65	float p; // probability of the token
				66	float pt; // probability of the timestamp token
				67	float ptsum; // sum of probabilities of all timestamp tokens
				68
				69	// token-level timestamp data
				70	// do not use if you haven't computed token-level timestamps
				71	int64_t t0; // start time of the token
				72	int64_t t1; // end time of the token
				73
				74	float vlen; // voice length of the token
				75	} whisperTokenData;
				76
				77	class ModelProcessor
				78	{
				79	public:
				80	ModelProcessor(const std::string& path, bool acc);
				81	~ModelProcessor();
				82
				83	void initModels(const std::string& encoderModelPath, const std::string& decoderModelPath, const std::string& logSoftMaxModelPath, bool activateAcc);
				84	void endModels();
				85
				86	whisperTokenData whisper_sample_best(const float * probs);
				87
				88	/**
				89	* @brief feedInput
Aline Gondim Santos	bd032f8	2022-11-25 15:39:12 -0300	[diff] [blame]	90	* Takes a input and feeds it to the model storage for predictions
				91	* @param input
				92	* @param preferenceLanguage
Aline Gondim Santos	329f862	2022-11-08 08:04:22 -0300	[diff] [blame]	93	*/
Aline Gondim Santos	bd032f8	2022-11-25 15:39:12 -0300	[diff] [blame]	94	std::string feedInput(std::vector<float>& input, const std::string& preferenceLanguage = "auto");
Aline Gondim Santos	329f862	2022-11-08 08:04:22 -0300	[diff] [blame]	95
				96	bool isAllocated() { return isAllocated_; }
				97
Aline Gondim Santos	329f862	2022-11-08 08:04:22 -0300	[diff] [blame]	98	private:
Aline Gondim Santos	329f862	2022-11-08 08:04:22 -0300	[diff] [blame]	99	// Tokens
				100	whisperVocab vocab_;
				101
				102	whisperTokenData getToken(std::vector<float>& logits);
Aline Gondim Santos	bd032f8	2022-11-25 15:39:12 -0300	[diff] [blame]	103	void filterLogits(std::vector<float>& logits, int offset);
				104	void filterLanguageLogits(std::vector<float>& logits);
Aline Gondim Santos	329f862	2022-11-08 08:04:22 -0300	[diff] [blame]	105
				106	// onnx related
				107	Ort::MemoryInfo allocatorInfo_ = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
				108	bool isAllocated_ {false};
				109	Ort::Env env_ {ORT_LOGGING_LEVEL_WARNING, "whisperTest"};
				110	Ort::Session* encoderSession_ {};
				111	Ort::Session* decoderSession_ {};
				112	Ort::Session* logSoftMaxSession_ {};
				113	Ort::SessionOptions sessOpt_;
				114
				115	// Encoder tensors. 1 input and 1 output
				116	std::vector<int64_t> melInputShape_ {1, 80, 3000}; // Input Data Type: 1 (float), Input Shape: [1, 80, 3000]
				117	Ort::Value audioFeaturesTensor_ {nullptr};
Aline Gondim Santos	bd032f8	2022-11-25 15:39:12 -0300	[diff] [blame]	118	std::vector<int64_t> audioFeaturesShape_ {1, 1500, MODELFEATURESHAPE}; // Output Data Type: 1 (float), Output Shape: [1, 1500, MODELFEATURESHAPE]
				119	std::array<float, 1500 * MODELFEATURESHAPE> audioFeatures_ {};
Aline Gondim Santos	329f862	2022-11-08 08:04:22 -0300	[diff] [blame]	120
				121	std::vector<float> output_;
				122
				123	// Decoder tensors. 4 inputs and 2 outputs
				124	std::vector<int64_t> tokensOutput_ { };
				125
				126	// LogProb check
Aline Gondim Santos	bd032f8	2022-11-25 15:39:12 -0300	[diff] [blame]	127	std::array<int64_t, 3> logitsShape_ {1, 1, MODELLOGITSHAPE};
Aline Gondim Santos	329f862	2022-11-08 08:04:22 -0300	[diff] [blame]	128
Aline Gondim Santos	676d1c3	2022-12-05 21:13:25 -0300	[diff] [blame^]	129	int sampleLen = 100;
Aline Gondim Santos	329f862	2022-11-08 08:04:22 -0300	[diff] [blame]	130
Aline Gondim Santos	bd032f8	2022-11-25 15:39:12 -0300	[diff] [blame]	131	std::mutex mtx_;
				132
Aline Gondim Santos	329f862	2022-11-08 08:04:22 -0300	[diff] [blame]	133	};
				134	} // namespace jami