blob: 51f45e559c59f6fc2421ec279563db03b193ae5f [file] [log] [blame]
/**
* Copyright (C) 2022 Savoir-faire Linux Inc.
*
* Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
* USA.
*/
#pragma once
#include <map>
#include <vector>
#include <algorithm>
#include <set>
#include <mutex>
#include <onnxruntime_cxx_api.h>
#ifdef __ANDROID__
#include <nnapi_provider_factory.h>
#endif
#include <functional>
#include "Preprocess.h"
namespace jami {
// Use script getonnxio.py to grab model inputs and outputs
// names and shapes.
// Note: None is a open shape. If in the input, it will be defined by
// the data we want to use as input. As for open output, it is recommended
// to not try to pre allocate the tensor and use the model.run return.
static const char* encoderInputNames[4] = {"mel"};
static const char* encoderOutputNames[4] = {"617"};
#define MODELFEATURESHAPE 384
#define MODELKVCACHESHAPE 8
#define MODELLOGITSHAPE 51865 // 51864 for english models
static const std::vector<const char*> decoderInputNames = {"audio_features", "tokens", "kv_cache", "offset"};
static const std::vector<const char*> decoderOutputNames = {"logits", "output_kv_cache"};
static const std::vector<const char *> logSoftMaxInputNames = {"logits"};
static const std::vector<const char *> logSoftMaxOutputNames = {"token_ids", "probs"};
typedef struct whisperTokenData {
int64_t id; // token id
int64_t tid; // forced timestamp token id
float p; // probability of the token
float pt; // probability of the timestamp token
float ptsum; // sum of probabilities of all timestamp tokens
// token-level timestamp data
// do not use if you haven't computed token-level timestamps
int64_t t0; // start time of the token
int64_t t1; // end time of the token
float vlen; // voice length of the token
} whisperTokenData;
class ModelProcessor
{
public:
ModelProcessor(const std::string& path, bool acc);
~ModelProcessor();
void initModels(const std::string& encoderModelPath, const std::string& decoderModelPath, const std::string& logSoftMaxModelPath, bool activateAcc);
void endModels();
whisperTokenData whisper_sample_best(const float * probs);
/**
* @brief feedInput
* Takes a input and feeds it to the model storage for predictions
* @param input
* @param preferenceLanguage
*/
std::string feedInput(std::vector<float>& input, const std::string& preferenceLanguage = "auto");
bool isAllocated() { return isAllocated_; }
private:
// Tokens
whisperVocab vocab_;
whisperTokenData getToken(std::vector<float>& logits);
void filterLogits(std::vector<float>& logits, int offset);
void filterLanguageLogits(std::vector<float>& logits);
// onnx related
Ort::MemoryInfo allocatorInfo_ = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
bool isAllocated_ {false};
Ort::Env env_ {ORT_LOGGING_LEVEL_WARNING, "whisperTest"};
Ort::Session* encoderSession_ {};
Ort::Session* decoderSession_ {};
Ort::Session* logSoftMaxSession_ {};
Ort::SessionOptions sessOpt_;
// Encoder tensors. 1 input and 1 output
std::vector<int64_t> melInputShape_ {1, 80, 3000}; // Input Data Type: 1 (float), Input Shape: [1, 80, 3000]
Ort::Value audioFeaturesTensor_ {nullptr};
std::vector<int64_t> audioFeaturesShape_ {1, 1500, MODELFEATURESHAPE}; // Output Data Type: 1 (float), Output Shape: [1, 1500, MODELFEATURESHAPE]
std::array<float, 1500 * MODELFEATURESHAPE> audioFeatures_ {};
std::vector<float> output_;
// Decoder tensors. 4 inputs and 2 outputs
std::vector<int64_t> tokensOutput_ { };
// LogProb check
std::array<int64_t, 3> logitsShape_ {1, 1, MODELLOGITSHAPE};
int sampleLen = 100;
std::mutex mtx_;
};
} // namespace jami