| /** |
| * Copyright (C) 2022 Savoir-faire Linux Inc. |
| * |
| * Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com> |
| * |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU General Public License as published by |
| * the Free Software Foundation; either version 3 of the License, or |
| * (at your option) any later version. |
| * |
| * This program is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| * GNU General Public License for more details. |
| * |
| * You should have received a copy of the GNU General Public License |
| * along with this program; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 |
| * USA. |
| */ |
| |
| #pragma once |
| |
| #include <map> |
| #include <vector> |
| #include <algorithm> |
| #include <set> |
| #include <mutex> |
| |
| #include <onnxruntime_cxx_api.h> |
| #ifdef __ANDROID__ |
| #include <nnapi_provider_factory.h> |
| #endif |
| |
| #include <functional> |
| |
| #include "Preprocess.h" |
| |
| namespace jami { |
| |
| // Use script getonnxio.py to grab model inputs and outputs |
| // names and shapes. |
| // Note: None is a open shape. If in the input, it will be defined by |
| // the data we want to use as input. As for open output, it is recommended |
| // to not try to pre allocate the tensor and use the model.run return. |
| |
| static const char* encoderInputNames[4] = {"mel"}; |
| static const char* encoderOutputNames[4] = {"617"}; |
| |
| #define MODELFEATURESHAPE 384 |
| #define MODELKVCACHESHAPE 8 |
| |
| #define MODELLOGITSHAPE 51865 // 51864 for english models |
| |
| static const std::vector<const char*> decoderInputNames = {"audio_features", "tokens", "kv_cache", "offset"}; |
| static const std::vector<const char*> decoderOutputNames = {"logits", "output_kv_cache"}; |
| |
| static const std::vector<const char *> logSoftMaxInputNames = {"logits"}; |
| static const std::vector<const char *> logSoftMaxOutputNames = {"token_ids", "probs"}; |
| |
| typedef struct whisperTokenData { |
| int64_t id; // token id |
| int64_t tid; // forced timestamp token id |
| |
| float p; // probability of the token |
| float pt; // probability of the timestamp token |
| float ptsum; // sum of probabilities of all timestamp tokens |
| |
| // token-level timestamp data |
| // do not use if you haven't computed token-level timestamps |
| int64_t t0; // start time of the token |
| int64_t t1; // end time of the token |
| |
| float vlen; // voice length of the token |
| } whisperTokenData; |
| |
| class ModelProcessor |
| { |
| public: |
| ModelProcessor(const std::string& path, bool acc); |
| ~ModelProcessor(); |
| |
| void initModels(const std::string& encoderModelPath, const std::string& decoderModelPath, const std::string& logSoftMaxModelPath, bool activateAcc); |
| void endModels(); |
| |
| whisperTokenData whisper_sample_best(const float * probs); |
| |
| /** |
| * @brief feedInput |
| * Takes a input and feeds it to the model storage for predictions |
| * @param input |
| * @param preferenceLanguage |
| */ |
| std::string feedInput(std::vector<float>& input, const std::string& preferenceLanguage = "auto"); |
| |
| bool isAllocated() { return isAllocated_; } |
| |
| private: |
| // Tokens |
| whisperVocab vocab_; |
| |
| whisperTokenData getToken(std::vector<float>& logits); |
| void filterLogits(std::vector<float>& logits, int offset); |
| void filterLanguageLogits(std::vector<float>& logits); |
| |
| // onnx related |
| Ort::MemoryInfo allocatorInfo_ = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); |
| bool isAllocated_ {false}; |
| Ort::Env env_ {ORT_LOGGING_LEVEL_WARNING, "whisperTest"}; |
| Ort::Session* encoderSession_ {}; |
| Ort::Session* decoderSession_ {}; |
| Ort::Session* logSoftMaxSession_ {}; |
| Ort::SessionOptions sessOpt_; |
| |
| // Encoder tensors. 1 input and 1 output |
| std::vector<int64_t> melInputShape_ {1, 80, 3000}; // Input Data Type: 1 (float), Input Shape: [1, 80, 3000] |
| Ort::Value audioFeaturesTensor_ {nullptr}; |
| std::vector<int64_t> audioFeaturesShape_ {1, 1500, MODELFEATURESHAPE}; // Output Data Type: 1 (float), Output Shape: [1, 1500, MODELFEATURESHAPE] |
| std::array<float, 1500 * MODELFEATURESHAPE> audioFeatures_ {}; |
| |
| std::vector<float> output_; |
| |
| // Decoder tensors. 4 inputs and 2 outputs |
| std::vector<int64_t> tokensOutput_ { }; |
| |
| // LogProb check |
| std::array<int64_t, 3> logitsShape_ {1, 1, MODELLOGITSHAPE}; |
| |
| int sampleLen = 50; |
| |
| std::mutex mtx_; |
| |
| }; |
| } // namespace jami |