blob: 51f45e559c59f6fc2421ec279563db03b193ae5f [file] [log] [blame]
Aline Gondim Santos329f8622022-11-08 08:04:22 -03001/**
2 * Copyright (C) 2022 Savoir-faire Linux Inc.
3 *
4 * Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
19 * USA.
20 */
21
22#pragma once
23
24#include <map>
25#include <vector>
26#include <algorithm>
Aline Gondim Santosbd032f82022-11-25 15:39:12 -030027#include <set>
28#include <mutex>
Aline Gondim Santos329f8622022-11-08 08:04:22 -030029
30#include <onnxruntime_cxx_api.h>
31#ifdef __ANDROID__
32#include <nnapi_provider_factory.h>
33#endif
34
35#include <functional>
36
37#include "Preprocess.h"
38
39namespace jami {
40
Aline Gondim Santosbd032f82022-11-25 15:39:12 -030041// Use script getonnxio.py to grab model inputs and outputs
42// names and shapes.
43// Note: None is a open shape. If in the input, it will be defined by
44// the data we want to use as input. As for open output, it is recommended
45// to not try to pre allocate the tensor and use the model.run return.
46
Aline Gondim Santos329f8622022-11-08 08:04:22 -030047static const char* encoderInputNames[4] = {"mel"};
Aline Gondim Santosbd032f82022-11-25 15:39:12 -030048static const char* encoderOutputNames[4] = {"617"};
49
50#define MODELFEATURESHAPE 384
51#define MODELKVCACHESHAPE 8
52
53#define MODELLOGITSHAPE 51865 // 51864 for english models
Aline Gondim Santos329f8622022-11-08 08:04:22 -030054
55static const std::vector<const char*> decoderInputNames = {"audio_features", "tokens", "kv_cache", "offset"};
56static const std::vector<const char*> decoderOutputNames = {"logits", "output_kv_cache"};
57
58static const std::vector<const char *> logSoftMaxInputNames = {"logits"};
59static const std::vector<const char *> logSoftMaxOutputNames = {"token_ids", "probs"};
60
61typedef struct whisperTokenData {
62 int64_t id; // token id
63 int64_t tid; // forced timestamp token id
64
65 float p; // probability of the token
66 float pt; // probability of the timestamp token
67 float ptsum; // sum of probabilities of all timestamp tokens
68
69 // token-level timestamp data
70 // do not use if you haven't computed token-level timestamps
71 int64_t t0; // start time of the token
72 int64_t t1; // end time of the token
73
74 float vlen; // voice length of the token
75} whisperTokenData;
76
77class ModelProcessor
78{
79public:
80 ModelProcessor(const std::string& path, bool acc);
81 ~ModelProcessor();
82
83 void initModels(const std::string& encoderModelPath, const std::string& decoderModelPath, const std::string& logSoftMaxModelPath, bool activateAcc);
84 void endModels();
85
86 whisperTokenData whisper_sample_best(const float * probs);
87
88 /**
89 * @brief feedInput
Aline Gondim Santosbd032f82022-11-25 15:39:12 -030090 * Takes a input and feeds it to the model storage for predictions
91 * @param input
92 * @param preferenceLanguage
Aline Gondim Santos329f8622022-11-08 08:04:22 -030093 */
Aline Gondim Santosbd032f82022-11-25 15:39:12 -030094 std::string feedInput(std::vector<float>& input, const std::string& preferenceLanguage = "auto");
Aline Gondim Santos329f8622022-11-08 08:04:22 -030095
96 bool isAllocated() { return isAllocated_; }
97
Aline Gondim Santos329f8622022-11-08 08:04:22 -030098private:
Aline Gondim Santos329f8622022-11-08 08:04:22 -030099 // Tokens
100 whisperVocab vocab_;
101
102 whisperTokenData getToken(std::vector<float>& logits);
Aline Gondim Santosbd032f82022-11-25 15:39:12 -0300103 void filterLogits(std::vector<float>& logits, int offset);
104 void filterLanguageLogits(std::vector<float>& logits);
Aline Gondim Santos329f8622022-11-08 08:04:22 -0300105
106 // onnx related
107 Ort::MemoryInfo allocatorInfo_ = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
108 bool isAllocated_ {false};
109 Ort::Env env_ {ORT_LOGGING_LEVEL_WARNING, "whisperTest"};
110 Ort::Session* encoderSession_ {};
111 Ort::Session* decoderSession_ {};
112 Ort::Session* logSoftMaxSession_ {};
113 Ort::SessionOptions sessOpt_;
114
115 // Encoder tensors. 1 input and 1 output
116 std::vector<int64_t> melInputShape_ {1, 80, 3000}; // Input Data Type: 1 (float), Input Shape: [1, 80, 3000]
117 Ort::Value audioFeaturesTensor_ {nullptr};
Aline Gondim Santosbd032f82022-11-25 15:39:12 -0300118 std::vector<int64_t> audioFeaturesShape_ {1, 1500, MODELFEATURESHAPE}; // Output Data Type: 1 (float), Output Shape: [1, 1500, MODELFEATURESHAPE]
119 std::array<float, 1500 * MODELFEATURESHAPE> audioFeatures_ {};
Aline Gondim Santos329f8622022-11-08 08:04:22 -0300120
121 std::vector<float> output_;
122
123 // Decoder tensors. 4 inputs and 2 outputs
124 std::vector<int64_t> tokensOutput_ { };
125
126 // LogProb check
Aline Gondim Santosbd032f82022-11-25 15:39:12 -0300127 std::array<int64_t, 3> logitsShape_ {1, 1, MODELLOGITSHAPE};
Aline Gondim Santos329f8622022-11-08 08:04:22 -0300128
Aline Gondim Santos676d1c32022-12-05 21:13:25 -0300129 int sampleLen = 100;
Aline Gondim Santos329f8622022-11-08 08:04:22 -0300130
Aline Gondim Santosbd032f82022-11-25 15:39:12 -0300131 std::mutex mtx_;
132
Aline Gondim Santos329f8622022-11-08 08:04:22 -0300133};
134} // namespace jami