| /** |
| * Copyright (C) 2022 Savoir-faire Linux Inc. |
| * |
| * Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com> |
| * |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU General Public License as published by |
| * the Free Software Foundation; either version 3 of the License, or |
| * (at your option) any later version. |
| * |
| * This program is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| * GNU General Public License for more details. |
| * |
| * You should have received a copy of the GNU General Public License |
| * along with this program; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| */ |
| |
| #pragma once |
| |
| #include <vector> |
| #include <cstdint> |
| #include <string> |
| #include <map> |
| #include <set> |
| |
| |
| // Those are model defined |
| // Check paper page 3 (https://cdn.openai.com/papers/whisper.pdf) |
| #define WHISPER_SAMPLE_RATE 16000 |
| #define WHISPER_N_FFT 400 |
| #define WHISPER_N_MEL 80 |
| #define WHISPER_HOP_LENGTH 160 |
| #define WHISPER_CHUNK_SIZE 30 |
| #define ENCODER_INPUT_LEN 3000 |
| |
| struct whisperMel { |
| int n_len; |
| int n_mel; |
| |
| std::vector<float> data; |
| }; |
| |
| struct whisperFilters { |
| int32_t n_mel; |
| int32_t n_fft; |
| |
| std::vector<float> data; |
| }; |
| |
| struct whisperVocab { |
| size_t n_vocab = 51864; |
| |
| std::map<std::string, int32_t> token_to_id; |
| std::map<int32_t, std::string> id_to_token; |
| |
| int32_t token_eot = 50256; |
| int32_t token_sot = 50257; |
| int32_t token_prev = 50360; |
| int32_t token_solm = 50361; // no speech |
| int32_t token_not = 50362; // no timestamps |
| int32_t token_beg = 50363; // timestamp begin |
| |
| // available tasks |
| const int32_t token_translate = 50358; |
| const int32_t token_transcribe = 50359; |
| |
| bool is_multilingual() const { |
| return n_vocab == 51865; |
| } |
| |
| std::map<std::string, int32_t> languageTokens2Id; |
| std::map<int32_t, std::string> languageId2Tokens; |
| std::set<int32_t> noSpeechTokens; |
| }; |
| |
| bool logMelSpectrogram( |
| const float * samples, |
| const int n_samples, |
| const int n_threads, |
| const whisperFilters & filters, |
| whisperMel &mel); |
| |
| void fft(const std::vector<float> & in, std::vector<float> & out); |
| |
| void dft(const std::vector<float> & in, std::vector<float> & out); |
| |
| void loadMelFilters(const std::string& fileName, whisperFilters& filters); |
| |
| void loadTokens(const std::string& fileName, whisperVocab& vocab); |
| |
| void inputPadTrim(whisperMel &mel); |