blob: 5138321a12210605d14d971e3a26d29fa44f2adb [file] [log] [blame]
/**
* Copyright (C) 2022 Savoir-faire Linux Inc.
*
* Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#pragma once
#include <vector>
#include <cstdint>
#include <string>
#include <map>
#include <set>
// Those are model defined
// Check paper page 3 (https://cdn.openai.com/papers/whisper.pdf)
#define WHISPER_SAMPLE_RATE 16000
#define WHISPER_N_FFT 400
#define WHISPER_N_MEL 80
#define WHISPER_HOP_LENGTH 160
#define WHISPER_CHUNK_SIZE 30
#define ENCODER_INPUT_LEN 3000
struct whisperMel {
int n_len;
int n_mel;
std::vector<float> data;
};
struct whisperFilters {
int32_t n_mel;
int32_t n_fft;
std::vector<float> data;
};
struct whisperVocab {
size_t n_vocab = 51864;
std::map<std::string, int32_t> token_to_id;
std::map<int32_t, std::string> id_to_token;
int32_t token_eot = 50256;
int32_t token_sot = 50257;
int32_t token_prev = 50360;
int32_t token_solm = 50361; // no speech
int32_t token_not = 50362; // no timestamps
int32_t token_beg = 50363; // timestamp begin
// available tasks
const int32_t token_translate = 50358;
const int32_t token_transcribe = 50359;
bool is_multilingual() const {
return n_vocab == 51865;
}
std::map<std::string, int32_t> languageTokens2Id;
std::map<int32_t, std::string> languageId2Tokens;
std::set<int32_t> noSpeechTokens;
};
bool logMelSpectrogram(
const float * samples,
const int n_samples,
const int n_threads,
const whisperFilters & filters,
whisperMel &mel);
void fft(const std::vector<float> & in, std::vector<float> & out);
void dft(const std::vector<float> & in, std::vector<float> & out);
void loadMelFilters(const std::string& fileName, whisperFilters& filters);
void loadTokens(const std::string& fileName, whisperVocab& vocab);
void inputPadTrim(whisperMel &mel);