blob: 70a29aaf6904e3476c0bc62aee70e9b10ae94238 [file] [log] [blame]
/**
* Copyright (C) 2022 Savoir-faire Linux Inc.
*
* Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "Preprocess.h"
#ifdef WIN32
#define _USE_MATH_DEFINES
#endif
#include <thread>
#include <math.h>
#include <fstream>
#include <iostream>
// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124
bool logMelSpectrogram(
const float *samples,
const int n_samples,
const int n_threads,
const whisperFilters &filters,
whisperMel &mel) {
// const int sample_rate = WHISPER_SAMPLE_RATE;
const int fft_size = WHISPER_N_FFT;
const int fft_step = WHISPER_HOP_LENGTH;
const int n_mel = WHISPER_N_MEL;
// Hanning window
std::vector<float> hann;
hann.resize(fft_size);
for (int i = 0; i < fft_size; i++) {
hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size)));
}
mel.n_mel = n_mel;
mel.n_len = (n_samples)/fft_step;
mel.data.resize(mel.n_mel*mel.n_len);
const int n_fft = 1 + fft_size/2;
std::vector<std::thread> workers(n_threads);
for (int iw = 0; iw < n_threads; ++iw) {
workers[iw] = std::thread([&](int ith) {
std::vector<float> fft_in;
fft_in.resize(fft_size);
for (int i = 0; i < fft_size; i++) {
fft_in[i] = 0.0;
}
std::vector<float> fft_out;
fft_out.resize(2*fft_size);
for (int i = ith; i < mel.n_len; i += n_threads) {
const int offset = i*fft_step;
// apply Hanning window
for (int j = 0; j < fft_size; j++) {
if (offset + j < n_samples) {
fft_in[j] = hann[j]*samples[offset + j];
} else {
fft_in[j] = 0.0;
}
}
// FFT -> mag^2
fft(fft_in, fft_out);
for (int j = 0; j < fft_size; j++) {
fft_out[j] = (fft_out[2*j + 0]*fft_out[2*j + 0] + fft_out[2*j + 1]*fft_out[2*j + 1]);
}
for (int j = 1; j < fft_size/2; j++) {
fft_out[j] += fft_out[fft_size - j];
}
// mel spectrogram
for (int j = 0; j < mel.n_mel; j++) {
double sum = 0.0;
for (int k = 0; k < n_fft; k++) {
sum += fft_out[k]*filters.data[j*n_fft + k];
}
if (sum < 1e-10) {
sum = 1e-10;
}
sum = log10(sum);
mel.data[j*mel.n_len + i] = sum;
}
}
}, iw);
}
for (int iw = 0; iw < n_threads; ++iw) {
workers[iw].join();
}
// clamping and normalization
double mmax = -1e20;
for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
if (mel.data[i] > mmax) {
mmax = mel.data[i];
}
}
mmax -= 8.0;
for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
if (mel.data[i] < mmax) {
mel.data[i] = mmax;
}
mel.data[i] = (mel.data[i] + 4.0)/4.0;
}
return true;
}
// Cooley-Tukey FFT
// poor man's implementation - use something better
// input is real-valued
// output is complex-valued
void fft(const std::vector<float> & in, std::vector<float> & out) {
out.resize(in.size()*2);
int N = in.size();
if (N == 1) {
out[0] = in[0];
out[1] = 0;
return;
}
if (N%2 == 1) {
dft(in, out);
return;
}
std::vector<float> even;
std::vector<float> odd;
for (int i = 0; i < N; i++) {
if (i % 2 == 0) {
even.emplace_back(in[i]);
} else {
odd.emplace_back(in[i]);
}
}
std::vector<float> even_fft;
std::vector<float> odd_fft;
fft(even, even_fft);
fft(odd, odd_fft);
for (int k = 0; k < N/2; k++) {
float theta = 2*M_PI*k/N;
float re = cos(theta);
float im = -sin(theta);
float re_odd = odd_fft[2*k + 0];
float im_odd = odd_fft[2*k + 1];
out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
out[2*(k + N/2) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
out[2*(k + N/2) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
}
}
// naive Discrete Fourier Transform
// input is real-valued
// output is complex-valued
void dft(const std::vector<float> & in, std::vector<float> & out) {
int N = in.size();
out.resize(N*2);
for (int k = 0; k < N; k++) {
float re = 0;
float im = 0;
for (int n = 0; n < N; n++) {
float angle = 2*M_PI*k*n/N;
re += in[n]*cos(angle);
im -= in[n]*sin(angle);
}
out[k*2 + 0] = re;
out[k*2 + 1] = im;
}
}
void loadMelFilters(const std::string& fileName, whisperFilters& filters) {
auto fin = std::ifstream(fileName, std::ios::binary);
if (!fin) {
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fileName.c_str());
return;
}
fin.read((char *) &filters.n_mel, sizeof(filters.n_mel));
fin.read((char *) &filters.n_fft, sizeof(filters.n_fft));
filters.data.resize(filters.n_mel * filters.n_fft);
fin.read((char *) filters.data.data(), filters.data.size() * sizeof(float));
}
void loadTokens(const std::string& fileName, whisperVocab& vocab) {
auto fin = std::ifstream(fileName, std::ios::binary);
if (!fin) {
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fileName.c_str());
return;
}
int32_t modelNVocab = 0;
fin.read((char *) &modelNVocab, sizeof(modelNVocab));
int32_t tokensNVocab = 0;
fin.read((char *) &tokensNVocab, sizeof(tokensNVocab));
std::string word;
for (int i = 0; i < tokensNVocab; i++) {
uint32_t len;
fin.read((char *) &len, sizeof(len));
word.resize(len);
fin.read((char *) word.data(), len);
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
}
vocab.n_vocab = modelNVocab;
if (vocab.is_multilingual()) {
vocab.token_eot++;
vocab.token_sot++;
vocab.token_prev++;
vocab.token_solm++;
vocab.token_not++;
vocab.token_beg++;
}
if (tokensNVocab < modelNVocab) {
// Read language tokens
{
int32_t languageTokensLen = 0;
fin.read((char *) &languageTokensLen, sizeof(languageTokensLen));
std::string word;
for (int i = 0; i < languageTokensLen; i++) {
int32_t id = 0;
fin.read((char *) &id, sizeof(id));
uint32_t len;
fin.read((char *) &len, sizeof(len));
word.resize(len);
fin.read((char *) word.data(), len);
vocab.token_to_id[word] = id;
vocab.id_to_token[id] = word;
vocab.languageId2Tokens.insert({id, word});
vocab.languageTokens2Id.insert({word, id});
}
}
fprintf(stderr, "%s: adding %d extra tokens\n", __func__, modelNVocab - tokensNVocab);
for (int i = tokensNVocab; i < modelNVocab; i++) {
if (!vocab.id_to_token[i].empty())
continue;
if (i > vocab.token_beg) {
word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
} else if (i == vocab.token_eot) {
word = "[_EOT_]";
} else if (i == vocab.token_sot) {
word = "[_SOT_]";
} else if (i == vocab.token_prev) {
word = "[_PREV_]";
} else if (i == vocab.token_not) {
word = "[_NOT_]";
} else if (i == vocab.token_beg) {
word = "[_BEG_]";
} else {
word = "[_extra_token_" + std::to_string(i) + "]";
}
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
}
}
// Read no speech tokens
{
int32_t noSpeechTokensLen = 0;
fin.read((char *) &noSpeechTokensLen, sizeof(noSpeechTokensLen));
for (int i = 0; i < noSpeechTokensLen; i++) {
uint32_t id;
fin.read((char *) &id, sizeof(id));
vocab.noSpeechTokens.insert(id);
}
}
}
void
inputPadTrim(whisperMel &mel)
{
if (mel.n_len == ENCODER_INPUT_LEN)
return;
std::vector<float> data;
std::vector<float> partialData;
int seek = 0;
auto dataLimit = std::min(mel.n_len, ENCODER_INPUT_LEN);
for (auto j = 0; j < mel.n_mel; j++) {
seek = j * mel.n_len;
for (auto i = seek; i < (j + 1) * dataLimit; i++) {
partialData.emplace_back(mel.data[i]);
}
if (mel.n_len < ENCODER_INPUT_LEN) {
for (auto i = mel.n_len; i < ENCODER_INPUT_LEN; i++) {
partialData.emplace_back(0.0f);
}
}
data.insert(data.end(), partialData.begin(), partialData.end());
partialData.clear();
}
std::swap(mel.data, data);
}