| /** |
| * Copyright (C) 2022 Savoir-faire Linux Inc. |
| * |
| * Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com> |
| * |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU General Public License as published by |
| * the Free Software Foundation; either version 3 of the License, or |
| * (at your option) any later version. |
| * |
| * This program is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| * GNU General Public License for more details. |
| * |
| * You should have received a copy of the GNU General Public License |
| * along with this program; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| */ |
| |
| #include "Preprocess.h" |
| |
| #ifdef WIN32 |
| #define _USE_MATH_DEFINES |
| #endif |
| |
| #include <thread> |
| #include <math.h> |
| #include <fstream> |
| #include <iostream> |
| |
| // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124 |
| bool logMelSpectrogram( |
| const float *samples, |
| const int n_samples, |
| const int n_threads, |
| const whisperFilters &filters, |
| whisperMel &mel) { |
| |
| // const int sample_rate = WHISPER_SAMPLE_RATE; |
| const int fft_size = WHISPER_N_FFT; |
| const int fft_step = WHISPER_HOP_LENGTH; |
| const int n_mel = WHISPER_N_MEL; |
| |
| // Hanning window |
| std::vector<float> hann; |
| hann.resize(fft_size); |
| for (int i = 0; i < fft_size; i++) { |
| hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size))); |
| } |
| |
| mel.n_mel = n_mel; |
| mel.n_len = (n_samples)/fft_step; |
| mel.data.resize(mel.n_mel*mel.n_len); |
| |
| const int n_fft = 1 + fft_size/2; |
| |
| std::vector<std::thread> workers(n_threads); |
| for (int iw = 0; iw < n_threads; ++iw) { |
| workers[iw] = std::thread([&](int ith) { |
| std::vector<float> fft_in; |
| fft_in.resize(fft_size); |
| for (int i = 0; i < fft_size; i++) { |
| fft_in[i] = 0.0; |
| } |
| |
| std::vector<float> fft_out; |
| fft_out.resize(2*fft_size); |
| |
| for (int i = ith; i < mel.n_len; i += n_threads) { |
| const int offset = i*fft_step; |
| |
| // apply Hanning window |
| for (int j = 0; j < fft_size; j++) { |
| if (offset + j < n_samples) { |
| fft_in[j] = hann[j]*samples[offset + j]; |
| } else { |
| fft_in[j] = 0.0; |
| } |
| } |
| |
| // FFT -> mag^2 |
| fft(fft_in, fft_out); |
| |
| for (int j = 0; j < fft_size; j++) { |
| fft_out[j] = (fft_out[2*j + 0]*fft_out[2*j + 0] + fft_out[2*j + 1]*fft_out[2*j + 1]); |
| } |
| for (int j = 1; j < fft_size/2; j++) { |
| fft_out[j] += fft_out[fft_size - j]; |
| } |
| |
| // mel spectrogram |
| for (int j = 0; j < mel.n_mel; j++) { |
| double sum = 0.0; |
| |
| for (int k = 0; k < n_fft; k++) { |
| sum += fft_out[k]*filters.data[j*n_fft + k]; |
| } |
| if (sum < 1e-10) { |
| sum = 1e-10; |
| } |
| |
| sum = log10(sum); |
| |
| mel.data[j*mel.n_len + i] = sum; |
| } |
| } |
| }, iw); |
| } |
| |
| for (int iw = 0; iw < n_threads; ++iw) { |
| workers[iw].join(); |
| } |
| |
| // clamping and normalization |
| double mmax = -1e20; |
| for (int i = 0; i < mel.n_mel*mel.n_len; i++) { |
| if (mel.data[i] > mmax) { |
| mmax = mel.data[i]; |
| } |
| } |
| |
| mmax -= 8.0; |
| |
| for (int i = 0; i < mel.n_mel*mel.n_len; i++) { |
| if (mel.data[i] < mmax) { |
| mel.data[i] = mmax; |
| } |
| |
| mel.data[i] = (mel.data[i] + 4.0)/4.0; |
| } |
| |
| return true; |
| } |
| |
| // Cooley-Tukey FFT |
| // poor man's implementation - use something better |
| // input is real-valued |
| // output is complex-valued |
| void fft(const std::vector<float> & in, std::vector<float> & out) { |
| out.resize(in.size()*2); |
| |
| int N = in.size(); |
| |
| if (N == 1) { |
| out[0] = in[0]; |
| out[1] = 0; |
| return; |
| } |
| |
| if (N%2 == 1) { |
| dft(in, out); |
| return; |
| } |
| |
| std::vector<float> even; |
| std::vector<float> odd; |
| |
| for (int i = 0; i < N; i++) { |
| if (i % 2 == 0) { |
| even.emplace_back(in[i]); |
| } else { |
| odd.emplace_back(in[i]); |
| } |
| } |
| |
| std::vector<float> even_fft; |
| std::vector<float> odd_fft; |
| |
| fft(even, even_fft); |
| fft(odd, odd_fft); |
| |
| for (int k = 0; k < N/2; k++) { |
| float theta = 2*M_PI*k/N; |
| |
| float re = cos(theta); |
| float im = -sin(theta); |
| |
| float re_odd = odd_fft[2*k + 0]; |
| float im_odd = odd_fft[2*k + 1]; |
| |
| out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd; |
| out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd; |
| |
| out[2*(k + N/2) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd; |
| out[2*(k + N/2) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd; |
| } |
| } |
| |
| // naive Discrete Fourier Transform |
| // input is real-valued |
| // output is complex-valued |
| void dft(const std::vector<float> & in, std::vector<float> & out) { |
| int N = in.size(); |
| |
| out.resize(N*2); |
| |
| for (int k = 0; k < N; k++) { |
| float re = 0; |
| float im = 0; |
| |
| for (int n = 0; n < N; n++) { |
| float angle = 2*M_PI*k*n/N; |
| re += in[n]*cos(angle); |
| im -= in[n]*sin(angle); |
| } |
| |
| out[k*2 + 0] = re; |
| out[k*2 + 1] = im; |
| } |
| } |
| |
| |
| void loadMelFilters(const std::string& fileName, whisperFilters& filters) { |
| auto fin = std::ifstream(fileName, std::ios::binary); |
| if (!fin) { |
| fprintf(stderr, "%s: failed to open '%s'\n", __func__, fileName.c_str()); |
| return; |
| } |
| |
| fin.read((char *) &filters.n_mel, sizeof(filters.n_mel)); |
| fin.read((char *) &filters.n_fft, sizeof(filters.n_fft)); |
| |
| filters.data.resize(filters.n_mel * filters.n_fft); |
| fin.read((char *) filters.data.data(), filters.data.size() * sizeof(float)); |
| } |
| |
| void loadTokens(const std::string& fileName, whisperVocab& vocab) { |
| auto fin = std::ifstream(fileName, std::ios::binary); |
| if (!fin) { |
| fprintf(stderr, "%s: failed to open '%s'\n", __func__, fileName.c_str()); |
| return; |
| } |
| |
| int32_t modelNVocab = 0; |
| fin.read((char *) &modelNVocab, sizeof(modelNVocab)); |
| |
| int32_t tokensNVocab = 0; |
| fin.read((char *) &tokensNVocab, sizeof(tokensNVocab)); |
| |
| std::string word; |
| for (int i = 0; i < tokensNVocab; i++) { |
| uint32_t len; |
| fin.read((char *) &len, sizeof(len)); |
| |
| word.resize(len); |
| fin.read((char *) word.data(), len); |
| |
| vocab.token_to_id[word] = i; |
| vocab.id_to_token[i] = word; |
| } |
| |
| vocab.n_vocab = modelNVocab; |
| if (vocab.is_multilingual()) { |
| vocab.token_eot++; |
| vocab.token_sot++; |
| vocab.token_prev++; |
| vocab.token_solm++; |
| vocab.token_not++; |
| vocab.token_beg++; |
| } |
| |
| if (tokensNVocab < modelNVocab) { |
| // Read language tokens |
| { |
| int32_t languageTokensLen = 0; |
| fin.read((char *) &languageTokensLen, sizeof(languageTokensLen)); |
| |
| std::string word; |
| for (int i = 0; i < languageTokensLen; i++) { |
| int32_t id = 0; |
| fin.read((char *) &id, sizeof(id)); |
| uint32_t len; |
| fin.read((char *) &len, sizeof(len)); |
| |
| word.resize(len); |
| fin.read((char *) word.data(), len); |
| |
| vocab.token_to_id[word] = id; |
| vocab.id_to_token[id] = word; |
| vocab.languageId2Tokens.insert({id, word}); |
| vocab.languageTokens2Id.insert({word, id}); |
| } |
| } |
| |
| fprintf(stderr, "%s: adding %d extra tokens\n", __func__, modelNVocab - tokensNVocab); |
| for (int i = tokensNVocab; i < modelNVocab; i++) { |
| if (!vocab.id_to_token[i].empty()) |
| continue; |
| if (i > vocab.token_beg) { |
| word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]"; |
| } else if (i == vocab.token_eot) { |
| word = "[_EOT_]"; |
| } else if (i == vocab.token_sot) { |
| word = "[_SOT_]"; |
| } else if (i == vocab.token_prev) { |
| word = "[_PREV_]"; |
| } else if (i == vocab.token_not) { |
| word = "[_NOT_]"; |
| } else if (i == vocab.token_beg) { |
| word = "[_BEG_]"; |
| } else { |
| word = "[_extra_token_" + std::to_string(i) + "]"; |
| } |
| vocab.token_to_id[word] = i; |
| vocab.id_to_token[i] = word; |
| } |
| } |
| |
| // Read no speech tokens |
| { |
| int32_t noSpeechTokensLen = 0; |
| fin.read((char *) &noSpeechTokensLen, sizeof(noSpeechTokensLen)); |
| |
| for (int i = 0; i < noSpeechTokensLen; i++) { |
| uint32_t id; |
| fin.read((char *) &id, sizeof(id)); |
| |
| vocab.noSpeechTokens.insert(id); |
| } |
| } |
| } |
| |
| void |
| inputPadTrim(whisperMel &mel) |
| { |
| if (mel.n_len == ENCODER_INPUT_LEN) |
| return; |
| std::vector<float> data; |
| std::vector<float> partialData; |
| int seek = 0; |
| auto dataLimit = std::min(mel.n_len, ENCODER_INPUT_LEN); |
| for (auto j = 0; j < mel.n_mel; j++) { |
| seek = j * mel.n_len; |
| for (auto i = seek; i < (j + 1) * dataLimit; i++) { |
| partialData.emplace_back(mel.data[i]); |
| } |
| if (mel.n_len < ENCODER_INPUT_LEN) { |
| for (auto i = mel.n_len; i < ENCODER_INPUT_LEN; i++) { |
| partialData.emplace_back(0.0f); |
| } |
| } |
| data.insert(data.end(), partialData.begin(), partialData.end()); |
| partialData.clear(); |
| } |
| std::swap(mel.data, data); |
| } |