WhisperTranscript/Preprocess.cpp - jami-plugins - Gitiles

 /**
  *  Copyright (C) 2022 Savoir-faire Linux Inc.
  *
  *  Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation; either version 3 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA.
  */

 #include "Preprocess.h"

 #ifdef WIN32
 #define _USE_MATH_DEFINES
 #endif

 #include <thread>
 #include <math.h>
 #include <fstream>
 #include <iostream>

 // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124
 bool logMelSpectrogram(
     const float *samples,
     const int n_samples,
     const int n_threads,
     const whisperFilters &filters,
     whisperMel &mel) {

     // const int sample_rate = WHISPER_SAMPLE_RATE;
     const int fft_size = WHISPER_N_FFT;
     const int fft_step = WHISPER_HOP_LENGTH;
     const int n_mel = WHISPER_N_MEL;

     // Hanning window
     std::vector<float> hann;
     hann.resize(fft_size);
     for (int i = 0; i < fft_size; i++) {
         hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size)));
     }

     mel.n_mel = n_mel;
     mel.n_len = (n_samples)/fft_step;
     mel.data.resize(mel.n_mel*mel.n_len);

     const int n_fft = 1 + fft_size/2;

     std::vector<std::thread> workers(n_threads);
     for (int iw = 0; iw < n_threads; ++iw) {
         workers[iw] = std::thread([&](int ith) {
             std::vector<float> fft_in;
             fft_in.resize(fft_size);
             for (int i = 0; i < fft_size; i++) {
                 fft_in[i] = 0.0;
             }

             std::vector<float> fft_out;
             fft_out.resize(2*fft_size);

             for (int i = ith; i < mel.n_len; i += n_threads) {
                 const int offset = i*fft_step;

                 // apply Hanning window
                 for (int j = 0; j < fft_size; j++) {
                     if (offset + j < n_samples) {
                         fft_in[j] = hann[j]*samples[offset + j];
                     } else {
                         fft_in[j] = 0.0;
                     }
                 }

                 // FFT -> mag^2
                 fft(fft_in, fft_out);

                 for (int j = 0; j < fft_size; j++) {
                     fft_out[j] = (fft_out[2*j + 0]*fft_out[2*j + 0] + fft_out[2*j + 1]*fft_out[2*j + 1]);
                 }
                 for (int j = 1; j < fft_size/2; j++) {
                     fft_out[j] += fft_out[fft_size - j];
                 }

                 // mel spectrogram
                 for (int j = 0; j < mel.n_mel; j++) {
                     double sum = 0.0;

                     for (int k = 0; k < n_fft; k++) {
                         sum += fft_out[k]*filters.data[j*n_fft + k];
                     }
                     if (sum < 1e-10) {
                         sum = 1e-10;
                     }

                     sum = log10(sum);

                     mel.data[j*mel.n_len + i] = sum;
                 }
             }
         }, iw);
     }

     for (int iw = 0; iw < n_threads; ++iw) {
         workers[iw].join();
     }

     // clamping and normalization
     double mmax = -1e20;
     for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
         if (mel.data[i] > mmax) {
             mmax = mel.data[i];
         }
     }

     mmax -= 8.0;

     for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
         if (mel.data[i] < mmax) {
             mel.data[i] = mmax;
         }

         mel.data[i] = (mel.data[i] + 4.0)/4.0;
     }

     return true;
 }

 // Cooley-Tukey FFT
 // poor man's implementation - use something better
 // input is real-valued
 // output is complex-valued
 void fft(const std::vector<float> & in, std::vector<float> & out) {
     out.resize(in.size()*2);

     int N = in.size();

     if (N == 1) {
         out[0] = in[0];
         out[1] = 0;
         return;
     }

     if (N%2 == 1) {
         dft(in, out);
         return;
     }

     std::vector<float> even;
     std::vector<float> odd;

     for (int i = 0; i < N; i++) {
         if (i % 2 == 0) {
             even.emplace_back(in[i]);
         } else {
             odd.emplace_back(in[i]);
         }
     }

     std::vector<float> even_fft;
     std::vector<float> odd_fft;

     fft(even, even_fft);
     fft(odd, odd_fft);

     for (int k = 0; k < N/2; k++) {
         float theta = 2*M_PI*k/N;

         float re = cos(theta);
         float im = -sin(theta);

         float re_odd = odd_fft[2*k + 0];
         float im_odd = odd_fft[2*k + 1];

         out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
         out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;

         out[2*(k + N/2) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
         out[2*(k + N/2) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
     }
 }

 // naive Discrete Fourier Transform
 // input is real-valued
 // output is complex-valued
 void dft(const std::vector<float> & in, std::vector<float> & out) {
     int N = in.size();

     out.resize(N*2);

     for (int k = 0; k < N; k++) {
         float re = 0;
         float im = 0;

         for (int n = 0; n < N; n++) {
             float angle = 2*M_PI*k*n/N;
             re += in[n]*cos(angle);
             im -= in[n]*sin(angle);
         }

         out[k*2 + 0] = re;
         out[k*2 + 1] = im;
     }
 }


 void loadMelFilters(const std::string& fileName, whisperFilters& filters) {
     auto fin = std::ifstream(fileName, std::ios::binary);
     if (!fin) {
         fprintf(stderr, "%s: failed to open '%s'\n", __func__, fileName.c_str());
         return;
     }

     fin.read((char *) &filters.n_mel, sizeof(filters.n_mel));
     fin.read((char *) &filters.n_fft, sizeof(filters.n_fft));

     filters.data.resize(filters.n_mel * filters.n_fft);
     fin.read((char *) filters.data.data(), filters.data.size() * sizeof(float));
 }

 void loadTokens(const std::string& fileName, whisperVocab& vocab) {
     auto fin = std::ifstream(fileName, std::ios::binary);
     if (!fin) {
         fprintf(stderr, "%s: failed to open '%s'\n", __func__, fileName.c_str());
         return;
     }

     int32_t modelNVocab = 0;
     fin.read((char *) &modelNVocab, sizeof(modelNVocab));

     int32_t tokensNVocab = 0;
     fin.read((char *) &tokensNVocab, sizeof(tokensNVocab));

     std::string word;
     for (int i = 0; i < tokensNVocab; i++) {
         uint32_t len;
         fin.read((char *) &len, sizeof(len));

         word.resize(len);
         fin.read((char *) word.data(), len);

         vocab.token_to_id[word] = i;
         vocab.id_to_token[i] = word;
     }

     vocab.n_vocab = modelNVocab;
     if (vocab.is_multilingual()) {
         vocab.token_eot++;
         vocab.token_sot++;
         vocab.token_prev++;
         vocab.token_solm++;
         vocab.token_not++;
         vocab.token_beg++;
     }

     if (tokensNVocab < modelNVocab) {
         // Read language tokens
         {
             int32_t languageTokensLen = 0;
             fin.read((char *) &languageTokensLen, sizeof(languageTokensLen));

             std::string word;
             for (int i = 0; i < languageTokensLen; i++) {
                 int32_t id = 0;
                 fin.read((char *) &id, sizeof(id));
                 uint32_t len;
                 fin.read((char *) &len, sizeof(len));

                 word.resize(len);
                 fin.read((char *) word.data(), len);

                 vocab.token_to_id[word] = id;
                 vocab.id_to_token[id] = word;
                 vocab.languageId2Tokens.insert({id, word});
                 vocab.languageTokens2Id.insert({word, id});
             }
         }

         fprintf(stderr, "%s: adding %d extra tokens\n", __func__, modelNVocab - tokensNVocab);
         for (int i = tokensNVocab; i < modelNVocab; i++) {
             if (!vocab.id_to_token[i].empty())
                 continue;
             if (i > vocab.token_beg) {
                 word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
             } else if (i == vocab.token_eot) {
                 word = "[_EOT_]";
             } else if (i == vocab.token_sot) {
                 word = "[_SOT_]";
             } else if (i == vocab.token_prev) {
                 word = "[_PREV_]";
             } else if (i == vocab.token_not) {
                 word = "[_NOT_]";
             } else if (i == vocab.token_beg) {
                 word = "[_BEG_]";
             } else {
                 word = "[_extra_token_" + std::to_string(i) + "]";
             }
             vocab.token_to_id[word] = i;
             vocab.id_to_token[i] = word;
         }
     }

     // Read no speech tokens
     {
         int32_t noSpeechTokensLen = 0;
         fin.read((char *) &noSpeechTokensLen, sizeof(noSpeechTokensLen));

         for (int i = 0; i < noSpeechTokensLen; i++) {
             uint32_t id;
             fin.read((char *) &id, sizeof(id));

             vocab.noSpeechTokens.insert(id);
         }
     }
 }

 void
 inputPadTrim(whisperMel &mel)
 {
     if (mel.n_len == ENCODER_INPUT_LEN)
         return;
     std::vector<float> data;
     std::vector<float> partialData;
     int seek = 0;
     auto dataLimit = std::min(mel.n_len, ENCODER_INPUT_LEN);
     for (auto j = 0; j < mel.n_mel; j++) {
         seek = j * mel.n_len;
         for (auto i = seek; i < (j + 1) * dataLimit; i++) {
             partialData.emplace_back(mel.data[i]);
         }
         if (mel.n_len < ENCODER_INPUT_LEN) {
             for (auto i = mel.n_len; i < ENCODER_INPUT_LEN; i++) {
                 partialData.emplace_back(0.0f);
             }
         }
         data.insert(data.end(), partialData.begin(), partialData.end());
         partialData.clear();
     }
     std::swap(mel.data, data);
 }
	/**
	* Copyright (C) 2022 Savoir-faire Linux Inc.
	*
	* Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 3 of the License, or
	* (at your option) any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
	*/

	#include "Preprocess.h"

	#ifdef WIN32
	#define _USE_MATH_DEFINES
	#endif

	#include <thread>
	#include <math.h>
	#include <fstream>
	#include <iostream>

	// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124
	bool logMelSpectrogram(
	const float *samples,
	const int n_samples,
	const int n_threads,
	const whisperFilters &filters,
	whisperMel &mel) {

	// const int sample_rate = WHISPER_SAMPLE_RATE;
	const int fft_size = WHISPER_N_FFT;
	const int fft_step = WHISPER_HOP_LENGTH;
	const int n_mel = WHISPER_N_MEL;

	// Hanning window
	std::vector<float> hann;
	hann.resize(fft_size);
	for (int i = 0; i < fft_size; i++) {
	hann[i] = 0.5(1.0 - cos((2.0M_PI*i)/(fft_size)));
	}

	mel.n_mel = n_mel;
	mel.n_len = (n_samples)/fft_step;
	mel.data.resize(mel.n_mel*mel.n_len);

	const int n_fft = 1 + fft_size/2;

	std::vector<std::thread> workers(n_threads);
	for (int iw = 0; iw < n_threads; ++iw) {
	workers[iw] = std::thread([&](int ith) {
	std::vector<float> fft_in;
	fft_in.resize(fft_size);
	for (int i = 0; i < fft_size; i++) {
	fft_in[i] = 0.0;
	}

	std::vector<float> fft_out;
	fft_out.resize(2*fft_size);

	for (int i = ith; i < mel.n_len; i += n_threads) {
	const int offset = i*fft_step;

	// apply Hanning window
	for (int j = 0; j < fft_size; j++) {
	if (offset + j < n_samples) {
	fft_in[j] = hann[j]*samples[offset + j];
	} else {
	fft_in[j] = 0.0;
	}
	}

	// FFT -> mag^2
	fft(fft_in, fft_out);

	for (int j = 0; j < fft_size; j++) {
	fft_out[j] = (fft_out[2j + 0]fft_out[2j + 0] + fft_out[2j + 1]fft_out[2j + 1]);
	}
	for (int j = 1; j < fft_size/2; j++) {
	fft_out[j] += fft_out[fft_size - j];
	}

	// mel spectrogram
	for (int j = 0; j < mel.n_mel; j++) {
	double sum = 0.0;

	for (int k = 0; k < n_fft; k++) {
	sum += fft_out[k]filters.data[jn_fft + k];
	}
	if (sum < 1e-10) {
	sum = 1e-10;
	}

	sum = log10(sum);

	mel.data[j*mel.n_len + i] = sum;
	}
	}
	}, iw);
	}

	for (int iw = 0; iw < n_threads; ++iw) {
	workers[iw].join();
	}

	// clamping and normalization
	double mmax = -1e20;
	for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
	if (mel.data[i] > mmax) {
	mmax = mel.data[i];
	}
	}

	mmax -= 8.0;

	for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
	if (mel.data[i] < mmax) {
	mel.data[i] = mmax;
	}

	mel.data[i] = (mel.data[i] + 4.0)/4.0;
	}

	return true;
	}

	// Cooley-Tukey FFT
	// poor man's implementation - use something better
	// input is real-valued
	// output is complex-valued
	void fft(const std::vector<float> & in, std::vector<float> & out) {
	out.resize(in.size()*2);

	int N = in.size();

	if (N == 1) {
	out[0] = in[0];
	out[1] = 0;
	return;
	}

	if (N%2 == 1) {
	dft(in, out);
	return;
	}

	std::vector<float> even;
	std::vector<float> odd;

	for (int i = 0; i < N; i++) {
	if (i % 2 == 0) {
	even.emplace_back(in[i]);
	} else {
	odd.emplace_back(in[i]);
	}
	}

	std::vector<float> even_fft;
	std::vector<float> odd_fft;

	fft(even, even_fft);
	fft(odd, odd_fft);

	for (int k = 0; k < N/2; k++) {
	float theta = 2M_PIk/N;

	float re = cos(theta);
	float im = -sin(theta);

	float re_odd = odd_fft[2*k + 0];
	float im_odd = odd_fft[2*k + 1];

	out[2k + 0] = even_fft[2k + 0] + rere_odd - imim_odd;
	out[2k + 1] = even_fft[2k + 1] + reim_odd + imre_odd;

	out[2(k + N/2) + 0] = even_fft[2k + 0] - rere_odd + imim_odd;
	out[2(k + N/2) + 1] = even_fft[2k + 1] - reim_odd - imre_odd;
	}
	}

	// naive Discrete Fourier Transform
	// input is real-valued
	// output is complex-valued
	void dft(const std::vector<float> & in, std::vector<float> & out) {
	int N = in.size();

	out.resize(N*2);

	for (int k = 0; k < N; k++) {
	float re = 0;
	float im = 0;

	for (int n = 0; n < N; n++) {
	float angle = 2M_PIk*n/N;
	re += in[n]*cos(angle);
	im -= in[n]*sin(angle);
	}

	out[k*2 + 0] = re;
	out[k*2 + 1] = im;
	}
	}


	void loadMelFilters(const std::string& fileName, whisperFilters& filters) {
	auto fin = std::ifstream(fileName, std::ios::binary);
	if (!fin) {
	fprintf(stderr, "%s: failed to open '%s'\n", __func__, fileName.c_str());
	return;
	}

	fin.read((char *) &filters.n_mel, sizeof(filters.n_mel));
	fin.read((char *) &filters.n_fft, sizeof(filters.n_fft));

	filters.data.resize(filters.n_mel * filters.n_fft);
	fin.read((char ) filters.data.data(), filters.data.size() sizeof(float));
	}

	void loadTokens(const std::string& fileName, whisperVocab& vocab) {
	auto fin = std::ifstream(fileName, std::ios::binary);
	if (!fin) {
	fprintf(stderr, "%s: failed to open '%s'\n", __func__, fileName.c_str());
	return;
	}

	int32_t modelNVocab = 0;
	fin.read((char *) &modelNVocab, sizeof(modelNVocab));

	int32_t tokensNVocab = 0;
	fin.read((char *) &tokensNVocab, sizeof(tokensNVocab));

	std::string word;
	for (int i = 0; i < tokensNVocab; i++) {
	uint32_t len;
	fin.read((char *) &len, sizeof(len));

	word.resize(len);
	fin.read((char *) word.data(), len);

	vocab.token_to_id[word] = i;
	vocab.id_to_token[i] = word;
	}

	vocab.n_vocab = modelNVocab;
	if (vocab.is_multilingual()) {
	vocab.token_eot++;
	vocab.token_sot++;
	vocab.token_prev++;
	vocab.token_solm++;
	vocab.token_not++;
	vocab.token_beg++;
	}

	if (tokensNVocab < modelNVocab) {
	// Read language tokens
	{
	int32_t languageTokensLen = 0;
	fin.read((char *) &languageTokensLen, sizeof(languageTokensLen));

	std::string word;
	for (int i = 0; i < languageTokensLen; i++) {
	int32_t id = 0;
	fin.read((char *) &id, sizeof(id));
	uint32_t len;
	fin.read((char *) &len, sizeof(len));

	word.resize(len);
	fin.read((char *) word.data(), len);

	vocab.token_to_id[word] = id;
	vocab.id_to_token[id] = word;
	vocab.languageId2Tokens.insert({id, word});
	vocab.languageTokens2Id.insert({word, id});
	}
	}

	fprintf(stderr, "%s: adding %d extra tokens\n", __func__, modelNVocab - tokensNVocab);
	for (int i = tokensNVocab; i < modelNVocab; i++) {
	if (!vocab.id_to_token[i].empty())
	continue;
	if (i > vocab.token_beg) {
	word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
	} else if (i == vocab.token_eot) {
	word = "[_EOT_]";
	} else if (i == vocab.token_sot) {
	word = "[_SOT_]";
	} else if (i == vocab.token_prev) {
	word = "[_PREV_]";
	} else if (i == vocab.token_not) {
	word = "[_NOT_]";
	} else if (i == vocab.token_beg) {
	word = "[_BEG_]";
	} else {
	word = "[_extra_token_" + std::to_string(i) + "]";
	}
	vocab.token_to_id[word] = i;
	vocab.id_to_token[i] = word;
	}
	}

	// Read no speech tokens
	{
	int32_t noSpeechTokensLen = 0;
	fin.read((char *) &noSpeechTokensLen, sizeof(noSpeechTokensLen));

	for (int i = 0; i < noSpeechTokensLen; i++) {
	uint32_t id;
	fin.read((char *) &id, sizeof(id));

	vocab.noSpeechTokens.insert(id);
	}
	}
	}

	void
	inputPadTrim(whisperMel &mel)
	{
	if (mel.n_len == ENCODER_INPUT_LEN)
	return;
	std::vector<float> data;
	std::vector<float> partialData;
	int seek = 0;
	auto dataLimit = std::min(mel.n_len, ENCODER_INPUT_LEN);
	for (auto j = 0; j < mel.n_mel; j++) {
	seek = j * mel.n_len;
	for (auto i = seek; i < (j + 1) * dataLimit; i++) {
	partialData.emplace_back(mel.data[i]);
	}
	if (mel.n_len < ENCODER_INPUT_LEN) {
	for (auto i = mel.n_len; i < ENCODER_INPUT_LEN; i++) {
	partialData.emplace_back(0.0f);
	}
	}
	data.insert(data.end(), partialData.begin(), partialData.end());
	partialData.clear();
	}
	std::swap(mel.data, data);
	}