WhisperTranscript/Preprocess.h - jami-plugins - Gitiles

 /**
  *  Copyright (C) 2022 Savoir-faire Linux Inc.
  *
  *  Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation; either version 3 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA.
  */

 #pragma once

 #include <vector>
 #include <cstdint>
 #include <string>
 #include <map>
 #include <set>


 // Those are model defined
 // Check paper page 3 (https://cdn.openai.com/papers/whisper.pdf)
 #define WHISPER_SAMPLE_RATE 16000
 #define WHISPER_N_FFT       400
 #define WHISPER_N_MEL       80
 #define WHISPER_HOP_LENGTH  160
 #define WHISPER_CHUNK_SIZE  30
 #define ENCODER_INPUT_LEN   3000

 struct whisperMel {
     int n_len;
     int n_mel;

     std::vector<float> data;
 };

 struct whisperFilters {
     int32_t n_mel;
     int32_t n_fft;

     std::vector<float> data;
 };

 struct whisperVocab {
     size_t n_vocab = 51864;

     std::map<std::string, int32_t> token_to_id;
     std::map<int32_t, std::string> id_to_token;

     int32_t token_eot  = 50256;
     int32_t token_sot  = 50257;
     int32_t token_prev = 50360;
     int32_t token_solm = 50361; // no speech
     int32_t token_not  = 50362; // no timestamps
     int32_t token_beg  = 50363; // timestamp begin

     // available tasks
     const int32_t token_translate  = 50358;
     const int32_t token_transcribe = 50359;

     bool is_multilingual() const {
         return n_vocab == 51865;
     }

     std::map<std::string, int32_t> languageTokens2Id;
     std::map<int32_t, std::string> languageId2Tokens;
     std::set<int32_t> noSpeechTokens;
 };

 bool logMelSpectrogram(
     const float * samples,
     const int n_samples,
     const int n_threads,
     const whisperFilters & filters,
     whisperMel &mel);

 void fft(const std::vector<float> & in, std::vector<float> & out);

 void dft(const std::vector<float> & in, std::vector<float> & out);

 void loadMelFilters(const std::string& fileName, whisperFilters& filters);

 void loadTokens(const std::string& fileName, whisperVocab& vocab);

 void inputPadTrim(whisperMel &mel);
	/**
	* Copyright (C) 2022 Savoir-faire Linux Inc.
	*
	* Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 3 of the License, or
	* (at your option) any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
	*/

	#pragma once

	#include <vector>
	#include <cstdint>
	#include <string>
	#include <map>
	#include <set>


	// Those are model defined
	// Check paper page 3 (https://cdn.openai.com/papers/whisper.pdf)
	#define WHISPER_SAMPLE_RATE 16000
	#define WHISPER_N_FFT 400
	#define WHISPER_N_MEL 80
	#define WHISPER_HOP_LENGTH 160
	#define WHISPER_CHUNK_SIZE 30
	#define ENCODER_INPUT_LEN 3000

	struct whisperMel {
	int n_len;
	int n_mel;

	std::vector<float> data;
	};

	struct whisperFilters {
	int32_t n_mel;
	int32_t n_fft;

	std::vector<float> data;
	};

	struct whisperVocab {
	size_t n_vocab = 51864;

	std::map<std::string, int32_t> token_to_id;
	std::map<int32_t, std::string> id_to_token;

	int32_t token_eot = 50256;
	int32_t token_sot = 50257;
	int32_t token_prev = 50360;
	int32_t token_solm = 50361; // no speech
	int32_t token_not = 50362; // no timestamps
	int32_t token_beg = 50363; // timestamp begin

	// available tasks
	const int32_t token_translate = 50358;
	const int32_t token_transcribe = 50359;

	bool is_multilingual() const {
	return n_vocab == 51865;
	}

	std::map<std::string, int32_t> languageTokens2Id;
	std::map<int32_t, std::string> languageId2Tokens;
	std::set<int32_t> noSpeechTokens;
	};

	bool logMelSpectrogram(
	const float * samples,
	const int n_samples,
	const int n_threads,
	const whisperFilters & filters,
	whisperMel &mel);

	void fft(const std::vector<float> & in, std::vector<float> & out);

	void dft(const std::vector<float> & in, std::vector<float> & out);

	void loadMelFilters(const std::string& fileName, whisperFilters& filters);

	void loadTokens(const std::string& fileName, whisperVocab& vocab);

	void inputPadTrim(whisperMel &mel);