From 15ee7da6140c7955c1e59a759f341c17ebd459c7 Mon Sep 17 00:00:00 2001 From: hkwon Date: Tue, 24 Oct 2023 11:46:25 -0700 Subject: [PATCH 01/20] Support Transformers in the Wav2Vec2 Encoder for the ASR Inference --- CMakeLists.txt | 2 + include/ctranslate2/layers/wav2vec2.h | 47 +++++++ include/ctranslate2/models/wav2vec2.h | 72 +++++++++++ python/cpp/module.cc | 1 + python/cpp/module.h | 1 + python/cpp/wav2vec2.cc | 93 ++++++++++++++ python/ctranslate2/converters/transformers.py | 46 +++++++ python/ctranslate2/models/__init__.py | 1 + python/ctranslate2/specs/__init__.py | 1 + python/ctranslate2/specs/wav2vec2_spec.py | 41 ++++++ python/tests/test_wav2vec2.py | 94 ++++++++++++++ src/layers/wav2vec2.cc | 58 +++++++++ src/models/model_factory.cc | 3 + src/models/wav2vec2.cc | 119 ++++++++++++++++++ 14 files changed, 579 insertions(+) create mode 100644 include/ctranslate2/layers/wav2vec2.h create mode 100644 include/ctranslate2/models/wav2vec2.h create mode 100644 python/cpp/wav2vec2.cc create mode 100644 python/ctranslate2/specs/wav2vec2_spec.py create mode 100644 python/tests/test_wav2vec2.py create mode 100644 src/layers/wav2vec2.cc create mode 100644 src/models/wav2vec2.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 79717045b..ce8b3d31f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,6 +116,7 @@ set(SOURCES src/layers/common.cc src/layers/decoder.cc src/layers/transformer.cc + src/layers/wav2vec2.cc src/layers/whisper.cc src/logging.cc src/models/language_model.cc @@ -124,6 +125,7 @@ set(SOURCES src/models/model_reader.cc src/models/sequence_to_sequence.cc src/models/transformer.cc + src/models/wav2vec2.cc src/models/whisper.cc src/ops/activation.cc src/ops/add.cc diff --git a/include/ctranslate2/layers/wav2vec2.h b/include/ctranslate2/layers/wav2vec2.h new file mode 100644 index 000000000..4c25c941a --- /dev/null +++ b/include/ctranslate2/layers/wav2vec2.h @@ -0,0 +1,47 @@ +#pragma once + +#include "ctranslate2/layers/transformer.h" + +namespace ctranslate2 { + namespace layers { + + class Wav2Vec2Encoder : public Layer { + public: + Wav2Vec2Encoder(const models::Model& model, const std::string& scope); + + void operator()(const StorageView& features, StorageView& output); + + DataType output_type() const override { + return _output_norm.output_type(); + } + + dim_t output_size() const override { + return _output_norm.output_size(); + } + + dim_t input_size() const { + return 1024; + } + + bool is_encoded(const StorageView& features) const { + // Input features shape: [batch_size, input_size, input_time] + // Encoder output shape: [batch_size, input_time // 2, output_size] + // + // input_time is variable so we check that dimension 1 is different than its original value. + + return (features.rank() == 3 + && features.dim(2) == output_size() + && features.dim(1) != input_size()); + } + + private: + const ops::GELU _gelu; + // wav2vec2.encoder modules except pos_conv_embed due to groups=16 being not supported + //const ops::Transpose _transpose; + const dim_t _num_heads; + const std::vector> _layers; + const LayerNorm _output_norm; + }; + + } +} diff --git a/include/ctranslate2/models/wav2vec2.h b/include/ctranslate2/models/wav2vec2.h new file mode 100644 index 000000000..d1034ef88 --- /dev/null +++ b/include/ctranslate2/models/wav2vec2.h @@ -0,0 +1,72 @@ +#pragma once + +//#include "ctranslate2/generation.h" +#include "ctranslate2/layers/wav2vec2.h" +#include "ctranslate2/models/model.h" +#include "ctranslate2/replica_pool.h" + +namespace ctranslate2 { + namespace models { + + struct Wav2Vec2Options { + // Maximum generation length. + size_t max_length = 448; + + // Randomly sample from the top K candidates (set 0 to sample from the full distribution). + size_t sampling_topk = 1; + + // Maximum index of the first predicted timestamp. + size_t max_initial_timestamp_index = 50; + + // Suppress blank outputs at the beginning of the sampling. + bool suppress_blank = true; + + // List of token IDs to suppress. + // -1 will suppress a default set of symbols as defined in the model config.json file. + std::vector suppress_tokens = {-1}; + }; + + + class Wav2Vec2Model : public Model { + public: + const Vocabulary& get_vocabulary() const; + size_t current_spec_revision() const override; + bool is_quantizable(const std::string& variable_name) const override; + bool is_linear_weight(const std::string& variable_name) const override; + std::unique_ptr clone() const override; + + bool use_global_int16_scale() const override { + return false; + } + + protected: + void initialize(ModelReader& model_reader) override; + private: + std::shared_ptr _vocabulary; + }; + + class Wav2Vec2Replica : public ModelReplica { + public: + static std::unique_ptr create_from_model(const Model& model); + + Wav2Vec2Replica(const std::shared_ptr& model); + + StorageView encode(StorageView features, const bool to_cpu); + + private: + const std::shared_ptr _model; + const std::unique_ptr _encoder; + + StorageView maybe_encode(StorageView features); + }; + + class Wav2Vec2 : public ReplicaPool { + public: + using ReplicaPool::ReplicaPool; + + std::future encode(const StorageView& features, const bool to_cpu); + + }; + + } +} diff --git a/python/cpp/module.cc b/python/cpp/module.cc index 997414989..4a9e47561 100644 --- a/python/cpp/module.cc +++ b/python/cpp/module.cc @@ -86,4 +86,5 @@ PYBIND11_MODULE(_ext, m) ctranslate2::python::register_generator(m); ctranslate2::python::register_encoder(m); ctranslate2::python::register_whisper(m); + ctranslate2::python::register_wav2vec2(m); } diff --git a/python/cpp/module.h b/python/cpp/module.h index b314969c4..01fdbdf59 100644 --- a/python/cpp/module.h +++ b/python/cpp/module.h @@ -17,6 +17,7 @@ namespace ctranslate2 { void register_translation_stats(py::module& m); void register_translator(py::module& m); void register_whisper(py::module& m); + void register_wav2vec2(py::module& m); } } diff --git a/python/cpp/wav2vec2.cc b/python/cpp/wav2vec2.cc new file mode 100644 index 000000000..ced116cb4 --- /dev/null +++ b/python/cpp/wav2vec2.cc @@ -0,0 +1,93 @@ +#include "module.h" + +#include + +#include "replica_pool.h" + +namespace ctranslate2 { + namespace python { + + class Wav2Vec2Wrapper : public ReplicaPoolHelper { + public: + using ReplicaPoolHelper::ReplicaPoolHelper; + + StorageView encode(const StorageView& features, const bool to_cpu) { + return _pool->encode(features, to_cpu).get(); + } + }; + + + void register_wav2vec2(py::module& m) { + py::class_( + m, "Wav2Vec2", + R"pbdoc( + Implements the Wav2Vec2 speech recognition model published by Facebook. + + See Also: + https://github.com/facebookresearch/fairseq/tree/main/examples/wav2vec + )pbdoc") + + .def(py::init>&, const StringOrMap&, size_t, size_t, long, py::object>(), + py::arg("model_path"), + py::arg("device")="cpu", + py::kw_only(), + py::arg("device_index")=0, + py::arg("compute_type")="default", + py::arg("inter_threads")=1, + py::arg("intra_threads")=0, + py::arg("max_queued_batches")=0, + py::arg("files")=py::none(), + R"pbdoc( + Initializes a Wav2Vec2 model from a converted model. + + Arguments: + model_path: Path to the CTranslate2 model directory. + device: Device to use (possible values are: cpu, cuda, auto). + device_index: Device IDs where to place this model on. + compute_type: Model computation type or a dictionary mapping a device name + to the computation type (possible values are: default, auto, int8, int8_float32, + int8_float16, int8_bfloat16, int16, float16, bfloat16, float32). + inter_threads: Number of workers to allow executing multiple batches in parallel. + intra_threads: Number of OpenMP threads per worker (0 to use a default value). + max_queued_batches: Maximum numbers of batches in the worker queue (-1 for unlimited, + 0 for an automatic value). When the queue is full, future requests will block + until a free slot is available. + files: Load model files from the memory. This argument is a dictionary mapping + file names to file contents as file-like or bytes objects. If this is set, + :obj:`model_path` acts as an identifier for this model. + )pbdoc") + + .def_property_readonly("device", &Wav2Vec2Wrapper::device, + "Device this model is running on.") + .def_property_readonly("device_index", &Wav2Vec2Wrapper::device_index, + "List of device IDs where this model is running on.") + .def_property_readonly("compute_type", &Wav2Vec2Wrapper::compute_type, + "Computation type used by the model.") + .def_property_readonly("num_workers", &Wav2Vec2Wrapper::num_replicas, + "Number of model workers backing this instance.") + .def_property_readonly("num_queued_batches", &Wav2Vec2Wrapper::num_queued_batches, + "Number of batches waiting to be processed.") + .def_property_readonly("num_active_batches", &Wav2Vec2Wrapper::num_active_batches, + "Number of batches waiting to be processed or currently processed.") + + .def("encode", &Wav2Vec2Wrapper::encode, + py::arg("features"), + py::arg("to_cpu")=false, + py::call_guard(), + R"pbdoc( + Encodes the input features. + + Arguments: + features: Mel spectogram of the audio, as a float array with shape + ``[batch_size, 80, 3000]``. + to_cpu: Copy the encoder output to the CPU before returning the value. + + Returns: + The encoder output. + )pbdoc") + + ; + } + + } +} diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 2c8138da0..609ed1774 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -23,6 +23,7 @@ model_spec, transformer_spec, whisper_spec, + wav2vec2_spec, ) _SUPPORTED_ACTIVATIONS = { @@ -935,6 +936,51 @@ def set_conv1d(self, spec, module): spec.bias = module.bias +@register_loader("Wav2Vec2Config") +class Wav2Vec2Loader(BartLoader): + @property + def architecture_name(self): + return "Wav2Vec2ForCTC" + + def get_model_spec(self, model): + # Wav2Vec2 encoder Wav2Vec2PositionalConvEmbedding conv1d has groups 16 + # that doesn't look available here so we make Wav2Vec2 encoder layers only + spec = wav2vec2_spec.Wav2Vec2Spec( + model.wav2vec2.encoder.config.num_hidden_layers, + model.wav2vec2.encoder.config.num_attention_heads, + ) + + # layer component name matching (no duplications saving) + for layer in model.wav2vec2.encoder.layers: + layer.self_attn = layer.attention + layer.self_attn_layer_norm = layer.layer_norm + layer.activation_fn = layer.feed_forward.intermediate_act_fn + layer.fc1 = layer.feed_forward.intermediate_dense + layer.fc2 = layer.feed_forward.output_dense + + self.set_encoder(spec.encoder, model.wav2vec2.encoder) + self.set_linear(spec.lm_head, model.lm_head) + # only for Wav2Vec2Spec.get_vocabulary_size() + return spec + + def set_config(self, config, model, tokenizer): + num_layers = model.wav2vec2.encoder.config.num_hidden_layers + num_heads = model.wav2vec2.encoder.config.num_attention_heads + return + + def get_vocabulary(self, model, tokenizer): + return tokenizer.vocab + + def set_vocabulary(self, spec, tokens): + spec.register_vocabulary(tokens) + + def set_encoder(self, spec, encoder): + super().set_encoder(spec, encoder) + + def set_common_layers(self, spec, module): + self.set_layer_norm(spec.layer_norm, module.layer_norm) + + @register_loader("T5Config") class T5Loader(ModelLoader): @property diff --git a/python/ctranslate2/models/__init__.py b/python/ctranslate2/models/__init__.py index 067a32d8c..009c56787 100644 --- a/python/ctranslate2/models/__init__.py +++ b/python/ctranslate2/models/__init__.py @@ -7,6 +7,7 @@ Whisper, WhisperGenerationResult, WhisperGenerationResultAsync, + Wav2Vec2, ) except ImportError as e: # Allow using the Python package without the compiled extension. diff --git a/python/ctranslate2/specs/__init__.py b/python/ctranslate2/specs/__init__.py index 4a2bf41a1..647696969 100644 --- a/python/ctranslate2/specs/__init__.py +++ b/python/ctranslate2/specs/__init__.py @@ -14,3 +14,4 @@ TransformerSpec, ) from ctranslate2.specs.whisper_spec import WhisperSpec +from ctranslate2.specs.wav2vec2_spec import Wav2Vec2Spec diff --git a/python/ctranslate2/specs/wav2vec2_spec.py b/python/ctranslate2/specs/wav2vec2_spec.py new file mode 100644 index 000000000..b89578d35 --- /dev/null +++ b/python/ctranslate2/specs/wav2vec2_spec.py @@ -0,0 +1,41 @@ +from typing import List, Optional, Tuple + +import numpy as np + +from ctranslate2.specs import common_spec, model_spec, transformer_spec + +class Wav2Vec2Config(model_spec.ModelConfig): + """Configuration for the Wav2Vec2 model.""" + def __init__(self): + return + + +class Wav2Vec2Spec(model_spec.LanguageModelSpec): + def __init__(self, num_layers, num_heads): + super().__init__() + self.encoder = Wav2Vec2EncoderSpec(num_layers, num_heads) + self.lm_head = common_spec.LinearSpec() + + @property + def name(self): + return "Wav2Vec2Spec" + + @property + def revision(self): + return 3 + + def get_default_config(self): + return Wav2Vec2Config() + + def get_vocabulary_size(self): + return self.lm_head.weight.shape[0] + + +class Wav2Vec2EncoderSpec(model_spec.LayerSpec): + def __init__(self, num_layers, num_heads): + self.num_heads = np.dtype("int16").type(num_heads) + # wav2vec2.encoder modules except pos_conv_embed due to groups=16 being not supported + self.layer_norm = common_spec.LayerNormSpec() + self.layer = [ + transformer_spec.TransformerEncoderLayerSpec() for _ in range(num_layers) + ] diff --git a/python/tests/test_wav2vec2.py b/python/tests/test_wav2vec2.py new file mode 100644 index 000000000..542cbf9f8 --- /dev/null +++ b/python/tests/test_wav2vec2.py @@ -0,0 +1,94 @@ +import os, os.path, torch, torchaudio, librosa +from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor +from transformers.modeling_outputs import CausalLMOutput, Wav2Vec2BaseModelOutput, BaseModelOutput +import ctranslate2 +from torchaudio.utils import download_asset +import numpy as np + +# Models Conversion & Preparation +compute_type='int8' +if not os.path.isfile('ctranslate2_model/model.bin'): + model_name='facebook/wav2vec2-large-robust-ft-swbd-300h' + converter = ctranslate2.converters.TransformersConverter(model_name,load_as_float16=compute_type) + output_dir = converter.convert('ctranslate2_model') +else: + output_dir = 'ctranslate2_model' + +if not os.path.isfile('ctranslate2_model/wav2vec2_partial.bin'): + w2v2_model = Wav2Vec2ForCTC.from_pretrained(model_name) + del w2v2_model.wav2vec2.encoder.layers + del w2v2_model.wav2vec2.encoder.layer_norm + torch.save(w2v2_model,'ctranslate2_model/wav2vec2_partial.bin') + w2v2_processor = Wav2Vec2Processor.from_pretrained(model_name) + torch.save(w2v2_processor,'ctranslate2_model/wav2vec2_processor.bin') + + +# ASR inference +try: + if os.environ['CUDA_VISIBLE_DEVICES']: + device = 'cuda' +except: + device = 'cpu' + +try: + if os.environ["OMP_NUM_THREADS"]: + cpu_threads = int(os.environ["OMP_NUM_THREADS"]) +except: + cpu_threads = 0 + + +w2v2_model = torch.load('ctranslate2_model/wav2vec2_partial.bin').to(device) +w2v2_processor = torch.load('ctranslate2_model/wav2vec2_processor.bin') + +SAMPLE_WAV = download_asset('tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav') +resample_rate = 16000 +waveform, sampling_rate = torchaudio.load(SAMPLE_WAV) +if sampling_rate != resample_rate: + speech_array = librosa.resample(waveform[0].numpy(), orig_sr=sampling_rate, target_sr=resample_rate) +else: + speech_array = waveform[0].numpy() +input_values = w2v2_processor(speech_array.astype(np.float32), padding=True, return_tensors="pt", sampling_rate=resample_rate).input_values + +with torch.no_grad(): + extract_features = w2v2_model.wav2vec2.feature_extractor(input_values.to(w2v2_model.device)).transpose(1, 2) + hidden_states, extract_features = w2v2_model.wav2vec2.feature_projection(extract_features) + position_embeddings = w2v2_model.wav2vec2.encoder.pos_conv_embed(hidden_states) + hidden_states = position_embeddings + hidden_states + #hidden_states = w2v2_model.encoder.dropout(hidden_states) # Dropout(p=0.0, inplace=False) bypassed + +ct2_w2v2_model = ctranslate2.models.Wav2Vec2( + output_dir, + device=device, + device_index=[0], + compute_type=compute_type, + intra_threads=cpu_threads, + inter_threads=1, + ) +hidden_states = hidden_states.cpu() if ct2_w2v2_model.device == "cuda" else hidden_states.numpy() +hidden_states = np.ascontiguousarray(hidden_states) +hidden_states = ctranslate2.StorageView.from_array(hidden_states) +to_cpu = ct2_w2v2_model.device == "cuda" and len(ct2_w2v2_model.device_index) > 1 +ct2_output = ct2_w2v2_model.encode(hidden_states, to_cpu=to_cpu) +# 24 x Wav2Vec2EncoderLayerStableLayerNorm processed +if ct2_w2v2_model.device == "cuda": + hidden_states = torch.as_tensor(ct2_output, device=ct2_w2v2_model.device) +else: + hidden_states = torch.as_tensor(np.array(ct2_output), dtype=torch.float32, device=ct2_w2v2_model.device) + +encoder_outputs = BaseModelOutput(last_hidden_state=hidden_states, hidden_states=None, attentions=None) +hidden_states = encoder_outputs[0] +outputs = Wav2Vec2BaseModelOutput( + last_hidden_state=hidden_states, + extract_features=extract_features, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) +hidden_states = outputs[0] +#hidden_states = w2v2_model.dropout(hidden_states) # Dropout(p=0.0, inplace=False) bypassed +with torch.no_grad(): + logits = w2v2_model.lm_head(hidden_states.to(torch.float32))[0] + +predicted_ids = torch.argmax(logits, dim=-1) +output = w2v2_processor.decode(predicted_ids, output_word_offsets=True) +print(output["text"]) # should be: I HAD THAT CURIOSITY BESIDE ME AT THIS MOMENT + diff --git a/src/layers/wav2vec2.cc b/src/layers/wav2vec2.cc new file mode 100644 index 000000000..237c77fad --- /dev/null +++ b/src/layers/wav2vec2.cc @@ -0,0 +1,58 @@ +#include "ctranslate2/layers/wav2vec2.h" + + +namespace ctranslate2 { + namespace layers { + Wav2Vec2Encoder::Wav2Vec2Encoder(const models::Model& model, const std::string& scope) + : _num_heads(model.get_attribute_with_default(scope + "/num_heads", 8)) + , _layers(build_layers_list(model, + scope + "/layer", + _num_heads, + /*pre_norm=*/true, + ops::ActivationType::GELU)) + , _output_norm(model, scope + "/layer_norm") + { + } + + void Wav2Vec2Encoder::operator()(const StorageView& features, StorageView& output) { + PROFILE("Wav2Vec2Encoder"); + + // SAD in front-end handles the input length + //const dim_t expected_depth = 1024; + //const dim_t expected_time = 406; + + if (features.rank() != 3) + throw std::invalid_argument("Expected input features to have 3 dimensions, but got " + + std::to_string(features.rank()) + + " dimension(s) instead"); + /* //may need to limit the input lenght + if (features.dim(1) != expected_depth || features.dim(2) != expected_time) + throw std::invalid_argument("Invalid input features shape: expected an input with shape (" + + std::to_string(features.dim(0)) + + ", " + + std::to_string(expected_depth) + + ", " + + std::to_string(expected_time) + + "), but got an input with shape (" + + std::to_string(features.dim(0)) + + ", " + + std::to_string(features.dim(1)) + + ", " + + std::to_string(features.dim(2)) + + ") instead;; _conv1.output_size() " + + std::to_string(_conv1.output_size())); + //+ ") instead"); + */ + + StorageView input(output_type(), features.device()); + input = features; + for (const auto& layer : _layers) { + (*layer)(input, nullptr, output); + input = std::move(output); + } + + _output_norm(input, output); + } + + } +} diff --git a/src/models/model_factory.cc b/src/models/model_factory.cc index e5a904aff..488e0b8b2 100644 --- a/src/models/model_factory.cc +++ b/src/models/model_factory.cc @@ -3,6 +3,7 @@ #include #include "ctranslate2/models/whisper.h" +#include "ctranslate2/models/wav2vec2.h" #include "ctranslate2/models/transformer.h" namespace ctranslate2 { @@ -20,6 +21,8 @@ namespace ctranslate2 { register_model("TransformerEncoderSpec"); register_model("WhisperSpec"); + + register_model("Wav2Vec2Spec"); } std::shared_ptr create_model(const std::string& name) { diff --git a/src/models/wav2vec2.cc b/src/models/wav2vec2.cc new file mode 100644 index 000000000..79a7a40d4 --- /dev/null +++ b/src/models/wav2vec2.cc @@ -0,0 +1,119 @@ +#include "ctranslate2/models/wav2vec2.h" + +#include + +#include "ctranslate2/decoding.h" + +#include "dispatch.h" +#include "dtw.h" + +#ifdef CT2_WITH_CUDA +# include "cuda/utils.h" +#endif + + +namespace ctranslate2 { + namespace models { + + const Vocabulary& Wav2Vec2Model::get_vocabulary() const { + return *_vocabulary; + } + + size_t Wav2Vec2Model::current_spec_revision() const { + return 3; + } + + void Wav2Vec2Model::initialize(ModelReader& model_reader) { + VocabularyInfo vocab_info; + vocab_info.unk_token = "[UNK]"; + vocab_info.bos_token = ""; + vocab_info.eos_token = ""; + + _vocabulary = load_vocabulary(model_reader, "vocabulary", std::move(vocab_info)); + if (!_vocabulary) + throw std::runtime_error("Cannot load the vocabulary from the model directory"); + } + + bool Wav2Vec2Model::is_quantizable(const std::string& variable_name) const { + return (Model::is_quantizable(variable_name) + && variable_name.find("conv") == std::string::npos); + } + + bool Wav2Vec2Model::is_linear_weight(const std::string& variable_name) const { + return is_quantizable(variable_name) && variable_name.find("embeddings") == std::string::npos; + } + + std::unique_ptr Wav2Vec2Model::clone() const { + return std::make_unique(*this); + } + + + std::unique_ptr Wav2Vec2Replica::create_from_model(const Model& model) { + if (!dynamic_cast(&model)) + throw std::invalid_argument("The model is not a Wav2Vec2 model"); + + const auto scoped_device_setter = model.get_scoped_device_setter(); + const auto model_ptr = model.shared_from_this(); + const auto concrete_model = std::static_pointer_cast(model_ptr); + return std::make_unique(concrete_model); + } + + Wav2Vec2Replica::Wav2Vec2Replica(const std::shared_ptr& model) + : ModelReplica(model) + , _model(model) + , _encoder(std::make_unique(*model, "encoder")) + { + } + + + StorageView Wav2Vec2Replica::encode(StorageView features, const bool to_cpu) { + PROFILE("Wav2Vec2Replica::encode"); + +#ifdef CT2_WITH_CUDA + const cuda::UseTrueFp16GemmInScope use_true_fp16_gemm(false); +#endif + + const auto scoped_device_setter = _model->get_scoped_device_setter(); + const Device device = _model->device(); + const DataType dtype = _encoder->output_type(); + features.move_to(device, dtype); + + StorageView encoder_output(dtype, device); + (*_encoder)(features, encoder_output); + + if (to_cpu) { + if (device != Device::CPU) + encoder_output = encoder_output.to(Device::CPU); + + return encoder_output; + } + + // Ensure all operations are finished before returning the output. + synchronize_stream(device); + + return encoder_output; + } + + StorageView Wav2Vec2Replica::maybe_encode(StorageView features) { + const Device device = _model->device(); + const DataType dtype = _encoder->output_type(); + + features.move_to(device, dtype); + + if (_encoder->is_encoded(features)) + return features; + + StorageView encoder_output(dtype, device); + (*_encoder)(features, encoder_output); + return encoder_output; + } + + std::future Wav2Vec2::encode(const StorageView& features, const bool to_cpu) { + return post( + [features = features.sync_copy(), to_cpu](Wav2Vec2Replica& replica) mutable { + return replica.encode(std::move(features), to_cpu); + }); + } + + } +} From 0901484a14b648be5ed7666d432bff995847f3f1 Mon Sep 17 00:00:00 2001 From: hkwon Date: Tue, 24 Oct 2023 14:22:19 -0700 Subject: [PATCH 02/20] code style/format check with flask8 & black --- python/ctranslate2/converters/transformers.py | 2 - python/ctranslate2/specs/wav2vec2_spec.py | 2 + python/tests/test_wav2vec2.py | 142 ++++++++++++------ 3 files changed, 94 insertions(+), 52 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 609ed1774..ba7750739 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -964,8 +964,6 @@ def get_model_spec(self, model): return spec def set_config(self, config, model, tokenizer): - num_layers = model.wav2vec2.encoder.config.num_hidden_layers - num_heads = model.wav2vec2.encoder.config.num_attention_heads return def get_vocabulary(self, model, tokenizer): diff --git a/python/ctranslate2/specs/wav2vec2_spec.py b/python/ctranslate2/specs/wav2vec2_spec.py index b89578d35..78b2ffa84 100644 --- a/python/ctranslate2/specs/wav2vec2_spec.py +++ b/python/ctranslate2/specs/wav2vec2_spec.py @@ -4,8 +4,10 @@ from ctranslate2.specs import common_spec, model_spec, transformer_spec + class Wav2Vec2Config(model_spec.ModelConfig): """Configuration for the Wav2Vec2 model.""" + def __init__(self): return diff --git a/python/tests/test_wav2vec2.py b/python/tests/test_wav2vec2.py index 542cbf9f8..f357748d7 100644 --- a/python/tests/test_wav2vec2.py +++ b/python/tests/test_wav2vec2.py @@ -1,60 +1,81 @@ -import os, os.path, torch, torchaudio, librosa -from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor -from transformers.modeling_outputs import CausalLMOutput, Wav2Vec2BaseModelOutput, BaseModelOutput +import os +import os.path +import torch +import torchaudio +import librosa +from transformers import ( + Wav2Vec2ForCTC, + Wav2Vec2Processor, +) +from transformers.modeling_outputs import ( + CausalLMOutput, + Wav2Vec2BaseModelOutput, + BaseModelOutput, +) import ctranslate2 from torchaudio.utils import download_asset import numpy as np # Models Conversion & Preparation -compute_type='int8' -if not os.path.isfile('ctranslate2_model/model.bin'): - model_name='facebook/wav2vec2-large-robust-ft-swbd-300h' - converter = ctranslate2.converters.TransformersConverter(model_name,load_as_float16=compute_type) - output_dir = converter.convert('ctranslate2_model') +compute_type = "int8" +if not os.path.isfile("ctranslate2_model/model.bin"): + model_name = "facebook/wav2vec2-large-robust-ft-swbd-300h" + converter = ctranslate2.converters.TransformersConverter( + model_name, + load_as_float16=compute_type, + ) + output_dir = converter.convert("ctranslate2_model") else: - output_dir = 'ctranslate2_model' + output_dir = "ctranslate2_model" -if not os.path.isfile('ctranslate2_model/wav2vec2_partial.bin'): - w2v2_model = Wav2Vec2ForCTC.from_pretrained(model_name) - del w2v2_model.wav2vec2.encoder.layers - del w2v2_model.wav2vec2.encoder.layer_norm - torch.save(w2v2_model,'ctranslate2_model/wav2vec2_partial.bin') - w2v2_processor = Wav2Vec2Processor.from_pretrained(model_name) - torch.save(w2v2_processor,'ctranslate2_model/wav2vec2_processor.bin') +if not os.path.isfile("ctranslate2_model/wav2vec2_partial.bin"): + w2v2_model = Wav2Vec2ForCTC.from_pretrained(model_name) + del w2v2_model.wav2vec2.encoder.layers + del w2v2_model.wav2vec2.encoder.layer_norm + torch.save(w2v2_model, "ctranslate2_model/wav2vec2_partial.bin") + w2v2_processor = Wav2Vec2Processor.from_pretrained(model_name) + torch.save(w2v2_processor, "ctranslate2_model/wav2vec2_processor.bin") # ASR inference -try: - if os.environ['CUDA_VISIBLE_DEVICES']: - device = 'cuda' -except: - device = 'cpu' +device = "cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu" +cpu_threads = int(os.environ.get("OMP_NUM_THREADS", 0)) -try: - if os.environ["OMP_NUM_THREADS"]: - cpu_threads = int(os.environ["OMP_NUM_THREADS"]) -except: - cpu_threads = 0 - +w2v2_model = torch.load("ctranslate2_model/wav2vec2_partial.bin").to(device) +w2v2_processor = torch.load("ctranslate2_model/wav2vec2_processor.bin") -w2v2_model = torch.load('ctranslate2_model/wav2vec2_partial.bin').to(device) -w2v2_processor = torch.load('ctranslate2_model/wav2vec2_processor.bin') - -SAMPLE_WAV = download_asset('tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav') +SAMPLE_WAV = download_asset( + "tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" +) resample_rate = 16000 waveform, sampling_rate = torchaudio.load(SAMPLE_WAV) if sampling_rate != resample_rate: - speech_array = librosa.resample(waveform[0].numpy(), orig_sr=sampling_rate, target_sr=resample_rate) + speech_array = librosa.resample( + waveform[0].numpy(), + orig_sr=sampling_rate, + target_sr=resample_rate, + ) else: speech_array = waveform[0].numpy() -input_values = w2v2_processor(speech_array.astype(np.float32), padding=True, return_tensors="pt", sampling_rate=resample_rate).input_values + +input_values = w2v2_processor( + speech_array.astype(np.float32), + padding=True, + return_tensors="pt", + sampling_rate=resample_rate, +).input_values with torch.no_grad(): - extract_features = w2v2_model.wav2vec2.feature_extractor(input_values.to(w2v2_model.device)).transpose(1, 2) - hidden_states, extract_features = w2v2_model.wav2vec2.feature_projection(extract_features) + extract_features = w2v2_model.wav2vec2.feature_extractor( + input_values.to(w2v2_model.device) + ).transpose(1, 2) + hidden_states, extract_features = w2v2_model.wav2vec2.feature_projection( + extract_features + ) position_embeddings = w2v2_model.wav2vec2.encoder.pos_conv_embed(hidden_states) - hidden_states = position_embeddings + hidden_states - #hidden_states = w2v2_model.encoder.dropout(hidden_states) # Dropout(p=0.0, inplace=False) bypassed + hidden_states = position_embeddings + hidden_states + # hidden_states = w2v2_model.encoder.dropout(hidden_states) + # Dropout(p=0.0, inplace=False) bypassed ct2_w2v2_model = ctranslate2.models.Wav2Vec2( output_dir, @@ -63,32 +84,53 @@ compute_type=compute_type, intra_threads=cpu_threads, inter_threads=1, - ) -hidden_states = hidden_states.cpu() if ct2_w2v2_model.device == "cuda" else hidden_states.numpy() +) + +if ct2_w2v2_model.device == "cuda": + hidden_states = hidden_states.cpu() +else: + hidden_states.numpy() + hidden_states = np.ascontiguousarray(hidden_states) hidden_states = ctranslate2.StorageView.from_array(hidden_states) to_cpu = ct2_w2v2_model.device == "cuda" and len(ct2_w2v2_model.device_index) > 1 -ct2_output = ct2_w2v2_model.encode(hidden_states, to_cpu=to_cpu) +ct2_output = ct2_w2v2_model.encode( + hidden_states, + to_cpu=to_cpu, +) # 24 x Wav2Vec2EncoderLayerStableLayerNorm processed if ct2_w2v2_model.device == "cuda": - hidden_states = torch.as_tensor(ct2_output, device=ct2_w2v2_model.device) + hidden_states = torch.as_tensor( + ct2_output, + device=ct2_w2v2_model.device, + ) else: - hidden_states = torch.as_tensor(np.array(ct2_output), dtype=torch.float32, device=ct2_w2v2_model.device) + hidden_states = torch.as_tensor( + np.array(ct2_output), + dtype=torch.float32, + device=ct2_w2v2_model.device, + ) -encoder_outputs = BaseModelOutput(last_hidden_state=hidden_states, hidden_states=None, attentions=None) +encoder_outputs = BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=None, + attentions=None, +) hidden_states = encoder_outputs[0] outputs = Wav2Vec2BaseModelOutput( - last_hidden_state=hidden_states, - extract_features=extract_features, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) + last_hidden_state=hidden_states, + extract_features=extract_features, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, +) hidden_states = outputs[0] -#hidden_states = w2v2_model.dropout(hidden_states) # Dropout(p=0.0, inplace=False) bypassed +# hidden_states = w2v2_model.dropout(hidden_states) +# Dropout(p=0.0, inplace=False) bypassed with torch.no_grad(): logits = w2v2_model.lm_head(hidden_states.to(torch.float32))[0] predicted_ids = torch.argmax(logits, dim=-1) output = w2v2_processor.decode(predicted_ids, output_word_offsets=True) -print(output["text"]) # should be: I HAD THAT CURIOSITY BESIDE ME AT THIS MOMENT +print(output["text"]) +# should be: I HAD THAT CURIOSITY BESIDE ME AT THIS MOMENT From 4b28cc6232bdaad0b066720bfc83e732ba1d2545 Mon Sep 17 00:00:00 2001 From: hkwon Date: Tue, 24 Oct 2023 14:48:14 -0700 Subject: [PATCH 03/20] check isort and update --- python/ctranslate2/converters/transformers.py | 2 +- python/ctranslate2/models/__init__.py | 2 +- python/ctranslate2/specs/__init__.py | 2 +- python/tests/test_wav2vec2.py | 16 ++++++++-------- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index ba7750739..11aec45f8 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -22,8 +22,8 @@ common_spec, model_spec, transformer_spec, - whisper_spec, wav2vec2_spec, + whisper_spec, ) _SUPPORTED_ACTIVATIONS = { diff --git a/python/ctranslate2/models/__init__.py b/python/ctranslate2/models/__init__.py index 009c56787..aba612a5c 100644 --- a/python/ctranslate2/models/__init__.py +++ b/python/ctranslate2/models/__init__.py @@ -4,10 +4,10 @@ try: from ctranslate2._ext import ( + Wav2Vec2, Whisper, WhisperGenerationResult, WhisperGenerationResultAsync, - Wav2Vec2, ) except ImportError as e: # Allow using the Python package without the compiled extension. diff --git a/python/ctranslate2/specs/__init__.py b/python/ctranslate2/specs/__init__.py index 647696969..22552f5c9 100644 --- a/python/ctranslate2/specs/__init__.py +++ b/python/ctranslate2/specs/__init__.py @@ -13,5 +13,5 @@ TransformerEncoderSpec, TransformerSpec, ) -from ctranslate2.specs.whisper_spec import WhisperSpec from ctranslate2.specs.wav2vec2_spec import Wav2Vec2Spec +from ctranslate2.specs.whisper_spec import WhisperSpec diff --git a/python/tests/test_wav2vec2.py b/python/tests/test_wav2vec2.py index f357748d7..0553966d8 100644 --- a/python/tests/test_wav2vec2.py +++ b/python/tests/test_wav2vec2.py @@ -1,20 +1,20 @@ import os import os.path + +import librosa +import numpy as np import torch import torchaudio -import librosa -from transformers import ( - Wav2Vec2ForCTC, - Wav2Vec2Processor, -) + +from torchaudio.utils import download_asset +from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor from transformers.modeling_outputs import ( + BaseModelOutput, CausalLMOutput, Wav2Vec2BaseModelOutput, - BaseModelOutput, ) + import ctranslate2 -from torchaudio.utils import download_asset -import numpy as np # Models Conversion & Preparation compute_type = "int8" From 525881693f580918e423b285d820a7a1658dbbb0 Mon Sep 17 00:00:00 2001 From: hkwon Date: Wed, 25 Oct 2023 08:41:04 -0700 Subject: [PATCH 04/20] change ONEAPI_VERSION to 2023.2.0 --- python/tools/prepare_build_environment_linux.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tools/prepare_build_environment_linux.sh b/python/tools/prepare_build_environment_linux.sh index f1416295b..a4009e566 100755 --- a/python/tools/prepare_build_environment_linux.sh +++ b/python/tools/prepare_build_environment_linux.sh @@ -32,7 +32,7 @@ else libcublas-devel-11-2-11.4.1.1043-1 ln -s cuda-11.2 /usr/local/cuda - ONEAPI_VERSION=2023.0.0 + ONEAPI_VERSION=2023.2.0 yum-config-manager --add-repo https://yum.repos.intel.com/oneapi rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB yum install -y intel-oneapi-mkl-devel-$ONEAPI_VERSION From f9bfa1610967bf188b742e6482c04b00d62aed14 Mon Sep 17 00:00:00 2001 From: hkwon Date: Wed, 25 Oct 2023 13:24:41 -0700 Subject: [PATCH 05/20] add missing package (librosa) for test_wav2vec2.py --- python/tests/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/python/tests/requirements.txt b/python/tests/requirements.txt index 71c3382a6..a3e558414 100644 --- a/python/tests/requirements.txt +++ b/python/tests/requirements.txt @@ -5,3 +5,4 @@ OpenNMT-tf==2.30.* tensorflow-cpu==2.11.* pytest wurlitzer==3.0.*;platform_system=='Linux' +librosa From d2ff992142d8f591726fce9053b820b9de83a408 Mon Sep 17 00:00:00 2001 From: hkwon Date: Wed, 25 Oct 2023 15:42:56 -0700 Subject: [PATCH 06/20] import package path update --- python/tests/test_wav2vec2.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/tests/test_wav2vec2.py b/python/tests/test_wav2vec2.py index 0553966d8..ee828fb03 100644 --- a/python/tests/test_wav2vec2.py +++ b/python/tests/test_wav2vec2.py @@ -7,7 +7,10 @@ import torchaudio from torchaudio.utils import download_asset -from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor +from transformers.models.wav2vec2 import ( + Wav2Vec2ForCTC, + Wav2Vec2Processor, +) from transformers.modeling_outputs import ( BaseModelOutput, CausalLMOutput, From 4ffaab95aa8e56cd59f16274bf846bbb43443341 Mon Sep 17 00:00:00 2001 From: hkwon Date: Wed, 25 Oct 2023 15:46:20 -0700 Subject: [PATCH 07/20] isort library update --- python/tests/test_wav2vec2.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/tests/test_wav2vec2.py b/python/tests/test_wav2vec2.py index ee828fb03..04642f854 100644 --- a/python/tests/test_wav2vec2.py +++ b/python/tests/test_wav2vec2.py @@ -7,15 +7,12 @@ import torchaudio from torchaudio.utils import download_asset -from transformers.models.wav2vec2 import ( - Wav2Vec2ForCTC, - Wav2Vec2Processor, -) from transformers.modeling_outputs import ( BaseModelOutput, CausalLMOutput, Wav2Vec2BaseModelOutput, ) +from transformers.models.wav2vec2 import Wav2Vec2ForCTC, Wav2Vec2Processor import ctranslate2 From 7078fd52bc9a68fdece6741b3c400197e567e56f Mon Sep 17 00:00:00 2001 From: hkwon Date: Wed, 25 Oct 2023 19:59:44 -0700 Subject: [PATCH 08/20] update vocab return --- python/ctranslate2/converters/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 11aec45f8..0acc661c4 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -967,7 +967,7 @@ def set_config(self, config, model, tokenizer): return def get_vocabulary(self, model, tokenizer): - return tokenizer.vocab + return tokenizer.get_vocab() def set_vocabulary(self, spec, tokens): spec.register_vocabulary(tokens) From eeb92efc92f88bed9f7824efc996f8043e4b411e Mon Sep 17 00:00:00 2001 From: hkwon Date: Wed, 25 Oct 2023 22:11:15 -0700 Subject: [PATCH 09/20] add packages requirement for test_wav2vec2.py --- python/tests/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/tests/requirements.txt b/python/tests/requirements.txt index a3e558414..0ec343fa3 100644 --- a/python/tests/requirements.txt +++ b/python/tests/requirements.txt @@ -6,3 +6,5 @@ tensorflow-cpu==2.11.* pytest wurlitzer==3.0.*;platform_system=='Linux' librosa +torch +torchaudio From 9af8b898f49d421be888848be459b18e929599c1 Mon Sep 17 00:00:00 2001 From: hkwon Date: Thu, 26 Oct 2023 10:03:36 -0700 Subject: [PATCH 10/20] merge test_wav2vec2.py to test_transformers.py for the compatibility --- python/tests/requirements.txt | 1 - python/tests/test_transformers.py | 125 +++++++++++++++++++++++++++ python/tests/test_wav2vec2.py | 136 ------------------------------ 3 files changed, 125 insertions(+), 137 deletions(-) delete mode 100644 python/tests/test_wav2vec2.py diff --git a/python/tests/requirements.txt b/python/tests/requirements.txt index 0ec343fa3..a05d4d39c 100644 --- a/python/tests/requirements.txt +++ b/python/tests/requirements.txt @@ -5,6 +5,5 @@ OpenNMT-tf==2.30.* tensorflow-cpu==2.11.* pytest wurlitzer==3.0.*;platform_system=='Linux' -librosa torch torchaudio diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index a34b752a8..318b1ea61 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -943,3 +943,128 @@ def test_transformers_whisper_include_tokenizer_json(self, tmp_dir): output_dir = str(tmp_dir.join("ctranslate2_model")) output_dir = converter.convert(output_dir) assert os.path.isfile(os.path.join(output_dir, "tokenizer.json")) + + +class TestWav2Vec2: + @classmethod + def teardown_class(cls): + clear_transformers_cache_in_ci() + + @test_utils.only_on_linux + @test_utils.on_available_devices + @pytest.mark.parametrize( + "model_name,audio_name,expected_transcription", + [ + ( + "facebook/wav2vec2-large-robust-ft-swbd-300h", + "tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav", + "I HAD THAT CURIOSITY BESIDE ME AT THIS MOMENT" + ), + ], + ) + def test_transformers_wav2vec2( + self, + tmp_dir, + device, + model_name, + audio_name, + expected_transcription, + ): + import transformers + import torch + import torchaudio + from torchaudio.utils import download_asset + + converter = ctranslate2.converters.TransformersConverter(model_name,load_as_float16="int8") + output_dir = str(tmp_dir.join("ctranslate2_model")) + output_dir = converter.convert(output_dir) + # 24 x Wav2Vec2EncoderLayerStableLayerNorm converted & saved + + w2v2_model = transformers.Wav2Vec2ForCTC.from_pretrained(model_name) + del w2v2_model.wav2vec2.encoder.layers + del w2v2_model.wav2vec2.encoder.layer_norm + torch.save(w2v2_model, output_dir+"/wav2vec2_partial.bin") + w2v2_processor = transformers.Wav2Vec2Processor.from_pretrained(model_name) + torch.save(w2v2_processor, output_dir+"/wav2vec2_processor.bin") + + device = "cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu" + cpu_threads = int(os.environ.get("OMP_NUM_THREADS", 0)) + w2v2_model = torch.load(output_dir+"/wav2vec2_partial.bin").to(device) + w2v2_processor = torch.load(output_dir+"/wav2vec2_processor.bin") + ct2_w2v2_model = ctranslate2.models.Wav2Vec2( + output_dir, + device=device, + device_index=[0], + compute_type="int8", + intra_threads=cpu_threads, + inter_threads=1, + ) + + SAMPLE_WAV = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav") + waveform, sampling_rate = torchaudio.load(SAMPLE_WAV) + speech_array = waveform[0].numpy() + input_values = w2v2_processor( + speech_array.astype(np.float32), + padding=True, + return_tensors="pt", + sampling_rate=resample_rate, + ).input_values + + with torch.no_grad(): + extract_features = w2v2_model.wav2vec2.feature_extractor( + input_values.to(w2v2_model.device) + ).transpose(1, 2) + hidden_states, extract_features = w2v2_model.wav2vec2.feature_projection( + extract_features + ) + position_embeddings = w2v2_model.wav2vec2.encoder.pos_conv_embed(hidden_states) + hidden_states = position_embeddings + hidden_states + # hidden_states = w2v2_model.encoder.dropout(hidden_states) + # Dropout(p=0.0, inplace=False) bypassed + + if ct2_w2v2_model.device == "cuda": + hidden_states = hidden_states.cpu() + else: + hidden_states.numpy() + + hidden_states = np.ascontiguousarray(hidden_states) + hidden_states = ctranslate2.StorageView.from_array(hidden_states) + to_cpu = ct2_w2v2_model.device == "cuda" and len(ct2_w2v2_model.device_index) > 1 + ct2_output = ct2_w2v2_model.encode( + hidden_states, + to_cpu=to_cpu, + ) # 24 x Wav2Vec2EncoderLayerStableLayerNorm processed + if ct2_w2v2_model.device == "cuda": + hidden_states = torch.as_tensor( + ct2_output, + device=ct2_w2v2_model.device, + ) + else: + hidden_states = torch.as_tensor( + np.array(ct2_output), + dtype=torch.float32, + device=ct2_w2v2_model.device, + ) + + encoder_outputs = transformers.modeling_outputs.BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=None, + attentions=None, + ) + hidden_states = encoder_outputs[0] + outputs = transformers.modeling_outputs.Wav2Vec2BaseModelOutput( + last_hidden_state=hidden_states, + extract_features=extract_features, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + hidden_states = outputs[0] + # hidden_states = w2v2_model.dropout(hidden_states) + # Dropout(p=0.0, inplace=False) bypassed + + with torch.no_grad(): + logits = w2v2_model.lm_head(hidden_states.to(torch.float32))[0] + + predicted_ids = torch.argmax(logits, dim=-1) + transcription = w2v2_processor.decode(predicted_ids, output_word_offsets=True)[0] + assert transcription == expected_transcription diff --git a/python/tests/test_wav2vec2.py b/python/tests/test_wav2vec2.py deleted file mode 100644 index 04642f854..000000000 --- a/python/tests/test_wav2vec2.py +++ /dev/null @@ -1,136 +0,0 @@ -import os -import os.path - -import librosa -import numpy as np -import torch -import torchaudio - -from torchaudio.utils import download_asset -from transformers.modeling_outputs import ( - BaseModelOutput, - CausalLMOutput, - Wav2Vec2BaseModelOutput, -) -from transformers.models.wav2vec2 import Wav2Vec2ForCTC, Wav2Vec2Processor - -import ctranslate2 - -# Models Conversion & Preparation -compute_type = "int8" -if not os.path.isfile("ctranslate2_model/model.bin"): - model_name = "facebook/wav2vec2-large-robust-ft-swbd-300h" - converter = ctranslate2.converters.TransformersConverter( - model_name, - load_as_float16=compute_type, - ) - output_dir = converter.convert("ctranslate2_model") -else: - output_dir = "ctranslate2_model" - -if not os.path.isfile("ctranslate2_model/wav2vec2_partial.bin"): - w2v2_model = Wav2Vec2ForCTC.from_pretrained(model_name) - del w2v2_model.wav2vec2.encoder.layers - del w2v2_model.wav2vec2.encoder.layer_norm - torch.save(w2v2_model, "ctranslate2_model/wav2vec2_partial.bin") - w2v2_processor = Wav2Vec2Processor.from_pretrained(model_name) - torch.save(w2v2_processor, "ctranslate2_model/wav2vec2_processor.bin") - - -# ASR inference -device = "cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu" -cpu_threads = int(os.environ.get("OMP_NUM_THREADS", 0)) - -w2v2_model = torch.load("ctranslate2_model/wav2vec2_partial.bin").to(device) -w2v2_processor = torch.load("ctranslate2_model/wav2vec2_processor.bin") - -SAMPLE_WAV = download_asset( - "tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" -) -resample_rate = 16000 -waveform, sampling_rate = torchaudio.load(SAMPLE_WAV) -if sampling_rate != resample_rate: - speech_array = librosa.resample( - waveform[0].numpy(), - orig_sr=sampling_rate, - target_sr=resample_rate, - ) -else: - speech_array = waveform[0].numpy() - -input_values = w2v2_processor( - speech_array.astype(np.float32), - padding=True, - return_tensors="pt", - sampling_rate=resample_rate, -).input_values - -with torch.no_grad(): - extract_features = w2v2_model.wav2vec2.feature_extractor( - input_values.to(w2v2_model.device) - ).transpose(1, 2) - hidden_states, extract_features = w2v2_model.wav2vec2.feature_projection( - extract_features - ) - position_embeddings = w2v2_model.wav2vec2.encoder.pos_conv_embed(hidden_states) - hidden_states = position_embeddings + hidden_states - # hidden_states = w2v2_model.encoder.dropout(hidden_states) - # Dropout(p=0.0, inplace=False) bypassed - -ct2_w2v2_model = ctranslate2.models.Wav2Vec2( - output_dir, - device=device, - device_index=[0], - compute_type=compute_type, - intra_threads=cpu_threads, - inter_threads=1, -) - -if ct2_w2v2_model.device == "cuda": - hidden_states = hidden_states.cpu() -else: - hidden_states.numpy() - -hidden_states = np.ascontiguousarray(hidden_states) -hidden_states = ctranslate2.StorageView.from_array(hidden_states) -to_cpu = ct2_w2v2_model.device == "cuda" and len(ct2_w2v2_model.device_index) > 1 -ct2_output = ct2_w2v2_model.encode( - hidden_states, - to_cpu=to_cpu, -) -# 24 x Wav2Vec2EncoderLayerStableLayerNorm processed -if ct2_w2v2_model.device == "cuda": - hidden_states = torch.as_tensor( - ct2_output, - device=ct2_w2v2_model.device, - ) -else: - hidden_states = torch.as_tensor( - np.array(ct2_output), - dtype=torch.float32, - device=ct2_w2v2_model.device, - ) - -encoder_outputs = BaseModelOutput( - last_hidden_state=hidden_states, - hidden_states=None, - attentions=None, -) -hidden_states = encoder_outputs[0] -outputs = Wav2Vec2BaseModelOutput( - last_hidden_state=hidden_states, - extract_features=extract_features, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, -) -hidden_states = outputs[0] -# hidden_states = w2v2_model.dropout(hidden_states) -# Dropout(p=0.0, inplace=False) bypassed -with torch.no_grad(): - logits = w2v2_model.lm_head(hidden_states.to(torch.float32))[0] - -predicted_ids = torch.argmax(logits, dim=-1) -output = w2v2_processor.decode(predicted_ids, output_word_offsets=True) - -print(output["text"]) -# should be: I HAD THAT CURIOSITY BESIDE ME AT THIS MOMENT From d2b01ede756acb9f175817267e346a0bd51a51e3 Mon Sep 17 00:00:00 2001 From: hkwon Date: Thu, 26 Oct 2023 10:07:39 -0700 Subject: [PATCH 11/20] fix python style format --- python/tests/test_transformers.py | 39 ++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 318b1ea61..191916714 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -958,7 +958,7 @@ def teardown_class(cls): ( "facebook/wav2vec2-large-robust-ft-swbd-300h", "tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav", - "I HAD THAT CURIOSITY BESIDE ME AT THIS MOMENT" + "I HAD THAT CURIOSITY BESIDE ME AT THIS MOMENT", ), ], ) @@ -970,12 +970,15 @@ def test_transformers_wav2vec2( audio_name, expected_transcription, ): - import transformers import torch import torchaudio + import transformers + from torchaudio.utils import download_asset - converter = ctranslate2.converters.TransformersConverter(model_name,load_as_float16="int8") + converter = ctranslate2.converters.TransformersConverter( + model_name, load_as_float16="int8" + ) output_dir = str(tmp_dir.join("ctranslate2_model")) output_dir = converter.convert(output_dir) # 24 x Wav2Vec2EncoderLayerStableLayerNorm converted & saved @@ -983,14 +986,14 @@ def test_transformers_wav2vec2( w2v2_model = transformers.Wav2Vec2ForCTC.from_pretrained(model_name) del w2v2_model.wav2vec2.encoder.layers del w2v2_model.wav2vec2.encoder.layer_norm - torch.save(w2v2_model, output_dir+"/wav2vec2_partial.bin") + torch.save(w2v2_model, output_dir + "/wav2vec2_partial.bin") w2v2_processor = transformers.Wav2Vec2Processor.from_pretrained(model_name) - torch.save(w2v2_processor, output_dir+"/wav2vec2_processor.bin") + torch.save(w2v2_processor, output_dir + "/wav2vec2_processor.bin") device = "cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu" cpu_threads = int(os.environ.get("OMP_NUM_THREADS", 0)) - w2v2_model = torch.load(output_dir+"/wav2vec2_partial.bin").to(device) - w2v2_processor = torch.load(output_dir+"/wav2vec2_processor.bin") + w2v2_model = torch.load(output_dir + "/wav2vec2_partial.bin").to(device) + w2v2_processor = torch.load(output_dir + "/wav2vec2_processor.bin") ct2_w2v2_model = ctranslate2.models.Wav2Vec2( output_dir, device=device, @@ -1000,24 +1003,28 @@ def test_transformers_wav2vec2( inter_threads=1, ) - SAMPLE_WAV = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav") + SAMPLE_WAV = download_asset( + "tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" + ) waveform, sampling_rate = torchaudio.load(SAMPLE_WAV) speech_array = waveform[0].numpy() input_values = w2v2_processor( speech_array.astype(np.float32), padding=True, return_tensors="pt", - sampling_rate=resample_rate, + sampling_rate=16000, ).input_values with torch.no_grad(): - extract_features = w2v2_model.wav2vec2.feature_extractor( + extract_features = w2v2_model.wav2vec2.feature_extractor( input_values.to(w2v2_model.device) ).transpose(1, 2) hidden_states, extract_features = w2v2_model.wav2vec2.feature_projection( extract_features ) - position_embeddings = w2v2_model.wav2vec2.encoder.pos_conv_embed(hidden_states) + position_embeddings = w2v2_model.wav2vec2.encoder.pos_conv_embed( + hidden_states + ) hidden_states = position_embeddings + hidden_states # hidden_states = w2v2_model.encoder.dropout(hidden_states) # Dropout(p=0.0, inplace=False) bypassed @@ -1029,11 +1036,13 @@ def test_transformers_wav2vec2( hidden_states = np.ascontiguousarray(hidden_states) hidden_states = ctranslate2.StorageView.from_array(hidden_states) - to_cpu = ct2_w2v2_model.device == "cuda" and len(ct2_w2v2_model.device_index) > 1 + to_cpu = ( + ct2_w2v2_model.device == "cuda" and len(ct2_w2v2_model.device_index) > 1 + ) ct2_output = ct2_w2v2_model.encode( hidden_states, to_cpu=to_cpu, - ) # 24 x Wav2Vec2EncoderLayerStableLayerNorm processed + ) # 24 x Wav2Vec2EncoderLayerStableLayerNorm processed if ct2_w2v2_model.device == "cuda": hidden_states = torch.as_tensor( ct2_output, @@ -1066,5 +1075,7 @@ def test_transformers_wav2vec2( logits = w2v2_model.lm_head(hidden_states.to(torch.float32))[0] predicted_ids = torch.argmax(logits, dim=-1) - transcription = w2v2_processor.decode(predicted_ids, output_word_offsets=True)[0] + transcription = w2v2_processor.decode(predicted_ids, output_word_offsets=True)[ + 0 + ] assert transcription == expected_transcription From 7e0dcdce3044e7eff28b2518b1b372030799f70f Mon Sep 17 00:00:00 2001 From: hkwon Date: Thu, 26 Oct 2023 13:19:29 -0700 Subject: [PATCH 12/20] update audio_name for TestWav2Vec2 --- python/tests/test_transformers.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 191916714..ee753b8af 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -1003,10 +1003,7 @@ def test_transformers_wav2vec2( inter_threads=1, ) - SAMPLE_WAV = download_asset( - "tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" - ) - waveform, sampling_rate = torchaudio.load(SAMPLE_WAV) + waveform, sampling_rate = torchaudio.load(download_asset(audio_name)) speech_array = waveform[0].numpy() input_values = w2v2_processor( speech_array.astype(np.float32), From 84039208c1aa46dba6aefc5aaa5b91a37ba2aff3 Mon Sep 17 00:00:00 2001 From: hkwon Date: Thu, 26 Oct 2023 15:28:08 -0700 Subject: [PATCH 13/20] change the audio downloading --- python/tests/test_transformers.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index ee753b8af..01aea9a01 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -957,7 +957,7 @@ def teardown_class(cls): [ ( "facebook/wav2vec2-large-robust-ft-swbd-300h", - "tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav", + "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav", "I HAD THAT CURIOSITY BESIDE ME AT THIS MOMENT", ), ], @@ -1003,7 +1003,9 @@ def test_transformers_wav2vec2( inter_threads=1, ) - waveform, sampling_rate = torchaudio.load(download_asset(audio_name)) + #waveform, sampling_rate = torchaudio.load(download_asset(audio_name)) + with requests.get(audio_name, stream=True) as response: + waveform, sampling_rate = torchaudio.load(_hide_seek(response.raw)) speech_array = waveform[0].numpy() input_values = w2v2_processor( speech_array.astype(np.float32), From bf63c95e4c224df71eb6346855e20708438923dd Mon Sep 17 00:00:00 2001 From: hkwon Date: Thu, 26 Oct 2023 15:39:34 -0700 Subject: [PATCH 14/20] change the audio downloading --- python/tests/test_transformers.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 01aea9a01..c56fcf4f5 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -972,10 +972,9 @@ def test_transformers_wav2vec2( ): import torch import torchaudio + import requests import transformers - from torchaudio.utils import download_asset - converter = ctranslate2.converters.TransformersConverter( model_name, load_as_float16="int8" ) @@ -1003,9 +1002,9 @@ def test_transformers_wav2vec2( inter_threads=1, ) - #waveform, sampling_rate = torchaudio.load(download_asset(audio_name)) - with requests.get(audio_name, stream=True) as response: - waveform, sampling_rate = torchaudio.load(_hide_seek(response.raw)) + r = requests.get(audio_name, allow_redirects=True) + open(output_dir + "/test.wav", "wb").write(r.content) + waveform, sampling_rate = torchaudio.load(output_dir + "/test.wav") speech_array = waveform[0].numpy() input_values = w2v2_processor( speech_array.astype(np.float32), From dad4b2cf83f65164b0edbc85f934a9a5e866ae51 Mon Sep 17 00:00:00 2001 From: hkwon Date: Thu, 26 Oct 2023 15:45:01 -0700 Subject: [PATCH 15/20] change the audio downloading --- python/tests/test_transformers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index c56fcf4f5..417c1109b 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -957,7 +957,8 @@ def teardown_class(cls): [ ( "facebook/wav2vec2-large-robust-ft-swbd-300h", - "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav", + "https://download.pytorch.org/torchaudio/tutorial-assets\ +/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav", "I HAD THAT CURIOSITY BESIDE ME AT THIS MOMENT", ), ], @@ -970,9 +971,9 @@ def test_transformers_wav2vec2( audio_name, expected_transcription, ): + import requests import torch import torchaudio - import requests import transformers converter = ctranslate2.converters.TransformersConverter( From a9674ba704c754b5f875e48ecaff169cb9324f6e Mon Sep 17 00:00:00 2001 From: hkwon Date: Thu, 26 Oct 2023 16:00:33 -0700 Subject: [PATCH 16/20] add requests for test requirement --- python/tests/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/python/tests/requirements.txt b/python/tests/requirements.txt index a05d4d39c..6ae7b719e 100644 --- a/python/tests/requirements.txt +++ b/python/tests/requirements.txt @@ -7,3 +7,4 @@ pytest wurlitzer==3.0.*;platform_system=='Linux' torch torchaudio +requests From 1e6aa47f756b5056339b60205f9107f2bdd56771 Mon Sep 17 00:00:00 2001 From: hkwon Date: Thu, 26 Oct 2023 19:22:54 -0700 Subject: [PATCH 17/20] update audio file downloading --- python/tests/test_transformers.py | 12 +++++------- python/tools/prepare_test_environment.sh | 2 ++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 417c1109b..e57e6f2e7 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -953,12 +953,10 @@ def teardown_class(cls): @test_utils.only_on_linux @test_utils.on_available_devices @pytest.mark.parametrize( - "model_name,audio_name,expected_transcription", + "model_name,expected_transcription", [ ( "facebook/wav2vec2-large-robust-ft-swbd-300h", - "https://download.pytorch.org/torchaudio/tutorial-assets\ -/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav", "I HAD THAT CURIOSITY BESIDE ME AT THIS MOMENT", ), ], @@ -968,7 +966,6 @@ def test_transformers_wav2vec2( tmp_dir, device, model_name, - audio_name, expected_transcription, ): import requests @@ -1003,9 +1000,10 @@ def test_transformers_wav2vec2( inter_threads=1, ) - r = requests.get(audio_name, allow_redirects=True) - open(output_dir + "/test.wav", "wb").write(r.content) - waveform, sampling_rate = torchaudio.load(output_dir + "/test.wav") + data_dir = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "..", "..", "tests", "data" + ) + waveform, sampling_rate = torchaudio.load(data_dir + "/test.wav") speech_array = waveform[0].numpy() input_values = w2v2_processor( speech_array.astype(np.float32), diff --git a/python/tools/prepare_test_environment.sh b/python/tools/prepare_test_environment.sh index c2f516cd8..2d909bc5e 100755 --- a/python/tools/prepare_test_environment.sh +++ b/python/tools/prepare_test_environment.sh @@ -15,3 +15,5 @@ rm transliteration-aren-all.tar.gz curl -O https://object.pouta.csc.fi/OPUS-MT-models/en-de/opus-2020-02-26.zip unzip opus-2020-02-26.zip -d tests/data/models/opus-mt-ende rm opus-2020-02-26.zip + +curl https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav --output tests/data/test.wav From 478bde5ab7f00504100347070aba477ac1a7ab05 Mon Sep 17 00:00:00 2001 From: hkwon Date: Thu, 26 Oct 2023 22:07:29 -0700 Subject: [PATCH 18/20] update audio file downloading path --- python/tests/test_transformers.py | 6 ++---- python/tools/prepare_test_environment.sh | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index e57e6f2e7..132aefcd5 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -968,7 +968,6 @@ def test_transformers_wav2vec2( model_name, expected_transcription, ): - import requests import torch import torchaudio import transformers @@ -1000,10 +999,9 @@ def test_transformers_wav2vec2( inter_threads=1, ) - data_dir = os.path.join( - os.path.dirname(os.path.realpath(__file__)), "..", "..", "tests", "data" + waveform, sampling_rate = torchaudio.load( + os.path.join(test_utils.get_data_dir(), "audio", "test.wav") ) - waveform, sampling_rate = torchaudio.load(data_dir + "/test.wav") speech_array = waveform[0].numpy() input_values = w2v2_processor( speech_array.astype(np.float32), diff --git a/python/tools/prepare_test_environment.sh b/python/tools/prepare_test_environment.sh index 2d909bc5e..5d9bd9210 100755 --- a/python/tools/prepare_test_environment.sh +++ b/python/tools/prepare_test_environment.sh @@ -16,4 +16,4 @@ curl -O https://object.pouta.csc.fi/OPUS-MT-models/en-de/opus-2020-02-26.zip unzip opus-2020-02-26.zip -d tests/data/models/opus-mt-ende rm opus-2020-02-26.zip -curl https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav --output tests/data/test.wav +curl https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav --output tests/data/audio/test.wav From 11f5ff17a5ac3cd7ae104f87d7622bdb3baff0be Mon Sep 17 00:00:00 2001 From: hkwon Date: Fri, 27 Oct 2023 09:48:44 -0700 Subject: [PATCH 19/20] switch audio to the existing one --- python/tests/requirements.txt | 2 -- python/tests/test_transformers.py | 20 ++++++++++---------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/python/tests/requirements.txt b/python/tests/requirements.txt index 6ae7b719e..f9cc04edf 100644 --- a/python/tests/requirements.txt +++ b/python/tests/requirements.txt @@ -6,5 +6,3 @@ tensorflow-cpu==2.11.* pytest wurlitzer==3.0.*;platform_system=='Linux' torch -torchaudio -requests diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 132aefcd5..d85299838 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -957,7 +957,10 @@ def teardown_class(cls): [ ( "facebook/wav2vec2-large-robust-ft-swbd-300h", - "I HAD THAT CURIOSITY BESIDE ME AT THIS MOMENT", + [ + "MISTER QUILTER IS THE APOSSEL OF THE MIDDLE CLASSES AND" + " WE ARE GLAD TO WELCOME HIS GOSPEL", + ], ), ], ) @@ -969,7 +972,6 @@ def test_transformers_wav2vec2( expected_transcription, ): import torch - import torchaudio import transformers converter = ctranslate2.converters.TransformersConverter( @@ -999,12 +1001,11 @@ def test_transformers_wav2vec2( inter_threads=1, ) - waveform, sampling_rate = torchaudio.load( - os.path.join(test_utils.get_data_dir(), "audio", "test.wav") + speech_array = np.load( + os.path.join(test_utils.get_data_dir(), "audio", "mr_quilter.npy") ) - speech_array = waveform[0].numpy() input_values = w2v2_processor( - speech_array.astype(np.float32), + speech_array, padding=True, return_tensors="pt", sampling_rate=16000, @@ -1070,7 +1071,6 @@ def test_transformers_wav2vec2( logits = w2v2_model.lm_head(hidden_states.to(torch.float32))[0] predicted_ids = torch.argmax(logits, dim=-1) - transcription = w2v2_processor.decode(predicted_ids, output_word_offsets=True)[ - 0 - ] - assert transcription == expected_transcription + transcription = w2v2_processor.decode(predicted_ids, output_word_offsets=True) + + assert transcription[0] == expected_transcription[0] From 437086983733dec9d58f32674cec85d06bb535b3 Mon Sep 17 00:00:00 2001 From: hkwon Date: Fri, 27 Oct 2023 12:04:06 -0700 Subject: [PATCH 20/20] remove unnecessary audio downloading --- python/tools/prepare_test_environment.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/tools/prepare_test_environment.sh b/python/tools/prepare_test_environment.sh index 5d9bd9210..c2f516cd8 100755 --- a/python/tools/prepare_test_environment.sh +++ b/python/tools/prepare_test_environment.sh @@ -15,5 +15,3 @@ rm transliteration-aren-all.tar.gz curl -O https://object.pouta.csc.fi/OPUS-MT-models/en-de/opus-2020-02-26.zip unzip opus-2020-02-26.zip -d tests/data/models/opus-mt-ende rm opus-2020-02-26.zip - -curl https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav --output tests/data/audio/test.wav