wenet-e2e · veelion · Jul 20, 2022 · Jul 20, 2022 · Jul 22, 2022 · Jul 28, 2022
diff --git a/runtime/binding/python/cpp/binding.cc b/runtime/binding/python/cpp/binding.cc
@@ -1,4 +1,5 @@
 // Copyright (c) 2022  Binbin Zhang([email protected])
+//               2022 SoundDataConverge Co.LTD (Weiliang Chong)
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,8 +14,10 @@
 // limitations under the License.
 
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include "api/wenet_api.h"
+#include "api/batch_recognizer.h"
 
 namespace py = pybind11;
 
@@ -37,4 +40,12 @@ PYBIND11_MODULE(_wenet, m) {
   m.def("wenet_set_language", &wenet_set_language, "set language");
   m.def("wenet_set_continuous_decoding", &wenet_set_continuous_decoding,
         "enable continuous decoding or not");
+  py::class_<BatchRecognizer>(m, "BatchRecognizer")
+    .def(py::init<const char*>())
+    .def("set_enable_timestamp", &BatchRecognizer::set_enable_timestamp)
+    .def("AddContext", &BatchRecognizer::AddContext)
+    .def("set_context_score", &BatchRecognizer::set_context_score)
+    .def("set_language", &BatchRecognizer::set_language)
+    .def("DecodeData", &BatchRecognizer::DecodeData)
+    .def("Decode", &BatchRecognizer::Decode);
 }
diff --git a/runtime/binding/python/py/__init__.py b/runtime/binding/python/py/__init__.py
@@ -1,2 +1,3 @@
 from .decoder import Decoder  # noqa
+from .batch_decoder import BatchDecoder  # noqa
 from _wenet import wenet_set_log_level as set_log_level  # noqa
diff --git a/runtime/binding/python/py/batch_decoder.py b/runtime/binding/python/py/batch_decoder.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2022  Binbin Zhang([email protected])
+#               2022 SoundDataConverge Co.LTD (Weiliang Chong)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+import _wenet
+
+from .hub import Hub
+
+
+class BatchDecoder:
+
+    def __init__(self,
+                 model_dir: Optional[str] = None,
+                 lang: str = 'chs',
+                 nbest: int = 1,
+                 enable_timestamp: bool = False,
+                 context: Optional[List[str]] = None,
+                 context_score: float = 3.0):
+        """ Init WeNet decoder
+        Args:
+            lang: language type of the model
+            nbest: nbest number for the final result
+            enable_timestamp: whether to enable word level timestamp
+               for the final result
+            context: context words
+            context_score: bonus score when the context is matched
+        """
+        if model_dir is None:
+            model_dir = Hub.get_model_by_lang(lang)
+
+        self.d = _wenet.BatchRecognizer(model_dir)
+
+        self.set_language(lang)
+        self.enable_timestamp(enable_timestamp)
+        if context is not None:
+            self.add_context(context)
+            self.set_context_score(context_score)
+
+    def __del__(self):
+        del self.d
+
+    def enable_timestamp(self, flag: bool):
+        tag = 1 if flag else 0
+        self.d.set_enable_timestamp(tag)
+
+    def add_context(self, contexts: List[str]):
+        for c in contexts:
+            assert isinstance(c, str)
+            self.d.AddContext(c)
+
+    def set_context_score(self, score: float):
+        self.d.set_context_score(score)
+
+    def set_language(self, lang: str):
+        assert lang in ['chs', 'en']
+        self.d.set_language(lang)
+
+    def decode(self, pcms: List[bytes]) -> str:
+        """ Decode the input data
+
+        Args:
+            pcms: a list of wav pcm
+        """
+        assert isinstance(pcms[0], bytes)
+        result = self.d.Decode(pcms)
+        return result
diff --git a/runtime/core/api/batch_recognizer.h b/runtime/core/api/batch_recognizer.h
@@ -0,0 +1,148 @@
+// Copyright (c) 2022  Binbin Zhang ([email protected])
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef API_BATCH_RECOGNIZER_H_
+#define API_BATCH_RECOGNIZER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <utility>
+
+#include "decoder/asr_decoder.h"
+#include "decoder/batch_asr_decoder.h"
+#include "decoder/batch_torch_asr_model.h"
+#include "post_processor/post_processor.h"
+#include "utils/file.h"
+#include "utils/json.h"
+#include "utils/string.h"
+
+class BatchRecognizer {
+ public:
+  explicit BatchRecognizer(const std::string& model_dir, int num_threads = 1) {
+    // FeaturePipeline init
+    feature_config_ = std::make_shared<wenet::FeaturePipelineConfig>(80, 16000);
+    // Resource init
+    resource_ = std::make_shared<wenet::DecodeResource>();
+    wenet::BatchTorchAsrModel::InitEngineThreads(num_threads);
+    std::string model_path = wenet::JoinPath(model_dir, "final.zip");
+    CHECK(wenet::FileExists(model_path));
+
+    auto model = std::make_shared<wenet::BatchTorchAsrModel>();
+    model->Read(model_path);
+    resource_->batch_model = model;
+
+    // units.txt: E2E model unit
+    std::string unit_path = wenet::JoinPath(model_dir, "units.txt");
+    CHECK(wenet::FileExists(unit_path));
+    resource_->unit_table = std::shared_ptr<fst::SymbolTable>(
+      fst::SymbolTable::ReadText(unit_path));
+
+    std::string fst_path = wenet::JoinPath(model_dir, "TLG.fst");
+    if (wenet::FileExists(fst_path)) {  // With LM
+      resource_->fst = std::shared_ptr<fst::Fst<fst::StdArc>>(
+          fst::Fst<fst::StdArc>::Read(fst_path));
+
+      std::string symbol_path = wenet::JoinPath(model_dir, "words.txt");
+      CHECK(wenet::FileExists(symbol_path));
+      resource_->symbol_table = std::shared_ptr<fst::SymbolTable>(
+          fst::SymbolTable::ReadText(symbol_path));
+    } else {  // Without LM, symbol_table is the same as unit_table
+      resource_->symbol_table = resource_->unit_table;
+    }
+
+    // Context config init
+    context_config_ = std::make_shared<wenet::ContextConfig>();
+    decode_options_ = std::make_shared<wenet::DecodeOptions>();
+    post_process_opts_ = std::make_shared<wenet::PostProcessOptions>();
+  }
+
+  void InitDecoder() {
+    CHECK(decoder_ == nullptr);
+    // Optional init context graph
+    if (context_.size() > 0) {
+      context_config_->context_score = context_score_;
+      auto context_graph =
+          std::make_shared<wenet::ContextGraph>(*context_config_);
+      context_graph->BuildContextGraph(context_, resource_->symbol_table);
+      resource_->context_graph = context_graph;
+    }
+    // PostProcessor
+    if (language_ == "chs") {  // TODO(Binbin Zhang): CJK(chs, jp, kr)
+      post_process_opts_->language_type = wenet::kMandarinEnglish;
+    } else {
+      post_process_opts_->language_type = wenet::kIndoEuropean;
+    }
+    resource_->post_processor =
+        std::make_shared<wenet::PostProcessor>(*post_process_opts_);
+    // Init decoder
+    decoder_ = std::make_shared<wenet::BatchAsrDecoder>(
+        feature_config_, resource_,
+        *decode_options_);
+  }
+
+  std::string Decode(const std::vector<std::string>& wavs) {
+    // Init decoder when it is called first time
+    if (decoder_ == nullptr) {
+      InitDecoder();
+    }
+    std::vector<std::vector<float>> wavs_float;
+    for (auto& wav : wavs) {
+      const int16_t* pcm = reinterpret_cast<const int16_t*>(wav.data());
+      int pcm_len = wav.size() / sizeof(int16_t);
+      std::vector<float> wav_float(pcm_len);
+      for (size_t i = 0; i < pcm_len; i++) {
+        wav_float[i] = static_cast<float>(*(pcm + i));
+      }
+      wavs_float.push_back(std::move(wav_float));
+    }
+    decoder_->Reset();
+    decoder_->Decode(wavs_float);
+    return decoder_->get_batch_result(nbest_, enable_timestamp_);
+  }
+
+  std::string DecodeData(const std::vector<std::vector<float>>& wavs) {
+    // Init decoder when it is called first time
+    if (decoder_ == nullptr) {
+      InitDecoder();
+    }
+    decoder_->Reset();
+    decoder_->Decode(wavs);
+    return decoder_->get_batch_result(nbest_, enable_timestamp_);
+  }
+
+
+
+  void set_nbest(int n) { nbest_ = n; }
+  void set_enable_timestamp(bool flag) { enable_timestamp_ = flag; }
+  void AddContext(const char* word) { context_.emplace_back(word); }
+  void set_context_score(float score) { context_score_ = score; }
+  void set_language(const char* lang) { language_ = lang; }
+
+ private:
+  std::shared_ptr<wenet::FeaturePipelineConfig> feature_config_ = nullptr;
+  std::shared_ptr<wenet::DecodeResource> resource_ = nullptr;
+  std::shared_ptr<wenet::DecodeOptions> decode_options_ = nullptr;
+  std::shared_ptr<wenet::BatchAsrDecoder> decoder_ = nullptr;
+  std::shared_ptr<wenet::ContextConfig> context_config_ = nullptr;
+  std::shared_ptr<wenet::PostProcessOptions> post_process_opts_ = nullptr;
+
+  int nbest_ = 1;
+  bool enable_timestamp_ = false;
+  std::vector<std::string> context_;
+  float context_score_;
+  std::string language_ = "chs";
+};
+
+#endif  // API_BATCH_RECOGNIZER_H_
diff --git a/runtime/core/bin/CMakeLists.txt b/runtime/core/bin/CMakeLists.txt
@@ -1,6 +1,9 @@
 add_executable(decoder_main decoder_main.cc)
 target_link_libraries(decoder_main PUBLIC decoder)
 
+add_executable(decoder_main_batch decoder_main_batch.cc)
+target_link_libraries(decoder_main_batch PUBLIC decoder)
+
 add_executable(label_checker_main label_checker_main.cc)
 target_link_libraries(label_checker_main PUBLIC decoder)
 

diff --git a/runtime/core/bin/api_batch_main.cc b/runtime/core/bin/api_batch_main.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2022  Binbin Zhang ([email protected])
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "api/batch_recognizer.h"
+#include "api/wenet_api.h"
+#include "frontend/wav.h"
+#include "utils/flags.h"
+#include "utils/timer.h"
+
+DEFINE_string(model_dir, "", "model dir path");
+DEFINE_string(wav_path, "", "single wave path");
+DEFINE_int32(batch_size, 1, "batch size of input");
+DEFINE_int32(num_threads, 1, "number threads of intraop");
+DEFINE_bool(enable_timestamp, false, "enable timestamps");
+
+int main(int argc, char* argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, false);
+  google::InitGoogleLogging(argv[0]);
+
+  wenet_set_log_level(2);
+
+  BatchRecognizer br(FLAGS_model_dir, FLAGS_num_threads);
+  if (FLAGS_enable_timestamp) br.set_enable_timestamp(true);
+  wenet::WavReader wav_reader(FLAGS_wav_path);
+  std::vector<float> data;
+  data.insert(
+      data.end(), wav_reader.data(),
+      wav_reader.data() + wav_reader.num_samples());
+  std::vector<std::vector<float>> wavs;
+  for (size_t i = 0; i < FLAGS_batch_size - 1; i++) {
+    wavs.push_back(data);
+  }
+  wavs.push_back(std::move(data));
+  wenet::Timer timer;
+  std::string result = br.DecodeData(wavs);
+  int forward_time = timer.Elapsed();
+  VLOG(1) << "Decode() takes " << forward_time << " ms";
+  LOG(INFO) << result;
+  return 0;
+}