forked from mozilla/DeepSpeech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeepspeech.h
133 lines (120 loc) · 5.36 KB
/
deepspeech.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#ifndef __DEEPSPEECH_H__
#define __DEEPSPEECH_H__
#include <cstddef>
namespace DeepSpeech
{
class Private;
class Model {
private:
Private* mPriv;
/**
* @brief Perform decoding of the logits, using basic CTC decoder or
* CTC decoder with KenLM enabled
*
* @param aNFrames Number of timesteps to deal with
* @param aLogits Matrix of logits, of dimensions:
* [timesteps][batch_size][num_classes]
*
* @param[out] String representing the decoded text.
*/
char* decode(int aNFrames, float*** aLogits);
public:
/**
* @brief An object providing an interface to a trained DeepSpeech model.
*
* @param aModelPath The path to the frozen model graph.
* @param aNCep The number of cepstrum the model was trained with.
* @param aNContext The context window the model was trained with.
* @param aAlphabetConfigPath The path to the configuration file specifying
* the alphabet used by the network. See alphabet.h.
* @param aBeamWidth The beam width used by the decoder. A larger beam
* width generates better results at the cost of decoding
* time.
*/
Model(const char* aModelPath, int aNCep, int aNContext,
const char* aAlphabetConfigPath, int aBeamWidth);
/**
* @brief Frees associated resources and destroys model object.
*/
~Model();
/**
* @brief Enable decoding using beam scoring with a KenLM language model.
*
* @param aAlphabetConfigPath The path to the configuration file specifying
* the alphabet used by the network. See alphabet.h.
* @param aLMPath The path to the language model binary file.
* @param aTriePath The path to the trie file build from the same vocabu-
* lary as the language model binary.
* @param aLMWeight The weight to give to language model results when sco-
* ring.
* @param aWordCountWeight The weight (penalty) to give to beams when in-
* creasing the word count of the decoding.
* @param aValidWordCountWeight The weight (bonus) to give to beams when
* adding a new valid word to the decoding.
*/
void enableDecoderWithLM(const char* aAlphabetConfigPath,
const char* aLMPath, const char* aTriePath,
float aLMWeight,
float aWordCountWeight,
float aValidWordCountWeight);
/**
* @brief Given audio, return a vector suitable for input to the
* DeepSpeech model.
*
* Extracts MFCC features from a given audio signal and adds the
* appropriate amount of context to run inference on the DeepSpeech model.
* This is equivalent to calling audioToInputVector() with the model's
* cepstrum and context window.
*
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate.
* @param aBufferSize The sample-length of the audio signal.
* @param aSampleRate The sample-rate of the audio signal.
* @param[out] aMFCC An array containing features, of shape
* (@p aNFrames, ncep * ncontext). The user is
* responsible for freeing the array.
* @param[out] aNFrames (optional) The number of frames in @p aMFCC.
* @param[out] aFrameLen (optional) The length of each frame
* (ncep * ncontext) in @p aMFCC.
*/
void getInputVector(const short* aBuffer,
unsigned int aBufferSize,
int aSampleRate,
float** aMfcc,
int* aNFrames = NULL,
int* aFrameLen = NULL);
/**
* @brief Run inference on the given audio.
*
* Runs inference on the given input vector with the model.
* See getInputVector().
*
* @param aMfcc MFCC features with the appropriate amount of context per
* frame.
* @param aNFrames The number of frames in @p aMfcc.
* @param aFrameLen (optional) The length of each frame in @p aMfcc. If
* specified, this will be used to verify the array is
* large enough.
*
* @return The resulting string after running inference. The user is
* responsible for freeing this string.
*/
char* infer(float* aMfcc,
int aNFrames,
int aFrameLen = 0);
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text.
*
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate.
* @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate The sample-rate of the audio signal.
*
* @return The STT result. The user is responsible for freeing the string.
*/
char* stt(const short* aBuffer,
unsigned int aBufferSize,
int aSampleRate);
};
}
#endif /* __DEEPSPEECH_H__ */