forked from DougDougGithub/Babagaboosh
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathazure_speech_to_text.py
186 lines (146 loc) · 9.67 KB
/
azure_speech_to_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import time
import azure.cognitiveservices.speech as speechsdk
import keyboard
import os
class SpeechToTextManager:
azure_speechconfig = None
azure_audioconfig = None
azure_speechrecognizer = None
def __init__(self):
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and service region (e.g., "westus").
try:
self.azure_speechconfig = speechsdk.SpeechConfig(subscription=os.getenv('AZURE_TTS_KEY'), region=os.getenv('AZURE_TTS_REGION'))
except TypeError:
exit("Ooops! You forgot to set AZURE_TTS_KEY or AZURE_TTS_REGION in your environment!")
self.azure_speechconfig.speech_recognition_language="en-US"
def speechtotext_from_mic(self):
self.azure_audioconfig = speechsdk.audio.AudioConfig(use_default_microphone=True)
self.azure_speechrecognizer = speechsdk.SpeechRecognizer(speech_config=self.azure_speechconfig, audio_config=self.azure_audioconfig)
print("Speak into your microphone.")
speech_recognition_result = self.azure_speechrecognizer.recognize_once_async().get()
text_result = speech_recognition_result.text
if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("Recognized: {}".format(speech_recognition_result.text))
elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized: {}".format(speech_recognition_result.no_match_details))
elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = speech_recognition_result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
print("Did you set the speech resource key and region values?")
print(f"We got the following text: {text_result}")
return text_result
def speechtotext_from_file(self, filename):
self.azure_audioconfig = speechsdk.AudioConfig(filename=filename)
self.azure_speechrecognizer = speechsdk.SpeechRecognizer(speech_config=self.azure_speechconfig, audio_config=self.azure_audioconfig)
print("Listening to the file \n")
speech_recognition_result = self.azure_speechrecognizer.recognize_once_async().get()
if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("Recognized: \n {}".format(speech_recognition_result.text))
elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized: {}".format(speech_recognition_result.no_match_details))
elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = speech_recognition_result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
print("Did you set the speech resource key and region values?")
return speech_recognition_result.text
def speechtotext_from_file_continuous(self, filename):
self.azure_audioconfig = speechsdk.audio.AudioConfig(filename=filename)
self.azure_speechrecognizer = speechsdk.SpeechRecognizer(speech_config=self.azure_speechconfig, audio_config=self.azure_audioconfig)
done = False
def stop_cb(evt):
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
# These are optional event callbacks that just print out when an event happens.
# Recognized is useful as an update when a full chunk of speech has finished processing
#self.azure_speechrecognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
self.azure_speechrecognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
self.azure_speechrecognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
self.azure_speechrecognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
self.azure_speechrecognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# These functions will stop the program by flipping the "done" boolean when the session is either stopped or canceled
self.azure_speechrecognizer.session_stopped.connect(stop_cb)
self.azure_speechrecognizer.canceled.connect(stop_cb)
# This is where we compile the results we receive from the ongoing "Recognized" events
all_results = []
def handle_final_result(evt):
all_results.append(evt.result.text)
self.azure_speechrecognizer.recognized.connect(handle_final_result)
# Start processing the file
print("Now processing the audio file...")
self.azure_speechrecognizer.start_continuous_recognition()
# We wait until stop_cb() has been called above, because session either stopped or canceled
while not done:
time.sleep(.5)
# Now that we're done, tell the recognizer to end session
# NOTE: THIS NEEDS TO BE OUTSIDE OF THE stop_cb FUNCTION. If it's inside that function the program just freezes. Not sure why.
self.azure_speechrecognizer.stop_continuous_recognition()
final_result = " ".join(all_results).strip()
print(f"\n\nHeres the result we got from contiuous file read!\n\n{final_result}\n\n")
return final_result
def speechtotext_from_mic_continuous(self, stop_key='p'):
self.azure_speechrecognizer = speechsdk.SpeechRecognizer(speech_config=self.azure_speechconfig)
done = False
# Optional callback to print out whenever a chunk of speech is being recognized. This gets called basically every word.
def recognizing_cb(evt: speechsdk.SpeechRecognitionEventArgs):
print('RECOGNIZING: {}'.format(evt))
self.azure_speechrecognizer.recognizing.connect(recognizing_cb)
# Optional callback to print out whenever a chunk of speech is finished being recognized. Make sure to let this finish before ending the speech recognition.
def recognized_cb(evt: speechsdk.SpeechRecognitionEventArgs):
print('RECOGNIZED: {}'.format(evt))
self.azure_speechrecognizer.recognized.connect(recognized_cb)
# We register this to fire if we get a session_stopped or cancelled event.
def stop_cb(evt: speechsdk.SessionEventArgs):
print('CLOSING speech recognition on {}'.format(evt))
nonlocal done
done = True
# Connect callbacks to the events fired by the speech recognizer
self.azure_speechrecognizer.session_stopped.connect(stop_cb)
self.azure_speechrecognizer.canceled.connect(stop_cb)
# This is where we compile the results we receive from the ongoing "Recognized" events
all_results = []
def handle_final_result(evt):
all_results.append(evt.result.text)
self.azure_speechrecognizer.recognized.connect(handle_final_result)
# Perform recognition. `start_continuous_recognition_async asynchronously initiates continuous recognition operation,
# Other tasks can be performed on this thread while recognition starts...
# wait on result_future.get() to know when initialization is done.
# Call stop_continuous_recognition_async() to stop recognition.
result_future = self.azure_speechrecognizer.start_continuous_recognition_async()
result_future.get() # wait for voidfuture, so we know engine initialization is done.
print('Continuous Speech Recognition is now running, say something.')
while not done:
# METHOD 1 - Press the stop key. This is 'p' by default but user can provide different key
if keyboard.read_key() == stop_key:
print("\nEnding azure speech recognition\n")
self.azure_speechrecognizer.stop_continuous_recognition_async()
break
# METHOD 2 - User must type "stop" into cmd window
#print('type "stop" then enter when done')
#stop = input()
#if (stop.lower() == "stop"):
# print('Stopping async recognition.')
# self.azure_speechrecognizer.stop_continuous_recognition_async()
# break
# Other methods: https://stackoverflow.com/a/57644349
# No real sample parallel work to do on this thread, so just wait for user to give the signal to stop.
# Can't exit function or speech_recognizer will go out of scope and be destroyed while running.
final_result = " ".join(all_results).strip()
print(f"\n\nHeres the result we got!\n\n{final_result}\n\n")
return final_result
# Tests
if __name__ == '__main__':
TEST_FILE = "D:\Video Editing\Misc - Ai teaches me to pass History Exam\Audio\Misc - Ai teaches me to pass History Exam - VO 1.wav"
speechtotext_manager = SpeechToTextManager()
while True:
#speechtotext_manager.speechtotext_from_mic()
#speechtotext_manager.speechtotext_from_file(TEST_FILE)
#speechtotext_manager.speechtotext_from_file_continuous(TEST_FILE)
result = speechtotext_manager.speechtotext_from_mic_continuous()
print(f"\n\nHERE IS THE RESULT:\n{result}")
time.sleep(60)