-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranscriber.py
118 lines (91 loc) · 3.99 KB
/
transcriber.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import multiprocessing
import os
from concurrent.futures import ThreadPoolExecutor
from shutil import which
import whisper
from dotenv import load_dotenv
from source.git_root_finder import GitRootFinder
from source.logger import LoggerMixin
class Transcriber(LoggerMixin):
"""
A class that handles the transcription of audio files using a specified transcriber model.
:param transcriber_model_name: The model name to use for transcription (default is "base"). Choose from OpenAI's
Whisper models.
:param max_cores: The maximum number of CPU cores to use for the transcription process.
:param overwrite: A flag indicating whether existing transcriptions should be overwritten.
"""
def __init__(
self,
transcriber_model_name: str = "base",
max_cores: int = None,
overwrite: bool = False,
):
super().__init__()
load_dotenv()
self.transcriber_model_name = transcriber_model_name
self.log.debug(f'Using model "{self.transcriber_model_name}" for transcription')
self._check_for_ffmpeg()
self._find_audio_files()
if max_cores is None:
self.max_cores = multiprocessing.cpu_count() * 3 // 4
else:
self.max_cores = max_cores
self.overwrite = overwrite
@staticmethod
def _check_for_ffmpeg() -> None:
"""
Check for the presence of ffmpeg.
:return: None
"""
if not which("ffmpeg"):
raise SystemError("ffmpeg is not installed!")
def _find_audio_files(self) -> None:
"""
This method is used to find audio files in a specific directory and set the input and output directories for audio processing.
:return: None
"""
data_directory = os.path.join(GitRootFinder.get(), "data")
self.audio_input_directory = os.path.join(data_directory, "audio")
self.transcription_output_directory = os.path.join(
data_directory, "transcriptions"
)
if not os.path.exists(self.transcription_output_directory):
os.makedirs(self.transcription_output_directory)
self.all_audio_files = sorted(os.listdir(self.audio_input_directory))
self.number_of_audio_files = len(self.all_audio_files)
self.log.debug(
f"Found audio files ({self.number_of_audio_files}): {self.all_audio_files}"
)
def transcribe_file(self, filename: str) -> None:
"""
Transcribes an audio file and saves the transcription to a text file.
:param filename: The name of the audio file to transcribe.
:return: None
"""
input_file_path = os.path.join(self.audio_input_directory, filename)
output_file_name = filename.replace(".mp3", ".txt")
output_file_path = os.path.join(
self.transcription_output_directory, output_file_name
)
if os.path.exists(output_file_path) and not self.overwrite:
self.log.debug("Transcription already exists, skipping file...")
return
model = whisper.load_model(self.transcriber_model_name, device="cuda")
self.log.info(f'Starting transcribing "{filename}"')
transcription = model.transcribe(input_file_path)["text"]
with open(output_file_path, "w", encoding="utf-8") as text_file:
text_file.write(transcription)
self.log.info(f'Finished transcribing "{filename}"')
def start(self) -> None:
"""
Starts the transcription process for audio files using multiple CPU cores. This may take a while.
:return: None
"""
self.log.info(
f"Starting to transcribe the {self.number_of_audio_files} audio files using {self.max_cores} workers, this may take a while..."
)
with ThreadPoolExecutor(max_workers=self.max_cores) as executor:
list(executor.map(self.transcribe_file, self.all_audio_files))
if __name__ == "__main__":
transcriber = Transcriber(overwrite=True)
transcriber.start()