From 0089aafcfd59d9b38dd3146b33a95f2f72c123e7 Mon Sep 17 00:00:00 2001 From: agentmarketbot Date: Sun, 26 Jan 2025 15:23:29 +0000 Subject: [PATCH] Add OpenAI Whisper API as transcription alternative Implement dual transcription service support allowing users to choose between AWS Transcribe and OpenAI Whisper API for voice message transcription. Changes include: - Add OpenAITranscriber class for Whisper API integration - Refactor AudioTranscriber to support multiple services - Update configuration to include OpenAI API key and service selection - Add new environment variable TRANSCRIPTION_SERVICE - Update documentation with setup instructions for both services - Add openai package dependency - Update API documentation references The implementation maintains backward compatibility while providing more flexibility in choosing transcription services based on user needs and preferences. --- README.md | 35 +++++++++++++++++++++++++-------- bot_handlers.py | 12 +++++++++--- config.py | 2 ++ pyproject.toml | 1 + services.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 89 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 97e441a..f7c7d0c 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,8 @@ GroupLang-secretary-bot is a Telegram bot that transcribes voice messages, summa ## Features -- Transcribes voice messages using AWS Transcribe +- Transcribes voice messages using either AWS Transcribe or OpenAI Whisper API +- Flexible choice of transcription service (AWS or OpenAI) - Summarizes transcribed text using a custom API - Allows users to tip for the service - Secures handling of API keys and tokens @@ -25,7 +26,9 @@ GroupLang-secretary-bot is a Telegram bot that transcribes voice messages, summa ## Prerequisites - Poetry for dependency management -- AWS account with Transcribe access +- Either: + - AWS account with Transcribe access, OR + - OpenAI API key for Whisper API - Telegram Bot Token - MarketRouter API Key @@ -70,13 +73,24 @@ To quickly get started with the GroupLang-secretary-bot, follow these steps: 1. Set up environment variables: - `TELEGRAM_BOT_TOKEN`: Your Telegram Bot Token + - `MARKETROUTER_API_KEY`: Your MarketRouter API Key + - `TRANSCRIPTION_SERVICE`: Choose between 'aws' or 'openai' (default: 'aws') + + For AWS Transcribe: - `AWS_ACCESS_KEY_ID`: Your AWS Access Key ID - `AWS_SECRET_ACCESS_KEY`: Your AWS Secret Access Key - - `MARKETROUTER_API_KEY`: Your MarketRouter API Key + + For OpenAI Whisper: + - `OPENAI_API_KEY`: Your OpenAI API Key + +2. Configure credentials based on your chosen transcription service: -2. Configure AWS credentials: - - Either set up the AWS CLI with `aws configure` or use environment variables as mentioned above. - - Ensure that your AWS IAM user has the necessary permissions for AWS Transcribe. + For AWS Transcribe: + - Either set up the AWS CLI with `aws configure` or use environment variables as mentioned above + - Ensure that your AWS IAM user has the necessary permissions for AWS Transcribe + + For OpenAI Whisper: + - Ensure you have a valid OpenAI API key with access to the Whisper API 1. Activate the Poetry virtual environment: ``` @@ -139,7 +153,12 @@ poetry update package_name The bot uses the following external APIs: -- AWS Transcribe: For audio transcription +- For audio transcription (configurable): + - AWS Transcribe: Amazon's speech-to-text service + - OpenAI Whisper API: OpenAI's speech recognition model - MarketRouter API: For text summarization and reward submission -Refer to the respective documentation for more details on these APIs. +Refer to the respective documentation for more details: +- [AWS Transcribe Documentation](https://docs.aws.amazon.com/transcribe/) +- [OpenAI Whisper API Documentation](https://platform.openai.com/docs/guides/speech-to-text) +- MarketRouter API Documentation diff --git a/bot_handlers.py b/bot_handlers.py index 74b6986..0149074 100644 --- a/bot_handlers.py +++ b/bot_handlers.py @@ -8,9 +8,15 @@ logger = logging.getLogger(__name__) # Initialize services -aws_services = AWSServices() -audio_transcriber = AudioTranscriber(aws_services) -text_summarizer = TextSummarizer(os.environ.get('MARKETROUTER_API_KEY')) +from config import Config + +aws_services = AWSServices() if Config.TRANSCRIPTION_SERVICE == 'aws' else None +audio_transcriber = AudioTranscriber( + aws_services=aws_services, + openai_api_key=Config.OPENAI_API_KEY, + service=Config.TRANSCRIPTION_SERVICE +) +text_summarizer = TextSummarizer(Config.MARKETROUTER_API_KEY) def handle_update(update: Dict[str, Any]) -> None: if 'message' in update: diff --git a/config.py b/config.py index 4b02d69..38526c3 100644 --- a/config.py +++ b/config.py @@ -6,3 +6,5 @@ class Config: MARKETROUTER_API_KEY = os.environ.get('MARKETROUTER_API_KEY') AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID') AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY') + OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY') + TRANSCRIPTION_SERVICE = os.environ.get('TRANSCRIPTION_SERVICE', 'aws') # 'aws' or 'openai' diff --git a/pyproject.toml b/pyproject.toml index 39d0ada..690d7ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ requests = "^2.32.3" nltk = "^3.9.1" langdetect = "^1.0.9" mangum = "^0.18.0" +openai = "^1.12.0" [tool.poetry.dev-dependencies] # Add any development dependencies here diff --git a/services.py b/services.py index 8d81328..bb57d6e 100644 --- a/services.py +++ b/services.py @@ -50,7 +50,38 @@ def start_transcription_job(self, job_name, media_uri, media_format='ogg', langu def get_transcription_job_status(self, job_name): return self.transcribe_client.get_transcription_job(TranscriptionJobName=job_name) -class AudioTranscriber: +class OpenAITranscriber: + def __init__(self, api_key: str): + self.api_key = api_key + self.api_url = "https://api.openai.com/v1/audio/transcriptions" + + def transcribe_audio(self, file_url: str) -> str: + try: + audio_content = self._download_audio(file_url) + + headers = { + "Authorization": f"Bearer {self.api_key}" + } + + files = { + 'file': ('audio.ogg', audio_content, 'audio/ogg'), + 'model': (None, 'whisper-1'), + } + + response = requests.post(self.api_url, headers=headers, files=files) + response.raise_for_status() + + return response.json()['text'] + except Exception as e: + logger.error(f"An error occurred with OpenAI transcription: {e}") + raise + + def _download_audio(self, file_url: str) -> bytes: + response = requests.get(file_url) + response.raise_for_status() + return response.content + +class AWSTranscriber: def __init__(self, aws_services: AWSServices): self.aws_services = aws_services self.bucket_name = 'audio-transcribe-temp' @@ -74,7 +105,7 @@ def transcribe_audio(self, file_url: str) -> str: return transcription except Exception as e: - logger.error(f"An error occurred: {e}") + logger.error(f"An error occurred with AWS transcription: {e}") raise def _download_audio(self, file_url: str) -> bytes: @@ -95,6 +126,23 @@ def _wait_for_transcription(self, job_name: str) -> str: else: raise Exception("Transcription failed") +class AudioTranscriber: + def __init__(self, aws_services: Optional[AWSServices] = None, openai_api_key: Optional[str] = None, service: str = 'aws'): + self.service = service.lower() + if self.service == 'aws': + if not aws_services: + raise ValueError("AWS services required for AWS transcription") + self.transcriber = AWSTranscriber(aws_services) + elif self.service == 'openai': + if not openai_api_key: + raise ValueError("OpenAI API key required for OpenAI transcription") + self.transcriber = OpenAITranscriber(openai_api_key) + else: + raise ValueError(f"Unsupported transcription service: {service}") + + def transcribe_audio(self, file_url: str) -> str: + return self.transcriber.transcribe_audio(file_url) + class TextSummarizer: def __init__(self, api_key: str): self.api_key = api_key