From 502ef0ac1049d2a6c458da8603aeeceb226f1f32 Mon Sep 17 00:00:00 2001 From: agentmarketbot Date: Sun, 26 Jan 2025 16:15:54 +0000 Subject: [PATCH] Add OpenAI Whisper API transcription support Implement alternative transcription service using OpenAI Whisper API: - Create abstract TranscriptionService base class - Add AWSTranscriptionService and OpenAITranscriptionService implementations - Update configuration to support service selection via TRANSCRIPTION_SERVICE env var - Add OpenAI dependencies and configuration requirements - Update documentation with new service options and setup instructions - Enhance logging to include transcription service type The change allows users to choose between AWS Transcribe for enterprise-grade transcription or OpenAI Whisper API for high-accuracy multilingual support. --- README.md | 41 ++++++++++++++++++++++++++++++----------- bot_handlers.py | 30 +++++++++++++++++++++++++----- pyproject.toml | 1 + services.py | 41 ++++++++++++++++++++++++++++++++++------- 4 files changed, 90 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 97e441a..1f3f623 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,9 @@ GroupLang-secretary-bot is a Telegram bot that transcribes voice messages, summa ## Features -- Transcribes voice messages using AWS Transcribe +- Supports multiple transcription services: + - AWS Transcribe for reliable, enterprise-grade transcription + - OpenAI Whisper API for high-accuracy, multilingual transcription - Summarizes transcribed text using a custom API - Allows users to tip for the service - Secures handling of API keys and tokens @@ -25,9 +27,11 @@ GroupLang-secretary-bot is a Telegram bot that transcribes voice messages, summa ## Prerequisites - Poetry for dependency management -- AWS account with Transcribe access - Telegram Bot Token - MarketRouter API Key +- One of the following transcription service configurations: + - AWS account with Transcribe access (for AWS transcription) + - OpenAI API key (for Whisper API transcription) ## Installation @@ -69,14 +73,24 @@ To quickly get started with the GroupLang-secretary-bot, follow these steps: ## Configuration 1. Set up environment variables: - - `TELEGRAM_BOT_TOKEN`: Your Telegram Bot Token - - `AWS_ACCESS_KEY_ID`: Your AWS Access Key ID - - `AWS_SECRET_ACCESS_KEY`: Your AWS Secret Access Key - - `MARKETROUTER_API_KEY`: Your MarketRouter API Key + - Required for all configurations: + - `TELEGRAM_BOT_TOKEN`: Your Telegram Bot Token + - `MARKETROUTER_API_KEY`: Your MarketRouter API Key + - `TRANSCRIPTION_SERVICE`: Choose the transcription service ('aws' or 'openai', defaults to 'aws') + + - Required for AWS Transcribe: + - `AWS_ACCESS_KEY_ID`: Your AWS Access Key ID + - `AWS_SECRET_ACCESS_KEY`: Your AWS Secret Access Key + + - Required for OpenAI Whisper: + - `OPENAI_API_KEY`: Your OpenAI API key -2. Configure AWS credentials: - - Either set up the AWS CLI with `aws configure` or use environment variables as mentioned above. - - Ensure that your AWS IAM user has the necessary permissions for AWS Transcribe. +2. Configure transcription service: + - For AWS Transcribe: + - Set up the AWS CLI with `aws configure` or use environment variables as mentioned above + - Ensure your AWS IAM user has the necessary permissions for AWS Transcribe + - For OpenAI Whisper: + - Ensure you have a valid OpenAI API key with access to the Whisper API 1. Activate the Poetry virtual environment: ``` @@ -139,7 +153,12 @@ poetry update package_name The bot uses the following external APIs: -- AWS Transcribe: For audio transcription +- Transcription Services: + - AWS Transcribe: Enterprise-grade audio transcription service + - OpenAI Whisper API: High-accuracy, multilingual transcription service - MarketRouter API: For text summarization and reward submission -Refer to the respective documentation for more details on these APIs. +For more details, refer to: +- [AWS Transcribe Documentation](https://docs.aws.amazon.com/transcribe/) +- [OpenAI Whisper API Documentation](https://platform.openai.com/docs/guides/speech-to-text) +- MarketRouter API Documentation (contact provider) diff --git a/bot_handlers.py b/bot_handlers.py index 74b6986..f69e2ac 100644 --- a/bot_handlers.py +++ b/bot_handlers.py @@ -1,15 +1,34 @@ import logging import os from typing import Dict, Any -from services import AWSServices, AudioTranscriber, TextSummarizer +from services import ( + AWSServices, + AWSTranscriptionService, + OpenAITranscriptionService, + TextSummarizer, + TranscriptionService +) from utils.telegram_utils import send_message, get_telegram_file_url from utils.message_utils import format_response, create_tip_button logger = logging.getLogger(__name__) +def get_transcription_service() -> TranscriptionService: + service_type = os.environ.get('TRANSCRIPTION_SERVICE', 'aws').lower() + + if service_type == 'aws': + aws_services = AWSServices() + return AWSTranscriptionService(aws_services) + elif service_type == 'openai': + openai_api_key = os.environ.get('OPENAI_API_KEY') + if not openai_api_key: + raise ValueError("OPENAI_API_KEY environment variable is required for OpenAI transcription service") + return OpenAITranscriptionService(openai_api_key) + else: + raise ValueError(f"Unsupported transcription service: {service_type}") + # Initialize services -aws_services = AWSServices() -audio_transcriber = AudioTranscriber(aws_services) +transcription_service = get_transcription_service() text_summarizer = TextSummarizer(os.environ.get('MARKETROUTER_API_KEY')) def handle_update(update: Dict[str, Any]) -> None: @@ -30,12 +49,13 @@ def handle_voice_message(message: Dict[str, Any], chat_id: int) -> None: file_id = message['voice']['file_id'] file_url = get_telegram_file_url(file_id) - transcription = audio_transcriber.transcribe_audio(file_url) + transcription = transcription_service.transcribe_audio(file_url) summary, conversation_id = text_summarizer.summarize_text(transcription) logger.info(f"Processed voice message: file_id={file_id}, " f"transcription_length={len(transcription)}, " - f"summary_length={len(summary)}") + f"summary_length={len(summary)}, " + f"service_type={type(transcription_service).__name__}") response = format_response(transcription, summary) reply_markup = create_tip_button(conversation_id) diff --git a/pyproject.toml b/pyproject.toml index 39d0ada..690d7ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ requests = "^2.32.3" nltk = "^3.9.1" langdetect = "^1.0.9" mangum = "^0.18.0" +openai = "^1.12.0" [tool.poetry.dev-dependencies] # Add any development dependencies here diff --git a/services.py b/services.py index 8d81328..fbe6c6f 100644 --- a/services.py +++ b/services.py @@ -1,11 +1,13 @@ import boto3 -from typing import Optional, Tuple, Dict +from typing import Optional, Tuple, Dict, Protocol, Union +from abc import ABC, abstractmethod import requests import time import uuid import logging from io import BytesIO from botocore.exceptions import ClientError +import openai logger = logging.getLogger(__name__) @@ -50,7 +52,17 @@ def start_transcription_job(self, job_name, media_uri, media_format='ogg', langu def get_transcription_job_status(self, job_name): return self.transcribe_client.get_transcription_job(TranscriptionJobName=job_name) -class AudioTranscriber: +class TranscriptionService(ABC): + @abstractmethod + def transcribe_audio(self, file_url: str) -> str: + pass + + def _download_audio(self, file_url: str) -> bytes: + response = requests.get(file_url) + response.raise_for_status() + return response.content + +class AWSTranscriptionService(TranscriptionService): def __init__(self, aws_services: AWSServices): self.aws_services = aws_services self.bucket_name = 'audio-transcribe-temp' @@ -77,11 +89,6 @@ def transcribe_audio(self, file_url: str) -> str: logger.error(f"An error occurred: {e}") raise - def _download_audio(self, file_url: str) -> bytes: - response = requests.get(file_url) - response.raise_for_status() - return response.content - def _wait_for_transcription(self, job_name: str) -> str: while True: status = self.aws_services.get_transcription_job_status(job_name) @@ -95,6 +102,26 @@ def _wait_for_transcription(self, job_name: str) -> str: else: raise Exception("Transcription failed") +class OpenAITranscriptionService(TranscriptionService): + def __init__(self, api_key: str): + self.client = openai.OpenAI(api_key=api_key) + + def transcribe_audio(self, file_url: str) -> str: + try: + audio_content = self._download_audio(file_url) + with BytesIO(audio_content) as audio_file: + audio_file.name = "audio.ogg" # OpenAI needs a filename + response = self.client.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + response_format="text" + ) + return response + + except Exception as e: + logger.error(f"OpenAI transcription error: {e}") + raise + class TextSummarizer: def __init__(self, api_key: str): self.api_key = api_key