From 502ef0ac1049d2a6c458da8603aeeceb226f1f32 Mon Sep 17 00:00:00 2001
From: agentmarketbot <agentmarketbot@gmail.com>
Date: Sun, 26 Jan 2025 16:15:54 +0000
Subject: [PATCH] Add OpenAI Whisper API transcription support

Implement alternative transcription service using OpenAI Whisper API:
- Create abstract TranscriptionService base class
- Add AWSTranscriptionService and OpenAITranscriptionService implementations
- Update configuration to support service selection via TRANSCRIPTION_SERVICE env var
- Add OpenAI dependencies and configuration requirements
- Update documentation with new service options and setup instructions
- Enhance logging to include transcription service type

The change allows users to choose between AWS Transcribe for enterprise-grade
transcription or OpenAI Whisper API for high-accuracy multilingual support.
---
 README.md       | 41 ++++++++++++++++++++++++++++++-----------
 bot_handlers.py | 30 +++++++++++++++++++++++++-----
 pyproject.toml  |  1 +
 services.py     | 41 ++++++++++++++++++++++++++++++++++-------
 4 files changed, 90 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 97e441a..1f3f623 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,9 @@ GroupLang-secretary-bot is a Telegram bot that transcribes voice messages, summa
 
 ## Features
 
-- Transcribes voice messages using AWS Transcribe
+- Supports multiple transcription services:
+  - AWS Transcribe for reliable, enterprise-grade transcription
+  - OpenAI Whisper API for high-accuracy, multilingual transcription
 - Summarizes transcribed text using a custom API
 - Allows users to tip for the service
 - Secures handling of API keys and tokens
@@ -25,9 +27,11 @@ GroupLang-secretary-bot is a Telegram bot that transcribes voice messages, summa
 ## Prerequisites
 
 - Poetry for dependency management
-- AWS account with Transcribe access
 - Telegram Bot Token
 - MarketRouter API Key
+- One of the following transcription service configurations:
+  - AWS account with Transcribe access (for AWS transcription)
+  - OpenAI API key (for Whisper API transcription)
 
 ## Installation
 
@@ -69,14 +73,24 @@ To quickly get started with the GroupLang-secretary-bot, follow these steps:
 ## Configuration
 
 1. Set up environment variables:
-   - `TELEGRAM_BOT_TOKEN`: Your Telegram Bot Token
-   - `AWS_ACCESS_KEY_ID`: Your AWS Access Key ID
-   - `AWS_SECRET_ACCESS_KEY`: Your AWS Secret Access Key
-   - `MARKETROUTER_API_KEY`: Your MarketRouter API Key
+   - Required for all configurations:
+     - `TELEGRAM_BOT_TOKEN`: Your Telegram Bot Token
+     - `MARKETROUTER_API_KEY`: Your MarketRouter API Key
+     - `TRANSCRIPTION_SERVICE`: Choose the transcription service ('aws' or 'openai', defaults to 'aws')
+
+   - Required for AWS Transcribe:
+     - `AWS_ACCESS_KEY_ID`: Your AWS Access Key ID
+     - `AWS_SECRET_ACCESS_KEY`: Your AWS Secret Access Key
+
+   - Required for OpenAI Whisper:
+     - `OPENAI_API_KEY`: Your OpenAI API key
 
-2. Configure AWS credentials:
-   - Either set up the AWS CLI with `aws configure` or use environment variables as mentioned above.
-   - Ensure that your AWS IAM user has the necessary permissions for AWS Transcribe.
+2. Configure transcription service:
+   - For AWS Transcribe:
+     - Set up the AWS CLI with `aws configure` or use environment variables as mentioned above
+     - Ensure your AWS IAM user has the necessary permissions for AWS Transcribe
+   - For OpenAI Whisper:
+     - Ensure you have a valid OpenAI API key with access to the Whisper API
 
 1. Activate the Poetry virtual environment:
    ```
@@ -139,7 +153,12 @@ poetry update package_name
 
 The bot uses the following external APIs:
 
-- AWS Transcribe: For audio transcription
+- Transcription Services:
+  - AWS Transcribe: Enterprise-grade audio transcription service
+  - OpenAI Whisper API: High-accuracy, multilingual transcription service
 - MarketRouter API: For text summarization and reward submission
 
-Refer to the respective documentation for more details on these APIs.
+For more details, refer to:
+- [AWS Transcribe Documentation](https://docs.aws.amazon.com/transcribe/)
+- [OpenAI Whisper API Documentation](https://platform.openai.com/docs/guides/speech-to-text)
+- MarketRouter API Documentation (contact provider)
diff --git a/bot_handlers.py b/bot_handlers.py
index 74b6986..f69e2ac 100644
--- a/bot_handlers.py
+++ b/bot_handlers.py
@@ -1,15 +1,34 @@
 import logging
 import os
 from typing import Dict, Any
-from services import AWSServices, AudioTranscriber, TextSummarizer
+from services import (
+    AWSServices,
+    AWSTranscriptionService,
+    OpenAITranscriptionService,
+    TextSummarizer,
+    TranscriptionService
+)
 from utils.telegram_utils import send_message, get_telegram_file_url
 from utils.message_utils import format_response, create_tip_button
 
 logger = logging.getLogger(__name__)
 
+def get_transcription_service() -> TranscriptionService:
+    service_type = os.environ.get('TRANSCRIPTION_SERVICE', 'aws').lower()
+    
+    if service_type == 'aws':
+        aws_services = AWSServices()
+        return AWSTranscriptionService(aws_services)
+    elif service_type == 'openai':
+        openai_api_key = os.environ.get('OPENAI_API_KEY')
+        if not openai_api_key:
+            raise ValueError("OPENAI_API_KEY environment variable is required for OpenAI transcription service")
+        return OpenAITranscriptionService(openai_api_key)
+    else:
+        raise ValueError(f"Unsupported transcription service: {service_type}")
+
 # Initialize services
-aws_services = AWSServices()
-audio_transcriber = AudioTranscriber(aws_services)
+transcription_service = get_transcription_service()
 text_summarizer = TextSummarizer(os.environ.get('MARKETROUTER_API_KEY'))
 
 def handle_update(update: Dict[str, Any]) -> None:
@@ -30,12 +49,13 @@ def handle_voice_message(message: Dict[str, Any], chat_id: int) -> None:
         file_id = message['voice']['file_id']
         file_url = get_telegram_file_url(file_id)
         
-        transcription = audio_transcriber.transcribe_audio(file_url)
+        transcription = transcription_service.transcribe_audio(file_url)
         summary, conversation_id = text_summarizer.summarize_text(transcription)
         
         logger.info(f"Processed voice message: file_id={file_id}, "
                     f"transcription_length={len(transcription)}, "
-                    f"summary_length={len(summary)}")
+                    f"summary_length={len(summary)}, "
+                    f"service_type={type(transcription_service).__name__}")
         
         response = format_response(transcription, summary)
         reply_markup = create_tip_button(conversation_id)
diff --git a/pyproject.toml b/pyproject.toml
index 39d0ada..690d7ed 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,6 +17,7 @@ requests = "^2.32.3"
 nltk = "^3.9.1"
 langdetect = "^1.0.9"
 mangum = "^0.18.0"
+openai = "^1.12.0"
 
 [tool.poetry.dev-dependencies]
 # Add any development dependencies here
diff --git a/services.py b/services.py
index 8d81328..fbe6c6f 100644
--- a/services.py
+++ b/services.py
@@ -1,11 +1,13 @@
 import boto3
-from typing import Optional, Tuple, Dict
+from typing import Optional, Tuple, Dict, Protocol, Union
+from abc import ABC, abstractmethod
 import requests
 import time
 import uuid
 import logging
 from io import BytesIO
 from botocore.exceptions import ClientError
+import openai
 
 logger = logging.getLogger(__name__)
 
@@ -50,7 +52,17 @@ def start_transcription_job(self, job_name, media_uri, media_format='ogg', langu
     def get_transcription_job_status(self, job_name):
         return self.transcribe_client.get_transcription_job(TranscriptionJobName=job_name)
 
-class AudioTranscriber:
+class TranscriptionService(ABC):
+    @abstractmethod
+    def transcribe_audio(self, file_url: str) -> str:
+        pass
+
+    def _download_audio(self, file_url: str) -> bytes:
+        response = requests.get(file_url)
+        response.raise_for_status()
+        return response.content
+
+class AWSTranscriptionService(TranscriptionService):
     def __init__(self, aws_services: AWSServices):
         self.aws_services = aws_services
         self.bucket_name = 'audio-transcribe-temp'
@@ -77,11 +89,6 @@ def transcribe_audio(self, file_url: str) -> str:
             logger.error(f"An error occurred: {e}")
             raise
 
-    def _download_audio(self, file_url: str) -> bytes:
-        response = requests.get(file_url)
-        response.raise_for_status()
-        return response.content
-
     def _wait_for_transcription(self, job_name: str) -> str:
         while True:
             status = self.aws_services.get_transcription_job_status(job_name)
@@ -95,6 +102,26 @@ def _wait_for_transcription(self, job_name: str) -> str:
         else:
             raise Exception("Transcription failed")
 
+class OpenAITranscriptionService(TranscriptionService):
+    def __init__(self, api_key: str):
+        self.client = openai.OpenAI(api_key=api_key)
+
+    def transcribe_audio(self, file_url: str) -> str:
+        try:
+            audio_content = self._download_audio(file_url)
+            with BytesIO(audio_content) as audio_file:
+                audio_file.name = "audio.ogg"  # OpenAI needs a filename
+                response = self.client.audio.transcriptions.create(
+                    model="whisper-1",
+                    file=audio_file,
+                    response_format="text"
+                )
+                return response
+
+        except Exception as e:
+            logger.error(f"OpenAI transcription error: {e}")
+            raise
+
 class TextSummarizer:
     def __init__(self, api_key: str):
         self.api_key = api_key