Skip to content

Commit

Permalink
feat: removes FastAPI server and improves logging
Browse files Browse the repository at this point in the history
  • Loading branch information
micheleriva committed Jan 11, 2025
1 parent 42fd4b2 commit 5edb115
Show file tree
Hide file tree
Showing 13 changed files with 94 additions and 239 deletions.
3 changes: 2 additions & 1 deletion src/ai_server/.gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.venv
__pycache__
loader.py
output.txt
output.txt
.embeddins_models_cache
4 changes: 1 addition & 3 deletions src/ai_server/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
# Puython 3.10 suggested
# Puython 3.11 suggested

numpy
fastapi
grpcio-tools
grpcio
python-dotenv
fastembed-gpu
nvidia-cudnn-cu12
uvicorn
grpcio-reflection
argparse
--extra-index-url https://download.pytorch.org/whl/cu124
Expand Down
17 changes: 15 additions & 2 deletions src/ai_server/server.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,41 @@
import sys
import signal
import logging

from src.grpc.server import serve
from src.utils import OramaAIConfig
from src.models.main import ModelsManager
from src.service.embedding import EmbeddingService

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

logger = logging.getLogger(__name__)


def handle_shutdown(signum, frame):
print("\nShutting down gracefully...")
logger.info("\nShutting down gracefully...")
sys.exit(0)


if __name__ == "__main__":
signal.signal(signal.SIGINT, handle_shutdown)
signal.signal(signal.SIGTERM, handle_shutdown)

logger.info("Initializing config...")
config = OramaAIConfig()

logger.info("Initializing embedding service...")
embeddings_service = EmbeddingService(config)

logger.info("Initializing models manager...")
models_manager = ModelsManager(config)

try:
logger.info(f"Starting gRPC server on port {config.grpc_port}...")
serve(config, embeddings_service.embeddings_service, models_manager)
except KeyboardInterrupt:
print("\nShutting down gracefully...")
logger.info("\nShutting down gracefully...")
sys.exit(0)
except Exception as e:
logger.error(f"Error starting server: {e}", exc_info=True)
sys.exit(1)
Empty file removed src/ai_server/src/api/__init__.py
Empty file.
55 changes: 0 additions & 55 deletions src/ai_server/src/api/app.py

This file was deleted.

47 changes: 0 additions & 47 deletions src/ai_server/src/api/middleware.py

This file was deleted.

19 changes: 0 additions & 19 deletions src/ai_server/src/api/models.py

This file was deleted.

94 changes: 16 additions & 78 deletions src/ai_server/src/embeddings/models.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,32 @@
import os
import logging
import threading
from typing import List
from fastembed import TextEmbedding

from src.utils import OramaAIConfig
from fastembed.text.onnx_embedding import supported_onnx_models
from fastembed.text.e5_onnx_embedding import supported_multilingual_e5_models
from src.embeddings.embeddings import embed_alternative, ModelGroups, OramaModelInfo

logger = logging.getLogger(__name__)


class EmbeddingsModels:
def __init__(self, config: OramaAIConfig, selected_models: List[OramaModelInfo]):
logger.info("Initializing EmbeddingsModels...")
self.config = config
self.selected_models = selected_models
self.selected_model_names = [item.name for item in selected_models]

logger.info(f"Creating cache directory: {config.models_cache_dir}")
os.makedirs(config.models_cache_dir, exist_ok=True)

logger.info("Setting FastEmbed cache directory...")
os.environ["FASTEMBED_CACHE_DIR"] = os.path.abspath(config.models_cache_dir)

logger.info("Loading models...")
self.loaded_models = self.load_models()
logger.info("Models loaded successfully")

self.model_loading_lock = threading.RLock()
self.model_last_used = {}

Expand Down Expand Up @@ -60,79 +74,3 @@ def calculate_embeddings(self, input, intent, model_name) -> List[float]:
return embed_alternative(self.loaded_models[model_name], input_strings)
else:
raise ValueError(f"Model {model_name} is not loaded")


def extend_fastembed_supported_models():
supported_onnx_models.extend(
[
{
"model": "intfloat/multilingual-e5-small",
"dim": 384,
"description": "Text embeddings, Unimodal (text), Multilingual (~100 languages), 512 input tokens truncation, Prefixes for queries/documents: necessary, 2024 year.",
"license": "mit",
"size_in_GB": 0.4,
"sources": {
"hf": "intfloat/multilingual-e5-small",
},
"model_file": "onnx/model.onnx",
},
{
"model": "intfloat/multilingual-e5-base",
"dim": 768,
"description": "Text embeddings, Unimodal (text), Multilingual (~100 languages), 512 input tokens truncation, Prefixes for queries/documents: necessary, 2024 year.",
"license": "mit",
"size_in_GB": 1.11,
"sources": {
"hf": "intfloat/multilingual-e5-base",
},
"model_file": "onnx/model.onnx",
},
{
"model": "BAAI/bge-small-en-v1.5-raw",
"dim": 384,
"description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.",
"license": "mit",
"size_in_GB": 0.4,
"sources": {
"hf": "BAAI/bge-small-en-v1.5",
},
"model_file": "onnx/model.onnx",
},
{
"model": "BAAI/bge-base-en-v1.5-raw",
"dim": 768,
"description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.",
"license": "mit",
"size_in_GB": 1.11,
"sources": {
"hf": "BAAI/bge-base-en-v1.5",
},
"model_file": "onnx/model.onnx",
},
{
"model": "BAAI/bge-large-en-v1.5-raw",
"dim": 1024,
"description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.",
"license": "mit",
"size_in_GB": 1.20,
"sources": {
"hf": "BAAI/bge-large-en-v1.5",
},
"model_file": "onnx/model.onnx",
},
]
)
supported_multilingual_e5_models.append(
{
"model": "intfloat/multilingual-e5-large-raw",
"dim": 1024,
"description": "Text embeddings, Unimodal (text), Multilingual (~100 languages), 512 input tokens truncation, Prefixes for queries/documents: necessary, 2024 year.",
"license": "mit",
"size_in_GB": 2.24,
"sources": {
"hf": "intfloat/multilingual-e5-large",
},
"model_file": "onnx/model.onnx",
"additional_files": ["onnx/model.onnx_data"],
}
)
5 changes: 4 additions & 1 deletion src/ai_server/src/grpc/server.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import grpc
import logging
from grpc_reflection.v1alpha import reflection
from concurrent.futures import ThreadPoolExecutor

Expand Down Expand Up @@ -97,8 +98,10 @@ def CallVision(self, request, context):


def serve(config, embeddings_service, models_manager):
print(f"Starting gRPC server on port {config.grpc_port}")
logger = logging.getLogger(__name__)
logger.info(f"Starting gRPC server on port {config.grpc_port}")
server = grpc.server(ThreadPoolExecutor(max_workers=10))
logger.info("gRPC server created")

embedding_service = CalculateEmbeddingService(embeddings_service)
llm_service = LLMService(models_manager)
Expand Down
Loading

0 comments on commit 5edb115

Please sign in to comment.