Merge pull request #1 from frapercan/dev

Dev
CBBIO · Jan 9, 2025 · ca36449 · ca36449
2 parents d49fdf1 + 444d258
commit ca36449
Show file tree

Hide file tree

Showing 20 changed files with 1,020 additions and 234 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,30 @@
+FROM nvidia/cuda:12.6.1-base-ubuntu24.04
+
+# Update and install required system packages
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    python3-venv \
+    cd-hit \
+    postgresql-client-16 \
+    postgresql-contrib \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Set up a virtual environment for Python
+RUN python3 -m venv /opt/venv
+
+# Activate the virtual environment and install Python packages
+RUN /opt/venv/bin/pip install --upgrade pip \
+    && /opt/venv/bin/pip install protein-metamorphisms-is --no-cache-dir
+
+
+# Add the virtual environment to the PATH
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Copy application files and set the working directory
+COPY . /app
+WORKDIR /app
+
+# Default command to keep the container running
+ENTRYPOINT ["python3", "-m", "FANTASIA.main"]
+
diff --git a/FANTASIA/README.md b/FANTASIA/README.md
@@ -0,0 +1,78 @@
+
+---
+
+# FANTASIA
+
+![FANTASIA Logo](img/FANTASIA_logo.png)
+
+FANTASIA (Functional ANnoTAtion based on embedding space SImilArity) is a pipeline for annotating Gene Ontology (GO) terms for protein sequences using advanced protein language models like **ProtT5**, **ProstT5**, and **ESM2**. This system automates complex workflows, from sequence processing to functional annotation, providing a scalable and efficient solution for protein structure and functionality analysis.
+
+---
+
+## Key Features
+
+- **Redundancy Filtering**: Removes identical sequences with **CD-HIT** and optionally excludes sequences based on length constraints.
+- **Embedding Generation**: Utilizes state-of-the-art models for protein sequence embeddings.
+- **GO Term Lookup**: Matches embeddings with a vector database to retrieve associated GO terms.
+- **Results**: Outputs annotations in timestamped CSV files for reproducibility.
+
+---
+
+## Installation
+
+To install FANTASIA, ensure you have Python 3.8+ installed and use the following commands:
+
+```bash
+pip install protein-metamorphisms-is
+```
+
+---
+
+## Quick Start
+
+### Prerequisites
+
+Ensure the **Information System** is properly configured before running FANTASIA. Detailed instructions are available in the [project documentation](../../../README.md).
+
+### Running the Pipeline
+
+Execute the following command, specifying the path to the configuration file:
+
+```bash
+python main.py --config <path_to_config.yaml>
+```
+
+### Pipeline Overview
+
+1. **Redundancy Filtering**: Removes identical sequences and optionally filters sequences based on length.
+2. **Embedding Generation**: Computes embeddings for sequences using supported models and stores them in HDF5 format.
+3. **GO Term Lookup**: Queries a vector database to find and annotate similar proteins.
+4. **Output**: Saves annotations in a structured CSV file.
+
+---
+
+## Documentation
+
+For complete details on pipeline configuration, parameters, and deployment, visit the [FANTASIA Documentation](https://protein-metamorphisms-is.readthedocs.io/en/latest/pipelines/fantasia.html).
+
+---
+
+## Citation
+
+If you use FANTASIA in your work, please cite the following:
+
+1. Martínez-Redondo, G. I., Barrios, I., Vázquez-Valls, M., Rojas, A. M., & Fernández, R. (2024). Illuminating the functional landscape of the dark proteome across the Animal Tree of Life.  
+   https://doi.org/10.1101/2024.02.28.582465.
+
+2. Barrios-Núñez, I., Martínez-Redondo, G. I., Medina-Burgos, P., Cases, I., Fernández, R. & Rojas, A.M. (2024). Decoding proteome functional information in model organisms using protein language models.  
+   https://doi.org/10.1101/2024.02.14.580341.
+
+---
+
+## Contact Information
+
+- Francisco Miguel Pérez Canales: [email protected]  
+- Gemma I. Martínez-Redondo: [email protected]  
+- Ana M. Rojas: [email protected]  
+- Rosa Fernández: [email protected]  
+
diff --git a/python_poetry_template/__init__.py → FANTASIA/__init__.py b/python_poetry_template/__init__.py → FANTASIA/__init__.py
diff --git a/FANTASIA/config.yaml b/FANTASIA/config.yaml
@@ -0,0 +1,46 @@
+#System
+max_workers: 1
+
+# DB CONFIGURATION
+DB_USERNAME: usuario
+DB_PASSWORD: clave
+DB_HOST: pgvectorsql
+DB_PORT: 5432
+DB_NAME: BioData
+
+rabbitmq_host: localhost
+rabbitmq_user: guest
+rabbitmq_password: guest
+
+# Execution
+embeddings_url: "https://zenodo.org/records/14546346/files/embeddings.tar?download=1"
+embeddings_path: ~/fantasia/dumps/
+
+
+# FANTASIA
+fantasia_input_fasta: ~/fantasia/input/zinc.fasta
+fantasia_output_h5: ~/fantasia/embeddings/
+fantasia_output_csv: ~/fantasia/results/
+redundancy_file: ~/fantasia/redundancy/output.fasta
+
+fantasia_prefix: finger_zinc
+max_distance: 1.1
+length_filter: 5000
+redundancy_filter: 0.65
+
+embedding:
+  types:
+    - 1 # ESM
+    - 2 # Prost
+    - 3 # Prot
+  batch_size: 50 # Queue
+
+  embedding_batch_size: 10 # Model input size
+
+topgo?: True
+
+
+
+constants: "./FANTASIA/constants.yaml"
+
+
diff --git a/FANTASIA/constants.yaml b/FANTASIA/constants.yaml
@@ -0,0 +1,28 @@
+
+sequence_embedding_types:
+  - name: "ESM"
+    description: "Evolutionary Scale Modeling (ESM) embeddings are designed to capture the evolutionary information of protein sequences, utilizing deep learning to generate representations that enhance sequence analysis and prediction tasks."
+    task_name: "esm"
+    model_name: facebook/esm2_t6_8M_UR50D
+  - name: "Prost-T5"
+    description: "Prot-T5 embeddings leverage the capabilities of the T5 (Text-to-Text Transfer Transformer) model adapted for protein sequences, offering advanced sequence representation by considering both local and global sequence features."
+    task_name:  prost_t5
+    model_name: Rostlab/ProstT5
+  - name: "Prot-T5"
+    description: "Prot-T5 embeddings leverage the capabilities of the T5 (Text-to-Text Transfer Transformer) model adapted for protein sequences, offering advanced sequence representation by considering both local and global sequence features."
+    task_name: prot_t5
+    model_name: Rostlab/prot_t5_xl_uniref50
+
+
+structural_alignment_types:
+  - name: "CE-align"
+    description: "CE-align, or Combinatorial Extension, is a method for pairwise protein structure alignment. It focuses on aligning backbone atoms by identifying and extending aligned fragment pairs, offering insights into protein function and evolutionary history."
+    task_name: combinatorial_extension
+
+  - name: "US-align"
+    description: "US-align (Universal Structural alignment) is a unified protocol to compare 3D structures of different macromolecules (proteins, RNAs and DNAs) in different forms (monomers, oligomers and heterocomplexes) for both pairwise and multiple structure alignments. The core alogrithm of US-align is extended from TM-align and generates optimal structural alignments by maximizing TM-score of compared strucures through heuristic dynamic programming iterations. Large-scale benchmark tests showed that US-align can generate more accurate structural alignments with significantly reduced CPU time, compared to the state-of-the-art methods developed for specific structural alignment tasks. TM-score has values in (0,1] with 1 indicating an identical structure match, where a TM-score ≥0.5 (or 0.45) means the structures share the same global topology for proteins (or RNAs)."
+    task_name: universal
+
+  - name: "FATCAT"
+    description: "FATCAT (Flexible structure AlignmenT by Chaining Aligned fragment pairs allowing Twists), Protein structures are flexible and undergo structural rearrangements as part of their function. FATCAT (Flexible structure AlignmenT by Chaining Aligned fragment pairs allowing Twists) is an approach for flexible protein structure comparison. It simultaneously addresses the two major goals of flexible structure alignment; optimizing the alignment and minimizing the number of rigid-body movements (twists) around pivot points (hinges) introduced in the reference structure."
+    task_name: fatcat
diff --git a/FANTASIA/main.py b/FANTASIA/main.py
@@ -0,0 +1,76 @@
+import os
+import sys
+import time
+import yaml
+import argparse
+from datetime import datetime
+import protein_metamorphisms_is.sql.model.model  # noqa: F401
+from protein_metamorphisms_is.helpers.config.yaml import read_yaml_config
+from FANTASIA.src.helpers import download_embeddings, load_dump_to_db
+from FANTASIA.src.embedder import SequenceEmbedder
+from FANTASIA.src.lookup import EmbeddingLookUp
+
+
+def initialize(config_path):
+    # Leer la configuración
+    with open(config_path, "r") as config_file:
+        conf = yaml.safe_load(config_file)
+
+    embeddings_dir = os.path.expanduser(conf["embeddings_path"])
+    os.makedirs(embeddings_dir, exist_ok=True)
+    tar_path = os.path.join(embeddings_dir, "embeddings.tar")
+
+    # Descargar embeddings
+    print("Downloading embeddings...")
+    download_embeddings(conf["embeddings_url"], tar_path)
+
+    # Cargar el dump en la base de datos
+    print("Loading dump into the database...")
+    load_dump_to_db(tar_path, conf)
+
+
+def run_pipeline(config_path, fasta_path=None):
+    # Leer la configuración
+    conf = read_yaml_config(config_path)
+
+    # Actualizar la ruta del FASTA si se proporciona
+    if fasta_path:
+        conf["fantasia_input_fasta"] = fasta_path
+
+    # Ejecutar el pipeline de FANTASIA
+    current_date = datetime.now().strftime("%Y%m%d%H%M%S")
+    embedder = SequenceEmbedder(conf, current_date)
+    embedder.start()
+
+    lookup = EmbeddingLookUp(conf, current_date)
+    lookup.start()
+
+
+def wait_forever():
+    # Modo de espera
+    print("Container is running and waiting for commands...")
+    try:
+        while True:
+            time.sleep(3600)  # Espera indefinida
+    except KeyboardInterrupt:
+        print("Stopping container.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="FANTASIA: Command Handler")
+    parser.add_argument("command", type=str, nargs="?", default=None, help="Command to execute: initialize or run")
+    parser.add_argument("--config", type=str, default="./FANTASIA/config.yaml", help="Path to the configuration YAML file.")
+    parser.add_argument("--fasta", type=str, help="Path to the input FASTA file.")
+    args = parser.parse_args()
+
+    if args.command == "initialize":
+        print("Initializing embeddings and database...")
+        initialize(args.config)
+    elif args.command == "run":
+        print("Running the FANTASIA pipeline...")
+        run_pipeline(config_path=args.config, fasta_path=args.fasta)
+    elif args.command is None:
+        wait_forever()
+    else:
+        print(f"Unknown command: {args.command}")
+        sys.exit(1)
diff --git a/python_poetry_template/maths/__init__.py → FANTASIA/src/__init__.py b/python_poetry_template/maths/__init__.py → FANTASIA/src/__init__.py