[MERGE] v1.4.0

bioinfo-pf-curie · Jan 13, 2025 · 1339005 · 1339005
2 parents f75498b + ba618ee
commit 1339005
Show file tree

Hide file tree

Showing 223 changed files with 62,871 additions and 1,073 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,3 +1,18 @@
+v1.4.0
+01/13/2025
+
+NEW FEATURES
+  - AlphaFold3
+  - Bootstrap using Docker containers on Docker Hub 4geniac
+
+BUGFIXES
+  - Remove double-quotes for boolean parameter in json files
+  - Numpy forced to version 1.26.4 in nanoBERT container
+
+DOCUMENTATION
+  - Detail outputs and explain how the different annotations must be installed
+  - Examples of different channels patterns explained in the nextflow code
+
 v1.3.0
 11/20/2024
 

diff --git a/README.md b/README.md
@@ -19,6 +19,7 @@ This pipeline allows:
 * the prediction of protein 3D structure using various tools:
   - [AFMassive](https://github.com/GBLille/AFMassive).
   - [AlphaFold](https://github.com/google-deepmind/alphafold/),
+  - [AlphaFold3](https://github.com/google-deepmind/alphafold3/),
   - [ColabFold](https://github.com/sokrypton/ColabFold),
 * molecular docking of protein/ligand using:
   - [DiffDock](https://github.com/gcorso/DiffDock)
@@ -42,8 +43,6 @@ Launching `main.nf` [curious_perlman] DSL2 - revision: 986ad6e9f0
     ( P | r | o | t | e | i | n ) ( F | o | l | d )
      \_/ \_/ \_/ \_/ \_/ \_/ \_/   \_/ \_/ \_/ \_/ 
 
-------------------------------------------------------------------------
-
 ------------------------------------------------------------------------
 
     Usage:
@@ -56,51 +55,55 @@ MANDATORY ARGUMENTS, NEXTFLOW:
     -profile  STRING [test, singularity, cluster]  Configuration profile to use. Can use multiple (comma separated).
 
 OTHER OPTIONS, NEXTFLOW:
-    -params-file  PATH   Set the parameters of the pipeline using a JSON file configuration filei (i.e. 'params.json'). All parameters defined as JSON 
-                         type must be this way. For example, the JSON can contain: "alphaFoldOptions": "--max_template=2024-01-01 --multimer". WARNING: 
-                         passing the option '--alphaFoldOptions' in command line will throw an error when the option contains '-' or '--' characters which 
+    -params-file  PATH   Set the parameters of the pipeline using a JSON file configuration file (i.e. 'params.json'). All parameters defined as JSON
+                         type must be this way. For example, the JSON can contain: "alphaFoldOptions": "--max_template=2024-01-01 --multimer". WARNING:
+                         passing the option '--alphaFoldOptions' in command line will throw an error when the option contains '-' or '--' characters which
                          are not appreciated by nextflow.
 
 OTHER OPTIONS:
     --afMassiveDatabase    PATH      Path to the database required by AFMassive.
     --afMassiveHelp                  Display all the options available to run AFMassive. Use this option in combination with -profile singularity.
-    afMassiveOptions       JSON      Specific options for AFMassive. As AFMassive is an AlphaFold-like tool, standard AlphaFold options are passed 
+    afMassiveOptions       JSON      Specific options for AFMassive. As AFMassive is an AlphaFold-like tool, standard AlphaFold options are passed
                                      using the --alphaFoldOptions option.
     --alphaFillHelp                  Display all the options available to run AlphaFill. Use this option in combination with -profile singularity.
+    --alphaFold3Database   PATH      Path to the database required by AlphaFold3.
+    --alphaFold3Help                 Display all the options available to run AlphaFold3. Use this option in combination with -profile singularity.
+    alphaFold3Options      JSON      Prediction model options passed to AlphaFold3.
     --alphaFoldDatabase    PATH      Path to the database required by AlphaFold.
     --alphaFoldHelp                  Display all the options available to run AlphaFold. Use this option in combination with -profile singularity.
     alphaFoldOptions       JSON      Prediction model options passed to AlphaFold or AFMassive.
     --colabFoldDatabase    PATH      Path to the database required by ColabFold.
     --colabFoldHelp                  Display all the options available to run ColabFold. Use this option in combination with -profile singularity.
     colabFoldOptions       JSON      Prediction model options passed to ColabFold.
-    --diffDockArgsYamlFile YAML      Path to the YAML file with the DiffDock options. 
+    --diffDockArgsYamlFile YAML      Path to the YAML file with the DiffDock options.
     --diffDockDatabase     PATH      Path to the database required by DiffDock.
     --dynamicBindDatabase  PATH      Path to the database required by DynamicBind.
-    --dynamicBindHelp                Display all the options available to run DynamicBind. Use this option in combination with -profile 
+    --dynamicBindHelp                Display all the options available to run DynamicBind. Use this option in combination with -profile
                                      singularity.
     dynamicBindOptions     JSON      Prediction model options passed to DynamicBind.
     --fastaPath            PATH      Path to the input directory which contains the fasta files.
-    --fromMsas             PATH      Path to existing multiple sequence alignments (msas) to use for the 3D protein strcuture prediction. 
+    --fromMsas             PATH      Path to existing multiple sequence alignments (msas) to use for the 3D protein strcuture prediction.
                                      Typically the path could be the results of the pipeline launcded with the --onlyMsas option.
     --launchAfMassive                Launch AFMassive
     --launchAlphaFill                Launch AlphaFill.
     --launchAlphaFold                Launch AlphaFold.
+    --launchAlphaFold3               Launch AlphaFold3.
     --launchColabFold                Launch ColabFold.
     --launchDiffDock                 Launch DiffDock.
     --launchDynamicBind              Launch DynamicBind.
-    --multimerVersions     INT       AlphaFold multimer model versions (v1, v2, v3) which be evaluated by AFMassive. This parameter is taken into 
-                                     account when --launchAfMassive is true. The list of the versions to be evaluated must be provided with a 
+    --multimerVersions     INT       AlphaFold multimer model versions (v1, v2, v3) which will be evaluated by AFMassive. This parameter is taken
+                                     into account when --launchAfMassive is true. The list of the versions to be evaluated must be provided with a
                                      comma separated string, e.g. 'v1,v2', Default is 'v1,v2,v3'.
-    --numberOfModels       INT       Number of models that will be evaluated by AFMassive. This parameter is taken into account when 
+    --numberOfModels       INT       Number of models that will be evaluated by AFMassive. This parameter is taken into account when
                                      --launchAfMassive is true.
     --onlyMsas                       When true, the pipeline will only generate the multiple sequence alignments (msas).
     --outDir               PATH      The output directory where the results will be saved
-    --predictionsPerModel  INT       Number of predictions per model which be evaluated by AFMassive. This parameter is taken into account when 
-                                     --launchAfMassive is true.
-    --proteinLigandFile    PATH      Path to the input file for molecular docking. The file must be in CSV format, without space. One column named 
-                                     'protein' contains the path the the 'pdb' file and one column named 'ligand' must contain the path to the 
+    --predictionsPerModel  INT       Number of predictions per model which will be evaluated by AFMassive. This parameter is taken into account
+                                     when --launchAfMassive is true.
+    --proteinLigandFile    PATH      Path to the input file for molecular docking. The file must be in CSV format, without space. One column named
+                                     'protein' contains the path the the 'pdb' file and one column named 'ligand' must contain the path to the
                                      'sdf' file.
-    --useGpu                         Run the prediction model on GPU. AlphaFold and AFMassive can run either on CPU or GPU. ColabFold and 
+    --useGpu                         Run the prediction model on GPU. AlphaFold and AFMassive can run either on CPU or GPU. ColabFold and
                                      DynamicBind require GPU only.
 
 REFERENCES:
@@ -111,12 +114,14 @@ Available Profiles
    -profile test                        Run the test dataset
    -profile singularity                 Use the Singularity images for each process. Use `--singularityPath` to define the insallation path
    -profile cluster                     Run the workflow on the cluster, instead of locally
+------------------------------------------------------------------------
+
 
 ```
 
 ## Quick run
 
-The pipeline can be run on any infrastructure. The use of GPU is preferred to speed-up computation.
+The pipeline can be run on any infrastructure. The use of GPU is preferred (and even required for some tools) to speed-up computation.
 
 ### Run the pipeline on a test dataset
 
@@ -149,7 +154,7 @@ For example, to launch the pipeline on a computing cluster with SLURM:
 ```bash
 echo "#! /bin/bash" > launcher.sh
 echo "set -oue pipefail" >> launcher.sh
-echo "nextflow run main.nf --fastaPath=\"test/data\" --alphaFoldOptions \"max_template_date=2024-01-01|random_seed=654321\" --outDir MY_OUTPUT_DIR -profile singularity,cluster" >> launcher.sh
+echo nextflow run main.nf --fastaPath="test/data" -params-file params.json --outDir MY_OUTPUT_DIR -profile singularity,cluster >> launcher.sh
 sbatch launcher.sh
 ```
 

diff --git a/bin/ap_format_ranking_alphafold3.py b/bin/ap_format_ranking_alphafold3.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+
+import csv
+import os
+import shutil
+
+from absl import flags
+from absl import app
+
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    'input_file', None,
+    'Path to input ranking file.')
+
+flags.DEFINE_string(
+    'output_file', None,
+    'Path to the output the ordered ranking file.')
+
+flags.DEFINE_string(
+    'cif_dir', None,
+    'Path where to copy the cif files.')
+
+def order_ranking_scores_af3(ranking_scores):
+    with open(ranking_scores, 'r') as csv_file:
+        csv_reader = csv.DictReader(csv_file)  # Use DictReader to work with named columns
+        # Sort rows by 'ranking_score' in descending order
+        sorted_rows = sorted(csv_reader, key=lambda row: float(row['ranking_score']), reverse=True)
+    rank = 0
+    for row in sorted_rows:
+        row['model'] = f"seed-{row['seed']}_sample-{row['sample']}"
+        row['rank'] = rank
+        rank = rank + 1
+
+    return sorted_rows
+
+def write_order_ranking_scores_af3(ordered_ranking_scores, ordered_ranking_scores_file):
+    new_column_order = ['rank', 'model', 'seed', 'sample', 'ranking_score']
+    with open(ordered_ranking_scores_file, mode='w', newline='', encoding='utf-8') as outfile:
+        # Create a DictWriter object with tab as the delimiter
+        writer = csv.DictWriter(outfile, fieldnames=new_column_order, delimiter='\t')
+
+        # Write the header row to the output file
+        writer.writeheader()
+
+        # Write rows from the DictReader to the output file
+        for row in ordered_ranking_scores:
+            writer.writerow({key: row[key] for key in new_column_order})
+
+
+def main(argv):
+    """ 
+    These functions are a combination of MassiveFold's team work and the following scripts from ColabFold repository and DeepMind colab notebook:
+    https://github.com/sokrypton/ColabFold/blob/main/colabfold/plot.py
+    https://github.com/sokrypton/ColabFold/blob/main/colabfold/colabfold.py
+    https://colab.research.google.com/github/deepmind/alphafold/blob/main/notebooks/AlphaFold.ipynb
+    
+    Here are some basic commands:
+    python MF_plots.py --input_path ./jobname --chosen_plots coverage,CF_PAEs
+      -> regardless of the plot type, plot alignment coverage and group PAE for top 10 predictions
+    """
+    FLAGS.input_file = os.path.realpath(FLAGS.input_file)
+    FLAGS.output_file = os.path.realpath(FLAGS.output_file)
+    FLAGS.cif_dir = os.path.realpath(FLAGS.cif_dir)
+
+    sorted_scores = order_ranking_scores_af3(FLAGS.input_file)
+    write_order_ranking_scores_af3(sorted_scores, FLAGS.output_file)
+
+    for row in sorted_scores:
+        cif_src = f"{os.path.dirname(FLAGS.input_file)}/{row['model']}/model.cif"
+        cif_dest = f"{FLAGS.cif_dir}/ranked_{row['rank']}.cif"
+        shutil.copy(cif_src, cif_dest)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/bin/ap_json_checker.py b/bin/ap_json_checker.py
@@ -0,0 +1,79 @@
+#! /usr/bin/env python
+
+""" Check the format of the json required by AlphaFold3 """
+import json
+import re
+from absl import app, flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('json', None, 'Path to the JSON file')
+
+flags.mark_flag_as_required('json')
+
+
+"""
+This functions checks:
+    - the file exists
+    - it has the extension .json
+    - the json is correctly formatted
+    - the file name contains only allowd characters
+"""
+def is_valid_file(file_path):
+
+    try:
+        with open(file_path, 'r') as file:
+            data = json.load(file)
+            file_basename=file.name.split('/')[-1]
+    except json.JSONDecodeError:
+        print("ERROR: JSON file is not correctly formatted.")
+        exit(1)
+    except FileNotFoundError:
+        print("ERROR: File not found.")
+        exit(1)
+
+
+    # Check if the file has a .json extension
+    if file_basename.endswith('.json'):
+        print("OK: The file has a .json extension.")
+    else:
+        print("ERROR: The file does not have a .json extension.")
+        exit(1)
+
+    # Define the allowed characters
+    allowed_chars = re.compile(r'^[a-zA-Z0-9_.-]+$')
+
+    # Find all invalid characters
+    invalid_chars = re.findall(r'[^a-zA-Z0-9_.-]', file_basename)
+
+    if invalid_chars:
+        print(f"ERROR: Invalid characters found: \"{', '.join(invalid_chars)}\"")
+        exit(1)
+    else:
+        return [data, file_basename]
+
+"""
+This functions checks:
+    - the field name value corresponds to the filename without the extension
+    - it sets the name value properly or add it if missing
+"""
+def check_and_update_json(data, file_basename):
+
+    new_name = file_basename.rsplit('.',1)[0]
+    if 'name' not in data:
+        print("ERROR: 'name' field is missing in the JSON file.")
+        exit(1)
+    else:
+        if data['name'] != new_name:
+            print(f"ERROR: The content of the 'name' field in the JSON file must be set to '{new_name}' which is the name of the JSON file without its extension.")
+            exit(1)
+
+
+def main(argv):
+    del argv  # Unused.
+    [data, file_basename] = is_valid_file(FLAGS.json)
+    check_and_update_json(data, file_basename)
+
+if __name__ == "__main__":
+    app.run(main)
+