From 8629ba0904ba2a58e9d93d774b94586776333fc3 Mon Sep 17 00:00:00 2001 From: "juan.ledesma" Date: Thu, 6 Jun 2024 11:16:33 +0200 Subject: [PATCH 001/321] updated changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 10c775bbe..8c94a1423 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Code contributions to the new version: - [Sarai Varona](https://github.com/svarona) - [Daniel Valle](https://github.com/Daniel-VM) - [Víctor López](https://github.com/victor5lm) +- [Juan Ledesma](https://github.com/juanledesma78) ### Template fixes and updates @@ -43,6 +44,7 @@ Code contributions to the new version: - Included annotated tab description in exome-trios markdowns [#273](https://github.com/BU-ISCIII/buisciii-tools/pull/273) - Installed all necessary singularity images and modified all templates so that, instead of using conda environments or loaded modules, the corresponding singularity images are used [#272](https://github.com/BU-ISCIII/buisciii-tools/pull/272) - Updated sarek version in exomeeb, exometrio and wgstrio templates [#277](https://github.com/BU-ISCIII/buisciii-tools/pull/277) +- Change of extension file of all_samples_virus_table_filtered (from csv to tsv) in lablog_viralrecon_results [#278](https://github.com/BU-ISCIII/buisciii-tools/pull/278) ### Modules From f939f321b9430faee16bf469fe5fd970235ad256 Mon Sep 17 00:00:00 2001 From: "juan.ledesma" Date: Thu, 6 Jun 2024 10:48:07 +0200 Subject: [PATCH 002/321] extension file of all_samples_virus_table_filtered changed from csv to tsv to match pikavirus result table --- .../templates/viralrecon/RESULTS/lablog_viralrecon_results | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results index e71f4294f..d229644b9 100755 --- a/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results +++ b/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results @@ -23,7 +23,7 @@ cat references.tmp | while read in; do ln -s ../../ANALYSIS/*/*${in}*/variants/i ln -s ../../ANALYSIS/*_MAG/99-stats/multiqc_report.html ./krona_results.html ln -s ../../ANALYSIS/*/mapping_illumina*.tab ./mapping_illumina.csv ln -s ../../ANALYSIS/*/assembly_stats.csv ./assembly_stats.csv -ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.csv ./pikavirus_table.csv +ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.tsv ./pikavirus_table.tsv #conda activate viralrecon_report echo "python ./excel_generator.py -r ./references.tmp --merge_lineage_files" > _01_generate_excel_files.sh From cc9aa85dc317e8660ded92cc16f2fe14ac8d981c Mon Sep 17 00:00:00 2001 From: "juan.ledesma" Date: Thu, 6 Jun 2024 12:10:21 +0200 Subject: [PATCH 003/321] updated CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c94a1423..d30c41c22 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,7 +44,7 @@ Code contributions to the new version: - Included annotated tab description in exome-trios markdowns [#273](https://github.com/BU-ISCIII/buisciii-tools/pull/273) - Installed all necessary singularity images and modified all templates so that, instead of using conda environments or loaded modules, the corresponding singularity images are used [#272](https://github.com/BU-ISCIII/buisciii-tools/pull/272) - Updated sarek version in exomeeb, exometrio and wgstrio templates [#277](https://github.com/BU-ISCIII/buisciii-tools/pull/277) -- Change of extension file of all_samples_virus_table_filtered (from csv to tsv) in lablog_viralrecon_results [#278](https://github.com/BU-ISCIII/buisciii-tools/pull/278) +- Extension file of all_samples_virus_table_filtered (from csv to tsv) in lablog_viralrecon_results changed [#278](https://github.com/BU-ISCIII/buisciii-tools/pull/278) ### Modules From 9d00f74649173c4897224ed1d368298ebf379b9f Mon Sep 17 00:00:00 2001 From: Shettland Date: Thu, 6 Jun 2024 12:49:00 +0200 Subject: [PATCH 004/321] Python lint now only runs if .py files in PR --- .github/workflows/python_lint.yml | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python_lint.yml b/.github/workflows/python_lint.yml index d121b7f32..503f3fe1b 100644 --- a/.github/workflows/python_lint.yml +++ b/.github/workflows/python_lint.yml @@ -3,10 +3,10 @@ name: python_lint on: push: paths: - - '**/*.py' + - '**.py' pull_request: paths: - - '**/*.py' + - '**.py' jobs: flake8_py3: @@ -21,8 +21,18 @@ jobs: uses: actions/checkout@v2 - name: Install flake8 run: pip install flake8 + - name: Check for Python file changes + id: file_check + run: | + git fetch origin ${{ github.base_ref }} + diff_pyfiles=$(git diff --name-only origin/${{ github.base_ref }} ${{ github.head_ref }} -- '*.py') + echo "::set-output name=diff_pyfiles::$diff_pyfiles" - name: Run flake8 + if: steps.file_check.outputs.diff_pyfiles != '' run: flake8 --ignore E501,W503,E203,W605 + - name: No Python files changed + if: steps.file_check.outputs.diff_pyfiles == '' + run: echo "No Python files have been changed." black_lint: runs-on: ubuntu-latest @@ -31,5 +41,15 @@ jobs: uses: actions/checkout@v2 - name: Install black in jupyter run: pip install black[jupyter] + - name: Check for Python file changes + id: file_check + run: | + git fetch origin ${{ github.base_ref }} + diff_pyfiles=$(git diff --name-only origin/${{ github.base_ref }} ${{ github.head_ref }} -- '*.py') + echo "::set-output name=diff_pyfiles::$diff_pyfiles" - name: Check code lints with Black + if: steps.file_check.outputs.diff_pyfiles != '' uses: psf/black@stable + - name: No Python files changed + if: steps.file_check.outputs.diff_pyfiles == '' + run: echo "No Python files have been changed." From 9a7b8dd018c738155c9f92ff6618cf3f9d741a6d Mon Sep 17 00:00:00 2001 From: Shettland Date: Thu, 6 Jun 2024 12:53:38 +0200 Subject: [PATCH 005/321] Updated CHANGELOG --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d30c41c22..baf416343 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -58,6 +58,8 @@ Code contributions to the new version: #### Changed +- Forcing python lint to success if no .py files are in PR [#279](https://github.com/BU-ISCIII/buisciii-tools/pull/279) + #### Removed ### Requirements From 11892bb8accd3c3b866b4e9063fb5dd552230318 Mon Sep 17 00:00:00 2001 From: Shettland Date: Thu, 6 Jun 2024 12:59:13 +0200 Subject: [PATCH 006/321] Removed restriction to pyfiles in lint workflow --- .github/workflows/python_lint.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python_lint.yml b/.github/workflows/python_lint.yml index 503f3fe1b..34ce3efa8 100644 --- a/.github/workflows/python_lint.yml +++ b/.github/workflows/python_lint.yml @@ -2,11 +2,10 @@ name: python_lint on: push: - paths: - - '**.py' + branches: "**" pull_request: - paths: - - '**.py' + types: [opened, reopened, synchronize, closed] + branches: "**" jobs: flake8_py3: From 9117bf167fc4f3ce6245e2b287ba3070127318fb Mon Sep 17 00:00:00 2001 From: Shettland Date: Thu, 6 Jun 2024 13:10:12 +0200 Subject: [PATCH 007/321] Updated solution to stalled lint --- .github/workflows/python_lint.yml | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/.github/workflows/python_lint.yml b/.github/workflows/python_lint.yml index 34ce3efa8..df75b052a 100644 --- a/.github/workflows/python_lint.yml +++ b/.github/workflows/python_lint.yml @@ -17,38 +17,34 @@ jobs: python-version: 3.9.x architecture: x64 - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: actions/checkout@v3 + fetch-depth: 0 - name: Install flake8 run: pip install flake8 - name: Check for Python file changes id: file_check - run: | - git fetch origin ${{ github.base_ref }} - diff_pyfiles=$(git diff --name-only origin/${{ github.base_ref }} ${{ github.head_ref }} -- '*.py') - echo "::set-output name=diff_pyfiles::$diff_pyfiles" + uses: tj-actions/changed-files@v33 - name: Run flake8 - if: steps.file_check.outputs.diff_pyfiles != '' + if: steps.file_check.outputs.any_changed == 'true' run: flake8 --ignore E501,W503,E203,W605 - name: No Python files changed - if: steps.file_check.outputs.diff_pyfiles == '' + if: steps.file_check.outputs.any_changed != 'true' run: echo "No Python files have been changed." black_lint: runs-on: ubuntu-latest steps: - name: Setup - uses: actions/checkout@v2 + uses: actions/checkout@v3 + fetch-depth: 0 - name: Install black in jupyter run: pip install black[jupyter] - name: Check for Python file changes id: file_check - run: | - git fetch origin ${{ github.base_ref }} - diff_pyfiles=$(git diff --name-only origin/${{ github.base_ref }} ${{ github.head_ref }} -- '*.py') - echo "::set-output name=diff_pyfiles::$diff_pyfiles" + uses: tj-actions/changed-files@v34 - name: Check code lints with Black - if: steps.file_check.outputs.diff_pyfiles != '' + if: steps.file_check.outputs.any_changed == 'true' uses: psf/black@stable - name: No Python files changed - if: steps.file_check.outputs.diff_pyfiles == '' + if: steps.file_check.outputs.any_changed != 'true' run: echo "No Python files have been changed." From 617ff6ad4aeb72f4932f54bbbbeba2b78342a4e3 Mon Sep 17 00:00:00 2001 From: Shettland Date: Thu, 6 Jun 2024 13:11:53 +0200 Subject: [PATCH 008/321] Updated solution to stalled lint. test2 --- .github/workflows/python_lint.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/python_lint.yml b/.github/workflows/python_lint.yml index df75b052a..64645e0b6 100644 --- a/.github/workflows/python_lint.yml +++ b/.github/workflows/python_lint.yml @@ -18,7 +18,6 @@ jobs: architecture: x64 - name: Checkout PyTorch uses: actions/checkout@v3 - fetch-depth: 0 - name: Install flake8 run: pip install flake8 - name: Check for Python file changes @@ -36,7 +35,6 @@ jobs: steps: - name: Setup uses: actions/checkout@v3 - fetch-depth: 0 - name: Install black in jupyter run: pip install black[jupyter] - name: Check for Python file changes From 8e8d1c8a313ded5d761ef15e428ede7f162b99e9 Mon Sep 17 00:00:00 2001 From: Shettland Date: Thu, 6 Jun 2024 13:14:30 +0200 Subject: [PATCH 009/321] Updated solution to stalled lint. fetch-depth 0 --- .github/workflows/python_lint.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python_lint.yml b/.github/workflows/python_lint.yml index 64645e0b6..5aa19c41b 100644 --- a/.github/workflows/python_lint.yml +++ b/.github/workflows/python_lint.yml @@ -17,7 +17,8 @@ jobs: python-version: 3.9.x architecture: x64 - name: Checkout PyTorch - uses: actions/checkout@v3 + uses: actions/checkout@v2 + fetch-depth: 0 - name: Install flake8 run: pip install flake8 - name: Check for Python file changes @@ -34,7 +35,8 @@ jobs: runs-on: ubuntu-latest steps: - name: Setup - uses: actions/checkout@v3 + uses: actions/checkout@v2 + fetch-depth: 0 - name: Install black in jupyter run: pip install black[jupyter] - name: Check for Python file changes From 0c7f8a9708e2ed32915805049ac713221582cf34 Mon Sep 17 00:00:00 2001 From: Shettland Date: Thu, 6 Jun 2024 13:16:26 +0200 Subject: [PATCH 010/321] Updated solution to stalled lint. fixed1 --- .github/workflows/python_lint.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/python_lint.yml b/.github/workflows/python_lint.yml index 5aa19c41b..eb7625289 100644 --- a/.github/workflows/python_lint.yml +++ b/.github/workflows/python_lint.yml @@ -18,6 +18,7 @@ jobs: architecture: x64 - name: Checkout PyTorch uses: actions/checkout@v2 + with: fetch-depth: 0 - name: Install flake8 run: pip install flake8 @@ -36,6 +37,7 @@ jobs: steps: - name: Setup uses: actions/checkout@v2 + with: fetch-depth: 0 - name: Install black in jupyter run: pip install black[jupyter] From 08e1fcaaac44b480a40d069f5ed6cbed7dc4afef Mon Sep 17 00:00:00 2001 From: Shettland Date: Thu, 6 Jun 2024 13:19:46 +0200 Subject: [PATCH 011/321] Python linting test --- bu_isciii/scratch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/scratch.py b/bu_isciii/scratch.py index 5f04ccd93..5a0bbeae2 100755 --- a/bu_isciii/scratch.py +++ b/bu_isciii/scratch.py @@ -251,7 +251,7 @@ def remove_scratch(self): stderr.print( "[red]ERROR: Directory " + scratch_folder - + " not the same as " + + " is not the same as " + self.scratch_tmp_path, highlight=False, ) From f5f029d6d6b2cdf792183895ef3edf1b33509870 Mon Sep 17 00:00:00 2001 From: Shettland Date: Thu, 6 Jun 2024 13:26:54 +0200 Subject: [PATCH 012/321] Updated solution to stalled lint. fixed2 --- .github/workflows/python_lint.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/python_lint.yml b/.github/workflows/python_lint.yml index eb7625289..aba4945fc 100644 --- a/.github/workflows/python_lint.yml +++ b/.github/workflows/python_lint.yml @@ -25,6 +25,10 @@ jobs: - name: Check for Python file changes id: file_check uses: tj-actions/changed-files@v33 + with: + since_last_remote_commit: true + files: | + **.py - name: Run flake8 if: steps.file_check.outputs.any_changed == 'true' run: flake8 --ignore E501,W503,E203,W605 @@ -44,6 +48,10 @@ jobs: - name: Check for Python file changes id: file_check uses: tj-actions/changed-files@v34 + with: + since_last_remote_commit: true + files: | + **.py - name: Check code lints with Black if: steps.file_check.outputs.any_changed == 'true' uses: psf/black@stable From 23b173c6aa0b50de88ee2c7196fbe7f2cde351e2 Mon Sep 17 00:00:00 2001 From: Shettland Date: Thu, 6 Jun 2024 13:28:24 +0200 Subject: [PATCH 013/321] python linting test2 --- bu_isciii/scratch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/scratch.py b/bu_isciii/scratch.py index 5a0bbeae2..d36dd8655 100755 --- a/bu_isciii/scratch.py +++ b/bu_isciii/scratch.py @@ -246,7 +246,7 @@ def remove_scratch(self): ) else: log.error( - f"Directory path not the same as service resolution. Skip folder copy '{scratch_folder}'" + f"Directory path is not the same as service resolution. Skip folder copy '{scratch_folder}'" ) stderr.print( "[red]ERROR: Directory " From 38c06c06aa65aea3d826ffadc49bdca935cf6bb2 Mon Sep 17 00:00:00 2001 From: Shettland Date: Thu, 6 Jun 2024 13:42:54 +0200 Subject: [PATCH 014/321] python linting test3 --- .github/workflows/python_lint.yml | 4 ++-- bu_isciii/scratch.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python_lint.yml b/.github/workflows/python_lint.yml index aba4945fc..f3bdcd225 100644 --- a/.github/workflows/python_lint.yml +++ b/.github/workflows/python_lint.yml @@ -26,7 +26,7 @@ jobs: id: file_check uses: tj-actions/changed-files@v33 with: - since_last_remote_commit: true + sha: ${{ github.event.pull_request.head.sha }} files: | **.py - name: Run flake8 @@ -49,7 +49,7 @@ jobs: id: file_check uses: tj-actions/changed-files@v34 with: - since_last_remote_commit: true + sha: ${{ github.event.pull_request.head.sha }} files: | **.py - name: Check code lints with Black diff --git a/bu_isciii/scratch.py b/bu_isciii/scratch.py index d36dd8655..d6a420e1b 100755 --- a/bu_isciii/scratch.py +++ b/bu_isciii/scratch.py @@ -241,7 +241,7 @@ def remove_scratch(self): if self.service_folder in scratch_folder: shutil.rmtree(scratch_folder) stderr.print( - "[green]Successfully removed the directory %s" % scratch_folder, + "[green]Successfully removed directory %s" % scratch_folder, highlight=False, ) else: From b74549495a4215f45cbbc096698cadc1a3f37880 Mon Sep 17 00:00:00 2001 From: Shettland Date: Thu, 6 Jun 2024 13:51:42 +0200 Subject: [PATCH 015/321] Updated diff-file checker version --- .github/workflows/python_lint.yml | 7 +++---- bu_isciii/scratch.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/python_lint.yml b/.github/workflows/python_lint.yml index f3bdcd225..b373db7a6 100644 --- a/.github/workflows/python_lint.yml +++ b/.github/workflows/python_lint.yml @@ -24,7 +24,7 @@ jobs: run: pip install flake8 - name: Check for Python file changes id: file_check - uses: tj-actions/changed-files@v33 + uses: tj-actions/changed-files@v44 with: sha: ${{ github.event.pull_request.head.sha }} files: | @@ -47,11 +47,10 @@ jobs: run: pip install black[jupyter] - name: Check for Python file changes id: file_check - uses: tj-actions/changed-files@v34 + uses: tj-actions/changed-files@v44 with: sha: ${{ github.event.pull_request.head.sha }} - files: | - **.py + files: '**.py' - name: Check code lints with Black if: steps.file_check.outputs.any_changed == 'true' uses: psf/black@stable diff --git a/bu_isciii/scratch.py b/bu_isciii/scratch.py index d6a420e1b..9f2a9984f 100755 --- a/bu_isciii/scratch.py +++ b/bu_isciii/scratch.py @@ -201,7 +201,7 @@ def revert_copy_scratch(self): except Exception as e: stderr.print(e) stderr.print( - "[red]ERROR: Copy of the directory %s failed" + "[red]ERROR: Copy of directory %s failed" % self.scratch_tmp_path, highlight=False, ) From b632b0225b5efbe095b4c2f19540e7d023519226 Mon Sep 17 00:00:00 2001 From: Shettland Date: Thu, 6 Jun 2024 13:54:37 +0200 Subject: [PATCH 016/321] Test commit without py files --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9d597a7cc..122bb0708 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ Options: -u, --api_user TEXT User for the API logging -p, --api_password TEXT Password for the API logging -c, --cred_file TEXT Config file with API logging credentials - --help Show this message and exit. + --help Show this message and exit Commands: list List available bu-isciii services. From 342067a95c9e334c08e9e9dc0a4bf0dc91db77cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sara=20Monz=C3=B3n?= Date: Tue, 4 Jun 2024 16:33:13 +0200 Subject: [PATCH 017/321] added versions dep to enviroment.yml file --- environment.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index f21d9fe78..cdad4196c 100644 --- a/environment.yml +++ b/environment.yml @@ -1,5 +1,7 @@ channels: - conda-forge dependencies: -- wkhtmltopdf -- tree +- wkhtmltopdf>=0.12.4 +- tree>=2.0.2 +- pip>22.0.2 +- python>3.9 From 2c1be53f9975ecfb9eb82b85a347581ab592a38d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sara=20Monz=C3=B3n?= Date: Tue, 4 Jun 2024 16:33:59 +0200 Subject: [PATCH 018/321] removed path for loading wkhtmlpdf, and catched error when executable does not exist --- bu_isciii/bioinfo_doc.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bu_isciii/bioinfo_doc.py b/bu_isciii/bioinfo_doc.py index b4d0ab4ea..e863fce4e 100755 --- a/bu_isciii/bioinfo_doc.py +++ b/bu_isciii/bioinfo_doc.py @@ -185,8 +185,13 @@ def __init__( ) self.samples = self.resolution_info.get("samples", None) self.handled_services = None - path_to_wkhtmltopdf = os.path.normpath(self.conf["wkhtmltopdf_path"]) - self.config_pdfkit = pdfkit.configuration(wkhtmltopdf=path_to_wkhtmltopdf) + try: + self.config_pdfkit = pdfkit.configuration() + except OSError as e: + stderr.print("[red] wkhtmlpdf executable was not found. Install it using conda environment.") + stderr.print(f"[red] Error: {e}") + sys.exit() + if self.type == "service_info": self.template_file = self.conf["service_info_template_path_file"] else: @@ -416,6 +421,7 @@ def convert_to_pdf(self, html_file): ) except OSError as e: stderr.print("[red] Unable to convert to PDF") + stderr.print(f"[red] Error: {e}") log.exception("Unable to create pdf.", exc_info=e) return From 3be8b0b0d96fe1f2f1be97fd288f299f7fcf55e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sara=20Monz=C3=B3n?= Date: Tue, 4 Jun 2024 16:34:34 +0200 Subject: [PATCH 019/321] removed wkhtmlpdf path from configs --- bu_isciii/conf/configuration.json | 1 - bu_isciii/conf/configuration_dev.json | 1 - 2 files changed, 2 deletions(-) diff --git a/bu_isciii/conf/configuration.json b/bu_isciii/conf/configuration.json index d133d3a50..d4e4f86ea 100755 --- a/bu_isciii/conf/configuration.json +++ b/bu_isciii/conf/configuration.json @@ -36,7 +36,6 @@ "delivery_template_path_file": "templates/jinja_template_delivery.j2", "html_template_path_file": "templates/html_service_template.html", "path_to_css": "assets/css", - "wkhtmltopdf_path": "/data/bi/pipelines/miniconda3/envs/buisciii-tools/bin/wkhtmltopdf", "email_host": "mx2.isciii.es", "email_port": "587", "email_host_user": "bioinformatica@isciii.es", diff --git a/bu_isciii/conf/configuration_dev.json b/bu_isciii/conf/configuration_dev.json index 08d1fd262..68e948cb5 100755 --- a/bu_isciii/conf/configuration_dev.json +++ b/bu_isciii/conf/configuration_dev.json @@ -36,7 +36,6 @@ "delivery_template_path_file": "templates/jinja_template_delivery.j2", "html_template_path_file": "templates/html_service_template.html", "path_to_css": "assets/css", - "wkhtmltopdf_path": "/data/bi/pipelines/miniconda3/envs/buisciii-tools/bin/wkhtmltopdf", "email_host": "mx2.isciii.es", "email_port": "587", "email_host_user": "bioinformatica@isciii.es", From 662c30aead3b176eaf93792f744a253d1472cfe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sara=20Monz=C3=B3n?= Date: Tue, 4 Jun 2024 16:40:15 +0200 Subject: [PATCH 020/321] clarified readme install instructions --- README.md | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 122bb0708..138380a1a 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,8 @@ BU-ISCIII provides a serie or services in its portfolio for supporting bioinform - [buisciii-tools](#buisciii-tools) - [Installation](#installation) - - [Bioconda](#bioconda) - - [Pip](#pip) - - [Development version](#development-version) + - [Micromamba and pip](#micromamba-and-pip) + - [Dev version](#dev-version) - [Usage](#usage) - [Command-line](#command-line) - [list](#list) @@ -26,30 +25,42 @@ BU-ISCIII provides a serie or services in its portfolio for supporting bioinform ## Installation -### Bioconda +### Micromamba and pip ```bash -conda create -n buisciii-tools pip -conda activate -conda env update --file environment.yml +micromamba create -n buisciii -f environment.yml +micromamba activate buisciii +pip install --force-reinstall --upgrade git+https://github.com/bu-isciii/buisciii-tools.git@main ``` -### Pip +or ```bash +git checkout main +conda create -n buisciii -f environment.yml conda activate pip install . ``` -### Development version +### Dev version If you want to install the latest code in the repository: ```bash -conda create -n buisciii_dev pip +micromamba create -n buisciii_dev -f environment.yml +micromamba activate buisciii_dev pip install --force-reinstall --upgrade git+https://github.com/bu-isciii/buisciii-tools.git@develop ``` +or locally: + +```bash +git checkout develop +micromamba create -n buisciii_dev -f environment.yml +micromamba activate buisciii_dev +pip install . +``` + ## Usage ### Command-line From dac62fa0bed78ce21c87f25090314b172641a32f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sara=20Monz=C3=B3n?= Date: Tue, 4 Jun 2024 16:45:05 +0200 Subject: [PATCH 021/321] updated changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index baf416343..95f6bbda5 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,6 +51,7 @@ Code contributions to the new version: #### Added enhancements - PR [#274](https://github.com/BU-ISCIII/buisciii-tools/pull/274): added `--dev` option, configuration dev and test folder structure. +- PR [#276](https://github.com/BU-ISCIII/buisciii-tools/pull/276): wkhtmlpdf does not need absolute path to executable. Added better error handling when executable does not exists. #### Fixes From ba47621fc5aa1774b831655e26f3984a1e7f0fd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sara=20Monz=C3=B3n?= Date: Tue, 4 Jun 2024 16:45:31 +0200 Subject: [PATCH 022/321] lintig --- bu_isciii/bioinfo_doc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bu_isciii/bioinfo_doc.py b/bu_isciii/bioinfo_doc.py index e863fce4e..b598f344b 100755 --- a/bu_isciii/bioinfo_doc.py +++ b/bu_isciii/bioinfo_doc.py @@ -188,7 +188,9 @@ def __init__( try: self.config_pdfkit = pdfkit.configuration() except OSError as e: - stderr.print("[red] wkhtmlpdf executable was not found. Install it using conda environment.") + stderr.print( + "[red] wkhtmlpdf executable was not found. Install it using conda environment." + ) stderr.print(f"[red] Error: {e}") sys.exit() From f67600e8f0806409074b3b49376ef3638d7ae940 Mon Sep 17 00:00:00 2001 From: jaimeozaez <135366362+jaimeozaez@users.noreply.github.com> Date: Fri, 7 Jun 2024 11:50:34 +0200 Subject: [PATCH 023/321] Fixed clean module (#280) * services.json modified in order to properly delete fitrimmed fastq les in assembly annotation * self.service_samples modified in order to properly get list of service samples * assembly files to delete modified in services.json * services.json updated with mtbseq service info * Modified purge_files function for avoiding repeated files in files_to_delete list * Modified purge_files function for avoiding repeated files in files_to_delete list * Relocated self.rename() method usage for _DEL renaming of purged folders. Now it is called when self.option = rename_nocopy * delete_rename() function renamed to just delete() * self.option rename_nocopy renamed to rename * Reverted lasts commits * Reverted some weird changes... * Relocated self.rename() method usage for _DEL renaming of purged folders * Renamed self.delete_rename() method. Now is self.delete() * rename_nocopy option modified in just rename * files to delete added to services.json for rnaseq service * Updated CHANGELOG * fixed linting * Fixed linting --- CHANGELOG.md | 1 + bu_isciii/__main__.py | 6 +++--- bu_isciii/clean.py | 30 ++++++++++++++++-------------- bu_isciii/templates/services.json | 14 +++++++------- 4 files changed, 27 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 95f6bbda5..03484899d 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -56,6 +56,7 @@ Code contributions to the new version: #### Fixes - Fixed archive module. Updated correct header for scout tsv [#258](https://github.com/BU-ISCIII/buisciii-tools/pull/258). +- Fixed clean module. Corrected purge_files function. Renaming stage moved from clean to rename_nocopy option. Updated services.json file with correct paths for some services. [#280](https://github.com/BU-ISCIII/buisciii-tools/pull/280) #### Changed diff --git a/bu_isciii/__main__.py b/bu_isciii/__main__.py index 3c3747612..012b3bb4a 100755 --- a/bu_isciii/__main__.py +++ b/bu_isciii/__main__.py @@ -307,7 +307,7 @@ def scratch(ctx, resolution, path, tmp_dir, direction, ask_path): type=click.Choice( [ "full_clean", - "rename_nocopy", + "rename", "clean", "revert_renaming", "show_removable", @@ -317,7 +317,7 @@ def scratch(ctx, resolution, path, tmp_dir, direction, ask_path): multiple=False, help=( "Select what to do inside the cleanning step: full_clean: delete files and folders to clean," - " rename no copy and deleted folders, rename_nocopy: just rename no copy folders, clean: " + " rename no copy and deleted folders, rename: just rename folders, clean: " "delete files and folders to clean," "revert_renaming: remove no_copy and delete tags," "show_removable: list folders and files to remove " @@ -447,7 +447,7 @@ def finish(ctx, resolution, path, ask_path, sftp_folder, tmp_dir): resolution, path, ask_path, - "rename_nocopy", + "rename", ctx.obj["api_user"], ctx.obj["api_password"], ctx.obj["conf"], diff --git a/bu_isciii/clean.py b/bu_isciii/clean.py index aeabff8db..3dba081c0 100644 --- a/bu_isciii/clean.py +++ b/bu_isciii/clean.py @@ -55,7 +55,9 @@ def __init__( self.services_requested = self.resolution_info["resolutions"][0][ "available_services" ] - self.service_samples = self.resolution_info["samples"] + self.service_samples = [ + sample_id["sample_name"] for sample_id in self.resolution_info["samples"] + ] if ask_path and path is None: stderr.print( @@ -94,14 +96,13 @@ def __init__( self.delete_files = self.get_clean_items(self.services_to_clean, type="files") # self.delete_list = [item for item in self.delete_list if item] self.nocopy = self.get_clean_items(self.services_to_clean, type="no_copy") - self.service_samples = self.resolution_info.get("Samples", None) if option is None: self.option = bu_isciii.utils.prompt_selection( "Options", [ "full_clean", - "rename_nocopy", + "rename", "clean", "revert_renaming", "show_removable", @@ -312,10 +313,9 @@ def purge_files(self): files_to_delete = [] for sample_info in self.service_samples: for file in self.delete_files: - file_to_delete = file.replace( - "sample_name", sample_info["sample_name"] - ) - files_to_delete.append(file_to_delete) + file_to_delete = file.replace("sample_name", sample_info) + if file_to_delete not in files_to_delete: + files_to_delete.append(file_to_delete) path_content = self.scan_dirs(to_find=files_to_delete) for file in path_content: os.remove(file) @@ -371,7 +371,7 @@ def delete_work(self): else: stderr.print("There is no work folder here") - def delete_rename(self, verbose=True, sacredtexts=["lablog", "logs"], add="_DEL"): + def delete(self, verbose=True, sacredtexts=["lablog", "logs"], add="_DEL"): """ Description: Remove both files and purge folders defined for the service, and rename to tag. @@ -392,10 +392,8 @@ def delete_rename(self, verbose=True, sacredtexts=["lablog", "logs"], add="_DEL" # Purge folders if self.delete_folders != "": self.purge_folders(sacredtexts=sacredtexts, add=add, verbose=verbose) - # Rename to tag. - self.rename(add=add, to_find=self.delete_folders, verbose=verbose) else: - stderr.print("No folders to remove or rename") + stderr.print("No folders to remove") # Purge work self.delete_work() # Delete files @@ -432,8 +430,10 @@ def full_clean(self): Perform and handle the whole cleaning of the service """ - self.delete_rename() + self.delete() self.rename(to_find=self.nocopy, add="_NC", verbose=True) + if self.delete_folders != "": + self.rename(add="_DEL", to_find=self.delete_folders, verbose=True) def handle_clean(self): """ @@ -445,9 +445,11 @@ def handle_clean(self): self.show_nocopy() if self.option == "full_clean": self.full_clean() - if self.option == "rename_nocopy": + if self.option == "rename": self.rename(to_find=self.nocopy, add="_NC", verbose=True) + if self.delete_folders != "": + self.rename(add="_DEL", to_find=self.delete_folders, verbose=True) if self.option == "clean": - self.delete_rename() + self.delete() if self.option == "revert_renaming": self.revert_renaming() diff --git a/bu_isciii/templates/services.json b/bu_isciii/templates/services.json index 0e8e853b8..bb88dfafb 100755 --- a/bu_isciii/templates/services.json +++ b/bu_isciii/templates/services.json @@ -8,8 +8,8 @@ "end": "", "description": "nf-core/bacass: Simple bacterial assembly and annotation pipeline", "clean": { - "folders":["01-preprocessing/trimmed_sequences"], - "files":[] + "folders":[], + "files":["01-processing/fastp/sample_name_1.fastp.fastq.gz", "01-processing/fastp/sample_name_2.fastp.fastq.gz"] }, "no_copy": ["RAW", "TMP", "latest"], "last_folder":"REFERENCES", @@ -25,8 +25,8 @@ "url": "https://github.com/ngs-fzb/MTBseq_source", "description": "Mycobacterium tuberculosis mapping, variant calling and detection of resistance using MTBseq", "clean": { - "folders":["01-preprocessing/trimmed_sequences", "Bam", "Mpileup"], - "files":[] + "folders":["Bam", "Mpileup"], + "files":["01-processing/fastp/sample_name_1.fastp.fastq.gz", "01-processing/fastp/sample_name_2.fastp.fastq.gz"] }, "no_copy": ["RAW", "TMP"], "last_folder":"REFERENCES", @@ -42,8 +42,8 @@ "url": "https://github.com/ngs-fzb/MTBseq_source", "description": "Mycobacterium tuberculosis mapping, variant calling and detection of resistance using MTBseq", "clean": { - "folders":["01-preprocessing", "Bam", "Mpileup"], - "files":[] + "folders":["Bam", "Mpileup"], + "files":["01-processing/fastp/sample_name_1.fastp.fastq.gz", "01-processing/fastp/sample_name_2.fastp.fastq.gz"] }, "no_copy": ["RAW", "TMP"], "last_folder":"REFERENCES", @@ -146,7 +146,7 @@ "description": "RNA-seq analysis", "clean": { "folders":[], - "files":[] + "files":["star_salmon/sample_name.Aligned.out.bam", "star_salmon/sample_name.Aligned.toTranscriptome.out.bam"] }, "no_copy": ["RAW", "TMP"], "last_folder":"RESULTS", From 5af33bb192d8f432c610cce778557a6524c60a35 Mon Sep 17 00:00:00 2001 From: jaimeozaez <135366362+jaimeozaez@users.noreply.github.com> Date: Mon, 10 Jun 2024 20:56:45 +0200 Subject: [PATCH 024/321] Fixed autoclean-sftp function (#281) * Added @click.pass_context before autoclean_sftp function * Updated CHANGELOG.md * removed tatus file --- CHANGELOG.md | 1 + bu_isciii/__main__.py | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 03484899d..ecfd10de8 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,7 @@ Code contributions to the new version: - Fixed archive module. Updated correct header for scout tsv [#258](https://github.com/BU-ISCIII/buisciii-tools/pull/258). - Fixed clean module. Corrected purge_files function. Renaming stage moved from clean to rename_nocopy option. Updated services.json file with correct paths for some services. [#280](https://github.com/BU-ISCIII/buisciii-tools/pull/280) +- Fixed autoclean-sftp function. [#281](https://github.com/BU-ISCIII/buisciii-tools/pull/281) #### Changed diff --git a/bu_isciii/__main__.py b/bu_isciii/__main__.py index 012b3bb4a..321bb172c 100755 --- a/bu_isciii/__main__.py +++ b/bu_isciii/__main__.py @@ -640,6 +640,7 @@ def archive( default=14, help="Integer, remove files older than a window of `-d [int]` days. Default 14 days.", ) +@click.pass_context def autoclean_sftp(ctx, sftp_folder, days): """Clean old sftp services""" sftp_clean = bu_isciii.autoclean_sftp.AutoremoveSftpService( From ff90493b8a20c9848280d43553646f83e5253864 Mon Sep 17 00:00:00 2001 From: jaimeozaez <135366362+jaimeozaez@users.noreply.github.com> Date: Wed, 12 Jun 2024 16:49:25 +0200 Subject: [PATCH 025/321] Fixed singularity-images path when updating pangolin database in lablog_viralrecon. (#282) * Updated CHANGELOG.md * removed tatus file * Corrected singularity-images path when updating pangolin database * Updated Changelog * Added line break after prompted input * Updated CHANGELOG --- CHANGELOG.md | 3 ++- .../viralrecon/ANALYSIS/lablog_viralrecon | 16 ++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ecfd10de8..77b146ea7 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,7 +44,8 @@ Code contributions to the new version: - Included annotated tab description in exome-trios markdowns [#273](https://github.com/BU-ISCIII/buisciii-tools/pull/273) - Installed all necessary singularity images and modified all templates so that, instead of using conda environments or loaded modules, the corresponding singularity images are used [#272](https://github.com/BU-ISCIII/buisciii-tools/pull/272) - Updated sarek version in exomeeb, exometrio and wgstrio templates [#277](https://github.com/BU-ISCIII/buisciii-tools/pull/277) -- Extension file of all_samples_virus_table_filtered (from csv to tsv) in lablog_viralrecon_results changed [#278](https://github.com/BU-ISCIII/buisciii-tools/pull/278) +- Extension file of all_samples_virus_table_filtered (from csv to tsv) in lablog_viralrecon_results changed [#278](https://github.com/BU-ISCIII/buisciii-tools/pull/278) +- Fixed singularity-images path when updating pangolin database in lablog_viralrecon. Added line break after prompted input. [#282](https://github.com/BU-ISCIII/buisciii-tools/pull/282) ### Modules diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 58fa21704..916f3d7fb 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -65,10 +65,10 @@ update_pangolin() { echo -e "Pangolin database is UP TO DATE. \xE2\x9C\x85" else mkdir "$(date '+%Y%m%d')" - srun --partition short_idx singularity run -B ${PWD} /scratch/bi/singularity-images/$latest_version_pangolin pangolin --update-data --datadir ${PWD}/$(date '+%Y%m%d')/ + srun --partition short_idx singularity run -B ${PWD} /data/bi/pipelines/singularity-images/$latest_version_pangolin pangolin --update-data --datadir ${PWD}/$(date '+%Y%m%d')/ # log file creation echo -e "$(date +'%Y-%m-%d %H:%M:%S')\tmkdir $(date '+%Y%m%d')" >> $(date '+%Y%m%d')/log - echo -e "$(date +'%Y-%m-%d %H:%M:%S')\tsrun --partition short_idx singularity run -B ${PWD} /scratch/bi/singularity-images/$latest_version_pangolin pangolin --update-data --datadir ${PWD}/$(date '+%Y%m%d')/)" >> $(date '+%Y%m%d')/log + echo -e "$(date +'%Y-%m-%d %H:%M:%S')\tsrun --partition short_idx singularity run -B ${PWD} /data/bi/pipelines/singularity-images/$latest_version_pangolin pangolin --update-data --datadir ${PWD}/$(date '+%Y%m%d')/)" >> $(date '+%Y%m%d')/log echo_green "Pangolin database UPDATED." fi cd - @@ -211,7 +211,7 @@ echo_bold "\nPlease specify the type of analysis." echo_bold "1. METAGENOMICS" echo_bold "2. AMPLICONS" while true; do - echo -ne "\e[1;38;5;220m"; read -n 1 ANALYSIS_TYPE; tput sgr0 + echo -ne "\e[1;38;5;220m"; read -n 1 ANALYSIS_TYPE; tput sgr0; echo if [ "$ANALYSIS_TYPE" == "1" ]; then ANALYSIS_TYPE="METAGENOMIC" echo_green "$ANALYSIS_TYPE analysis selected." @@ -230,7 +230,7 @@ echo_bold "\nPlease specify the method to be performed." echo_bold "2. De novo assemby" echo_bold "3. Both" while true; do - echo -ne "\e[1;38;5;220m"; read -n 1 method; tput sgr0 + echo -ne "\e[1;38;5;220m"; read -n 1 method; tput sgr0; echo if [ "$method" == "1" ]; then echo_green "Mapping method selected." break @@ -251,7 +251,7 @@ echo_bold "\nPlease specify the method to be performed." # Setting samples_ref.txt file echo -read -p $'\e[1;37mIs samples_ref.txt file already prepared? [y/N]: \e[1;38;5;220m' -n 1 samples_ref_prepared; tput sgr0 +read -p $'\e[1;37mIs samples_ref.txt file already prepared? [y/N]: \e[1;38;5;220m' -n 1 samples_ref_prepared; tput sgr0; echo if [ "$samples_ref_prepared" == "y" ]; then echo -e "File samples_ref.txt READY. \xE2\x9C\x85" else @@ -260,7 +260,7 @@ else while [ -z "$host" ] || [ -z "$reference" ] || [ "$answer" = "n" ]; do read -p $'\e[1;37mPlease specify the host: \e[1;38;5;220m' host read -p $'\e[1;37mPlease specify the reference: \e[1;38;5;220m' reference - read -p $'\e[1;37mAre host [\e[1;38;5;220m'"${host^^}"$'\e[1;37m] and reference [\e[1;38;5;220m'"${reference}"$'\e[1;37m] correct? [Y/n]: \e[1;38;5;220m' -n 1 answer; tput sgr0 + read -p $'\e[1;37mAre host [\e[1;38;5;220m'"${host^^}"$'\e[1;37m] and reference [\e[1;38;5;220m'"${reference}"$'\e[1;37m] correct? [Y/n]: \e[1;38;5;220m' -n 1 answer; tput sgr0; echo done while read in; do echo -e "${in}\t${reference}\t${host^^}" >> samples_ref.txt; done < samples_id.txt echo -e "File samples_ref.txt READY. \xE2\x9C\x85" @@ -272,7 +272,7 @@ if [ "$ANALYSIS_TYPE" = "METAGENOMIC" ]; then # Nextclade is able to analyze monkeypox virus echo - read -p $'\e[1;37mDo the sequences correspond to monkeypox virus (MPV)? [y/N]: \e[1;38;5;220m' -n 1 monkeypox; tput sgr0 + read -p $'\e[1;37mDo the sequences correspond to monkeypox virus (MPV)? [y/N]: \e[1;38;5;220m' -n 1 monkeypox; tput sgr0; echo if [ "$monkeypox" == "y" ]; then virus_tag='mpox' @@ -291,7 +291,7 @@ else echo_bold "2. RSV" echo_bold "3. Other" while true; do - echo -ne "\e[1;38;5;220m"; read -n 1 virus_tag; tput sgr0 + echo -ne "\e[1;38;5;220m"; read -n 1 virus_tag; tput sgr0; echo if [ "$virus_tag" == "1" ]; then virus_tag="sars-cov-2" echo_green "${virus_tag^^} virus selected." From 9e9ce3af63b5c0f9184540a0ef9ce651a8498694 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Mon, 10 Jun 2024 16:13:26 +0200 Subject: [PATCH 026/321] Updated CHANGELOG.md --- tatus | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 tatus diff --git a/tatus b/tatus new file mode 100644 index 000000000..6f1a1d8db --- /dev/null +++ b/tatus @@ -0,0 +1,2 @@ +* develop + main From 7771433ce8489976b80c0d8e20ba794f368620f7 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Mon, 10 Jun 2024 16:25:51 +0200 Subject: [PATCH 027/321] removed tatus file --- tatus | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 tatus diff --git a/tatus b/tatus deleted file mode 100644 index 6f1a1d8db..000000000 --- a/tatus +++ /dev/null @@ -1,2 +0,0 @@ -* develop - main From 9173ce5788a31a888cbc13e8e9b1b46bfc65f5fc Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Thu, 13 Jun 2024 15:42:02 +0200 Subject: [PATCH 028/321] Fixed conflict --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 77b146ea7..084e38884 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,7 +45,11 @@ Code contributions to the new version: - Installed all necessary singularity images and modified all templates so that, instead of using conda environments or loaded modules, the corresponding singularity images are used [#272](https://github.com/BU-ISCIII/buisciii-tools/pull/272) - Updated sarek version in exomeeb, exometrio and wgstrio templates [#277](https://github.com/BU-ISCIII/buisciii-tools/pull/277) - Extension file of all_samples_virus_table_filtered (from csv to tsv) in lablog_viralrecon_results changed [#278](https://github.com/BU-ISCIII/buisciii-tools/pull/278) +<<<<<<< HEAD - Fixed singularity-images path when updating pangolin database in lablog_viralrecon. Added line break after prompted input. [#282](https://github.com/BU-ISCIII/buisciii-tools/pull/282) +======= +- Fixed singularity-images path when updating pangolin database in lablog_viralrecon. [#282](https://github.com/BU-ISCIII/buisciii-tools/pull/282) +>>>>>>> c4bd14e (Updated Changelog) ### Modules From dfaf61719b3c9cbd2ae863f59085765b69f7276c Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Thu, 13 Jun 2024 15:42:43 +0200 Subject: [PATCH 029/321] Fixed conflict --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 084e38884..cb1647768 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,10 +46,14 @@ Code contributions to the new version: - Updated sarek version in exomeeb, exometrio and wgstrio templates [#277](https://github.com/BU-ISCIII/buisciii-tools/pull/277) - Extension file of all_samples_virus_table_filtered (from csv to tsv) in lablog_viralrecon_results changed [#278](https://github.com/BU-ISCIII/buisciii-tools/pull/278) <<<<<<< HEAD +<<<<<<< HEAD - Fixed singularity-images path when updating pangolin database in lablog_viralrecon. Added line break after prompted input. [#282](https://github.com/BU-ISCIII/buisciii-tools/pull/282) ======= - Fixed singularity-images path when updating pangolin database in lablog_viralrecon. [#282](https://github.com/BU-ISCIII/buisciii-tools/pull/282) >>>>>>> c4bd14e (Updated Changelog) +======= +- Fixed singularity-images path when updating pangolin database in lablog_viralrecon. Added line break after prompted input. [#282](https://github.com/BU-ISCIII/buisciii-tools/pull/282) +>>>>>>> a7da8ac (Updated CHANGELOG) ### Modules From fb05f50ce1e4c590289931cada29a62705320e9a Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Thu, 13 Jun 2024 15:44:46 +0200 Subject: [PATCH 030/321] Fixed conflict --- CHANGELOG.md | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb1647768..41f02fae5 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,15 +45,9 @@ Code contributions to the new version: - Installed all necessary singularity images and modified all templates so that, instead of using conda environments or loaded modules, the corresponding singularity images are used [#272](https://github.com/BU-ISCIII/buisciii-tools/pull/272) - Updated sarek version in exomeeb, exometrio and wgstrio templates [#277](https://github.com/BU-ISCIII/buisciii-tools/pull/277) - Extension file of all_samples_virus_table_filtered (from csv to tsv) in lablog_viralrecon_results changed [#278](https://github.com/BU-ISCIII/buisciii-tools/pull/278) -<<<<<<< HEAD -<<<<<<< HEAD - Fixed singularity-images path when updating pangolin database in lablog_viralrecon. Added line break after prompted input. [#282](https://github.com/BU-ISCIII/buisciii-tools/pull/282) -======= -- Fixed singularity-images path when updating pangolin database in lablog_viralrecon. [#282](https://github.com/BU-ISCIII/buisciii-tools/pull/282) ->>>>>>> c4bd14e (Updated Changelog) -======= - Fixed singularity-images path when updating pangolin database in lablog_viralrecon. Added line break after prompted input. [#282](https://github.com/BU-ISCIII/buisciii-tools/pull/282) ->>>>>>> a7da8ac (Updated CHANGELOG) + ### Modules From d02b8fc41c9ae9f942d3e51f84c997a71514ff82 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Thu, 13 Jun 2024 15:56:14 +0200 Subject: [PATCH 031/321] Modified 02-preprocessing/lablog in snippy template in order to properly find trimmed reads from bacass pipeline --- .../ANALYSIS01_SNIPPY/02-preprocessing/lablog | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/02-preprocessing/lablog b/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/02-preprocessing/lablog index 70183be4e..23216d1af 100644 --- a/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/02-preprocessing/lablog +++ b/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/02-preprocessing/lablog @@ -1,11 +1,26 @@ # module load singularity # if assembly pipeline was performed first and the trimmed sequences were saved, this should work: -# cat ../samples_id | xargs -I mkdir @@; cd $_; ln -s ../../*/01-preprocessing/trimmed_sequences/@@*.gz @@; cd - -# else: +read -p $'\e[1;37mDid you save the trimmed reads from previous assembly pipeline? [y/N]: \e[1;38;5;220m' -n 1 answer; tput sgr0; echo + if [ "$answer" == "y" ]; then + echo "Creating links to trimmed reads..." -mkdir logs + while read in; do + mkdir ${in} + cd ${in} + ln -s ../../../*/01-processing/fastp/${in}_1.fastp.fastq.gz ${in}_R1_filtered.fastq.gz + ln -s ../../../*/01-processing/fastp/${in}_2.fastp.fastq.gz ${in}_R2_filtered.fastq.gz + cd - + done < ../samples_id.txt -scratch_dir=$(echo $(pwd) | sed 's@/data/bi/scratch_tmp/@/scratch/@g') + echo -e "\e[32mLinks for $(cat ../samples_id.txt | wc -l) samples succesfully created.\e[0m" + + else -cat ../samples_id.txt | xargs -I @@ echo "mkdir @@; srun --chdir ${scratch_dir} --mem 10G --time 1:00:00 --job-name FP.@@ --output logs/FP.@@.%j.log --partition short_idx --cpus-per-task 5 singularity exec -B ${scratch_dir}/../../../ -B /srv/fastq_repo/ /data/bi/pipelines/singularity-images/fastp:0.20.0--hdbcaa40_0 fastp --in1 ${scratch_dir}/../00-reads/@@_R1.fastq.gz --in2 ${scratch_dir}/../00-reads/@@_R2.fastq.gz --thread 5 --cut_front --cut_tail --cut_mean_quality 15 --qualified_quality_phred 15 --trim_poly_x --detect_adapter_for_pe --json ${scratch_dir}/@@/@@_fastp.json --html ${scratch_dir}/@@/@@_fastp.html --out1 ${scratch_dir}/@@/@@_R1_filtered.fastq.gz --out2 ${scratch_dir}/@@/@@_R2_filtered.fastq.gz &" > _01_fastp.sh + mkdir logs + scratch_dir=$(echo $(pwd) | sed 's@/data/bi/scratch_tmp/@/scratch/@g') + cat ../samples_id.txt | xargs -I @@ echo "mkdir @@; srun --chdir ${scratch_dir} --mem 10G --time 1:00:00 --job-name FP.@@ --output logs/FP.@@.%j.log --partition short_idx --cpus-per-task 5 singularity exec -B ${scratch_dir}/../../../ -B /srv/fastq_repo/ /data/bi/pipelines/singularity-images/fastp:0.20.0--hdbcaa40_0 fastp --in1 ${scratch_dir}/../00-reads/@@_R1.fastq.gz --in2 ${scratch_dir}/../00-reads/@@_R2.fastq.gz --thread 5 --cut_front --cut_tail --cut_mean_quality 15 --qualified_quality_phred 15 --trim_poly_x --detect_adapter_for_pe --json ${scratch_dir}/@@/@@_fastp.json --html ${scratch_dir}/@@/@@_fastp.html --out1 ${scratch_dir}/@@/@@_R1_filtered.fastq.gz --out2 ${scratch_dir}/@@/@@_R2_filtered.fastq.gz &" > _01_fastp.sh + + echo -e "\e[32mFile _01_fastp.sh ready.\e[0m" + + fi \ No newline at end of file From 11e4f0e9de4900c1dc23d440b14d25d2b709662f Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Thu, 13 Jun 2024 16:01:44 +0200 Subject: [PATCH 032/321] Modified 01-preprocessing/lablog in characterization template in order to properly find trimmed reads from bacass pipeline --- .../01-preprocessing/lablog | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/01-preprocessing/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/01-preprocessing/lablog index b7532e2bf..d1d3eb738 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/01-preprocessing/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/01-preprocessing/lablog @@ -1,7 +1,27 @@ -# module load singularity +# module load singularity + # if assembly pipeline was performed first and the trimmed sequences were saved, this should work: -# cat ../samples_id.txt | xargs -I @@ mkdir @@; cd @@; ln -s ../../../*/01-processing/fastp/@@_1.fastp.fastq.gz ./@@_R1_filtered.fastq.gz; ln -s ../../../*/01-processing/fastp/@@_2.fastp.fastq.gz ./@@_R2_filtered.fastq.gz ; cd - -# else: -mkdir logs -scratch_dir=$(echo $(pwd) | sed 's@/data/bi/scratch_tmp/@/scratch/@g') -cat ../samples_id.txt | xargs -I @@ echo "mkdir @@; srun --chdir ${scratch_dir} --mem 10G --time 1:00:00 --job-name FP.@@ --output logs/FP.@@.%j.log --partition short_idx --cpus-per-task 5 singularity exec -B ${scratch_dir}/../../../ -B /srv/fastq_repo/ /data/bi/pipelines/singularity-images/fastp:0.20.0--hdbcaa40_0 fastp --in1 ${scratch_dir}/../00-reads/@@_R1.fastq.gz --in2 ${scratch_dir}/../00-reads/@@_R2.fastq.gz --thread 5 --cut_front --cut_tail --cut_mean_quality 15 --qualified_quality_phred 15 --trim_poly_x --detect_adapter_for_pe --json ${scratch_dir}/@@/@@_fastp.json --html ${scratch_dir}/@@/@@_fastp.html --out1 ${scratch_dir}/@@/@@_R1_filtered.fastq.gz --out2 ${scratch_dir}/@@/@@_R2_filtered.fastq.gz &" > _01_fastp.sh +read -p $'\e[1;37mDid you save the trimmed reads from previous assembly pipeline? [y/N]: \e[1;38;5;220m' -n 1 answer; tput sgr0; echo + if [ "$answer" == "y" ]; then + echo "Creating links to trimmed reads..." + + while read in; do + mkdir ${in} + cd ${in} + ln -s ../../../*/01-processing/fastp/${in}_1.fastp.fastq.gz ${in}_R1_filtered.fastq.gz + ln -s ../../../*/01-processing/fastp/${in}_2.fastp.fastq.gz ${in}_R2_filtered.fastq.gz + cd - + done < ../samples_id.txt + + echo -e "\e[32mLinks for $(cat ../samples_id.txt | wc -l) samples succesfully created.\e[0m" + + else + echo "Preparing _01_fastp.sh file for trimming..." + + mkdir logs + scratch_dir=$(echo $(pwd) | sed 's@/data/bi/scratch_tmp/@/scratch/@g') + cat ../samples_id.txt | xargs -I @@ echo "mkdir @@; srun --chdir ${scratch_dir} --mem 10G --time 1:00:00 --job-name FP.@@ --output logs/FP.@@.%j.log --partition short_idx --cpus-per-task 5 singularity exec -B ${scratch_dir}/../../../ -B /srv/fastq_repo/ /data/bi/pipelines/singularity-images/fastp:0.20.0--hdbcaa40_0 fastp --in1 ${scratch_dir}/../00-reads/@@_R1.fastq.gz --in2 ${scratch_dir}/../00-reads/@@_R2.fastq.gz --thread 5 --cut_front --cut_tail --cut_mean_quality 15 --qualified_quality_phred 15 --trim_poly_x --length_required 50 --detect_adapter_for_pe --json ${scratch_dir}/@@/@@_fastp.json --html ${scratch_dir}/@@/@@_fastp.html --out1 ${scratch_dir}/@@/@@_R1_filtered.fastq.gz --out2 ${scratch_dir}/@@/@@_R2_filtered.fastq.gz --unpaired1 ${scratch_dir}/@@/@@_R1_unpaired.fastq.gz --unpaired2 ${scratch_dir}/@@/@@_R2_unpaired.fastq.gz &" > _01_fastp.sh + + echo -e "\e[32mFile _01_fastp.sh ready.\e[0m" + + fi \ No newline at end of file From 92ce0c085883d021969224b37bab36669fe6e53b Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Thu, 13 Jun 2024 16:04:32 +0200 Subject: [PATCH 033/321] Removed not needed lines --- .../templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/lablog | 7 ------- 1 file changed, 7 deletions(-) diff --git a/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/lablog b/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/lablog index 3c03fe884..1b53edc41 100644 --- a/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/lablog +++ b/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/lablog @@ -1,10 +1,3 @@ -mkdir 01-fastqc -mkdir 02-preprocessing -mkdir 03-preprocQC -mkdir 04-snippy -mkdir 05-iqtree -mkdir 99-stats - ln -s ../samples_id.txt . ln -s ../00-reads . From 3e373ac35c254915e928fadb9b33863aade2bd3a Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Thu, 13 Jun 2024 16:28:04 +0200 Subject: [PATCH 034/321] Corrected path to phylo.aln file in iqtree lablog --- .../snippy/ANALYSIS/ANALYSIS01_SNIPPY/05-iqtree/lablog | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/05-iqtree/lablog b/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/05-iqtree/lablog index 95a2c4830..e351131c1 100644 --- a/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/05-iqtree/lablog +++ b/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/05-iqtree/lablog @@ -2,5 +2,5 @@ scratch_dir=$(echo $PWD | sed 's/\/data\/bi\/scratch_tmp/\/scratch/g') mkdir logs -#echo "srun --chdir \${scratch_dir} --output logs/IQTREEMFP.%j.log --job-name IQTREEMFP --cpus-per-task 20 --mem 5G --partition short_idx --time 00:30:00 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/iqtree:2.1.4_beta--hdcc8f71_0 iqtree -s ${scratch_dir}/../07-snphylo/snphylo.output.fasta -m MFP &" > _00_iqtreemfp.sh -echo "srun --chdir ${scratch_dir} --output logs/IQTREEFULLALIGN.%j.log --job-name IQTREEFULLALIGN --cpus-per-task 20 --mem 15G --partition short_idx --time 08:00:00 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/iqtree:2.1.4_beta--hdcc8f71_0 iqtree -s ${scratch_dir}/../07-snphylo/snphylo.output.fasta -m PMB+F+R2 -T 20 -B 1000 -pre phylo.iqtree.bootstrap &" > _01_iqtreeall.sh +#echo "srun --chdir \${scratch_dir} --output logs/IQTREEMFP.%j.log --job-name IQTREEMFP --cpus-per-task 20 --mem 5G --partition short_idx --time 00:30:00 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/iqtree:2.1.4_beta--hdcc8f71_0 iqtree -s ${scratch_dir}/../04-snippy/phylo.aln -m MFP &" > _00_iqtreemfp.sh +echo "srun --chdir ${scratch_dir} --output logs/IQTREEFULLALIGN.%j.log --job-name IQTREEFULLALIGN --cpus-per-task 20 --mem 15G --partition short_idx --time 08:00:00 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/iqtree:2.1.4_beta--hdcc8f71_0 iqtree -s ${scratch_dir}/../04-snippy/phylo.aln -m PMB+F+R2 -T 20 -B 1000 -pre phylo.iqtree.bootstrap &" > _01_iqtreeall.sh From d1611ccf0d8ca039b0fb2a30f320b5f164e74353 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Thu, 13 Jun 2024 16:34:36 +0200 Subject: [PATCH 035/321] Updated Changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 41f02fae5..f956ef2e7 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,7 +46,7 @@ Code contributions to the new version: - Updated sarek version in exomeeb, exometrio and wgstrio templates [#277](https://github.com/BU-ISCIII/buisciii-tools/pull/277) - Extension file of all_samples_virus_table_filtered (from csv to tsv) in lablog_viralrecon_results changed [#278](https://github.com/BU-ISCIII/buisciii-tools/pull/278) - Fixed singularity-images path when updating pangolin database in lablog_viralrecon. Added line break after prompted input. [#282](https://github.com/BU-ISCIII/buisciii-tools/pull/282) -- Fixed singularity-images path when updating pangolin database in lablog_viralrecon. Added line break after prompted input. [#282](https://github.com/BU-ISCIII/buisciii-tools/pull/282) +- Updated characterization and snippy templates to fit bacass pipeline. Corrected path in 05-iqtree in snippy template. [#283](https://github.com/BU-ISCIII/buisciii-tools/pull/283) ### Modules From 730489fce7e4c3a488565b0bb25514bc267a4a85 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Fri, 17 May 2024 15:39:37 +0200 Subject: [PATCH 036/321] Created RESULTS/lablog files where needed so that multiqc_report.html is included in the RESULTS folder --- .../lowfreq_panel/RESULTS/lablog_lowfreq_panel_results | 9 +++++++++ .../RESULTS/lablog_mtbseq_assembly_results | 9 +++++++++ bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results | 9 +++++++++ 3 files changed, 27 insertions(+) create mode 100644 bu_isciii/templates/lowfreq_panel/RESULTS/lablog_lowfreq_panel_results create mode 100644 bu_isciii/templates/mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results create mode 100644 bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results diff --git a/bu_isciii/templates/lowfreq_panel/RESULTS/lablog_lowfreq_panel_results b/bu_isciii/templates/lowfreq_panel/RESULTS/lablog_lowfreq_panel_results new file mode 100644 index 000000000..cf16f5308 --- /dev/null +++ b/bu_isciii/templates/lowfreq_panel/RESULTS/lablog_lowfreq_panel_results @@ -0,0 +1,9 @@ +DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega" + +mkdir $DELIVERY_FOLDER + +# Lowfreq_panel service +cd $DELIVERY_FOLDER + +# Links to reports +ln -s ../../../ANALYSIS/*RBPANEL/99-stats/multiqc_report.html . diff --git a/bu_isciii/templates/mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results b/bu_isciii/templates/mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results new file mode 100644 index 000000000..3e60aec88 --- /dev/null +++ b/bu_isciii/templates/mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results @@ -0,0 +1,9 @@ +DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega" + +mkdir $DELIVERY_FOLDER + +# Assembly service +cd $DELIVERY_FOLDER + +# Links to reports +ln -s ../../../ANALYSIS/*ASSEMBLY/99-stats/MultiQC/multiqc_report.html . diff --git a/bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results b/bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results new file mode 100644 index 000000000..55199d71e --- /dev/null +++ b/bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results @@ -0,0 +1,9 @@ +DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega" + +mkdir $DELIVERY_FOLDER + +# Assembly service +cd $DELIVERY_FOLDER + +# Links to reports +ln -s ../../../ANALYSIS/*RNASEQ/*rnaseq/multiqc/star_salmon/multiqc_report.html . From c0043ae975ab63a2549b75913d6b6e7d2670c172 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Fri, 17 May 2024 15:44:10 +0200 Subject: [PATCH 037/321] Updated RESULTS/lablog files --- .../mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results | 2 +- bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results b/bu_isciii/templates/mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results index 3e60aec88..07babd5a3 100644 --- a/bu_isciii/templates/mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results +++ b/bu_isciii/templates/mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results @@ -2,7 +2,7 @@ DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega" mkdir $DELIVERY_FOLDER -# Assembly service +# MTBSEQ-ASSEMBLY service cd $DELIVERY_FOLDER # Links to reports diff --git a/bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results b/bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results index 55199d71e..e05201ead 100644 --- a/bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results +++ b/bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results @@ -2,7 +2,7 @@ DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega" mkdir $DELIVERY_FOLDER -# Assembly service +# RNASEQ service cd $DELIVERY_FOLDER # Links to reports From 9cf3c173434b63d085b923cb48a5af08c70f4896 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Fri, 14 Jun 2024 15:46:51 +0200 Subject: [PATCH 038/321] Updated CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f956ef2e7..7a728ca18 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -47,7 +47,7 @@ Code contributions to the new version: - Extension file of all_samples_virus_table_filtered (from csv to tsv) in lablog_viralrecon_results changed [#278](https://github.com/BU-ISCIII/buisciii-tools/pull/278) - Fixed singularity-images path when updating pangolin database in lablog_viralrecon. Added line break after prompted input. [#282](https://github.com/BU-ISCIII/buisciii-tools/pull/282) - Updated characterization and snippy templates to fit bacass pipeline. Corrected path in 05-iqtree in snippy template. [#283](https://github.com/BU-ISCIII/buisciii-tools/pull/283) - +- Included multiqc_report.html in RESULTS folder in every service, where necessary [#265] (https://github.com/BU-ISCIII/buisciii-tools/pull/265) ### Modules From b1dc18fcb5b6f2749b0a7bd10b6f8e344c49ce27 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Mon, 17 Jun 2024 11:14:38 +0200 Subject: [PATCH 039/321] Fixed paths for symlink creation --- .../lowfreq_panel/RESULTS/lablog_lowfreq_panel_results | 2 +- .../mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results | 2 +- bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bu_isciii/templates/lowfreq_panel/RESULTS/lablog_lowfreq_panel_results b/bu_isciii/templates/lowfreq_panel/RESULTS/lablog_lowfreq_panel_results index cf16f5308..6f41d3987 100644 --- a/bu_isciii/templates/lowfreq_panel/RESULTS/lablog_lowfreq_panel_results +++ b/bu_isciii/templates/lowfreq_panel/RESULTS/lablog_lowfreq_panel_results @@ -6,4 +6,4 @@ mkdir $DELIVERY_FOLDER cd $DELIVERY_FOLDER # Links to reports -ln -s ../../../ANALYSIS/*RBPANEL/99-stats/multiqc_report.html . +ln -s ../../ANALYSIS/*RBPANEL/99-stats/multiqc_report.html . diff --git a/bu_isciii/templates/mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results b/bu_isciii/templates/mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results index 07babd5a3..d2bf377f2 100644 --- a/bu_isciii/templates/mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results +++ b/bu_isciii/templates/mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results @@ -6,4 +6,4 @@ mkdir $DELIVERY_FOLDER cd $DELIVERY_FOLDER # Links to reports -ln -s ../../../ANALYSIS/*ASSEMBLY/99-stats/MultiQC/multiqc_report.html . +ln -s ../../ANALYSIS/*ASSEMBLY/99-stats/MultiQC/multiqc_report.html . diff --git a/bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results b/bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results index e05201ead..caa8f8684 100644 --- a/bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results +++ b/bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results @@ -6,4 +6,4 @@ mkdir $DELIVERY_FOLDER cd $DELIVERY_FOLDER # Links to reports -ln -s ../../../ANALYSIS/*RNASEQ/*rnaseq/multiqc/star_salmon/multiqc_report.html . +ln -s ../../ANALYSIS/*RNASEQ/*rnaseq/multiqc/star_salmon/multiqc_report.html . From 819d3d2b6fdf2650faa78bb1f656627d65543314 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Tue, 18 Jun 2024 10:20:13 +0200 Subject: [PATCH 040/321] Modified viralrecon results lablog to make symlinks to the /*_mapping/multiqc/ html reports --- bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results | 1 + 1 file changed, 1 insertion(+) diff --git a/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results index d229644b9..4e691b91a 100755 --- a/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results +++ b/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results @@ -20,6 +20,7 @@ cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANAL #Create symbolic links to files that are going to be converted to excel cat references.tmp | while read in; do ln -s ../../ANALYSIS/*/*${in}*/variants/ivar/variants_long_table.csv ${in}_variants_long_table.csv; done +for ref in $(cut -f2 ../../ANALYSIS/samples_ref.txt | sort | uniq); do for organism in $(cut -f3 ../../ANALYSIS/samples_ref.txt | tr '[:lower:]' '[:upper:]' | sort | uniq); do report="../../ANALYSIS/*_${organism}/${ref}_*_viralrecon_mapping/multiqc/multiqc_report.html"; if [ -e $report ]; then ln -s ${report} ./multiqc_report_${ref}_${organism}.html; fi; done; done ln -s ../../ANALYSIS/*_MAG/99-stats/multiqc_report.html ./krona_results.html ln -s ../../ANALYSIS/*/mapping_illumina*.tab ./mapping_illumina.csv ln -s ../../ANALYSIS/*/assembly_stats.csv ./assembly_stats.csv From 4a10dca3674084dce97d0d3867e638a0f93b589d Mon Sep 17 00:00:00 2001 From: victor5lm Date: Tue, 18 Jun 2024 13:02:46 +0200 Subject: [PATCH 041/321] Completed rnaseq results lablog --- .../templates/rnaseq/RESULTS/lablog_rnaseq_results | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results b/bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results index caa8f8684..c6c075df8 100644 --- a/bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results +++ b/bu_isciii/templates/rnaseq/RESULTS/lablog_rnaseq_results @@ -5,5 +5,15 @@ mkdir $DELIVERY_FOLDER # RNASEQ service cd $DELIVERY_FOLDER -# Links to reports +# Links to multiqc reports ln -s ../../ANALYSIS/*RNASEQ/*rnaseq/multiqc/star_salmon/multiqc_report.html . + +# Links to differential expression folders +# REMINDER: please make sure that subfolders within /*_DIFFERENTIAL_EXPRESSION/ follow the structure [number]_*. +# Otherwise, change the following command so that symlinks are created correctly. +# Check this path: /ANALYSIS/[DATE]_ANALYSIS0X_DIFFERENTIAL_EXPRESSION/ to see the names of the folders +ln -s ../../ANALYSIS/*_DIFFERENTIAL_EXPRESSION/[0-9]*_* . + +# Links to counts files +ln -s ../../ANALYSIS/*_RNASEQ/*_rnaseq/star_salmon/salmon.merged.gene_counts.tsv . +ln -s ../../ANALYSIS/*_RNASEQ/02-differential_expression/99-stats/normalized_expression.csv . From 450fadbd1f9472441253574584a97dc2c842ff20 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Tue, 18 Jun 2024 13:08:06 +0200 Subject: [PATCH 042/321] Modified viralrecon results lablog --- .../templates/viralrecon/RESULTS/lablog_viralrecon_results | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results index 4e691b91a..d7363bd2d 100755 --- a/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results +++ b/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results @@ -20,7 +20,7 @@ cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANAL #Create symbolic links to files that are going to be converted to excel cat references.tmp | while read in; do ln -s ../../ANALYSIS/*/*${in}*/variants/ivar/variants_long_table.csv ${in}_variants_long_table.csv; done -for ref in $(cut -f2 ../../ANALYSIS/samples_ref.txt | sort | uniq); do for organism in $(cut -f3 ../../ANALYSIS/samples_ref.txt | tr '[:lower:]' '[:upper:]' | sort | uniq); do report="../../ANALYSIS/*_${organism}/${ref}_*_viralrecon_mapping/multiqc/multiqc_report.html"; if [ -e $report ]; then ln -s ${report} ./multiqc_report_${ref}_${organism}.html; fi; done; done +for ref in $(cut -f2 ../../ANALYSIS/samples_ref.txt | sort | uniq); do for organism in $(cut -f3 ../../ANALYSIS/samples_ref.txt | tr '[:lower:]' '[:upper:]' | sort | uniq); do report="../../ANALYSIS/*_${organism}/${ref}_*_viralrecon_mapping/multiqc/multiqc_report.html"; if [ -e $report ]; then ln -s ${report} ./multiqc_report_${ref}_${organism}.html; else echo "Multiqc report not found for reference $reference and organism $organism"; fi; done; done ln -s ../../ANALYSIS/*_MAG/99-stats/multiqc_report.html ./krona_results.html ln -s ../../ANALYSIS/*/mapping_illumina*.tab ./mapping_illumina.csv ln -s ../../ANALYSIS/*/assembly_stats.csv ./assembly_stats.csv From 942adad65341e2e3913bfde9556277484b31d5a5 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Tue, 18 Jun 2024 13:12:13 +0200 Subject: [PATCH 043/321] Fixed minor mistake in viralrecon results lablog --- .../templates/viralrecon/RESULTS/lablog_viralrecon_results | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results index d7363bd2d..a46faba6b 100755 --- a/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results +++ b/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results @@ -20,7 +20,7 @@ cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANAL #Create symbolic links to files that are going to be converted to excel cat references.tmp | while read in; do ln -s ../../ANALYSIS/*/*${in}*/variants/ivar/variants_long_table.csv ${in}_variants_long_table.csv; done -for ref in $(cut -f2 ../../ANALYSIS/samples_ref.txt | sort | uniq); do for organism in $(cut -f3 ../../ANALYSIS/samples_ref.txt | tr '[:lower:]' '[:upper:]' | sort | uniq); do report="../../ANALYSIS/*_${organism}/${ref}_*_viralrecon_mapping/multiqc/multiqc_report.html"; if [ -e $report ]; then ln -s ${report} ./multiqc_report_${ref}_${organism}.html; else echo "Multiqc report not found for reference $reference and organism $organism"; fi; done; done +for ref in $(cut -f2 ../../ANALYSIS/samples_ref.txt | sort | uniq); do for organism in $(cut -f3 ../../ANALYSIS/samples_ref.txt | tr '[:lower:]' '[:upper:]' | sort | uniq); do report="../../ANALYSIS/*_${organism}/${ref}_*_viralrecon_mapping/multiqc/multiqc_report.html"; if [ -e $report ]; then ln -s ${report} ./multiqc_report_${ref}_${organism}.html; else echo "Multiqc report not found for reference $ref and organism $organism"; fi; done; done ln -s ../../ANALYSIS/*_MAG/99-stats/multiqc_report.html ./krona_results.html ln -s ../../ANALYSIS/*/mapping_illumina*.tab ./mapping_illumina.csv ln -s ../../ANALYSIS/*/assembly_stats.csv ./assembly_stats.csv From d9077ca96aa531f16d4775ebf82e3398b2a50ba1 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Mon, 29 Apr 2024 12:58:11 +0200 Subject: [PATCH 044/321] Modified bioinfo_doc.py so that new lines are applied when creating the .pdf file --- bu_isciii/bioinfo_doc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/bioinfo_doc.py b/bu_isciii/bioinfo_doc.py index b598f344b..4ef4f80f2 100755 --- a/bu_isciii/bioinfo_doc.py +++ b/bu_isciii/bioinfo_doc.py @@ -290,7 +290,7 @@ def post_delivery_info(self): if self.provided_txt: with open(os.path.expanduser(self.provided_txt)) as f: - self.delivery_notes = " ".join([x.strip() for x in f.readlines()]) + self.delivery_notes = "\n".join([x.strip() for x in f.readlines()]) else: self.delivery_notes = bu_isciii.utils.ask_for_some_text( msg="Write some delivery notes:" From 951daa9e8f340fbee05df6a087253f880ae0a803 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Mon, 29 Apr 2024 13:16:28 +0200 Subject: [PATCH 045/321] Updated CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a728ca18..3d378aa4b 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,6 +61,7 @@ Code contributions to the new version: - Fixed archive module. Updated correct header for scout tsv [#258](https://github.com/BU-ISCIII/buisciii-tools/pull/258). - Fixed clean module. Corrected purge_files function. Renaming stage moved from clean to rename_nocopy option. Updated services.json file with correct paths for some services. [#280](https://github.com/BU-ISCIII/buisciii-tools/pull/280) - Fixed autoclean-sftp function. [#281](https://github.com/BU-ISCIII/buisciii-tools/pull/281) +- Fixed bioinfo_doc.py. Modified it so that this module creates a .pdf file including new-line characters, without merging lines into one single line [#259](https://github.com/BU-ISCIII/buisciii-tools/pull/259). #### Changed From d864debb3046593b2a5b44fbaa3aae032d8abf25 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Tue, 25 Jun 2024 10:37:19 +0200 Subject: [PATCH 046/321] New attempt: added the nl2br markdown extension --- bu_isciii/bioinfo_doc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bu_isciii/bioinfo_doc.py b/bu_isciii/bioinfo_doc.py index 4ef4f80f2..37c95eec1 100755 --- a/bu_isciii/bioinfo_doc.py +++ b/bu_isciii/bioinfo_doc.py @@ -290,7 +290,7 @@ def post_delivery_info(self): if self.provided_txt: with open(os.path.expanduser(self.provided_txt)) as f: - self.delivery_notes = "\n".join([x.strip() for x in f.readlines()]) + self.delivery_notes = f.read() else: self.delivery_notes = bu_isciii.utils.ask_for_some_text( msg="Write some delivery notes:" @@ -388,6 +388,7 @@ def convert_markdown_to_html(self, mk_text): "pymdownx.highlight", "pymdownx.emoji", "pymdownx.tilde", + "nl2br", ], extension_configs={ "pymdownx.b64": { From c8497ef5437934d46323f2d13c40cad894f4504e Mon Sep 17 00:00:00 2001 From: victor5lm Date: Tue, 25 Jun 2024 11:01:37 +0200 Subject: [PATCH 047/321] Fixed linting mistake --- bu_isciii/bioinfo_doc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/bioinfo_doc.py b/bu_isciii/bioinfo_doc.py index 37c95eec1..a66ecd282 100755 --- a/bu_isciii/bioinfo_doc.py +++ b/bu_isciii/bioinfo_doc.py @@ -388,7 +388,7 @@ def convert_markdown_to_html(self, mk_text): "pymdownx.highlight", "pymdownx.emoji", "pymdownx.tilde", - "nl2br", + "nl2br", ], extension_configs={ "pymdownx.b64": { From 08ad531cc3808fa9a7f6830ad82dd806db7f7f5d Mon Sep 17 00:00:00 2001 From: svarona Date: Fri, 21 Jun 2024 14:07:21 +0200 Subject: [PATCH 048/321] Created mag template --- .../ANALYSIS/ANALYSIS02_MAG/99-stats/lablog | 25 ++++++++++++++++ .../99-stats/multiqc_config.yaml | 13 ++++++++ .../mag/ANALYSIS/ANALYSIS02_MAG/lablog | 30 +++++++++++++++++++ bu_isciii/templates/mag/ANALYSIS/lablog_mag | 1 + bu_isciii/templates/mag/DOC/mag.config | 19 ++++++++++++ bu_isciii/templates/mag/RAW/README | 1 + bu_isciii/templates/mag/REFERENCES/README | 1 + .../templates/mag/RESULTS/lablog_mag_results | 6 ++++ bu_isciii/templates/mag/TMP/README | 1 + bu_isciii/templates/services.json | 2 +- 10 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/lablog create mode 100644 bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/multiqc_config.yaml create mode 100644 bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog create mode 100644 bu_isciii/templates/mag/ANALYSIS/lablog_mag create mode 100644 bu_isciii/templates/mag/DOC/mag.config create mode 100644 bu_isciii/templates/mag/RAW/README create mode 100644 bu_isciii/templates/mag/REFERENCES/README create mode 100755 bu_isciii/templates/mag/RESULTS/lablog_mag_results create mode 100644 bu_isciii/templates/mag/TMP/README diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/lablog b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/lablog new file mode 100644 index 000000000..246dae5d4 --- /dev/null +++ b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/lablog @@ -0,0 +1,25 @@ +#module load singularity + +cat ../../samples_id.txt | while read in; do ln -s ../*_mag/Taxonomy/kraken2/${in}/kraken2_report.txt ./${in}_kraken2_report.txt; done + +scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") + +cat < multiqc.sbatch +#!/bin/sh +#SBATCH --ntasks 1 +#SBATCH --cpus-per-task 2 +#SBATCH --mem 4G +#SBATCH --time 00:30:00 +#SBATCH --partition short_idx +#SBATCH --output $(date '+%Y%m%d')_multiqc.log +#SBATCH --chdir $scratch_dir + +export NXF_OPTS="-Xms500M -Xmx4G" + +singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/multiqc:1.9--py_1 multiqc -d . --config multiqc_config.yaml + +EOF + +echo "sbatch multiqc.sbatch" > _01_run_multiqc.sh + +echo "find -type l | while read in; do unlink \${in}; done" > _02_unlink.sh diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/multiqc_config.yaml b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/multiqc_config.yaml new file mode 100644 index 000000000..96b7e6136 --- /dev/null +++ b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/multiqc_config.yaml @@ -0,0 +1,13 @@ +extra_fn_clean_exts: + - _R1 + - _R2 + - .R1 + - .R2 + - .sort + - _sort + - .stats + - _bamstat + - _align + - .txt +report_comment: > + This report has been generated by BU-ISCIII diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog new file mode 100644 index 000000000..39408a717 --- /dev/null +++ b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog @@ -0,0 +1,30 @@ +ln -s ../00-reads . +ln -s ../samples_id.txt . + +#module load Nextflow +#module load singularity + +scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") + +cat < mag.sbatch +#!/bin/sh +#SBATCH --ntasks 1 +#SBATCH --cpus-per-task 2 +#SBATCH --mem 4G +#SBATCH --time 2:00:00 +#SBATCH --partition middle_idx +#SBATCH --output $(date '+%Y%m%d')_mag.log +#SBATCH --chdir $scratch_dir + +export NXF_OPTS="-Xms500M -Xmx4G" + +nextflow run /data/bi/pipelines/nf-core-mag/nf-core-mag-2.1.1/workflow/main.nf \\ + -c ../../DOC/mag.config \\ + --input '00-reads/*_R{1,2}.fastq.gz' \\ + --outdir $(date '+%Y%m%d')_mag \\ + --kraken2_db /data/bi/references/kraken/minikraken_8GB_20200312.tgz \\ + --skip_busco --skip_spades --skip_spadeshybrid --skip_megahit --skip_prodigal --skip_binning \\ + -resume +EOF + +echo "sbatch mag.sbatch" > _01_run_mag.sh diff --git a/bu_isciii/templates/mag/ANALYSIS/lablog_mag b/bu_isciii/templates/mag/ANALYSIS/lablog_mag new file mode 100644 index 000000000..5076f4425 --- /dev/null +++ b/bu_isciii/templates/mag/ANALYSIS/lablog_mag @@ -0,0 +1 @@ +mv ANALYSIS02_MAG $(date '+%Y%m%d')_ANALYSIS02_MAG diff --git a/bu_isciii/templates/mag/DOC/mag.config b/bu_isciii/templates/mag/DOC/mag.config new file mode 100644 index 000000000..732980bf1 --- /dev/null +++ b/bu_isciii/templates/mag/DOC/mag.config @@ -0,0 +1,19 @@ +singularity { + enabled = true + autoMounts = true +} + +process { + executor = 'slurm' + queue = 'middle_idx' + queue = 'middle_idx' + errorStrategy = { task.exitStatus in [140,143,137,138,104,134,139] ? 'retry' : 'finish'; task.exitStatus in [1,4,255] ? 'ignore' : 'finish' } + maxRetries = 1 + maxErrors = '-1' +} + +params { + max_memory = 376.GB + max_cpus = 32 + max_time = '48.h' +} diff --git a/bu_isciii/templates/mag/RAW/README b/bu_isciii/templates/mag/RAW/README new file mode 100644 index 000000000..a774e7bb8 --- /dev/null +++ b/bu_isciii/templates/mag/RAW/README @@ -0,0 +1 @@ +##Folder to hold raw reads to analyze in the service diff --git a/bu_isciii/templates/mag/REFERENCES/README b/bu_isciii/templates/mag/REFERENCES/README new file mode 100644 index 000000000..3ce2a2815 --- /dev/null +++ b/bu_isciii/templates/mag/REFERENCES/README @@ -0,0 +1 @@ +##Folder to hold references of the service diff --git a/bu_isciii/templates/mag/RESULTS/lablog_mag_results b/bu_isciii/templates/mag/RESULTS/lablog_mag_results new file mode 100755 index 000000000..55bc4c0e5 --- /dev/null +++ b/bu_isciii/templates/mag/RESULTS/lablog_mag_results @@ -0,0 +1,6 @@ +mkdir $(date '+%Y%m%d')_entrega01 +cd $(date '+%Y%m%d')_entrega01 + +#Create symbolic links depending on the analysis +#Individual files +ln -s ../../ANALYSIS/*_MAG/99-stats/multiqc_report.html ./krona_results.html diff --git a/bu_isciii/templates/mag/TMP/README b/bu_isciii/templates/mag/TMP/README new file mode 100644 index 000000000..ba3229456 --- /dev/null +++ b/bu_isciii/templates/mag/TMP/README @@ -0,0 +1 @@ +##Folder to hold temporary files diff --git a/bu_isciii/templates/services.json b/bu_isciii/templates/services.json index bb88dfafb..7463cd5c4 100755 --- a/bu_isciii/templates/services.json +++ b/bu_isciii/templates/services.json @@ -222,7 +222,7 @@ }, "mag_met": { "label": "", - "template": "mag_met", + "template": "mag", "order": 2, "begin": "base", "end": "", From 550609d7375d75f5333dcfd01a012eb20660fe2a Mon Sep 17 00:00:00 2001 From: svarona Date: Fri, 21 Jun 2024 15:53:09 +0200 Subject: [PATCH 049/321] removed mag from other templates --- .../ANALYSIS/ANALYSIS02_MET/99-stats/lablog | 25 ---------------- .../99-stats/multiqc_config.yaml | 13 -------- .../IRMA/ANALYSIS/ANALYSIS02_MET/lablog | 30 ------------------- bu_isciii/templates/IRMA/ANALYSIS/lablog_irma | 1 - bu_isciii/templates/IRMA/DOC/mag.config | 19 ------------ .../IRMA/RESULTS/lablog_irma_results | 1 - .../DATE_ANALYSIS0X_MAG/99-stats/lablog | 27 ----------------- .../ANALYSIS/DATE_ANALYSIS0X_MAG/lablog | 30 ------------------- .../viralrecon/ANALYSIS/lablog_viralrecon | 1 - bu_isciii/templates/viralrecon/DOC/mag.config | 19 ------------ .../viralrecon/DOC/multiqc_config.yml | 13 -------- .../RESULTS/lablog_viralrecon_results | 1 - 12 files changed, 180 deletions(-) delete mode 100644 bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS02_MET/99-stats/lablog delete mode 100644 bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS02_MET/99-stats/multiqc_config.yaml delete mode 100644 bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS02_MET/lablog delete mode 100644 bu_isciii/templates/IRMA/DOC/mag.config delete mode 100644 bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog delete mode 100644 bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/lablog delete mode 100644 bu_isciii/templates/viralrecon/DOC/mag.config delete mode 100644 bu_isciii/templates/viralrecon/DOC/multiqc_config.yml diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS02_MET/99-stats/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS02_MET/99-stats/lablog deleted file mode 100644 index 246dae5d4..000000000 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS02_MET/99-stats/lablog +++ /dev/null @@ -1,25 +0,0 @@ -#module load singularity - -cat ../../samples_id.txt | while read in; do ln -s ../*_mag/Taxonomy/kraken2/${in}/kraken2_report.txt ./${in}_kraken2_report.txt; done - -scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") - -cat < multiqc.sbatch -#!/bin/sh -#SBATCH --ntasks 1 -#SBATCH --cpus-per-task 2 -#SBATCH --mem 4G -#SBATCH --time 00:30:00 -#SBATCH --partition short_idx -#SBATCH --output $(date '+%Y%m%d')_multiqc.log -#SBATCH --chdir $scratch_dir - -export NXF_OPTS="-Xms500M -Xmx4G" - -singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/multiqc:1.9--py_1 multiqc -d . --config multiqc_config.yaml - -EOF - -echo "sbatch multiqc.sbatch" > _01_run_multiqc.sh - -echo "find -type l | while read in; do unlink \${in}; done" > _02_unlink.sh diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS02_MET/99-stats/multiqc_config.yaml b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS02_MET/99-stats/multiqc_config.yaml deleted file mode 100644 index 96b7e6136..000000000 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS02_MET/99-stats/multiqc_config.yaml +++ /dev/null @@ -1,13 +0,0 @@ -extra_fn_clean_exts: - - _R1 - - _R2 - - .R1 - - .R2 - - .sort - - _sort - - .stats - - _bamstat - - _align - - .txt -report_comment: > - This report has been generated by BU-ISCIII diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS02_MET/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS02_MET/lablog deleted file mode 100644 index 21e0456ac..000000000 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS02_MET/lablog +++ /dev/null @@ -1,30 +0,0 @@ -ln -s ../00-reads . -ln -s ../samples_id.txt . - -#module load Nextflow -#module load singularity - -scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") - -cat < mag.sbatch -#!/bin/sh -#SBATCH --ntasks 1 -#SBATCH --cpus-per-task 2 -#SBATCH --mem 4G -#SBATCH --time 2:00:00 -#SBATCH --partition middle_idx -#SBATCH --output $(date '+%Y%m%d')_mag.log -#SBATCH --chdir $scratch_dir - -export NXF_OPTS="-Xms500M -Xmx4G" - -nextflow run /scratch/bi/pipelines/nf-core-mag-2.1.1/workflow/main.nf \\ - -c ../../DOC/mag.config \\ - --input '00-reads/*_R{1,2}.fastq.gz' \\ - --outdir $(date '+%Y%m%d')_mag \\ - --kraken2_db /data/bi/references/kraken/minikraken_8GB_20200312.tgz \\ - --skip_busco --skip_spades --skip_spadeshybrid --skip_megahit --skip_prodigal --skip_binning \\ - -resume -EOF - -echo "sbatch mag.sbatch" > _01_run_mag.sh diff --git a/bu_isciii/templates/IRMA/ANALYSIS/lablog_irma b/bu_isciii/templates/IRMA/ANALYSIS/lablog_irma index 3f99b6b0d..798ee5497 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/lablog_irma +++ b/bu_isciii/templates/IRMA/ANALYSIS/lablog_irma @@ -1,5 +1,4 @@ #ls ../RAW/* | tr '\/' '\t' | cut -f3 | cut -d "_" -f 1 | sort -u | grep -v "md5" > samples_id.txt mkdir -p 00-reads mv ANALYSIS01_FLU_IRMA $(date '+%Y%m%d')_ANALYSIS01_FLU_IRMA -mv ANALYSIS02_MET $(date '+%Y%m%d')_ANALYSIS02_MET cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd - \ No newline at end of file diff --git a/bu_isciii/templates/IRMA/DOC/mag.config b/bu_isciii/templates/IRMA/DOC/mag.config deleted file mode 100644 index 732980bf1..000000000 --- a/bu_isciii/templates/IRMA/DOC/mag.config +++ /dev/null @@ -1,19 +0,0 @@ -singularity { - enabled = true - autoMounts = true -} - -process { - executor = 'slurm' - queue = 'middle_idx' - queue = 'middle_idx' - errorStrategy = { task.exitStatus in [140,143,137,138,104,134,139] ? 'retry' : 'finish'; task.exitStatus in [1,4,255] ? 'ignore' : 'finish' } - maxRetries = 1 - maxErrors = '-1' -} - -params { - max_memory = 376.GB - max_cpus = 32 - max_time = '48.h' -} diff --git a/bu_isciii/templates/IRMA/RESULTS/lablog_irma_results b/bu_isciii/templates/IRMA/RESULTS/lablog_irma_results index eee33aa6a..5cb7c418d 100755 --- a/bu_isciii/templates/IRMA/RESULTS/lablog_irma_results +++ b/bu_isciii/templates/IRMA/RESULTS/lablog_irma_results @@ -3,7 +3,6 @@ cd $(date '+%Y%m%d')_entrega01 #Create symbolic links depending on the analysis #Individual files -ln -s ../../ANALYSIS/*_MET/99-stats/multiqc_report.html ./krona_results.html ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/all_samples_completo.txt . ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/A_H* . ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/B . diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog b/bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog deleted file mode 100644 index 6685af52f..000000000 --- a/bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog +++ /dev/null @@ -1,27 +0,0 @@ -#module load singularity - -cat ../../samples_id.txt | while read in; do ln -s ../*_mag/Taxonomy/kraken2/${in}/kraken2_report.txt ./${in}_kraken2_report.txt; done - -scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") - -cat < multiqc.sbatch -#!/bin/sh -#SBATCH --ntasks 1 -#SBATCH --cpus-per-task 2 -#SBATCH --mem 4G -#SBATCH --time 00:30:00 -#SBATCH --partition short_idx -#SBATCH --output $(date '+%Y%m%d')_multiqc.log -#SBATCH --chdir $scratch_dir - -export NXF_OPTS="-Xms500M -Xmx4G" - -singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/multiqc:1.9--py_1 multiqc -d . --config multiqc_config.yaml - -EOF - -echo "sbatch multiqc.sbatch" > _01_run_multiqc.sh - -ln -s ../../../DOC/multiqc_config.yml . - -echo "find -type l | while read in; do unlink \${in}; done" > _02_unlink.sh diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/lablog b/bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/lablog deleted file mode 100644 index 83e293d6f..000000000 --- a/bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/lablog +++ /dev/null @@ -1,30 +0,0 @@ -ln -s ../00-reads . -ln -s ../samples_id.txt . - -#module load Nextflow -#module load singularity - -scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") - -cat < mag.sbatch -#!/bin/sh -#SBATCH --ntasks 1 -#SBATCH --cpus-per-task 2 -#SBATCH --mem 4G -#SBATCH --time 2:00:00 -#SBATCH --partition middle_idx -#SBATCH --output $(date '+%Y%m%d')_mag.log -#SBATCH --chdir $scratch_dir - -export NXF_OPTS="-Xms500M -Xmx4G" - -nextflow run /data/bi/pipelines/nf-core-mag-2.1.1/workflow/main.nf \\ - -c ../../DOC/mag.config \\ - --input '00-reads/*_R{1,2}.fastq.gz' \\ - --outdir $(date '+%Y%m%d')_mag \\ - --kraken2_db /data/bi/references/kraken/minikraken_8GB_20200312.tgz \\ - --skip_busco --skip_spades --skip_spadeshybrid --skip_megahit --skip_prodigal --skip_binning \\ - -resume -EOF - -echo "sbatch mag.sbatch" > _01_run_mag.sh diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 916f3d7fb..f63925f21 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -401,7 +401,6 @@ rm create_summary_report.sh rm deduplicate_long_table.sh rm percentajeNs.py rm _02_create_run_percentage_Ns.sh -mv DATE_ANALYSIS0X_MAG $(date '+%Y%m%d')_ANALYSIS0X_MAG cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd .. echo_green "\nLablog_viralrecon execution has been completed successfully!" \ No newline at end of file diff --git a/bu_isciii/templates/viralrecon/DOC/mag.config b/bu_isciii/templates/viralrecon/DOC/mag.config deleted file mode 100644 index 732980bf1..000000000 --- a/bu_isciii/templates/viralrecon/DOC/mag.config +++ /dev/null @@ -1,19 +0,0 @@ -singularity { - enabled = true - autoMounts = true -} - -process { - executor = 'slurm' - queue = 'middle_idx' - queue = 'middle_idx' - errorStrategy = { task.exitStatus in [140,143,137,138,104,134,139] ? 'retry' : 'finish'; task.exitStatus in [1,4,255] ? 'ignore' : 'finish' } - maxRetries = 1 - maxErrors = '-1' -} - -params { - max_memory = 376.GB - max_cpus = 32 - max_time = '48.h' -} diff --git a/bu_isciii/templates/viralrecon/DOC/multiqc_config.yml b/bu_isciii/templates/viralrecon/DOC/multiqc_config.yml deleted file mode 100644 index 96b7e6136..000000000 --- a/bu_isciii/templates/viralrecon/DOC/multiqc_config.yml +++ /dev/null @@ -1,13 +0,0 @@ -extra_fn_clean_exts: - - _R1 - - _R2 - - .R1 - - .R2 - - .sort - - _sort - - .stats - - _bamstat - - _align - - .txt -report_comment: > - This report has been generated by BU-ISCIII diff --git a/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results index a46faba6b..d05b2ab77 100755 --- a/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results +++ b/bu_isciii/templates/viralrecon/RESULTS/lablog_viralrecon_results @@ -21,7 +21,6 @@ cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANAL cat references.tmp | while read in; do ln -s ../../ANALYSIS/*/*${in}*/variants/ivar/variants_long_table.csv ${in}_variants_long_table.csv; done for ref in $(cut -f2 ../../ANALYSIS/samples_ref.txt | sort | uniq); do for organism in $(cut -f3 ../../ANALYSIS/samples_ref.txt | tr '[:lower:]' '[:upper:]' | sort | uniq); do report="../../ANALYSIS/*_${organism}/${ref}_*_viralrecon_mapping/multiqc/multiqc_report.html"; if [ -e $report ]; then ln -s ${report} ./multiqc_report_${ref}_${organism}.html; else echo "Multiqc report not found for reference $ref and organism $organism"; fi; done; done -ln -s ../../ANALYSIS/*_MAG/99-stats/multiqc_report.html ./krona_results.html ln -s ../../ANALYSIS/*/mapping_illumina*.tab ./mapping_illumina.csv ln -s ../../ANALYSIS/*/assembly_stats.csv ./assembly_stats.csv ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.tsv ./pikavirus_table.tsv From 9d8b08d26a853dc0cca4a21e16ed9be61a14cc9c Mon Sep 17 00:00:00 2001 From: svarona Date: Fri, 21 Jun 2024 15:54:13 +0200 Subject: [PATCH 050/321] Fixed new service to handle more than one service_id --- bu_isciii/new_service.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/bu_isciii/new_service.py b/bu_isciii/new_service.py index cbbfe81ee..f3ab412bc 100755 --- a/bu_isciii/new_service.py +++ b/bu_isciii/new_service.py @@ -125,16 +125,19 @@ def copy_template(self): ) services_ids = bu_isciii.utils.get_service_ids(self.services_requested) services_json = bu_isciii.service_json.ServiceJson() - if len(services_ids) == 1: + for service_id in services_ids: try: - service_template = services_json.get_find(services_ids[0], "template") + service_template = services_json.get_find(service_id, "template") + service_end = services_json.get_find(service_id, "end") except KeyError as e: stderr.print( "[red]ERROR: Service id %s not found in services json file." - % services_ids[0] + % service_id ) stderr.print("traceback error %s" % e) sys.exit() + if service_end not in services_ids and service_end != '': + services_ids.append(service_end) try: shutil.copytree( os.path.join( @@ -153,13 +156,6 @@ def copy_template(self): stderr.print("[red]ERROR: Copying template failed.") stderr.print("traceback error %s" % e) sys.exit() - else: - stderr.print( - "[red] ERROR: I'm not already prepared for handling more than one error at the same time, sorry!" - "Please re-run and select one of the service ids." - ) - sys.exit(1) - return False return True def create_samples_id(self): From 681bbe64a6a93577fd8abe32802d865f357c78c9 Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 24 Jun 2024 09:03:11 +0200 Subject: [PATCH 051/321] moved the code to utils --- bu_isciii/new_service.py | 3 --- bu_isciii/utils.py | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/bu_isciii/new_service.py b/bu_isciii/new_service.py index f3ab412bc..c775b928a 100755 --- a/bu_isciii/new_service.py +++ b/bu_isciii/new_service.py @@ -128,7 +128,6 @@ def copy_template(self): for service_id in services_ids: try: service_template = services_json.get_find(service_id, "template") - service_end = services_json.get_find(service_id, "end") except KeyError as e: stderr.print( "[red]ERROR: Service id %s not found in services json file." @@ -136,8 +135,6 @@ def copy_template(self): ) stderr.print("traceback error %s" % e) sys.exit() - if service_end not in services_ids and service_end != '': - services_ids.append(service_end) try: shutil.copytree( os.path.join( diff --git a/bu_isciii/utils.py b/bu_isciii/utils.py index 821210b3c..5408e7c72 100755 --- a/bu_isciii/utils.py +++ b/bu_isciii/utils.py @@ -6,6 +6,7 @@ import json import os import tarfile +import sys import questionary import rich @@ -167,6 +168,19 @@ def get_service_ids(services_requested): if services["service_id"] is not None: service_id_list.append(services["service_id"]) service_id_list_all.append(services["service_id"]) + services_json = bu_isciii.service_json.ServiceJson() + try: + service_end = services_json.get_find(services["service_id"], "end") + except KeyError as e: + stderr.print( + "[red]ERROR: Service id %s not found in services json file." + % services["service_id"] + ) + stderr.print("traceback error %s" % e) + sys.exit() + if service_end not in service_id_list and service_end != '': + service_id_list.append(service_end) + service_id_list_all.append(service_end) service_id_list_all.append("all") stderr.print("Which selected service do you want to manage?") services_sel = [prompt_selection("Service label:", service_id_list_all)] From f422a4728ee7b54d38d0030d807b3a3899347870 Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 25 Jun 2024 14:53:08 +0200 Subject: [PATCH 052/321] Fixed multiple service in clean module --- bu_isciii/clean.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bu_isciii/clean.py b/bu_isciii/clean.py index 3dba081c0..63a42b8bc 100644 --- a/bu_isciii/clean.py +++ b/bu_isciii/clean.py @@ -129,10 +129,9 @@ def get_clean_items(self, services_ids, type="files"): for service in services_ids: try: items = service_conf.get_find_deep(service, type) - if len(clean_items_list) == 0 and len(items) > 0: - clean_items_list = items - elif len(items) > 0: - clean_items_list.append(items) + for item in items: + if item not in clean_items_list: + clean_items_list.append(item) except KeyError as e: stderr.print( "[red]ERROR: Service id %s not found in services json file." From 90df9e91fde9c87c47614e78ae153861bba2cd63 Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 25 Jun 2024 14:53:50 +0200 Subject: [PATCH 053/321] fixed multiple services in copy_sftp module --- bu_isciii/copy_sftp.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bu_isciii/copy_sftp.py b/bu_isciii/copy_sftp.py index 0c1851424..cfd2b3890 100644 --- a/bu_isciii/copy_sftp.py +++ b/bu_isciii/copy_sftp.py @@ -112,8 +112,9 @@ def get_last_folders(self, services_ids, type="last_folder"): last_folders_list = [] for service in services_ids: try: - items = service_conf.get_find_deep(service, type) - last_folders_list.append(items) + item = service_conf.get_find_deep(service, type) + if item not in last_folders_list: + last_folders_list.append(item) except KeyError as e: stderr.print( "[red]ERROR: Service id %s not found in services json file." From b19358fb57e4507525e84f74755d47cc7ca7a802 Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 25 Jun 2024 16:51:18 +0200 Subject: [PATCH 054/321] Fixed variable name for stderr --- bu_isciii/new_service.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/new_service.py b/bu_isciii/new_service.py index c775b928a..eca0f8cd1 100755 --- a/bu_isciii/new_service.py +++ b/bu_isciii/new_service.py @@ -207,8 +207,8 @@ def create_symbolic_links(self): ) except OSError as e: stderr.print( - "[red]ERROR: Symbolic links creation failed for sample %s." - % sample["sampleName"] + "[red]ERROR: Symbolic links creation failed for file %s." + % file ) stderr.print("Traceback: %s" % e) sys.exit() From 21332741d3968155e31c1314b471e12cc73ba07d Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 25 Jun 2024 16:51:50 +0200 Subject: [PATCH 055/321] Allow to continue after symb link error --- bu_isciii/new_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/new_service.py b/bu_isciii/new_service.py index eca0f8cd1..280061738 100755 --- a/bu_isciii/new_service.py +++ b/bu_isciii/new_service.py @@ -211,7 +211,7 @@ def create_symbolic_links(self): % file ) stderr.print("Traceback: %s" % e) - sys.exit() + continue def samples_json(self): json_samples = json.dumps(self.service_samples, indent=4) From 39de6de948e81651eacaf384488cd7716a196488 Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 25 Jun 2024 16:52:29 +0200 Subject: [PATCH 056/321] Avoid updating to in_progress when already in_progress --- bu_isciii/new_service.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/bu_isciii/new_service.py b/bu_isciii/new_service.py index 280061738..e0faaa5e8 100755 --- a/bu_isciii/new_service.py +++ b/bu_isciii/new_service.py @@ -230,9 +230,11 @@ def create_new_service(self): self.create_samples_id() self.create_symbolic_links() self.samples_json() - self.rest_api.put_request( - "update-state", "resolution", self.resolution_id, "state", "in_progress" - ) + if self.resolution_info["service_state"] != "in_progress": + self.rest_api.put_request( + "update-state", "resolution", self.resolution_id, "state", "in_progress" + ) + else: stderr.print( "[yellow]WARN: No samples recorded in service: " + self.resolution_id @@ -240,13 +242,14 @@ def create_new_service(self): if bu_isciii.utils.prompt_yn_question("Do you want to proceed?: "): self.create_folder() self.copy_template() - self.rest_api.put_request( - "update-state", - "resolution", - self.resolution_id, - "state", - "in_progress", - ) + if self.resolution_info["service_state"] != "in_progress": + self.rest_api.put_request( + "update-state", + "resolution", + self.resolution_id, + "state", + "in_progress", + ) else: stderr.print("Directory not created. Bye!") sys.exit(1) From 80f5df8b38a216ef348adc4c0668295eacf55ffd Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 25 Jun 2024 16:58:21 +0200 Subject: [PATCH 057/321] fixed black --- bu_isciii/new_service.py | 9 ++++++--- bu_isciii/utils.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/bu_isciii/new_service.py b/bu_isciii/new_service.py index e0faaa5e8..f57e301da 100755 --- a/bu_isciii/new_service.py +++ b/bu_isciii/new_service.py @@ -207,8 +207,7 @@ def create_symbolic_links(self): ) except OSError as e: stderr.print( - "[red]ERROR: Symbolic links creation failed for file %s." - % file + "[red]ERROR: Symbolic links creation failed for file %s." % file ) stderr.print("Traceback: %s" % e) continue @@ -232,7 +231,11 @@ def create_new_service(self): self.samples_json() if self.resolution_info["service_state"] != "in_progress": self.rest_api.put_request( - "update-state", "resolution", self.resolution_id, "state", "in_progress" + "update-state", + "resolution", + self.resolution_id, + "state", + "in_progress", ) else: diff --git a/bu_isciii/utils.py b/bu_isciii/utils.py index 5408e7c72..07ba82939 100755 --- a/bu_isciii/utils.py +++ b/bu_isciii/utils.py @@ -178,7 +178,7 @@ def get_service_ids(services_requested): ) stderr.print("traceback error %s" % e) sys.exit() - if service_end not in service_id_list and service_end != '': + if service_end not in service_id_list and service_end != "": service_id_list.append(service_end) service_id_list_all.append(service_end) service_id_list_all.append("all") From 9db234bdd2fe36858afbfd4faaabfcbffab79ca5 Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 25 Jun 2024 17:44:34 +0200 Subject: [PATCH 058/321] Updated changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d378aa4b..f9a97e31a 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,6 +48,7 @@ Code contributions to the new version: - Fixed singularity-images path when updating pangolin database in lablog_viralrecon. Added line break after prompted input. [#282](https://github.com/BU-ISCIII/buisciii-tools/pull/282) - Updated characterization and snippy templates to fit bacass pipeline. Corrected path in 05-iqtree in snippy template. [#283](https://github.com/BU-ISCIII/buisciii-tools/pull/283) - Included multiqc_report.html in RESULTS folder in every service, where necessary [#265] (https://github.com/BU-ISCIII/buisciii-tools/pull/265) +- Added MAG tempalte and removed MAG from other templates [#288](https://github.com/BU-ISCIII/buisciii-tools/pull/288) ### Modules @@ -55,6 +56,7 @@ Code contributions to the new version: - PR [#274](https://github.com/BU-ISCIII/buisciii-tools/pull/274): added `--dev` option, configuration dev and test folder structure. - PR [#276](https://github.com/BU-ISCIII/buisciii-tools/pull/276): wkhtmlpdf does not need absolute path to executable. Added better error handling when executable does not exists. +- PR [#288](https://github.com/BU-ISCIII/buisciii-tools/pull/288) Allowed to handle more than one service at a time, related to issue [#217](https://github.com/BU-ISCIII/buisciii-tools/issues/217) #### Fixes @@ -62,6 +64,7 @@ Code contributions to the new version: - Fixed clean module. Corrected purge_files function. Renaming stage moved from clean to rename_nocopy option. Updated services.json file with correct paths for some services. [#280](https://github.com/BU-ISCIII/buisciii-tools/pull/280) - Fixed autoclean-sftp function. [#281](https://github.com/BU-ISCIII/buisciii-tools/pull/281) - Fixed bioinfo_doc.py. Modified it so that this module creates a .pdf file including new-line characters, without merging lines into one single line [#259](https://github.com/BU-ISCIII/buisciii-tools/pull/259). +- PR [#288](https://github.com/BU-ISCIII/buisciii-tools/pull/288) Fixed updating service's state to in_progress multiple times, related with issue [#285](https://github.com/BU-ISCIII/buisciii-tools/issues/285) #### Changed From 0c2fd23d99bbcd70f564137e27ba6085e9a3e89f Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 27 Jun 2024 09:10:40 +0200 Subject: [PATCH 059/321] Removed owner Pablo --- bu_isciii/config_json.py | 1 - bu_isciii/service_json.py | 1 - 2 files changed, 2 deletions(-) diff --git a/bu_isciii/config_json.py b/bu_isciii/config_json.py index cafad194b..5035ec938 100644 --- a/bu_isciii/config_json.py +++ b/bu_isciii/config_json.py @@ -24,7 +24,6 @@ def get_configuration(self, topic): def get_find(self, topic, found): """ - Owner: Pablo Description: Obtain from topic any forward items from json data """ diff --git a/bu_isciii/service_json.py b/bu_isciii/service_json.py index d9f2a280b..bbe8a1112 100644 --- a/bu_isciii/service_json.py +++ b/bu_isciii/service_json.py @@ -38,7 +38,6 @@ def get_service_configuration(self, service): def get_find(self, service, found): """ - Owner: Pablo Description: Obtain from service any forward items from json data """ From 236bec09108ee1989d70fad9f5adef127d1646ca Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 27 Jun 2024 10:28:23 +0200 Subject: [PATCH 060/321] Changed safe to True --- bu_isciii/archive.py | 4 ++-- bu_isciii/bioinfo_doc.py | 4 ++-- bu_isciii/clean.py | 2 +- bu_isciii/copy_sftp.py | 2 +- bu_isciii/new_service.py | 3 +-- bu_isciii/scratch.py | 2 +- 6 files changed, 8 insertions(+), 9 deletions(-) diff --git a/bu_isciii/archive.py b/bu_isciii/archive.py index 6d84f1610..d89f36e4b 100644 --- a/bu_isciii/archive.py +++ b/bu_isciii/archive.py @@ -224,7 +224,7 @@ def __init__( try: for service in rest_api.get_request( request_info="services", - safe=False, + safe=True, state="delivered", date_from=str(self.date_from), date_until=str(self.date_until), @@ -259,7 +259,7 @@ def __init__( if isinstance( ( service_data := rest_api.get_request( - request_info="service-data", safe=False, service=service + request_info="service-data", safe=True, service=service ) ), int, diff --git a/bu_isciii/bioinfo_doc.py b/bu_isciii/bioinfo_doc.py index a66ecd282..5583ab8a8 100755 --- a/bu_isciii/bioinfo_doc.py +++ b/bu_isciii/bioinfo_doc.py @@ -77,7 +77,7 @@ def __init__( conf_api["server"], conf_api["api_url"], api_user, api_password ) self.resolution_info = self.rest_api.get_request( - request_info="service-data", safe=False, resolution=self.resolution_id + request_info="service-data", safe=True, resolution=self.resolution_id ) if self.resolution_info == 404: print("Received Error 404 from Iskylims API. Aborting") @@ -92,7 +92,7 @@ def __init__( else: self.post_delivery_info() self.resolution_info = self.rest_api.get_request( - request_info="service-data", safe=False, resolution=self.resolution_id + request_info="service-data", safe=True, resolution=self.resolution_id ) self.services_requested = self.resolution_info["resolutions"][0][ "available_services" diff --git a/bu_isciii/clean.py b/bu_isciii/clean.py index 63a42b8bc..6536c306a 100644 --- a/bu_isciii/clean.py +++ b/bu_isciii/clean.py @@ -47,7 +47,7 @@ def __init__( conf_api["server"], conf_api["api_url"], api_user, api_password ) self.resolution_info = rest_api.get_request( - request_info="service-data", safe=False, resolution=self.resolution_id + request_info="service-data", safe=True, resolution=self.resolution_id ) self.service_folder = self.resolution_info["resolutions"][0][ "resolution_full_number" diff --git a/bu_isciii/copy_sftp.py b/bu_isciii/copy_sftp.py index cfd2b3890..9474b59be 100644 --- a/bu_isciii/copy_sftp.py +++ b/bu_isciii/copy_sftp.py @@ -50,7 +50,7 @@ def __init__( ) self.resolution_info = rest_api.get_request( - request_info="service-data", safe=False, resolution=self.resolution_id + request_info="service-data", safe=True, resolution=self.resolution_id ) if sftp_folder is None: self.sftp_folder = bu_isciii.utils.get_sftp_folder( diff --git a/bu_isciii/new_service.py b/bu_isciii/new_service.py index f57e301da..0bcf65561 100755 --- a/bu_isciii/new_service.py +++ b/bu_isciii/new_service.py @@ -54,7 +54,7 @@ def __init__( conf_api["server"], conf_api["api_url"], api_user, api_password ) self.resolution_info = self.rest_api.get_request( - request_info="service-data", safe=False, resolution=self.resolution_id + request_info="service-data", safe=True, resolution=self.resolution_id ) self.service_folder = self.resolution_info["resolutions"][0][ "resolution_full_number" @@ -210,7 +210,6 @@ def create_symbolic_links(self): "[red]ERROR: Symbolic links creation failed for file %s." % file ) stderr.print("Traceback: %s" % e) - continue def samples_json(self): json_samples = json.dumps(self.service_samples, indent=4) diff --git a/bu_isciii/scratch.py b/bu_isciii/scratch.py index 9f2a9984f..d165c8e92 100755 --- a/bu_isciii/scratch.py +++ b/bu_isciii/scratch.py @@ -64,7 +64,7 @@ def __init__( self.conf = conf.get_configuration("scratch_copy") self.resolution_info = rest_api.get_request( - request_info="service-data", safe=False, resolution=self.resolution_id + request_info="service-data", safe=True, resolution=self.resolution_id ) self.service_folder = self.resolution_info["resolutions"][0][ "resolution_full_number" From bb50952036e477e66e257fe707c2e91639b041e8 Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 27 Jun 2024 10:29:18 +0200 Subject: [PATCH 061/321] Fixed possible error when type does not exist in json --- bu_isciii/clean.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/bu_isciii/clean.py b/bu_isciii/clean.py index 6536c306a..dbc6625c5 100644 --- a/bu_isciii/clean.py +++ b/bu_isciii/clean.py @@ -129,9 +129,16 @@ def get_clean_items(self, services_ids, type="files"): for service in services_ids: try: items = service_conf.get_find_deep(service, type) - for item in items: - if item not in clean_items_list: - clean_items_list.append(item) + if items: + for item in items: + if item not in clean_items_list: + clean_items_list.append(item) + else: + stderr.print( + "[red]ERROR: Service type %s not found in services json file for service %s." + % (type,service) + ) + sys.exit() except KeyError as e: stderr.print( "[red]ERROR: Service id %s not found in services json file." From 387e4f48c3f3d40559de988f04923f8df25b3e32 Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 27 Jun 2024 10:30:04 +0200 Subject: [PATCH 062/321] Fixed black --- bu_isciii/clean.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/clean.py b/bu_isciii/clean.py index dbc6625c5..cc514628a 100644 --- a/bu_isciii/clean.py +++ b/bu_isciii/clean.py @@ -136,7 +136,7 @@ def get_clean_items(self, services_ids, type="files"): else: stderr.print( "[red]ERROR: Service type %s not found in services json file for service %s." - % (type,service) + % (type, service) ) sys.exit() except KeyError as e: From d01ac11edb191940d439c0ea439bb30da634d3ec Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Wed, 26 Jun 2024 15:44:16 +0200 Subject: [PATCH 063/321] Created 03-armfinderplus and lablog --- .../ANALYSIS01_CHARACTERIZATION/03-armfinderplus/lablog | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-armfinderplus/lablog diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-armfinderplus/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-armfinderplus/lablog new file mode 100644 index 000000000..7ef828b00 --- /dev/null +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-armfinderplus/lablog @@ -0,0 +1,7 @@ +# conda activate amrfinder + +# Only works with assemblies, prior to this you must have executed assembly pipeline + +scratch_dir=$(echo $(pwd) | sed 's@/data/bi/scratch_tmp/@/scratch/@g') + +cat ../samples_id.txt | while read in; do echo "srun --chdir $scratch_dir --partition middle_idx amrfinder -n $(ls ../../*ANALYSIS*ASSEMBLY/03-assembly/unicycler/${in}.fasta.gz) --organism Clostridioides_difficile --name ${in} --plus -o ${in}_out &" >> _01_run_amrfinder.sh; done \ No newline at end of file From 4ac57c34cdafdea65f20cef5877b41d6207d5f12 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Wed, 26 Jun 2024 16:36:07 +0200 Subject: [PATCH 064/321] Added organism selection when lablog executed --- .../03-armfinderplus/lablog | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-armfinderplus/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-armfinderplus/lablog index 7ef828b00..19920152a 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-armfinderplus/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-armfinderplus/lablog @@ -4,4 +4,20 @@ scratch_dir=$(echo $(pwd) | sed 's@/data/bi/scratch_tmp/@/scratch/@g') -cat ../samples_id.txt | while read in; do echo "srun --chdir $scratch_dir --partition middle_idx amrfinder -n $(ls ../../*ANALYSIS*ASSEMBLY/03-assembly/unicycler/${in}.fasta.gz) --organism Clostridioides_difficile --name ${in} --plus -o ${in}_out &" >> _01_run_amrfinder.sh; done \ No newline at end of file +possible_organisms=("Acinetobacter_baumannii" "Burkholderia_cepacia" "Burkholderia_pseudomallei" "Campylobacter" "Citrobacter_freundii" "Clostridioides_difficile" "Enterobacter_asburiae" "Enterobacter_cloacae" "Enterococcus_faecalis" "Enterococcus_faecium" "Escherichia" "Klebsiella_oxytoca" "Klebsiella_pneumoniae" "Neisseria_gonorrhoeae" "Neisseria_meningitidis" "Pseudomonas_aeruginosa" "Salmonella" "Serratia_marcescens" "Staphylococcus_aureus" "Staphylococcus_pseudintermedius" "Streptococcus_agalactiae" "Streptococcus_pneumoniae" "Streptococcus_pyogenes" "Vibrio_cholerae" "Vibrio_parahaemolyticus" "Vibrio_vulnificus" "OTHER") + +echo "Please select your bacteria from the following list:" +PS3=$(echo -e "\n\033[1;37mSelect number:\033[0m ") +select bacteria in "${possible_organisms[@]}"; do + if [[ -n "$bacteria" ]]; then + echo -e "\033[0;32mOrganism selected: ${bacteria}\033[0m" + if [ $bacteria = "OTHER" ]; then + cat ../samples_id.txt | while read in; do echo "srun --chdir $scratch_dir --partition middle_idx amrfinder -n $(ls ../../*ANALYSIS*ASSEMBLY/03-assembly/unicycler/${in}.fasta.gz) --name ${in} --plus -o ${in}_out &" >> _01_run_amrfinder.sh; done + else + cat ../samples_id.txt | while read in; do echo "srun --chdir $scratch_dir --partition middle_idx amrfinder -n $(ls ../../*ANALYSIS*ASSEMBLY/03-assembly/unicycler/${in}.fasta.gz) --organism ${bacteria} --name ${in} --plus -o ${in}_out &" >> _01_run_amrfinder.sh; done + fi + break + else + echo -e "\n\033[0;31mInvalid input.\033[0m" + fi +done \ No newline at end of file From 30cf68f8ebd53382c3e9ec24465a8b4a311fb9e6 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Wed, 26 Jun 2024 16:44:12 +0200 Subject: [PATCH 065/321] Added some format --- .../ANALYSIS01_CHARACTERIZATION/03-armfinderplus/lablog | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-armfinderplus/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-armfinderplus/lablog index 19920152a..981b66fa6 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-armfinderplus/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-armfinderplus/lablog @@ -5,8 +5,8 @@ scratch_dir=$(echo $(pwd) | sed 's@/data/bi/scratch_tmp/@/scratch/@g') possible_organisms=("Acinetobacter_baumannii" "Burkholderia_cepacia" "Burkholderia_pseudomallei" "Campylobacter" "Citrobacter_freundii" "Clostridioides_difficile" "Enterobacter_asburiae" "Enterobacter_cloacae" "Enterococcus_faecalis" "Enterococcus_faecium" "Escherichia" "Klebsiella_oxytoca" "Klebsiella_pneumoniae" "Neisseria_gonorrhoeae" "Neisseria_meningitidis" "Pseudomonas_aeruginosa" "Salmonella" "Serratia_marcescens" "Staphylococcus_aureus" "Staphylococcus_pseudintermedius" "Streptococcus_agalactiae" "Streptococcus_pneumoniae" "Streptococcus_pyogenes" "Vibrio_cholerae" "Vibrio_parahaemolyticus" "Vibrio_vulnificus" "OTHER") - -echo "Please select your bacteria from the following list:" +echo +echo -e "\n\033[1;37mPlease select your bacteria from the following list:\033[0m" PS3=$(echo -e "\n\033[1;37mSelect number:\033[0m ") select bacteria in "${possible_organisms[@]}"; do if [[ -n "$bacteria" ]]; then From a838fb8f577d98fce9c3f643a7000176ee68f977 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Wed, 26 Jun 2024 17:13:23 +0200 Subject: [PATCH 066/321] Modified lablog_characterization_results including amrfinderplus files --- .../RESULTS/lablog_characterization_results | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results b/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results index 8879a843b..6a41c5703 100644 --- a/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results +++ b/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results @@ -1,12 +1,14 @@ -DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega" +DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega01" -mkdir -p $DELIVERY_FOLDER -mkdir "${DELIVERY_FOLDER}/characterization" +mkdir -p "${DELIVERY_FOLDER}/characterization/amrfinderplus" # ARIBA characterization service cd $DELIVERY_FOLDER/characterization ln -s ../../../ANALYSIS/*CHARACTERIZATION/99-stats/ariba_*.tsv . ln -s ../../../ANALYSIS/*CHARACTERIZATION/99-stats/ariba_*.csv . -find . -xtype l -delete -cd - +cd amrfinderplus +find ../../../../ANALYSIS/*CHARACTERIZATION/*amrfinderplus -name '*_out' -exec bash -c 'ln -s "$1" "$(basename "${1%_out}.tsv")"' _ {} \; +find .. -xtype l -delete + +cd ../.. From 03d3e3ce275cd214baf54d74e3551a2363a93af8 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Wed, 26 Jun 2024 17:15:38 +0200 Subject: [PATCH 067/321] Renamed 03-amrfinderplus folder --- .../{03-armfinderplus => 03-amrfinderplus}/lablog | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/{03-armfinderplus => 03-amrfinderplus}/lablog (100%) diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-armfinderplus/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-amrfinderplus/lablog similarity index 100% rename from bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-armfinderplus/lablog rename to bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-amrfinderplus/lablog From ebd448ede3b956c569d3ead94ebb7a84b79c9504 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Thu, 27 Jun 2024 13:28:49 +0200 Subject: [PATCH 068/321] Fixed changelog conflict --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f9a97e31a..5aae27a46 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,6 +49,7 @@ Code contributions to the new version: - Updated characterization and snippy templates to fit bacass pipeline. Corrected path in 05-iqtree in snippy template. [#283](https://github.com/BU-ISCIII/buisciii-tools/pull/283) - Included multiqc_report.html in RESULTS folder in every service, where necessary [#265] (https://github.com/BU-ISCIII/buisciii-tools/pull/265) - Added MAG tempalte and removed MAG from other templates [#288](https://github.com/BU-ISCIII/buisciii-tools/pull/288) +- Added amrfinderplus to characterization template. [#289] (https://github.com/BU-ISCIII/buisciii-tools/pull/289) ### Modules From d65b97727a46bbdfa688de5fe1d0d04f3af1d783 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Thu, 27 Jun 2024 09:12:53 +0200 Subject: [PATCH 069/321] Added tsv extension to output files for amrfinderplus --- .../ANALYSIS01_CHARACTERIZATION/03-amrfinderplus/lablog | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-amrfinderplus/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-amrfinderplus/lablog index 981b66fa6..a75150202 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-amrfinderplus/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-amrfinderplus/lablog @@ -12,9 +12,9 @@ select bacteria in "${possible_organisms[@]}"; do if [[ -n "$bacteria" ]]; then echo -e "\033[0;32mOrganism selected: ${bacteria}\033[0m" if [ $bacteria = "OTHER" ]; then - cat ../samples_id.txt | while read in; do echo "srun --chdir $scratch_dir --partition middle_idx amrfinder -n $(ls ../../*ANALYSIS*ASSEMBLY/03-assembly/unicycler/${in}.fasta.gz) --name ${in} --plus -o ${in}_out &" >> _01_run_amrfinder.sh; done + cat ../samples_id.txt | while read in; do echo "srun --chdir $scratch_dir --partition middle_idx amrfinder -n $(ls ../../*ANALYSIS*ASSEMBLY/03-assembly/unicycler/${in}.fasta.gz) --name ${in} --plus -o ${in}_out.tsv &" >> _01_run_amrfinder.sh; done else - cat ../samples_id.txt | while read in; do echo "srun --chdir $scratch_dir --partition middle_idx amrfinder -n $(ls ../../*ANALYSIS*ASSEMBLY/03-assembly/unicycler/${in}.fasta.gz) --organism ${bacteria} --name ${in} --plus -o ${in}_out &" >> _01_run_amrfinder.sh; done + cat ../samples_id.txt | while read in; do echo "srun --chdir $scratch_dir --partition middle_idx amrfinder -n $(ls ../../*ANALYSIS*ASSEMBLY/03-assembly/unicycler/${in}.fasta.gz) --organism ${bacteria} --name ${in} --plus -o ${in}_out.tsv &" >> _01_run_amrfinder.sh; done fi break else From 4d44862110b5c45be2bfafd0a47c84f37ffb1ce2 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Thu, 27 Jun 2024 09:16:23 +0200 Subject: [PATCH 070/321] Modified lablog_characterization_results in order to creating symlinks for tsv files in amrfinderplus analysis --- .../characterization/RESULTS/lablog_characterization_results | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results b/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results index 6a41c5703..a12cfbd96 100644 --- a/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results +++ b/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results @@ -8,7 +8,7 @@ ln -s ../../../ANALYSIS/*CHARACTERIZATION/99-stats/ariba_*.tsv . ln -s ../../../ANALYSIS/*CHARACTERIZATION/99-stats/ariba_*.csv . cd amrfinderplus -find ../../../../ANALYSIS/*CHARACTERIZATION/*amrfinderplus -name '*_out' -exec bash -c 'ln -s "$1" "$(basename "${1%_out}.tsv")"' _ {} \; +ln -s ../../../../ANALYSIS/*CHARACTERIZATION/*amrfinderplus/*tsv . find .. -xtype l -delete cd ../.. From e63981b38f891d11a9b042bd4219d98e82bdbbfc Mon Sep 17 00:00:00 2001 From: victor5lm Date: Tue, 18 Jun 2024 14:13:30 +0200 Subject: [PATCH 071/321] Updated plasmidid lablog --- .../templates/plasmidid/ANALYSIS/ANALYSIS02_PLASMIDID/lablog | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/plasmidid/ANALYSIS/ANALYSIS02_PLASMIDID/lablog b/bu_isciii/templates/plasmidid/ANALYSIS/ANALYSIS02_PLASMIDID/lablog index bc11143f8..b6d0b61ec 100644 --- a/bu_isciii/templates/plasmidid/ANALYSIS/ANALYSIS02_PLASMIDID/lablog +++ b/bu_isciii/templates/plasmidid/ANALYSIS/ANALYSIS02_PLASMIDID/lablog @@ -4,6 +4,6 @@ mkdir logs scratch_dir=$(echo $PWD | sed 's/\/data\/bi\/scratch_tmp/\/scratch/g') ln -s ../samples_id.txt . -cat ../samples_id.txt | xargs -I @@ echo "srun --chdir $scratch_dir --output logs/PLASMIDID.@@.%j.log --job-name PLASMIDID_@@ --partition short_idx --time 05:00:00 env - PATH="$PATH" singularity exec --bind /data/bi/references/ --bind ${scratch_dir} --bind ${scratch_dir}/../../ /scratch/bi/singularity-images/depot.galaxyproject.org-singularity-plasmidid-1.6.5--hdfd78af_0.img plasmidID -1 ${scratch_dir}/../*ASSEMBLY/01-preprocessing/trimmed_sequences/@@_1.trim.fastq.gz -2 ${scratch_dir}/../*ASSEMBLY/01-preprocessing/trimmed_sequences/@@_2.trim.fastq.gz -d /data/bi/references/plasmidID/plasmid_ddbb/20200203/20200203_plasmids.fasta -s @@ -c ${scratch_dir}/../*ASSEMBLY/03-assembly/unicycler/@@.fasta -a ${scratch_dir}/plasmidID_annotation_config_file.txt --no-trim -o ${scratch_dir} &" > _01_plasmidID.sh +cat ../samples_id.txt | xargs -I @@ echo "srun --chdir $scratch_dir --output logs/PLASMIDID.@@.%j.log --job-name PLASMIDID_@@ --partition short_idx --time 05:00:00 env - PATH="$PATH" singularity exec --bind /data/bi/references/ --bind ${scratch_dir} --bind ${scratch_dir}/../../ /data/bi/pipelines/singularity-images/depot.galaxyproject.org-singularity-plasmidid-1.6.5--hdfd78af_0.img plasmidID -1 ${scratch_dir}/../*ASSEMBLY/01-processing/fastp/@@_1.fastp.fastq.gz -2 ${scratch_dir}/../*ASSEMBLY/01-processing/fastp/@@_2.fastp.fastq.gz -d /data/bi/references/plasmidID/plasmid_ddbb/20200203/20200203_plasmids.fasta -s @@ -c ${scratch_dir}/../*ASSEMBLY/03-assembly/unicycler/@@.fasta -a ${scratch_dir}/plasmidID_annotation_config_file.txt --no-trim -o ${scratch_dir} &" > _01_plasmidID.sh -echo "srun --chdir $scratch_dir --partition short_idx --time 1:00:00 --output logs/SUMMARY.%j.log /data/bi/pipelines/plasmidID/bin/summary_report_pid.py -i NO_GROUP -g &" > _02_summary_table.sh +echo "srun --chdir $scratch_dir --partition short_idx --time 1:00:00 --output logs/SUMMARY.%j.log env - PATH="$PATH" singularity exec --bind /data/bi/references/ --bind ${scratch_dir} --bind ${scratch_dir}/../../ /data/bi/pipelines/singularity-images/depot.galaxyproject.org-singularity-plasmidid-1.6.5--hdfd78af_0.img summary_report_pid.py -i NO_GROUP -g &" > _02_summary_table.sh From 84607b44d6b31249ccfcdfae4e18bc0a9beaf63c Mon Sep 17 00:00:00 2001 From: victor5lm Date: Tue, 18 Jun 2024 14:17:05 +0200 Subject: [PATCH 072/321] Modified pipelines paths where necessary --- .../IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog | 4 ++-- .../templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog | 2 +- .../freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/lablog | 2 +- .../ANALYSIS/ANALYSIS01_RBPANEL/07-annotation/lablog | 2 +- .../mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/02-kmerfinder/lablog | 2 +- .../templates/pikavirus/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog | 2 +- .../templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/lablog | 2 +- .../seek_and_destroy/ANALYSIS/ANALYSIS01_SEEK_DESTROY/lablog | 2 +- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 4 ++-- bu_isciii/templates/viralrecon/DOC/viralrecon.config | 2 +- .../templates/viralrecon/DOC/viralrecon_sars_nanopore.config | 4 ++-- .../viralrecon/DOC/viralrecon_sars_nanopore_params.yml | 4 ++-- 12 files changed, 16 insertions(+), 16 deletions(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog index 01c499462..c694a5b91 100755 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog @@ -5,7 +5,7 @@ mkdir logs scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") -cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --cpus-per-task 32 --mem 35000M --chdir $scratch_dir --time 01:00:00 --output logs/IRMA.${in}.%j.log /data/bi/pipelines/flu-amd-202402/IRMA FLU_AD ../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${in} --external-config ../../../DOC/irma_config.sh &"; done > _01_irma.sh +cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --cpus-per-task 32 --mem 35000M --chdir $scratch_dir --time 01:00:00 --output logs/IRMA.${in}.%j.log /data/bi/pipelines/flu-amd/flu-amd-1.1.4/IRMA FLU_AD ../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${in} --external-config ../../../DOC/irma_config.sh &"; done > _01_irma.sh echo 'bash create_irma_stats.sh' > _02_create_stats.sh @@ -32,4 +32,4 @@ echo 'grep -w 'C__' irma_stats.txt | cut -f1 | while read sample; do cat C_fragm echo 'cat ../samples_id.txt | while read in; do cat ${in}/*.fasta | sed "s/^>/\>${in}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g' | sed 's@-@/@g' | sed 's/_A_/_/g' | sed 's/_B_/_/g' | sed 's/_C_/_/g' >> all_samples_completo.txt; done' >> _03_post_processing.sh echo 'sed "s/__//g" irma_stats.txt > clean_irma_stats.txt' >> _03_post_processing.sh -echo 'sed "s/_\t/\t/g" irma_stats.txt > clean_irma_stats.txt' >> _03_post_processing.sh \ No newline at end of file +echo 'sed "s/_\t/\t/g" irma_stats.txt > clean_irma_stats.txt' >> _03_post_processing.sh diff --git a/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog b/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog index bd8f8549a..25e84ab36 100644 --- a/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog +++ b/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog @@ -107,7 +107,7 @@ cat < assembly.sbatch # module load Nextflow/23.10.0 singularity export NXF_OPTS="-Xms500M -Xmx8G" -nextflow run /data/bi/pipelines/nf-core-bacass/main.nf \\ +nextflow run /data/bi/pipelines/nf-core-bacass/nf-core-bacass-2.2.0dev/main.nf \\ -c ../../DOC/hpc_slurm_assembly.config \\ -profile singularity \\ --input samplesheet.csv \\ diff --git a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/lablog b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/lablog index 84c6c6dfa..77a8c99b7 100644 --- a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/lablog +++ b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/lablog @@ -17,7 +17,7 @@ cat < mag.sbatch export NXF_OPTS="-Xms500M -Xmx4G" -nextflow run /scratch/bi/pipelines/nf-core-mag-2.1.1/workflow/main.nf \\ +nextflow run /data/bi/pipelines/nf-core-mag/nf-core-mag-2.1.1/workflow/main.nf \\ -c ../../DOC/mag.config \\ --input '00-reads/*_R{1,2}.fastq.gz' \\ --outdir $(date '+%Y%m%d')_mag \\ diff --git a/bu_isciii/templates/lowfreq_panel/ANALYSIS/ANALYSIS01_RBPANEL/07-annotation/lablog b/bu_isciii/templates/lowfreq_panel/ANALYSIS/ANALYSIS01_RBPANEL/07-annotation/lablog index 29a8a8cbd..584fe9b36 100644 --- a/bu_isciii/templates/lowfreq_panel/ANALYSIS/ANALYSIS01_RBPANEL/07-annotation/lablog +++ b/bu_isciii/templates/lowfreq_panel/ANALYSIS/ANALYSIS01_RBPANEL/07-annotation/lablog @@ -27,7 +27,7 @@ echo "sbatch bcftools_query.sbatch" > _01_bcftools_query.sh # module load Java/1.8.0_281 R/4.1.0-foss-2021a -cat ../samples_id.txt | xargs -I @@ echo "mkdir @@;srun --chdir ${scratch_dir} --output logs/KGGSEQ.@@.%j.log --job-name KGGSEQ --cpus-per-task 1 --mem 8192 --partition short_idx --time 02:00:00 java -jar -Xmx8g /data/bi/pipelines/kggseqhg19/kggseq.jar --no-web --buildver hg19 --vcf-file ../06-VarScan/@@/@@.vcf.gz --db-gene refgene --db-score dbnsfp --genome-annot --db-filter ESP5400,dbsnp141,1kg201305 --rare-allele-freq 1 --mendel-causing-predict best --omim-annot --out @@/@@_annot.txt --no-qc &" > _02_kggseq.sh +cat ../samples_id.txt | xargs -I @@ echo "mkdir @@;srun --chdir ${scratch_dir} --output logs/KGGSEQ.@@.%j.log --job-name KGGSEQ --cpus-per-task 1 --mem 8192 --partition short_idx --time 02:00:00 java -jar -Xmx8g /data/bi/pipelines/kggseq/kggseqhg19/kggseq.jar --no-web --buildver hg19 --vcf-file ../06-VarScan/@@/@@.vcf.gz --db-gene refgene --db-score dbnsfp --genome-annot --db-filter ESP5400,dbsnp141,1kg201305 --rare-allele-freq 1 --mendel-causing-predict best --omim-annot --out @@/@@_annot.txt --no-qc &" > _02_kggseq.sh cat ../samples_id.txt | xargs -I % echo "gunzip %/%_annot.txt.flt.txt.gz" > _03_final_table.sh cat ../samples_id.txt | xargs -I % echo 'cp header %/%_header.table && tail -n +2 %/%.table >> %/%_header.table' >> _03_final_table.sh cat ../samples_id.txt | xargs -I % echo "Rscript merge_parse.R %" >> _03_final_table.sh diff --git a/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/02-kmerfinder/lablog b/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/02-kmerfinder/lablog index ea7174d24..eff310bb1 100644 --- a/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/02-kmerfinder/lablog +++ b/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/02-kmerfinder/lablog @@ -4,7 +4,7 @@ scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") mkdir logs -cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --chdir ${scratch_dir} --mem 50G --output logs/KMERFINDER.${in}.%j.log singularity run -B /scratch/bi/ -B /data/bi/references/kmerfinder/20190108_stable_dirs/bacteria /scratch/bi/pipelines/kmerfinder_v3.0.simg -i ../01-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../01-preprocessing/${in}/${in}_R2_filtered.fastq.gz -o ${scratch_dir}/${in} -db /data/bi/references/kmerfinder/20190108_stable_dirs/bacteria/bacteria.ATG -tax /data/bi/references/kmerfinder/20190108_stable_dirs/bacteria/bacteria.name -x &";done > _01_kmerfinder.sh +cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --chdir ${scratch_dir} --mem 50G --output logs/KMERFINDER.${in}.%j.log singularity run -B /scratch/bi/ -B /data/bi/references/kmerfinder/20190108_stable_dirs/bacteria /data/bi/pipelines/singularity-images/kmerfinder_v3.0.simg -i ../01-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../01-preprocessing/${in}/${in}_R2_filtered.fastq.gz -o ${scratch_dir}/${in} -db /data/bi/references/kmerfinder/20190108_stable_dirs/bacteria/bacteria.ATG -tax /data/bi/references/kmerfinder/20190108_stable_dirs/bacteria/bacteria.name -x &";done > _01_kmerfinder.sh echo "cat ../samples_id.txt | xargs -I % awk '{FS=\"\t\"} NR==2 {print \$1}' %/results.txt | awk '{count[\$0]++} END{for (i in count) {print count[i], i}}' | sort -nr" > _02_find_common_reference.sh diff --git a/bu_isciii/templates/pikavirus/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog b/bu_isciii/templates/pikavirus/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog index c91b4010f..ef1be3ced 100644 --- a/bu_isciii/templates/pikavirus/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog +++ b/bu_isciii/templates/pikavirus/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog @@ -20,7 +20,7 @@ cat < pikavirus.sbatch export NXF_OPTS="-Xms500M -Xmx4G" -nextflow run /scratch/bi/pipelines/PikaVirus/main.nf \\ +nextflow run /data/bi/pipelines/pikavirus/PikaVirus/main.nf \\ -c ../../DOC/hpc_slurm_pikavirus.config \\ --input samplesheet.csv \\ --kraken_scouting false \\ diff --git a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/lablog b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/lablog index d0674cda9..a39b66986 100644 --- a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/lablog +++ b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/lablog @@ -19,7 +19,7 @@ cat < rnaseq.sbatch export NXF_OPTS="-Xms500M -Xmx4G" -/data/bi/pipelines/nf-core-rnaseq-3.10.1/workflow/main.nf \\ +/data/bi/pipelines/nf-core-rnaseq/nf-core-rnaseq-3.10.1/workflow/main.nf \\ -c ../../DOC/hpc_slurm_rnaseq.config \\ -params-file ../../DOC/hg38_ensmbl_rnaseq.yml \\ --input samplesheet.csv \\ diff --git a/bu_isciii/templates/seek_and_destroy/ANALYSIS/ANALYSIS01_SEEK_DESTROY/lablog b/bu_isciii/templates/seek_and_destroy/ANALYSIS/ANALYSIS01_SEEK_DESTROY/lablog index a960ff2e3..87cdb1f10 100644 --- a/bu_isciii/templates/seek_and_destroy/ANALYSIS/ANALYSIS01_SEEK_DESTROY/lablog +++ b/bu_isciii/templates/seek_and_destroy/ANALYSIS/ANALYSIS01_SEEK_DESTROY/lablog @@ -21,7 +21,7 @@ cat < seek_destroy.sbatch export NXF_OPTS="-Xms500M -Xmx4G" -nextflow run /scratch/bi/pipelines/Seek-Destroy/main.nf \\ +nextflow run /data/bi/pipelines/seek-destroy-nf/main.nf \\ -c ../../DOC/seek_destroy.config \\ --input samplesheet.csv \\ --scout_database /data/bi/references/kraken/minikraken_8GB_20200312.tgz \\ diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index f63925f21..3a81fc18a 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -363,7 +363,7 @@ do echo "#SBATCH --output ${ref}_$(date '+%Y%m%d')_viralrecon.log" >> ${FOLDER_NAME}/lablog printf "#SBATCH --chdir \$scratch_dir\n\n" >> ${FOLDER_NAME}/lablog printf 'export NXF_OPTS="-Xms500M -Xmx4G"\n\n' >> ${FOLDER_NAME}/lablog - echo "nextflow run /data/bi/pipelines/nf-core-viralrecon-2.6.0/workflow/main.nf \\\\" >> ${FOLDER_NAME}/lablog + echo "nextflow run /data/bi/pipelines/nf-core-viralrecon/nf-core-viralrecon-2.6.0/workflow/main.nf \\\\" >> ${FOLDER_NAME}/lablog echo " -c ../${CONFIG_FILE} \\\\" >> ${FOLDER_NAME}/lablog echo " -params-file ../${PARAMS_FILE} \\\\" >> ${FOLDER_NAME}/lablog echo " --input samplesheet_${ref}.csv \\\\" >> ${FOLDER_NAME}/lablog @@ -403,4 +403,4 @@ rm percentajeNs.py rm _02_create_run_percentage_Ns.sh cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd .. -echo_green "\nLablog_viralrecon execution has been completed successfully!" \ No newline at end of file +echo_green "\nLablog_viralrecon execution has been completed successfully!" diff --git a/bu_isciii/templates/viralrecon/DOC/viralrecon.config b/bu_isciii/templates/viralrecon/DOC/viralrecon.config index 564f23d33..ea6317c97 100644 --- a/bu_isciii/templates/viralrecon/DOC/viralrecon.config +++ b/bu_isciii/templates/viralrecon/DOC/viralrecon.config @@ -1,7 +1,7 @@ singularity { enabled = true autoMounts = true - runOptions = '-B /data/bi/references/ -B /data/bi/pipelines/artic-ncov2019/ -B "$HOME"' + runOptions = '-B /data/bi/references/ -B /data/bi/pipelines/artic-ncov2019/artic-ncov2019/ -B "$HOME"' } process { diff --git a/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore.config b/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore.config index 31ca69413..c844e28dd 100644 --- a/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore.config +++ b/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore.config @@ -1,7 +1,7 @@ singularity { enabled = true autoMounts = true - runOptions = '-B /data/bi/references/ -B /data/bi/pipelines/artic-ncov2019/' + runOptions = '-B /data/bi/references/ -B /data/bi/pipelines/artic-ncov2019/artic-ncov2019/' } process { @@ -10,7 +10,7 @@ process { withName: 'ARTIC_MINION' { ext.args = [ '--normalise 500', - '--scheme-directory /data/bi/pipelines/artic-ncov2019/primer_schemes/', + '--scheme-directory /data/bi/pipelines/artic-ncov2019/artic-ncov2019/primer_schemes/', '--medaka' ].join(' ').trim() } diff --git a/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore_params.yml b/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore_params.yml index 2ef67955d..c342c8e4e 100644 --- a/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore_params.yml +++ b/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore_params.yml @@ -2,8 +2,8 @@ platform: 'nanopore' protocol: 'amplicon' artic_scheme: 'nCoV-2019' primer_set_version: 5 -fasta: '/data/bi/pipelines/artic-ncov2019/primer_schemes/nCoV-2019/V5/ESIB-EQA.reference.fasta' -primer_bed: '/data/bi/pipelines/artic-ncov2019/primer_schemes/nCoV-2019/V5/ESIB-EQA.primer.bed' +fasta: '/data/bi/pipelines/artic-ncov2019/artic-ncov2019/primer_schemes/nCoV-2019/V5/ESIB-EQA.reference.fasta' +primer_bed: '/data/bi/pipelines/artic-ncov2019/artic-ncov2019/primer_schemes/nCoV-2019/V5/ESIB-EQA.primer.bed' primer_fasta: '../../RAW/ESIB_EQA_2023.SARS1/ESIB_EQA_2023.SARS1.primers.fasta' kraken2_db: '/data/bi/references/eukaria/homo_sapiens/hg38/UCSC/kraken2/kraken2_human.tar.gz' nextclade_dataset_name: 'sars-cov-2' From 40b82c3adcc7e03e44d40fbd979621e3e62673ff Mon Sep 17 00:00:00 2001 From: victor5lm Date: Tue, 18 Jun 2024 14:35:55 +0200 Subject: [PATCH 073/321] Updated CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5aae27a46..abaf2f21a 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,6 +50,7 @@ Code contributions to the new version: - Included multiqc_report.html in RESULTS folder in every service, where necessary [#265] (https://github.com/BU-ISCIII/buisciii-tools/pull/265) - Added MAG tempalte and removed MAG from other templates [#288](https://github.com/BU-ISCIII/buisciii-tools/pull/288) - Added amrfinderplus to characterization template. [#289] (https://github.com/BU-ISCIII/buisciii-tools/pull/289) +- Updated all files so that paths referring to /pipelines/ are updated according to the new structure [#287](https://github.com/BU-ISCIII/buisciii-tools/pull/287) ### Modules From 0211853aff1c25bad81a279ae821dec0cd3c03c6 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Tue, 18 Jun 2024 15:55:58 +0200 Subject: [PATCH 074/321] Modified some auxiliar viralrecon files --- bu_isciii/templates/viralrecon/DOC/viralrecon.config | 2 +- .../templates/viralrecon/DOC/viralrecon_sars_nanopore.config | 4 ++-- .../viralrecon/DOC/viralrecon_sars_nanopore_params.yml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bu_isciii/templates/viralrecon/DOC/viralrecon.config b/bu_isciii/templates/viralrecon/DOC/viralrecon.config index ea6317c97..bc862c2e5 100644 --- a/bu_isciii/templates/viralrecon/DOC/viralrecon.config +++ b/bu_isciii/templates/viralrecon/DOC/viralrecon.config @@ -1,7 +1,7 @@ singularity { enabled = true autoMounts = true - runOptions = '-B /data/bi/references/ -B /data/bi/pipelines/artic-ncov2019/artic-ncov2019/ -B "$HOME"' + runOptions = '-B /data/bi/references/ -B /data/bi/pipelines/artic-ncov2019/artic-ncov2019_v0/ -B "$HOME"' } process { diff --git a/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore.config b/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore.config index c844e28dd..03ef609e4 100644 --- a/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore.config +++ b/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore.config @@ -1,7 +1,7 @@ singularity { enabled = true autoMounts = true - runOptions = '-B /data/bi/references/ -B /data/bi/pipelines/artic-ncov2019/artic-ncov2019/' + runOptions = '-B /data/bi/references/ -B /data/bi/pipelines/artic-ncov2019/artic-ncov2019_v0/' } process { @@ -10,7 +10,7 @@ process { withName: 'ARTIC_MINION' { ext.args = [ '--normalise 500', - '--scheme-directory /data/bi/pipelines/artic-ncov2019/artic-ncov2019/primer_schemes/', + '--scheme-directory /data/bi/pipelines/artic-ncov2019/artic-ncov2019_v0/primer_schemes/', '--medaka' ].join(' ').trim() } diff --git a/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore_params.yml b/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore_params.yml index c342c8e4e..56c95cbf4 100644 --- a/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore_params.yml +++ b/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore_params.yml @@ -2,8 +2,8 @@ platform: 'nanopore' protocol: 'amplicon' artic_scheme: 'nCoV-2019' primer_set_version: 5 -fasta: '/data/bi/pipelines/artic-ncov2019/artic-ncov2019/primer_schemes/nCoV-2019/V5/ESIB-EQA.reference.fasta' -primer_bed: '/data/bi/pipelines/artic-ncov2019/artic-ncov2019/primer_schemes/nCoV-2019/V5/ESIB-EQA.primer.bed' +fasta: '/data/bi/pipelines/artic-ncov2019/artic-ncov2019_v0/primer_schemes/nCoV-2019/V5/ESIB-EQA.reference.fasta' +primer_bed: '/data/bi/pipelines/artic-ncov2019/artic-ncov2019_v0/primer_schemes/nCoV-2019/V5/ESIB-EQA.primer.bed' primer_fasta: '../../RAW/ESIB_EQA_2023.SARS1/ESIB_EQA_2023.SARS1.primers.fasta' kraken2_db: '/data/bi/references/eukaria/homo_sapiens/hg38/UCSC/kraken2/kraken2_human.tar.gz' nextclade_dataset_name: 'sars-cov-2' From 8ab181283966c64a1c936e8c9dcf079e5e8cc65d Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 27 Jun 2024 13:26:29 +0200 Subject: [PATCH 075/321] Fixed some routes in a few templates and auxiliar files --- bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog | 2 +- .../templates/pikavirus/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog | 2 +- .../seek_and_destroy/ANALYSIS/ANALYSIS01_SEEK_DESTROY/lablog | 2 +- bu_isciii/templates/viralrecon/DOC/viralrecon.config | 2 +- .../templates/viralrecon/DOC/viralrecon_sars_nanopore.config | 4 ++-- .../viralrecon/DOC/viralrecon_sars_nanopore_params.yml | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog index 39408a717..d68708d31 100644 --- a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog +++ b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog @@ -18,7 +18,7 @@ cat < mag.sbatch export NXF_OPTS="-Xms500M -Xmx4G" -nextflow run /data/bi/pipelines/nf-core-mag/nf-core-mag-2.1.1/workflow/main.nf \\ +nextflow run /data/bi/pipelines/nf-core-mag/nf-core-mag-2.5.3/workflow/main.nf \\ -c ../../DOC/mag.config \\ --input '00-reads/*_R{1,2}.fastq.gz' \\ --outdir $(date '+%Y%m%d')_mag \\ diff --git a/bu_isciii/templates/pikavirus/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog b/bu_isciii/templates/pikavirus/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog index ef1be3ced..83da38684 100644 --- a/bu_isciii/templates/pikavirus/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog +++ b/bu_isciii/templates/pikavirus/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog @@ -20,7 +20,7 @@ cat < pikavirus.sbatch export NXF_OPTS="-Xms500M -Xmx4G" -nextflow run /data/bi/pipelines/pikavirus/PikaVirus/main.nf \\ +nextflow run /data/bi/pipelines/pikavirus/pikavirus-1.0dev/main.nf \\ -c ../../DOC/hpc_slurm_pikavirus.config \\ --input samplesheet.csv \\ --kraken_scouting false \\ diff --git a/bu_isciii/templates/seek_and_destroy/ANALYSIS/ANALYSIS01_SEEK_DESTROY/lablog b/bu_isciii/templates/seek_and_destroy/ANALYSIS/ANALYSIS01_SEEK_DESTROY/lablog index 87cdb1f10..381cbe632 100644 --- a/bu_isciii/templates/seek_and_destroy/ANALYSIS/ANALYSIS01_SEEK_DESTROY/lablog +++ b/bu_isciii/templates/seek_and_destroy/ANALYSIS/ANALYSIS01_SEEK_DESTROY/lablog @@ -21,7 +21,7 @@ cat < seek_destroy.sbatch export NXF_OPTS="-Xms500M -Xmx4G" -nextflow run /data/bi/pipelines/seek-destroy-nf/main.nf \\ +nextflow run /data/bi/pipelines/seek-destroy-nf/seek-destroy-nf-1.0dev/main.nf \\ -c ../../DOC/seek_destroy.config \\ --input samplesheet.csv \\ --scout_database /data/bi/references/kraken/minikraken_8GB_20200312.tgz \\ diff --git a/bu_isciii/templates/viralrecon/DOC/viralrecon.config b/bu_isciii/templates/viralrecon/DOC/viralrecon.config index bc862c2e5..4b5f4e2a5 100644 --- a/bu_isciii/templates/viralrecon/DOC/viralrecon.config +++ b/bu_isciii/templates/viralrecon/DOC/viralrecon.config @@ -1,7 +1,7 @@ singularity { enabled = true autoMounts = true - runOptions = '-B /data/bi/references/ -B /data/bi/pipelines/artic-ncov2019/artic-ncov2019_v0/ -B "$HOME"' + runOptions = '-B /data/bi/references/ -B "$HOME"' } process { diff --git a/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore.config b/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore.config index 03ef609e4..865d1a0d2 100644 --- a/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore.config +++ b/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore.config @@ -1,7 +1,7 @@ singularity { enabled = true autoMounts = true - runOptions = '-B /data/bi/references/ -B /data/bi/pipelines/artic-ncov2019/artic-ncov2019_v0/' + runOptions = '-B /data/bi/references/' } process { @@ -10,7 +10,7 @@ process { withName: 'ARTIC_MINION' { ext.args = [ '--normalise 500', - '--scheme-directory /data/bi/pipelines/artic-ncov2019/artic-ncov2019_v0/primer_schemes/', + '--scheme-directory /data/bi/references/virus/2019-nCoV/primer_schemes/', '--medaka' ].join(' ').trim() } diff --git a/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore_params.yml b/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore_params.yml index 56c95cbf4..7134065a0 100644 --- a/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore_params.yml +++ b/bu_isciii/templates/viralrecon/DOC/viralrecon_sars_nanopore_params.yml @@ -2,8 +2,8 @@ platform: 'nanopore' protocol: 'amplicon' artic_scheme: 'nCoV-2019' primer_set_version: 5 -fasta: '/data/bi/pipelines/artic-ncov2019/artic-ncov2019_v0/primer_schemes/nCoV-2019/V5/ESIB-EQA.reference.fasta' -primer_bed: '/data/bi/pipelines/artic-ncov2019/artic-ncov2019_v0/primer_schemes/nCoV-2019/V5/ESIB-EQA.primer.bed' +fasta: '/data/bi/references/virus/2019-nCoV/primer_schemes/nCoV-2019/V5/ESIB-EQA.reference.fasta' +primer_bed: '/data/bi/references/virus/2019-nCoV/primer_schemes/nCoV-2019/V5/ESIB-EQA.primer.bed' primer_fasta: '../../RAW/ESIB_EQA_2023.SARS1/ESIB_EQA_2023.SARS1.primers.fasta' kraken2_db: '/data/bi/references/eukaria/homo_sapiens/hg38/UCSC/kraken2/kraken2_human.tar.gz' nextclade_dataset_name: 'sars-cov-2' From 0cbb21d76ae7aac6b58a37678d2247f304038e94 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 27 Jun 2024 13:35:57 +0200 Subject: [PATCH 076/321] Updated mag template --- .../templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/lablog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/lablog b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/lablog index 77a8c99b7..0086364db 100644 --- a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/lablog +++ b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/lablog @@ -17,7 +17,7 @@ cat < mag.sbatch export NXF_OPTS="-Xms500M -Xmx4G" -nextflow run /data/bi/pipelines/nf-core-mag/nf-core-mag-2.1.1/workflow/main.nf \\ +nextflow run /data/bi/pipelines/nf-core-mag/nf-core-mag-2.5.3/workflow/main.nf \\ -c ../../DOC/mag.config \\ --input '00-reads/*_R{1,2}.fastq.gz' \\ --outdir $(date '+%Y%m%d')_mag \\ From 841bedd88e3b45ab58bf4dee8ad9f3df66dbb765 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 1 Feb 2024 13:58:18 +0100 Subject: [PATCH 077/321] Merge changes from main into hotfix (#195) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sara Monzón From 18a1fe758a3868ff113f57206806db534aa18e1c Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 1 Jul 2024 10:59:24 +0200 Subject: [PATCH 078/321] Fixed error because items can be none, empty list, or list with content --- bu_isciii/clean.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bu_isciii/clean.py b/bu_isciii/clean.py index cc514628a..f1f14fc67 100644 --- a/bu_isciii/clean.py +++ b/bu_isciii/clean.py @@ -129,16 +129,16 @@ def get_clean_items(self, services_ids, type="files"): for service in services_ids: try: items = service_conf.get_find_deep(service, type) - if items: - for item in items: - if item not in clean_items_list: - clean_items_list.append(item) - else: + if items is None: stderr.print( "[red]ERROR: Service type %s not found in services json file for service %s." % (type, service) ) sys.exit() + else: + for item in items: + if item not in clean_items_list: + clean_items_list.append(item) except KeyError as e: stderr.print( "[red]ERROR: Service id %s not found in services json file." From fdcc10cf8739f9d9a673936eaac790b444ba55fe Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 4 Jul 2024 09:18:01 +0200 Subject: [PATCH 079/321] Updated assembly's lablog and config files --- .../assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog | 16 +++++++--------- .../assembly/DOC/hpc_slurm_assembly.config | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog b/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog index 25e84ab36..6124ca630 100644 --- a/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog +++ b/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog @@ -107,25 +107,23 @@ cat < assembly.sbatch # module load Nextflow/23.10.0 singularity export NXF_OPTS="-Xms500M -Xmx8G" -nextflow run /data/bi/pipelines/nf-core-bacass/nf-core-bacass-2.2.0dev/main.nf \\ +nextflow run /data/bi/pipelines/nf-core-bacass/nf-core-bacass-2.3.1/main.nf \\ -c ../../DOC/hpc_slurm_assembly.config \\ -profile singularity \\ --input samplesheet.csv \\ --outdir ./ \\ - --assembly_type ${ASSEMBLY_MODE} \\ - --assembler ${ASSEMBLER} \\ + --assembly_type short \\ + --assembler unicycler \\ --skip_polish true \\ - --save_trimmed ${SAVETRIMMED} \\ + --save_trimmed false \\ --fastp_args '--qualified_quality_phred 20 --cut_mean_quality 20' \\ --skip_kraken2 true \\ --skip_kmerfinder false \\ - --kmerfinderdb /data/bi/references/kmerfinder/20190108_stable_dirs/bacteria \\ - --ncbi_assembly_metadata /data/bi/references/bacteria/20191212/assembly_summary_bacteria.txt \\ - ${PROKKA_ARGS} \\ + --kmerfinderdb /data/bi/references/kmerfinder/20241004/bacteria \\ + --ncbi_assembly_metadata /data/bi/references/bacteria/20240626/assembly_summary_refseq.txt \\ + --prokka_args '--gram +' \\ -resume EOF echo "sbatch assembly.sbatch" > _01_nf_assembly.sh - - diff --git a/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config b/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config index 04dddf4db..480d57f4a 100644 --- a/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config +++ b/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config @@ -228,4 +228,20 @@ process { ] ] } + withName: '.*:.*:FASTQ_TRIM_FASTP_FASTQC:FASTQC_RAW' { + maxRetries = 2 + memory = {12.GB * task.attempt} + } + withName:KMERFINDER{ + maxRetries = 2 + memory = {12.GB * task.attempt} + } + withName: '.*:.*:FASTQ_TRIM_FASTP_FASTQC:FASTQC_TRIM' { + maxRetries = 2 + memory = {12.GB * task.attempt} + } + withName: '.*:.*:UNICYCLER' { + maxRetries = 2 + memory = {32.GB * task.attempt} + } } From c3ec911c95de03135518eb1bbe1e470942425ba3 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 4 Jul 2024 09:24:33 +0200 Subject: [PATCH 080/321] Fixed minor mistakes in the lablog file --- .../assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog b/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog index 6124ca630..a196cc46d 100644 --- a/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog +++ b/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog @@ -112,16 +112,16 @@ nextflow run /data/bi/pipelines/nf-core-bacass/nf-core-bacass-2.3.1/main.nf \\ -profile singularity \\ --input samplesheet.csv \\ --outdir ./ \\ - --assembly_type short \\ - --assembler unicycler \\ + --assembly_type ${ASSEMBLY_MODE} \\ + --assembler ${ASSEMBLER} \\ --skip_polish true \\ - --save_trimmed false \\ + --save_trimmed ${SAVETRIMMED} \\ --fastp_args '--qualified_quality_phred 20 --cut_mean_quality 20' \\ --skip_kraken2 true \\ --skip_kmerfinder false \\ --kmerfinderdb /data/bi/references/kmerfinder/20241004/bacteria \\ --ncbi_assembly_metadata /data/bi/references/bacteria/20240626/assembly_summary_refseq.txt \\ - --prokka_args '--gram +' \\ + ${PROKKA_ARGS} \\ -resume EOF From aa9296574256fdfa2acd703b3deb0f930426881c Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 4 Jul 2024 09:27:21 +0200 Subject: [PATCH 081/321] Updated CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index abaf2f21a..253b5add6 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,6 +51,7 @@ Code contributions to the new version: - Added MAG tempalte and removed MAG from other templates [#288](https://github.com/BU-ISCIII/buisciii-tools/pull/288) - Added amrfinderplus to characterization template. [#289] (https://github.com/BU-ISCIII/buisciii-tools/pull/289) - Updated all files so that paths referring to /pipelines/ are updated according to the new structure [#287](https://github.com/BU-ISCIII/buisciii-tools/pull/287) +- Updated assembly's template (lablog and config files) [#295](https://github.com/BU-ISCIII/buisciii-tools/pull/295) ### Modules From 9242e86019d3f7d523e9cfaa48e5ada231d32d5b Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 4 Jul 2024 11:05:58 +0200 Subject: [PATCH 082/321] Updated ariba's lablog --- .../ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/run/lablog | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/run/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/run/lablog index 9ddf07267..1bfc20c02 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/run/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/run/lablog @@ -9,8 +9,8 @@ join -j 2 ../../samples_id.txt ../databases.txt | sed 's/^ //g' > sample_databas # col 1 (arr[0]): sample # col 2 (arr[1]): database -cat sample_database.txt | grep -v 'pubmlst' | while read in; do arr=($in); echo "mkdir -p ${arr[0]}; srun --chdir $scratch_dir --output logs/ARIBA_${arr[0]}_${arr[1]}.%j.log --job-name ARIBA_${arr[0]}_${arr[1]} --cpus-per-task 5 --mem 5G --partition short_idx --time 02:00:00 singularity exec -B ${scratch_dir}/../../../../ -B /data/bi/references/ariba/20211216/${arr[1]}/ /data/bi/pipelines/singularity-images/ariba:2.14.6--py36h4aaaa08_3 ariba run /data/bi/references/ariba/20211216/${arr[1]}/out.${arr[1]}.prepareref ${scratch_dir}/../../01-preprocessing/${arr[0]}/${arr[0]}_R1_filtered.fastq.gz ${scratch_dir}/../../01-preprocessing/${arr[0]}/${arr[0]}_R2_filtered.fastq.gz ${scratch_dir}/${arr[0]}/out_${arr[1]}_${arr[0]}_run &"; done > _01_ariba.sh +cat sample_database.txt | grep -v 'pubmlst' | while read in; do arr=($in); echo "mkdir -p ${arr[0]}; srun --chdir $scratch_dir --output logs/ARIBA_${arr[0]}_${arr[1]}.%j.log --job-name ARIBA_${arr[0]}_${arr[1]} --cpus-per-task 5 --mem 5G --partition short_idx --time 02:00:00 singularity exec -B ${scratch_dir}/../../../../ -B /data/bi/references/ariba/ /data/bi/pipelines/singularity-images/ariba:2.14.6--py39heaaa4ec_6 ariba run /data/bi/references/ariba/latest/${arr[1]}/out.${arr[1]}.prepareref ${scratch_dir}/../../01-preprocessing/${arr[0]}/${arr[0]}_R1_filtered.fastq.gz ${scratch_dir}/../../01-preprocessing/${arr[0]}/${arr[0]}_R2_filtered.fastq.gz ${scratch_dir}/${arr[0]}/out_${arr[1]}_${arr[0]}_run &"; done > _01_ariba.sh -cat ../../../samples_id.txt | while read in; do echo "mkdir -p $in; srun --chdir $scratch_dir --output logs/ARIBA_${in}_pubmlst.%j.log --job-name ARIBA_${in}_pubmlst --cpus-per-task 5 --mem 5G --partition short_idx --time 02:00:00 singularity exec -B ${scratch_dir}/../../../../ /data/bi/pipelines/singularity-images/ariba:2.14.6--py36h4aaaa08_3 ariba run ${scratch_dir}/${downloaded_ref} ${scratch_dir}/../../01-preprocessing/${in}/${in}_R1_filtered.fastq.gz ${scratch_dir}/../../01-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${scratch_dir}/${in}/out_pubmlst_${in}_run &"; done > _01_ariba.sh +cat ../../../samples_id.txt | while read in; do echo "mkdir -p $in; srun --chdir $scratch_dir --output logs/ARIBA_${in}_pubmlst.%j.log --job-name ARIBA_${in}_pubmlst --cpus-per-task 5 --mem 5G --partition short_idx --time 02:00:00 singularity exec -B ${scratch_dir}/../../../../ /data/bi/pipelines/singularity-images/ariba:2.14.6--py39heaaa4ec_6 ariba run ${scratch_dir}/${downloaded_ref} ${scratch_dir}/../../01-preprocessing/${in}/${in}_R1_filtered.fastq.gz ${scratch_dir}/../../01-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${scratch_dir}/${in}/out_pubmlst_${in}_run &"; done >> _01_ariba.sh cat sample_database.txt | while read in; do arr=($in); echo "mv ${arr[0]}/out_${arr[1]}_${arr[0]}_run/report.tsv ${arr[0]}/out_${arr[1]}_${arr[0]}_run/${arr[0]}_${arr[1]}_report.tsv"; done > _02_fix_tsvreport.sh From ac13addae41444c75ea132bb75284520118c7a71 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 4 Jul 2024 11:39:57 +0200 Subject: [PATCH 083/321] Fixed minor mistakes in the config file --- .../assembly/DOC/hpc_slurm_assembly.config | 27 +++++++------------ 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config b/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config index 480d57f4a..284208fbb 100644 --- a/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config +++ b/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config @@ -38,6 +38,8 @@ params { params { publish_dir_mode = 'copy' } process { withName: '.*:.*:FASTQ_TRIM_FASTP_FASTQC:FASTQC_RAW' { + maxRetries = 2 + memory = {12.GB * task.attempt} publishDir = [ [ path: { "${params.outdir}/01-processing/fastqc/raw" }, @@ -71,6 +73,8 @@ process { ] } withName: '.*:.*:FASTQ_TRIM_FASTP_FASTQC:FASTQC_TRIM' { + maxRetries = 2 + memory = {12.GB * task.attempt} publishDir = [ [ path: { "${params.outdir}/01-processing/fastqc/trim" }, @@ -127,6 +131,9 @@ process { ] } withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER' { + maxRetries = 2 + memory = {12.GB * task.attempt} + errorStrategy = { task.exitStatus in [1] ? 'retry' : 'finish'} publishDir = [ path: { "${params.outdir}/02-taxonomy_contamination/kmerfinder/${meta.id}" }, mode: params.publish_dir_mode @@ -145,7 +152,9 @@ process { ] } withName: 'UNICYCLER|CANU|MINIASM|DRAGONFLYE' { - publishDir = [ + maxRetries = 2 + memory = {64.GB * task.attempt} + publishDir = [ path: { "${params.outdir}/03-assembly/${params.assembler}" }, mode: params.publish_dir_mode, saveAs: { filename -> @@ -228,20 +237,4 @@ process { ] ] } - withName: '.*:.*:FASTQ_TRIM_FASTP_FASTQC:FASTQC_RAW' { - maxRetries = 2 - memory = {12.GB * task.attempt} - } - withName:KMERFINDER{ - maxRetries = 2 - memory = {12.GB * task.attempt} - } - withName: '.*:.*:FASTQ_TRIM_FASTP_FASTQC:FASTQC_TRIM' { - maxRetries = 2 - memory = {12.GB * task.attempt} - } - withName: '.*:.*:UNICYCLER' { - maxRetries = 2 - memory = {32.GB * task.attempt} - } } From e0d1c30ddbf6db2ecedf48875a30ac497d25dca8 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 4 Jul 2024 11:41:01 +0200 Subject: [PATCH 084/321] Replaced kmerfinder database for latest --- .../templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog b/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog index a196cc46d..38f5edd7a 100644 --- a/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog +++ b/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog @@ -119,7 +119,7 @@ nextflow run /data/bi/pipelines/nf-core-bacass/nf-core-bacass-2.3.1/main.nf \\ --fastp_args '--qualified_quality_phred 20 --cut_mean_quality 20' \\ --skip_kraken2 true \\ --skip_kmerfinder false \\ - --kmerfinderdb /data/bi/references/kmerfinder/20241004/bacteria \\ + --kmerfinderdb /data/bi/references/kmerfinder/latest/bacteria \\ --ncbi_assembly_metadata /data/bi/references/bacteria/20240626/assembly_summary_refseq.txt \\ ${PROKKA_ARGS} \\ -resume From a0d054a3e26500523a8be4eacd00d9cbea50db31 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 4 Jul 2024 11:43:43 +0200 Subject: [PATCH 085/321] Updated amrfinderplus lablog --- .../ANALYSIS01_CHARACTERIZATION/03-amrfinderplus/lablog | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-amrfinderplus/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-amrfinderplus/lablog index a75150202..a8694b065 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-amrfinderplus/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-amrfinderplus/lablog @@ -4,6 +4,8 @@ scratch_dir=$(echo $(pwd) | sed 's@/data/bi/scratch_tmp/@/scratch/@g') +mkdir logs + possible_organisms=("Acinetobacter_baumannii" "Burkholderia_cepacia" "Burkholderia_pseudomallei" "Campylobacter" "Citrobacter_freundii" "Clostridioides_difficile" "Enterobacter_asburiae" "Enterobacter_cloacae" "Enterococcus_faecalis" "Enterococcus_faecium" "Escherichia" "Klebsiella_oxytoca" "Klebsiella_pneumoniae" "Neisseria_gonorrhoeae" "Neisseria_meningitidis" "Pseudomonas_aeruginosa" "Salmonella" "Serratia_marcescens" "Staphylococcus_aureus" "Staphylococcus_pseudintermedius" "Streptococcus_agalactiae" "Streptococcus_pneumoniae" "Streptococcus_pyogenes" "Vibrio_cholerae" "Vibrio_parahaemolyticus" "Vibrio_vulnificus" "OTHER") echo echo -e "\n\033[1;37mPlease select your bacteria from the following list:\033[0m" @@ -12,12 +14,12 @@ select bacteria in "${possible_organisms[@]}"; do if [[ -n "$bacteria" ]]; then echo -e "\033[0;32mOrganism selected: ${bacteria}\033[0m" if [ $bacteria = "OTHER" ]; then - cat ../samples_id.txt | while read in; do echo "srun --chdir $scratch_dir --partition middle_idx amrfinder -n $(ls ../../*ANALYSIS*ASSEMBLY/03-assembly/unicycler/${in}.fasta.gz) --name ${in} --plus -o ${in}_out.tsv &" >> _01_run_amrfinder.sh; done + cat ../samples_id.txt | while read in; do echo "srun --chdir $scratch_dir --partition middle_idx --output logs/AMRFINDER_${in}.%j.log --job-name AMRFINDER_${in} amrfinder -n $(ls ../../*ANALYSIS*ASSEMBLY/03-assembly/unicycler/${in}.fasta.gz) --name ${in} --plus -o ${in}_out.tsv &" >> _01_run_amrfinder.sh; done else - cat ../samples_id.txt | while read in; do echo "srun --chdir $scratch_dir --partition middle_idx amrfinder -n $(ls ../../*ANALYSIS*ASSEMBLY/03-assembly/unicycler/${in}.fasta.gz) --organism ${bacteria} --name ${in} --plus -o ${in}_out.tsv &" >> _01_run_amrfinder.sh; done + cat ../samples_id.txt | while read in; do echo "srun --chdir $scratch_dir --partition middle_idx --output logs/AMRFINDER_${in}.%j.log --job-name AMRFINDER_${in} amrfinder -n $(ls ../../*ANALYSIS*ASSEMBLY/03-assembly/unicycler/${in}.fasta.gz) --organism ${bacteria} --name ${in} --plus -o ${in}_out.tsv &" >> _01_run_amrfinder.sh; done fi break else echo -e "\n\033[0;31mInvalid input.\033[0m" fi -done \ No newline at end of file +done From 10141026c71dc13ebf605b66ad47f5d51ede3711 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 4 Jul 2024 12:42:21 +0200 Subject: [PATCH 086/321] Updated snippy lablog --- .../snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog b/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog index 95826e287..c67c2a06d 100644 --- a/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog +++ b/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog @@ -1,4 +1,4 @@ -rm # module load singularity +# conda activate snippy scratch_dir=$(echo $PWD | sed 's/\/data\/bi\/scratch_tmp/\/scratch/g') @@ -6,10 +6,10 @@ mkdir logs cat ../samples_id.txt | while read in; do echo -e "${in}\t${scratch_dir}/../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz\t${scratch_dir}/../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz"; done >> input.tab -ls ${scratch_dir}/../../../REFERENCES | xargs -I %% singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_2 snippy-multi ${scratch_dir}/input.tab --mincov 9 --mapqual 10 --basequal 5 --minqual 30 --ref ${scratch_dir}/../../../REFERENCES/%% --cpus 5 > commands.out +singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 snippy-multi ${scratch_dir}/input.tab --mincov 9 --mapqual 10 --basequal 5 --minqual 30 --ref ${scratch_dir}/../../../REFERENCES/GCF_015326295.1/GCF_015326295.1_ASM1532629v1_genomic.fna --cpus 5 > commands.out -head -n -1 commands.out | sed -e "s@^@srun --chdir ${scratch_dir} --output logs/SNIPPY.%j.log --job-name SNIPPY --cpus-per-task 5 --mem 49152 --partition short_idx --time 02:00:00 @" | awk '{print $0" &"}' > _00_snippy.sh -tail -n 1 commands.out | sed -e "s@^@srun --chdir ${scratch_dir} --output logs/SNIPPY_CORE.%j.log --job-name SNIPPY --cpus-per-task 5 --mem 49152 --partition short_idx --time 02:00:00 @" | awk '{print $0" &"}' > _01_snippy_core.sh +head -n -1 commands.out | sed -e "s@^@srun --chdir ${scratch_dir} --output logs/SNIPPY.%j.log --job-name SNIPPY --cpus-per-task 5 --mem 49152 --partition short_idx --time 02:00:00 env - PATH="$PATH" singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 @" | awk '{print $0" &"}' > _00_snippy.sh +tail -n 1 commands.out | sed -e "s@^@srun --chdir ${scratch_dir} --output logs/SNIPPY_CORE.%j.log --job-name SNIPPY --cpus-per-task 5 --mem 49152 --partition short_idx --time 02:00:00 env - PATH="$PATH" singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 @" | awk '{print $0" &"}' > _01_snippy_core.sh @@ -19,7 +19,7 @@ tail -n 1 commands.out | sed -e "s@^@srun --chdir ${scratch_dir} --output logs/S # echo "grep \"complex\" ./*/snps.vcf | cut -f 1,2,4,5 | cut -d \":\" -f 2 | sort -u | awk '{pos1=\$2; len_ref=length(\$3); printf \"%s\t%s\t%s\n\", \$1, pos1-1, pos1+len_ref+1}' | grep -v \"^#\" > mask_complex_variants.bed" > _01_snippy_core.sh # ls ${scratch_dir}/../../../REFERENCES | xargs -I %% echo "snippy-core --debug --mask ./mask_complex_variants.bed --mask-char 'N' --ref '../../../REFERENCES/%%' $(cat ../samples_id.txt | xargs)" >> _01_snippy_core.sh -echo "snp-sites -b -c -o phylo.aln core.full.aln" > _02_phylo_aln.sh +echo "srun --chdir ${scratch_dir} --output logs/SNIP-SITES.%j.log --job-name SNIP-SITES --cpus-per-task 5 --mem 49152 --partition short_idx --time 02:00:00 env - PATH="$PATH" singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 snp-sites -b -c -o phylo.aln core.full.aln &" > _02_phylo_aln.sh # awk 'BEGIN{FS="[> ]"} /^>/{val=$2;next} {print val,length($0)}' phylo.aln From 338997b011560611446ea965899d3e8de5441655 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 4 Jul 2024 12:49:17 +0200 Subject: [PATCH 087/321] Updated services.json --- bu_isciii/templates/services.json | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/bu_isciii/templates/services.json b/bu_isciii/templates/services.json index 7463cd5c4..3107c913f 100755 --- a/bu_isciii/templates/services.json +++ b/bu_isciii/templates/services.json @@ -16,23 +16,6 @@ "delivery_md": "assets/reports/md/assembly.md", "results_md": "assets/reports/results/assembly.md" }, - "mtbseq_assembly": { - "label": "", - "template": "mtbseq", - "order": 1, - "begin": "base", - "end": "mag_met", - "url": "https://github.com/ngs-fzb/MTBseq_source", - "description": "Mycobacterium tuberculosis mapping, variant calling and detection of resistance using MTBseq", - "clean": { - "folders":["Bam", "Mpileup"], - "files":["01-processing/fastp/sample_name_1.fastp.fastq.gz", "01-processing/fastp/sample_name_2.fastp.fastq.gz"] - }, - "no_copy": ["RAW", "TMP"], - "last_folder":"REFERENCES", - "delivery_md": "", - "results_md": "" - }, "mtbseq": { "label": "", "template": "mtbseq", From 8abcc989a32427e7248e6ac1fc7ce9aedda62687 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 4 Jul 2024 12:50:06 +0200 Subject: [PATCH 088/321] Removed genomeev and mtbseq_assembly templates --- .../ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog | 38 ----- .../ANALYSIS/ANALYSIS03_MAG/99-stats/lablog | 25 ---- .../99-stats/multiqc_config.yaml | 13 -- .../genomeev/ANALYSIS/ANALYSIS03_MAG/lablog | 30 ---- .../genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog | 75 ---------- bu_isciii/templates/genomeev/ANALYSIS/README | 25 ---- .../ANALYSIS/_02_create_run_percentage_Ns.sh | 1 - .../genomeev/ANALYSIS/create_assembly_stats.R | 130 ------------------ .../ANALYSIS/create_summary_report.sh | 51 ------- .../ANALYSIS/deduplicate_long_table.sh | 6 - .../genomeev/ANALYSIS/lablog_pikavirus | 4 - .../genomeev/ANALYSIS/lablog_viralrecon | 68 --------- .../genomeev/ANALYSIS/percentajeNs.py | 21 --- .../genomeev/ANALYSIS/samples_ref.txt | 4 - .../genomeev/DOC/hpc_slurm_pikavirus.config | 32 ----- bu_isciii/templates/genomeev/DOC/mag.config | 19 --- .../DOC/viralrecon_metagenomic.config | 19 --- ...con_metagenomic_ignore_merge_codons.config | 42 ------ .../DOC/viralrecon_metagenomic_params.yml | 11 -- .../viralrecon_metagenomic_save_nohost.config | 40 ------ bu_isciii/templates/genomeev/RAW/README | 1 - .../genomeev/RESULTS/lablog_genomeev_results | 17 --- bu_isciii/templates/genomeev/TMP/README | 1 - .../ANALYSIS/ANALYSIS01_ASSEMBLY/lablog | 37 ----- .../ANALYSIS/ANALYSIS02_MTBSEQ/lablog | 21 --- .../ANALYSIS02_MTBSEQ/samples_all/lablog | 22 --- .../templates/mtbseq_assembly/ANALYSIS/lablog | 6 - .../templates/mtbseq_assembly/DOC/README | 1 - .../DOC/hpc_slurm_assembly.config | 27 ---- .../templates/mtbseq_assembly/RAW/README | 1 - .../mtbseq_assembly/REFERENCES/README | 1 - .../templates/mtbseq_assembly/RESULTS/README | 1 - .../RESULTS/lablog_mtbseq_assembly_results | 9 -- .../templates/mtbseq_assembly/TMP/README | 1 - 34 files changed, 800 deletions(-) delete mode 100644 bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog delete mode 100644 bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/lablog delete mode 100644 bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/multiqc_config.yaml delete mode 100644 bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/lablog delete mode 100644 bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog delete mode 100644 bu_isciii/templates/genomeev/ANALYSIS/README delete mode 100644 bu_isciii/templates/genomeev/ANALYSIS/_02_create_run_percentage_Ns.sh delete mode 100644 bu_isciii/templates/genomeev/ANALYSIS/create_assembly_stats.R delete mode 100644 bu_isciii/templates/genomeev/ANALYSIS/create_summary_report.sh delete mode 100644 bu_isciii/templates/genomeev/ANALYSIS/deduplicate_long_table.sh delete mode 100644 bu_isciii/templates/genomeev/ANALYSIS/lablog_pikavirus delete mode 100644 bu_isciii/templates/genomeev/ANALYSIS/lablog_viralrecon delete mode 100644 bu_isciii/templates/genomeev/ANALYSIS/percentajeNs.py delete mode 100644 bu_isciii/templates/genomeev/ANALYSIS/samples_ref.txt delete mode 100644 bu_isciii/templates/genomeev/DOC/hpc_slurm_pikavirus.config delete mode 100644 bu_isciii/templates/genomeev/DOC/mag.config delete mode 100644 bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic.config delete mode 100644 bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_ignore_merge_codons.config delete mode 100644 bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_params.yml delete mode 100644 bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_save_nohost.config delete mode 100644 bu_isciii/templates/genomeev/RAW/README delete mode 100644 bu_isciii/templates/genomeev/RESULTS/lablog_genomeev_results delete mode 100644 bu_isciii/templates/genomeev/TMP/README delete mode 100644 bu_isciii/templates/mtbseq_assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog delete mode 100644 bu_isciii/templates/mtbseq_assembly/ANALYSIS/ANALYSIS02_MTBSEQ/lablog delete mode 100644 bu_isciii/templates/mtbseq_assembly/ANALYSIS/ANALYSIS02_MTBSEQ/samples_all/lablog delete mode 100644 bu_isciii/templates/mtbseq_assembly/ANALYSIS/lablog delete mode 100644 bu_isciii/templates/mtbseq_assembly/DOC/README delete mode 100644 bu_isciii/templates/mtbseq_assembly/DOC/hpc_slurm_assembly.config delete mode 100644 bu_isciii/templates/mtbseq_assembly/RAW/README delete mode 100644 bu_isciii/templates/mtbseq_assembly/REFERENCES/README delete mode 100644 bu_isciii/templates/mtbseq_assembly/RESULTS/README delete mode 100644 bu_isciii/templates/mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results delete mode 100644 bu_isciii/templates/mtbseq_assembly/TMP/README diff --git a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog deleted file mode 100644 index 7a8120a75..000000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog +++ /dev/null @@ -1,38 +0,0 @@ -# module load Nextflow/21.10.6 singularity - -ln -s ../00-reads . -ln -s ../samples_id.txt . -echo "sample,fastq_1,fastq_2" > samplesheet.csv -cat samples_id.txt | while read in; do echo "${in},00-reads/${in}_R1.fastq.gz,00-reads/${in}_R2.fastq.gz"; done >> samplesheet.csv - - -scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") - -cat < pikavirus.sbatch -#!/bin/sh -#SBATCH --ntasks 1 -#SBATCH --cpus-per-task 2 -#SBATCH --mem 4G -#SBATCH --time 4:00:00 -#SBATCH --partition middle_idx -#SBATCH --output $(date '+%Y%m%d')_pikavirus01.log -#SBATCH --chdir $scratch_dir - -export NXF_OPTS="-Xms500M -Xmx4G" - -nextflow run /scratch/bi/pipelines/PikaVirus/main.nf \\ - -c ../../DOC/hpc_slurm_pikavirus.config \\ - --input samplesheet.csv \\ - --kraken_scouting false \\ - --virus true \\ - --bacteria false \\ - --fungi false \\ - --kaiju false \\ - --mash_winner_strategy true \\ - --mash_identitity_threshold 0.9 \\ - --mash_shared_hashes_threshold 0.01 \\ - --mash_pvalue_threshold 0.05 \\ - -resume -EOF - -echo "sbatch pikavirus.sbatch" > _01_nf_pikavirus.sh \ No newline at end of file diff --git a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/lablog b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/lablog deleted file mode 100644 index 88edff156..000000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/lablog +++ /dev/null @@ -1,25 +0,0 @@ -# module load singularity - -cat ../../samples_id.txt | while read in; do ln -s ../*_mag/Taxonomy/kraken2/${in}/kraken2_report.txt ./${in}_kraken2_report.txt; done - -scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") - -cat < multiqc.sbatch -#!/bin/sh -#SBATCH --ntasks 1 -#SBATCH --cpus-per-task 2 -#SBATCH --mem 4G -#SBATCH --time 00:30:00 -#SBATCH --partition short_idx -#SBATCH --output $(date '+%Y%m%d')_multiqc.log -#SBATCH --chdir $scratch_dir - -export NXF_OPTS="-Xms500M -Xmx4G" - -singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/multiqc:1.9--py_1 multiqc -d . --config multiqc_config.yaml - -EOF - -echo "sbatch multiqc.sbatch" > _01_run_multiqc.sh - -echo "find -type l | while read in; do unlink \${in}; done" > _02_unlink.sh diff --git a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/multiqc_config.yaml b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/multiqc_config.yaml deleted file mode 100644 index 96b7e6136..000000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/multiqc_config.yaml +++ /dev/null @@ -1,13 +0,0 @@ -extra_fn_clean_exts: - - _R1 - - _R2 - - .R1 - - .R2 - - .sort - - _sort - - .stats - - _bamstat - - _align - - .txt -report_comment: > - This report has been generated by BU-ISCIII diff --git a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/lablog b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/lablog deleted file mode 100644 index 83e293d6f..000000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/lablog +++ /dev/null @@ -1,30 +0,0 @@ -ln -s ../00-reads . -ln -s ../samples_id.txt . - -#module load Nextflow -#module load singularity - -scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") - -cat < mag.sbatch -#!/bin/sh -#SBATCH --ntasks 1 -#SBATCH --cpus-per-task 2 -#SBATCH --mem 4G -#SBATCH --time 2:00:00 -#SBATCH --partition middle_idx -#SBATCH --output $(date '+%Y%m%d')_mag.log -#SBATCH --chdir $scratch_dir - -export NXF_OPTS="-Xms500M -Xmx4G" - -nextflow run /data/bi/pipelines/nf-core-mag-2.1.1/workflow/main.nf \\ - -c ../../DOC/mag.config \\ - --input '00-reads/*_R{1,2}.fastq.gz' \\ - --outdir $(date '+%Y%m%d')_mag \\ - --kraken2_db /data/bi/references/kraken/minikraken_8GB_20200312.tgz \\ - --skip_busco --skip_spades --skip_spadeshybrid --skip_megahit --skip_prodigal --skip_binning \\ - -resume -EOF - -echo "sbatch mag.sbatch" > _01_run_mag.sh diff --git a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog deleted file mode 100644 index 2986a343d..000000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog +++ /dev/null @@ -1,75 +0,0 @@ -# module load singularity - -scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") -mkdir logs - -# Location of assemblies to a variable so it only has to be changed here -LOCATION=../*/*/assembly/*/* -# Other databases: -# /data/bi/references/BLAST_dbs/nt_20211025/nt -BLAST_DATABASE="/data/bi/references/virus/BLAST/all_virus.fasta" - -# if there are scaffolds, uncompress the scaffolds in its dir (zcat for decompression) -# if there contigs and no scaffolds, uncompress the contigs as scaffolds in its dir -echo "Samples that did not generate scaffolds:" > noscaffold.txt -cat ../samples_id.txt | while read in; do - mkdir ${in} - # ls will return 0 if there are no scaffolds file - # NOTE: change extension and location at will - # NOTE2: zcat is only used in case of gzipped files, use a cp or ln -s if needed - if [ $(ls ${LOCATION}/${in}.scaffolds.fa.gz | wc -l) != 0 ]; then - zcat ${LOCATION}/${in}.scaffolds.fa.gz > ${in}/${in}.scaffolds.fa - else - # Note assemblies that did not make a scaffold - zcat ${LOCATION}/${in}.contigs.fa.gz > ${in}/${in}.scaffolds.fa - echo ${in} >> noscaffold.txt - fi -done - -# NOTE3: change the -query flag to meet your requirements -cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition middle_idx --mem 200G --time 48:00:00 --cpus-per-task 10 --output logs/BLASTN_%%_%j.log --job-name BLASTN_%% singularity exec -B ${scratch_dir}/../../ -B /data/bi/references/virus/BLAST /data/bi/pipelines/singularity-images/blast:2.11.0--pl5262h3289130_1 blastn -num_threads 10 -db ${BLAST_DATABASE} -query ${scratch_dir}/%%/%%.scaffolds.fa -out ${scratch_dir}/%%/%%_blast.tsv -outfmt '6 qseqid stitle qaccver saccver pident length mismatch gaps qstart qend sstart send evalue bitscore slen qlen qcovs' &" > _01_blast.sh - -# Filtering criteria: - # %refCovered > 0.7 - # ref not a phage (stitle ~! /phage/) - # ref longer than 200 bp (slen > 200) - -# First awk: create the full table; second awk: filter it -cat ../samples_id.txt | xargs -I %% echo "awk -v \"samplename=%%\" 'BEGIN{OFS=\"\t\";FS=\"\t\"}{print samplename,\$0,(\$6-\$8)/\$16,\$6/\$15}' %%/%%_blast.tsv | awk 'BEGIN{OFS=\"\t\";FS=\"\t\"} \$16 > 200 && \$17 > 0.7 && \$3 !~ /phage/ {print \$0}' > %%/%%_blast_filt.tsv" > _02_filter_blast.sh -echo -e "echo \"samplename\tqseqid\tstitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgap\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tref_len\tquery_len\tqcovs\t%queryAligned\t%refCovered\" > header" > _03_gather_results_add_header.sh -echo "cat header */*blast_filt.tsv > all_samples_filtered_BLAST_results.tsv" >> _03_gather_results_add_header.sh -cat ../samples_id.txt | xargs -I %% echo "cat header %%/%%_blast_filt.tsv > tmp; rm %%/%%_blast_filt.tsv; mv tmp %%/%%_blast_filt.tsv" >> _03_gather_results_add_header.sh -echo "rm header" >> _03_gather_results_add_header.sh - -# NOTES FOR FILTERING -# -# subject = reference -# -# COLS GENERATED BY US: -# 1: samplename -# GENERATED BY BLAST -# 2: contigname - qseqid -# 3: stitle -# 4: qaccver -# 5: saccver -# 6: pident -# 7: length (of alignment) -# 8: mismatch -# 9: gaps -# 10: qstart -# 11: qend -# 12: sstart -# 13: send -# 14: evalue -# 15: bitscore -# 16: ref len - slen -# 17: query len - qlen -# 18: qcovs -# MORE INFO: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6 -# GENERATED BY US: -# 19: %queryAligned: (length-gaps)/qlen (if gaps are not deleted, then this would be bigger than 1 sometimes) -# 20: %refCovered: length/slen - -# conda activate 2excel -cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_%%.log --job-name 2excel_%% python /data/bi/pipelines/utilities/export_excel_from_csv.py --input_file %%/%%_blast_filt.tsv --delimiter '\t' --output_filename %%/%%_blast_filt --it_has_index --it_has_header" > _04_to_excel.sh -echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_all.log --job-name 2excel_all python /data/bi/pipelines/utilities/export_excel_from_csv.py --input_file all_samples_filtered_BLAST_results.tsv --delimiter '\t' --output_filename all_samples_filtered_BLAST_results --it_has_index --it_has_header" >> _04_to_excel.sh diff --git a/bu_isciii/templates/genomeev/ANALYSIS/README b/bu_isciii/templates/genomeev/ANALYSIS/README deleted file mode 100644 index 0ecbe695b..000000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/README +++ /dev/null @@ -1,25 +0,0 @@ -This document should be read as INSTRUCTIONS to perform the "genomeev" service, as created on 25 Sep 2023. -The steps to follow to perform this service (which, by the way, can be done fairly quickly computationally speaking) are the following: - -- Load the samples into the RAW directory (manually or automatically using the BU-ISCIII tools) - -- Copy all files from this template (manually or automatically, make sure all files are there) - -- Copy the whole service folder to scratch_tmp (at least, we had to do that when this template was created) - -- First part is PikaVirus. Run PikaVirus by executing lablog_pikavirus, then enter the PikaVirus folder, execute the lablog (note that you need a samples_id.txt file, if you did not create it automatically, it has to be done manually), load the modules and do the thing. Feel free to change anything in PikaVirus through command or through the config (config is recommended so that any changes can be tracked). NOTE: wait for PikaVirus to end before you continue. Do something else in the meantime. read a paper or something dunno. - -- Once PikaVirus has ended, we have to dive into the results, particularly the "all_samples_virus_table.tsv" in the results dir. Here, we have to find the most abundant virus. I personally recommend opening this file in excel or similar, and find the virus that repeats the most in the samples using some formula such as "COUNTIF(range, value)". Make sure you are working with a genome and not with just a fragment of it. - -- Download said assembly locally, both its fna and its gff file. Make sure you store both files with the same name and different extension. The name SHOULD include the virus name, and the GCA/GCF code so its easier to identify (example: RotavirusG8_GCA_002669555_1.fasta; RotavirusG8_GCA_002669555_1.gff). Then, place it in the corresponding directory inside "/data/bi/references/virus". - -- Once the files have been placed, we have to modify the samples_ref.txt file. - First column will be the exact same as the samples_id.txt file. - Second column will be the name of the assemblies we downloaded in the previous step (example: RotavirusG8_GCA_002669555_1 ). Make sure that all the rows are the exact same. - Third column will be the name of the host (typically "human", but can be changed depending on the situation) - -- Execute the lablog_viralrecon. The ANALYSIS02 directory will be created and filled with the corresponding scripts. Load the modules and launch viralrecon. - -- Once it has ended, its time for MAG. Go to the ANALYSIS03 directory, execute the lablog, load the modules and run MAG with the specified params. - -- Last, but not least, go to the ANALYSIS04 directory and run the lablog, the lablog will check the assembly step in viralrecon, and will store the names of the samples that didnt assembly to the scaffold level in the noscaffold.txt file. Run normally the three scripts after loading the corresponding module, and that should be about everything there is to this service! diff --git a/bu_isciii/templates/genomeev/ANALYSIS/_02_create_run_percentage_Ns.sh b/bu_isciii/templates/genomeev/ANALYSIS/_02_create_run_percentage_Ns.sh deleted file mode 100644 index 3e5e10128..000000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/_02_create_run_percentage_Ns.sh +++ /dev/null @@ -1 +0,0 @@ -i=1; find */variants/ivar/consensus/ -type d -name 'bcftools' | while read in; do echo "python ./percentajeNs.py ${in} %Ns_${i}.tab"; i=$((i+1)); done > _03_run_percentage_Ns.sh; echo "cat %Ns_* > %Ns.tab" >> _03_run_percentage_Ns.sh; echo "rm %Ns_*" >> _03_run_percentage_Ns.sh diff --git a/bu_isciii/templates/genomeev/ANALYSIS/create_assembly_stats.R b/bu_isciii/templates/genomeev/ANALYSIS/create_assembly_stats.R deleted file mode 100644 index d174e208f..000000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/create_assembly_stats.R +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env Rscript - -################################################ -################################################ -## LOAD LIBRARIES ## -################################################ -################################################ - -library(plyr, quietly = TRUE, warn.conflicts = FALSE) -library(dplyr, quietly = TRUE, warn.conflicts = FALSE) -library(tidyr, quietly = TRUE, warn.conflicts = FALSE) -library(stringr, quietly = TRUE, warn.conflicts = FALSE) -library(jsonlite, quietly = TRUE, warn.conflicts = FALSE) -library(writexl, quietly = TRUE, warn.conflicts = FALSE) - -################################################ -################################################ -## DATA ############################### -################################################ -################################################ - -# PATHS -path <- getwd() -samples_ref <- read.table(paste0(path, "/samples_ref.txt"), header = F) - -if (ncol(samples_ref) == 2) { - colnames(samples_ref) <- c("id", "ref") -} else { - colnames(samples_ref) <- c("id", "ref", "host") -} - -# Fastq path - -fastq_names <- list.files("../../RAW/") -path_run <- Sys.readlink(paste0("../../RAW/", fastq_names[1])) - -# columnas -columnas <- "run\tuser\thost\tVirussequence\tsample\ttotalreads\treadshostR1\treadshost\t%readshost\tNon-host-reads\t%Non-host-reads\tContigs\tLargest_contig\t%Genome_fraction" -name_columns <- as.vector(str_split(columnas, "\t", simplify = T)) - -list_assembly <- list(0) -for (i in 1:nrow(samples_ref)) { - - # Run, user, host and sequence - name_run <- str_split(path_run, "/", simplify = T)[, 4] - name_user <- str_split(path, "_", simplify = T)[, 5] - name_host <- tolower(str_split(path, "_", simplify = T)[, 9]) - date_service <- str_split(str_split(path, "_", simplify = T)[, 6], "/", simplify = T)[, 3] - - name_sequence <- as.character(samples_ref$ref[i]) - name_id <- as.character(samples_ref$id[i]) - - # path outputfolder - directorios <- list.dirs(recursive = FALSE) - patron_workdir <- paste0(name_sequence, "_", date_service) - workdir <- directorios[grepl(patron_workdir, directorios)][1] - - # totalreads - json_fastp <- fromJSON(paste0(workdir, "/fastp/", name_id, ".fastp.json")) - value_totalreads <- json_fastp$summary[["after_filtering"]]$total_reads - - # readshostR1 - table_kraken <- read.table(paste0(workdir, "/kraken2/", name_id, ".kraken2.report.txt"), sep = "\t") - unclassified_reads <- as.numeric(subset(x = table_kraken, subset = V6 == "unclassified")[2]) - value_readhostr1 <- sum(table_kraken$V3)-unclassified_reads - - # readshosh - value_readhost <- value_readhostr1 * 2 - - # readshost - value_percreadhost <- round((value_readhost * 100) / value_totalreads, 2) - - # non host reads - value_nonhostreads <- value_totalreads - value_readhost - - # % non host - value_percnonhostreads <- round((value_nonhostreads * 100) / value_totalreads, 2) - - # Contigs - assembly_workdir <- paste(workdir, "/assembly", sep = "") - quast_report_path <- paste("/",list.files(pattern = "transposed_report.tsv", recursive = TRUE, path = assembly_workdir), sep = "") - table_quast <- read.delim(paste0(assembly_workdir, quast_report_path), skip = 0, header = T, sep = "\t") - - # no quast error - if (exists("table_quast") == FALSE) { - value_contigs <- NA - value_lcontig <- NA - value_genomef <- NA - } else { - - sample_data <- subset(table_quast, Assembly == paste(name_id, "scaffolds", sep = ".")) - value_contigs <- as.numeric(sample_data$X..contigs) - value_lcontig <- as.numeric(sample_data$Largest.contig) - value_genomef <- as.numeric(as.character(sample_data$Genome.fraction....)) - - # empty values - # empty values - if (length(value_contigs) == 0) { - value_contigs <- NA - } - - if (length(value_lcontig) == 0) { - value_lcontig <- NA - } - - if (length(value_genomef) == 0) { - value_genomef <- NA - } - } - - # Create table - list_assembly[[i]] <- c(name_run, name_user, name_host, name_sequence, name_id, value_totalreads, value_readhostr1, value_readhost, value_percreadhost, value_nonhostreads, value_percnonhostreads, value_contigs, value_lcontig, value_genomef) -} - -df_final <- as.data.frame(do.call("rbind", list_assembly)) -colnames(df_final) <- name_columns - -# characters -columnas_ch <- as.vector(1:5) -df_final[, columnas_ch] <- apply(df_final[, columnas_ch], 2, function(x) as.character(x)) - -# numeric -columnas_nu <- as.vector(6:length(colnames(df_final))) -df_final[, columnas_nu] <- apply(df_final[, columnas_nu], 2, function(x) as.numeric(as.character(x))) - -# Write table csv -write.table(df_final, "assembly_stats.csv", row.names = F, col.names = T, sep = "\t", quote = F) - -# Write table xlsx -write_xlsx(df_final, "assembly_stats.xlsx", format_headers = F) diff --git a/bu_isciii/templates/genomeev/ANALYSIS/create_summary_report.sh b/bu_isciii/templates/genomeev/ANALYSIS/create_summary_report.sh deleted file mode 100644 index 4ed9b1929..000000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/create_summary_report.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash - -# Define fixed data variables -RUN=$(ls -l ../../RAW/ | cut -d'/' -f4 | sort -u | grep -v 'total' | head -n1 | rev | cut -d " " -f 2- | rev) -USER=$(pwd | cut -d '/' -f6 | cut -d '_' -f4) -HOST=$(pwd | cut -d '/' -f8 | cut -d '_' -f4 | tr '[:upper:]' '[:lower:]' | sed 's/.*/\u&/') - -# Define header for output file -HEADER="run\tuser\thost\tVirussequence\tsample\ttotalreads\treadshostR1\treadshost\t%readshost\treadsvirus\t%readsvirus\tunmappedreads\t%unmapedreads\tmedianDPcoveragevirus\tCoverage>10x(%)\tVariantsinconsensusx10\tMissenseVariants\t%Ns10x\tLineage\tread_length\tanalysis_date" - -# Print header to output file -echo -e $HEADER > mapping_illumina_$(date '+%Y%m%d').tab - -# Loop through sample list and extract relevant data -cat samples_ref.txt | while read in -do - # Sample and virus reference names - arr=($in); - - # Extract data for each column - total_reads=$(grep 'total_reads' ${arr[1]}*/fastp/${arr[0]}.fastp.json | head -n2 | tail -n1 | cut -d ':' -f2 | sed 's/,//g') - - reads_hostR1=$(cat ${arr[1]}*/kraken2/${arr[0]}.kraken2.report.txt | grep -v 'unclassified' | cut -f3 | awk '{s+=$1}END{print s}') - reads_host_x2=$(echo $((reads_hostR1 * 2)) ) - perc_mapped=$(echo $(awk -v v1=$total_reads -v v2=$reads_host_x2 'BEGIN {print (v2*100)/v1}') ) - - reads_virus=$(cat ${arr[1]}*/variants/bowtie2/samtools_stats/${arr[0]}.sorted.bam.flagstat | grep '+ 0 mapped' | cut -d ' ' -f1) - - unmapped_reads=$(echo $((total_reads - (reads_host_x2+reads_virus))) ) - perc_unmapped=$(echo $(awk -v v1=$total_reads -v v2=$unmapped_reads 'BEGIN {print (v2/v1)*100}') ) - - n_count=$(cat %Ns.tab | grep -w ${arr[0]} | grep ${arr[1]} | cut -f2) - - missense=$(LC_ALL=C awk -F, '{if($10 >= 0.75)print $0}' ${arr[1]}*/variants/ivar/variants_long_table.csv | grep ^${arr[0]}, | grep 'missense' | wc -l) - - Ns_10x_perc=$(zcat ${arr[1]}*/variants/ivar/consensus/bcftools/${arr[0]}.filtered.vcf.gz | grep -v '^#' | wc -l) - - lineage=$(cat ${arr[1]}*/variants/ivar/consensus/bcftools/pangolin/${arr[0]}.pangolin.csv | tail -n1 | cut -d ',' -f2) - - metrics=$(cat ${arr[1]}*/multiqc/summary_variants_metrics_mqc.csv | grep ^${arr[0]},) - reads_virus_perc=$(echo "$metrics" | cut -d ',' -f5) - medianDPcov=$(echo "$metrics" | cut -d ',' -f8) - cov10x=$(echo "$metrics" | cut -d ',' -f10) - - read_length=$(cat ${arr[1]}*/multiqc/multiqc_data/multiqc_fastqc.yaml | grep -A5 "${arr[0]}_1:$" | grep "Sequence length:" | tr "-" " " | rev | cut -d " " -f1 | rev) - - analysis_date=$(date '+%Y%m%d') - - # Introduce data row into output file - echo -e "${RUN}\t${USER}\t${HOST}\t${arr[1]}\t${arr[0]}\t$total_reads\t$reads_hostR1\t$reads_host_x2\t$perc_mapped\t$reads_virus\t$reads_virus_perc\t$unmapped_reads\t$perc_unmapped\t$medianDPcov\t$cov10x\t$Ns_10x_perc\t$missense\t$n_count\t$lineage\t$read_length\t$analysis_date" >> mapping_illumina_$(date '+%Y%m%d').tab -done diff --git a/bu_isciii/templates/genomeev/ANALYSIS/deduplicate_long_table.sh b/bu_isciii/templates/genomeev/ANALYSIS/deduplicate_long_table.sh deleted file mode 100644 index 146f23efb..000000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/deduplicate_long_table.sh +++ /dev/null @@ -1,6 +0,0 @@ -find . -type f -name "variants_long_table.csv" | cut -d '/' -f1,2,3,4 | while read in -do - mv ${in}/variants_long_table.csv ${in}/variants_long_table_dups.csv - head -n1 ${in}/variants_long_table_dups.csv > ${in}/variants_long_table.csv - grep -v 'SAMPLE' ${in}/variants_long_table_dups.csv | sort -u >> ${in}/variants_long_table.csv -done diff --git a/bu_isciii/templates/genomeev/ANALYSIS/lablog_pikavirus b/bu_isciii/templates/genomeev/ANALYSIS/lablog_pikavirus deleted file mode 100644 index 04924688f..000000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/lablog_pikavirus +++ /dev/null @@ -1,4 +0,0 @@ -mkdir -p 00-reads -mv ANALYSIS01_PIKAVIRUS $(date '+%Y%m%d')_ANALYSIS01_PIKAVIRUS -cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cd - -cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd - \ No newline at end of file diff --git a/bu_isciii/templates/genomeev/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/genomeev/ANALYSIS/lablog_viralrecon deleted file mode 100644 index fc2a53206..000000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/lablog_viralrecon +++ /dev/null @@ -1,68 +0,0 @@ -ANALYSIS_TYPE=METAGENOMIC -CONFIG_FILE="../../DOC/viralrecon_metagenomic.config" -PARAMS_FILE="../../DOC/viralrecon_metagenomic_params.yml" - -cat samples_ref.txt | cut -f3 | sort -u | while read in; do echo ${in^^}; done > host_list.tmp -i=2; cat host_list.tmp | while read in -do - FOLDER_NAME=$(echo $(date '+%Y%m%d')_ANALYSIS0${i}_${ANALYSIS_TYPE}_${in}) - mkdir ${FOLDER_NAME} - cp create_summary_report.sh ${FOLDER_NAME}/ - cp deduplicate_long_table.sh ${FOLDER_NAME}/ - cp percentajeNs.py ${FOLDER_NAME}/ - grep -i ${in} samples_ref.txt | cut -f1,2 > ${FOLDER_NAME}/samples_ref.txt - echo "ln -s ../00-reads ." > ${FOLDER_NAME}/lablog - printf "ln -s ../samples_id.txt .\n\n" >> ${FOLDER_NAME}/lablog - echo "# module load Nextflow/22.10.1 singularity" >> ${FOLDER_NAME}/lablog - echo "" >> ${FOLDER_NAME}/lablog - printf 'scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g")\n\n' >> ${FOLDER_NAME}/lablog - cut -f2 ${FOLDER_NAME}/samples_ref.txt | sort -u | while read ref - do - echo "sample,fastq_1,fastq_2" > ${FOLDER_NAME}/samplesheet_${ref}.csv - grep -i ${ref} ${FOLDER_NAME}/samples_ref.txt | while read samples - do - arr=($samples); echo "${arr[0]},00-reads/${arr[0]}_R1.fastq.gz,00-reads/${arr[0]}_R2.fastq.gz" >> ${FOLDER_NAME}/samplesheet_${ref}.csv - done - REF_FASTA=$(find /data/bi/references/virus/ -name ${ref}.fasta) - REF_GFF=$(find /data/bi/references/virus/ -name ${ref}.gff) - echo "cat < ${ref}_viralrecon.sbatch" >> ${FOLDER_NAME}/lablog - echo "#!/bin/sh" >> ${FOLDER_NAME}/lablog - echo "#SBATCH --ntasks 1" >> ${FOLDER_NAME}/lablog - echo "#SBATCH --cpus-per-task 2" >> ${FOLDER_NAME}/lablog - echo "#SBATCH --mem 4G" >> ${FOLDER_NAME}/lablog - echo "#SBATCH --time 2:00:00" >> ${FOLDER_NAME}/lablog - echo "#SBATCH --partition middle_idx" >> ${FOLDER_NAME}/lablog - echo "#SBATCH --output ${ref}_$(date '+%Y%m%d')_viralrecon.log" >> ${FOLDER_NAME}/lablog - printf "#SBATCH --chdir \$scratch_dir\n\n" >> ${FOLDER_NAME}/lablog - printf 'export NXF_OPTS="-Xms500M -Xmx4G"\n\n' >> ${FOLDER_NAME}/lablog - echo "nextflow run /data/bi/pipelines/nf-core-viralrecon-2.6.0/workflow/main.nf \\\\" >> ${FOLDER_NAME}/lablog - echo " -c ${CONFIG_FILE} \\\\" >> ${FOLDER_NAME}/lablog - echo " -params-file ${PARAMS_FILE} \\\\" >> ${FOLDER_NAME}/lablog - echo " --input samplesheet_${ref}.csv \\\\" >> ${FOLDER_NAME}/lablog - echo " --outdir ${ref}_$(date '+%Y%m%d')_viralrecon_mapping \\\\" >> ${FOLDER_NAME}/lablog - echo " --fasta ${REF_FASTA} \\\\" >> ${FOLDER_NAME}/lablog - echo " --gff ${REF_GFF} \\\\" >> ${FOLDER_NAME}/lablog - echo " -resume" >> ${FOLDER_NAME}/lablog - printf "EOF\n\n" >> ${FOLDER_NAME}/lablog - printf "echo 'sbatch ${ref}_viralrecon.sbatch' > _01_run_${ref}_viralrecon.sh\n\n" >> ${FOLDER_NAME}/lablog - done - echo "# conda activate python3" >> ${FOLDER_NAME}/lablog - - cp _02_create_run_percentage_Ns.sh ${FOLDER_NAME}/ - printf 'echo "bash create_summary_report.sh" > _04_create_stats_table.sh\n\n' >> ${FOLDER_NAME}/lablog - cp create_assembly_stats.R ${FOLDER_NAME}/ - echo "# module load R/4.2.1" >> ${FOLDER_NAME}/lablog - printf 'echo "Rscript create_assembly_stats.R" > _05_create_stats_assembly.sh\n\n' >> ${FOLDER_NAME}/lablog - printf 'echo "bash deduplicate_long_table.sh" > _06_deduplicate_long_table.sh\n\n' >> ${FOLDER_NAME}/lablog - - i=$((i+1)) -done -rm host_list.tmp -rm create_summary_report.sh -rm deduplicate_long_table.sh -rm percentajeNs.py -rm _02_create_run_percentage_Ns.sh - -# Exclusive of genomeev and viral_discovery service, if you see this anywhere else, somebody has copied without reading (https://youtu.be/AgGtGORPHcM?t=4) -mv ANALYSIS03_MAG $(date '+%Y%m%d')_ANALYSIS03_MAG -mv ANALYSIS04_BLAST $(date '+%Y%m%d')_ANALYSIS04_BLAST diff --git a/bu_isciii/templates/genomeev/ANALYSIS/percentajeNs.py b/bu_isciii/templates/genomeev/ANALYSIS/percentajeNs.py deleted file mode 100644 index c2552969c..000000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/percentajeNs.py +++ /dev/null @@ -1,21 +0,0 @@ -from Bio import SeqIO -import os -import argparse - -parser = argparse.ArgumentParser(description="Count %Ns") -parser.add_argument("input_dir", type=str, help="Input dir masked files") -parser.add_argument("output_file", type=str, help="Output file for Ns count") -args = parser.parse_args() - -out_handle = open(args.output_file, "w") - -for f in os.listdir(args.input_dir): - if f.endswith(".consensus.fa"): - ffpath = os.path.join(args.input_dir, f) - for record in SeqIO.parse(ffpath, "fasta"): - n_count = record.seq.count("N") + record.seq.count("n") - out_handle.write( - "%s\t%0.2f\n" % (record.description, n_count * 100.0 / len(record)) - ) - -out_handle.close() diff --git a/bu_isciii/templates/genomeev/ANALYSIS/samples_ref.txt b/bu_isciii/templates/genomeev/ANALYSIS/samples_ref.txt deleted file mode 100644 index 5e3528b12..000000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/samples_ref.txt +++ /dev/null @@ -1,4 +0,0 @@ -SampleID Reference Host -SampleID Reference Host -SampleID Reference Host - diff --git a/bu_isciii/templates/genomeev/DOC/hpc_slurm_pikavirus.config b/bu_isciii/templates/genomeev/DOC/hpc_slurm_pikavirus.config deleted file mode 100644 index 2517ff51d..000000000 --- a/bu_isciii/templates/genomeev/DOC/hpc_slurm_pikavirus.config +++ /dev/null @@ -1,32 +0,0 @@ -/* - * --------------------------------------------------------------- - * Nextflow config file for the ISCIII High Performance Computer - * --------------------------------------------------------------- - * - * nextflow run PikaVirus/manin.nf -profile HPC_ISCIII, - */ - -process{ - executor = 'slurm' - queue = 'middle_idx' - conda = '/data/bi/pipelines/miniconda3/envs/PikaVirus' - errorStrategy = { task.exitStatus in [140,143,137,138,104,134,139] ? 'retry' : 'finish'; task.exitStatus in [1,4,255] ? 'ignore' : 'finish' } - maxRetries = 5 - maxErrors = '-1' -} - -params { - config_profile_name = 'ISCIII HPC profile' - config_profile_description = 'Profile designed for the High Performance Computer in the ISCIII' - kraken2_db = "/data/bi/references/kraken/minikraken_8GB_20200312" - vir_ref_dir = "/data/bi/references/PikaVirus/viral_assemblies_for_pikavirus" - vir_dir_repo = "/data/bi/references/PikaVirus/viral_assemblies.tsv" - bact_ref_dir = "/data/bi/references/PikaVirus/bacteria_assemblies_for_pikavirus" - bact_dir_repo = "/data/bi/references/PikaVirus/bacteria_assemblies.tsv" - fungi_ref_dir = "/data/bi/references/PikaVirus/fungi_assemblies_for_pikavirus" - fungi_dir_repo = "/data/bi/references/PikaVirus/fungi_assemblies.tsv" - outdir = "01-PikaVirus-results" - max_memory = 376.GB - max_cpus = 32 - max_time = '48.h' -} diff --git a/bu_isciii/templates/genomeev/DOC/mag.config b/bu_isciii/templates/genomeev/DOC/mag.config deleted file mode 100644 index 732980bf1..000000000 --- a/bu_isciii/templates/genomeev/DOC/mag.config +++ /dev/null @@ -1,19 +0,0 @@ -singularity { - enabled = true - autoMounts = true -} - -process { - executor = 'slurm' - queue = 'middle_idx' - queue = 'middle_idx' - errorStrategy = { task.exitStatus in [140,143,137,138,104,134,139] ? 'retry' : 'finish'; task.exitStatus in [1,4,255] ? 'ignore' : 'finish' } - maxRetries = 1 - maxErrors = '-1' -} - -params { - max_memory = 376.GB - max_cpus = 32 - max_time = '48.h' -} diff --git a/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic.config b/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic.config deleted file mode 100644 index 66e8911f5..000000000 --- a/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic.config +++ /dev/null @@ -1,19 +0,0 @@ -singularity { - enabled = true - autoMounts = true -} - -process { - executor = 'slurm' - queue = 'middle_idx' - withName: 'FASTP' { - ext.args = '--cut_front --cut_tail --trim_poly_x --cut_mean_quality 20 --qualified_quality_phred 20 --unqualified_percent_limit 10 --length_required 50' - } -} - -params { - // Max resource options - max_memory = 376.GB - max_cpus = 32 - max_time = '48.h' -} diff --git a/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_ignore_merge_codons.config b/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_ignore_merge_codons.config deleted file mode 100644 index bb575ee33..000000000 --- a/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_ignore_merge_codons.config +++ /dev/null @@ -1,42 +0,0 @@ -singularity { - enabled = true - autoMounts = true -} - -process { - executor = 'slurm' - queue = 'middle_idx' - withName: 'FASTP' { - ext.args = '--cut_front --cut_tail --trim_poly_x --cut_mean_quality 20 --qualified_quality_phred 20 --unqualified_percent_limit 10 --length_required 50' - } - withName: 'PANGOLIN' { - ext.args = '--datadir /scratch/bi/references/pangolin/20220322' - container = 'https://depot.galaxyproject.org/singularity/pangolin:3.1.20--pyhdfd78af_0' - } - withName: 'IVAR_VARIANTS_TO_VCF' { - ext.args = params.protocol == 'amplicon' ? '--ignore_strand_bias --ignore_merge_codons' : '--ignore_merge_codons' - } -} - -params { - // Input options - platform=illumina - protocol=metagenomic - - // Illumina QC, read trimming and filtering options - kraken2_db="/data/bi/references/eukaria/homo_sapiens/hg38/UCSC/kraken2/kraken2_human.tar.gz" - - // Illumina variant calling options - variant_caller=ivar - consensus_caller=bcftools - skip_pangolin=true - skip_nextclade=true - - // Illumina de novo assembly options - skip_assembly=true - - // Max resource options - max_memory = 376.GB - max_cpus = 32 - max_time = '48.h' -} diff --git a/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_params.yml b/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_params.yml deleted file mode 100644 index 7ef761860..000000000 --- a/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_params.yml +++ /dev/null @@ -1,11 +0,0 @@ -platform: 'illumina' -protocol: 'metagenomic' -kraken2_db: '/data/bi/references/eukaria/homo_sapiens/hg38/UCSC/kraken2/kraken2_human.tar.gz' -variant_caller: 'ivar' -consensus_caller: 'bcftools' -skip_pangolin: true -skip_nextclade: true -skip_variants: true -skip_assembly: false -skip_abacas: true -skip_plasmidid: true diff --git a/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_save_nohost.config b/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_save_nohost.config deleted file mode 100644 index a62b5ac42..000000000 --- a/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_save_nohost.config +++ /dev/null @@ -1,40 +0,0 @@ -singularity { - enabled = true - autoMounts = true -} - -process { - executor = 'slurm' - queue = 'middle_idx' - withName: 'FASTP' { - ext.args = '--cut_front --cut_tail --trim_poly_x --cut_mean_quality 20 --qualified_quality_phred 20 --unqualified_percent_limit 10 --length_required 50' - } - withName: 'KRAKEN2_KRAKEN2' { - publishDir = [ - pattern: "*.{unclassified_1.fastq.gz,unclassified_2.fastq.gz,txt}" - ] - } -} - -params { - // Input options - platform=illumina - protocol=metagenomic - - // Illumina QC, read trimming and filtering options - kraken2_db="/data/bi/references/eukaria/homo_sapiens/hg38/UCSC/kraken2/kraken2_human.tar.gz" - - // Illumina variant calling options - variant_caller=ivar - consensus_caller=bcftools - skip_pangolin=true - skip_nextclade=true - - // Illumina de novo assembly options - skip_assembly=true - - // Max resource options - max_memory = 376.GB - max_cpus = 32 - max_time = '48.h' -} diff --git a/bu_isciii/templates/genomeev/RAW/README b/bu_isciii/templates/genomeev/RAW/README deleted file mode 100644 index deb5220b4..000000000 --- a/bu_isciii/templates/genomeev/RAW/README +++ /dev/null @@ -1 +0,0 @@ -RAW template diff --git a/bu_isciii/templates/genomeev/RESULTS/lablog_genomeev_results b/bu_isciii/templates/genomeev/RESULTS/lablog_genomeev_results deleted file mode 100644 index c593c7acb..000000000 --- a/bu_isciii/templates/genomeev/RESULTS/lablog_genomeev_results +++ /dev/null @@ -1,17 +0,0 @@ -# conda activate 2excel -mkdir $(date '+%Y%m%d')_entrega01 -cd $(date '+%Y%m%d')_entrega01 - -#Create directories depending on the analysis -mkdir assembly_spades -mkdir blast - -#Create symbolic links depending on the analysis -#Individual files -ln -s ../../ANALYSIS/*_MAG/99-stats/multiqc_report.html ./krona_results.html -ln -s ../../ANALYSIS/*/assembly_stats.xlsx ./assembly_stats.xlsx -python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file ../../ANALYSIS/*PIKAVIRUS*/*/all_samples_virus_table_filtered.tsv --delimiter '\t' --output_filename filtered_all_samples_virus_table --it_has_index --it_has_header - -#Folders -cd assembly_spades; ln -s ../../../ANALYSIS/*BLAST*/*/*scaffolds.fa .; cd - -cd blast; ln -s ../../../ANALYSIS/*BLAST*/all_samples_filtered_BLAST_results.xlsx .; ln -s ../../../ANALYSIS/*BLAST*/*/*.xlsx .; cd - diff --git a/bu_isciii/templates/genomeev/TMP/README b/bu_isciii/templates/genomeev/TMP/README deleted file mode 100644 index 36ecd8ddc..000000000 --- a/bu_isciii/templates/genomeev/TMP/README +++ /dev/null @@ -1 +0,0 @@ -TMP templates diff --git a/bu_isciii/templates/mtbseq_assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog b/bu_isciii/templates/mtbseq_assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog deleted file mode 100644 index 2124059d3..000000000 --- a/bu_isciii/templates/mtbseq_assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog +++ /dev/null @@ -1,37 +0,0 @@ -# module load Nextflow/21.10.6 singularity - -ln -s ../00-reads . -ln -s ../samples_id.txt . - -echo "sample,fastq_1,fastq_2" > samplesheet.csv -cat samples_id.txt | while read in; do echo "${in},00-reads/${in}_R1.fastq.gz,00-reads/${in}_R2.fastq.gz"; done >> samplesheet.csv - -#module load Nextflow singularity -scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") - -cat < assembly.sbatch -#!/bin/sh -#SBATCH --ntasks 1 -#SBATCH --cpus-per-task 2 -#SBATCH --mem 8G -#SBATCH --time 8:00:00 -#SBATCH --partition middle_idx -#SBATCH --output $(date '+%Y%m%d')_assembly01.log -#SBATCH --chdir $scratch_dir - -export NXF_OPTS="-Xms500M -Xmx4G" - -nextflow run /scratch/bi/pipelines/BU_ISCIII-bacterial-assembly/main.nf \\ - -c ../../DOC/hpc_slurm_assembly.config \\ - --input samplesheet.csv \\ - --outdir ./ \\ - --cut_mean_quality 20 \\ - --qualified_quality_phred 20 \\ - --gram + \\ - --save_trimmed true \\ - --kmerfinder_bacteria_database '/data/bi/references/kmerfinder/20190108_stable_dirs/bacteria' \\ - --reference_ncbi_bacteria '/data/bi/references/bacteria/latest_db/assembly_summary_bacteria.txt' \\ - -resume -EOF - -echo "sbatch assembly.sbatch" > _01_nf_assembly.sh diff --git a/bu_isciii/templates/mtbseq_assembly/ANALYSIS/ANALYSIS02_MTBSEQ/lablog b/bu_isciii/templates/mtbseq_assembly/ANALYSIS/ANALYSIS02_MTBSEQ/lablog deleted file mode 100644 index 73d719fe1..000000000 --- a/bu_isciii/templates/mtbseq_assembly/ANALYSIS/ANALYSIS02_MTBSEQ/lablog +++ /dev/null @@ -1,21 +0,0 @@ -# module load singularity -# NOTE: error Java version is fixed by using module purge -mkdir logs -scratch_dir=$(echo $(pwd) | sed "s@/data/bi/scratch_tmp/@/scratch/@g") - -cat ../samples_id.txt | xargs -I % echo "mkdir %; cd %; ln -s ../../*ANALYSIS01*/01-preprocessing/trimmed_sequences/%_1.trim.fastq.gz %_lib1_R1.fastq.gz; cd -" | bash -cat ../samples_id.txt | xargs -I % echo "cd %; ln -s ../../*ANALYSIS01*/01-preprocessing/trimmed_sequences/%_2.trim.fastq.gz %_lib1_R2.fastq.gz; cd -" | bash -cat ../samples_id.txt | xargs -I % echo "cd %; ls *.fastq.gz | tr '_' '\t' | cut -f 1,2 | sort -u > samples.txt; cd -" | bash - -cat ../samples_id.txt | xargs -I @@ echo -e "srun --job-name MTBSEQ.@@ --output logs/MTBSEQ.@@.%j.log --partition middle_idx --mem 100G --chdir ${scratch_dir}/@@ --cpus-per-task 10 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/mtbseq:1.0.4--hdfd78af_2 MTBseq --step TBfull --threads 10 --samples samples.txt &" > _01_mtbseq.sh - -# classification -YEAR=$(date +%Y) -echo "mkdir classification_all" > _02_gather_results.sh -echo "FIRST_SAMPLE=$( head -n1 ../samples_id.txt ); head -n 1 \${FIRST_SAMPLE}/Classification/Strain_Classification.tab > classification_all/strain_classification_all.tab; grep \"^'${YEAR}\" */Classification/Strain_Classification.tab | cut -d \":\" -f 2 >> classification_all/strain_classification_all.tab" >> _02_gather_results.sh -# resistances -echo "mkdir resistances_all" >> _02_gather_results.sh -cat ../samples_id.txt | xargs -I % echo "cp %/Amend/NONE_joint_cf4_cr4_fr75_ph4_samples1_amended.tab resistances_all/%_var_res.tab" >> _02_gather_results.sh -# stats -echo "mkdir stats_all" >> _02_gather_results.sh -echo "FIRST_SAMPLE=$(head -n1 ../samples_id.txt); head -n 1 \$FIRST_SAMPLE/Statistics/Mapping_and_Variant_Statistics.tab > stats_all/statistics_all.tab; grep \"^'${YEAR}\" */Statistics/Mapping_and_Variant_Statistics.tab | cut -d \":\" -f 2 >> stats_all/statistics_all.tab" >> _02_gather_results.sh diff --git a/bu_isciii/templates/mtbseq_assembly/ANALYSIS/ANALYSIS02_MTBSEQ/samples_all/lablog b/bu_isciii/templates/mtbseq_assembly/ANALYSIS/ANALYSIS02_MTBSEQ/samples_all/lablog deleted file mode 100644 index 3fd8c8dfc..000000000 --- a/bu_isciii/templates/mtbseq_assembly/ANALYSIS/ANALYSIS02_MTBSEQ/samples_all/lablog +++ /dev/null @@ -1,22 +0,0 @@ -# module load singularity -# this will be performed in case a tree is requested -mkdir logs -scratch_dir=$(echo $(pwd) | sed "s@/data/bi/scratch_tmp/@/scratch/@g") - -mkdir Amend Bam Called Classification GATK_Bam Groups Joint Mpileup Position_Tables Statistics -cat ../*/samples.txt > samples.txt - -#### Create symbolic links to the folders already created -cd Bam; ln -s ../../*/Bam/* . ; cd - -cd Called; ln -s ../../*/Called/* . ; cd - -cd GATK_Bam; ln -s ../../*/GATK_Bam/* . ; cd - -cd Mpileup; ln -s ../../*/Mpileup/* . ; cd - -cd Position_Tables; ln -s ../../*/Position_Tables/* . ; cd - - -##### Create join scripts -echo "srun --job-name MTBSEQ_JOIN --output logs/MTBSEQ_JOIN.%j.log --partition middle_obx --mem 48G --chdir ${scratch_dir} --cpus-per-task 10 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/mtbseq:1.0.4--hdfd78af_2 MTBseq --step TBjoin --threads 5 --samples ${scratch_dir}/samples.txt &" > _01_tb_join.sh -echo "srun --job-name MTBSEQ_AMEND --output logs/MTBSEQ_AMEND.%j.log --partition middle_obx --mem 48G --chdir ${scratch_dir} --cpus-per-task 10 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/mtbseq:1.0.4--hdfd78af_2 MTBseq --step TBamend --threads 5 --samples ${scratch_dir}/samples.txt &" > _02_tb_amend.sh -echo "srun --job-name MTBSEQ_GROUPS --output logs/MTBSEQ_GROUPS.%j.log --partition middle_obx --mem 48G --chdir ${scratch_dir} --cpus-per-task 10 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/mtbseq:1.0.4--hdfd78af_2 MTBseq --step TBgroups --threads 5 --samples ${scratch_dir}/samples.txt &" > _03_tb_groups.sh - -##### Execute iqtree -echo "srun --chdir ${scratch_dir} --output logs/IQTREEFULLALIGN.%j.log --job-name IQTREEFULLALIGN --cpus-per-task 20 --mem 15G --partition short_idx --time 08:00:00 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/iqtree:2.1.4_beta--hdcc8f71_0 iqtree -s ${scratch_dir}/Amend/*amended_u95_phylo_w12.plainIDs.fasta -m K3Pu+F+I -T 20 -B 1000 -pre phylo.iqtree.bootstrap &" > _04_iqtreeall.sh diff --git a/bu_isciii/templates/mtbseq_assembly/ANALYSIS/lablog b/bu_isciii/templates/mtbseq_assembly/ANALYSIS/lablog deleted file mode 100644 index 05094a174..000000000 --- a/bu_isciii/templates/mtbseq_assembly/ANALYSIS/lablog +++ /dev/null @@ -1,6 +0,0 @@ -mkdir -p 00-reads -mkdir -p $(date '+%Y%m%d')_ANALYSIS01_ASSEMBLY -mkdir -p $(date '+%Y%m%d')_ANALYSIS02_MTBSEQ -ls ../RAW/*.gz | cut -d "/" -f3 | cut -d "_" -f1 | sort -u > samples_id.txt -cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cd - -cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd - \ No newline at end of file diff --git a/bu_isciii/templates/mtbseq_assembly/DOC/README b/bu_isciii/templates/mtbseq_assembly/DOC/README deleted file mode 100644 index b9ad751ba..000000000 --- a/bu_isciii/templates/mtbseq_assembly/DOC/README +++ /dev/null @@ -1 +0,0 @@ -DOC template diff --git a/bu_isciii/templates/mtbseq_assembly/DOC/hpc_slurm_assembly.config b/bu_isciii/templates/mtbseq_assembly/DOC/hpc_slurm_assembly.config deleted file mode 100644 index 9af022258..000000000 --- a/bu_isciii/templates/mtbseq_assembly/DOC/hpc_slurm_assembly.config +++ /dev/null @@ -1,27 +0,0 @@ -conda { - enabled = true - autoMounts = true -} - -singularity { - enabled = true - autoMounts = true -} - -process { - executor = 'slurm' - queue = 'middle_idx' - conda = '/data/bi/pipelines/miniconda3/envs/assembly' - errorStrategy = { task.exitStatus in [140,143,137,138,104,134,139] ? 'retry' : 'finish'; task.exitStatus in [1,4,255] ? 'ignore' : 'finish' } - maxRetries = 1 - maxErrors = '-1' - withName:KMERFINDER { - container = '/scratch/bi/singularity-images/kmerfinder_v3.0.2.sif' - } -} - -params { - max_memory = 376.GB - max_cpus = 32 - max_time = '48.h' -} diff --git a/bu_isciii/templates/mtbseq_assembly/RAW/README b/bu_isciii/templates/mtbseq_assembly/RAW/README deleted file mode 100644 index deb5220b4..000000000 --- a/bu_isciii/templates/mtbseq_assembly/RAW/README +++ /dev/null @@ -1 +0,0 @@ -RAW template diff --git a/bu_isciii/templates/mtbseq_assembly/REFERENCES/README b/bu_isciii/templates/mtbseq_assembly/REFERENCES/README deleted file mode 100644 index 834c3390f..000000000 --- a/bu_isciii/templates/mtbseq_assembly/REFERENCES/README +++ /dev/null @@ -1 +0,0 @@ -REFERENCES template diff --git a/bu_isciii/templates/mtbseq_assembly/RESULTS/README b/bu_isciii/templates/mtbseq_assembly/RESULTS/README deleted file mode 100644 index 5f9902779..000000000 --- a/bu_isciii/templates/mtbseq_assembly/RESULTS/README +++ /dev/null @@ -1 +0,0 @@ -RESULTS templates diff --git a/bu_isciii/templates/mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results b/bu_isciii/templates/mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results deleted file mode 100644 index d2bf377f2..000000000 --- a/bu_isciii/templates/mtbseq_assembly/RESULTS/lablog_mtbseq_assembly_results +++ /dev/null @@ -1,9 +0,0 @@ -DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega" - -mkdir $DELIVERY_FOLDER - -# MTBSEQ-ASSEMBLY service -cd $DELIVERY_FOLDER - -# Links to reports -ln -s ../../ANALYSIS/*ASSEMBLY/99-stats/MultiQC/multiqc_report.html . diff --git a/bu_isciii/templates/mtbseq_assembly/TMP/README b/bu_isciii/templates/mtbseq_assembly/TMP/README deleted file mode 100644 index 36ecd8ddc..000000000 --- a/bu_isciii/templates/mtbseq_assembly/TMP/README +++ /dev/null @@ -1 +0,0 @@ -TMP templates From 02379547864fc72b4b7f8ef96a9631ab60d7109b Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 4 Jul 2024 12:53:02 +0200 Subject: [PATCH 089/321] Updated CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 253b5add6..37427a03c 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,7 +51,7 @@ Code contributions to the new version: - Added MAG tempalte and removed MAG from other templates [#288](https://github.com/BU-ISCIII/buisciii-tools/pull/288) - Added amrfinderplus to characterization template. [#289] (https://github.com/BU-ISCIII/buisciii-tools/pull/289) - Updated all files so that paths referring to /pipelines/ are updated according to the new structure [#287](https://github.com/BU-ISCIII/buisciii-tools/pull/287) -- Updated assembly's template (lablog and config files) [#295](https://github.com/BU-ISCIII/buisciii-tools/pull/295) +- Updated assembly, ariba, snippy and amrfinderplus templates, removed genomeev and mtbseq_assembly templates and updated services.json [#295](https://github.com/BU-ISCIII/buisciii-tools/pull/295) ### Modules From 31535cf704361fc95b227199352042cdfdcaa7e6 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 4 Jul 2024 13:31:00 +0200 Subject: [PATCH 090/321] Updated iqtree lablog --- .../snippy/ANALYSIS/ANALYSIS01_SNIPPY/05-iqtree/lablog | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/05-iqtree/lablog b/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/05-iqtree/lablog index e351131c1..f40bd67d9 100644 --- a/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/05-iqtree/lablog +++ b/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/05-iqtree/lablog @@ -2,5 +2,5 @@ scratch_dir=$(echo $PWD | sed 's/\/data\/bi\/scratch_tmp/\/scratch/g') mkdir logs -#echo "srun --chdir \${scratch_dir} --output logs/IQTREEMFP.%j.log --job-name IQTREEMFP --cpus-per-task 20 --mem 5G --partition short_idx --time 00:30:00 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/iqtree:2.1.4_beta--hdcc8f71_0 iqtree -s ${scratch_dir}/../04-snippy/phylo.aln -m MFP &" > _00_iqtreemfp.sh -echo "srun --chdir ${scratch_dir} --output logs/IQTREEFULLALIGN.%j.log --job-name IQTREEFULLALIGN --cpus-per-task 20 --mem 15G --partition short_idx --time 08:00:00 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/iqtree:2.1.4_beta--hdcc8f71_0 iqtree -s ${scratch_dir}/../04-snippy/phylo.aln -m PMB+F+R2 -T 20 -B 1000 -pre phylo.iqtree.bootstrap &" > _01_iqtreeall.sh +#echo "srun --chdir ${scratch_dir} --output logs/IQTREEMFP.%j.log --job-name IQTREEMFP --cpus-per-task 20 --mem 5G --partition short_idx --time 00:30:00 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/iqtree:2.1.4_beta--hdcc8f71_0 iqtree -s ${scratch_dir}/../04-snippy/phylo.aln -m MFP &" > _00_iqtreemfp.sh +echo "srun --chdir ${scratch_dir} --output logs/IQTREEFULLALIGN.%j.log --job-name IQTREEFULLALIGN --cpus-per-task 20 --mem 15G --partition short_idx --time 08:00:00 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/iqtree:2.1.4_beta--hdcc8f71_0 iqtree -s ${scratch_dir}/../04-snippy/phylo.aln -m HKY+F+I -T 20 -B 1000 -pre phylo.iqtree.bootstrap &" > _01_iqtreeall.sh From f8fb17ebb38eed15cffb80a1d112237e7fb29e63 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 4 Jul 2024 13:35:15 +0200 Subject: [PATCH 091/321] Updated snippy lablog --- .../snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog b/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog index c67c2a06d..2db8ad881 100644 --- a/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog +++ b/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog @@ -23,9 +23,12 @@ echo "srun --chdir ${scratch_dir} --output logs/SNIP-SITES.%j.log --job-name SNI # awk 'BEGIN{FS="[> ]"} /^>/{val=$2;next} {print val,length($0)}' phylo.aln +#code to compare samples inpairs +# awk '$4 != $5 || $4 != $6 || $5 != $6' core.tab > differences.txt + ## GUBBINS commands -echo "snippy-clean_full_aln core.full.aln > clean.full.aln" > _03_gubbins.sh -echo "run_gubbins.py --threads 20 -p gubbins clean.full.aln" >> _03_gubbins.sh -echo "snp-sites -c gubbins.filtered_polymorphic_sites.fasta > clean.core.aln" >> _03_gubbins.sh +echo "env - PATH="$PATH" singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 snippy-clean_full_aln core.full.aln > clean.full.aln" > _03_gubbins.sh +echo "singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/gubbins:3.3.5--py39pl5321he4a0461_0 run_gubbins.py --threads 20 -p gubbins clean.full.aln" >> _03_gubbins.sh +echo "env - PATH="$PATH" singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 snp-sites -c gubbins.filtered_polymorphic_sites.fasta > clean.core.aln" >> _03_gubbins.sh # Run gubbins echo "srun --chdir ${scratch_dir} --output logs/GUBBINS.%j.log --job-name GUBBINS --cpus-per-task 20 --mem 49152 --partition short_idx --time 02:00:00 bash _03_gubbins.sh &" > _03_run_gubbins.sh From 318543bce3f5fe3e5620a20db1f94341a9755b4d Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 4 Jul 2024 16:20:26 +0200 Subject: [PATCH 092/321] Fixed minor mistakes in snippy's lablog --- .../snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog b/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog index 2db8ad881..6154d78bf 100644 --- a/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog +++ b/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog @@ -1,4 +1,4 @@ -# conda activate snippy +# module load singularity scratch_dir=$(echo $PWD | sed 's/\/data\/bi\/scratch_tmp/\/scratch/g') @@ -6,7 +6,7 @@ mkdir logs cat ../samples_id.txt | while read in; do echo -e "${in}\t${scratch_dir}/../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz\t${scratch_dir}/../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz"; done >> input.tab -singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 snippy-multi ${scratch_dir}/input.tab --mincov 9 --mapqual 10 --basequal 5 --minqual 30 --ref ${scratch_dir}/../../../REFERENCES/GCF_015326295.1/GCF_015326295.1_ASM1532629v1_genomic.fna --cpus 5 > commands.out +ls ${scratch_dir}/../../../REFERENCES | xargs -I %% singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 snippy-multi ${scratch_dir}/input.tab --mincov 9 --mapqual 10 --basequal 5 --minqual 30 --ref ${scratch_dir}/../../../REFERENCES/%% --cpus 5 > commands.out head -n -1 commands.out | sed -e "s@^@srun --chdir ${scratch_dir} --output logs/SNIPPY.%j.log --job-name SNIPPY --cpus-per-task 5 --mem 49152 --partition short_idx --time 02:00:00 env - PATH="$PATH" singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 @" | awk '{print $0" &"}' > _00_snippy.sh tail -n 1 commands.out | sed -e "s@^@srun --chdir ${scratch_dir} --output logs/SNIPPY_CORE.%j.log --job-name SNIPPY --cpus-per-task 5 --mem 49152 --partition short_idx --time 02:00:00 env - PATH="$PATH" singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 @" | awk '{print $0" &"}' > _01_snippy_core.sh From 81125321f63d395ae11a344d083daae309a9d244 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 4 Jul 2024 16:23:34 +0200 Subject: [PATCH 093/321] Updated CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 37427a03c..b97d8b2e3 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,7 +51,7 @@ Code contributions to the new version: - Added MAG tempalte and removed MAG from other templates [#288](https://github.com/BU-ISCIII/buisciii-tools/pull/288) - Added amrfinderplus to characterization template. [#289] (https://github.com/BU-ISCIII/buisciii-tools/pull/289) - Updated all files so that paths referring to /pipelines/ are updated according to the new structure [#287](https://github.com/BU-ISCIII/buisciii-tools/pull/287) -- Updated assembly, ariba, snippy and amrfinderplus templates, removed genomeev and mtbseq_assembly templates and updated services.json [#295](https://github.com/BU-ISCIII/buisciii-tools/pull/295) +- Updated assembly, ariba, snippy, amrfinderplus and iqtree templates, removed genomeev and mtbseq_assembly templates and updated services.json [#295](https://github.com/BU-ISCIII/buisciii-tools/pull/295) ### Modules From 130c402013078104fa2a3e6988766fd90798c0c5 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Tue, 9 Jul 2024 16:32:09 +0200 Subject: [PATCH 094/321] Updated lablog so that references are directly available in refgenie --- .../viralrecon/ANALYSIS/lablog_viralrecon | 93 ++++++++++++------- 1 file changed, 57 insertions(+), 36 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 3a81fc18a..824278ca3 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -133,7 +133,7 @@ update_nextclade() { echo } -# Checks if fasta and gff references are downloaded. If not, it downloades them (and create family folder if neccesary) +# Checks if fasta and gff references are downloaded. If not, it downloads them (and creates family folder if neccesary) check_references() { echo echo_bold "Processing reference: ${ref}." @@ -149,53 +149,74 @@ check_references() { if [ -z $family ]; then family=$(curl -s "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=${organism_id}" | grep -o 'ALT="family">.*<' | awk -F 'ALT="family">' '{print $2}' | cut -d '<' -f 1 | tr '[:upper:]' '[:lower:]') fi - echo "Reference $ref organism belongs to $family family." + echo "Reference $ref belongs to $family family." } # Check if FASTA sequence is already downloaded - REF_FASTA=$(find /data/bi/references/virus/ -maxdepth 2 -type f -name "${ref}.fa*" ! -name "*.fai") - if [ -z $REF_FASTA ]; then + obtain_family; + REF_FASTA=$(refgenie seek ${family}/fasta.fasta:${ref} -c /data/bi/references/refgenie/genome_config.yaml 2>&1) + if echo "$REF_FASTA" | grep -q "Traceback"; then echo "File ${ref}.fasta is not yet downloaded." - obtain_family; if [ -z $family ]; then return; fi - if [ ! -e "/data/bi/references/virus/$family" ]; then # Check if directory doesn't exists - echo "Creating new directory: /data/bi/references/virus/${family}/" - mkdir /data/bi/references/virus/${family}/; chgrp bi /data/bi/references/virus/${family}/ - else - echo "Directory /data/bi/references/virus/${family}/ ALREADY EXISTS." - fi - echo "Downloading ${ref}.fasta file..." - wget -q -O "/data/bi/references/virus/${family}/${ref}.fasta" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=${ref}&rettype=fasta&retmode=text" - if [ $? -eq 0 ]; then - REF_FASTA="/data/bi/references/virus/${family}/${ref}.fasta" - chgrp bi $REF_FASTA - echo_green "File ${ref}.fasta downloaded in $REF_FASTA." + if [ -z ${family} ]; then return; fi + if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exists + echo "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.fasta in /data/bi/references/refgenie/alias/${family}/fasta/${ref}." + digest=$(openssl rand -hex 24) + refgenie alias set --aliases ${family} --digest ${digest} -f -c /data/bi/references/refgenie/genome_config.yaml + mkdir -p /data/bi/references/refgenie/data/${digest}/fasta/${ref}/ + wget -q -O "/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=${ref}&rettype=fasta&retmode=text" + if [ $? -eq 0 ]; then + echo_green "File ${ref}.fasta downloaded in $REF_FASTA." + refgenie add ${family}/fasta:${ref} --path data/${digest}/fasta/${ref}/ --seek-keys '{"fasta" : "'"${ref}.fasta"'"}' -c /data/bi/references/refgenie/genome_config.yaml + else + echo_blinking_red "An error occurred during file downloading." + fi else - echo_blinking_red "An error occurred during file downloading." + echo "Directory /data/bi/references/refgenie/alias/${family}/ ALREADY EXISTS. Downloading ${ref}.fasta." + digest=$(refgenie alias get -a ${family} -c /data/bi/references/refgenie/genome_config.yaml) + mkdir -p /data/bi/references/refgenie/data/${digest}/fasta/${ref}/ + wget -q -O "/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=${ref}&rettype=fasta&retmode=text" + if [ $? -eq 0 ]; then + echo_green "File ${ref}.fasta downloaded in $REF_FASTA." + refgenie add ${family}/fasta:${ref} --path data/${digest}/fasta/${ref}/ --seek-keys '{"fasta" : "'"${ref}.fasta"'"}' -c /data/bi/references/refgenie/genome_config.yaml + else + echo_blinking_red "An error occurred during file downloading." + fi fi else - echo -e "File ${ref}.fasta is ALREADY available in $REF_FASTA. \xE2\x9C\x85" + echo -e "File ${ref}.fasta is ALREADY available in $(dirname $REF_FASTA). \xE2\x9C\x85" fi # Check if GFF file is already downloaded - REF_GFF=$(find /data/bi/references/virus/ -maxdepth 2 -type f -name "${ref}.gff*") - if [ -z $REF_GFF ]; then + REF_GFF=$(refgenie seek ${family}/gff.gff:${ref} -c /data/bi/references/refgenie/genome_config.yaml 2>&1) + if echo "$REF_GFF" | grep -q "Traceback"; then echo "File ${ref}.gff is not yet downloaded." - if [ ! -v family ]; then obtain_family; if [ -z $family ]; then return; fi; fi - if [ ! -e "/data/bi/references/virus/$family" ]; then - echo "Creating new directory: /data/bi/references/virus/${family}/" - mkdir /data/bi/references/virus/${family}/; chgrp bi /data/bi/references/virus/${family}/ - fi - echo "Downloading ${ref}.gff file..." - wget -q -O "/data/bi/references/virus/${family}/${ref}.gff" "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${ref}" - if [ $? -eq 0 ]; then - REF_GFF="/data/bi/references/virus/${family}/${ref}.gff" - chgrp bi $REF_GFF - echo_green "File ${ref}.gff downloaded in $REF_GFF." - else - echo_blinking_red "An error occurred during file downloading." - fi + if [ ! -v ${family} ]; then obtain_family; if [ -z ${family} ]; then return; fi; fi + if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exist + echo "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.gff in /data/bi/references/refgenie/alias/${family}/gff/${ref}." + digest=$(openssl rand -hex 24) + refgenie alias set --aliases ${family} --digest ${digest} -f -c /data/bi/references/refgenie/genome_config.yaml + mkdir -p /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/ + wget -q -O "/data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/${ref}.gff" "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${ref}" + if [ $? -eq 0 ]; then + echo_green "File ${ref}.gff downloaded in $REF_GFF." + refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${ref}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml + else + echo_blinking_red "An error occurred during file downloading." + fi + else + echo "Directory /data/bi/references/refgenie/alias/${family}/ ALREADY EXISTS. Downloading ${ref}.gff." + digest=$(refgenie alias get -a ${family} -c /data/bi/references/refgenie/genome_config.yaml) + mkdir -p /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/ + wget -q -O "/data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/${ref}.gff" "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${ref}" + if [ $? -eq 0 ]; then + echo_green "File ${ref}.gff downloaded in $REF_GFF." + refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${ref}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml + else + echo_blinking_red "An error occurred during file downloading." + fi + fi else - echo -e "File ${ref}.gff is ALREADY available in $REF_GFF. \xE2\x9C\x85" + echo -e "File ${ref}.gff is ALREADY available in $(dirname $REF_GFF). \xE2\x9C\x85" fi unset family From 852b0111bdca39e1b64cadd1670a587d1b692ff0 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Tue, 9 Jul 2024 16:40:56 +0200 Subject: [PATCH 095/321] Updated CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b97d8b2e3..b1be3e55b 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,6 +52,7 @@ Code contributions to the new version: - Added amrfinderplus to characterization template. [#289] (https://github.com/BU-ISCIII/buisciii-tools/pull/289) - Updated all files so that paths referring to /pipelines/ are updated according to the new structure [#287](https://github.com/BU-ISCIII/buisciii-tools/pull/287) - Updated assembly, ariba, snippy, amrfinderplus and iqtree templates, removed genomeev and mtbseq_assembly templates and updated services.json [#295](https://github.com/BU-ISCIII/buisciii-tools/pull/295) +- Changed viralrecon's lablog so that references are available within refgenie [#296](https://github.com/BU-ISCIII/buisciii-tools/pull/296) ### Modules From 0d53c4aeef9ecf16e2c70faae1cb7bae8d90a6b6 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 10 Jul 2024 16:22:06 +0200 Subject: [PATCH 096/321] Fixed minor mistake in the lablog --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 824278ca3..9d523a5ba 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -153,10 +153,10 @@ check_references() { } # Check if FASTA sequence is already downloaded - obtain_family; REF_FASTA=$(refgenie seek ${family}/fasta.fasta:${ref} -c /data/bi/references/refgenie/genome_config.yaml 2>&1) if echo "$REF_FASTA" | grep -q "Traceback"; then - echo "File ${ref}.fasta is not yet downloaded." + obtain_family; + echo "File ${ref}.fasta is not yet downloaded." if [ -z ${family} ]; then return; fi if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exists echo "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.fasta in /data/bi/references/refgenie/alias/${family}/fasta/${ref}." From 543c121c7609426b302584259ceec1a887191d5f Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 10 Jul 2024 16:40:02 +0200 Subject: [PATCH 097/321] Moved obtain_family function inside if statement, for speed purposes --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 9d523a5ba..669f1af0d 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -154,10 +154,9 @@ check_references() { # Check if FASTA sequence is already downloaded REF_FASTA=$(refgenie seek ${family}/fasta.fasta:${ref} -c /data/bi/references/refgenie/genome_config.yaml 2>&1) - if echo "$REF_FASTA" | grep -q "Traceback"; then - obtain_family; - echo "File ${ref}.fasta is not yet downloaded." - if [ -z ${family} ]; then return; fi + if echo "$REF_FASTA" | grep -q "Traceback"; then + echo "File ${ref}.fasta is not yet downloaded." + obtain_family; if [ -z $family ]; then return; fi if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exists echo "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.fasta in /data/bi/references/refgenie/alias/${family}/fasta/${ref}." digest=$(openssl rand -hex 24) From b6248bd89c59986f4d68012e6de560663fb7cf3d Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 11 Jul 2024 12:44:29 +0200 Subject: [PATCH 098/321] Moved obtain_family function out of the ifstatement, and removed some variables from places where they should not be --- .../viralrecon/ANALYSIS/lablog_viralrecon | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 669f1af0d..8c58a3a98 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -153,10 +153,10 @@ check_references() { } # Check if FASTA sequence is already downloaded + obtain_family; if [ -z $family ]; then return; fi REF_FASTA=$(refgenie seek ${family}/fasta.fasta:${ref} -c /data/bi/references/refgenie/genome_config.yaml 2>&1) if echo "$REF_FASTA" | grep -q "Traceback"; then echo "File ${ref}.fasta is not yet downloaded." - obtain_family; if [ -z $family ]; then return; fi if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exists echo "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.fasta in /data/bi/references/refgenie/alias/${family}/fasta/${ref}." digest=$(openssl rand -hex 24) @@ -164,7 +164,7 @@ check_references() { mkdir -p /data/bi/references/refgenie/data/${digest}/fasta/${ref}/ wget -q -O "/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=${ref}&rettype=fasta&retmode=text" if [ $? -eq 0 ]; then - echo_green "File ${ref}.fasta downloaded in $REF_FASTA." + echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}" refgenie add ${family}/fasta:${ref} --path data/${digest}/fasta/${ref}/ --seek-keys '{"fasta" : "'"${ref}.fasta"'"}' -c /data/bi/references/refgenie/genome_config.yaml else echo_blinking_red "An error occurred during file downloading." @@ -175,7 +175,7 @@ check_references() { mkdir -p /data/bi/references/refgenie/data/${digest}/fasta/${ref}/ wget -q -O "/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=${ref}&rettype=fasta&retmode=text" if [ $? -eq 0 ]; then - echo_green "File ${ref}.fasta downloaded in $REF_FASTA." + echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}" refgenie add ${family}/fasta:${ref} --path data/${digest}/fasta/${ref}/ --seek-keys '{"fasta" : "'"${ref}.fasta"'"}' -c /data/bi/references/refgenie/genome_config.yaml else echo_blinking_red "An error occurred during file downloading." @@ -186,10 +186,10 @@ check_references() { fi # Check if GFF file is already downloaded + if [ ! -v family ]; then obtain_family; if [ -z ${family} ]; then return; fi; fi REF_GFF=$(refgenie seek ${family}/gff.gff:${ref} -c /data/bi/references/refgenie/genome_config.yaml 2>&1) if echo "$REF_GFF" | grep -q "Traceback"; then - echo "File ${ref}.gff is not yet downloaded." - if [ ! -v ${family} ]; then obtain_family; if [ -z ${family} ]; then return; fi; fi + echo "File ${ref}.gff is not yet downloaded." if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exist echo "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.gff in /data/bi/references/refgenie/alias/${family}/gff/${ref}." digest=$(openssl rand -hex 24) @@ -197,7 +197,7 @@ check_references() { mkdir -p /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/ wget -q -O "/data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/${ref}.gff" "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${ref}" if [ $? -eq 0 ]; then - echo_green "File ${ref}.gff downloaded in $REF_GFF." + echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}" refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${ref}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml else echo_blinking_red "An error occurred during file downloading." @@ -208,7 +208,7 @@ check_references() { mkdir -p /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/ wget -q -O "/data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/${ref}.gff" "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${ref}" if [ $? -eq 0 ]; then - echo_green "File ${ref}.gff downloaded in $REF_GFF." + echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}" refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${ref}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml else echo_blinking_red "An error occurred during file downloading." From 84d259687f5bf36d9393e2cae86fe6424a206502 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Mon, 22 Jul 2024 17:19:38 +0200 Subject: [PATCH 099/321] Updated services.json with the new bacass version, fixed a refgenie-related issue in viralrecon's lablog, fixed the wrong date grep issue in mtbseq's lablog and updated assembly's config file --- bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config | 2 +- .../mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog | 5 +++-- bu_isciii/templates/services.json | 2 +- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 4 ++++ 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config b/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config index 284208fbb..8325bcd5f 100644 --- a/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config +++ b/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config @@ -217,7 +217,7 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: 'MULTIQC' { + withName: 'MULTIQC_CUSTOM' { publishDir = [ [ path: { "${params.outdir}/99-stats/multiqc" }, diff --git a/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog b/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog index d60b322f2..0263c8986 100644 --- a/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog +++ b/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog @@ -2,6 +2,7 @@ # srun parameters scratch_dir=$(echo $(pwd) | sed "s@/data/bi/scratch_tmp/@/scratch/@g" ) +analysis_year=$(pwd | awk -F'ANALYSIS/' '{print substr($2, 1, 4)}') mkdir logs cat ../samples_id.txt | xargs -I % echo "mkdir %; ln -s ../../01-preprocessing/%/%_R1_filtered.fastq.gz %/%_lib1_R1.fastq.gz" > _00_prepareRaw.sh @@ -12,9 +13,9 @@ cat ../samples_id.txt | xargs -I @@ echo -e "srun --job-name MTBSEQ.@@ --output # classification echo "mkdir classification_all" > _03_gather_results.sh -echo "FIRST_SAMPLE=$( head -n1 ../samples_id.txt ); head -n 1 \${FIRST_SAMPLE}/Classification/Strain_Classification.tab > classification_all/strain_classification_all.tab; grep \"^'2023\" */Classification/Strain_Classification.tab | cut -d \":\" -f 2 >> classification_all/strain_classification_all.tab" >> _03_gather_results.sh +echo "FIRST_SAMPLE=$(head -n1 ../samples_id.txt); head -n1 ${FIRST_SAMPLE}/Classification/Strain_Classification.tab > classification_all/strain_classification_all.tab; grep \"^'$analysis_year\" */Classification/Strain_Classification.tab | cut -d ":" -f 2 >> classification_all/strain_classification_all.tab" >> _03_gather_results.sh # resistances echo "mkdir resistances_all" >> _03_gather_results.sh cat ../samples_id.txt | xargs -I % echo "cp %/Amend/NONE_joint_cf4_cr4_fr75_ph4_samples1_amended.tab resistances_all/%_var_res.tab" >> _03_gather_results.sh # stats -echo "mkdir stats_all; FIRST_SAMPLE=$(head -n1 ../samples_id.txt); head -n 1 \$FIRST_SAMPLE/Statistics/Mapping_and_Variant_Statistics.tab > stats_all/statistics_all.tab; grep \"^'2023\" */Statistics/Mapping_and_Variant_Statistics.tab | cut -d \":\" -f 2 >> stats_all/statistics_all.tab" >> _03_gather_results.sh +echo "mkdir stats_all; FIRST_SAMPLE=$(head -n1 ../samples_id.txt); head -n 1 "$FIRST_SAMPLE/Statistics/Mapping_and_Variant_Statistics.tab" > stats_all/statistics_all.tab; grep \"^'$analysis_year\" */Statistics/Mapping_and_Variant_Statistics.tab | cut -d ":" -f 2 >> stats_all/statistics_all.tab" >> _03_gather_results.sh diff --git a/bu_isciii/templates/services.json b/bu_isciii/templates/services.json index 3107c913f..c9e4a108f 100755 --- a/bu_isciii/templates/services.json +++ b/bu_isciii/templates/services.json @@ -2,7 +2,7 @@ "assembly_annotation": { "label": "", "template": "assembly", - "url": "https://github.com/Daniel-VM/bacass/tree/buisciii-develop", + "url": "https://github.com/nf-core/bacass/tree/2.3.1", "order": 1, "begin": "", "end": "", diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 8c58a3a98..a98cf487f 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -166,6 +166,7 @@ check_references() { if [ $? -eq 0 ]; then echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}" refgenie add ${family}/fasta:${ref} --path data/${digest}/fasta/${ref}/ --seek-keys '{"fasta" : "'"${ref}.fasta"'"}' -c /data/bi/references/refgenie/genome_config.yaml + REF_FASTA=$(refgenie seek ${family}/fasta.fasta:${ref} -c /data/bi/references/refgenie/genome_config.yaml) else echo_blinking_red "An error occurred during file downloading." fi @@ -177,6 +178,7 @@ check_references() { if [ $? -eq 0 ]; then echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}" refgenie add ${family}/fasta:${ref} --path data/${digest}/fasta/${ref}/ --seek-keys '{"fasta" : "'"${ref}.fasta"'"}' -c /data/bi/references/refgenie/genome_config.yaml + REF_FASTA=$(refgenie seek ${family}/fasta.fasta:${ref} -c /data/bi/references/refgenie/genome_config.yaml) else echo_blinking_red "An error occurred during file downloading." fi @@ -199,6 +201,7 @@ check_references() { if [ $? -eq 0 ]; then echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}" refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${ref}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml + REF_GFF=$(refgenie seek ${family}/gff.gff:${ref} -c /data/bi/references/refgenie/genome_config.yaml) else echo_blinking_red "An error occurred during file downloading." fi @@ -210,6 +213,7 @@ check_references() { if [ $? -eq 0 ]; then echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}" refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${ref}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml + REF_GFF=$(refgenie seek ${family}/gff.gff:${ref} -c /data/bi/references/refgenie/genome_config.yaml) else echo_blinking_red "An error occurred during file downloading." fi From d894ceee5828eba11572809424472d10fde5d823 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Mon, 22 Jul 2024 17:40:50 +0200 Subject: [PATCH 100/321] Removed some extra quotation marks from the mtbseq template --- .../mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog b/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog index 0263c8986..bc1c80fa8 100644 --- a/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog +++ b/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog @@ -12,10 +12,10 @@ cat ../samples_id.txt | xargs -I % echo "cd %;ls *.fastq.gz | tr '_' '\t' | cut cat ../samples_id.txt | xargs -I @@ echo -e "srun --job-name MTBSEQ.@@ --output logs/MTBSEQ.@@.%j.log --partition middle_idx --mem 48G --chdir ${scratch_dir}/@@ --cpus-per-task 10 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/mtbseq:1.0.4--hdfd78af_2 MTBseq --step TBfull --threads 10 --samples samples.txt &" > _02_mtbseq.sh # classification -echo "mkdir classification_all" > _03_gather_results.sh -echo "FIRST_SAMPLE=$(head -n1 ../samples_id.txt); head -n1 ${FIRST_SAMPLE}/Classification/Strain_Classification.tab > classification_all/strain_classification_all.tab; grep \"^'$analysis_year\" */Classification/Strain_Classification.tab | cut -d ":" -f 2 >> classification_all/strain_classification_all.tab" >> _03_gather_results.sh +echo "mkdir classification_all" > _03_gather_results.sh +echo "FIRST_SAMPLE=$(head -n1 ../samples_id.txt); head -n 1 ${FIRST_SAMPLE}/Classification/Strain_Classification.tab > classification_all/strain_classification_all.tab; grep \"^'$analysis_year\" */Classification/Strain_Classification.tab | cut -d ":" -f 2 >> classification_all/strain_classification_all.tab" >> _03_gather_results.sh # resistances echo "mkdir resistances_all" >> _03_gather_results.sh cat ../samples_id.txt | xargs -I % echo "cp %/Amend/NONE_joint_cf4_cr4_fr75_ph4_samples1_amended.tab resistances_all/%_var_res.tab" >> _03_gather_results.sh # stats -echo "mkdir stats_all; FIRST_SAMPLE=$(head -n1 ../samples_id.txt); head -n 1 "$FIRST_SAMPLE/Statistics/Mapping_and_Variant_Statistics.tab" > stats_all/statistics_all.tab; grep \"^'$analysis_year\" */Statistics/Mapping_and_Variant_Statistics.tab | cut -d ":" -f 2 >> stats_all/statistics_all.tab" >> _03_gather_results.sh +echo "mkdir stats_all; FIRST_SAMPLE=$(head -n1 ../samples_id.txt); head -n 1 ${FIRST_SAMPLE}/Statistics/Mapping_and_Variant_Statistics.tab > stats_all/statistics_all.tab; grep \"^'$analysis_year\" */Statistics/Mapping_and_Variant_Statistics.tab | cut -d ":" -f 2 >> stats_all/statistics_all.tab" >> _03_gather_results.sh From 5a75ae83d62028d451ae35ab8d4def951a499e42 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Mon, 22 Jul 2024 17:42:46 +0200 Subject: [PATCH 101/321] Updated CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b1be3e55b..d82b79be0 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -53,6 +53,7 @@ Code contributions to the new version: - Updated all files so that paths referring to /pipelines/ are updated according to the new structure [#287](https://github.com/BU-ISCIII/buisciii-tools/pull/287) - Updated assembly, ariba, snippy, amrfinderplus and iqtree templates, removed genomeev and mtbseq_assembly templates and updated services.json [#295](https://github.com/BU-ISCIII/buisciii-tools/pull/295) - Changed viralrecon's lablog so that references are available within refgenie [#296](https://github.com/BU-ISCIII/buisciii-tools/pull/296) +- Updated services.json, mtbseq's lablog, viralrecon's lablog and assembly's config file [#299](https://github.com/BU-ISCIII/buisciii-tools/pull/299) ### Modules From 913cbb33b2802564f236ec16fd6fbe3b91aa3f94 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Tue, 23 Jul 2024 11:17:41 +0200 Subject: [PATCH 102/321] Added the corresponding \ characters when doing cut --- .../mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog b/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog index bc1c80fa8..e5bdac16e 100644 --- a/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog +++ b/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog @@ -13,9 +13,9 @@ cat ../samples_id.txt | xargs -I @@ echo -e "srun --job-name MTBSEQ.@@ --output # classification echo "mkdir classification_all" > _03_gather_results.sh -echo "FIRST_SAMPLE=$(head -n1 ../samples_id.txt); head -n 1 ${FIRST_SAMPLE}/Classification/Strain_Classification.tab > classification_all/strain_classification_all.tab; grep \"^'$analysis_year\" */Classification/Strain_Classification.tab | cut -d ":" -f 2 >> classification_all/strain_classification_all.tab" >> _03_gather_results.sh +echo "FIRST_SAMPLE=$(head -n1 ../samples_id.txt); head -n 1 ${FIRST_SAMPLE}/Classification/Strain_Classification.tab > classification_all/strain_classification_all.tab; grep \"^'$analysis_year\" */Classification/Strain_Classification.tab | cut -d \":\" -f 2 >> classification_all/strain_classification_all.tab" >> _03_gather_results.sh # resistances echo "mkdir resistances_all" >> _03_gather_results.sh cat ../samples_id.txt | xargs -I % echo "cp %/Amend/NONE_joint_cf4_cr4_fr75_ph4_samples1_amended.tab resistances_all/%_var_res.tab" >> _03_gather_results.sh # stats -echo "mkdir stats_all; FIRST_SAMPLE=$(head -n1 ../samples_id.txt); head -n 1 ${FIRST_SAMPLE}/Statistics/Mapping_and_Variant_Statistics.tab > stats_all/statistics_all.tab; grep \"^'$analysis_year\" */Statistics/Mapping_and_Variant_Statistics.tab | cut -d ":" -f 2 >> stats_all/statistics_all.tab" >> _03_gather_results.sh +echo "mkdir stats_all; FIRST_SAMPLE=$(head -n1 ../samples_id.txt); head -n 1 ${FIRST_SAMPLE}/Statistics/Mapping_and_Variant_Statistics.tab > stats_all/statistics_all.tab; grep \"^'$analysis_year\" */Statistics/Mapping_and_Variant_Statistics.tab | cut -d \":\" -f 2 >> stats_all/statistics_all.tab" >> _03_gather_results.sh From 75c4f549efd2b43f772aeec871b44ad07955d05b Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Mon, 29 Jul 2024 15:42:35 +0200 Subject: [PATCH 103/321] Fixed symlink creation in 99-stats folder (MAG). --- bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/lablog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/lablog b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/lablog index 246dae5d4..f3e40c093 100644 --- a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/lablog +++ b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/lablog @@ -1,6 +1,6 @@ #module load singularity -cat ../../samples_id.txt | while read in; do ln -s ../*_mag/Taxonomy/kraken2/${in}/kraken2_report.txt ./${in}_kraken2_report.txt; done +cat ../../samples_id.txt | while read in; do ln -s ../*_mag/Taxonomy/kraken2/${in}/${in}kraken2_report.txt .; done scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") From d3385bb2c1f3869fb9333e957a5c3b11ec61c152 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Mon, 29 Jul 2024 15:46:55 +0200 Subject: [PATCH 104/321] Added dot --- bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/lablog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/lablog b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/lablog index f3e40c093..915ddf942 100644 --- a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/lablog +++ b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/lablog @@ -1,6 +1,6 @@ #module load singularity -cat ../../samples_id.txt | while read in; do ln -s ../*_mag/Taxonomy/kraken2/${in}/${in}kraken2_report.txt .; done +cat ../../samples_id.txt | while read in; do ln -s ../*_mag/Taxonomy/kraken2/${in}/${in}.kraken2_report.txt .; done scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") From db8306039392e613f538c33a6c16cb8ffe763d91 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Mon, 29 Jul 2024 15:55:01 +0200 Subject: [PATCH 105/321] Updated CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d82b79be0..974e17a77 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -54,6 +54,7 @@ Code contributions to the new version: - Updated assembly, ariba, snippy, amrfinderplus and iqtree templates, removed genomeev and mtbseq_assembly templates and updated services.json [#295](https://github.com/BU-ISCIII/buisciii-tools/pull/295) - Changed viralrecon's lablog so that references are available within refgenie [#296](https://github.com/BU-ISCIII/buisciii-tools/pull/296) - Updated services.json, mtbseq's lablog, viralrecon's lablog and assembly's config file [#299](https://github.com/BU-ISCIII/buisciii-tools/pull/299) +- Fixed 99-stats (MAG) template. [#301](https://github.com/BU-ISCIII/buisciii-tools/pull/301) ### Modules From 8d70a3a7ec37344386eb1dbe2553e7136b80ee1b Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 26 Jul 2024 16:02:09 +0200 Subject: [PATCH 106/321] added emmtyper template --- .../04-emmtyper/lablog | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog new file mode 100644 index 000000000..59033b0ec --- /dev/null +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog @@ -0,0 +1,78 @@ +#!/bin/sh + +# Create folders +mkdir -p data +mkdir -p .slurm_logs_NC + +# Find all .gz files and write them to a file list +# TODO: add if to check >1 fasta files are available in assembly results +# FIXME: set path to assembly files (tmp: assembly template file path) +find ../../../../assembly/ANALYSIS/*_ASSEMBLY01/results/assembly/unicycler/*.fasta.gz > data/assembly_file_list.txt +ASSEMBLY_LIST=data/assembly_file_list.txt + +# Get the number of files +num_files=$(wc -l < $ASSEMBLY_LIST) + +scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") + +# STEP 1: Set up jobarray to unzip fasta files +cat < _00_unzip_jobarray.sbatch +#!/bin/bash +#SBATCH --job-name=unzip_fasta +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=2 +#SBATCH --mem=8G +#SBATCH --time=2:00:00 +#SBATCH --partition short_idx +#SBATCH --array=1-$num_files +#SBATCH --chdir $scratch_dir +#SBATCH --output .slurm_logs_NC/slurm-%A_%a.out +#SBATCH --error .slurm_logs_NC/slurm-%A_%a.err + +# Get the file to process +file=\$(sed -n "\${SLURM_ARRAY_TASK_ID}p" $ASSEMBLY_LIST) + +# Unzip the file to the destination directory +gzip -dkc \$file > data/\$(basename "\$file" .gz) + +EOF + +# FIXME: symb links to BLAST DATABASE? +# FIXME: conda & singularity load +# STEP 2: Setup exe file to perform unzip and emmtyper. +cat < _01_emmtyper.sbatch +#!/bin/bash +#SBATCH --job-name emmtyper +#SBATCH --ntasks 1 +#SBATCH --cpus-per-task 4 +#SBATCH --mem 24G +#SBATCH --time 4:00:00 +#SBATCH --partition short_idx +#SBATCH --chdir $scratch_dir +#SBATCH --output ./$(date '+%Y%m%d')_emmtyper.log + +# module load singularity +# conda activate emmtyper-0.2.0 + +# create results folder +mkdir -p 01-typing +mkdir -p 01-typing/tmps + +# Run emmtyper +emmtyper \\ + -w blast \\ + --keep \\ + --blast_db 'path_to_blastdatabase' \\ + --percent-identity 95 \\ + --culling-limit 5 \\ + --output 01-typing/results_emmtyper.out \\ + --output-format verbose \\ + ../data/*.fasta + +mv *.tmp 01-typing/tmps + +EOF + +echo "#!/bin/bash" > _ALLSTEPS_emmtyper.sh +echo "unzip_job_id=\$(sbatch _00_unzip_jobarray.sbatch | awk '{print \$4}')" >> _ALLSTEPS_emmtyper.sh +echo "sbatch --dependency=afterok:\${unzip_job_id} _01_emmtyper.sbatch" >> _ALLSTEPS_emmtyper.sh From f5df419a086a1439c70a7afe0d8ebbfda7b5b720 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Mon, 29 Jul 2024 11:22:38 +0200 Subject: [PATCH 107/321] fixing paths and folder names --- .../04-emmtyper/lablog | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog index 59033b0ec..bd6caf0b6 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog @@ -1,14 +1,13 @@ #!/bin/sh # Create folders -mkdir -p data +mkdir -p data_NC mkdir -p .slurm_logs_NC # Find all .gz files and write them to a file list # TODO: add if to check >1 fasta files are available in assembly results -# FIXME: set path to assembly files (tmp: assembly template file path) -find ../../../../assembly/ANALYSIS/*_ASSEMBLY01/results/assembly/unicycler/*.fasta.gz > data/assembly_file_list.txt -ASSEMBLY_LIST=data/assembly_file_list.txt +find ../../*ANALYSIS*ASSEMBLY/03-assembly/unicycler/*.fasta.gz > data_NC/assembly_file_list.txt +ASSEMBLY_LIST=data_NC/assembly_file_list.txt # Get the number of files num_files=$(wc -l < $ASSEMBLY_LIST) @@ -33,11 +32,10 @@ cat < _00_unzip_jobarray.sbatch file=\$(sed -n "\${SLURM_ARRAY_TASK_ID}p" $ASSEMBLY_LIST) # Unzip the file to the destination directory -gzip -dkc \$file > data/\$(basename "\$file" .gz) +gzip -dkc \$file > data_NC/\$(basename "\$file" .gz) EOF -# FIXME: symb links to BLAST DATABASE? # FIXME: conda & singularity load # STEP 2: Setup exe file to perform unzip and emmtyper. cat < _01_emmtyper.sbatch @@ -62,13 +60,14 @@ mkdir -p 01-typing/tmps emmtyper \\ -w blast \\ --keep \\ - --blast_db 'path_to_blastdatabase' \\ + --blast_db '/data/bi/references/cdc_emm_blastdb/cdc_emm_database29042024' \\ --percent-identity 95 \\ --culling-limit 5 \\ --output 01-typing/results_emmtyper.out \\ --output-format verbose \\ - ../data/*.fasta + ./data_NC/*.fasta +mv *emmtyper.log 01-typing/ mv *.tmp 01-typing/tmps EOF From 806cd2c14dab4d12cd899c577e80255aaca8854f Mon Sep 17 00:00:00 2001 From: Dani VM Date: Mon, 29 Jul 2024 11:23:21 +0200 Subject: [PATCH 108/321] add emmtyper to results lablog --- .../RESULTS/lablog_characterization_results | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results b/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results index a12cfbd96..a3585ea46 100644 --- a/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results +++ b/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results @@ -1,6 +1,7 @@ DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega01" mkdir -p "${DELIVERY_FOLDER}/characterization/amrfinderplus" +mkdir -p "${DELIVERY_FOLDER}/characterization/emmtyper" # ARIBA characterization service cd $DELIVERY_FOLDER/characterization @@ -11,4 +12,7 @@ cd amrfinderplus ln -s ../../../../ANALYSIS/*CHARACTERIZATION/*amrfinderplus/*tsv . find .. -xtype l -delete -cd ../.. +cd ../emmtyper +ln -s ../../../../ANALYSIS/*CHARACTERIZATION/*emmtyper/01-typing/results_emmtyper.out . + +cd ../../ From e55bb8d5d83fc8bd5da9630c5e0081c90939d1b2 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Mon, 29 Jul 2024 12:11:17 +0200 Subject: [PATCH 109/321] fixed folder name and dir accessing --- .../ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog | 2 +- .../characterization/RESULTS/lablog_characterization_results | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog index bd6caf0b6..735921f9e 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog @@ -6,7 +6,7 @@ mkdir -p .slurm_logs_NC # Find all .gz files and write them to a file list # TODO: add if to check >1 fasta files are available in assembly results -find ../../*ANALYSIS*ASSEMBLY/03-assembly/unicycler/*.fasta.gz > data_NC/assembly_file_list.txt +find ../../*ANALYSIS*ASSEMBLY/*-assembly/unicycler/*.fasta.gz > data_NC/assembly_file_list.txt ASSEMBLY_LIST=data_NC/assembly_file_list.txt # Get the number of files diff --git a/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results b/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results index a3585ea46..9d617543a 100644 --- a/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results +++ b/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results @@ -12,7 +12,8 @@ cd amrfinderplus ln -s ../../../../ANALYSIS/*CHARACTERIZATION/*amrfinderplus/*tsv . find .. -xtype l -delete -cd ../emmtyper +cd .. +cd emmtyper ln -s ../../../../ANALYSIS/*CHARACTERIZATION/*emmtyper/01-typing/results_emmtyper.out . cd ../../ From f55e7bc1874f034efc036659fb6331362d93da0b Mon Sep 17 00:00:00 2001 From: Dani VM Date: Mon, 29 Jul 2024 12:11:43 +0200 Subject: [PATCH 110/321] allow emmtyper to run via singularity --- .../04-emmtyper/lablog | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog index 735921f9e..7ff21e7a1 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog @@ -36,7 +36,6 @@ gzip -dkc \$file > data_NC/\$(basename "\$file" .gz) EOF -# FIXME: conda & singularity load # STEP 2: Setup exe file to perform unzip and emmtyper. cat < _01_emmtyper.sbatch #!/bin/bash @@ -50,28 +49,35 @@ cat < _01_emmtyper.sbatch #SBATCH --output ./$(date '+%Y%m%d')_emmtyper.log # module load singularity -# conda activate emmtyper-0.2.0 # create results folder mkdir -p 01-typing mkdir -p 01-typing/tmps +blastdb_path=/data/bi/references/cdc_emm_blastdb # Run emmtyper -emmtyper \\ +singularity exec \\ + --bind ${scratch_dir} \\ + --bind ${scratch_dir}/../../ \\ + --bind \$blastdb_path \\ + /data/bi/pipelines/singularity-images/singularity-emmtyper.0.2.0--py_0 emmtyper \\ -w blast \\ --keep \\ - --blast_db '/data/bi/references/cdc_emm_blastdb/cdc_emm_database29042024' \\ + --blast_db "${blastdb_path}/cdc_emm_database29042024" \\ --percent-identity 95 \\ --culling-limit 5 \\ --output 01-typing/results_emmtyper.out \\ --output-format verbose \\ ./data_NC/*.fasta -mv *emmtyper.log 01-typing/ mv *.tmp 01-typing/tmps EOF +# Bash script that performs all steps above echo "#!/bin/bash" > _ALLSTEPS_emmtyper.sh +echo "# # module load singularity" >> _ALLSTEPS_emmtyper.sh echo "unzip_job_id=\$(sbatch _00_unzip_jobarray.sbatch | awk '{print \$4}')" >> _ALLSTEPS_emmtyper.sh echo "sbatch --dependency=afterok:\${unzip_job_id} _01_emmtyper.sbatch" >> _ALLSTEPS_emmtyper.sh + +chmod +x _ALLSTEPS_emmtyper.sh From 56262a49a9a3363ace4b73a29bacb53abfc132d4 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Mon, 29 Jul 2024 12:25:03 +0200 Subject: [PATCH 111/321] update changelog in #300 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 974e17a77..8641155ed 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -54,6 +54,7 @@ Code contributions to the new version: - Updated assembly, ariba, snippy, amrfinderplus and iqtree templates, removed genomeev and mtbseq_assembly templates and updated services.json [#295](https://github.com/BU-ISCIII/buisciii-tools/pull/295) - Changed viralrecon's lablog so that references are available within refgenie [#296](https://github.com/BU-ISCIII/buisciii-tools/pull/296) - Updated services.json, mtbseq's lablog, viralrecon's lablog and assembly's config file [#299](https://github.com/BU-ISCIII/buisciii-tools/pull/299) +- Added lablog to automate gene characterization with emmtyper, including unzipping assemblies. [#300](https://github.com/BU-ISCIII/buisciii-tools/pull/300) - Fixed 99-stats (MAG) template. [#301](https://github.com/BU-ISCIII/buisciii-tools/pull/301) ### Modules From 78554bb9d8bf89d6f3b6740296b110eac7a2b443 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Mon, 29 Jul 2024 13:03:03 +0200 Subject: [PATCH 112/321] added reviewer suggestions in #300 --- .../04-emmtyper/lablog | 16 ++++++++-------- bu_isciii/templates/services.json | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog index 7ff21e7a1..a2eec9ce0 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog @@ -1,13 +1,13 @@ #!/bin/sh # Create folders -mkdir -p data_NC -mkdir -p .slurm_logs_NC +mkdir -p fasta_inputs +mkdir -p slurm_logs # Find all .gz files and write them to a file list # TODO: add if to check >1 fasta files are available in assembly results -find ../../*ANALYSIS*ASSEMBLY/*-assembly/unicycler/*.fasta.gz > data_NC/assembly_file_list.txt -ASSEMBLY_LIST=data_NC/assembly_file_list.txt +find ../../*ANALYSIS*ASSEMBLY/*-assembly/unicycler/*.fasta.gz > fasta_inputs/assembly_file_list.txt +ASSEMBLY_LIST=fasta_inputs/assembly_file_list.txt # Get the number of files num_files=$(wc -l < $ASSEMBLY_LIST) @@ -25,14 +25,14 @@ cat < _00_unzip_jobarray.sbatch #SBATCH --partition short_idx #SBATCH --array=1-$num_files #SBATCH --chdir $scratch_dir -#SBATCH --output .slurm_logs_NC/slurm-%A_%a.out -#SBATCH --error .slurm_logs_NC/slurm-%A_%a.err +#SBATCH --output slurm_logs/slurm-%A_%a.out +#SBATCH --error slurm_logs/slurm-%A_%a.err # Get the file to process file=\$(sed -n "\${SLURM_ARRAY_TASK_ID}p" $ASSEMBLY_LIST) # Unzip the file to the destination directory -gzip -dkc \$file > data_NC/\$(basename "\$file" .gz) +gzip -dkc \$file > fasta_inputs/\$(basename "\$file" .gz) EOF @@ -68,7 +68,7 @@ singularity exec \\ --culling-limit 5 \\ --output 01-typing/results_emmtyper.out \\ --output-format verbose \\ - ./data_NC/*.fasta + ./fasta_inputs/*.fasta mv *.tmp 01-typing/tmps diff --git a/bu_isciii/templates/services.json b/bu_isciii/templates/services.json index c9e4a108f..3e57d041b 100755 --- a/bu_isciii/templates/services.json +++ b/bu_isciii/templates/services.json @@ -198,7 +198,7 @@ "folders":[], "files":[] }, - "no_copy": ["RAW", "TMP", "00-reads"], + "no_copy": ["RAW", "TMP", "00-reads", "fasta_inputs"], "last_folder":"REFERENCES", "delivery_md": "", "results_md": "" From b9f1498ca87dc1b7a933553276b22eb5b3e033aa Mon Sep 17 00:00:00 2001 From: Dani VM Date: Mon, 29 Jul 2024 13:03:40 +0200 Subject: [PATCH 113/321] fix singularity bind --- .../ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog index a2eec9ce0..c93c8fd6c 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog @@ -63,7 +63,7 @@ singularity exec \\ /data/bi/pipelines/singularity-images/singularity-emmtyper.0.2.0--py_0 emmtyper \\ -w blast \\ --keep \\ - --blast_db "${blastdb_path}/cdc_emm_database29042024" \\ + --blast_db "\${blastdb_path}/cdc_emm_database29042024" \\ --percent-identity 95 \\ --culling-limit 5 \\ --output 01-typing/results_emmtyper.out \\ From 2f7fd5e8ef6eacd4ea9cffffee40ff4c1442439b Mon Sep 17 00:00:00 2001 From: Dani VM Date: Tue, 30 Jul 2024 09:45:11 +0200 Subject: [PATCH 114/321] renamed logs folder --- .../ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog index c93c8fd6c..d5897933c 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog @@ -2,7 +2,7 @@ # Create folders mkdir -p fasta_inputs -mkdir -p slurm_logs +mkdir -p logs # Find all .gz files and write them to a file list # TODO: add if to check >1 fasta files are available in assembly results @@ -25,8 +25,8 @@ cat < _00_unzip_jobarray.sbatch #SBATCH --partition short_idx #SBATCH --array=1-$num_files #SBATCH --chdir $scratch_dir -#SBATCH --output slurm_logs/slurm-%A_%a.out -#SBATCH --error slurm_logs/slurm-%A_%a.err +#SBATCH --output logs/slurm-%A_%a.out +#SBATCH --error logs/slurm-%A_%a.err # Get the file to process file=\$(sed -n "\${SLURM_ARRAY_TASK_ID}p" $ASSEMBLY_LIST) From 309dd28bb244401bb16ed956b402f71e1e76cc35 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 1 Aug 2024 14:46:00 +0200 Subject: [PATCH 115/321] Fixed IRMA's lablog so that the info is not displayed several times neither within the files of each type nor in all_samples_completo.txt --- .../IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) mode change 100755 => 100644 bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog old mode 100755 new mode 100644 index c694a5b91..0e7704fe0 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog @@ -5,12 +5,14 @@ mkdir logs scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") -cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --cpus-per-task 32 --mem 35000M --chdir $scratch_dir --time 01:00:00 --output logs/IRMA.${in}.%j.log /data/bi/pipelines/flu-amd/flu-amd-1.1.4/IRMA FLU_AD ../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${in} --external-config ../../../DOC/irma_config.sh &"; done > _01_irma.sh +cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --cpus-per-task 32 --mem 35000M --chdir $scratch_dir --time 01:00:00 --output logs/IRMA.${in}.%j.log /data/bi/tmp/pipelines_new/flu-amd/flu-amd-1.1.4/IRMA FLU_AD ../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${in} --external-config ../../../DOC/irma_config.sh &"; done > _01_irma.sh echo 'bash create_irma_stats.sh' > _02_create_stats.sh echo "ls */*HA*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | sort -u | cut -d '_' -f3 | sed '/^\$/d' | sed 's/^/A_/g' > HA_types.txt" > _03_post_processing.sh +echo 'cat HA_types.txt | while read type; do if test -d ${type}; then rm -rf ${type}; fi; done; if test -d B ; then rm -rf B; fi; if test -d C; then rm -rf C; fi' >> _03_post_processing.sh + echo "cat HA_types.txt | while read in; do mkdir \${in}; done" >> _03_post_processing.sh echo "if grep -qw 'B__' irma_stats.txt; then mkdir B; fi" >> _03_post_processing.sh From 790a728fb1d341624147a7f7b6c48927cd78fd45 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 1 Aug 2024 14:50:58 +0200 Subject: [PATCH 116/321] Substituted pipelines_new by pipelines in line 8 --- .../templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog index 0e7704fe0..e66a50a9d 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog @@ -5,7 +5,7 @@ mkdir logs scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") -cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --cpus-per-task 32 --mem 35000M --chdir $scratch_dir --time 01:00:00 --output logs/IRMA.${in}.%j.log /data/bi/tmp/pipelines_new/flu-amd/flu-amd-1.1.4/IRMA FLU_AD ../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${in} --external-config ../../../DOC/irma_config.sh &"; done > _01_irma.sh +cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --cpus-per-task 32 --mem 35000M --chdir $scratch_dir --time 01:00:00 --output logs/IRMA.${in}.%j.log /data/bi/pipelines/flu-amd/flu-amd-1.1.4/IRMA FLU_AD ../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${in} --external-config ../../../DOC/irma_config.sh &"; done > _01_irma.sh echo 'bash create_irma_stats.sh' > _02_create_stats.sh From 2cf15d9a35a61e8833e1ef6e3e29149f143f1f76 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 1 Aug 2024 14:53:53 +0200 Subject: [PATCH 117/321] Updated CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8641155ed..6417f4b9b 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -56,6 +56,7 @@ Code contributions to the new version: - Updated services.json, mtbseq's lablog, viralrecon's lablog and assembly's config file [#299](https://github.com/BU-ISCIII/buisciii-tools/pull/299) - Added lablog to automate gene characterization with emmtyper, including unzipping assemblies. [#300](https://github.com/BU-ISCIII/buisciii-tools/pull/300) - Fixed 99-stats (MAG) template. [#301](https://github.com/BU-ISCIII/buisciii-tools/pull/301) +- Fixed IRMA's lablog so that the sequences of the samples are not displayed several times neither in the .txt files of each influenza type nor in all_samples_completo.txt [#305](https://github.com/BU-ISCIII/buisciii-tools/pull/305) ### Modules From 8fa9e2a44e09a8fe91e28a4c80b5df1fbfb205bd Mon Sep 17 00:00:00 2001 From: victor5lm Date: Mon, 5 Aug 2024 13:52:13 +0200 Subject: [PATCH 118/321] Added new line to remove all_samples_completo.txt before creating it, in case it already existed previously --- .../templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog index e66a50a9d..5e9d933b6 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog @@ -13,6 +13,8 @@ echo "ls */*HA*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | sort -u | cut -d '_' - echo 'cat HA_types.txt | while read type; do if test -d ${type}; then rm -rf ${type}; fi; done; if test -d B ; then rm -rf B; fi; if test -d C; then rm -rf C; fi' >> _03_post_processing.sh +echo 'if test -f all_samples_completo.txt; then rm all_samples_completo.txt; fi' >> _03_post_processing.sh + echo "cat HA_types.txt | while read in; do mkdir \${in}; done" >> _03_post_processing.sh echo "if grep -qw 'B__' irma_stats.txt; then mkdir B; fi" >> _03_post_processing.sh From 4620576ce721db3eea3e17b5644a7c34a131b5a8 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 7 Aug 2024 13:29:10 +0200 Subject: [PATCH 119/321] Modified the email_creation function so that new lines are applied when adding the delivery notes into the body of the email --- bu_isciii/bioinfo_doc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/bioinfo_doc.py b/bu_isciii/bioinfo_doc.py index 5583ab8a8..ad7b20a46 100755 --- a/bu_isciii/bioinfo_doc.py +++ b/bu_isciii/bioinfo_doc.py @@ -603,11 +603,11 @@ def email_creation(self): if bu_isciii.utils.prompt_yn_question( f"Do you want to use notes from {self.provided_txt}?", dflt=False ): - email_data["email_notes"] = self.delivery_notes + email_data["email_notes"] = self.delivery_notes.replace("\n", "
") else: email_data["email_notes"] = bu_isciii.utils.ask_for_some_text( msg="Write email notes" - ) + ).replace("\n", "
") email_data["user_data"] = self.resolution_info["service_user_id"] email_data["service_id"] = self.service_name.split("_", 5)[0] From 66da332950059be6e1af1156405582ee58e5095e Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 7 Aug 2024 13:36:39 +0200 Subject: [PATCH 120/321] Updated CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6417f4b9b..13c20e788 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,7 @@ Code contributions to the new version: - Added lablog to automate gene characterization with emmtyper, including unzipping assemblies. [#300](https://github.com/BU-ISCIII/buisciii-tools/pull/300) - Fixed 99-stats (MAG) template. [#301](https://github.com/BU-ISCIII/buisciii-tools/pull/301) - Fixed IRMA's lablog so that the sequences of the samples are not displayed several times neither in the .txt files of each influenza type nor in all_samples_completo.txt [#305](https://github.com/BU-ISCIII/buisciii-tools/pull/305) +- Modified bioinfo_doc.py so that new lines in the delivery message are applied in the email [#307](https://github.com/BU-ISCIII/buisciii-tools/pull/307) ### Modules From f209cead765c767b629d483c398197f1390cd479 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Thu, 8 Aug 2024 11:44:22 +0200 Subject: [PATCH 121/321] Ran black --check and black commands so that black_lint does not fail --- bu_isciii/bioinfo_doc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bu_isciii/bioinfo_doc.py b/bu_isciii/bioinfo_doc.py index ad7b20a46..37512a40e 100755 --- a/bu_isciii/bioinfo_doc.py +++ b/bu_isciii/bioinfo_doc.py @@ -603,7 +603,9 @@ def email_creation(self): if bu_isciii.utils.prompt_yn_question( f"Do you want to use notes from {self.provided_txt}?", dflt=False ): - email_data["email_notes"] = self.delivery_notes.replace("\n", "
") + email_data["email_notes"] = self.delivery_notes.replace( + "\n", "
" + ) else: email_data["email_notes"] = bu_isciii.utils.ask_for_some_text( msg="Write email notes" From e72dfbc42be47e800c52d348e70428368d1674dd Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Fri, 2 Aug 2024 16:25:18 +0200 Subject: [PATCH 122/321] Added log to pangolin-data update stage. Added check for correct update --- .../viralrecon/ANALYSIS/lablog_viralrecon | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index a98cf487f..6ae5230da 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -16,7 +16,7 @@ The functions performed by the script can be listed as follows: - Checking of the last available version of the Pangolin container. Download if necessary. Database update. File configuration. - Checking the last available version of the Nextclade container. Download if necessary. Extraction of the tag for the analysis dataset. File configuration. -- Checking of required references (fasta and gff) and downloading of from NCBI if necessary. +- Checking if required references (fasta and gff) are locally available and downloading them from NCBI if necessary. - Creation of the necessary directories for the analysis hosts, and subdirectories for each of the references. ' @@ -65,11 +65,15 @@ update_pangolin() { echo -e "Pangolin database is UP TO DATE. \xE2\x9C\x85" else mkdir "$(date '+%Y%m%d')" - srun --partition short_idx singularity run -B ${PWD} /data/bi/pipelines/singularity-images/$latest_version_pangolin pangolin --update-data --datadir ${PWD}/$(date '+%Y%m%d')/ - # log file creation - echo -e "$(date +'%Y-%m-%d %H:%M:%S')\tmkdir $(date '+%Y%m%d')" >> $(date '+%Y%m%d')/log - echo -e "$(date +'%Y-%m-%d %H:%M:%S')\tsrun --partition short_idx singularity run -B ${PWD} /data/bi/pipelines/singularity-images/$latest_version_pangolin pangolin --update-data --datadir ${PWD}/$(date '+%Y%m%d')/)" >> $(date '+%Y%m%d')/log - echo_green "Pangolin database UPDATED." + echo -e "$(date +'%Y-%m-%d %H:%M:%S')\tmkdir $(date '+%Y%m%d')" >> $(date '+%Y%m%d')/command.log + echo -e "$(date +'%Y-%m-%d %H:%M:%S')\tsrun --partition short_idx --output ${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log singularity run -B ${PWD} /data/bi/pipelines/singularity-images/$latest_version_pangolin pangolin --update-data --datadir ${PWD}/$(date '+%Y%m%d')/)" >> $(date '+%Y%m%d')/command.log + srun --partition short_idx --output ${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log singularity run -B ${PWD} /data/bi/pipelines/singularity-images/$latest_version_pangolin pangolin --update-data --datadir ${PWD}/$(date '+%Y%m%d')/ + if [ $? -eq 0 ]; then + echo_green "$(grep pangolin "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")" + echo_green "$(grep constellations "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")" + else + echo_blinking_red "Error during pangolin database update." + fi fi cd - From 3f1d9fb197e1c0395b9097a230fc60313afa3c51 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Fri, 2 Aug 2024 16:28:00 +0200 Subject: [PATCH 123/321] Fixed typo --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 6ae5230da..0de45e34a 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -255,7 +255,7 @@ done echo_bold "\nPlease specify the method to be performed." echo_bold "1. Mapping" - echo_bold "2. De novo assemby" + echo_bold "2. De novo assembly" echo_bold "3. Both" while true; do echo -ne "\e[1;38;5;220m"; read -n 1 method; tput sgr0; echo From c3e5dc6817b1feec02a806daf543d12277465a86 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Fri, 2 Aug 2024 16:49:17 +0200 Subject: [PATCH 124/321] Enabled module load singularity inside of the lablog --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 0de45e34a..7ca50950b 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -21,7 +21,7 @@ The functions performed by the script can be listed as follows: ' -# module load singularity +module load singularity # If there is more than 1 reference, please prepare the samples_ref.txt file before running this lablog. From 8675931eeca3a442fd9a34df3555b7c5b1618dca Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Mon, 5 Aug 2024 12:15:27 +0200 Subject: [PATCH 125/321] Moved nextclade_dataset_tag and nextclade_dataset_name inside sbatch file istead of PARAMS_FILE --- .../viralrecon/ANALYSIS/lablog_viralrecon | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 7ca50950b..e6ec4dced 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -128,9 +128,7 @@ update_nextclade() { # Updating params file echo "Updating $PARAMS_FILE file..." sed -i "s|skip_nextclade: true|skip_nextclade: false|" "$PARAMS_FILE" - echo "nextclade_dataset_name: '$virus_tag'" >> $PARAMS_FILE echo "nextclade_dataset: false" >> $PARAMS_FILE - echo "nextclade_dataset_tag: '$nextclade_tag'" >> $PARAMS_FILE echo_bold "File $PARAMS_FILE UPDATED." echo_bold "Finished NEXTCLADE check/update" @@ -348,7 +346,6 @@ else elif [ "$virus_tag" == "rsv" ]; then # Update Nextclade update_nextclade - sed -i '/^nextclade_dataset_name/d' $PARAMS_FILE echo_bold "\nRemember to provide the complete route to primer_bed and primer_fasta files, and specify the nextclade_dataset_name in every sbatch file before running the pipeline." @@ -398,15 +395,20 @@ do echo " --outdir ${ref}_$(date '+%Y%m%d')_viralrecon_mapping \\\\" >> ${FOLDER_NAME}/lablog echo " --fasta ${REF_FASTA} \\\\" >> ${FOLDER_NAME}/lablog echo " --gff ${REF_GFF} \\\\" >> ${FOLDER_NAME}/lablog - if [ "$virus_tag" == 'rsv' ]; then - echo " --primer_bed ../../REFERENCES/XXXX \\\\" >> ${FOLDER_NAME}/lablog - echo " --primer_fasta ../../REFERENCES/XXXX \\\\" >> ${FOLDER_NAME}/lablog - if [ $ref == "EPI_ISL_18668201" ]; then - echo " --nextclade_dataset_name 'rsv_a' \\\\" >> ${FOLDER_NAME}/lablog - elif [ $ref == "EPI_ISL_1653999" ]; then - echo " --nextclade_dataset_name 'rsv_b' \\\\" >> ${FOLDER_NAME}/lablog - else - echo " --nextclade_dataset_name 'rsv_X' \\\\" >> ${FOLDER_NAME}/lablog + if [ -n "$virus_tag" ]; then + echo " --nextclade_dataset_tag '$nextclade_tag' \\\\" >> ${FOLDER_NAME}/lablog + if [ "$virus_tag" == 'rsv' ]; then + echo " --primer_bed ../../REFERENCES/XXXX \\\\" >> ${FOLDER_NAME}/lablog + echo " --primer_fasta ../../REFERENCES/XXXX \\\\" >> ${FOLDER_NAME}/lablog + if [ $ref == "EPI_ISL_18668201" ]; then + echo " --nextclade_dataset_name 'rsv_a' \\\\" >> ${FOLDER_NAME}/lablog + elif [ $ref == "EPI_ISL_1653999" ]; then + echo " --nextclade_dataset_name 'rsv_b' \\\\" >> ${FOLDER_NAME}/lablog + else + echo " --nextclade_dataset_name 'rsv_X' \\\\" >> ${FOLDER_NAME}/lablog + fi + else + echo " --nextclade_dataset_name '$virus_tag' \\\\" >> ${FOLDER_NAME}/lablog fi fi echo " -resume" >> ${FOLDER_NAME}/lablog From 6879896377b2ed55511a49054e60fc80c053750e Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Tue, 6 Aug 2024 09:54:42 +0200 Subject: [PATCH 126/321] Added singularity module load check and screen confirmation messages --- .../templates/viralrecon/ANALYSIS/lablog_viralrecon | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index e6ec4dced..4a84b3f15 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -21,7 +21,6 @@ The functions performed by the script can be listed as follows: ' -module load singularity # If there is more than 1 reference, please prepare the samples_ref.txt file before running this lablog. @@ -34,6 +33,16 @@ echo_red() { echo -e "\e[31m$1\e[0m"; } echo_green() { echo -e "\e[32m$1\e[0m"; } echo_blinking_red() { echo -e "\e[1;5;97;5;41m$1\e[0m"; } +# Loading singularity module +module load singularity +singularity_loaded=$(module list | grep singularity | awk '{print $2}') +if [ -n "$singularity_loaded" ]; then + echo_green "$singularity_loaded module succesfully loaded." +else + echo_blinking_red "Singularity module not loaded. Exiting..." + exit 1 +fi + # Updating pangolin. Checks last image available and if is already downloaded. If not, downloads it. This function also updates pangolin database. Update related config files with pangolin info update_pangolin() { From 5738917285e2c04fa0b5a63363d867a5e78d6da5 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Tue, 6 Aug 2024 10:57:50 +0200 Subject: [PATCH 127/321] Modified name of fasta and gff files (family instead ref) when using refgenie add function --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 4a84b3f15..25d958530 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -176,7 +176,7 @@ check_references() { wget -q -O "/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=${ref}&rettype=fasta&retmode=text" if [ $? -eq 0 ]; then echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}" - refgenie add ${family}/fasta:${ref} --path data/${digest}/fasta/${ref}/ --seek-keys '{"fasta" : "'"${ref}.fasta"'"}' -c /data/bi/references/refgenie/genome_config.yaml + refgenie add ${family}/fasta:${ref} --path data/${digest}/fasta/${ref}/ --seek-keys '{"fasta" : "'"${family}.fasta"'"}' -c /data/bi/references/refgenie/genome_config.yaml REF_FASTA=$(refgenie seek ${family}/fasta.fasta:${ref} -c /data/bi/references/refgenie/genome_config.yaml) else echo_blinking_red "An error occurred during file downloading." @@ -188,7 +188,7 @@ check_references() { wget -q -O "/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=${ref}&rettype=fasta&retmode=text" if [ $? -eq 0 ]; then echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}" - refgenie add ${family}/fasta:${ref} --path data/${digest}/fasta/${ref}/ --seek-keys '{"fasta" : "'"${ref}.fasta"'"}' -c /data/bi/references/refgenie/genome_config.yaml + refgenie add ${family}/fasta:${ref} --path data/${digest}/fasta/${ref}/ --seek-keys '{"fasta" : "'"${family}.fasta"'"}' -c /data/bi/references/refgenie/genome_config.yaml REF_FASTA=$(refgenie seek ${family}/fasta.fasta:${ref} -c /data/bi/references/refgenie/genome_config.yaml) else echo_blinking_red "An error occurred during file downloading." @@ -211,7 +211,7 @@ check_references() { wget -q -O "/data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/${ref}.gff" "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${ref}" if [ $? -eq 0 ]; then echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}" - refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${ref}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml + refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml REF_GFF=$(refgenie seek ${family}/gff.gff:${ref} -c /data/bi/references/refgenie/genome_config.yaml) else echo_blinking_red "An error occurred during file downloading." @@ -223,7 +223,7 @@ check_references() { wget -q -O "/data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/${ref}.gff" "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${ref}" if [ $? -eq 0 ]; then echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}" - refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${ref}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml + refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml REF_GFF=$(refgenie seek ${family}/gff.gff:${ref} -c /data/bi/references/refgenie/genome_config.yaml) else echo_blinking_red "An error occurred during file downloading." From 8e412a7a6b9de884c9464e52821dd8dab6f7cbbc Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Tue, 6 Aug 2024 11:14:59 +0200 Subject: [PATCH 128/321] Updated path for artic SARS primer.bed inside refgenie folder --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 25d958530..400f6220c 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -350,7 +350,7 @@ else update_nextclade update_pangolin - echo "primer_bed: '/data/bi/references/virus/2019-nCoV/amplicons/NC_045512.2/V4.1/artic_v4-1_ncov-2019-primer.scheme.bed'" >> $PARAMS_FILE + echo "primer_bed: '/data/bi/references/refgenie/alias/coronaviridae/primer_schemes/NC_045512.2/artic_v4-1_ncov-2019-primer.scheme.bed'" >> $PARAMS_FILE elif [ "$virus_tag" == "rsv" ]; then # Update Nextclade From 9ad6d573c7203c47461730a836283a8ccde9f5ea Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Tue, 6 Aug 2024 12:30:01 +0200 Subject: [PATCH 129/321] Moved obtain_family function inside if statement, in order to use it just when it is necessary --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 400f6220c..e50efbc49 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -164,10 +164,10 @@ check_references() { } # Check if FASTA sequence is already downloaded - obtain_family; if [ -z $family ]; then return; fi REF_FASTA=$(refgenie seek ${family}/fasta.fasta:${ref} -c /data/bi/references/refgenie/genome_config.yaml 2>&1) if echo "$REF_FASTA" | grep -q "Traceback"; then echo "File ${ref}.fasta is not yet downloaded." + obtain_family; if [ -z $family ]; then return; fi if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exists echo "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.fasta in /data/bi/references/refgenie/alias/${family}/fasta/${ref}." digest=$(openssl rand -hex 24) @@ -199,10 +199,10 @@ check_references() { fi # Check if GFF file is already downloaded - if [ ! -v family ]; then obtain_family; if [ -z ${family} ]; then return; fi; fi REF_GFF=$(refgenie seek ${family}/gff.gff:${ref} -c /data/bi/references/refgenie/genome_config.yaml 2>&1) - if echo "$REF_GFF" | grep -q "Traceback"; then - echo "File ${ref}.gff is not yet downloaded." + if echo "$REF_GFF" | grep -q "Traceback"; then + echo "File ${ref}.gff is not yet downloaded." + if [ ! -v family ]; then obtain_family; if [ -z ${family} ]; then return; fi; fi if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exist echo "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.gff in /data/bi/references/refgenie/alias/${family}/gff/${ref}." digest=$(openssl rand -hex 24) From 74896c3041348b65b9ce2ddf816675a76c306885 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Tue, 6 Aug 2024 13:06:59 +0200 Subject: [PATCH 130/321] Modified the way the references are finded. Substituted refgenie seek for awk in references.txt. Added regeneration of references.txt when new references are included --- .../viralrecon/ANALYSIS/lablog_viralrecon | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index e50efbc49..77ab1d0b5 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -164,8 +164,8 @@ check_references() { } # Check if FASTA sequence is already downloaded - REF_FASTA=$(refgenie seek ${family}/fasta.fasta:${ref} -c /data/bi/references/refgenie/genome_config.yaml 2>&1) - if echo "$REF_FASTA" | grep -q "Traceback"; then + REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) + if [ -z "$REF_FASTA" ]; then echo "File ${ref}.fasta is not yet downloaded." obtain_family; if [ -z $family ]; then return; fi if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exists @@ -177,7 +177,8 @@ check_references() { if [ $? -eq 0 ]; then echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}" refgenie add ${family}/fasta:${ref} --path data/${digest}/fasta/${ref}/ --seek-keys '{"fasta" : "'"${family}.fasta"'"}' -c /data/bi/references/refgenie/genome_config.yaml - REF_FASTA=$(refgenie seek ${family}/fasta.fasta:${ref} -c /data/bi/references/refgenie/genome_config.yaml) + bash /data/bi/references/refgenie/alias/ref.sh + REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else echo_blinking_red "An error occurred during file downloading." fi @@ -189,7 +190,8 @@ check_references() { if [ $? -eq 0 ]; then echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}" refgenie add ${family}/fasta:${ref} --path data/${digest}/fasta/${ref}/ --seek-keys '{"fasta" : "'"${family}.fasta"'"}' -c /data/bi/references/refgenie/genome_config.yaml - REF_FASTA=$(refgenie seek ${family}/fasta.fasta:${ref} -c /data/bi/references/refgenie/genome_config.yaml) + bash /data/bi/references/refgenie/alias/ref.sh + REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else echo_blinking_red "An error occurred during file downloading." fi @@ -199,8 +201,8 @@ check_references() { fi # Check if GFF file is already downloaded - REF_GFF=$(refgenie seek ${family}/gff.gff:${ref} -c /data/bi/references/refgenie/genome_config.yaml 2>&1) - if echo "$REF_GFF" | grep -q "Traceback"; then + REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) + if [ -z "$REF_GFF" ]; then echo "File ${ref}.gff is not yet downloaded." if [ ! -v family ]; then obtain_family; if [ -z ${family} ]; then return; fi; fi if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exist @@ -212,7 +214,8 @@ check_references() { if [ $? -eq 0 ]; then echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}" refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml - REF_GFF=$(refgenie seek ${family}/gff.gff:${ref} -c /data/bi/references/refgenie/genome_config.yaml) + bash /data/bi/references/refgenie/alias/ref.sh + REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else echo_blinking_red "An error occurred during file downloading." fi @@ -224,7 +227,8 @@ check_references() { if [ $? -eq 0 ]; then echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}" refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml - REF_GFF=$(refgenie seek ${family}/gff.gff:${ref} -c /data/bi/references/refgenie/genome_config.yaml) + bash /data/bi/references/refgenie/alias/ref.sh + REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else echo_blinking_red "An error occurred during file downloading." fi From 5a026a25792b10a36c262ee1d2689f1fcb4465fd Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Tue, 6 Aug 2024 13:37:03 +0200 Subject: [PATCH 131/321] Added copy stage for config and params files and added date and time to filenames in order to regenerate this config files when this lablog is run more than one time, por several analysis --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 77ab1d0b5..07946196e 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -242,8 +242,11 @@ check_references() { #################################### # Setting work variables -CONFIG_FILE="../DOC/viralrecon.config" -PARAMS_FILE="../DOC/viralrecon_params.yml" +timeset=$(date +"%Y-%m-%d_%H-%M-%S")_ +cp ../DOC/viralrecon.config ../DOC/${timeset}_viralrecon.config +cp ../DOC/viralrecon_params.yml ../DOC/${timeset}_viralrecon_params.yml +CONFIG_FILE="../DOC/${timeset}_viralrecon.config" +PARAMS_FILE="../DOC/${timeset}_viralrecon_params.yml" # Setting the type of analysis echo_bold "\nPlease specify the type of analysis." From ceb0e062e910ac18b650d03789683d2610b851e7 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Tue, 6 Aug 2024 15:45:07 +0200 Subject: [PATCH 132/321] Substituted refgenie add by refgenie build when including new references. Added SAMtools module loading --- .../viralrecon/ANALYSIS/lablog_viralrecon | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 07946196e..ade96b6b0 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -34,6 +34,7 @@ echo_green() { echo -e "\e[32m$1\e[0m"; } echo_blinking_red() { echo -e "\e[1;5;97;5;41m$1\e[0m"; } # Loading singularity module +module purge module load singularity singularity_loaded=$(module list | grep singularity | awk '{print $2}') if [ -n "$singularity_loaded" ]; then @@ -168,15 +169,24 @@ check_references() { if [ -z "$REF_FASTA" ]; then echo "File ${ref}.fasta is not yet downloaded." obtain_family; if [ -z $family ]; then return; fi + # Loading SAMtools module + module load SAMtools + SAMtools_loaded=$(module list | grep -o 'SAMtools/[0-9.]\+-GCC-[0-9.]\+') + if [ -n "$SAMtools_loaded" ]; then + echo_green "$SAMtools_loaded module succesfully loaded." + else + echo_blinking_red "SAMtools module not loaded. Exiting..." + exit 1 + fi if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exists echo "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.fasta in /data/bi/references/refgenie/alias/${family}/fasta/${ref}." digest=$(openssl rand -hex 24) - refgenie alias set --aliases ${family} --digest ${digest} -f -c /data/bi/references/refgenie/genome_config.yaml mkdir -p /data/bi/references/refgenie/data/${digest}/fasta/${ref}/ wget -q -O "/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=${ref}&rettype=fasta&retmode=text" if [ $? -eq 0 ]; then echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}" - refgenie add ${family}/fasta:${ref} --path data/${digest}/fasta/${ref}/ --seek-keys '{"fasta" : "'"${family}.fasta"'"}' -c /data/bi/references/refgenie/genome_config.yaml + gzip /data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta + refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R bash /data/bi/references/refgenie/alias/ref.sh REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else @@ -189,8 +199,8 @@ check_references() { wget -q -O "/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=${ref}&rettype=fasta&retmode=text" if [ $? -eq 0 ]; then echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}" - refgenie add ${family}/fasta:${ref} --path data/${digest}/fasta/${ref}/ --seek-keys '{"fasta" : "'"${family}.fasta"'"}' -c /data/bi/references/refgenie/genome_config.yaml - bash /data/bi/references/refgenie/alias/ref.sh + gzip /data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta + refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R bash /data/bi/references/refgenie/alias/ref.sh REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else echo_blinking_red "An error occurred during file downloading." From 8115d72a7d300f44311e3681c06c4b565810b2e0 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Tue, 6 Aug 2024 17:15:16 +0200 Subject: [PATCH 133/321] Modified final message. Moved module load singularity stage outside defining functions area --- .../viralrecon/ANALYSIS/lablog_viralrecon | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index ade96b6b0..d33037a0a 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -33,17 +33,6 @@ echo_red() { echo -e "\e[31m$1\e[0m"; } echo_green() { echo -e "\e[32m$1\e[0m"; } echo_blinking_red() { echo -e "\e[1;5;97;5;41m$1\e[0m"; } -# Loading singularity module -module purge -module load singularity -singularity_loaded=$(module list | grep singularity | awk '{print $2}') -if [ -n "$singularity_loaded" ]; then - echo_green "$singularity_loaded module succesfully loaded." -else - echo_blinking_red "Singularity module not loaded. Exiting..." - exit 1 -fi - # Updating pangolin. Checks last image available and if is already downloaded. If not, downloads it. This function also updates pangolin database. Update related config files with pangolin info update_pangolin() { @@ -251,12 +240,27 @@ check_references() { } #################################### +# Loading singularity module +module purge +module load singularity +singularity_loaded=$(module list | grep singularity | awk '{print $2}') +if [ -n "$singularity_loaded" ]; then + echo_green "$singularity_loaded module succesfully loaded." + echo -e "$(date +'%Y-%m-%d %H:%M:%S')\t${singularity_loaded} module succesfully loaded." >> $(date '+%Y%m%d')/command.log +else + echo_blinking_red "Singularity module not loaded. Exiting..." + exit 1 +fi + # Setting work variables timeset=$(date +"%Y-%m-%d_%H-%M-%S")_ cp ../DOC/viralrecon.config ../DOC/${timeset}_viralrecon.config cp ../DOC/viralrecon_params.yml ../DOC/${timeset}_viralrecon_params.yml CONFIG_FILE="../DOC/${timeset}_viralrecon.config" PARAMS_FILE="../DOC/${timeset}_viralrecon_params.yml" +echo "Created $CONFIG_FILE file." +echo "Created $PARAMS_FILE file." +echo # Setting the type of analysis echo_bold "\nPlease specify the type of analysis." @@ -459,4 +463,4 @@ rm percentajeNs.py rm _02_create_run_percentage_Ns.sh cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd .. -echo_green "\nLablog_viralrecon execution has been completed successfully!" +echo_green "\nLablog_viralrecon execution has been completed. Please verify all the configurations are set up correctly." From c3188b3750fff10aa1b93f7773ca49eb38189c7d Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Tue, 6 Aug 2024 17:48:27 +0200 Subject: [PATCH 134/321] Created lablog_viralrecon.log --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index d33037a0a..468e8cd6a 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -24,6 +24,9 @@ The functions performed by the script can be listed as follows: # If there is more than 1 reference, please prepare the samples_ref.txt file before running this lablog. +echo "Starting lablog_viralrecon execution." +echo -e "$(date +'%Y-%m-%d %H:%M:%S')\tStarting lablog_viralrecon execution." > lablog_viralrecon.log + #################################### # Defining functions @@ -38,6 +41,7 @@ echo_blinking_red() { echo -e "\e[1;5;97;5;41m$1\e[0m"; } update_pangolin() { echo echo_bold "Starting PANGOLIN check/update." + echo -e "$(date +'%Y-%m-%d %H:%M:%S')\tStarting lablog_viralrecon execution." >> lablog_viralrecon.log echo "Checking Pangolin container version..." url=$(curl -s "https://depot.galaxyproject.org/singularity/") latest_version_pangolin=$(echo "$url" | grep -oP 'pangolin:[^"]+' | sort -V | tail -n 1 | awk -F'>' '{print $1}' | sed 's/<\/a//') @@ -246,7 +250,7 @@ module load singularity singularity_loaded=$(module list | grep singularity | awk '{print $2}') if [ -n "$singularity_loaded" ]; then echo_green "$singularity_loaded module succesfully loaded." - echo -e "$(date +'%Y-%m-%d %H:%M:%S')\t${singularity_loaded} module succesfully loaded." >> $(date '+%Y%m%d')/command.log + echo -e "$(date +'%Y-%m-%d %H:%M:%S')\t${singularity_loaded} module succesfully loaded." >> lablog_viralrecon.log else echo_blinking_red "Singularity module not loaded. Exiting..." exit 1 From a3196c623eaeba3ed1ac31608c1daa1bf9a47d16 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Wed, 7 Aug 2024 09:10:05 +0200 Subject: [PATCH 135/321] Added log_message function --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 468e8cd6a..e668e0673 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -30,6 +30,13 @@ echo -e "$(date +'%Y-%m-%d %H:%M:%S')\tStarting lablog_viralrecon execution." > #################################### # Defining functions +# Log message saving +log_message() { + local message="$1" + echo "$(date '+%Y-%m-%d %H:%M:%S') - $message" >> ./lablog_viralrecon.log +} + + # Coloring messages echo_bold() { echo -e "\e[1;37m$1\e[0m"; } echo_red() { echo -e "\e[31m$1\e[0m"; } From b7cfcc3670dee256721974c7b7166fcbdf92f5e7 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Wed, 7 Aug 2024 09:56:51 +0200 Subject: [PATCH 136/321] Added log_message to every message in the lablog --- .../viralrecon/ANALYSIS/lablog_viralrecon | 155 ++++++++++-------- 1 file changed, 85 insertions(+), 70 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index e668e0673..47b8ebf7e 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -24,8 +24,6 @@ The functions performed by the script can be listed as follows: # If there is more than 1 reference, please prepare the samples_ref.txt file before running this lablog. -echo "Starting lablog_viralrecon execution." -echo -e "$(date +'%Y-%m-%d %H:%M:%S')\tStarting lablog_viralrecon execution." > lablog_viralrecon.log #################################### # Defining functions @@ -33,7 +31,7 @@ echo -e "$(date +'%Y-%m-%d %H:%M:%S')\tStarting lablog_viralrecon execution." > # Log message saving log_message() { local message="$1" - echo "$(date '+%Y-%m-%d %H:%M:%S') - $message" >> ./lablog_viralrecon.log + echo -e "$(date '+%Y-%m-%d %H:%M:%S') - $message" >> lablog_viralrecon.log } @@ -47,219 +45,233 @@ echo_blinking_red() { echo -e "\e[1;5;97;5;41m$1\e[0m"; } # Updating pangolin. Checks last image available and if is already downloaded. If not, downloads it. This function also updates pangolin database. Update related config files with pangolin info update_pangolin() { echo - echo_bold "Starting PANGOLIN check/update." - echo -e "$(date +'%Y-%m-%d %H:%M:%S')\tStarting lablog_viralrecon execution." >> lablog_viralrecon.log - echo "Checking Pangolin container version..." + echo_bold "Starting PANGOLIN check/update."; log_message "Starting PANGOLIN check/update." + echo "Checking Pangolin container version..."; log_message "Checking Pangolin container version..." url=$(curl -s "https://depot.galaxyproject.org/singularity/") latest_version_pangolin=$(echo "$url" | grep -oP 'pangolin:[^"]+' | sort -V | tail -n 1 | awk -F'>' '{print $1}' | sed 's/<\/a//') echo_bold "Latest version available of Pangolin:\e[1;38;5;220m $latest_version_pangolin"; tput sgr0 + log_message "Latest version available of Pangolin: $latest_version_pangolin" - echo "Checking if latest version of Pangolin image is already downloaded..." + echo "Checking if latest version of Pangolin image is already downloaded..."; log_message "Checking if latest version of Pangolin image is already downloaded..." if [ -e "/data/bi/pipelines/singularity-images/$latest_version_pangolin" ]; then - echo "File $latest_version_pangolin already downloaded." - echo -e "Pangolin container is UP TO DATE. \xE2\x9C\x85" + echo "File $latest_version_pangolin already downloaded."; log_message "File $latest_version_pangolin already downloaded." + echo -e "Pangolin container is UP TO DATE. \xE2\x9C\x85"; log_message "Pangolin container is UP TO DATE. \xE2\x9C\x85" else - echo "Downloading $latest_version_pangolin file..." + echo "Downloading $latest_version_pangolin file..."; log_message "Downloading $latest_version_pangolin file..." wget -P "/data/bi/pipelines/singularity-images/" "https://depot.galaxyproject.org/singularity/$latest_version_pangolin" if [ $? -eq 0 ]; then - echo_green "$latest_version_pangolin file succesfully downloaded." + echo_green "$latest_version_pangolin file succesfully downloaded."; log_message "$latest_version_pangolin file succesfully downloaded." else - echo_blinking_red "An error occurred during file downloading." + echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." fi fi # Updating Pangolin database - echo "Setting datadir for Pangolin database." + echo "Setting datadir for Pangolin database."; log_message "Setting datadir for Pangolin database." cd /data/bi/references/pangolin/ if [ -e "./$(date '+%Y%m%d')" ]; then - echo -e "Pangolin database is UP TO DATE. \xE2\x9C\x85" + echo -e "Pangolin database is UP TO DATE. \xE2\x9C\x85"; log_message "Pangolin database is UP TO DATE. \xE2\x9C\x85" else mkdir "$(date '+%Y%m%d')" - echo -e "$(date +'%Y-%m-%d %H:%M:%S')\tmkdir $(date '+%Y%m%d')" >> $(date '+%Y%m%d')/command.log - echo -e "$(date +'%Y-%m-%d %H:%M:%S')\tsrun --partition short_idx --output ${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log singularity run -B ${PWD} /data/bi/pipelines/singularity-images/$latest_version_pangolin pangolin --update-data --datadir ${PWD}/$(date '+%Y%m%d')/)" >> $(date '+%Y%m%d')/command.log + echo -e "$(date +'%Y-%m-%d %H:%M:%S') - mkdir $(date '+%Y%m%d')" >> $(date '+%Y%m%d')/command.log + echo -e "$(date +'%Y-%m-%d %H:%M:%S') - srun --partition short_idx --output ${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log singularity run -B ${PWD} /data/bi/pipelines/singularity-images/$latest_version_pangolin pangolin --update-data --datadir ${PWD}/$(date '+%Y%m%d')/)" >> $(date '+%Y%m%d')/command.log srun --partition short_idx --output ${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log singularity run -B ${PWD} /data/bi/pipelines/singularity-images/$latest_version_pangolin pangolin --update-data --datadir ${PWD}/$(date '+%Y%m%d')/ if [ $? -eq 0 ]; then - echo_green "$(grep pangolin "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")" - echo_green "$(grep constellations "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")" + echo_green "$(grep pangolin "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")"; log_message "$(grep pangolin "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")" + echo_green "$(grep constellations "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")"; log_message "$(grep constellations "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")" else - echo_blinking_red "Error during pangolin database update." + echo_blinking_red "Error during pangolin database update."; log_message "Error during pangolin database update." fi fi cd - # Updating config file - echo "Updating $CONFIG_FILE file..." + echo "Updating $CONFIG_FILE file..."; log_message "Updating $CONFIG_FILE file..." sed -i "s|pangolin:4.3--pyhdfd78af_2|$latest_version_pangolin|" "$CONFIG_FILE" sed -i "s|--datadir XXXX|--datadir $(ls -dt /data/bi/references/pangolin/*/ | head -n 1)|" "$CONFIG_FILE" - echo_bold "File $CONFIG_FILE UPDATED." + echo_bold "File $CONFIG_FILE UPDATED."; log_message "File $CONFIG_FILE UPDATED." # Updating params file - echo "Updating $PARAMS_FILE file..." + echo "Updating $PARAMS_FILE file..."; log_message "Updating $PARAMS_FILE file..." sed -i "s|skip_pangolin: true|skip_pangolin: false|" "$PARAMS_FILE" - echo_bold "File $PARAMS_FILE UPDATED." + echo_bold "File $PARAMS_FILE UPDATED."; log_message "File $PARAMS_FILE UPDATED." - echo_bold "Finished PANGOLIN check/update" + echo_bold "Finished PANGOLIN check/update"; log_message "Finished PANGOLIN check/update" echo } # Updating Nextclade. Checks last image available and if is already downloaded. If not, downloads it. Update related config files with nextclade info update_nextclade() { echo - echo_bold "Starting NEXTCLADE check/update." - echo "Checking Nextclade container version..." + echo_bold "Starting NEXTCLADE check/update."; log_message "Starting NEXTCLADE check/update." + echo "Checking Nextclade container version..."; log_message "Checking Nextclade container version..." url=$(curl -s "https://depot.galaxyproject.org/singularity/") latest_version_nextclade=$(echo "$url" | grep -oP 'nextclade:[^"]+' | sort -V | tail -n 1 | awk -F'>' '{print $1}' | sed 's/<\/a//') echo_bold "Latest version available of Nextclade:\e[1;38;5;220m $latest_version_nextclade"; tput sgr0 + log_message "Latest version available of Nextclade: $latest_version_nextclade" - echo "Checking if latest version of Nextclade image is already downloaded..." + echo "Checking if latest version of Nextclade image is already downloaded..."; log_message "Checking if latest version of Nextclade image is already downloaded..." if [ -e "/data/bi/pipelines/singularity-images/$latest_version_nextclade" ]; then - echo "File $latest_version_nextclade already downloaded." - echo -e "Nextclade container is UP TO DATE. \xE2\x9C\x85" + echo "File $latest_version_nextclade already downloaded."; log_message "File $latest_version_nextclade already downloaded." + echo -e "Nextclade container is UP TO DATE. \xE2\x9C\x85"; log_message "Nextclade container is UP TO DATE. \xE2\x9C\x85" else - echo "Downloading $latest_version_nextclade file..." + echo "Downloading $latest_version_nextclade file..."; log_message "Downloading $latest_version_nextclade file..." wget -P "/data/bi/pipelines/singularity-images" "https://depot.galaxyproject.org/singularity/$latest_version_nextclade" if [ $? -eq 0 ]; then - echo_green "$latest_version_nextclade file succesfully downloaded." + echo_green "$latest_version_nextclade file succesfully downloaded."; log_message "$latest_version_nextclade file succesfully downloaded." else - echo_blinking_red "An error occurred during file downloading." + echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." fi fi # Extracting the current Nextclade data TAG - echo "Extracting Nextclade data TAG..." + echo "Extracting Nextclade data TAG..."; log_message "Extracting Nextclade data TAG..." nextclade_tag=$(singularity run /data/bi/pipelines/singularity-images/$latest_version_nextclade nextclade dataset list --json | grep -zoP "\"path\":\s*\"nextstrain/${virus_tag}[^\"]*\"[\s\S]*?\"tag\":\s*\"\K[^\"]*" | tr '\0' '\n' | head -n 1) echo_bold "Latest \e[1;38;5;220m${virus_tag^^} \e[1;37mNextclade dataset version TAG:\e[1;38;5;220m $nextclade_tag"; tput sgr0 + log_message "Latest ${virus_tag^^} Nextclade dataset version TAG: $nextclade_tag" # Updating config file - echo "Updating $CONFIG_FILE file..." + echo "Updating $CONFIG_FILE file..."; log_message "Updating $CONFIG_FILE file..." sed -i "s|nextclade:3.5.0--h9ee0642_0|$latest_version_nextclade|" "$CONFIG_FILE" - echo_bold "File $CONFIG_FILE UPDATED." + echo_bold "File $CONFIG_FILE UPDATED."; log_message "File $CONFIG_FILE UPDATED." # Updating params file - echo "Updating $PARAMS_FILE file..." + echo "Updating $PARAMS_FILE file..."; log_message "Updating $PARAMS_FILE file..." sed -i "s|skip_nextclade: true|skip_nextclade: false|" "$PARAMS_FILE" echo "nextclade_dataset: false" >> $PARAMS_FILE - echo_bold "File $PARAMS_FILE UPDATED." + echo_bold "File $PARAMS_FILE UPDATED."; log_message "File $PARAMS_FILE UPDATED." - echo_bold "Finished NEXTCLADE check/update" + echo_bold "Finished NEXTCLADE check/update"; log_message "Finished NEXTCLADE check/update" echo } # Checks if fasta and gff references are downloaded. If not, it downloads them (and creates family folder if neccesary) check_references() { echo - echo_bold "Processing reference: ${ref}." + echo_bold "Processing reference: ${ref}."; log_message "Processing reference: ${ref}." # Obtaining family information obtain_family() { organism_id=$(curl -s "https://www.ncbi.nlm.nih.gov/nuccore/${ref}" | grep -o 'ORGANISM=[0-9]\+' | head -n 1 | awk -F '=' '{print $2}') if [ -z $organism_id ]; then - echo_blinking_red "$ref not found in NCBI. Please download it manually." + echo_blinking_red "$ref not found in NCBI. Please download it manually."; log_message "$ref not found in NCBI. Please download it manually." return fi family=$(curl -s "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=${organism_id}" | grep -o 'TITLE="family">.*<' | awk -F 'TITLE="family">' '{print $2}' | cut -d '<' -f 1 | tr '[:upper:]' '[:lower:]') if [ -z $family ]; then family=$(curl -s "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=${organism_id}" | grep -o 'ALT="family">.*<' | awk -F 'ALT="family">' '{print $2}' | cut -d '<' -f 1 | tr '[:upper:]' '[:lower:]') fi - echo "Reference $ref belongs to $family family." + echo "Reference $ref belongs to $family family."; log_message "Reference $ref belongs to $family family." } # Check if FASTA sequence is already downloaded REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) if [ -z "$REF_FASTA" ]; then - echo "File ${ref}.fasta is not yet downloaded." + echo "File ${ref}.fasta is not yet downloaded."; log_message "File ${ref}.fasta is not yet downloaded." obtain_family; if [ -z $family ]; then return; fi # Loading SAMtools module module load SAMtools SAMtools_loaded=$(module list | grep -o 'SAMtools/[0-9.]\+-GCC-[0-9.]\+') if [ -n "$SAMtools_loaded" ]; then - echo_green "$SAMtools_loaded module succesfully loaded." + echo_green "$SAMtools_loaded module succesfully loaded."; log_message "$SAMtools_loaded module succesfully loaded." else - echo_blinking_red "SAMtools module not loaded. Exiting..." + echo_blinking_red "SAMtools module not loaded. Exiting..."; log_message "SAMtools module not loaded. Exiting..." exit 1 fi if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exists echo "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.fasta in /data/bi/references/refgenie/alias/${family}/fasta/${ref}." + log_message "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.fasta in /data/bi/references/refgenie/alias/${family}/fasta/${ref}." digest=$(openssl rand -hex 24) mkdir -p /data/bi/references/refgenie/data/${digest}/fasta/${ref}/ wget -q -O "/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=${ref}&rettype=fasta&retmode=text" if [ $? -eq 0 ]; then - echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}" + echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." + log_message "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." gzip /data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R bash /data/bi/references/refgenie/alias/ref.sh REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else - echo_blinking_red "An error occurred during file downloading." + echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." fi else echo "Directory /data/bi/references/refgenie/alias/${family}/ ALREADY EXISTS. Downloading ${ref}.fasta." + log_message "Directory /data/bi/references/refgenie/alias/${family}/ ALREADY EXISTS. Downloading ${ref}.fasta." digest=$(refgenie alias get -a ${family} -c /data/bi/references/refgenie/genome_config.yaml) mkdir -p /data/bi/references/refgenie/data/${digest}/fasta/${ref}/ wget -q -O "/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=${ref}&rettype=fasta&retmode=text" if [ $? -eq 0 ]; then - echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}" + echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." + log_message "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." gzip /data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R bash /data/bi/references/refgenie/alias/ref.sh REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else - echo_blinking_red "An error occurred during file downloading." + echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." fi fi else - echo -e "File ${ref}.fasta is ALREADY available in $(dirname $REF_FASTA). \xE2\x9C\x85" + echo -e "File ${ref}.fasta is ALREADY available in $(dirname $REF_FASTA). \xE2\x9C\x85"; log_message "File ${ref}.fasta is ALREADY available in $(dirname $REF_FASTA). \xE2\x9C\x85" fi # Check if GFF file is already downloaded REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) if [ -z "$REF_GFF" ]; then - echo "File ${ref}.gff is not yet downloaded." + echo "File ${ref}.gff is not yet downloaded."; log_message "File ${ref}.gff is not yet downloaded." if [ ! -v family ]; then obtain_family; if [ -z ${family} ]; then return; fi; fi if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exist echo "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.gff in /data/bi/references/refgenie/alias/${family}/gff/${ref}." + log_message "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.gff in /data/bi/references/refgenie/alias/${family}/gff/${ref}." digest=$(openssl rand -hex 24) refgenie alias set --aliases ${family} --digest ${digest} -f -c /data/bi/references/refgenie/genome_config.yaml mkdir -p /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/ wget -q -O "/data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/${ref}.gff" "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${ref}" if [ $? -eq 0 ]; then - echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}" + echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." + log_message "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml bash /data/bi/references/refgenie/alias/ref.sh REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else - echo_blinking_red "An error occurred during file downloading." + echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." fi else echo "Directory /data/bi/references/refgenie/alias/${family}/ ALREADY EXISTS. Downloading ${ref}.gff." + log_message "Directory /data/bi/references/refgenie/alias/${family}/ ALREADY EXISTS. Downloading ${ref}.gff." digest=$(refgenie alias get -a ${family} -c /data/bi/references/refgenie/genome_config.yaml) mkdir -p /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/ wget -q -O "/data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/${ref}.gff" "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${ref}" if [ $? -eq 0 ]; then - echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}" + echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." + log_message "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml bash /data/bi/references/refgenie/alias/ref.sh REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else - echo_blinking_red "An error occurred during file downloading." + echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." fi fi else - echo -e "File ${ref}.gff is ALREADY available in $(dirname $REF_GFF). \xE2\x9C\x85" + echo -e "File ${ref}.gff is ALREADY available in $(dirname $REF_GFF). \xE2\x9C\x85"; log_message "File ${ref}.gff is ALREADY available in $(dirname $REF_GFF). \xE2\x9C\x85" fi unset family } #################################### + +echo_bold "Starting lablog_viralrecon execution." +echo -e "$(date +'%Y-%m-%d %H:%M:%S') - Starting lablog_viralrecon execution." > lablog_viralrecon.log + # Loading singularity module -module purge module load singularity singularity_loaded=$(module list | grep singularity | awk '{print $2}') if [ -n "$singularity_loaded" ]; then echo_green "$singularity_loaded module succesfully loaded." - echo -e "$(date +'%Y-%m-%d %H:%M:%S')\t${singularity_loaded} module succesfully loaded." >> lablog_viralrecon.log + log_message "${singularity_loaded} module succesfully loaded." else echo_blinking_red "Singularity module not loaded. Exiting..." + log_message "Singularity module not loaded. Exiting..." exit 1 fi @@ -269,8 +281,8 @@ cp ../DOC/viralrecon.config ../DOC/${timeset}_viralrecon.config cp ../DOC/viralrecon_params.yml ../DOC/${timeset}_viralrecon_params.yml CONFIG_FILE="../DOC/${timeset}_viralrecon.config" PARAMS_FILE="../DOC/${timeset}_viralrecon_params.yml" -echo "Created $CONFIG_FILE file." -echo "Created $PARAMS_FILE file." +echo "Created $CONFIG_FILE file."; log_message "Created $CONFIG_FILE file." +echo "Created $PARAMS_FILE file."; log_message "Created $PARAMS_FILE file." echo # Setting the type of analysis @@ -281,11 +293,11 @@ while true; do echo -ne "\e[1;38;5;220m"; read -n 1 ANALYSIS_TYPE; tput sgr0; echo if [ "$ANALYSIS_TYPE" == "1" ]; then ANALYSIS_TYPE="METAGENOMIC" - echo_green "$ANALYSIS_TYPE analysis selected." + echo_green "$ANALYSIS_TYPE analysis selected."; log_message "$ANALYSIS_TYPE analysis selected." break elif [ "$ANALYSIS_TYPE" == "2" ]; then ANALYSIS_TYPE="AMPLICONS" - echo_green "$ANALYSIS_TYPE analysis selected." + echo_green "$ANALYSIS_TYPE analysis selected."; log_message "$ANALYSIS_TYPE analysis selected." break else echo_red "Invalid input. Please enter 1 or 2." @@ -299,15 +311,15 @@ echo_bold "\nPlease specify the method to be performed." while true; do echo -ne "\e[1;38;5;220m"; read -n 1 method; tput sgr0; echo if [ "$method" == "1" ]; then - echo_green "Mapping method selected." + echo_green "Mapping method selected."; log_message "Mapping method selected." break elif [ "$method" == "2" ]; then - echo_green "De novo assembly method selected." + echo_green "De novo assembly method selected."; log_message "De novo assembly method selected." sed -i "s|skip_assembly: true|skip_assembly: false|" "$PARAMS_FILE" sed -i "s|skip_variants: false|skip_variants: true|" "$PARAMS_FILE" break elif [ "$method" == "3" ]; then - echo_green "Mapping + de novo assembly methods selected." + echo_green "Mapping + de novo assembly methods selected."; log_message "Mapping + de novo assembly methods selected." sed -i "s|skip_assembly: true|skip_assembly: false|" "$PARAMS_FILE" break else @@ -320,17 +332,17 @@ echo_bold "\nPlease specify the method to be performed." echo read -p $'\e[1;37mIs samples_ref.txt file already prepared? [y/N]: \e[1;38;5;220m' -n 1 samples_ref_prepared; tput sgr0; echo if [ "$samples_ref_prepared" == "y" ]; then - echo -e "File samples_ref.txt READY. \xE2\x9C\x85" + echo -e "File samples_ref.txt READY. \xE2\x9C\x85"; log_message "File samples_ref.txt READY. \xE2\x9C\x85" else : > samples_ref.txt - echo "File samples_ref NOT prepared." + echo "File samples_ref NOT prepared."; log_message "File samples_ref NOT prepared." while [ -z "$host" ] || [ -z "$reference" ] || [ "$answer" = "n" ]; do read -p $'\e[1;37mPlease specify the host: \e[1;38;5;220m' host read -p $'\e[1;37mPlease specify the reference: \e[1;38;5;220m' reference read -p $'\e[1;37mAre host [\e[1;38;5;220m'"${host^^}"$'\e[1;37m] and reference [\e[1;38;5;220m'"${reference}"$'\e[1;37m] correct? [Y/n]: \e[1;38;5;220m' -n 1 answer; tput sgr0; echo done while read in; do echo -e "${in}\t${reference}\t${host^^}" >> samples_ref.txt; done < samples_id.txt - echo -e "File samples_ref.txt READY. \xE2\x9C\x85" + echo -e "File samples_ref.txt READY. \xE2\x9C\x85"; log_message "File samples_ref.txt READY. \xE2\x9C\x85. Host: ${host^^}. Reference: ${reference}." fi @@ -361,15 +373,15 @@ else echo -ne "\e[1;38;5;220m"; read -n 1 virus_tag; tput sgr0; echo if [ "$virus_tag" == "1" ]; then virus_tag="sars-cov-2" - echo_green "${virus_tag^^} virus selected." + echo_green "${virus_tag^^} virus selected."; log_message "${virus_tag^^} virus selected." break elif [ "$virus_tag" == "2" ]; then virus_tag="rsv" - echo_green "${virus_tag^^} virus selected." + echo_green "${virus_tag^^} virus selected."; log_message "${virus_tag^^} virus selected." break elif [ "$virus_tag" == "3" ]; then virus_tag="Other" - echo_green "$virus_tag virus selected." + echo_green "$virus_tag virus selected."; log_message "$virus_tag virus selected." break else echo_red "Invalid input. Please select a valid number." @@ -389,10 +401,12 @@ else update_nextclade echo_bold "\nRemember to provide the complete route to primer_bed and primer_fasta files, and specify the nextclade_dataset_name in every sbatch file before running the pipeline." + log_message "Remember to provide the complete route to primer_bed and primer_fasta files, and specify the nextclade_dataset_name in every sbatch file before running the pipeline." else echo "primer_bed: '../REFERENCES/XXXX'" >> $PARAMS_FILE echo_bold "\nRemember to provide the complete route to PRIMER_BED file in $PARAMS_FILE file before running the pipeline." + log_message "Remember to provide the complete route to PRIMER_BED file in $PARAMS_FILE file before running the pipeline." fi fi @@ -475,3 +489,4 @@ rm _02_create_run_percentage_Ns.sh cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd .. echo_green "\nLablog_viralrecon execution has been completed. Please verify all the configurations are set up correctly." +log_message "Lablog_viralrecon execution has been completed. Please verify all the configurations are set up correctly." From 4385ce50d7fa560e0e1e0f323f0bb93b082a1f79 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Wed, 7 Aug 2024 10:00:42 +0200 Subject: [PATCH 137/321] Fixed typo --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 47b8ebf7e..7587b64da 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -276,7 +276,7 @@ else fi # Setting work variables -timeset=$(date +"%Y-%m-%d_%H-%M-%S")_ +timeset=$(date +"%Y-%m-%d_%H-%M-%S") cp ../DOC/viralrecon.config ../DOC/${timeset}_viralrecon.config cp ../DOC/viralrecon_params.yml ../DOC/${timeset}_viralrecon_params.yml CONFIG_FILE="../DOC/${timeset}_viralrecon.config" From caf50f31cf64447c61eb7f0d118fae73ad0e6b11 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Wed, 7 Aug 2024 13:17:29 +0200 Subject: [PATCH 138/321] Fixed log_message function when used outside service folder (pangolin database update) --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 7587b64da..2f8c29a7b 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -29,9 +29,10 @@ The functions performed by the script can be listed as follows: # Defining functions # Log message saving +current_dir=$PWD log_message() { local message="$1" - echo -e "$(date '+%Y-%m-%d %H:%M:%S') - $message" >> lablog_viralrecon.log + echo -e "$(date '+%Y-%m-%d %H:%M:%S') - $message" >> $current_dir/lablog_viralrecon.log } From 76581f4784d84294ccd285eb58b9d83d1c4e386c Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Wed, 7 Aug 2024 14:01:52 +0200 Subject: [PATCH 139/321] Fixed bug in check_references function --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 1 + 1 file changed, 1 insertion(+) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 2f8c29a7b..1a245b7bc 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -206,6 +206,7 @@ check_references() { log_message "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." gzip /data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R bash /data/bi/references/refgenie/alias/ref.sh + bash /data/bi/references/refgenie/alias/ref.sh REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." From c86be8204bb33866477c5eee3e85a2420b171e9b Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Wed, 7 Aug 2024 14:16:10 +0200 Subject: [PATCH 140/321] Added some messages when building and adding new references --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 1a245b7bc..294dbedfe 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -189,6 +189,7 @@ check_references() { echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." log_message "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." gzip /data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta + echo "Building asset for ${ref}.fasta file..."; log_message "Building asset for ${ref}.fasta file..." refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R bash /data/bi/references/refgenie/alias/ref.sh REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) @@ -205,6 +206,7 @@ check_references() { echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." log_message "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." gzip /data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta + echo "Building asset for ${ref}.fasta file..."; log_message "Building asset for ${ref}.fasta file..." refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R bash /data/bi/references/refgenie/alias/ref.sh bash /data/bi/references/refgenie/alias/ref.sh REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) @@ -231,6 +233,7 @@ check_references() { if [ $? -eq 0 ]; then echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." log_message "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." + echo "Adding asset for ${ref}.gff file..."; log_message "Building asset for ${ref}.gff file..." refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml bash /data/bi/references/refgenie/alias/ref.sh REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) @@ -246,6 +249,7 @@ check_references() { if [ $? -eq 0 ]; then echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." log_message "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." + echo "Adding asset for ${ref}.gff file..."; log_message "Building asset for ${ref}.gff file..." refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml bash /data/bi/references/refgenie/alias/ref.sh REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) From b22611af202916761aada880f135e3f3b3bf7e4a Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Wed, 7 Aug 2024 15:51:42 +0200 Subject: [PATCH 141/321] Added srun to refgenie build and refgenie add functions. Created prompt and log messages --- .../viralrecon/ANALYSIS/lablog_viralrecon | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 294dbedfe..e293974d6 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -190,7 +190,9 @@ check_references() { log_message "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." gzip /data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta echo "Building asset for ${ref}.fasta file..."; log_message "Building asset for ${ref}.fasta file..." - refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R + srun --partition short_idx --output ${ref}.fasta_build.log refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R + echo_bold "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" + log_message "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" bash /data/bi/references/refgenie/alias/ref.sh REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else @@ -207,7 +209,9 @@ check_references() { log_message "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." gzip /data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta echo "Building asset for ${ref}.fasta file..."; log_message "Building asset for ${ref}.fasta file..." - refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R bash /data/bi/references/refgenie/alias/ref.sh + srun --partition short_idx --output ${ref}.fasta_build.log refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R + echo_bold "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" + log_message "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" bash /data/bi/references/refgenie/alias/ref.sh REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else @@ -233,8 +237,10 @@ check_references() { if [ $? -eq 0 ]; then echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." log_message "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." - echo "Adding asset for ${ref}.gff file..."; log_message "Building asset for ${ref}.gff file..." - refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml + echo "Adding asset for ${ref}.gff file..."; log_message "Adding asset for ${ref}.gff file..." + srun --partition short_idx --output ${ref}.gff_add.log refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml + echo_bold "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" + log_message "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" bash /data/bi/references/refgenie/alias/ref.sh REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else @@ -249,8 +255,10 @@ check_references() { if [ $? -eq 0 ]; then echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." log_message "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." - echo "Adding asset for ${ref}.gff file..."; log_message "Building asset for ${ref}.gff file..." - refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml + echo "Adding asset for ${ref}.gff file..."; log_message "Adding asset for ${ref}.gff file..." + srun --partition short_idx --output ${ref}.gff_add.log refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml + echo_bold "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" + log_message "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" bash /data/bi/references/refgenie/alias/ref.sh REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else From 8e846ffd45a2754bad4009746d1771cabdaf4354 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Thu, 8 Aug 2024 13:07:03 +0200 Subject: [PATCH 142/321] Resolved CHANGELOG.md conflict --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 13c20e788..35b6a0478 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -58,6 +58,7 @@ Code contributions to the new version: - Fixed 99-stats (MAG) template. [#301](https://github.com/BU-ISCIII/buisciii-tools/pull/301) - Fixed IRMA's lablog so that the sequences of the samples are not displayed several times neither in the .txt files of each influenza type nor in all_samples_completo.txt [#305](https://github.com/BU-ISCIII/buisciii-tools/pull/305) - Modified bioinfo_doc.py so that new lines in the delivery message are applied in the email [#307](https://github.com/BU-ISCIII/buisciii-tools/pull/307) +- Added several improvements in lablog_viralrecon (created log files, modified check_references function behaviour, enabled config files regeneration) [#306](https://github.com/BU-ISCIII/buisciii-tools/pull/306) ### Modules From 1638ad84fca4cfe457120d7ef585f1a17743da74 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Thu, 8 Aug 2024 12:55:03 +0200 Subject: [PATCH 143/321] Modified message when pangolin-data is already updated. Added confirmation messages using grep (pangolin and constallations) with logfile inside database folder --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index e293974d6..bc6d04c1a 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -71,7 +71,10 @@ update_pangolin() { echo "Setting datadir for Pangolin database."; log_message "Setting datadir for Pangolin database." cd /data/bi/references/pangolin/ if [ -e "./$(date '+%Y%m%d')" ]; then - echo -e "Pangolin database is UP TO DATE. \xE2\x9C\x85"; log_message "Pangolin database is UP TO DATE. \xE2\x9C\x85" + echo -e "Directory /data/bi/references/pangolin/$(date '+%Y%m%d') already exists. Assuming that a BU-ISCIII member previously updated pangolin database today. \xE2\x9C\x85" + log_message "Directory /data/bi/references/pangolin/$(date '+%Y%m%d') already exists. Assuming that a BU-ISCIII member previously updated pangolin database today. \xE2\x9C\x85" + echo_green "$(grep pangolin "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")"; log_message "$(grep pangolin "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")" + echo_green "$(grep constellations "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")"; log_message "$(grep constellations "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")" else mkdir "$(date '+%Y%m%d')" echo -e "$(date +'%Y-%m-%d %H:%M:%S') - mkdir $(date '+%Y%m%d')" >> $(date '+%Y%m%d')/command.log From 9792924d3f15fa47bf2491122238f5de8b52d371 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Thu, 8 Aug 2024 13:05:25 +0200 Subject: [PATCH 144/321] Added checking stage por proper performance of refgenie build and refgenie add functions --- .../viralrecon/ANALYSIS/lablog_viralrecon | 52 +++++++++++++------ 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index bc6d04c1a..3ee0241de 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -194,10 +194,15 @@ check_references() { gzip /data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta echo "Building asset for ${ref}.fasta file..."; log_message "Building asset for ${ref}.fasta file..." srun --partition short_idx --output ${ref}.fasta_build.log refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R - echo_bold "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" - log_message "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" - bash /data/bi/references/refgenie/alias/ref.sh - REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) + if [ $? -eq 0 ]; then + echo_bold "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" + log_message "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" + bash /data/bi/references/refgenie/alias/ref.sh + REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) + else + echo_blinking_red "An error ocurred during building asset for ${ref}.fasta file." + log_message "An error ocurred during building asset for ${ref}.fasta file." + fi else echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." fi @@ -213,10 +218,15 @@ check_references() { gzip /data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta echo "Building asset for ${ref}.fasta file..."; log_message "Building asset for ${ref}.fasta file..." srun --partition short_idx --output ${ref}.fasta_build.log refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R - echo_bold "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" - log_message "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" - bash /data/bi/references/refgenie/alias/ref.sh - REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) + if [ $? -eq 0 ]; then + echo_bold "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" + log_message "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" + bash /data/bi/references/refgenie/alias/ref.sh + REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) + else + echo_blinking_red "An error ocurred during building asset for ${ref}.fasta file." + log_message "An error ocurred during building asset for ${ref}.fasta file." + fi else echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." fi @@ -242,10 +252,15 @@ check_references() { log_message "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." echo "Adding asset for ${ref}.gff file..."; log_message "Adding asset for ${ref}.gff file..." srun --partition short_idx --output ${ref}.gff_add.log refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml - echo_bold "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" - log_message "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" - bash /data/bi/references/refgenie/alias/ref.sh - REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) + if [ $? -eq 0 ]; then + echo_bold "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" + log_message "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" + bash /data/bi/references/refgenie/alias/ref.sh + REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) + else + echo_blinking_red "An error ocurred during adding asset for ${ref}.gff file." + log_message "An error ocurred during adding asset for ${ref}.gff file." + fi else echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." fi @@ -260,10 +275,15 @@ check_references() { log_message "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." echo "Adding asset for ${ref}.gff file..."; log_message "Adding asset for ${ref}.gff file..." srun --partition short_idx --output ${ref}.gff_add.log refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml - echo_bold "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" - log_message "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" - bash /data/bi/references/refgenie/alias/ref.sh - REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) + if [ $? -eq 0 ]; then + echo_bold "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" + log_message "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" + bash /data/bi/references/refgenie/alias/ref.sh + REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) + else + echo_blinking_red "An error ocurred during adding asset for ${ref}.gff file." + log_message "An error ocurred during adding asset for ${ref}.gff file." + fi else echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." fi From b1586df8ae47c959f52c9cae7fd4094eba8511f5 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Thu, 8 Aug 2024 16:36:24 +0200 Subject: [PATCH 145/321] Updated log_message function including prompt message in terminal. Removed all duplicated messages --- .../viralrecon/ANALYSIS/lablog_viralrecon | 223 +++++++++--------- 1 file changed, 110 insertions(+), 113 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 3ee0241de..e705f12de 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -28,11 +28,32 @@ The functions performed by the script can be listed as follows: #################################### # Defining functions -# Log message saving -current_dir=$PWD +# Coloring messages and log saving +logfile=$(echo "$PWD/lablog_viralrecon.log") log_message() { local message="$1" - echo -e "$(date '+%Y-%m-%d %H:%M:%S') - $message" >> $current_dir/lablog_viralrecon.log + case "$2" in + "bold") + echo -e "\e[1;37m$message\e[0m" + echo -e "$(date '+%Y-%m-%d %H:%M:%S') - \e[1;37m$message\e[0m" >> $logfile + ;; + "red") + echo -e "\e[31m$message\e[0m" + echo -e "$(date '+%Y-%m-%d %H:%M:%S') - \e[31m$message\e[0m" >> $logfile + ;; + "green") + echo -e "\e[32m$message\e[0m" + echo -e "$(date '+%Y-%m-%d %H:%M:%S') - \e[32m$message\e[0m" >> $logfile + ;; + "blk_red") + echo -e "\e[1;5;97;5;41m$message\e[0m" + echo -e "$(date '+%Y-%m-%d %H:%M:%S') - \e[1;5;97;5;41m$message\e[0m" >> $logfile + ;; + *) + echo -e "$message" + echo -e "$(date '+%Y-%m-%d %H:%M:%S') - $message" >> $logfile + ;; + esac } @@ -46,250 +67,230 @@ echo_blinking_red() { echo -e "\e[1;5;97;5;41m$1\e[0m"; } # Updating pangolin. Checks last image available and if is already downloaded. If not, downloads it. This function also updates pangolin database. Update related config files with pangolin info update_pangolin() { echo - echo_bold "Starting PANGOLIN check/update."; log_message "Starting PANGOLIN check/update." - echo "Checking Pangolin container version..."; log_message "Checking Pangolin container version..." + log_message "Starting PANGOLIN check/update." bold + log_message "Checking Pangolin container version..." url=$(curl -s "https://depot.galaxyproject.org/singularity/") latest_version_pangolin=$(echo "$url" | grep -oP 'pangolin:[^"]+' | sort -V | tail -n 1 | awk -F'>' '{print $1}' | sed 's/<\/a//') - echo_bold "Latest version available of Pangolin:\e[1;38;5;220m $latest_version_pangolin"; tput sgr0 - log_message "Latest version available of Pangolin: $latest_version_pangolin" + log_message "Latest version available of Pangolin:\e[1;38;5;220m $latest_version_pangolin" bold - echo "Checking if latest version of Pangolin image is already downloaded..."; log_message "Checking if latest version of Pangolin image is already downloaded..." + log_message "Checking if latest version of Pangolin image is already downloaded..." if [ -e "/data/bi/pipelines/singularity-images/$latest_version_pangolin" ]; then - echo "File $latest_version_pangolin already downloaded."; log_message "File $latest_version_pangolin already downloaded." - echo -e "Pangolin container is UP TO DATE. \xE2\x9C\x85"; log_message "Pangolin container is UP TO DATE. \xE2\x9C\x85" + log_message "File $latest_version_pangolin already downloaded." + log_message "Pangolin container is UP TO DATE. \xE2\x9C\x85" else - echo "Downloading $latest_version_pangolin file..."; log_message "Downloading $latest_version_pangolin file..." + log_message "Downloading $latest_version_pangolin file..." wget -P "/data/bi/pipelines/singularity-images/" "https://depot.galaxyproject.org/singularity/$latest_version_pangolin" if [ $? -eq 0 ]; then - echo_green "$latest_version_pangolin file succesfully downloaded."; log_message "$latest_version_pangolin file succesfully downloaded." + log_message "$latest_version_pangolin file succesfully downloaded." green else - echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." + log_message "An error occurred during file downloading." blk_red fi fi # Updating Pangolin database - echo "Setting datadir for Pangolin database."; log_message "Setting datadir for Pangolin database." + log_message "Setting datadir for Pangolin database." cd /data/bi/references/pangolin/ if [ -e "./$(date '+%Y%m%d')" ]; then - echo -e "Directory /data/bi/references/pangolin/$(date '+%Y%m%d') already exists. Assuming that a BU-ISCIII member previously updated pangolin database today. \xE2\x9C\x85" log_message "Directory /data/bi/references/pangolin/$(date '+%Y%m%d') already exists. Assuming that a BU-ISCIII member previously updated pangolin database today. \xE2\x9C\x85" - echo_green "$(grep pangolin "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")"; log_message "$(grep pangolin "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")" - echo_green "$(grep constellations "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")"; log_message "$(grep constellations "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")" + log_message "$(grep pangolin "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")" green + log_message "$(grep constellations "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")" green else mkdir "$(date '+%Y%m%d')" echo -e "$(date +'%Y-%m-%d %H:%M:%S') - mkdir $(date '+%Y%m%d')" >> $(date '+%Y%m%d')/command.log echo -e "$(date +'%Y-%m-%d %H:%M:%S') - srun --partition short_idx --output ${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log singularity run -B ${PWD} /data/bi/pipelines/singularity-images/$latest_version_pangolin pangolin --update-data --datadir ${PWD}/$(date '+%Y%m%d')/)" >> $(date '+%Y%m%d')/command.log srun --partition short_idx --output ${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log singularity run -B ${PWD} /data/bi/pipelines/singularity-images/$latest_version_pangolin pangolin --update-data --datadir ${PWD}/$(date '+%Y%m%d')/ if [ $? -eq 0 ]; then - echo_green "$(grep pangolin "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")"; log_message "$(grep pangolin "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")" - echo_green "$(grep constellations "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")"; log_message "$(grep constellations "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")" + log_message "$(grep pangolin "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")" green + log_message "$(grep constellations "${PWD}/$(date '+%Y%m%d')/$(date '+%Y%m%d')_pangolin.log")" green else - echo_blinking_red "Error during pangolin database update."; log_message "Error during pangolin database update." + log_message "Error during pangolin database update." blk_red fi fi cd - # Updating config file - echo "Updating $CONFIG_FILE file..."; log_message "Updating $CONFIG_FILE file..." + log_message "Updating $CONFIG_FILE file..." sed -i "s|pangolin:4.3--pyhdfd78af_2|$latest_version_pangolin|" "$CONFIG_FILE" sed -i "s|--datadir XXXX|--datadir $(ls -dt /data/bi/references/pangolin/*/ | head -n 1)|" "$CONFIG_FILE" - echo_bold "File $CONFIG_FILE UPDATED."; log_message "File $CONFIG_FILE UPDATED." + log_message "File $CONFIG_FILE UPDATED." # Updating params file - echo "Updating $PARAMS_FILE file..."; log_message "Updating $PARAMS_FILE file..." + log_message "Updating $PARAMS_FILE file..." sed -i "s|skip_pangolin: true|skip_pangolin: false|" "$PARAMS_FILE" - echo_bold "File $PARAMS_FILE UPDATED."; log_message "File $PARAMS_FILE UPDATED." + log_message "File $PARAMS_FILE UPDATED." - echo_bold "Finished PANGOLIN check/update"; log_message "Finished PANGOLIN check/update" + log_message "Finished PANGOLIN check/update" bold echo } # Updating Nextclade. Checks last image available and if is already downloaded. If not, downloads it. Update related config files with nextclade info update_nextclade() { echo - echo_bold "Starting NEXTCLADE check/update."; log_message "Starting NEXTCLADE check/update." - echo "Checking Nextclade container version..."; log_message "Checking Nextclade container version..." + log_message "Starting NEXTCLADE check/update." bold + log_message "Checking Nextclade container version..." url=$(curl -s "https://depot.galaxyproject.org/singularity/") latest_version_nextclade=$(echo "$url" | grep -oP 'nextclade:[^"]+' | sort -V | tail -n 1 | awk -F'>' '{print $1}' | sed 's/<\/a//') - echo_bold "Latest version available of Nextclade:\e[1;38;5;220m $latest_version_nextclade"; tput sgr0 - log_message "Latest version available of Nextclade: $latest_version_nextclade" + log_message "Latest version available of Nextclade:\e[1;38;5;220m $latest_version_nextclade" - echo "Checking if latest version of Nextclade image is already downloaded..."; log_message "Checking if latest version of Nextclade image is already downloaded..." + log_message "Checking if latest version of Nextclade image is already downloaded..." if [ -e "/data/bi/pipelines/singularity-images/$latest_version_nextclade" ]; then - echo "File $latest_version_nextclade already downloaded."; log_message "File $latest_version_nextclade already downloaded." - echo -e "Nextclade container is UP TO DATE. \xE2\x9C\x85"; log_message "Nextclade container is UP TO DATE. \xE2\x9C\x85" + log_message "File $latest_version_nextclade already downloaded." + log_message "Nextclade container is UP TO DATE. \xE2\x9C\x85" else - echo "Downloading $latest_version_nextclade file..."; log_message "Downloading $latest_version_nextclade file..." + log_message "Downloading $latest_version_nextclade file..." wget -P "/data/bi/pipelines/singularity-images" "https://depot.galaxyproject.org/singularity/$latest_version_nextclade" if [ $? -eq 0 ]; then - echo_green "$latest_version_nextclade file succesfully downloaded."; log_message "$latest_version_nextclade file succesfully downloaded." + log_message "$latest_version_nextclade file succesfully downloaded." green else - echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." + log_message "An error occurred during file downloading." blk_red fi fi # Extracting the current Nextclade data TAG - echo "Extracting Nextclade data TAG..."; log_message "Extracting Nextclade data TAG..." + log_message "Extracting Nextclade data TAG..." nextclade_tag=$(singularity run /data/bi/pipelines/singularity-images/$latest_version_nextclade nextclade dataset list --json | grep -zoP "\"path\":\s*\"nextstrain/${virus_tag}[^\"]*\"[\s\S]*?\"tag\":\s*\"\K[^\"]*" | tr '\0' '\n' | head -n 1) - echo_bold "Latest \e[1;38;5;220m${virus_tag^^} \e[1;37mNextclade dataset version TAG:\e[1;38;5;220m $nextclade_tag"; tput sgr0 - log_message "Latest ${virus_tag^^} Nextclade dataset version TAG: $nextclade_tag" + log_message "Latest \e[1;38;5;220m${virus_tag^^} \e[1;37mNextclade dataset version TAG:\e[1;38;5;220m $nextclade_tag" bold # Updating config file - echo "Updating $CONFIG_FILE file..."; log_message "Updating $CONFIG_FILE file..." + log_message "Updating $CONFIG_FILE file..." sed -i "s|nextclade:3.5.0--h9ee0642_0|$latest_version_nextclade|" "$CONFIG_FILE" - echo_bold "File $CONFIG_FILE UPDATED."; log_message "File $CONFIG_FILE UPDATED." + log_message "File $CONFIG_FILE UPDATED." # Updating params file - echo "Updating $PARAMS_FILE file..."; log_message "Updating $PARAMS_FILE file..." + log_message "Updating $PARAMS_FILE file..." sed -i "s|skip_nextclade: true|skip_nextclade: false|" "$PARAMS_FILE" echo "nextclade_dataset: false" >> $PARAMS_FILE - echo_bold "File $PARAMS_FILE UPDATED."; log_message "File $PARAMS_FILE UPDATED." + log_message "File $PARAMS_FILE UPDATED." - echo_bold "Finished NEXTCLADE check/update"; log_message "Finished NEXTCLADE check/update" + log_message "Finished NEXTCLADE check/update" bold echo } # Checks if fasta and gff references are downloaded. If not, it downloads them (and creates family folder if neccesary) check_references() { echo - echo_bold "Processing reference: ${ref}."; log_message "Processing reference: ${ref}." + log_message "Processing reference: ${ref}." bold # Obtaining family information obtain_family() { organism_id=$(curl -s "https://www.ncbi.nlm.nih.gov/nuccore/${ref}" | grep -o 'ORGANISM=[0-9]\+' | head -n 1 | awk -F '=' '{print $2}') if [ -z $organism_id ]; then - echo_blinking_red "$ref not found in NCBI. Please download it manually."; log_message "$ref not found in NCBI. Please download it manually." + log_message "$ref not found in NCBI. Please download it manually." blk_red return fi family=$(curl -s "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=${organism_id}" | grep -o 'TITLE="family">.*<' | awk -F 'TITLE="family">' '{print $2}' | cut -d '<' -f 1 | tr '[:upper:]' '[:lower:]') if [ -z $family ]; then family=$(curl -s "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=${organism_id}" | grep -o 'ALT="family">.*<' | awk -F 'ALT="family">' '{print $2}' | cut -d '<' -f 1 | tr '[:upper:]' '[:lower:]') fi - echo "Reference $ref belongs to $family family."; log_message "Reference $ref belongs to $family family." + log_message "Reference $ref belongs to $family family." } # Check if FASTA sequence is already downloaded REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) if [ -z "$REF_FASTA" ]; then - echo "File ${ref}.fasta is not yet downloaded."; log_message "File ${ref}.fasta is not yet downloaded." + log_message "File ${ref}.fasta is not yet downloaded." obtain_family; if [ -z $family ]; then return; fi # Loading SAMtools module module load SAMtools SAMtools_loaded=$(module list | grep -o 'SAMtools/[0-9.]\+-GCC-[0-9.]\+') if [ -n "$SAMtools_loaded" ]; then - echo_green "$SAMtools_loaded module succesfully loaded."; log_message "$SAMtools_loaded module succesfully loaded." + log_message "$SAMtools_loaded module succesfully loaded." green else - echo_blinking_red "SAMtools module not loaded. Exiting..."; log_message "SAMtools module not loaded. Exiting..." + log_message "SAMtools module not loaded. Exiting..." blk_red exit 1 fi if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exists - echo "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.fasta in /data/bi/references/refgenie/alias/${family}/fasta/${ref}." log_message "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.fasta in /data/bi/references/refgenie/alias/${family}/fasta/${ref}." digest=$(openssl rand -hex 24) mkdir -p /data/bi/references/refgenie/data/${digest}/fasta/${ref}/ wget -q -O "/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=${ref}&rettype=fasta&retmode=text" if [ $? -eq 0 ]; then - echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." - log_message "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." + log_message "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." green gzip /data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta - echo "Building asset for ${ref}.fasta file..."; log_message "Building asset for ${ref}.fasta file..." + log_message "Building asset for ${ref}.fasta file..." srun --partition short_idx --output ${ref}.fasta_build.log refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R if [ $? -eq 0 ]; then - echo_bold "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" - log_message "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" + log_message "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" bold bash /data/bi/references/refgenie/alias/ref.sh REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else - echo_blinking_red "An error ocurred during building asset for ${ref}.fasta file." - log_message "An error ocurred during building asset for ${ref}.fasta file." + log_message "An error ocurred during building asset for ${ref}.fasta file." blk_red fi else - echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." + log_message "An error occurred during file downloading." blk_red fi else - echo "Directory /data/bi/references/refgenie/alias/${family}/ ALREADY EXISTS. Downloading ${ref}.fasta." log_message "Directory /data/bi/references/refgenie/alias/${family}/ ALREADY EXISTS. Downloading ${ref}.fasta." digest=$(refgenie alias get -a ${family} -c /data/bi/references/refgenie/genome_config.yaml) mkdir -p /data/bi/references/refgenie/data/${digest}/fasta/${ref}/ wget -q -O "/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=${ref}&rettype=fasta&retmode=text" if [ $? -eq 0 ]; then - echo_green "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." - log_message "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." + log_message "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." green gzip /data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta - echo "Building asset for ${ref}.fasta file..."; log_message "Building asset for ${ref}.fasta file..." + log_message "Building asset for ${ref}.fasta file..." srun --partition short_idx --output ${ref}.fasta_build.log refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R if [ $? -eq 0 ]; then - echo_bold "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" - log_message "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" + log_message "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" bold bash /data/bi/references/refgenie/alias/ref.sh REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else - echo_blinking_red "An error ocurred during building asset for ${ref}.fasta file." - log_message "An error ocurred during building asset for ${ref}.fasta file." + log_message "An error ocurred during building asset for ${ref}.fasta file." blk_red fi else - echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." + log_message "An error occurred during file downloading." blk_red fi fi else - echo -e "File ${ref}.fasta is ALREADY available in $(dirname $REF_FASTA). \xE2\x9C\x85"; log_message "File ${ref}.fasta is ALREADY available in $(dirname $REF_FASTA). \xE2\x9C\x85" + log_message "File ${ref}.fasta is ALREADY available in $(dirname $REF_FASTA). \xE2\x9C\x85" fi # Check if GFF file is already downloaded REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) if [ -z "$REF_GFF" ]; then - echo "File ${ref}.gff is not yet downloaded."; log_message "File ${ref}.gff is not yet downloaded." + log_message "File ${ref}.gff is not yet downloaded." if [ ! -v family ]; then obtain_family; if [ -z ${family} ]; then return; fi; fi if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exist - echo "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.gff in /data/bi/references/refgenie/alias/${family}/gff/${ref}." log_message "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.gff in /data/bi/references/refgenie/alias/${family}/gff/${ref}." digest=$(openssl rand -hex 24) refgenie alias set --aliases ${family} --digest ${digest} -f -c /data/bi/references/refgenie/genome_config.yaml mkdir -p /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/ wget -q -O "/data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/${ref}.gff" "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${ref}" if [ $? -eq 0 ]; then - echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." - log_message "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." - echo "Adding asset for ${ref}.gff file..."; log_message "Adding asset for ${ref}.gff file..." + log_message "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." green + log_message "Adding asset for ${ref}.gff file..." srun --partition short_idx --output ${ref}.gff_add.log refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml if [ $? -eq 0 ]; then - echo_bold "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" - log_message "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" + log_message "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" bold bash /data/bi/references/refgenie/alias/ref.sh REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else - echo_blinking_red "An error ocurred during adding asset for ${ref}.gff file." - log_message "An error ocurred during adding asset for ${ref}.gff file." + log_message "An error ocurred during adding asset for ${ref}.gff file." blk_red fi else - echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." + log_message "An error occurred during file downloading." blk_red fi else - echo "Directory /data/bi/references/refgenie/alias/${family}/ ALREADY EXISTS. Downloading ${ref}.gff." log_message "Directory /data/bi/references/refgenie/alias/${family}/ ALREADY EXISTS. Downloading ${ref}.gff." digest=$(refgenie alias get -a ${family} -c /data/bi/references/refgenie/genome_config.yaml) mkdir -p /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/ wget -q -O "/data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/${ref}.gff" "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${ref}" if [ $? -eq 0 ]; then - echo_green "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." - log_message "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." - echo "Adding asset for ${ref}.gff file..."; log_message "Adding asset for ${ref}.gff file..." + log_message "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." green + log_message "Adding asset for ${ref}.gff file..." srun --partition short_idx --output ${ref}.gff_add.log refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml if [ $? -eq 0 ]; then - echo_bold "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" - log_message "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" + log_message "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" bold bash /data/bi/references/refgenie/alias/ref.sh REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) else - echo_blinking_red "An error ocurred during adding asset for ${ref}.gff file." - log_message "An error ocurred during adding asset for ${ref}.gff file." + log_message "An error ocurred during adding asset for ${ref}.gff file." blk_red fi else - echo_blinking_red "An error occurred during file downloading."; log_message "An error occurred during file downloading." + log_message "An error occurred during file downloading." blk_red fi fi else - echo -e "File ${ref}.gff is ALREADY available in $(dirname $REF_GFF). \xE2\x9C\x85"; log_message "File ${ref}.gff is ALREADY available in $(dirname $REF_GFF). \xE2\x9C\x85" + log_message "File ${ref}.gff is ALREADY available in $(dirname $REF_GFF). \xE2\x9C\x85" fi unset family @@ -297,18 +298,16 @@ check_references() { #################################### -echo_bold "Starting lablog_viralrecon execution." -echo -e "$(date +'%Y-%m-%d %H:%M:%S') - Starting lablog_viralrecon execution." > lablog_viralrecon.log +echo -e "\e[1;37mStarting lablog_viralrecon execution.\e[0m" +echo -e "$(date +'%Y-%m-%d %H:%M:%S') - \e[1;37mStarting lablog_viralrecon execution.\e[0m" > $logfile # Loading singularity module module load singularity singularity_loaded=$(module list | grep singularity | awk '{print $2}') if [ -n "$singularity_loaded" ]; then - echo_green "$singularity_loaded module succesfully loaded." - log_message "${singularity_loaded} module succesfully loaded." + log_message "${singularity_loaded} module succesfully loaded." green else - echo_blinking_red "Singularity module not loaded. Exiting..." - log_message "Singularity module not loaded. Exiting..." + log_message "Singularity module not loaded. Exiting..." blk_red exit 1 fi @@ -318,8 +317,8 @@ cp ../DOC/viralrecon.config ../DOC/${timeset}_viralrecon.config cp ../DOC/viralrecon_params.yml ../DOC/${timeset}_viralrecon_params.yml CONFIG_FILE="../DOC/${timeset}_viralrecon.config" PARAMS_FILE="../DOC/${timeset}_viralrecon_params.yml" -echo "Created $CONFIG_FILE file."; log_message "Created $CONFIG_FILE file." -echo "Created $PARAMS_FILE file."; log_message "Created $PARAMS_FILE file." +log_message "Created $CONFIG_FILE file." +log_message "Created $PARAMS_FILE file." echo # Setting the type of analysis @@ -330,11 +329,11 @@ while true; do echo -ne "\e[1;38;5;220m"; read -n 1 ANALYSIS_TYPE; tput sgr0; echo if [ "$ANALYSIS_TYPE" == "1" ]; then ANALYSIS_TYPE="METAGENOMIC" - echo_green "$ANALYSIS_TYPE analysis selected."; log_message "$ANALYSIS_TYPE analysis selected." + log_message "$ANALYSIS_TYPE analysis selected." green break elif [ "$ANALYSIS_TYPE" == "2" ]; then ANALYSIS_TYPE="AMPLICONS" - echo_green "$ANALYSIS_TYPE analysis selected."; log_message "$ANALYSIS_TYPE analysis selected." + log_message "$ANALYSIS_TYPE analysis selected." green break else echo_red "Invalid input. Please enter 1 or 2." @@ -348,15 +347,15 @@ echo_bold "\nPlease specify the method to be performed." while true; do echo -ne "\e[1;38;5;220m"; read -n 1 method; tput sgr0; echo if [ "$method" == "1" ]; then - echo_green "Mapping method selected."; log_message "Mapping method selected." + log_message "Mapping method selected." green break elif [ "$method" == "2" ]; then - echo_green "De novo assembly method selected."; log_message "De novo assembly method selected." + log_message "De novo assembly method selected." green sed -i "s|skip_assembly: true|skip_assembly: false|" "$PARAMS_FILE" sed -i "s|skip_variants: false|skip_variants: true|" "$PARAMS_FILE" break elif [ "$method" == "3" ]; then - echo_green "Mapping + de novo assembly methods selected."; log_message "Mapping + de novo assembly methods selected." + log_message "Mapping + de novo assembly methods selected." green sed -i "s|skip_assembly: true|skip_assembly: false|" "$PARAMS_FILE" break else @@ -369,17 +368,17 @@ echo_bold "\nPlease specify the method to be performed." echo read -p $'\e[1;37mIs samples_ref.txt file already prepared? [y/N]: \e[1;38;5;220m' -n 1 samples_ref_prepared; tput sgr0; echo if [ "$samples_ref_prepared" == "y" ]; then - echo -e "File samples_ref.txt READY. \xE2\x9C\x85"; log_message "File samples_ref.txt READY. \xE2\x9C\x85" + log_message "File samples_ref.txt READY. \xE2\x9C\x85" else : > samples_ref.txt - echo "File samples_ref NOT prepared."; log_message "File samples_ref NOT prepared." + log_message "File samples_ref NOT prepared." while [ -z "$host" ] || [ -z "$reference" ] || [ "$answer" = "n" ]; do read -p $'\e[1;37mPlease specify the host: \e[1;38;5;220m' host read -p $'\e[1;37mPlease specify the reference: \e[1;38;5;220m' reference read -p $'\e[1;37mAre host [\e[1;38;5;220m'"${host^^}"$'\e[1;37m] and reference [\e[1;38;5;220m'"${reference}"$'\e[1;37m] correct? [Y/n]: \e[1;38;5;220m' -n 1 answer; tput sgr0; echo done while read in; do echo -e "${in}\t${reference}\t${host^^}" >> samples_ref.txt; done < samples_id.txt - echo -e "File samples_ref.txt READY. \xE2\x9C\x85"; log_message "File samples_ref.txt READY. \xE2\x9C\x85. Host: ${host^^}. Reference: ${reference}." + log_message "File samples_ref.txt READY. \xE2\x9C\x85. Host: ${host^^}. Reference: ${reference}." fi @@ -391,6 +390,7 @@ if [ "$ANALYSIS_TYPE" = "METAGENOMIC" ]; then read -p $'\e[1;37mDo the sequences correspond to monkeypox virus (MPV)? [y/N]: \e[1;38;5;220m' -n 1 monkeypox; tput sgr0; echo if [ "$monkeypox" == "y" ]; then + log_message "Monkeypox virus (MPV) analisys selected" virus_tag='mpox' # Update Nextclade update_nextclade @@ -410,15 +410,15 @@ else echo -ne "\e[1;38;5;220m"; read -n 1 virus_tag; tput sgr0; echo if [ "$virus_tag" == "1" ]; then virus_tag="sars-cov-2" - echo_green "${virus_tag^^} virus selected."; log_message "${virus_tag^^} virus selected." + log_message "${virus_tag^^} virus selected." green break elif [ "$virus_tag" == "2" ]; then virus_tag="rsv" - echo_green "${virus_tag^^} virus selected."; log_message "${virus_tag^^} virus selected." + log_message "${virus_tag^^} virus selected." green break elif [ "$virus_tag" == "3" ]; then virus_tag="Other" - echo_green "$virus_tag virus selected."; log_message "$virus_tag virus selected." + log_message "$virus_tag virus selected." green break else echo_red "Invalid input. Please select a valid number." @@ -437,13 +437,11 @@ else # Update Nextclade update_nextclade - echo_bold "\nRemember to provide the complete route to primer_bed and primer_fasta files, and specify the nextclade_dataset_name in every sbatch file before running the pipeline." - log_message "Remember to provide the complete route to primer_bed and primer_fasta files, and specify the nextclade_dataset_name in every sbatch file before running the pipeline." + log_message "Remember to provide the complete route to primer_bed and primer_fasta files, and specify the nextclade_dataset_name in every sbatch file before running the pipeline." bold else echo "primer_bed: '../REFERENCES/XXXX'" >> $PARAMS_FILE - echo_bold "\nRemember to provide the complete route to PRIMER_BED file in $PARAMS_FILE file before running the pipeline." - log_message "Remember to provide the complete route to PRIMER_BED file in $PARAMS_FILE file before running the pipeline." + log_message "Remember to provide the complete route to PRIMER_BED file in $PARAMS_FILE file before running the pipeline." bold fi fi @@ -525,5 +523,4 @@ rm percentajeNs.py rm _02_create_run_percentage_Ns.sh cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd .. -echo_green "\nLablog_viralrecon execution has been completed. Please verify all the configurations are set up correctly." -log_message "Lablog_viralrecon execution has been completed. Please verify all the configurations are set up correctly." +log_message "Lablog_viralrecon execution has been completed. Please verify all the configurations are set up correctly." green From cbd7183dd9b50c4690002c090a7f374965f0fced Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Thu, 8 Aug 2024 16:41:33 +0200 Subject: [PATCH 146/321] Fixed bug --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index e705f12de..b31243c2e 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -130,7 +130,7 @@ update_nextclade() { log_message "Checking Nextclade container version..." url=$(curl -s "https://depot.galaxyproject.org/singularity/") latest_version_nextclade=$(echo "$url" | grep -oP 'nextclade:[^"]+' | sort -V | tail -n 1 | awk -F'>' '{print $1}' | sed 's/<\/a//') - log_message "Latest version available of Nextclade:\e[1;38;5;220m $latest_version_nextclade" + log_message "Latest version available of Nextclade:\e[1;38;5;220m $latest_version_nextclade" bold log_message "Checking if latest version of Nextclade image is already downloaded..." if [ -e "/data/bi/pipelines/singularity-images/$latest_version_nextclade" ]; then From 63f69d771fe772785bcc4e57e302a5e2f8066302 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Fri, 9 Aug 2024 11:29:40 +0200 Subject: [PATCH 147/321] Added timestamp variable to log_message function --- .../viralrecon/ANALYSIS/lablog_viralrecon | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index b31243c2e..02c5b055a 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -30,28 +30,29 @@ The functions performed by the script can be listed as follows: # Coloring messages and log saving logfile=$(echo "$PWD/lablog_viralrecon.log") +timestamp=$(date +"%Y-%m-%d_%H-%M-%S") log_message() { local message="$1" case "$2" in "bold") echo -e "\e[1;37m$message\e[0m" - echo -e "$(date '+%Y-%m-%d %H:%M:%S') - \e[1;37m$message\e[0m" >> $logfile + echo -e "$timestamp - \e[1;37m$message\e[0m" >> $logfile ;; "red") echo -e "\e[31m$message\e[0m" - echo -e "$(date '+%Y-%m-%d %H:%M:%S') - \e[31m$message\e[0m" >> $logfile + echo -e "$timestamp - \e[31m$message\e[0m" >> $logfile ;; "green") echo -e "\e[32m$message\e[0m" - echo -e "$(date '+%Y-%m-%d %H:%M:%S') - \e[32m$message\e[0m" >> $logfile + echo -e "$timestamp - \e[32m$message\e[0m" >> $logfile ;; "blk_red") echo -e "\e[1;5;97;5;41m$message\e[0m" - echo -e "$(date '+%Y-%m-%d %H:%M:%S') - \e[1;5;97;5;41m$message\e[0m" >> $logfile + echo -e "$timestamp - \e[1;5;97;5;41m$message\e[0m" >> $logfile ;; *) echo -e "$message" - echo -e "$(date '+%Y-%m-%d %H:%M:%S') - $message" >> $logfile + echo -e "$timestamp - $message" >> $logfile ;; esac } @@ -312,11 +313,10 @@ else fi # Setting work variables -timeset=$(date +"%Y-%m-%d_%H-%M-%S") -cp ../DOC/viralrecon.config ../DOC/${timeset}_viralrecon.config -cp ../DOC/viralrecon_params.yml ../DOC/${timeset}_viralrecon_params.yml -CONFIG_FILE="../DOC/${timeset}_viralrecon.config" -PARAMS_FILE="../DOC/${timeset}_viralrecon_params.yml" +cp ../DOC/viralrecon.config ../DOC/${timestamp}_viralrecon.config +cp ../DOC/viralrecon_params.yml ../DOC/${timestamp}_viralrecon_params.yml +CONFIG_FILE="../DOC/${timestamp}_viralrecon.config" +PARAMS_FILE="../DOC/${timestamp}_viralrecon_params.yml" log_message "Created $CONFIG_FILE file." log_message "Created $PARAMS_FILE file." echo From dbf8de4412f7edc3f22d704407d17650fb31d000 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Fri, 9 Aug 2024 14:54:52 +0200 Subject: [PATCH 148/321] Fixed bug. Now, references with no family are assigned to miscellanous --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 02c5b055a..3537bd36b 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -182,6 +182,11 @@ check_references() { family=$(curl -s "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=${organism_id}" | grep -o 'TITLE="family">.*<' | awk -F 'TITLE="family">' '{print $2}' | cut -d '<' -f 1 | tr '[:upper:]' '[:lower:]') if [ -z $family ]; then family=$(curl -s "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=${organism_id}" | grep -o 'ALT="family">.*<' | awk -F 'ALT="family">' '{print $2}' | cut -d '<' -f 1 | tr '[:upper:]' '[:lower:]') + if [ -z $family ]; then + family="miscellanous" + log_message "Reference $ref does not currently belong to any family. Assigned to $family." + break + fi fi log_message "Reference $ref belongs to $family family." } From e1c81495f8a0775caef796dd6d86ab45af8351c9 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Fri, 9 Aug 2024 14:57:47 +0200 Subject: [PATCH 149/321] Substituted break by return --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 3537bd36b..727a308bd 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -185,7 +185,7 @@ check_references() { if [ -z $family ]; then family="miscellanous" log_message "Reference $ref does not currently belong to any family. Assigned to $family." - break + return fi fi log_message "Reference $ref belongs to $family family." From 43fc68e9e5288b1a089846e1567f0bc2802685f8 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Tue, 13 Aug 2024 15:42:26 +0200 Subject: [PATCH 150/321] Changed color to last message --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 727a308bd..12d6c6670 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -528,4 +528,4 @@ rm percentajeNs.py rm _02_create_run_percentage_Ns.sh cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd .. -log_message "Lablog_viralrecon execution has been completed. Please verify all the configurations are set up correctly." green +log_message "Lablog_viralrecon execution has been completed. Please verify all the configurations are set up correctly." bold From 2b08b9c0f54c9150bf3c8dc1d7f4590cea774665 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Wed, 14 Aug 2024 15:49:51 +0200 Subject: [PATCH 151/321] Modified name of gff file (now is named as family) --- bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 12d6c6670..ff8cdea2e 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -260,7 +260,7 @@ check_references() { digest=$(openssl rand -hex 24) refgenie alias set --aliases ${family} --digest ${digest} -f -c /data/bi/references/refgenie/genome_config.yaml mkdir -p /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/ - wget -q -O "/data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/${ref}.gff" "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${ref}" + wget -q -O "/data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/${family}.gff" "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${ref}" if [ $? -eq 0 ]; then log_message "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." green log_message "Adding asset for ${ref}.gff file..." @@ -279,7 +279,7 @@ check_references() { log_message "Directory /data/bi/references/refgenie/alias/${family}/ ALREADY EXISTS. Downloading ${ref}.gff." digest=$(refgenie alias get -a ${family} -c /data/bi/references/refgenie/genome_config.yaml) mkdir -p /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/ - wget -q -O "/data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/${ref}.gff" "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${ref}" + wget -q -O "/data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}/${family}.gff" "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${ref}" if [ $? -eq 0 ]; then log_message "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." green log_message "Adding asset for ${ref}.gff file..." From b75d1c18b7e4f2f7814e259c9cf07af5043c89fe Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Wed, 14 Aug 2024 16:03:01 +0200 Subject: [PATCH 152/321] Removed srun from refgenie build and refgenie add in order to avoid wrong behaviour of loop at line 471. --- .../viralrecon/ANALYSIS/lablog_viralrecon | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index ff8cdea2e..6477bd408 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -214,8 +214,8 @@ check_references() { log_message "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." green gzip /data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta log_message "Building asset for ${ref}.fasta file..." - srun --partition short_idx --output ${ref}.fasta_build.log refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R - if [ $? -eq 0 ]; then + refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R > ${ref}.fasta_build.log 2>&1 + if grep -q "Created" "${ref}.fasta_build.log"; then log_message "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" bold bash /data/bi/references/refgenie/alias/ref.sh REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) @@ -234,8 +234,8 @@ check_references() { log_message "File ${ref}.fasta downloaded in /data/bi/references/refgenie/data/${digest}/fasta/${ref}." green gzip /data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta log_message "Building asset for ${ref}.fasta file..." - srun --partition short_idx --output ${ref}.fasta_build.log refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R - if [ $? -eq 0 ]; then + refgenie build ${family}/fasta:${ref} --files fasta=/data/bi/references/refgenie/data/${digest}/fasta/${ref}/${ref}.fasta.gz -c /data/bi/references/refgenie/genome_config.yaml -R > ${ref}.fasta_build.log 2>&1 + if grep -q "Created" "${ref}.fasta_build.log"; then log_message "$(grep Created ${ref}.fasta_build.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.fasta_build.log)" bold bash /data/bi/references/refgenie/alias/ref.sh REF_FASTA=$(awk -v ref="$ref" '$0 ~ ref && /fasta/ {print $4}' /data/bi/references/refgenie/alias/references.txt) @@ -264,8 +264,8 @@ check_references() { if [ $? -eq 0 ]; then log_message "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." green log_message "Adding asset for ${ref}.gff file..." - srun --partition short_idx --output ${ref}.gff_add.log refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml - if [ $? -eq 0 ]; then + refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml > ${ref}.gff_add.log 2>&1 + if grep -q "Created" "${ref}.gff_add.log"; then log_message "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" bold bash /data/bi/references/refgenie/alias/ref.sh REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) @@ -283,8 +283,8 @@ check_references() { if [ $? -eq 0 ]; then log_message "File ${ref}.gff downloaded in /data/bi/references/refgenie/data/${digest}/ensembl_rb/${ref}." green log_message "Adding asset for ${ref}.gff file..." - srun --partition short_idx --output ${ref}.gff_add.log refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml - if [ $? -eq 0 ]; then + refgenie add ${family}/gff:${ref} --path data/${digest}/ensembl_rb/${ref}/ --seek-keys '{"gff" : "'"${family}.gff"'"}' -c /data/bi/references/refgenie/genome_config.yaml > ${ref}.gff_add.log 2>&1 + if grep -q "Created" "${ref}.gff_add.log"; then log_message "$(grep Created ${ref}.gff_add.log) $(grep "/data/bi/references/refgenie/alias/" ${ref}.gff_add.log)" bold bash /data/bi/references/refgenie/alias/ref.sh REF_GFF=$(awk -v ref="$ref" '$0 ~ ref && /gff/ {print $4}' /data/bi/references/refgenie/alias/references.txt) From 003a2267b2d244a648f11f86b24829cd5a2aedf1 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Fri, 16 Aug 2024 10:41:23 +0200 Subject: [PATCH 153/321] Added loading stage for micromamba refgenie enviroment --- .../viralrecon/ANALYSIS/lablog_viralrecon | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon index 6477bd408..ec944be38 100644 --- a/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon +++ b/bu_isciii/templates/viralrecon/ANALYSIS/lablog_viralrecon @@ -205,6 +205,15 @@ check_references() { log_message "SAMtools module not loaded. Exiting..." blk_red exit 1 fi + eval "$(micromamba shell hook --shell bash)" + micromamba activate refgenie_v0.12.1 + environment=$(micromamba info | awk '/environment/ && /active/ {print $3}') + if [[ $environment == *"refgenie"* ]]; then + log_message "$environment environment succesfully activated." green + else + log_message "Refgenie environment is NOT ACTIVE. Exiting..." blk_red + exit 1 + fi if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exists log_message "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.fasta in /data/bi/references/refgenie/alias/${family}/fasta/${ref}." digest=$(openssl rand -hex 24) @@ -255,6 +264,16 @@ check_references() { if [ -z "$REF_GFF" ]; then log_message "File ${ref}.gff is not yet downloaded." if [ ! -v family ]; then obtain_family; if [ -z ${family} ]; then return; fi; fi + if [[ $environment != *"refgenie"* ]]; then + eval "$(micromamba shell hook --shell bash)" + micromamba activate refgenie_v0.12.1 + environment=$(micromamba info | awk '/environment/ && /active/ {print $3}') + if [[ $environment == *"refgenie"* ]]; then + log_message "$environment environment succesfully activated." green + else + log_message "Refgenie environment is NOT ACTIVE. Exiting..." blk_red + fi + fi if [ ! -e "/data/bi/references/refgenie/alias/${family}" ]; then # Check if directory doesn't exist log_message "Creating new directory: /data/bi/references/refgenie/alias/${family}/ and saving file ${ref}.gff in /data/bi/references/refgenie/alias/${family}/gff/${ref}." digest=$(openssl rand -hex 24) From 2f6bfdcea7e3fa80aa2b270d3567c2dcf0278803 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Fri, 16 Aug 2024 10:56:48 +0200 Subject: [PATCH 154/321] Updated CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 35b6a0478..d2dc42510 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -59,6 +59,7 @@ Code contributions to the new version: - Fixed IRMA's lablog so that the sequences of the samples are not displayed several times neither in the .txt files of each influenza type nor in all_samples_completo.txt [#305](https://github.com/BU-ISCIII/buisciii-tools/pull/305) - Modified bioinfo_doc.py so that new lines in the delivery message are applied in the email [#307](https://github.com/BU-ISCIII/buisciii-tools/pull/307) - Added several improvements in lablog_viralrecon (created log files, modified check_references function behaviour, enabled config files regeneration) [#306](https://github.com/BU-ISCIII/buisciii-tools/pull/306) +- Fixed bug when lablog_viralrecon tries to download references that don't belong to any family. [#310](https://github.com/BU-ISCIII/buisciii-tools/pull/310) ### Modules From 936df9a1ace69d9a60d2e1079088501ece2feaf8 Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 1 Aug 2024 11:29:43 +0200 Subject: [PATCH 155/321] created python for irma vcf --- .../04-irma/create_irma_vcf.py | 712 ++++++++++++++++++ 1 file changed, 712 insertions(+) create mode 100644 bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py new file mode 100644 index 000000000..bf909cb60 --- /dev/null +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -0,0 +1,712 @@ +# imports +from Bio import SeqIO +import statistics +import argparse +import sys + + +def parse_args(args=None): + Description = "Convert alignment between IRMA consensus and reference fasta to VCF file using IRMA stats" + Epilog = """Example usage: python create_irma_vcf.py -a -i -o """ + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "-a", + "--alignment", + type=str, + required=True, + help="Alignment file", + ) + parser.add_argument( + "-i", + "--irma_alleles", + type=str, + required=True, + help="IRMA allAlleles.txt file", + ) + parser.add_argument( + "-o", + "--out_vcf", + type=str, + required=True, + help="Output vcf file", + ) + return parser.parse_args(args) + + +def alleles_to_dict(alleles_file): + """Convert IRMA's allAlleles file to dictionary. + + Parameters + ---------- + alleles_file : str + Path to the alleles file. + + Returns + ------- + alleles_dict + Dictionary containing alleles information with positions as keys. + E.g: + { + 1: { + 'Reference_Name': 'rsv_a2', + 'Position': '1', + 'Allele': 'A', + 'Count': '2', + 'Total': '2', + 'Frequency': '1', + 'Average_Quality': '29.5', + 'ConfidenceNotMacErr': '0.998877981545698', + 'PairedUB': '1', + 'QualityUB': '1', + 'Allele_Type': 'Consensus' + } + } + """ + + alleles_dict = {} + with open(alleles_file, "r") as file: + header = file.readline().strip().split('\t') + for line in file: + while line.count('\t') < len(header) - 1: + line += file.readline() + line_data = line.strip().split('\t') + position = int(line_data[1]) + allele_type = line_data[10] + if allele_type == "Consensus": + entry_dict = {header[i]: line_data[i] for i in range(len(header))} + alleles_dict[position] = entry_dict + return alleles_dict + + +def align2dict(alignment_file): + """Convert alignment file to dictionary. + + Parameters + ---------- + alignment_file : str + Path to the alignment file in fasta format. + + Returns + ------- + vcf_dict + Dictionary containing alignment information with alignment positions as keys. + E.g.: + { + "10": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 1, + "SAMPLE_POS": [ + 8, + 9 + ], + "REF": "A", + "ALT": "AAA", + "TYPE": "INS" + }, + "19": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10, + "SAMPLE_POS": [ + 19 + ], + "REF": "T", + "ALT": "A", + "TYPE": "SNP" + }, + "7542": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 7531, + "SAMPLE_POS": [ + 7542 + ], + "REF": "T", + "ALT": "TT", + "TYPE": "INS" + }, + "7543": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 7531, + "SAMPLE_POS": [ + 7543 + ], + "REF": "T", + "ALT": "TC", + "TYPE": "INS" + }, + "7544": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 7531, + "SAMPLE_POS": [ + 7544 + ], + "REF": "C", + "ALT": "CA", + "TYPE": "INS" + }, + "10081": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10068, + "SAMPLE_POS": [ + 10079 + ], + "REF": "AA", + "ALT": "A", + "TYPE": "DEL" + }, + "10082": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10069, + "SAMPLE_POS": [ + 10079 + ], + "REF": "-C", + "ALT": "-", + "TYPE": "DEL" + }, + "10083": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10070, + "SAMPLE_POS": [ + 10079 + ], + "REF": "-T", + "ALT": "-", + "TYPE": "DEL" + } + } + """ + sequences_dict = {} + with open(alignment_file, "r") as alignment: + for sequence in SeqIO.parse(alignment, "fasta"): + sequences_dict[sequence.id] = str(sequence.seq) + sample_id, sample_seq = list(sequences_dict.items())[0] + ref_id, ref_seq = list(sequences_dict.items())[1] + sample_position = 0 + ref_position = 0 + vcf_dict = {} + CHROM = ref_id + ALT = "" + SAMPLE_POS = [] + for i, (sample_base, ref_base) in enumerate(zip(sample_seq, ref_seq)): + align_position = i + 1 + if sample_base != "-": + sample_position += 1 + if ref_base != "-": + ref_position += 1 + if ref_base == "-" and sample_base != "N": + if ref_position == 0: + ALT += sample_base + SAMPLE_POS.append(sample_position) + else: + content_dict = { + "CHROM": CHROM, + "REF_POS": ref_position, + "SAMPLE_POS": [sample_position], + "REF": sample_seq[i-1], + "ALT": sample_seq[i-1] + sample_base, + "TYPE": "INS" + } + vcf_dict[align_position] = content_dict + elif ref_position == 1 and len(SAMPLE_POS) > 1: + content_dict = { + "CHROM": CHROM, + "REF_POS": ref_position, + "SAMPLE_POS": SAMPLE_POS, + "REF": ref_base, + "ALT": ALT + sample_base, + "TYPE": "INS" + } + vcf_dict[align_position] = content_dict + elif sample_base == "-" and ref_base != "N": + content_dict = { + "CHROM": CHROM, + "REF_POS": ref_position-1, + "SAMPLE_POS": [sample_position], + "REF": sample_seq[i-1] + ref_base, + "ALT": sample_seq[i-1], + "TYPE": "DEL" + } + vcf_dict[align_position] = content_dict + elif ref_base != sample_base and ref_base != "N" and ref_base != "-" and sample_base != "N" and sample_base != "-": + content_dict = { + "CHROM": CHROM, + "REF_POS": ref_position, + "SAMPLE_POS": [sample_position], + "REF": ref_base, + "ALT": sample_base, + "TYPE": "SNP" + } + vcf_dict[align_position] = content_dict + return vcf_dict + + +def stats_vcf(vcf_dictionary, alleles_dictionary): + """Add stats to VCF dictionary. + + Parameters + ---------- + vcf_dictionary : dict + Dictionary containing VCF information. + alleles_dictionary : dict + Dictionary containing alleles information. + + Returns + ------- + af_vcf_dict + Updated dictionary with allele frequencies and other metrics. + E.g: + { + "10": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 1, + "SAMPLE_POS": [ + 8, + 9 + ], + "REF": "A", + "ALT": "AAA", + "TYPE": "INS", + "DP": [ + "9" + ], + "AF": [ + "1" + ], + "QUAL": [ + "33.7777777777778" + ] + }, + "19": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10, + "SAMPLE_POS": [ + 19 + ], + "REF": "T", + "ALT": "A", + "TYPE": "SNP", + "DP": [ + "60" + ], + "AF": [ + "0.833333333333333" + ], + "QUAL": [ + "34.0166666666667" + ] + }, + "7542": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 7531, + "SAMPLE_POS": [ + 7542 + ], + "REF": "T", + "ALT": "TT", + "TYPE": "INS", + "DP": [ + "74" + ], + "AF": [ + "0.986666666666667" + ], + "QUAL": [ + "34.8648648648649" + ] + }, + "7543": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 7531, + "SAMPLE_POS": [ + 7543 + ], + "REF": "T", + "ALT": "TC", + "TYPE": "INS", + "DP": [ + "75" + ], + "AF": [ + "1" + ], + "QUAL": [ + "35.04" + ] + }, + "7544": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 7531, + "SAMPLE_POS": [ + 7544 + ], + "REF": "C", + "ALT": "CA", + "TYPE": "INS", + "DP": [ + "75" + ], + "AF": [ + "1" + ], + "QUAL": [ + "33.8533333333333" + ] + }, + "10081": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10068, + "SAMPLE_POS": [ + 10079 + ], + "REF": "AA", + "ALT": "A", + "TYPE": "DEL", + "DP": [ + "10" + ], + "AF": [ + "1" + ], + "QUAL": [ + "34.3" + ] + }, + "10082": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10069, + "SAMPLE_POS": [ + 10079 + ], + "REF": "-C", + "ALT": "-", + "TYPE": "DEL", + "DP": [ + "10" + ], + "AF": [ + "1" + ], + "QUAL": [ + "34.3" + ] + }, + "10083": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10070, + "SAMPLE_POS": [ + 10079 + ], + "REF": "-T", + "ALT": "-", + "TYPE": "DEL", + "DP": [ + "10" + ], + "AF": [ + "1" + ], + "QUAL": [ + "34.3" + ] + } + } + """ + af_vcf_dict = {} + for key, value in vcf_dictionary.items(): + DP = [] + AF = [] + QUAL = [] + content_dict = { + "CHROM": value["CHROM"], + "REF_POS": value["REF_POS"], + "SAMPLE_POS": value["SAMPLE_POS"], + "REF": value["REF"], + "ALT": value["ALT"], + "TYPE": value["TYPE"] + } + for position in value["SAMPLE_POS"]: + if position in alleles_dictionary: + alleles_info = alleles_dictionary[position] + if alleles_info["Allele"] == value["ALT"] or value["TYPE"] in ["INS", "DEL"]: + DP.append(alleles_info["Count"]) + AF.append(alleles_info["Frequency"]) + QUAL.append(alleles_info["Average_Quality"]) + break + else: + print("SNP not the same in .fasta file and alleles file") + print(value) + print(alleles_info) + else: + print("Position not detected in allele file!") + print("Position") + print(value["SAMPLE_POS"]) + print(value) + content_dict.update({"DP": DP, "AF": AF, "QUAL": QUAL}) + af_vcf_dict[key] = content_dict + return af_vcf_dict + + +def combine_indels(vcf_dictionary): + """Combine insertion and deletion pñositons in the VCF dictionary. + + Parameters + ---------- + vcf_dictionary : dict + Dictionary containing VCF information. + + Returns + ------- + combined_vcf_dict + Updated dictionary with combined insertion and deletion variants. + { + "1": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 1, + "SAMPLE_POS": [ + 8, + 9 + ], + "REF": "A", + "ALT": "AAA", + "DP": [ + "9" + ], + "AF": [ + "1" + ], + "QUAL": [ + "33.7777777777778" + ], + "TYPE": "INS" + }, + "10": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10, + "SAMPLE_POS": [ + 19 + ], + "REF": "T", + "ALT": "A", + "DP": [ + "60" + ], + "AF": [ + "0.833333333333333" + ], + "QUAL": [ + "34.0166666666667" + ], + "TYPE": "SNP" + }, + "7531": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 7531, + "SAMPLE_POS": [ + 7542, + 7543, + 7544 + ], + "REF": "T", + "ALT": "TTCA", + "DP": [ + "74", + "75", + "75" + ], + "AF": [ + "0.986666666666667", + "1", + "1" + ], + "QUAL": [ + "34.8648648648649", + "35.04", + "33.8533333333333" + ], + "TYPE": "INS" + }, + "10068": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10068, + "SAMPLE_POS": [ + 10079 + ], + "REF": "AACT", + "ALT": "A", + "DP": [ + "10" + ], + "AF": [ + "1" + ], + "QUAL": [ + "34.3" + ], + "TYPE": "DEL" + } + } + + """ + combined_vcf_dict = {} + for key, value in vcf_dictionary.items(): + content_dict = { + "CHROM": value["CHROM"], + "REF_POS": value["REF_POS"], + "SAMPLE_POS": value["SAMPLE_POS"], + "REF": value["REF"], + "ALT": value["ALT"], + "DP": value["DP"], + "AF": value["AF"], + "QUAL": value["QUAL"], + "TYPE": value["TYPE"] + } + if value["TYPE"] == "INS": + if value["REF_POS"] in combined_vcf_dict: + if value["TYPE"] == combined_vcf_dict[value["REF_POS"]]["TYPE"]: + NEW_ALT = value["ALT"].replace(value["REF"], "") + combined_vcf_dict[value["REF_POS"]]["ALT"] += NEW_ALT + combined_vcf_dict[value["REF_POS"]]["SAMPLE_POS"].append(value["SAMPLE_POS"][0]) + combined_vcf_dict[value["REF_POS"]]["DP"].append(value["DP"][0]) + combined_vcf_dict[value["REF_POS"]]["AF"].append(value["AF"][0]) + combined_vcf_dict[value["REF_POS"]]["QUAL"].append(value["QUAL"][0]) + else: + print("Same position annotated with multiple variant types") + print("value") + print(value) + print("combined_vcf_dict") + print(combined_vcf_dict[value["REF_POS"]]) + else: + combined_vcf_dict[value["REF_POS"]] = content_dict + elif value["TYPE"] == "DEL": + sample_found = False + for pos, data in combined_vcf_dict.items(): + var_type = data["TYPE"] + if var_type == "DEL": + if value["SAMPLE_POS"] == data["SAMPLE_POS"]: + if value["TYPE"] == var_type: + sample_found = data["REF_POS"] + break + else: + print("Same position annotated with multiple variant types") + print("value") + print(value) + print("combined_vcf_dict") + print(combined_vcf_dict[value["REF_POS"]]) + if sample_found: + NEW_REF = value["REF"].replace(value["ALT"], "") + combined_vcf_dict[sample_found]["REF"] += NEW_REF + else: + combined_vcf_dict[value["REF_POS"]] = content_dict + elif value["TYPE"] == "SNP": + if value["REF_POS"] in combined_vcf_dict: + if value["TYPE"] == combined_vcf_dict[value["REF_POS"]]["TYPE"]: + print("Repeated SNP!!!") + else: + print("Same position annotated with multiple variant types") + print("value") + print(value) + print("combined_vcf_dict") + print(combined_vcf_dict[value["REF_POS"]]) + else: + combined_vcf_dict[value["REF_POS"]] = content_dict + else: + print("Different annotation type found") + return combined_vcf_dict + + +def get_vcf_header(chromosome, sample_name): + """Create the VCF header for VCFv4.2 + + Parameters + ---------- + chromosome : str + Chromosome name. + sample_name : str + Sample name. + + Returns + ------- + header + String containing all the VCF header lines separated by newline. + """ + # Define VCF header + header_source = ["##fileformat=VCFv4.2", "##source=custom"] + header_contig = [] + if chromosome: + header_contig += [ + "##contig=" + ] + header_source += header_contig + + header_info = [ + '##INFO=', + ] + header_format = [ + '##FORMAT=', + '##FORMAT=', + ] + columns = [ + '#CHROM\tPOS\tREF\tALT\tQUAL\tINFO\tFORMAT\t' + sample_name + ] + header = header_source + header_info + header_format + columns + return header + + +def create_vcf(variants_dict, out_vcf, alignment): + """Create VCF file from variants dictionary. + + Parameters + ---------- + variants_dict : dict + Dictionary containing variants information. + out_vcf : str + Path to the output VCF file. + alignment : str + Path to the alignment file. + + Returns + ------- + None + """ + + chrom = next(iter(variants_dict.values()))["CHROM"] + sample = alignment.replace(".align.fasta", "") + vcf_header = "\n".join(get_vcf_header(chrom, sample)) + FORMAT = "DP:AF" + with open(out_vcf, "w") as file_out: + file_out.write(vcf_header + "\n") + for key, value in variants_dict.items(): + CHROM = value["CHROM"] + POS = value["REF_POS"] + REF = value["REF"] + ALT = value["ALT"] + QUAL_list = [float(number) for number in value["QUAL"]] + QUAL = str(round(statistics.mean(QUAL_list), 2)) + INFO = "TYPE=" + value["TYPE"] + DP_list = [int(number) for number in value["DP"]] + AF_list = [float(number) for number in value["AF"]] + SAMPLE = str(round(statistics.mean(DP_list))) + ':' + str(round(statistics.mean(AF_list), 4)) + oline = CHROM + '\t' + str(POS) + '\t' + REF + '\t' + ALT + '\t' + str("".join(QUAL)) + '\t' + INFO + '\t' + FORMAT + '\t' + SAMPLE + file_out.write(oline + "\n") + + +def main(args=None): + # Process args + args = parse_args(args) + + # Initialize vars + alignment = args.alignment + all_alleles = args.irma_alleles + output_vcf = args.out_vcf + + # Start analysis + alleles_dict = alleles_to_dict(all_alleles) + alignment_dict = align2dict(alignment) + af_vcf_dict = stats_vcf(alignment_dict, alleles_dict) + combined_vcf_dict = combine_indels(af_vcf_dict) + create_vcf(combined_vcf_dict, output_vcf, alignment) + + +if __name__ == "__main__": + sys.exit(main()) From 84ab72bba8be324c5913759dda57d24fdecca02a Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 1 Aug 2024 11:52:42 +0200 Subject: [PATCH 156/321] @saramonzon changes to add minority variants WIP --- .../04-irma/create_irma_vcf.py | 157 +++++++++++------- 1 file changed, 101 insertions(+), 56 deletions(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index bf909cb60..e9b09e6d3 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -45,23 +45,48 @@ def alleles_to_dict(alleles_file): Returns ------- alleles_dict - Dictionary containing alleles information with positions as keys. - E.g: - { - 1: { - 'Reference_Name': 'rsv_a2', - 'Position': '1', - 'Allele': 'A', - 'Count': '2', - 'Total': '2', - 'Frequency': '1', - 'Average_Quality': '29.5', - 'ConfidenceNotMacErr': '0.998877981545698', - 'PairedUB': '1', - 'QualityUB': '1', - 'Allele_Type': 'Consensus' - } - } + Dictionary containing alleles information with chrom+positions+allele as key. e.g. + { + "rsv_a2_1_A": { + "Reference_Name": "rsv_a2", + "Position": "1", + "Allele": "A", + "Count": "2", + "Total": "2", + "Frequency": "1", + "Average_Quality": "29.5", + "ConfidenceNotMacErr": "0.998877981545698", + "PairedUB": "1", + "QualityUB": "1", + "Allele_Type": "Consensus" + }, + "rsv_a2_2204_A": { + "Reference_Name": "rsv_a2", + "Position": "2204", + "Allele": "A", + "Count": "6532", + "Total": "15323", + "Frequency": "0.426287280558637", + "Average_Quality": "34.5708818126148", + "ConfidenceNotMacErr": "0.999181140401206", + "PairedUB": "0.00396999257813604", + "QualityUB": "0.0010642711614851", + "Allele_Type": "Minority" + }, + "rsv_a2_2204_G": { + "Reference_Name": "rsv_a2", + "Position": "2204", + "Allele": "G", + "Count": "8768", + "Total": "15323", + "Frequency": "0.5722117078901", + "Average_Quality": "35.0286268248175", + "ConfidenceNotMacErr": "0.999450989591763", + "PairedUB": "0.00396999257813604", + "QualityUB": "0.00100698799816366", + "Allele_Type": "Consensus" + }, + } """ alleles_dict = {} @@ -72,10 +97,11 @@ def alleles_to_dict(alleles_file): line += file.readline() line_data = line.strip().split('\t') position = int(line_data[1]) - allele_type = line_data[10] - if allele_type == "Consensus": + variant_af = float(line_data[5]) + if variant_af > 0.25: entry_dict = {header[i]: line_data[i] for i in range(len(header))} - alleles_dict[position] = entry_dict + variant = str(line_data[0]) + "_" + str(position) + "_" + str(line_data[2]) + alleles_dict[variant] = entry_dict return alleles_dict @@ -104,15 +130,15 @@ def align2dict(alignment_file): "ALT": "AAA", "TYPE": "INS" }, - "19": { + "11": { "CHROM": "EPI_ISL_18668201", - "REF_POS": 10, + "REF_POS": 2, "SAMPLE_POS": [ - 19 + 11 ], - "REF": "T", + "REF": "A", "ALT": "A", - "TYPE": "SNP" + "TYPE": "REF" }, "7542": { "CHROM": "EPI_ISL_18668201", @@ -238,6 +264,16 @@ def align2dict(alignment_file): "TYPE": "SNP" } vcf_dict[align_position] = content_dict + elif ref_base != "N" and ref_base != "-" and sample_base != "N" and sample_base != "-": + content_dict = { + "CHROM": CHROM, + "REF_POS": ref_position, + "SAMPLE_POS": [sample_position], + "REF": ref_base, + "ALT": sample_base, + "TYPE": "REF" + } + vcf_dict[align_position] = content_dict return vcf_dict @@ -257,7 +293,7 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): Updated dictionary with allele frequencies and other metrics. E.g: { - "10": { + "EPI_ISL_18668201_1_AAA": { "CHROM": "EPI_ISL_18668201", "REF_POS": 1, "SAMPLE_POS": [ @@ -268,13 +304,16 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "ALT": "AAA", "TYPE": "INS", "DP": [ - "9" + "9", + "10" ], "AF": [ + "1", "1" ], "QUAL": [ - "33.7777777777778" + "33.7777777777778", + "34" ] }, "19": { @@ -413,37 +452,43 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): } """ af_vcf_dict = {} - for key, value in vcf_dictionary.items(): - DP = [] - AF = [] - QUAL = [] - content_dict = { - "CHROM": value["CHROM"], - "REF_POS": value["REF_POS"], - "SAMPLE_POS": value["SAMPLE_POS"], - "REF": value["REF"], - "ALT": value["ALT"], - "TYPE": value["TYPE"] - } - for position in value["SAMPLE_POS"]: - if position in alleles_dictionary: - alleles_info = alleles_dictionary[position] - if alleles_info["Allele"] == value["ALT"] or value["TYPE"] in ["INS", "DEL"]: - DP.append(alleles_info["Count"]) - AF.append(alleles_info["Frequency"]) - QUAL.append(alleles_info["Average_Quality"]) - break + for _, value in alleles_dictionary.items(): + pos = value["Position"] + for _, subdict in vcf_dictionary.items(): + if value["Allele_Type"] == "Consensus" and subdict["TYPE"] == "REF": + continue + if 'SAMPLE_POS' in subdict and int(pos) in subdict['SAMPLE_POS']: + DP = [] + AF = [] + QUAL = [] + content_dict = { + "CHROM": subdict["CHROM"], + "REF_POS": subdict["REF_POS"], + "SAMPLE_POS": subdict["SAMPLE_POS"], + "REF": subdict["REF"], + "ALT": subdict["ALT"], + "TYPE": subdict["TYPE"] + } + if value["Allele"] == content_dict["ALT"] or value["Allele_Type"] == "Minority" or content_dict["TYPE"] in ["INS", "DEL", "REF"]: + DP.append(value["Count"]) + AF.append(value["Frequency"]) + QUAL.append(value["Average_Quality"]) else: print("SNP not the same in .fasta file and alleles file") print(value) - print(alleles_info) - else: - print("Position not detected in allele file!") - print("Position") - print(value["SAMPLE_POS"]) - print(value) - content_dict.update({"DP": DP, "AF": AF, "QUAL": QUAL}) - af_vcf_dict[key] = content_dict + print(content_dict) + + content_dict.update({"DP": DP, "AF": AF, "QUAL": QUAL}) + variant = content_dict["CHROM"] + "_" + str(content_dict["REF_POS"]) + "_" + content_dict["ALT"] + + if variant in af_vcf_dict: + af_vcf_dict[variant]["DP"] += DP + af_vcf_dict[variant]["AF"] += AF + af_vcf_dict[variant]["QUAL"] += QUAL + else: + af_vcf_dict[variant] = content_dict + break + return af_vcf_dict From fc748cad49394270707da2e4dd379ad09c3672b0 Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 1 Aug 2024 11:57:00 +0200 Subject: [PATCH 157/321] Added frequency and depth as parameter filters --- .../04-irma/create_irma_vcf.py | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index e9b09e6d3..03a3f1c7a 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -31,10 +31,26 @@ def parse_args(args=None): required=True, help="Output vcf file", ) + parser.add_argument( + "-f", + "--frequency", + type=float, + default=0.25, + required=True, + help="Minimum Allele Frequency for a variant to be included in the .vcf file. Default 0.25.", + ) + parser.add_argument( + "-d", + "--depth", + type=int, + default=10, + required=True, + help="Minimum depth for a variant to be included in the .vcf file. Default 10X.", + ) return parser.parse_args(args) -def alleles_to_dict(alleles_file): +def alleles_to_dict(alleles_file, frequency, depth): """Convert IRMA's allAlleles file to dictionary. Parameters @@ -98,7 +114,8 @@ def alleles_to_dict(alleles_file): line_data = line.strip().split('\t') position = int(line_data[1]) variant_af = float(line_data[5]) - if variant_af > 0.25: + position_dp = float(line_data[4]) + if variant_af >= frequency and position_dp >= depth: entry_dict = {header[i]: line_data[i] for i in range(len(header))} variant = str(line_data[0]) + "_" + str(position) + "_" + str(line_data[2]) alleles_dict[variant] = entry_dict @@ -744,9 +761,11 @@ def main(args=None): alignment = args.alignment all_alleles = args.irma_alleles output_vcf = args.out_vcf + freq = args.frequency + dp = args.depth # Start analysis - alleles_dict = alleles_to_dict(all_alleles) + alleles_dict = alleles_to_dict(all_alleles, freq, dp) alignment_dict = align2dict(alignment) af_vcf_dict = stats_vcf(alignment_dict, alleles_dict) combined_vcf_dict = combine_indels(af_vcf_dict) From ac9dcce7f081fb7e0226992086785b5d72c37017 Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 1 Aug 2024 12:05:23 +0200 Subject: [PATCH 158/321] Added TOTAL_DP to stats --- .../04-irma/create_irma_vcf.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index 03a3f1c7a..e18d25eb3 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -476,6 +476,7 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): continue if 'SAMPLE_POS' in subdict and int(pos) in subdict['SAMPLE_POS']: DP = [] + TOTAL_DP = [] AF = [] QUAL = [] content_dict = { @@ -494,12 +495,12 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): print("SNP not the same in .fasta file and alleles file") print(value) print(content_dict) - - content_dict.update({"DP": DP, "AF": AF, "QUAL": QUAL}) + content_dict.update({"DP": DP, "TOTAL_DP": TOTAL_DP, "AF": AF, "QUAL": QUAL}) variant = content_dict["CHROM"] + "_" + str(content_dict["REF_POS"]) + "_" + content_dict["ALT"] if variant in af_vcf_dict: af_vcf_dict[variant]["DP"] += DP + af_vcf_dict[variant]["TOTAL_DP"] += TOTAL_DP af_vcf_dict[variant]["AF"] += AF af_vcf_dict[variant]["QUAL"] += QUAL else: @@ -610,6 +611,7 @@ def combine_indels(vcf_dictionary): } """ + combined_vcf_dict = {} for key, value in vcf_dictionary.items(): content_dict = { @@ -619,6 +621,7 @@ def combine_indels(vcf_dictionary): "REF": value["REF"], "ALT": value["ALT"], "DP": value["DP"], + "TOTAL_DP": value["TOTAL_DP"], "AF": value["AF"], "QUAL": value["QUAL"], "TYPE": value["TYPE"] @@ -630,6 +633,7 @@ def combine_indels(vcf_dictionary): combined_vcf_dict[value["REF_POS"]]["ALT"] += NEW_ALT combined_vcf_dict[value["REF_POS"]]["SAMPLE_POS"].append(value["SAMPLE_POS"][0]) combined_vcf_dict[value["REF_POS"]]["DP"].append(value["DP"][0]) + combined_vcf_dict[value["REF_POS"]]["TOTAL_DP"].append(value["TOTAL_DP"][0]) combined_vcf_dict[value["REF_POS"]]["AF"].append(value["AF"][0]) combined_vcf_dict[value["REF_POS"]]["QUAL"].append(value["QUAL"][0]) else: @@ -692,7 +696,7 @@ def get_vcf_header(chromosome, sample_name): header String containing all the VCF header lines separated by newline. """ - # Define VCF header + header_source = ["##fileformat=VCFv4.2", "##source=custom"] header_contig = [] if chromosome: @@ -743,10 +747,8 @@ def create_vcf(variants_dict, out_vcf, alignment): POS = value["REF_POS"] REF = value["REF"] ALT = value["ALT"] - QUAL_list = [float(number) for number in value["QUAL"]] - QUAL = str(round(statistics.mean(QUAL_list), 2)) - INFO = "TYPE=" + value["TYPE"] - DP_list = [int(number) for number in value["DP"]] + TOTAL_DP_list = [int(number) for number in value["TOTAL_DP"]] + INFO = "TYPE=" + value["TYPE"] + ';' + "DP=" + str(round(statistics.mean(TOTAL_DP_list))) AF_list = [float(number) for number in value["AF"]] SAMPLE = str(round(statistics.mean(DP_list))) + ':' + str(round(statistics.mean(AF_list), 4)) oline = CHROM + '\t' + str(POS) + '\t' + REF + '\t' + ALT + '\t' + str("".join(QUAL)) + '\t' + INFO + '\t' + FORMAT + '\t' + SAMPLE From 0673ef949cca539380a44d8238afc1e7b43c27b7 Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 1 Aug 2024 12:06:23 +0200 Subject: [PATCH 159/321] Updated VCF data to fit in VCFv4.2 format --- .../04-irma/create_irma_vcf.py | 32 +++++++++++++++---- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index e18d25eb3..44f7b945d 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -707,15 +707,21 @@ def get_vcf_header(chromosome, sample_name): header_info = [ '##INFO=', + '##INFO=' + ] + header_filter = [ + '##FILTER=', ] header_format = [ - '##FORMAT=', - '##FORMAT=', + '##FORMAT=', + '##FORMAT=', + '##FORMAT=', + '##FORMAT=', ] columns = [ - '#CHROM\tPOS\tREF\tALT\tQUAL\tINFO\tFORMAT\t' + sample_name + '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name ] - header = header_source + header_info + header_format + columns + header = header_source + header_info + header_filter + header_format + columns return header @@ -739,7 +745,11 @@ def create_vcf(variants_dict, out_vcf, alignment): chrom = next(iter(variants_dict.values()))["CHROM"] sample = alignment.replace(".align.fasta", "") vcf_header = "\n".join(get_vcf_header(chrom, sample)) - FORMAT = "DP:AF" + FORMAT = "GT:ALT_DP:ALT_QUAL:ALT_FREQ" + ID = "." + QUAL = "." + FILTER = "PASS" + GT = "1" with open(out_vcf, "w") as file_out: file_out.write(vcf_header + "\n") for key, value in variants_dict.items(): @@ -749,9 +759,17 @@ def create_vcf(variants_dict, out_vcf, alignment): ALT = value["ALT"] TOTAL_DP_list = [int(number) for number in value["TOTAL_DP"]] INFO = "TYPE=" + value["TYPE"] + ';' + "DP=" + str(round(statistics.mean(TOTAL_DP_list))) + ALT_QUAL_list = [] + for number in value["QUAL"]: + if number != "NA": + ALT_QUAL_list.append(float(number)) + ALT_QUAL = str(round(statistics.mean(ALT_QUAL_list), 2)) + else: + ALT_QUAL = "NA" + ALT_DP_list = [int(number) for number in value["DP"]] AF_list = [float(number) for number in value["AF"]] - SAMPLE = str(round(statistics.mean(DP_list))) + ':' + str(round(statistics.mean(AF_list), 4)) - oline = CHROM + '\t' + str(POS) + '\t' + REF + '\t' + ALT + '\t' + str("".join(QUAL)) + '\t' + INFO + '\t' + FORMAT + '\t' + SAMPLE + SAMPLE = GT + ':' + str(round(statistics.mean(ALT_DP_list))) + ':' + ALT_QUAL + ':' + str(round(statistics.mean(AF_list), 4)) + oline = CHROM + '\t' + str(POS) + '\t' + ID + '\t' + REF + '\t' + ALT + '\t' + QUAL + '\t' + FILTER + '\t' + INFO + '\t' + FORMAT + '\t' + SAMPLE file_out.write(oline + "\n") From 3254106accc6ad22d8665b50a8207d51876f486f Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 1 Aug 2024 12:07:34 +0200 Subject: [PATCH 160/321] Exclude reference alles in both alignment and stats files --- .../ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index 44f7b945d..218e6b49a 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -468,11 +468,12 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): } } """ + af_vcf_dict = {} for _, value in alleles_dictionary.items(): pos = value["Position"] - for _, subdict in vcf_dictionary.items(): - if value["Allele_Type"] == "Consensus" and subdict["TYPE"] == "REF": + for align_pos, subdict in vcf_dictionary.items(): + if (value["Allele_Type"] == "Consensus" and subdict["TYPE"] == "REF") or (value["Allele"] == subdict['REF'] and subdict['TYPE'] not in ["DEL", "INS"]): continue if 'SAMPLE_POS' in subdict and int(pos) in subdict['SAMPLE_POS']: DP = [] From a036242079d1c79ffbc02647b40a51af9c828af2 Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 1 Aug 2024 12:11:44 +0200 Subject: [PATCH 161/321] Added code to update minority allele --- .../ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index 218e6b49a..e76548233 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -489,6 +489,9 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "TYPE": subdict["TYPE"] } if value["Allele"] == content_dict["ALT"] or value["Allele_Type"] == "Minority" or content_dict["TYPE"] in ["INS", "DEL", "REF"]: + if value["Allele_Type"] == "Minority": + content_dict.update({"ALT": value["Allele"]}) + content_dict.update({"TYPE": "SNP"}) DP.append(value["Count"]) AF.append(value["Frequency"]) QUAL.append(value["Average_Quality"]) From 2b80fa500b4b2dcd26bb357d708158f5f8d58487 Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 1 Aug 2024 12:12:51 +0200 Subject: [PATCH 162/321] Added code for minority deletions --- .../ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index e76548233..8d8a0860c 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -492,6 +492,13 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): if value["Allele_Type"] == "Minority": content_dict.update({"ALT": value["Allele"]}) content_dict.update({"TYPE": "SNP"}) + if value["Allele"] == "-" and value["Allele_Type"] == "Minority": + REF = vcf_dictionary[align_pos-1]["REF"]+subdict["REF"] + ALT = vcf_dictionary[align_pos-1]["REF"] + content_dict.update({"REF_POS": vcf_dictionary[align_pos-1]["REF_POS"]}) + content_dict.update({"REF": REF}) + content_dict.update({"ALT": ALT}) + content_dict.update({"TYPE": "DEL"}) DP.append(value["Count"]) AF.append(value["Frequency"]) QUAL.append(value["Average_Quality"]) From 5276af32ffc5f1437f2c95dd0fd6c2b2198dc2de Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 1 Aug 2024 12:13:36 +0200 Subject: [PATCH 163/321] Fixed line that was excluding deletions --- .../ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index 8d8a0860c..9fdbaab73 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -516,7 +516,7 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): af_vcf_dict[variant]["QUAL"] += QUAL else: af_vcf_dict[variant] = content_dict - break + pass return af_vcf_dict From 321d1e3c0a220e631923c213168cbe2262688632 Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 1 Aug 2024 12:13:49 +0200 Subject: [PATCH 164/321] Added total dp --- .../IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index 9fdbaab73..c94585904 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -500,6 +500,7 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): content_dict.update({"ALT": ALT}) content_dict.update({"TYPE": "DEL"}) DP.append(value["Count"]) + TOTAL_DP.append(value["Total"]) AF.append(value["Frequency"]) QUAL.append(value["Average_Quality"]) else: From 15166b12439f7846022794419ca9d02b67e6799d Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 1 Aug 2024 12:14:30 +0200 Subject: [PATCH 165/321] refactored a little bit --- .../ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index c94585904..5e26c29ab 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -625,7 +625,7 @@ def combine_indels(vcf_dictionary): """ combined_vcf_dict = {} - for key, value in vcf_dictionary.items(): + for _, value in vcf_dictionary.items(): content_dict = { "CHROM": value["CHROM"], "REF_POS": value["REF_POS"], @@ -658,11 +658,10 @@ def combine_indels(vcf_dictionary): combined_vcf_dict[value["REF_POS"]] = content_dict elif value["TYPE"] == "DEL": sample_found = False - for pos, data in combined_vcf_dict.items(): - var_type = data["TYPE"] - if var_type == "DEL": + for _, data in combined_vcf_dict.items(): + if data["TYPE"] == "DEL": if value["SAMPLE_POS"] == data["SAMPLE_POS"]: - if value["TYPE"] == var_type: + if value["TYPE"] == data["TYPE"]: sample_found = data["REF_POS"] break else: From 2eb5e3bd789746eab737dd3f80cc0ae1f7a80a49 Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 1 Aug 2024 12:24:11 +0200 Subject: [PATCH 166/321] Updated docstrings --- .../04-irma/create_irma_vcf.py | 66 +++++++++++++++---- 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index 5e26c29ab..e96ea42f8 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -324,6 +324,10 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "9", "10" ], + "TOTAL_DP": [ + "9", + "10" + ], "AF": [ "1", "1" @@ -333,7 +337,7 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "34" ] }, - "19": { + "EPI_ISL_18668201_10_A": { "CHROM": "EPI_ISL_18668201", "REF_POS": 10, "SAMPLE_POS": [ @@ -345,6 +349,9 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "DP": [ "60" ], + "TOTAL_DP": [ + "72" + ], "AF": [ "0.833333333333333" ], @@ -352,7 +359,7 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "34.0166666666667" ] }, - "7542": { + "EPI_ISL_18668201_7531_TT": { "CHROM": "EPI_ISL_18668201", "REF_POS": 7531, "SAMPLE_POS": [ @@ -364,6 +371,9 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "DP": [ "74" ], + "TOTAL_DP": [ + "75" + ], "AF": [ "0.986666666666667" ], @@ -371,7 +381,7 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "34.8648648648649" ] }, - "7543": { + "EPI_ISL_18668201_7531_TC": { "CHROM": "EPI_ISL_18668201", "REF_POS": 7531, "SAMPLE_POS": [ @@ -383,6 +393,9 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "DP": [ "75" ], + "TOTAL_DP": [ + "75" + ], "AF": [ "1" ], @@ -390,7 +403,7 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "35.04" ] }, - "7544": { + "EPI_ISL_18668201_7531_CA": { "CHROM": "EPI_ISL_18668201", "REF_POS": 7531, "SAMPLE_POS": [ @@ -402,6 +415,9 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "DP": [ "75" ], + "TOTAL_DP": [ + "75" + ], "AF": [ "1" ], @@ -409,9 +425,9 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "33.8533333333333" ] }, - "10081": { + "EPI_ISL_18668201_10067_A": { "CHROM": "EPI_ISL_18668201", - "REF_POS": 10068, + "REF_POS": 10067, "SAMPLE_POS": [ 10079 ], @@ -421,6 +437,9 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "DP": [ "10" ], + "TOTAL_DP": [ + "10" + ], "AF": [ "1" ], @@ -428,9 +447,9 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "34.3" ] }, - "10082": { + "EPI_ISL_18668201_10068_-": { "CHROM": "EPI_ISL_18668201", - "REF_POS": 10069, + "REF_POS": 10068, "SAMPLE_POS": [ 10079 ], @@ -440,6 +459,9 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "DP": [ "10" ], + "TOTAL_DP": [ + "10" + ], "AF": [ "1" ], @@ -447,9 +469,9 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "34.3" ] }, - "10083": { + "EPI_ISL_18668201_10069_-": { "CHROM": "EPI_ISL_18668201", - "REF_POS": 10070, + "REF_POS": 10069, "SAMPLE_POS": [ 10079 ], @@ -459,6 +481,9 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "DP": [ "10" ], + "TOTAL_DP": [ + "10" + ], "AF": [ "1" ], @@ -547,6 +572,10 @@ def combine_indels(vcf_dictionary): "DP": [ "9" ], + "TOTAL_DP": [ + "9", + "10" + ], "AF": [ "1" ], @@ -564,7 +593,10 @@ def combine_indels(vcf_dictionary): "REF": "T", "ALT": "A", "DP": [ - "60" + "72" + ], + "TOTAL_DP": [ + "10" ], "AF": [ "0.833333333333333" @@ -589,6 +621,11 @@ def combine_indels(vcf_dictionary): "75", "75" ], + "TOTAL_DP": [ + "75", + "75", + "75" + ], "AF": [ "0.986666666666667", "1", @@ -601,9 +638,9 @@ def combine_indels(vcf_dictionary): ], "TYPE": "INS" }, - "10068": { + "10067": { "CHROM": "EPI_ISL_18668201", - "REF_POS": 10068, + "REF_POS": 10067, "SAMPLE_POS": [ 10079 ], @@ -612,6 +649,9 @@ def combine_indels(vcf_dictionary): "DP": [ "10" ], + "TOTAL_DP": [ + "10", + ], "AF": [ "1" ], From 182af9b41ef3a63a18a980d0ae3c0238c91834f6 Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 1 Aug 2024 12:25:17 +0200 Subject: [PATCH 167/321] black --- .../04-irma/create_irma_vcf.py | 140 +++++++++++++----- 1 file changed, 102 insertions(+), 38 deletions(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index e96ea42f8..b12d27ec2 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -107,17 +107,19 @@ def alleles_to_dict(alleles_file, frequency, depth): alleles_dict = {} with open(alleles_file, "r") as file: - header = file.readline().strip().split('\t') + header = file.readline().strip().split("\t") for line in file: - while line.count('\t') < len(header) - 1: + while line.count("\t") < len(header) - 1: line += file.readline() - line_data = line.strip().split('\t') + line_data = line.strip().split("\t") position = int(line_data[1]) variant_af = float(line_data[5]) position_dp = float(line_data[4]) if variant_af >= frequency and position_dp >= depth: entry_dict = {header[i]: line_data[i] for i in range(len(header))} - variant = str(line_data[0]) + "_" + str(position) + "_" + str(line_data[2]) + variant = ( + str(line_data[0]) + "_" + str(position) + "_" + str(line_data[2]) + ) alleles_dict[variant] = entry_dict return alleles_dict @@ -246,9 +248,9 @@ def align2dict(alignment_file): "CHROM": CHROM, "REF_POS": ref_position, "SAMPLE_POS": [sample_position], - "REF": sample_seq[i-1], - "ALT": sample_seq[i-1] + sample_base, - "TYPE": "INS" + "REF": sample_seq[i - 1], + "ALT": sample_seq[i - 1] + sample_base, + "TYPE": "INS", } vcf_dict[align_position] = content_dict elif ref_position == 1 and len(SAMPLE_POS) > 1: @@ -258,37 +260,48 @@ def align2dict(alignment_file): "SAMPLE_POS": SAMPLE_POS, "REF": ref_base, "ALT": ALT + sample_base, - "TYPE": "INS" + "TYPE": "INS", } vcf_dict[align_position] = content_dict elif sample_base == "-" and ref_base != "N": content_dict = { "CHROM": CHROM, - "REF_POS": ref_position-1, + "REF_POS": ref_position - 1, "SAMPLE_POS": [sample_position], - "REF": sample_seq[i-1] + ref_base, - "ALT": sample_seq[i-1], - "TYPE": "DEL" + "REF": sample_seq[i - 1] + ref_base, + "ALT": sample_seq[i - 1], + "TYPE": "DEL", } vcf_dict[align_position] = content_dict - elif ref_base != sample_base and ref_base != "N" and ref_base != "-" and sample_base != "N" and sample_base != "-": + elif ( + ref_base != sample_base + and ref_base != "N" + and ref_base != "-" + and sample_base != "N" + and sample_base != "-" + ): content_dict = { "CHROM": CHROM, "REF_POS": ref_position, "SAMPLE_POS": [sample_position], "REF": ref_base, "ALT": sample_base, - "TYPE": "SNP" + "TYPE": "SNP", } vcf_dict[align_position] = content_dict - elif ref_base != "N" and ref_base != "-" and sample_base != "N" and sample_base != "-": + elif ( + ref_base != "N" + and ref_base != "-" + and sample_base != "N" + and sample_base != "-" + ): content_dict = { "CHROM": CHROM, "REF_POS": ref_position, "SAMPLE_POS": [sample_position], "REF": ref_base, "ALT": sample_base, - "TYPE": "REF" + "TYPE": "REF", } vcf_dict[align_position] = content_dict return vcf_dict @@ -498,9 +511,12 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): for _, value in alleles_dictionary.items(): pos = value["Position"] for align_pos, subdict in vcf_dictionary.items(): - if (value["Allele_Type"] == "Consensus" and subdict["TYPE"] == "REF") or (value["Allele"] == subdict['REF'] and subdict['TYPE'] not in ["DEL", "INS"]): + if (value["Allele_Type"] == "Consensus" and subdict["TYPE"] == "REF") or ( + value["Allele"] == subdict["REF"] + and subdict["TYPE"] not in ["DEL", "INS"] + ): continue - if 'SAMPLE_POS' in subdict and int(pos) in subdict['SAMPLE_POS']: + if "SAMPLE_POS" in subdict and int(pos) in subdict["SAMPLE_POS"]: DP = [] TOTAL_DP = [] AF = [] @@ -511,16 +527,22 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): "SAMPLE_POS": subdict["SAMPLE_POS"], "REF": subdict["REF"], "ALT": subdict["ALT"], - "TYPE": subdict["TYPE"] + "TYPE": subdict["TYPE"], } - if value["Allele"] == content_dict["ALT"] or value["Allele_Type"] == "Minority" or content_dict["TYPE"] in ["INS", "DEL", "REF"]: + if ( + value["Allele"] == content_dict["ALT"] + or value["Allele_Type"] == "Minority" + or content_dict["TYPE"] in ["INS", "DEL", "REF"] + ): if value["Allele_Type"] == "Minority": content_dict.update({"ALT": value["Allele"]}) content_dict.update({"TYPE": "SNP"}) if value["Allele"] == "-" and value["Allele_Type"] == "Minority": - REF = vcf_dictionary[align_pos-1]["REF"]+subdict["REF"] - ALT = vcf_dictionary[align_pos-1]["REF"] - content_dict.update({"REF_POS": vcf_dictionary[align_pos-1]["REF_POS"]}) + REF = vcf_dictionary[align_pos - 1]["REF"] + subdict["REF"] + ALT = vcf_dictionary[align_pos - 1]["REF"] + content_dict.update( + {"REF_POS": vcf_dictionary[align_pos - 1]["REF_POS"]} + ) content_dict.update({"REF": REF}) content_dict.update({"ALT": ALT}) content_dict.update({"TYPE": "DEL"}) @@ -532,8 +554,16 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): print("SNP not the same in .fasta file and alleles file") print(value) print(content_dict) - content_dict.update({"DP": DP, "TOTAL_DP": TOTAL_DP, "AF": AF, "QUAL": QUAL}) - variant = content_dict["CHROM"] + "_" + str(content_dict["REF_POS"]) + "_" + content_dict["ALT"] + content_dict.update( + {"DP": DP, "TOTAL_DP": TOTAL_DP, "AF": AF, "QUAL": QUAL} + ) + variant = ( + content_dict["CHROM"] + + "_" + + str(content_dict["REF_POS"]) + + "_" + + content_dict["ALT"] + ) if variant in af_vcf_dict: af_vcf_dict[variant]["DP"] += DP @@ -676,16 +706,20 @@ def combine_indels(vcf_dictionary): "TOTAL_DP": value["TOTAL_DP"], "AF": value["AF"], "QUAL": value["QUAL"], - "TYPE": value["TYPE"] + "TYPE": value["TYPE"], } if value["TYPE"] == "INS": if value["REF_POS"] in combined_vcf_dict: if value["TYPE"] == combined_vcf_dict[value["REF_POS"]]["TYPE"]: NEW_ALT = value["ALT"].replace(value["REF"], "") combined_vcf_dict[value["REF_POS"]]["ALT"] += NEW_ALT - combined_vcf_dict[value["REF_POS"]]["SAMPLE_POS"].append(value["SAMPLE_POS"][0]) + combined_vcf_dict[value["REF_POS"]]["SAMPLE_POS"].append( + value["SAMPLE_POS"][0] + ) combined_vcf_dict[value["REF_POS"]]["DP"].append(value["DP"][0]) - combined_vcf_dict[value["REF_POS"]]["TOTAL_DP"].append(value["TOTAL_DP"][0]) + combined_vcf_dict[value["REF_POS"]]["TOTAL_DP"].append( + value["TOTAL_DP"][0] + ) combined_vcf_dict[value["REF_POS"]]["AF"].append(value["AF"][0]) combined_vcf_dict[value["REF_POS"]]["QUAL"].append(value["QUAL"][0]) else: @@ -751,14 +785,12 @@ def get_vcf_header(chromosome, sample_name): header_source = ["##fileformat=VCFv4.2", "##source=custom"] header_contig = [] if chromosome: - header_contig += [ - "##contig=" - ] + header_contig += ["##contig="] header_source += header_contig header_info = [ '##INFO=', - '##INFO=' + '##INFO=', ] header_filter = [ '##FILTER=', @@ -769,9 +801,7 @@ def get_vcf_header(chromosome, sample_name): '##FORMAT=', '##FORMAT=', ] - columns = [ - '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name - ] + columns = ["#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sample_name] header = header_source + header_info + header_filter + header_format + columns return header @@ -809,7 +839,13 @@ def create_vcf(variants_dict, out_vcf, alignment): REF = value["REF"] ALT = value["ALT"] TOTAL_DP_list = [int(number) for number in value["TOTAL_DP"]] - INFO = "TYPE=" + value["TYPE"] + ';' + "DP=" + str(round(statistics.mean(TOTAL_DP_list))) + INFO = ( + "TYPE=" + + value["TYPE"] + + ";" + + "DP=" + + str(round(statistics.mean(TOTAL_DP_list))) + ) ALT_QUAL_list = [] for number in value["QUAL"]: if number != "NA": @@ -819,8 +855,36 @@ def create_vcf(variants_dict, out_vcf, alignment): ALT_QUAL = "NA" ALT_DP_list = [int(number) for number in value["DP"]] AF_list = [float(number) for number in value["AF"]] - SAMPLE = GT + ':' + str(round(statistics.mean(ALT_DP_list))) + ':' + ALT_QUAL + ':' + str(round(statistics.mean(AF_list), 4)) - oline = CHROM + '\t' + str(POS) + '\t' + ID + '\t' + REF + '\t' + ALT + '\t' + QUAL + '\t' + FILTER + '\t' + INFO + '\t' + FORMAT + '\t' + SAMPLE + SAMPLE = ( + GT + + ":" + + str(round(statistics.mean(ALT_DP_list))) + + ":" + + ALT_QUAL + + ":" + + str(round(statistics.mean(AF_list), 4)) + ) + oline = ( + CHROM + + "\t" + + str(POS) + + "\t" + + ID + + "\t" + + REF + + "\t" + + ALT + + "\t" + + QUAL + + "\t" + + FILTER + + "\t" + + INFO + + "\t" + + FORMAT + + "\t" + + SAMPLE + ) file_out.write(oline + "\n") From a698f50ef8017524e513e76b28859588959fdebc Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 1 Aug 2024 13:14:25 +0200 Subject: [PATCH 168/321] Updated changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2dc42510..8da193baa 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -56,6 +56,7 @@ Code contributions to the new version: - Updated services.json, mtbseq's lablog, viralrecon's lablog and assembly's config file [#299](https://github.com/BU-ISCIII/buisciii-tools/pull/299) - Added lablog to automate gene characterization with emmtyper, including unzipping assemblies. [#300](https://github.com/BU-ISCIII/buisciii-tools/pull/300) - Fixed 99-stats (MAG) template. [#301](https://github.com/BU-ISCIII/buisciii-tools/pull/301) +- Created a python script to process IRMA's results and create a standard vcf file against reference. [#304](https://github.com/BU-ISCIII/buisciii-tools/pull/304) - Fixed IRMA's lablog so that the sequences of the samples are not displayed several times neither in the .txt files of each influenza type nor in all_samples_completo.txt [#305](https://github.com/BU-ISCIII/buisciii-tools/pull/305) - Modified bioinfo_doc.py so that new lines in the delivery message are applied in the email [#307](https://github.com/BU-ISCIII/buisciii-tools/pull/307) - Added several improvements in lablog_viralrecon (created log files, modified check_references function behaviour, enabled config files regeneration) [#306](https://github.com/BU-ISCIII/buisciii-tools/pull/306) From 432ebd88f7cb9f5bb8008c33f4ea0421930d718e Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 6 Aug 2024 12:58:04 +0200 Subject: [PATCH 169/321] Managed deletions at the begining of alignment --- .../04-irma/create_irma_vcf.py | 61 +++++++++++++++---- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index b12d27ec2..2d1129446 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -264,15 +264,26 @@ def align2dict(alignment_file): } vcf_dict[align_position] = content_dict elif sample_base == "-" and ref_base != "N": - content_dict = { - "CHROM": CHROM, - "REF_POS": ref_position - 1, - "SAMPLE_POS": [sample_position], - "REF": sample_seq[i - 1] + ref_base, - "ALT": sample_seq[i - 1], - "TYPE": "DEL", - } - vcf_dict[align_position] = content_dict + if sample_position == 0: + content_dict = { + "CHROM": CHROM, + "REF_POS": ref_position, + "SAMPLE_POS": [sample_position], + "REF": ref_base + ref_seq[i + 1], + "ALT": ref_seq[i + 1], + "TYPE": "DEL", + } + vcf_dict[align_position] = content_dict + else: + content_dict = { + "CHROM": CHROM, + "REF_POS": ref_position - 1, + "SAMPLE_POS": [sample_position], + "REF": sample_seq[i - 1] + ref_base, + "ALT": sample_seq[i - 1], + "TYPE": "DEL", + } + vcf_dict[align_position] = content_dict elif ( ref_base != sample_base and ref_base != "N" @@ -516,6 +527,29 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): and subdict["TYPE"] not in ["DEL", "INS"] ): continue + if 0 in subdict["SAMPLE_POS"] and len(subdict["SAMPLE_POS"]) == 1: + content_dict = { + "CHROM": subdict["CHROM"], + "REF_POS": subdict["REF_POS"], + "SAMPLE_POS": subdict["SAMPLE_POS"], + "REF": subdict["REF"], + "ALT": subdict["ALT"], + "TYPE": subdict["TYPE"], + "DP": ["NA"], + "TOTAL_DP": ["NA"], + "AF": ["NA"], + "QUAL": ["NA"], + } + variant = ( + content_dict["CHROM"] + + "_" + + str(content_dict["REF_POS"]) + + "_" + + content_dict["ALT"] + ) + af_vcf_dict[variant] = content_dict + pass + if "SAMPLE_POS" in subdict and int(pos) in subdict["SAMPLE_POS"]: DP = [] TOTAL_DP = [] @@ -745,8 +779,12 @@ def combine_indels(vcf_dictionary): print("combined_vcf_dict") print(combined_vcf_dict[value["REF_POS"]]) if sample_found: - NEW_REF = value["REF"].replace(value["ALT"], "") - combined_vcf_dict[sample_found]["REF"] += NEW_REF + if 0 in value["SAMPLE_POS"] and len(value["SAMPLE_POS"]) == 1: + combined_vcf_dict[sample_found]["REF"] += value["ALT"] + combined_vcf_dict[sample_found]["ALT"] = value["ALT"] + else: + NEW_REF = value["REF"][len(value["ALT"]):] + combined_vcf_dict[sample_found]["REF"] += NEW_REF else: combined_vcf_dict[value["REF_POS"]] = content_dict elif value["TYPE"] == "SNP": @@ -909,3 +947,4 @@ def main(args=None): if __name__ == "__main__": sys.exit(main()) + From 69f7193421eca9bc6a79a4d39f345ce5f7241c30 Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 6 Aug 2024 12:58:44 +0200 Subject: [PATCH 170/321] Managed missing data in stats --- .../04-irma/create_irma_vcf.py | 44 +++++++++++++++---- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index 2d1129446..613d7f909 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -876,31 +876,57 @@ def create_vcf(variants_dict, out_vcf, alignment): POS = value["REF_POS"] REF = value["REF"] ALT = value["ALT"] - TOTAL_DP_list = [int(number) for number in value["TOTAL_DP"]] + TOTAL_DP_list = [] + for number in value["TOTAL_DP"]: + if number != "NA": + TOTAL_DP_list.append(int(number)) + if TOTAL_DP_list: + TOTAL_DP = str(round(statistics.mean(TOTAL_DP_list))) + else: + TOTAL_DP = "NA" + INFO = ( "TYPE=" + value["TYPE"] + ";" + "DP=" - + str(round(statistics.mean(TOTAL_DP_list))) + + TOTAL_DP ) ALT_QUAL_list = [] for number in value["QUAL"]: if number != "NA": ALT_QUAL_list.append(float(number)) - ALT_QUAL = str(round(statistics.mean(ALT_QUAL_list), 2)) - else: - ALT_QUAL = "NA" - ALT_DP_list = [int(number) for number in value["DP"]] - AF_list = [float(number) for number in value["AF"]] + if ALT_QUAL_list: + ALT_QUAL = str(round(statistics.mean(ALT_QUAL_list), 2)) + else: + ALT_QUAL = "NA" + + ALT_DP_list = [] + for number in value["DP"]: + if number != "NA": + ALT_DP_list.append(int(number)) + if ALT_DP_list: + ALT_DP = str(round(statistics.mean(ALT_DP_list), 0)) + else: + ALT_DP = "NA" + + AF_list = [] + for number in value["AF"]: + if number != "NA": + AF_list.append(float(number)) + if AF_list: + AF = str(round(statistics.mean(AF_list), 4)) + else: + AF = "NA" + SAMPLE = ( GT + ":" - + str(round(statistics.mean(ALT_DP_list))) + + ALT_DP + ":" + ALT_QUAL + ":" - + str(round(statistics.mean(AF_list), 4)) + + AF ) oline = ( CHROM From d5c64ba109cfa13ddaebf69c2feb1e52d8787de6 Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 6 Aug 2024 12:59:44 +0200 Subject: [PATCH 171/321] Managed minority deletions --- .../ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index 613d7f909..9f62e73dc 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -778,6 +778,9 @@ def combine_indels(vcf_dictionary): print(value) print("combined_vcf_dict") print(combined_vcf_dict[value["REF_POS"]]) + elif minority and prev_sample_pos in data["SAMPLE_POS"]: + sample_found = data["REF_POS"] + break if sample_found: if 0 in value["SAMPLE_POS"] and len(value["SAMPLE_POS"]) == 1: combined_vcf_dict[sample_found]["REF"] += value["ALT"] @@ -785,6 +788,11 @@ def combine_indels(vcf_dictionary): else: NEW_REF = value["REF"][len(value["ALT"]):] combined_vcf_dict[sample_found]["REF"] += NEW_REF + if minority: + combined_vcf_dict[sample_found]["SAMPLE_POS"] += value["SAMPLE_POS"] + combined_vcf_dict[sample_found]["DP"] += value["DP"] + combined_vcf_dict[sample_found]["TOTAL_DP"] += value["TOTAL_DP"] + combined_vcf_dict[sample_found]["AF"] += value["AF"] else: combined_vcf_dict[value["REF_POS"]] = content_dict elif value["TYPE"] == "SNP": From 8ba38eeea871d11b8dbe6aacfb8052855db53bd6 Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 6 Aug 2024 13:03:08 +0200 Subject: [PATCH 172/321] managed minority deletions --- .../ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index 9f62e73dc..47f0f7859 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -766,6 +766,14 @@ def combine_indels(vcf_dictionary): combined_vcf_dict[value["REF_POS"]] = content_dict elif value["TYPE"] == "DEL": sample_found = False + minority = False + for af in value["AF"]: + if float(af) < 0.5: + minority = True + prev_sample_pos = "" + if minority and len(value["SAMPLE_POS"]) == 1: + sample_pos = value["SAMPLE_POS"][0] + prev_sample_pos = sample_pos - 1 for _, data in combined_vcf_dict.items(): if data["TYPE"] == "DEL": if value["SAMPLE_POS"] == data["SAMPLE_POS"]: From 878a42b1aed6e2f17ee0bb53c8e14d18b0de4b1e Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 6 Aug 2024 13:05:47 +0200 Subject: [PATCH 173/321] black --- .../04-irma/create_irma_vcf.py | 25 +++++-------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index 47f0f7859..462d19b57 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -794,10 +794,12 @@ def combine_indels(vcf_dictionary): combined_vcf_dict[sample_found]["REF"] += value["ALT"] combined_vcf_dict[sample_found]["ALT"] = value["ALT"] else: - NEW_REF = value["REF"][len(value["ALT"]):] + NEW_REF = value["REF"][len(value["ALT"]) :] combined_vcf_dict[sample_found]["REF"] += NEW_REF if minority: - combined_vcf_dict[sample_found]["SAMPLE_POS"] += value["SAMPLE_POS"] + combined_vcf_dict[sample_found]["SAMPLE_POS"] += value[ + "SAMPLE_POS" + ] combined_vcf_dict[sample_found]["DP"] += value["DP"] combined_vcf_dict[sample_found]["TOTAL_DP"] += value["TOTAL_DP"] combined_vcf_dict[sample_found]["AF"] += value["AF"] @@ -901,13 +903,7 @@ def create_vcf(variants_dict, out_vcf, alignment): else: TOTAL_DP = "NA" - INFO = ( - "TYPE=" - + value["TYPE"] - + ";" - + "DP=" - + TOTAL_DP - ) + INFO = "TYPE=" + value["TYPE"] + ";" + "DP=" + TOTAL_DP ALT_QUAL_list = [] for number in value["QUAL"]: if number != "NA": @@ -935,15 +931,7 @@ def create_vcf(variants_dict, out_vcf, alignment): else: AF = "NA" - SAMPLE = ( - GT - + ":" - + ALT_DP - + ":" - + ALT_QUAL - + ":" - + AF - ) + SAMPLE = GT + ":" + ALT_DP + ":" + ALT_QUAL + ":" + AF oline = ( CHROM + "\t" @@ -989,4 +977,3 @@ def main(args=None): if __name__ == "__main__": sys.exit(main()) - From b1f4ba6d7cddebe5ed41f7d0bfbabbe6e4722cb9 Mon Sep 17 00:00:00 2001 From: svarona Date: Wed, 7 Aug 2024 09:48:50 +0200 Subject: [PATCH 174/321] Fixed alt_allele in insertions --- .../ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index 462d19b57..47c7c2523 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -745,7 +745,7 @@ def combine_indels(vcf_dictionary): if value["TYPE"] == "INS": if value["REF_POS"] in combined_vcf_dict: if value["TYPE"] == combined_vcf_dict[value["REF_POS"]]["TYPE"]: - NEW_ALT = value["ALT"].replace(value["REF"], "") + NEW_ALT = value["ALT"][len(value["REF"]) :] combined_vcf_dict[value["REF_POS"]]["ALT"] += NEW_ALT combined_vcf_dict[value["REF_POS"]]["SAMPLE_POS"].append( value["SAMPLE_POS"][0] From 258c2cbfdfac9b5452b3f8b13a195b3074d063c0 Mon Sep 17 00:00:00 2001 From: svarona Date: Wed, 7 Aug 2024 09:49:33 +0200 Subject: [PATCH 175/321] Added management of low af insertions at the end of the sequence --- .../04-irma/create_irma_vcf.py | 45 ++++++++++++++++++- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py index 47c7c2523..1cd416723 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -318,7 +318,7 @@ def align2dict(alignment_file): return vcf_dict -def stats_vcf(vcf_dictionary, alleles_dictionary): +def stats_vcf(vcf_dictionary, alleles_dictionary, last_pos, last_allele): """Add stats to VCF dictionary. Parameters @@ -521,6 +521,41 @@ def stats_vcf(vcf_dictionary, alleles_dictionary): af_vcf_dict = {} for _, value in alleles_dictionary.items(): pos = value["Position"] + chrom = next(iter(vcf_dictionary.values()))["CHROM"] + + if int(pos) > last_pos and value["Allele_Type"] == "Minority": + content_dict = { + "CHROM": chrom, + "REF_POS": last_pos, + "SAMPLE_POS": [pos], + "REF": last_allele, + "ALT": last_allele + value["Allele"], + "TYPE": "INS", + "DP": [value["Count"]], + "TOTAL_DP": [value["Total"]], + "AF": [value["Frequency"]], + "QUAL": [value["Frequency"]], + } + + variant = ( + content_dict["CHROM"] + + "_" + + str(content_dict["REF_POS"]) + + "_" + + "final_ins" + ) + + if variant in af_vcf_dict: + af_vcf_dict[variant]["DP"] += content_dict["DP"] + af_vcf_dict[variant]["TOTAL_DP"] += content_dict["TOTAL_DP"] + af_vcf_dict[variant]["AF"] += content_dict["AF"] + af_vcf_dict[variant]["QUAL"] += content_dict["QUAL"] + af_vcf_dict[variant]["SAMPLE_POS"] += content_dict["SAMPLE_POS"] + af_vcf_dict[variant]["ALT"] += value["Allele"] + else: + af_vcf_dict[variant] = content_dict + pass + for align_pos, subdict in vcf_dictionary.items(): if (value["Allele_Type"] == "Consensus" and subdict["TYPE"] == "REF") or ( value["Allele"] == subdict["REF"] @@ -970,7 +1005,13 @@ def main(args=None): # Start analysis alleles_dict = alleles_to_dict(all_alleles, freq, dp) alignment_dict = align2dict(alignment) - af_vcf_dict = stats_vcf(alignment_dict, alleles_dict) + last_ref_pos = max(position["REF_POS"] for position in alignment_dict.values()) + last_ref_allele = None + for _, value in alignment_dict.items(): + if value["REF_POS"] == last_ref_pos: + last_ref_allele = value["REF"] + break + af_vcf_dict = stats_vcf(alignment_dict, alleles_dict, last_ref_pos, last_ref_allele) combined_vcf_dict = combine_indels(af_vcf_dict) create_vcf(combined_vcf_dict, output_vcf, alignment) From e229ed7d3fcca5ef1abb87fb0adb2733f3a0994c Mon Sep 17 00:00:00 2001 From: svarona Date: Fri, 16 Aug 2024 12:47:43 +0200 Subject: [PATCH 176/321] Added mvmoneo to sftp users --- bu_isciii/templates/sftp_user.json | 1 + 1 file changed, 1 insertion(+) diff --git a/bu_isciii/templates/sftp_user.json b/bu_isciii/templates/sftp_user.json index fff8c4a4b..04773c8fb 100755 --- a/bu_isciii/templates/sftp_user.json +++ b/bu_isciii/templates/sftp_user.json @@ -50,5 +50,6 @@ "svazquez": ["Labvirusres"], "ycampos": ["LabUfiecMithocondrial"], "anadonoso": ["Labenterovirus"], + "mvmoneo": ["SpainUDP"], "bioinfoadm": ["test"] } From 6e6e27ef3364b59f1aeb4a4df6a45fa34c3770b0 Mon Sep 17 00:00:00 2001 From: svarona Date: Fri, 16 Aug 2024 12:48:56 +0200 Subject: [PATCH 177/321] Updated changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8da193baa..5b352a739 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,6 +61,7 @@ Code contributions to the new version: - Modified bioinfo_doc.py so that new lines in the delivery message are applied in the email [#307](https://github.com/BU-ISCIII/buisciii-tools/pull/307) - Added several improvements in lablog_viralrecon (created log files, modified check_references function behaviour, enabled config files regeneration) [#306](https://github.com/BU-ISCIII/buisciii-tools/pull/306) - Fixed bug when lablog_viralrecon tries to download references that don't belong to any family. [#310](https://github.com/BU-ISCIII/buisciii-tools/pull/310) +-Added mvmoneo to SFTP users. [#317](https://github.com/BU-ISCIII/buisciii-tools/pull/317) ### Modules From 5db8f81a3c03c41b3a4daf4ba0644cecd5e2dc14 Mon Sep 17 00:00:00 2001 From: svarona Date: Fri, 16 Aug 2024 12:50:06 +0200 Subject: [PATCH 178/321] fixed typo --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b352a739..c81d1d99d 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,7 +61,7 @@ Code contributions to the new version: - Modified bioinfo_doc.py so that new lines in the delivery message are applied in the email [#307](https://github.com/BU-ISCIII/buisciii-tools/pull/307) - Added several improvements in lablog_viralrecon (created log files, modified check_references function behaviour, enabled config files regeneration) [#306](https://github.com/BU-ISCIII/buisciii-tools/pull/306) - Fixed bug when lablog_viralrecon tries to download references that don't belong to any family. [#310](https://github.com/BU-ISCIII/buisciii-tools/pull/310) --Added mvmoneo to SFTP users. [#317](https://github.com/BU-ISCIII/buisciii-tools/pull/317) +- Added mvmoneo to SFTP users. [#317](https://github.com/BU-ISCIII/buisciii-tools/pull/317) ### Modules From 051504cbc664cd9a880647a66be7dca0adf26280 Mon Sep 17 00:00:00 2001 From: svarona Date: Wed, 14 Aug 2024 13:35:34 +0200 Subject: [PATCH 179/321] updated new tx2gene table name innf-core pipeline --- .../02-differential_expression/differential_expression.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/differential_expression.R b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/differential_expression.R index 160d69dcc..f684a1841 100644 --- a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/differential_expression.R +++ b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/differential_expression.R @@ -550,13 +550,14 @@ cat(blue("########################\nStarting with loading data\n################ ####LOAD TRANSCRIPT RELATION DATA FILE ######################### if (opt$differential_expression != "DEM") { - tx2gene <- read.table(file.path(opt$rnaseq_dir, "star_salmon", "salmon_tx2gene.tsv"), header = F) + tx2gene <- read.table(file.path(opt$rnaseq_dir, "star_salmon", "tx2gene.tsv"), header = F) colnames(tx2gene) <- c("TXNAME", "GENEID", "gene_name") if ( opt$differential_expression == "DEG") { gene_genename <- tx2gene[,c(2:3)] gene_genename <- gene_genename %>% distinct() } } + ####LOAD CLINICAL DATA FILE ######################### samples_clin_data <- load_sample_data(clinical_data = opt$sample_data, group = opt$group_col) From 7c82f7ee132b10cfc9464ed28b7c71335c156f66 Mon Sep 17 00:00:00 2001 From: svarona Date: Wed, 14 Aug 2024 13:35:49 +0200 Subject: [PATCH 180/321] updated plots por DETs --- .../differential_expression.R | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/differential_expression.R b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/differential_expression.R index f684a1841..f1a7f397a 100644 --- a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/differential_expression.R +++ b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/differential_expression.R @@ -363,9 +363,17 @@ differential_plots <- function(res_de, de_results, ntd_subset, dds_subset){ rownames(df) <- colnames(ntd_subset) to_plot <- assay_ntd[select,] to_plot_geneid <- as.data.frame(rownames(to_plot)) - colnames(to_plot_geneid) <- "GeneID" - to_plot_geneid_merged <- merge(x = to_plot_geneid, y = gene_genename, by.x="GeneID", by.y = "GENEID", all.x = TRUE, all.y = FALSE) - rownames(to_plot) <- to_plot_geneid_merged$gene_name + if ( opt$differential_expression == "DEG") { + colnames(to_plot_geneid) <- "GeneID" + to_plot_geneid_merged <- merge(x = to_plot_geneid, y = gene_genename, by.x="GeneID", by.y = "GENEID", all.x = TRUE, all.y = FALSE) + rownames(to_plot) <- to_plot_geneid_merged$gene_name + } + + if ( opt$differential_expression == "DET") { + colnames(to_plot_geneid) <- "TranscriptID" + rownames(to_plot) <- to_plot_geneid$TranscriptID + } + pdf(file="Differential_expression/DESeq2/heatmapCount_top20_differentially_expressed.pdf") pheatmap(to_plot, cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE, annotation_col=df, main="Top 20 significant genes") @@ -444,9 +452,16 @@ quality_plots <- function(data_subset){ to_plot <- assay(data_subset$subset_ntd)[select,] to_plot_geneid <- as.data.frame(rownames(to_plot)) - colnames(to_plot_geneid) <- "GeneID" - to_plot_geneid_merged <- merge(x = to_plot_geneid, y = gene_genename, by.x="GeneID", by.y = "GENEID", all.x = TRUE, all.y = FALSE) - rownames(to_plot) <- to_plot_geneid_merged$gene_name + if ( opt$differential_expression == "DEG") { + colnames(to_plot_geneid) <- "GeneID" + to_plot_geneid_merged <- merge(x = to_plot_geneid, y = gene_genename, by.x="GeneID", by.y = "GENEID", all.x = TRUE, all.y = FALSE) + rownames(to_plot) <- to_plot_geneid_merged$gene_name + } + + if ( opt$differential_expression == "DET") { + colnames(to_plot_geneid) <- "TranscriptID" + rownames(to_plot) <- to_plot_geneid$TranscriptID + } pdf(file="Quality_plots/DESeq2/heatmapCount_top20_highest_expression.pdf") pheatmap(to_plot, cluster_rows=FALSE, show_rownames=TRUE, From 1d42ab1a58d73b0c420ee4423823624c051fd258 Mon Sep 17 00:00:00 2001 From: svarona Date: Wed, 14 Aug 2024 13:36:12 +0200 Subject: [PATCH 181/321] added original timeseries script --- .../time_series_differential_expression.R | 343 ++++++++++++++++++ 1 file changed, 343 insertions(+) create mode 100644 bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R diff --git a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R new file mode 100644 index 000000000..5ad14850b --- /dev/null +++ b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R @@ -0,0 +1,343 @@ +#!/usr/bin/env Rscript + +################################################ +################################################ +## LOAD LIBRARIES ## +################################################ +################################################ + +####DESeq2 libraries +library(DESeq2) +library(tximport) +library(readr) + +####fishpond libraries +library(fishpond) +library(tximeta) +library(SummarizedExperiment) +####Other libraries +library(optparse) +#library(xlsx) +#options(java.parameters = "-Xmx4G") +library(dplyr) +library(pheatmap) +library(RColorBrewer) +library(ggplot2) +library(vsn) +library(crayon) +library(tidytable) +library(data.table) + + +################################################ +################################################ +## PARSE COMMAND-LINE PARAMETERS ## +################################################ +################################################ +cat(cyan$bgRed$bold("########################\nStarting diferential expression pipeline\n###############################\n")) + +option_list <- list( + make_option(c("-r", "--rnaseq_dir" ), type="character" , default='../../01-rnaseq' , metavar="path" , help="Path to rna-seq results" ), + make_option(c("-c", "--clinical_data" ), type="character" , default='./clinical_data.txt' , metavar="path" , help="Path to clinical data file" ), + make_option(c("-g", "--group_col" ), type="character" , default='Group' , metavar="string" , help="Colname with the sample classes in sample_data of the experiment for the DE." ), + make_option(c("-n", "--norm_counts" ), type="logical" , default=FALSE , metavar="boolean", help="Create table with normalized counts" ), + make_option(c("-q", "--quality_plots" ), type="logical" , default=TRUE , metavar="boolean", help="Create quality plots or not." ) +) + +opt_parser <- OptionParser(option_list=option_list) +opt <- parse_args(opt_parser) + +cat(blue$bold("########################\nRunning analysis with the following params:\n###############################\n")) +cat(blue("-Path to RNAseq input folder: ")) + cat(blue(opt$rnaseq_dir))+cat(blue("\n")) +cat(blue("-Path to samples clinical data: ")) + cat(blue(opt$clinical_data))+cat(blue("\n")) +cat(blue("-Column with the group info: ")) + cat(blue(opt$group_col))+cat(blue("\n")) +if (opt$norm_counts) { + cat(blue("-Saving normalized counts to file\n")) +} else{ + cat(blue("-Not saving normalized counts to file\n")) +} +if (opt$quality_plots) { + cat(blue("-Creating quality plots\n")) +} else{ + cat(blue("-Skipping quality plots\n")) +} + + +################################################ +################################################ +## FUNCTIONS ## +################################################ +################################################ + +################################################ +## LOAD DATA ## +################################################ + +####LOAD CLINICAL DATA FILE######################### +load_sample_data <- function(clinical_data, group) { + samples <- read.table(clinical_data, header = T) + compare_col <- which(colnames(samples) %in% group) + time_col <- which(colnames(samples) %in% c("time")) + samples <- samples[,c(1,compare_col, time_col)] + colnames(samples) <- c("names","condition", "time") + rownames(samples) <- samples$names + return(samples) +} + +################################################ +## DESEQ2 ## +################################################ + +####DIFFERENTIAL EXPRESSION######################### + +deseq2_analysis <- function(txi_data, samples, compare_char1, compare_char2){ + ddsTxi <- DESeqDataSetFromTximport(txi_data, + colData = samples, + design = ~ condition + time + condition:time) + dds <- ddsTxi[ rowSums(counts(ddsTxi)) >= 1, ] + dds <- DESeq(dds, test = "LRT", reduced = ~ condition + time) + res <- results(dds) + return(list(dds_matrix = dds, results =res)) +} + +####NORMALIZATION######################### + +normalized_counts <- function(dds_table){ + ntd <- normTransform(dds_table) + rld <- rlog(dds_table, blind=FALSE) + vsd <- varianceStabilizingTransformation(dds_table, blind=FALSE) + return(list(dds_norm=dds_table, norm = ntd, rlogtrans =rld, varstab=vsd)) +} + +####DE PLOTS######################### + +differential_plots <- function(res_de, de_results, ntd_subset, dds_subset){ + #MA-plotThe MA-plot shows the log2 fold changes from the treatment over the meanof normalized counts. + #The average of counts normalized by size factor. + pdf(file="Differential_expression/DESeq2/maPlot_all.pdf") + plotMA( res_de, ylim = c(-1, 1) ) + dev.off() + + #############DISPERSION PLOTS################ + pdf(file="Differential_expression/DESeq2/pvalues.pdf") + hist( res_de$pvalue, breaks=20, col="grey", main = "pvalues test for differential expression") + dev.off() + + ##############PHEATMAP############## + assay_ntd <- assay(ntd_subset) + ordered_table <- de_results[order(de_results$pvalue, -abs(de_results$log2FoldChange)),] + ordered_table$identifier <- rownames(ordered_table) + col_num <- which(colnames(ordered_table) == "identifier") + top_sig_genes <- ordered_table[1:20,col_num] + select <- which(rownames(assay_ntd) %in% top_sig_genes) + df <- as.data.frame(colData(dds_subset)[,c("condition")]) + colnames(df) <- c("condition") + rownames(df) <- colnames(ntd_subset) + to_plot <- assay_ntd[select,] + to_plot_geneid <- as.data.frame(rownames(to_plot)) + colnames(to_plot_geneid) <- "GeneID" + to_plot_geneid_merged <- merge(x = to_plot_geneid, y = gene_genename, by.x="GeneID", by.y = "GENEID", all.x = TRUE, all.y = FALSE) + rownames(to_plot) <- to_plot_geneid_merged$gene_name + pdf(file="Differential_expression/DESeq2/heatmapCount_top20_differentially_expressed.pdf") + pheatmap(to_plot, cluster_rows=TRUE, show_rownames=TRUE, + cluster_cols=TRUE, annotation_col=df, main="Top 20 significant genes") + dev.off() +} + + +####QUALITY PLOTS######################### + +quality_plots <- function(norm_data){ + ###########SAMPLE DISTANCE############## + sampleDists <- dist( t( assay(norm_data$rlogtrans) ) ) + + sampleDistMatrix <- as.matrix( sampleDists ) + colours = colorRampPalette(rev(brewer.pal(9, "Blues"))) (255) + pdf(file="Quality_plots/DESeq2/heatmap_sample_to_sample.pdf") + pheatmap(sampleDistMatrix, + clustering_distance_rows=sampleDists, + clustering_distance_cols=sampleDists, + col=colours) + dev.off() + + #############PCA PLOTS################ + pcaData <- plotPCA(norm_data$rlogtrans, intgroup=c("condition"), returnData=TRUE) + pcaData_2 <- plotPCA(norm_data$varstab, intgroup=c("condition"), returnData=TRUE) + percentVar <- round(100 * attr(pcaData, "percentVar")) + pdf(file="Quality_plots/DESeq2/plotPCA.pdf") + pca_plot_rld <- ggplot(pcaData, aes(PC1, PC2, color=condition)) + + geom_point(size=3) + + xlab(paste0("PC1: ",percentVar[1],"% variance")) + + ylab(paste0("PC2: ",percentVar[2],"% variance")) + + geom_text(aes(label = name), color = "black", size=2, position = position_nudge(y = 0.8)) + + labs(title="PCA: rlog") + + coord_fixed() + pca_plot_vsd <- ggplot(pcaData_2, aes(PC1, PC2, color=condition)) + + geom_point(size=3) + + xlab(paste0("PC1: ",percentVar[1],"% variance")) + + ylab(paste0("PC2: ",percentVar[2],"% variance")) + + geom_text(aes(label = name), color = "black", size=2, position = position_nudge(y = 0.8)) + + labs(title="PCA: vsd") + + coord_fixed() + print(pca_plot_rld) + print(pca_plot_vsd) + dev.off() + + #############BOX PLOTS################ + pdf(file="Quality_plots/DESeq2/boxplot.pdf") + boxplot(assay(norm_data$norm), col="blue", las =2) + title(main="Boxplot: normalized counts") + boxplot(log10(assays(norm_data$dds_norm)[["cooks"]]), range=0, las=2) + title(main="Boxplot see outliers: cooks distance") + dev.off() + + #############DISPERSION PLOTS################ + pdf(file="Quality_plots/DESeq2/plotDispersions.pdf") + plotDispEsts(norm_data$dds_norm) + dev.off() + + #############DESVIATION PLOT################ + pdf(file="Quality_plots/DESeq2/plotSD.pdf") + meanSdPlot(assay(norm_data$norm)) + dev.off() + + ##############HCLUST################### + assay_ntd <- assay(norm_data$norm) + pdf(file="Quality_plots/DESeq2/cluster_dendrogram.pdf") + plot(hclust(dist(t(assay_ntd)),method="average")) + dev.off() + + ##############PHEATMAP############## + select <- order(rowMeans(counts(norm_data$dds_norm,normalized=TRUE)), + decreasing=TRUE)[1:20] + df <- as.data.frame(colData(norm_data$dds_norm)[,c("condition")]) + colnames(df) <- c("Condition") + rownames(df) <- colnames(norm_data$norm) + + to_plot <- assay(norm_data$norm)[select,] + to_plot_geneid <- as.data.frame(rownames(to_plot)) + colnames(to_plot_geneid) <- "GeneID" + to_plot_geneid_merged <- merge(x = to_plot_geneid, y = gene_genename, by.x="GeneID", by.y = "GENEID", all.x = TRUE, all.y = FALSE) + rownames(to_plot) <- to_plot_geneid_merged$gene_name + + pdf(file="Quality_plots/DESeq2/heatmapCount_top20_highest_expression.pdf") + pheatmap(to_plot, cluster_rows=FALSE, show_rownames=TRUE, + cluster_cols=TRUE, annotation_col=df, main="Normalized counts top 20 more expressed genes") + dev.off() + + ######FULL PHEATMAP################# + pdf(file="Quality_plots/DESeq2/heatmapCount_all_genes.pdf") + pheatmap(assay(norm_data$norm), cluster_rows=FALSE, show_rownames=FALSE, + cluster_cols=TRUE,main="Normalized counts", annotation_col=df) + dev.off() +} + +################################################ +## WARNINGS ## +################################################ + +test_data <- function(samples_data, txi_data){ + if (all(rownames(samples_data) %in% colnames(txi_data$counts)) == FALSE) { + print("Warning: Check sample names") + } + if (all(rownames(samples_data) == colnames(txi_data$counts)) == FALSE) { + print("Warning: Check sample names") + } +} + + +############################################################################################################################################## +############################################################################################################################################## +##################################################### MAIN ################################################## +############################################################################################################################################## +############################################################################################################################################## + + +################################################ +################################################ +## LOAD DATA ## +################################################ +################################################ + +cat(blue("########################\nStarting with loading data\n###############################\n")) + +####LOAD TRANSCRIPT RELATION DATA FILE ######################### + +tx2gene <- read.table(file.path(opt$rnaseq_dir, "star_salmon", "salmon_tx2gene.tsv"), header = F) +colnames(tx2gene) <- c("TXNAME", "GENEID", "gene_name") +gene_genename <- tx2gene[,c(2:3)] +gene_genename <- gene_genename %>% distinct() + +####LOAD CLINICAL DATA FILE ######################### +samples_clin_data <- load_sample_data(clinical_data = opt$clinical_data, group = opt$group_col) + +####LOAD ESPRESSION DATA ######################### +files <- file.path(opt$rnaseq_dir,"star_salmon", samples_clin_data$names, "quant.sf") +names(files) <- samples_clin_data$names +coldata <- data.frame(files, samples_clin_data, stringsAsFactors=FALSE) +if (!all(file.exists(coldata$files))) { + cat(red("############WARNING############\nNo todos los ficheros existen\n###############################\n")) +} + + +################################################ +################################################ +## DIFFERENTIAL EXPRESSION ## +################################################ +################################################ + + +################################################ +################################################ +## DIFFERENTIAL EXPRESSION DESEQ2 ## +################################################ +################################################ + +cat(blue("########################\nStarting with DESeq2\n###############################\n")) +txi <- tximport(files, type="salmon", tx2gene=tx2gene) +test_data(samples_data = samples_clin_data, txi_data = txi) + +cat(blue("########################\nStarting with differential expression\n###############################\n")) +deseq2_results <- deseq2_analysis(txi_data = txi, samples = samples_clin_data, compare_char1 = opt$treatment, opt$control) +mcols(deseq2_results$results, use.names = T) +DE_results <- as.data.frame(deseq2_results$results) + +DE_results$GeneID <- row.names(DE_results) +DE_results_merged <- merge(x = gene_genename, y = DE_results, by.x = "GENEID", by.y= "GeneID", all.y = T, all.x=F) + +DE_results_merged_sig <- subset(x = DE_results_merged, padj <= 0.05 & (log2FoldChange <= -2 | log2FoldChange >= 2)) + +dir.create("Differential_expression",showWarnings = FALSE) +dir.create("Differential_expression/DESeq2",showWarnings = FALSE) +write.table(x = DE_results_merged, file = "Differential_expression/DESeq2/Differential_expression.csv", sep = ",", quote = F, col.names = T, row.names = F) +#write.xlsx(x = DE_results_merged, file = "Differential_expression/DESeq2/Differential_expression.xlsx", sheetName = "Diff_exp", col.names = TRUE, row.names = FALSE, append = FALSE, showNA = TRUE, password = NULL) + + +cat(blue("########################\nStarting with normalization\n###############################\n")) +norm_count <- normalized_counts(dds_table = deseq2_results$dds_matrix) + +if (opt$norm_counts) { + ntd_gene <- as.data.frame(assay(norm_count$norm)) + ntd_gene$GeneID <- rownames(ntd_gene) + norm_name_table <- merge(x = gene_genename, y = ntd_gene, by.x = "GENEID", by.y= "GeneID", all.y = T, all.x=F) + write.table(x = norm_name_table, file = "normalized_expression.csv", quote = F, sep = ",", row.names = F, col.names = T) + #write.xlsx(x = norm_name_table, file = "normalized_expression.xlsx", sheetName = "Norm_exp", col.names = TRUE, row.names = FALSE, append = FALSE, showNA = TRUE, password = NULL) +} + +cat(blue("########################\nStarting with data subsettion\n###############################\n")) + +differential_plots(res_de = deseq2_results$results, de_results = DE_results, ntd_subset = norm_count$norm, dds_subset = norm_count$dds_norm) + +if (opt$quality_plots) { + cat(blue("########################\nStarting with Quality plots\n###############################\n")) + dir.create("Quality_plots",showWarnings = FALSE) + dir.create("Quality_plots/DESeq2",showWarnings = FALSE) + quality_plots(norm_data = norm_count) +} + +save.image() +cat(blue("########################\nNumber of genes with padj < 0.05 and log2FC >= |2|:\n")) +cat(blue(nrow(DE_results_merged_sig))) +cat(blue("\n###############################\n")) + +cat(green("########################\nPipeline completed succesfully\n###############################\n")) From 577d0b55a7fdf2f559510ca2ec30ba001a81486d Mon Sep 17 00:00:00 2001 From: svarona Date: Wed, 14 Aug 2024 14:06:42 +0200 Subject: [PATCH 182/321] prettier --- .../time_series_differential_expression.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R index 5ad14850b..de296357f 100644 --- a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R +++ b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R @@ -37,11 +37,11 @@ library(data.table) cat(cyan$bgRed$bold("########################\nStarting diferential expression pipeline\n###############################\n")) option_list <- list( - make_option(c("-r", "--rnaseq_dir" ), type="character" , default='../../01-rnaseq' , metavar="path" , help="Path to rna-seq results" ), - make_option(c("-c", "--clinical_data" ), type="character" , default='./clinical_data.txt' , metavar="path" , help="Path to clinical data file" ), - make_option(c("-g", "--group_col" ), type="character" , default='Group' , metavar="string" , help="Colname with the sample classes in sample_data of the experiment for the DE." ), - make_option(c("-n", "--norm_counts" ), type="logical" , default=FALSE , metavar="boolean", help="Create table with normalized counts" ), - make_option(c("-q", "--quality_plots" ), type="logical" , default=TRUE , metavar="boolean", help="Create quality plots or not." ) + make_option(c("-r", "--rnaseq_dir" ), type="character" , default='../../01-rnaseq' , metavar="path" , help="Path to rna-seq results" ), + make_option(c("-c", "--clinical_data" ), type="character" , default='../clinical_data.txt' , metavar="path" , help="Path to clinical data file" ), + make_option(c("-g", "--group_col" ), type="character" , default='Group' , metavar="string" , help="Colname with the sample classes in sample_data of the experiment for the DE." ), + make_option(c("-n", "--norm_counts" ), type="logical" , default=FALSE , metavar="boolean", help="Create table with normalized counts" ), + make_option(c("-q", "--quality_plots" ), type="logical" , default=TRUE , metavar="boolean", help="Create quality plots or not." ) ) opt_parser <- OptionParser(option_list=option_list) From dbf382d352ecb627be7edc755b496400198a2033 Mon Sep 17 00:00:00 2001 From: svarona Date: Wed, 14 Aug 2024 14:08:15 +0200 Subject: [PATCH 183/321] Fixed new nf-core pipeline file name --- .../time_series_differential_expression.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R index de296357f..b7b407e3b 100644 --- a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R +++ b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R @@ -263,7 +263,7 @@ cat(blue("########################\nStarting with loading data\n################ ####LOAD TRANSCRIPT RELATION DATA FILE ######################### -tx2gene <- read.table(file.path(opt$rnaseq_dir, "star_salmon", "salmon_tx2gene.tsv"), header = F) +tx2gene <- read.table(file.path(opt$rnaseq_dir, "star_salmon", "tx2gene.tsv"), header = F) colnames(tx2gene) <- c("TXNAME", "GENEID", "gene_name") gene_genename <- tx2gene[,c(2:3)] gene_genename <- gene_genename %>% distinct() From 4c9155b179ee348ed14dbea30c272cac9df73bcd Mon Sep 17 00:00:00 2001 From: svarona Date: Wed, 14 Aug 2024 16:38:10 +0200 Subject: [PATCH 184/321] removed unnecessary variables --- .../time_series_differential_expression.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R index b7b407e3b..d3358cd2c 100644 --- a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R +++ b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R @@ -90,7 +90,7 @@ load_sample_data <- function(clinical_data, group) { ####DIFFERENTIAL EXPRESSION######################### -deseq2_analysis <- function(txi_data, samples, compare_char1, compare_char2){ +deseq2_analysis <- function(txi_data, samples){ ddsTxi <- DESeqDataSetFromTximport(txi_data, colData = samples, design = ~ condition + time + condition:time) @@ -298,7 +298,7 @@ txi <- tximport(files, type="salmon", tx2gene=tx2gene) test_data(samples_data = samples_clin_data, txi_data = txi) cat(blue("########################\nStarting with differential expression\n###############################\n")) -deseq2_results <- deseq2_analysis(txi_data = txi, samples = samples_clin_data, compare_char1 = opt$treatment, opt$control) +deseq2_results <- deseq2_analysis(txi_data = txi, samples = samples_clin_data) mcols(deseq2_results$results, use.names = T) DE_results <- as.data.frame(deseq2_results$results) From f0289652edeb9de25364f8e41164ca0603c6669a Mon Sep 17 00:00:00 2001 From: svarona Date: Wed, 14 Aug 2024 16:39:00 +0200 Subject: [PATCH 185/321] Added time series plots --- .../time_series_differential_expression.R | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R index d3358cd2c..57d1725f4 100644 --- a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R +++ b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R @@ -232,6 +232,49 @@ quality_plots <- function(norm_data){ dev.off() } +####TIME SERIES PLOTS######################### +time_series_plots <- function(gene, res, dds) { + plot_name <- paste(gene, "expression.pdf", sep = "_") + file_path <- paste("Time_series_plots", plot_name, sep = "/") + index = which(rownames(res) == gene, arr.ind = TRUE) + gene_name <- DE_results_merged[index,2] + fiss <- plotCounts(dds, index, + intgroup = c("time","condition"), returnData = TRUE) + fiss$time <- factor(fiss$time, levels = time_order) + p <- ggplot(fiss, aes(x = time, y = count, color = condition, group = condition)) + + geom_point() + + stat_summary(fun = mean, geom = "line") + + scale_y_log10() + + labs(title = paste("Expression evolution of gene: ", gene_name, sep = "")) + pdf(file=file_path) + print(p) + dev.off() +} + +####WALD TEST FOR TIME SERIES######################### + +wald_test <- function(dds, condition){ + condition_test <- results(dds, name=condition, test="Wald") + print(condition_test[which.min(condition_test$padj),]) +} + +####BETAS PLOT FOR TIME SERIES######################### + +betas_plot <- function(res, dds) { + betas <- coef(dds) + colnames(betas) + + topGenes <- head(order(res$padj),20) + mat <- betas[topGenes, -c(1,2)] + thr <- 3 + mat[mat < -thr] <- -thr + mat[mat > thr] <- thr + pdf(file="Time_series_plots/betas_pheatmap.pdf") + pheatmap(mat, breaks=seq(from=-thr, to=thr, length=101), + cluster_col=TRUE, main = "log2FC of top20 significant genes") + dev.off() +} + ################################################ ## WARNINGS ## ################################################ @@ -335,6 +378,23 @@ if (opt$quality_plots) { quality_plots(norm_data = norm_count) } +cat(blue("########################\nStarting with time series plots\n###############################\n")) + +top4 <- rownames(head(deseq2_results$dds_matrix[order(deseq2_results$results$padj),], 4)) + +dir.create("Time_series_plots",showWarnings = FALSE) + +for (gene in top4) { + time_series_plots(gene, res = deseq2_results$results, dds = deseq2_results$dds_matrix) +} + +all_conditions <- resultsNames(deseq2_results$dds_matrix) +for (condition in all_conditions) { + wald_test(dds = deseq2_results$dds_matrix, condition) +} + +betas_plot(res = deseq2_results$results, dds = deseq2_results$dds_matrix) + save.image() cat(blue("########################\nNumber of genes with padj < 0.05 and log2FC >= |2|:\n")) cat(blue(nrow(DE_results_merged_sig))) From dd749833271dfe32d649613b07a19932e3e88ae2 Mon Sep 17 00:00:00 2001 From: svarona Date: Wed, 14 Aug 2024 16:39:25 +0200 Subject: [PATCH 186/321] Created option to pass date specific order --- .../time_series_differential_expression.R | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R index 57d1725f4..ce5b964e9 100644 --- a/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R +++ b/bu_isciii/templates/rnaseq/ANALYSIS/DATE_ANALYSIS01_RNASEQ/02-differential_expression/time_series_differential_expression.R @@ -41,7 +41,8 @@ option_list <- list( make_option(c("-c", "--clinical_data" ), type="character" , default='../clinical_data.txt' , metavar="path" , help="Path to clinical data file" ), make_option(c("-g", "--group_col" ), type="character" , default='Group' , metavar="string" , help="Colname with the sample classes in sample_data of the experiment for the DE." ), make_option(c("-n", "--norm_counts" ), type="logical" , default=FALSE , metavar="boolean", help="Create table with normalized counts" ), - make_option(c("-q", "--quality_plots" ), type="logical" , default=TRUE , metavar="boolean", help="Create quality plots or not." ) + make_option(c("-q", "--quality_plots" ), type="logical" , default=TRUE , metavar="boolean", help="Create quality plots or not." ), + make_option(c("-t", "--time_order" ), type="character" , default=NULL , metavar="string" , help="Order to plot the dates as list, eg: 15D,45D,3M." ) ) opt_parser <- OptionParser(option_list=option_list) @@ -51,6 +52,14 @@ cat(blue$bold("########################\nRunning analysis with the following par cat(blue("-Path to RNAseq input folder: ")) + cat(blue(opt$rnaseq_dir))+cat(blue("\n")) cat(blue("-Path to samples clinical data: ")) + cat(blue(opt$clinical_data))+cat(blue("\n")) cat(blue("-Column with the group info: ")) + cat(blue(opt$group_col))+cat(blue("\n")) + +if (is.null(opt$time_order)) { + print_help(opt_parser) + stop("You need to specify the order for the dates.", call.=FALSE) +} else { + time_order <- unlist(strsplit(opt$time_order, ",")) +} + if (opt$norm_counts) { cat(blue("-Saving normalized counts to file\n")) } else{ @@ -62,6 +71,7 @@ if (opt$quality_plots) { cat(blue("-Skipping quality plots\n")) } +cat(blue("Time order: ")) + cat(blue(time_order)) +cat(blue("\n")) ################################################ ################################################ From 2bc885194208f0409a728b71cc79e05f4f8a0999 Mon Sep 17 00:00:00 2001 From: svarona Date: Fri, 16 Aug 2024 12:46:24 +0200 Subject: [PATCH 187/321] Updated changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c81d1d99d..5cf42bc41 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,6 +62,7 @@ Code contributions to the new version: - Added several improvements in lablog_viralrecon (created log files, modified check_references function behaviour, enabled config files regeneration) [#306](https://github.com/BU-ISCIII/buisciii-tools/pull/306) - Fixed bug when lablog_viralrecon tries to download references that don't belong to any family. [#310](https://github.com/BU-ISCIII/buisciii-tools/pull/310) - Added mvmoneo to SFTP users. [#317](https://github.com/BU-ISCIII/buisciii-tools/pull/317) +- Added scripts for time series RNAseq and updated differential expression code for differentially expressed transcripts [#316](https://github.com/BU-ISCIII/buisciii-tools/pull/316) ### Modules From 424a4adb70cc29be837aa824f2a91f7d9aaaf8b6 Mon Sep 17 00:00:00 2001 From: svarona Date: Fri, 16 Aug 2024 13:12:14 +0200 Subject: [PATCH 188/321] Added bbaladron to SFTP users --- bu_isciii/templates/sftp_user.json | 1 + 1 file changed, 1 insertion(+) diff --git a/bu_isciii/templates/sftp_user.json b/bu_isciii/templates/sftp_user.json index 04773c8fb..f8ee42f60 100755 --- a/bu_isciii/templates/sftp_user.json +++ b/bu_isciii/templates/sftp_user.json @@ -51,5 +51,6 @@ "ycampos": ["LabUfiecMithocondrial"], "anadonoso": ["Labenterovirus"], "mvmoneo": ["SpainUDP"], + "bbaladron": ["SpainUDP"], "bioinfoadm": ["test"] } From 834c08836e7ecb0024fa424598eb5d4cbd833af7 Mon Sep 17 00:00:00 2001 From: svarona Date: Fri, 16 Aug 2024 13:12:48 +0200 Subject: [PATCH 189/321] Updated changelog --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5cf42bc41..cc58e59f7 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,7 +62,8 @@ Code contributions to the new version: - Added several improvements in lablog_viralrecon (created log files, modified check_references function behaviour, enabled config files regeneration) [#306](https://github.com/BU-ISCIII/buisciii-tools/pull/306) - Fixed bug when lablog_viralrecon tries to download references that don't belong to any family. [#310](https://github.com/BU-ISCIII/buisciii-tools/pull/310) - Added mvmoneo to SFTP users. [#317](https://github.com/BU-ISCIII/buisciii-tools/pull/317) -- Added scripts for time series RNAseq and updated differential expression code for differentially expressed transcripts [#316](https://github.com/BU-ISCIII/buisciii-tools/pull/316) +- Added scripts for time series RNAseq and updated differential expression code for differentially expressed transcripts [#316](https://github.com/BU-ISCIII/buisciii-tools/pull/316). +- Added bbaladron to SFTP users [#316](https://github.com/BU-ISCIII/buisciii-tools/pull/316). ### Modules From fdf6271682aa04e9e6429c5aa5513cd0c4937120 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Mon, 19 Aug 2024 17:12:59 +0200 Subject: [PATCH 190/321] Added pikavirus configuration --- bu_isciii/templates/services.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/services.json b/bu_isciii/templates/services.json index 3e57d041b..147277fe4 100755 --- a/bu_isciii/templates/services.json +++ b/bu_isciii/templates/services.json @@ -42,10 +42,10 @@ "url": "https://github.com/BU-ISCIII/PikaVirus", "description": "PikaVirus, a mapping-based tool for metagenome analysis of virus.", "clean": { - "folders":[], + "folders":["virus_coverage/plots"], "files":[] }, - "no_copy": ["RAW", "TMP"], + "no_copy": ["RAW", "TMP", "01-PikaVirus-results"], "last_folder":"REFERENCES", "delivery_md": "assets/reports/md/pikavirus.md", "results_md": "assets/reports/results/pikavirus.md" From 6c9da287452afaecbc98b1a8bc2f5b7455ff02b3 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Mon, 19 Aug 2024 17:29:50 +0200 Subject: [PATCH 191/321] Added plasmidID configuration --- bu_isciii/templates/services.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/services.json b/bu_isciii/templates/services.json index 147277fe4..d60131998 100755 --- a/bu_isciii/templates/services.json +++ b/bu_isciii/templates/services.json @@ -60,7 +60,7 @@ "description": "Plasmid identification tool based on mapping and assisted by assembly", "clean": { "folders":["01-preprocessing/trimmed_sequences"], - "files":[] + "files":["mapping/sample_name.sorted.bam", "kmer/database.msh"] }, "no_copy": ["RAW", "TMP"], "delivery_md": "assets/reports/md/plasmidid.md", From 67f491bf8e67f1f1a6cf0180664e73a0243ecdc7 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Mon, 19 Aug 2024 17:32:11 +0200 Subject: [PATCH 192/321] Added wgmlst_chewbbaca configuration --- bu_isciii/templates/services.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/services.json b/bu_isciii/templates/services.json index d60131998..7c8049fed 100755 --- a/bu_isciii/templates/services.json +++ b/bu_isciii/templates/services.json @@ -94,7 +94,7 @@ "description": "Multilocus sequence typing (MLST) using chewBBACA", "depends_on": "assembly_annotation", "clean": { - "folders":["03-assembly/trimming/trimmed", "01-preprocessing/{sample_name}"], + "folders":[], "files":[] }, "no_copy": ["RAW", "TMP"], From f6e8d05b88b2f3e59f4b2af87e37f0b3296e0915 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Mon, 19 Aug 2024 17:32:52 +0200 Subject: [PATCH 193/321] Added wgmlst_ctaranis configuration --- bu_isciii/templates/services.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/services.json b/bu_isciii/templates/services.json index 7c8049fed..abb0a1a2d 100755 --- a/bu_isciii/templates/services.json +++ b/bu_isciii/templates/services.json @@ -76,7 +76,7 @@ "description": "Multilocus sequence typing (MLST) using Taranis", "depends_on": "assembly_annotation", "clean": { - "folders":["03-assembly/trimming/trimmed", "01-preprocessing"], + "folders":[], "files":[] }, "no_copy": ["RAW", "TMP"], From 616b08e0f12f5d812b360c70025aac78ea3f8865 Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Mon, 19 Aug 2024 17:38:48 +0200 Subject: [PATCH 194/321] Added characterization configuration --- bu_isciii/templates/services.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/services.json b/bu_isciii/templates/services.json index abb0a1a2d..346a98727 100755 --- a/bu_isciii/templates/services.json +++ b/bu_isciii/templates/services.json @@ -195,7 +195,7 @@ "end": "", "description": "", "clean": { - "folders":[], + "folders":["01-preprocessing"], "files":[] }, "no_copy": ["RAW", "TMP", "00-reads", "fasta_inputs"], From c39b700247e6a30a64945f76ee376d3f356bd69d Mon Sep 17 00:00:00 2001 From: "jaime.ozaez" Date: Mon, 19 Aug 2024 17:56:10 +0200 Subject: [PATCH 195/321] Updated CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cc58e59f7..42e236f2c 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -80,6 +80,7 @@ Code contributions to the new version: - Fixed autoclean-sftp function. [#281](https://github.com/BU-ISCIII/buisciii-tools/pull/281) - Fixed bioinfo_doc.py. Modified it so that this module creates a .pdf file including new-line characters, without merging lines into one single line [#259](https://github.com/BU-ISCIII/buisciii-tools/pull/259). - PR [#288](https://github.com/BU-ISCIII/buisciii-tools/pull/288) Fixed updating service's state to in_progress multiple times, related with issue [#285](https://github.com/BU-ISCIII/buisciii-tools/issues/285) +- Review and update of services.json for files and folders cleaning [#318](https://github.com/BU-ISCIII/buisciii-tools/pull/318). #### Changed From 3bb2d2270e57b09962cbb4b184d23b5deffafaaf Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 23 Aug 2024 16:13:03 +0200 Subject: [PATCH 196/321] added files of template characterization-taxprofiler --- .../ANALYSIS/ANALYSIS02_TAXPROFILING/lablog | 91 +++++++++++++++++++ .../ANALYSIS/lablog_taxprofiling | 6 ++ .../characterization/DOC/databasesheet.csv | 6 ++ .../DOC/hpc_slurm_taxprofiler.config | 35 +++++++ .../RESULTS/lablog_taxprofiling_results | 8 ++ 5 files changed, 146 insertions(+) create mode 100644 bu_isciii/templates/characterization/ANALYSIS/ANALYSIS02_TAXPROFILING/lablog create mode 100644 bu_isciii/templates/characterization/ANALYSIS/lablog_taxprofiling create mode 100644 bu_isciii/templates/characterization/DOC/databasesheet.csv create mode 100644 bu_isciii/templates/characterization/DOC/hpc_slurm_taxprofiler.config create mode 100644 bu_isciii/templates/characterization/RESULTS/lablog_taxprofiling_results diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS02_TAXPROFILING/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS02_TAXPROFILING/lablog new file mode 100644 index 000000000..817c77e88 --- /dev/null +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS02_TAXPROFILING/lablog @@ -0,0 +1,91 @@ +# SETUP INTPUT SAMPLE SHEET +ln -s ../00-reads . +ln -s ../samples_id.txt . + +# Function to print colored text +print_color() { + case "$2" in + "red") + echo -e "\e[1;31m$1\e[0m" + ;; + "green") + echo -e "\e[1;32m$1\e[0m" + ;; + "blue") + echo -e "\e[1;34m$1\e[0m" + ;; + *) + echo "$1" + ;; + esac +} + +# Function to prompt with color +prompt_with_color() { + read -p "$(print_color $1 'blue') $2" response +} + +# Select whether to save trimmed reads +trim_options=("Yes" "No") +print_color "Do you want to save trimmed reads in outdir?" 'blue' +select TRIMMED in "${trim_options[@]}"; do + if [ -n "$TRIMMED" ]; then + # rename trimmed + if [ "$TRIMMED" == "Yes" ] || [ "$TRIMMED" == "y" ]; then + SAVETRIMMED="true" + else + SAVETRIMMED="false" + fi + + break + else + print_color "Invalid input. Please select a valid option." 'red' + fi +done +print_color "Selected trimmed file option: $TRIMMED save trimmed" 'green' + + +# Samples sheet setup +echo "sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta" > samplesheet.csv +cat samples_id.txt | while read in; do + echo "${in},run1,ILLUMINA,00-reads/${in}_R1.fastq.gz,00-reads/${in}_R2.fastq.gz," +done >> samplesheet.csv + +scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") + +# slurm sbatch file setup +cat < taxprofiler.sbatch +#!/bin/sh +#SBATCH --ntasks 1 +#SBATCH --cpus-per-task 2 +#SBATCH --mem 4G +#SBATCH --time 2:00:00 +#SBATCH --partition middle_idx +#SBATCH --output $(date '+%Y%m%d')_taxprofiler.log +#SBATCH --chdir $scratch_dir + +# module load Nextflow/23.10.0 singularity +export NXF_OPTS="-Xms500M -Xmx10G" + +nextflow run /data/bi/pipelines/nf-core-taxprofiler/nf-core-taxprofiler-1.1.8 \\ + -profile singularity \\ + -c ../../DOC/hpc_slurm_taxprofiler.config \\ + --input samplesheet.csv \\ + --outdir ./ \\ + --databases ../../DOC/databasesheet.csv \\ + --preprocessing_qc_tool fastqc \\ + --save_preprocessed_reads ${SAVETRIMMED} \\ + --perform_shortread_qc true \\ + --shortread_qc_tool fastp \\ + --perform_shortread_hostremoval true \\ + --hostremoval_reference /data/bi/references/eukaria/homo_sapiens/hg38/NCBI/genome/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.fna.gz \\ + --run_kraken2 true \\ + --run_bracken true \\ + --run_centrifuge true \\ + --run_kaiju true \\ + --run_metaphlan true \\ + --run_krona true \\ + -resume +EOF + +echo "sbatch taxprofiler.sbatch" > _01_nf_taxprofiler.sh diff --git a/bu_isciii/templates/characterization/ANALYSIS/lablog_taxprofiling b/bu_isciii/templates/characterization/ANALYSIS/lablog_taxprofiling new file mode 100644 index 000000000..e4a813bf5 --- /dev/null +++ b/bu_isciii/templates/characterization/ANALYSIS/lablog_taxprofiling @@ -0,0 +1,6 @@ +mkdir -p 00-reads + +cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cd - +cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd - + +mv ANALYSIS02_TAXPROFILING $(date '+%Y%m%d')_ANALYSIS02_TAXPROFILING diff --git a/bu_isciii/templates/characterization/DOC/databasesheet.csv b/bu_isciii/templates/characterization/DOC/databasesheet.csv new file mode 100644 index 000000000..811e462c1 --- /dev/null +++ b/bu_isciii/templates/characterization/DOC/databasesheet.csv @@ -0,0 +1,6 @@ +tool,db_name,db_params,db_path +kraken2,db1,,/data/bi/references/kraken/minikraken_8GB_20200312.tgz +bracken,db2,,/data/bi/references/bracken/bracken_minikraken_8GB_20200312.tgz +centrifuge,db3,,/data/bi/references/centrifuge/201612_centrifuge_index_p+h+v.tar.gz +metaphlan,db4,,/data/bi/references/metaphlan/mpa_vJun23_CHOCOPhlAnSGB_20240/ +kaiju,db5,,/data/bi/references/kaiju/nr_euk_2023-05-10/ \ No newline at end of file diff --git a/bu_isciii/templates/characterization/DOC/hpc_slurm_taxprofiler.config b/bu_isciii/templates/characterization/DOC/hpc_slurm_taxprofiler.config new file mode 100644 index 000000000..bfbc4ac15 --- /dev/null +++ b/bu_isciii/templates/characterization/DOC/hpc_slurm_taxprofiler.config @@ -0,0 +1,35 @@ +/* + HPC XTUTATIS CONFIGURATION +*/ + +singularity { + enabled = true + autoMounts = true + singularity.cacheDir = '/data/cnm/ratb/pipelines/singularity-images/' +} + +process { + executor = 'slurm' + queue = 'middle_idx' + jobName = { "$task.name - $task.hash" } + conda = null + + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + + withName:'KAIJU_KAIJU' { + errorStrategy = { task.exitStatus in [143,137,21,1] ? 'retry' : 'finish' } + maxRetries = 3 + memory = { 72.GB * task.attempt } + time = { 8.h } + } +} + +params { + max_memory = 376.GB + max_cpus = 32 + max_time = '24.h' +} + +/* + Custom base.config +*/ diff --git a/bu_isciii/templates/characterization/RESULTS/lablog_taxprofiling_results b/bu_isciii/templates/characterization/RESULTS/lablog_taxprofiling_results new file mode 100644 index 000000000..f980497de --- /dev/null +++ b/bu_isciii/templates/characterization/RESULTS/lablog_taxprofiling_results @@ -0,0 +1,8 @@ +DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega01" +mkdir -p $DELIVERY_FOLDER/taxprofiling + +# Taxprofiling service +cd $DELIVERY_FOLDER/taxprofiling + +# Links to reports +ln -s ../../../ANALYSIS/*ANALYSIS02_TAXPROFILING/results/multiqc/multiqc_report.html . \ No newline at end of file From f74e4848db9c91e704c15d6a36f4e243343a3263 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 23 Aug 2024 16:24:26 +0200 Subject: [PATCH 197/321] fixed file end lines --- bu_isciii/templates/characterization/DOC/databasesheet.csv | 2 +- .../characterization/RESULTS/lablog_taxprofiling_results | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/characterization/DOC/databasesheet.csv b/bu_isciii/templates/characterization/DOC/databasesheet.csv index 811e462c1..a227b08c7 100644 --- a/bu_isciii/templates/characterization/DOC/databasesheet.csv +++ b/bu_isciii/templates/characterization/DOC/databasesheet.csv @@ -3,4 +3,4 @@ kraken2,db1,,/data/bi/references/kraken/minikraken_8GB_20200312.tgz bracken,db2,,/data/bi/references/bracken/bracken_minikraken_8GB_20200312.tgz centrifuge,db3,,/data/bi/references/centrifuge/201612_centrifuge_index_p+h+v.tar.gz metaphlan,db4,,/data/bi/references/metaphlan/mpa_vJun23_CHOCOPhlAnSGB_20240/ -kaiju,db5,,/data/bi/references/kaiju/nr_euk_2023-05-10/ \ No newline at end of file +kaiju,db5,,/data/bi/references/kaiju/nr_euk_2023-05-10/ diff --git a/bu_isciii/templates/characterization/RESULTS/lablog_taxprofiling_results b/bu_isciii/templates/characterization/RESULTS/lablog_taxprofiling_results index f980497de..7d5cce5cf 100644 --- a/bu_isciii/templates/characterization/RESULTS/lablog_taxprofiling_results +++ b/bu_isciii/templates/characterization/RESULTS/lablog_taxprofiling_results @@ -5,4 +5,4 @@ mkdir -p $DELIVERY_FOLDER/taxprofiling cd $DELIVERY_FOLDER/taxprofiling # Links to reports -ln -s ../../../ANALYSIS/*ANALYSIS02_TAXPROFILING/results/multiqc/multiqc_report.html . \ No newline at end of file +ln -s ../../../ANALYSIS/*ANALYSIS02_TAXPROFILING/results/multiqc/multiqc_report.html . From d61a555cecf36b22feaa3659b593a77a85d2a5a1 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 23 Aug 2024 16:38:03 +0200 Subject: [PATCH 198/321] update changelog #320 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 42e236f2c..42f918ab2 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -64,6 +64,7 @@ Code contributions to the new version: - Added mvmoneo to SFTP users. [#317](https://github.com/BU-ISCIII/buisciii-tools/pull/317) - Added scripts for time series RNAseq and updated differential expression code for differentially expressed transcripts [#316](https://github.com/BU-ISCIII/buisciii-tools/pull/316). - Added bbaladron to SFTP users [#316](https://github.com/BU-ISCIII/buisciii-tools/pull/316). +- Added new template for comprehensive taxonomy profiling using the nf-core/taxprofiler pipeline [#320](https://github.com/BU-ISCIII/buisciii-tools/pull/320). ### Modules From f0a183ee42cd2f7482d6192efba93b63fdee5934 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 30 Aug 2024 09:45:47 +0200 Subject: [PATCH 199/321] uppdate execution time and database paths --- .../characterization/ANALYSIS/ANALYSIS02_TAXPROFILING/lablog | 2 +- bu_isciii/templates/characterization/DOC/databasesheet.csv | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS02_TAXPROFILING/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS02_TAXPROFILING/lablog index 817c77e88..e16eab703 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS02_TAXPROFILING/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS02_TAXPROFILING/lablog @@ -59,7 +59,7 @@ cat < taxprofiler.sbatch #SBATCH --ntasks 1 #SBATCH --cpus-per-task 2 #SBATCH --mem 4G -#SBATCH --time 2:00:00 +#SBATCH --time 24:00:00 #SBATCH --partition middle_idx #SBATCH --output $(date '+%Y%m%d')_taxprofiler.log #SBATCH --chdir $scratch_dir diff --git a/bu_isciii/templates/characterization/DOC/databasesheet.csv b/bu_isciii/templates/characterization/DOC/databasesheet.csv index a227b08c7..18b4afee6 100644 --- a/bu_isciii/templates/characterization/DOC/databasesheet.csv +++ b/bu_isciii/templates/characterization/DOC/databasesheet.csv @@ -1,6 +1,6 @@ tool,db_name,db_params,db_path -kraken2,db1,,/data/bi/references/kraken/minikraken_8GB_20200312.tgz -bracken,db2,,/data/bi/references/bracken/bracken_minikraken_8GB_20200312.tgz +kraken2,db1,,/data/bi/references/kraken/k2_standard_16gb_20240605.tar.gz +bracken,db2,,/data/bi/references/kraken/k2_standard_16gb_20240605.tar.gz centrifuge,db3,,/data/bi/references/centrifuge/201612_centrifuge_index_p+h+v.tar.gz metaphlan,db4,,/data/bi/references/metaphlan/mpa_vJun23_CHOCOPhlAnSGB_20240/ kaiju,db5,,/data/bi/references/kaiju/nr_euk_2023-05-10/ From 180447ba23800402a6c7210c05b96d8134907541 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 30 Aug 2024 10:14:32 +0200 Subject: [PATCH 200/321] relocated template taxprofiler --- .../characterization/RESULTS/lablog_taxprofiling_results | 8 -------- .../ANALYSIS/ANALYSIS01_TAXPROFILER}/lablog | 4 ++-- .../ANALYSIS/lablog_taxprofiler} | 2 +- .../{characterization => mag}/DOC/databasesheet.csv | 0 .../DOC/taxprofiler.config} | 0 .../templates/mag/RESULTS/lablog_taxprofiler_results | 8 ++++++++ 6 files changed, 11 insertions(+), 11 deletions(-) delete mode 100644 bu_isciii/templates/characterization/RESULTS/lablog_taxprofiling_results rename bu_isciii/templates/{characterization/ANALYSIS/ANALYSIS02_TAXPROFILING => mag/ANALYSIS/ANALYSIS01_TAXPROFILER}/lablog (96%) rename bu_isciii/templates/{characterization/ANALYSIS/lablog_taxprofiling => mag/ANALYSIS/lablog_taxprofiler} (78%) rename bu_isciii/templates/{characterization => mag}/DOC/databasesheet.csv (100%) rename bu_isciii/templates/{characterization/DOC/hpc_slurm_taxprofiler.config => mag/DOC/taxprofiler.config} (100%) create mode 100755 bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results diff --git a/bu_isciii/templates/characterization/RESULTS/lablog_taxprofiling_results b/bu_isciii/templates/characterization/RESULTS/lablog_taxprofiling_results deleted file mode 100644 index 7d5cce5cf..000000000 --- a/bu_isciii/templates/characterization/RESULTS/lablog_taxprofiling_results +++ /dev/null @@ -1,8 +0,0 @@ -DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega01" -mkdir -p $DELIVERY_FOLDER/taxprofiling - -# Taxprofiling service -cd $DELIVERY_FOLDER/taxprofiling - -# Links to reports -ln -s ../../../ANALYSIS/*ANALYSIS02_TAXPROFILING/results/multiqc/multiqc_report.html . diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS02_TAXPROFILING/lablog b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS01_TAXPROFILER/lablog similarity index 96% rename from bu_isciii/templates/characterization/ANALYSIS/ANALYSIS02_TAXPROFILING/lablog rename to bu_isciii/templates/mag/ANALYSIS/ANALYSIS01_TAXPROFILER/lablog index e16eab703..06bd2f6bf 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS02_TAXPROFILING/lablog +++ b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS01_TAXPROFILER/lablog @@ -65,11 +65,11 @@ cat < taxprofiler.sbatch #SBATCH --chdir $scratch_dir # module load Nextflow/23.10.0 singularity -export NXF_OPTS="-Xms500M -Xmx10G" +export NXF_OPTS="-Xms500M -Xmx6G" nextflow run /data/bi/pipelines/nf-core-taxprofiler/nf-core-taxprofiler-1.1.8 \\ -profile singularity \\ - -c ../../DOC/hpc_slurm_taxprofiler.config \\ + -c ../../DOC/taxprofiler.config \\ --input samplesheet.csv \\ --outdir ./ \\ --databases ../../DOC/databasesheet.csv \\ diff --git a/bu_isciii/templates/characterization/ANALYSIS/lablog_taxprofiling b/bu_isciii/templates/mag/ANALYSIS/lablog_taxprofiler similarity index 78% rename from bu_isciii/templates/characterization/ANALYSIS/lablog_taxprofiling rename to bu_isciii/templates/mag/ANALYSIS/lablog_taxprofiler index e4a813bf5..f6a3f119b 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/lablog_taxprofiling +++ b/bu_isciii/templates/mag/ANALYSIS/lablog_taxprofiler @@ -3,4 +3,4 @@ mkdir -p 00-reads cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cd - cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd - -mv ANALYSIS02_TAXPROFILING $(date '+%Y%m%d')_ANALYSIS02_TAXPROFILING +mv ANALYSIS01_TAXPROFILER $(date '+%Y%m%d')_ANALYSIS01_TAXPROFILER diff --git a/bu_isciii/templates/characterization/DOC/databasesheet.csv b/bu_isciii/templates/mag/DOC/databasesheet.csv similarity index 100% rename from bu_isciii/templates/characterization/DOC/databasesheet.csv rename to bu_isciii/templates/mag/DOC/databasesheet.csv diff --git a/bu_isciii/templates/characterization/DOC/hpc_slurm_taxprofiler.config b/bu_isciii/templates/mag/DOC/taxprofiler.config similarity index 100% rename from bu_isciii/templates/characterization/DOC/hpc_slurm_taxprofiler.config rename to bu_isciii/templates/mag/DOC/taxprofiler.config diff --git a/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results b/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results new file mode 100755 index 000000000..d6231a54b --- /dev/null +++ b/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results @@ -0,0 +1,8 @@ +DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega01" +mkdir -p $DELIVERY_FOLDER/taxprofiler + +# Taxprofiling service +cd $DELIVERY_FOLDER/taxprofiler + +# Links to reports +ln -s ../../../ANALYSIS/*ANALYSIS01_TAXPROFILER/results/multiqc/multiqc_report.html . From 9ccbdcba28ab452c3d66686eb348415a037a97d1 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 30 Aug 2024 11:33:02 +0200 Subject: [PATCH 201/321] udpdate documentation of template taxprofiler --- README.md | 7 +- bu_isciii/assets/reports/md/mag.md | 687 +++++++++++++++++++++++- bu_isciii/assets/reports/results/mag.md | 9 + bu_isciii/templates/services.json | 4 +- 4 files changed, 701 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 138380a1a..fa4f2e68f 100644 --- a/README.md +++ b/README.md @@ -148,9 +148,10 @@ Output: │ │ control, host removal and exploratory │ │ │ │ analysis of samples. │ │ │ ariba_characterization │ │ │ -│ mag_met │ Bioinformatics best-practise analysis │ https://github.com/nf-core/mag │ -│ │ pipeline for assembly, binning and │ │ -│ │ annotation of metagenomes. │ │ +│ mag_met │ 1- Bioinformatics best-practise analysis │ https://github.com/nf-core/mag or │ +│ │ for taxonomic classification and │ https://github.com/nf-core/taxprofiler │ +│ │ profiling; 2- Bioinformatics best-practise│ │ +│ │ analysis pipeline for assembly, binning │ │ └────────────────────────┴───────────────────────────────────────────┴────────────────────────────────────────────┘ ``` diff --git a/bu_isciii/assets/reports/md/mag.md b/bu_isciii/assets/reports/md/mag.md index 1b216b9ea..67ef902a2 100644 --- a/bu_isciii/assets/reports/md/mag.md +++ b/bu_isciii/assets/reports/md/mag.md @@ -749,4 +749,689 @@ Summary tool-specific plots and tables of following tools are currently displaye -[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. \ No newline at end of file +[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. + +# nf-core/taxprofiler: Output + +## Introduction + +This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. + +The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. + +## Pipeline overview + +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: + +- [UNTAR](#untar) - Optionally saved decompressed input databases +- [FastQC](#fastqc) - Raw read QC +- [falco](#fastqc) - Alternative to FastQC for raw read QC +- [fastp](#fastp) - Adapter trimming for Illumina data +- [AdapterRemoval](#adapterremoval) - Adapter trimming for Illumina data +- [Porechop](#porechop) - Adapter removal for Oxford Nanopore data +- [BBDuk](#bbduk) - Quality trimming and filtering for Illumina data +- [PRINSEQ++](#prinseq) - Quality trimming and filtering for Illunina data +- [Filtlong](#filtlong) - Quality trimming and filtering for Nanopore data +- [Bowtie2](#bowtie2) - Host removal for Illumina reads +- [minimap2](#minimap2) - Host removal for Nanopore reads +- [SAMtools stats](#samtools-stats) - Statistics from host removal +- [SAMtools fastq](#samtools-fastq) - Converts unmapped BAM file to fastq format (minimap2 only) +- [Analysis Ready Reads](#analysis-read-reads) - Optional results directory containing the final processed reads used as input for classification/profiling. +- [Bracken](#bracken) - Taxonomic classifier using k-mers and abundance estimations +- [Kraken2](#kraken2) - Taxonomic classifier using exact k-mer matches +- [KrakenUniq](#krakenuniq) - Taxonomic classifier that combines the k-mer-based classification and the number of unique k-mers found in each species +- [Centrifuge](#centrifuge) - Taxonomic classifier that uses a novel indexing scheme based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index. +- [Kaiju](#kaiju) - Taxonomic classifier that finds maximum (in-)exact matches on the protein-level. +- [Diamond](#diamond) - Sequence aligner for protein and translated DNA searches. +- [MALT](#malt) - Sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics +- [MetaPhlAn](#metaphlan) - Genome-level marker gene based taxonomic classifier +- [mOTUs](#motus) - Tool for marker gene-based OTU (mOTU) profiling. +- [KMCP](#kmcp) - Taxonomic classifier that utilizes genome coverage information by splitting the reference genomes into chunks and stores k-mers in a modified and optimized COBS index for fast alignment-free sequence searching. +- [ganon](#ganon) - Taxonomic classifier and profile that uses Interleaved Bloom Filters as indices based on k-mers/minimizers. +- [TAXPASTA](#taxpasta) - Tool to standardise taxonomic profiles as well as merge profiles across samples from the same database and classifier/profiler. +- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline +- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution + +![](images/taxprofiler_tube.png) + +### untar + +untar is used in nf-core/taxprofiler to decompress various input files ending in `.tar.gz`. This process is mainly used for decompressing input database archive files. + +
+Output files + +- `untar/` + - `database/` + - ``: directory containing contents of the decompressed archive + +
+ +This directory will only be present if `--save_untarred_databases` is supplied. The contained directories can be useful for moving the decompressed directories to a central 'cache' location allowing users to re-use the same databases. This is useful to save unnecessary computational time of decompressing the archives on every run. + +### FastQC or Falco + +
+Output files + +- `{fastqc,falco}/` + - {raw,preprocessed} + - `*html`: FastQC or Falco report containing quality metrics in HTML format. + - `*.txt`: FastQC or Falco report containing quality metrics in TXT format. + - `*.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images (FastQC only). + +
+ +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). + +If preprocessing is turned on, nf-core/taxprofiler runs FastQC/Falco twice -once before and once after adapter removal/read merging, to allow evaluation of the performance of these preprocessing steps. Note in the General Stats table, the columns of these two instances of FastQC/Falco are placed next to each other to make it easier to evaluate. However, the columns of the actual preprocessing steps (i.e, fastp, AdapterRemoval, and Porechop) will be displayed _after_ the two FastQC/Falco columns, even if they were run 'between' the two FastQC/Falco jobs in the pipeline itself. + +:::info +Falco produces identical output to FastQC but in the `falco/` directory. +::: + +![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) + +![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) + +![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) + +:::note +The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +::: + +### fastp + +[fastp](https://github.com/OpenGene/fastp) is a FASTQ pre-processing tool for quality control, trimmming of adapters, quality filtering and other features. + +It is used in nf-core/taxprofiler for adapter trimming of short-reads. + +
+Output files + +- `fastp/` + - `.fastp.fastq.gz`: File with the trimmed unmerged fastq reads. + - `.merged.fastq.gz`: File with the reads that were successfully merged. + - `.*{log,html,json}`: Log files in different formats. + +
+ +By default nf-core/taxprofiler will only provide the `.fastp.fastq.gz` file if fastp is selected. The file `.merged.fastq.gz` will be available in the output folder if you provide the argument ` --shortread_qc_mergepairs` (optionally retaining un-merged pairs when in combination with `--shortread_qc_includeunmerged`). + +You can change the default value for low complexity filtering by using the argument `--shortread_complexityfilter_fastp_threshold`. + +### AdapterRemoval + +[AdapterRemoval](https://adapterremoval.readthedocs.io/en/stable/) searches for and removes remnant adapter sequences from High-Throughput Sequencing (HTS) data and (optionally) trims low quality bases from the 3' end of reads following adapter removal. It is popular in the field of palaeogenomics. The output logs are stored in the results folder, and as a part of the MultiQC report. + +
+Output files + +- `adapterremoval/` + - `.settings`: AdapterRemoval log file containing general adapter removal, read trimming and merging statistics + - `.collapsed.fastq.gz` - read-pairs that merged and did not undergo trimming (only when `--shortread_qc_mergepairs` supplied) + - `.collapsed.truncated.fastq.gz` - read-pairs that merged underwent quality trimming (only when `--shortread_qc_mergepairs` supplied) + - `.pair1.truncated.fastq.gz` - read 1 of pairs that underwent quality trimming + - `.pair2.truncated.fastq.gz` - read 2 of pairs that underwent quality trimming (and could not merge if `--shortread_qc_mergepairs` supplied) + - `.singleton.truncated.fastq.gz` - orphaned read pairs where one of the pair was discarded + - `.discard.fastq.gz` - reads that were discarded due to length or quality filtering + +
+ +By default nf-core/taxprofiler will only provide the `.settings` file if AdapterRemoval is selected. + +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. If this is selected, you may receive different combinations of `.fastq` files for each sample depending on the input types - e.g. whether you have merged or not, or if you're supplying both single- and paired-end reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::warning +The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as complexity filtering, host removal, run merging etc.. +::: + +### Porechop + +[Porechop](https://github.com/rrwick/Porechop) is a tool for finding and removing adapters from Oxford Nanopore reads. Adapters on the ends of reads are trimmed and if a read has an adapter in its middle, it is considered a chimeric and it chopped into separate reads. + +
+Output files + +- `porechop/` + - `.log`: Log file containing trimming statistics + - `.fastq.gz`: Adapter-trimmed file + +
+ +The output logs are saved in the output folder and are part of MultiQC report.You do not normally need to check these manually. + +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::warning +We do **not** recommend using Porechop if you are already trimming the adapters with ONT's basecaller Guppy. +::: + +### BBDuk + +[BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) stands for Decontamination Using Kmers. BBDuk was developed to combine most common data-quality-related trimming, filtering, and masking operations into a single high-performance tool. + +It is used in nf-core/taxprofiler for complexity filtering using different algorithms. This means that it will remove reads with low sequence diversity (e.g. mono- or dinucleotide repeats). + +
+Output files + +- `bbduk/` + - `.bbduk.log`: log file containing filtering statistics + - `.fastq.gz`: resulting FASTQ file without low-complexity reads + +
+ +By default nf-core/taxprofiler will only provide the `.log` file if BBDuk is selected as the complexity filtering tool. You will only find the complexity filtered reads in your results directory if you provide ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::warning +The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as host removal, run merging etc.. +::: + +### PRINSEQ++ + +[PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus) is a C++ implementation of the [prinseq-lite.pl](https://prinseq.sourceforge.net/) program. It can be used to filter, reformat or trim genomic and metagenomic sequence data. + +It is used in nf-core/taxprofiler for complexity filtering using different algorithms. This means that it will remove reads with low sequence diversity (e.g. mono- or dinucleotide repeats). + +
+Output files + +- `prinseqplusplus/` + - `.log`: log file containing number of reads. Row IDs correspond to: `min_len, max_len, min_gc, max_gc, min_qual_score, min_qual_mean, ns_max_n, noiupac, derep, lc_entropy, lc_dust, trim_tail_left, trim_tail_right, trim_qual_left, trim_qual_right, trim_left, trim_right` + - `_good_out.fastq.gz`: resulting FASTQ file without low-complexity reads + +
+ +By default nf-core/taxprofiler will only provide the `.log` file if PRINSEQ++ is selected as the complexity filtering tool. You will only find the complexity filtered `.fastq` files in your results directory if you supply ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::warning +The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as host removal, run merging etc.. +::: + +### Filtlong + +[Filtlong](https://github.com/rrwick/Filtlong) is a quality filtering tool for long reads. It can take a set of small reads and produce a smaller, better subset. + +
+Output files + +- `filtlong/` + - `_filtered.fastq.gz`: Quality or short read data filtered file + - `_filtered.log`: log file containing summary statistics + +
+ +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::warning +We do _not_ recommend using Filtlong if you are performing filtering of low quality reads with ONT's basecaller Guppy. +::: + +### Bowtie2 + +[Bowtie 2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml) is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences. It is particularly good at aligning reads of about 50 up to 100s or 1,000s of characters, and particularly good at aligning to relatively long (e.g. mammalian) genomes. + +It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) and/or other possible contaminant reads (e.g. Phi X) from short-read `.fastq` files prior to profiling. + +
+Output files + +- `bowtie2/` + - `build/` + - `*.bt2`: Bowtie2 indicies of reference genome, only if `--save_hostremoval_index` supplied. + - `align/` + - `.bam`: BAM file containing reads that aligned against the user-supplied reference genome as well as unmapped reads + - `.bowtie2.log`: log file about the mapped reads + - `.unmapped.fastq.gz`: the off-target reads from the mapping that is used in downstream steps. + +
+ +By default nf-core/taxprofiler will only provide the `.log` file if host removal is turned on. You will only have a `.bam` file if you specify `--save_hostremoval_bam`. This will contain _both_ mapped and unmapped reads. You will only get FASTQ files if you specify to save `--save_hostremoval_unmapped` - these contain only unmapped reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::info +Unmapped reads in FASTQ are only found in this directory for short-reads, for long-reads see [`samtools/fastq/`](#samtools-fastq). +::: + +:::info +The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as run merging etc.. +::: + +:::info +While there is a dedicated section in the MultiQC HTML for Bowtie2, these values are not displayed by default in the General Stats table. Rather, alignment statistics to host genome is reported via samtools stats module in MultiQC report for direct comparison with minimap2 (see below). +::: + +### minimap2 + +[minimap2](https://github.com/lh3/minimap2) is an alignment tool suited to mapping long reads to reference sequences. + +It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) or other possible contaminant reads from long-read `.fastq` files prior to taxonomic classification/profiling. + +
+Output files + +- `minimap2/` + - `build/` + - `*.mmi2`: minimap2 indices of reference genome, only if `--save_hostremoval_index` supplied. + - `align/` + - `.bam`: Alignment file in BAM format containing both mapped and unmapped reads. + +
+ +By default, nf-core/taxprofiler will only provide the `.bam` file containing mapped and unmapped reads if saving of host removal for long reads is turned on via `--save_hostremoval_bam`. + +:::info +minimap2 is not yet supported as a module in MultiQC and therefore there is no dedicated section in the MultiQC HTML. Rather, alignment statistics to host genome is reported via samtools stats module in MultiQC report. +::: + +:::info +Unlike Bowtie2, minimap2 does not produce an unmapped FASTQ file by itself. See [`samtools/fastq`](#samtools-fastq). +::: + +### SAMtools fastq + +[SAMtools fastq](http://www.htslib.org/doc/1.1/samtools.html) converts a `.sam`, `.bam`, or `.cram` alignment file to FASTQ format + +
+Output files + +- `samtools/stats/` + - `_interleaved.fq.gz`: Unmapped reads only in FASTQ gzip format + +
+ +This directory will be present and contain the unmapped reads from the `.fastq` format from long-read minimap2 host removal, if `--save_hostremoval_unmapped` is supplied. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::info +For short-read unmapped reads, see [bowtie2](#bowtie2). +::: + +### Analysis Ready Reads + +:::info +This optional results directory will only be present in the pipeline results when supplying `--save_analysis_ready_reads`. +::: + +
+Output files + +- `samtools/stats/` + - `_{fq,fastq}.gz`: Final reads that underwent preprocessing and were sent for classification/profiling. + +
+ +The results directory will contain the 'final' processed reads used as input for classification/profiling. It will _only_ include the output of the _last_ step of any combinations of preprocessing steps that may have been specified in the run configuration. For example, if you perform the read QC and host-removal preprocessing steps, the final reads that are sent to classification/profiling are the host-removed FASTQ files - those will be the ones present in this directory. + +:::warning +If you turn off all preprocessing steps, then no results will be present in this directory. This happens independently for short- and long-reads. I.e. you will only have FASTQ files for short reads in this directory if you skip all long-read preprocessing. +::: + +### SAMtools stats + +[SAMtools stats](http://www.htslib.org/doc/samtools-stats.html) collects statistics from a `.sam`, `.bam`, or `.cram` alignment file and outputs in a text format. + +
+Output files + +- `samtools/stats/` + - `.stats`: File containing samtools stats output. + +
+ +In most cases you do not need to check this file, as it is rendered in the MultiQC run report. + +### Run Merging + +nf-core/taxprofiler offers the option to merge FASTQ files of multiple sequencing runs or libraries that derive from the same sample, as specified in the input samplesheet. + +This is the last possible preprocessing step, so if you have multiple runs or libraries (and run merging turned on), this will represent the final reads that will go into classification/profiling steps. + +
+Output files + +- `run_merging/` + - `*.fastq.gz`: Concatenated FASTQ files on a per-sample basis + +
+ +Note that you will only find samples that went through the run merging step in this directory. For samples that had a single run or library will not go through this step of the pipeline and thus will not be present in this directory. + +This directory and its FASTQ files will only be present if you supply `--save_runmerged_reads`.Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +### Bracken + +[Bracken](https://ccb.jhu.edu/software/bracken/) (Bayesian Reestimation of Abundance with Kraken) is a highly accurate statistical method that computes the abundance of species in DNA sequences from a metagenomics sample. Braken uses the taxonomy labels assigned by Kraken, a highly accurate metagenomics classification algorithm, to estimate the number of reads originating from each species present in a sample. + +:::info +The first step of using Bracken requires running Kraken2, therefore the initial results before abundance estimation will be found in `/kraken2/`. +::: + +
+Output files + +- `bracken/` + - `/` + - `bracken__combined_reports.txt`: combined bracken results as output from Bracken's `combine_bracken_outputs.py` script + - `/` + - `_.tsv`: TSV file containing per-sample summary of Bracken results with abundance information + - `_.report_bracken_species.txt`: Kraken2 style report with Bracken abundance information + +
+ +The main taxonomic profiling file from Bracken is the `*.tsv` file. This provides the basic results from Kraken2 but with the corrected abundance information. Note that the raw Kraken2 version of the upstream step of Bracken can be found in the `kraken2/` directory with the suffix of `_.bracken.report.txt` (with a 6 column variant when `--save_minimizers` specified). + +### Kraken2 + +[Kraken](https://ccb.jhu.edu/software/kraken2/) is a taxonomic sequence classifier that assigns taxonomic labels to DNA sequences. Kraken examines the k-mers within a query sequence and uses the information within those k-mers to query a database. That database maps -mers to the lowest common ancestor (LCA) of all genomes known to contain a given k-mer. + +
+Output files + +- `kraken2/` + - `_combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `krakentools`) + - If you have also run Bracken, the original Kraken report (i.e., _before_ read re-assignment) will also be included in this directory with `-bracken` suffixed to your Bracken database name if you supply `--bracken_save_intermediatekraken2` to the run. For example: `kraken2--bracken.tsv`. However in most cases you want to use the actual Bracken file (i.e., `bracken_.tsv`). + - `/` + - `_.classified.fastq.gz`: FASTQ file containing all reads that had a hit against a reference in the database for a given sample + - `_.unclassified.fastq.gz`: FASTQ file containing all reads that did not have a hit in the database for a given sample + - `_.report.txt`: A Kraken2 report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits in the Kraken2 run for a given sample. Will be 6 column rather than 8 if `--save_minimizers` specified. This report will **only** be included if you supply `--bracken_save_intermediatekraken2` to the run. + - `_.classifiedreads.txt`: A list of read IDs and the hits each read had against each database for a given sample + +
+ +The main taxonomic classification file from Kraken2 is the `_combined_reports.txt` or `*report.txt` file. The former provides you the broadest over view of the taxonomic classification results across all samples against a single database, where you get two columns for each sample e.g. `2_all` and `2_lvl`, as well as a summarised column summing up across all samples `tot_all` and `tot_lvl`. The latter gives you the most information for a single sample. The report file is also used for the taxpasta step. + +You will only receive the `.fastq` and `*classifiedreads.txt` file if you supply `--kraken2_save_reads` and/or `--kraken2_save_readclassifications` parameters to the pipeline. + +When running Bracken, you will only get the 'intermediate' Kraken2 report files in this directory if you supply `--bracken_save_intermediatekraken2` to the run. + +### KrakenUniq + +[KrakenUniq](https://github.com/fbreitwieser/krakenuniq) (formerly KrakenHLL) is an extension to the fast k-mer-based classification performed by [Kraken](https://github.com/DerrickWood/kraken) with an efficient algorithm for additionally assessing the coverage of unique k-mers found in each species in a dataset. + +
+Output files + +- `krakenuniq/` + - `/` + - `_[.merged].classified.fast{a,q}.gz`: Optional FASTA file containing all reads that had a hit against a reference in the database for a given sample. Paired-end input reads are merged in this output. + - `_[.merged].unclassified.fast{a,q}.gz`: Optional FASTA file containing all reads that did not have a hit in the database for a given sample. Paired-end input reads are merged in this output. + - `_.krakenuniq.report.txt`: A Kraken2-style report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits, with an additional column for k-mer coverage, that allows for more accurate distinguishing between false-positive/true-postitive hits. + - `_.krakenuniq.classified.txt`: An optional list of read IDs and the hits each read had against each database for a given sample. + +
+ +The main taxonomic classification file from KrakenUniq is the `*.krakenuniq.report.txt` file. This is an extension of the Kraken2 report with the additional k-mer coverage information that provides more information about the accuracy of hits. + +You will only receive the `.fasta.gz` and `*.krakenuniq.classified.txt` file if you supply `--krakenuniq_save_reads` and/or `--krakenuniq_save_readclassification` parameters to the pipeline. + +:::info +The output system of KrakenUniq can result in other `stdout` or `stderr` logging information being saved in the report file, therefore you must check your report files before downstream use! +::: + +### Centrifuge + +[Centrifuge](https://github.com/DaehwanKimLab/centrifuge) is a taxonomic sequence classifier that uses a Burrows-Wheeler transform and Ferragina-Manzina index for storing and mapping sequences. + +
+Output files + +- `centrifuge/` + - `/` + - `.centrifuge.mapped.fastq.gz`: `FASTQ` files containing all mapped reads + - `.centrifuge.report.txt`: A classification report that summarises the taxonomic ID, the taxonomic rank, length of genome sequence, number of classified and uniquely classified reads + - `.centrifuge.results.txt`: A file that summarises the classification assignment for a read, i.e read ID, sequence ID, score for the classification, score for the next best classification, number of classifications for this read + - `.centrifuge.txt`: A Kraken2-style report that summarises the fraction abundance, taxonomic ID, number of k-mers, taxonomic path of all the hits in the centrifuge run for a given sample + - `.centrifuge.unmapped.fastq.gz`: FASTQ file containing all unmapped reads + +
+ +The main taxonomic classification files from Centrifuge are the `_combined_reports.txt`, `*report.txt`, `*results.txt` and the `*centrifuge.txt`. The latter is used by the taxpasta step. You will receive the `.fastq` files if you supply `--centrifuge_save_reads`. + +### Kaiju + +[Kaiju](https://github.com/bioinformatics-centre/kaiju) is a taxonomic classifier that finds maximum exact matches on the protein-level using the Burrows-Wheeler transform. + +
+Output files + +- `kaiju/` + - `kaiju__combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by kaiju2table) + - `/` + - `_.kaiju.tsv`: Raw output from Kaiju with taxonomic rank, read ID and taxonic ID + - `_.kaijutable.txt`: Summarised Kaiju output with fraction abundance, taxonomic ID, number of reads, and taxonomic names (as generated by `kaiju2table`) + +
+ +The most useful summary file is the `_combined_reports.txt` file which summarises hits across all reads and samples. Separate per-sample versions summaries can be seen in `/*.txt`. However if you wish to look at more precise information on a per-read basis, see the `*tsv` file. The default taxonomic rank is `species`. You can provide a different one by updating the argument `--kaiju_taxon_rank`. + +### DIAMOND + +[DIAMOND](https://github.com/bbuchfink/diamond) is a sequence aligner for translated DNA searches or protein sequences against a protein reference database such as NR. It is a replacement for the NCBI BLAST software tools.It has many key features and it is used as taxonomic classifier in nf-core/taxprofiler. + +
+Output files + +- `diamond/` + - `/` + - `.log`: A log file containing stdout information + - `*.{blast,xml,txt,daa,sam,tsv,paf}`: A file containing alignment information in various formats, or taxonomic information in a text-based format. Exact output depends on user choice. + +
+ +By default you will receive a TSV output. Alternatively, you will receive a `*.sam` file if you provide the parameter `--diamond_save_reads` but in this case no taxonomic classification will be available(!), only the aligned reads in sam format. + +:::info +DIAMOND has many output formats, so depending on your [choice](https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options) with ` --diamond_output_format` you will receive the taxonomic information in a different format. +::: + +### MALT + +[MALT](https://software-ab.cs.uni-tuebingen.de/download/malt) is a fast replacement for BLASTX, BLASTP and BLASTN, and provides both local and semi-global alignment capabilities. + +
+Output files + +- `malt/` + - `/` + - `.blastn.sam`: sparse SAM file containing alignments of each hit + - `.megan`: summary file that can be loaded into the [MEGAN6](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/) interactive viewer. Generated by MEGAN6 companion tool `rma2info` + - `.rma6`: binary file containing all alignments and taxonomic information of hits that can be loaded into the [MEGAN6](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/) interactive viewer + - `.txt.gz`: text file containing taxonomic IDs and read counts against each taxon. Generated by MEGAN6 companion tool `rma2info` + +
+ +The main output of MALT is the `.rma6` file format, which can be only loaded into MEGAN and it's related tools. We provide the `rma2info` text files for improved compatibility with spreadsheet programs and other programmtic data manipulation tools, however this has only limited information compared to the 'binary' RMA6 file format (the `.txt` file only contains taxonomic ID and count, whereas RMA6 has taxonomic lineage information). + +You will only receive the `.sam` and `.megan` files if you supply `--malt_save_reads` and/or `--malt_generate_megansummary` parameters to the pipeline. + +### MetaPhlAn + +[MetaPhlAn](https://github.com/biobakery/metaphlan) is a computational tool for profiling the composition of microbial communities (Bacteria, Archaea and Eukaryotes) from metagenomic shotgun sequencing data (i.e. not 16S) with species-level resolution via marker genes. + +
+Output files + +- `metaphlan/` + - `metaphlan__combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `metaphlan_merge_tables`) + - `/` + - `.biom`: taxonomic profile in BIOM format + - `.bowtie2out.txt`: BowTie2 alignment information (can be re-used for skipping alignment when re-running MetaPhlAn with different parameters) + - `_profile.txt`: MetaPhlAn taxonomic profile including abundance estimates + +
+ +The output contains a file named `*_combined_reports.txt`, which provides an overview of the classification results for all samples. The main taxonomic profiling file from MetaPhlAn is the `*_profile.txt` file. This provides the abundance estimates from MetaPhlAn however does not include raw counts by default. Additionally, it contains intermediate Bowtie2 output `.bowtie2out.txt`, which presents a condensed representation of the mapping results of your sequencing reads to MetaPhlAn's marker gene sequences. The alignments are listed in tab-separated columns, including Read ID and Marker Gene ID, with each alignment represented on a separate line. + +### mOTUs + +[mOTUS](https://github.com/motu-tool/mOTUs) is a taxonomic profiler that maps reads to a unique marker specific database and estimates the relative abundance of known and unknown species. + +
+Output files + +- `motus/` + - `/` + - `.log`: A log file that contains summary statistics + - `.out`: A classification file that summarises taxonomic identifiers, by default at the rank of mOTUs (i.e., species level), and their relative abundances in the profiled sample. + - `motus__combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `motus_merge`) + +
+ +Normally `*_combined_reports.txt` is the most useful file for downstream analyses, but the per sample `.out` file can provide additional more specific information. By default, nf-core/taxprofiler is providing a column describing NCBI taxonomic ID as this is used in the taxpasta step. You can disable this column by activating the argument `--motus_remove_ncbi_ids`. +You will receive the relative abundance instead of read counts if you provide the argument `--motus_use_relative_abundance`. + +### KMCP + +[KMCP](https://github.com/shenwei356/kmcp) utilises genome coverage information by splitting the reference genomes into chunks and stores k-mers in a modified and optimised COBS index for fast alignment-free sequence searching. KMCP combines k-mer similarity and genome coverage information to reduce the false positive rate of k-mer-based taxonomic classification and profiling methods. + +
+Output files + +- `kmcp/` + + - `/` + - `.gz`: output of `kmcp_search` containing search sequences against a database in tab-delimited format with 15 columns. + - `_kmcp.profile`: output of `kmcp_profile` containing the taxonomic profile from search results. + +
+ +You will receive the `.gz` file if you supply `--kmcp_save_search`. Please note that there is no taxonomic label assignment in this output file. + +The main taxonomic classification file from KMCP is the `*kmcp.profile` which is also used by the taxpasta step. + +### ganon + +[ganon](https://pirovc.github.io/ganon/) is designed to index large sets of genomic reference sequences and to classify reads against them efficiently. The tool uses Interleaved Bloom Filters as indices based on k-mers/minimizers. It was mainly developed, but not limited, to the metagenomics classification problem: quickly assign sequence fragments to their closest reference among thousands of references. After classification, taxonomic abundance is estimated and reported. + +
+Output files + +- `ganon/` + + - `/` + + - `_report.tre`: output of `ganon report` containing taxonomic classifications with possible formatting and/or filtering depending on options specified. + - ``.tre: output of `ganon classify` containing raw taxonomic classifications and abundance estimations with no additional formatting or filtering. + - ``.rep: 'raw' report of counts against each taxon. + - ``.all: per-read summary of all hits of each reads. + - ``.lca: per-read summary of the best single hit after LCA for each read. + - ``.unc: list of read IDs with no hits. + - ``.log: the stdout console messages printed by `ganon classify`, containing some classification summary information + + - `ganon__combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `ganon table`) + +
+ +Generally you will want to refer to the `combined_reports.txt` or `_report.tre` file. For further descriptions of the contents of each file, see the [ganon documentation](https://pirovc.github.io/ganon/outputfiles/). + +You will only receive the `.all`, `.lca`, and `.unc` files if you supply the `--ganon_save_readclassifications` parameter to the pipeline. + +### Krona + +[Krona](https://github.com/marbl/Krona) allows the exploration of (metagenomic) hierarchical data with interactive zooming, multi-layered pie charts. + +Krona charts will be generated by the pipeline for supported tools (Kraken2, Centrifuge, Kaiju, and MALT) + +
+Output files + +- `krona/` + - `_.html`: per-tool/per-database interactive HTML file containing hierarchical piecharts + +
+ +The resulting HTML files can be loaded into your web browser for exploration. Each file will have a dropdown to allow you to switch between each sample aligned against the given database of the tool. + +### TAXPASTA + +[TAXPASTA](https://github.com/taxprofiler/taxpasta) standardises and optionally merges two or more taxonomic profiles across samples into one single table. It supports multiple different classifiers simplifying comparison of taxonomic classification results between tools and databases. + +
+Output files + +- `taxpasta/` + + - `_*.{tsv,csv,arrow,parquet,biom}`: Standardised taxon table containing multiple samples. The standard format is the `tsv`. + - The first column describes the taxonomy ID and the rest of the columns describe the read counts for each sample. + - Note that the file naming scheme will apply regardless of whether `TAXPASTA_MERGE` (multiple sample run) or `TAXPASTA_STANDARDISE` (single sample run) are executed. + - If you have also run Bracken, the initial Kraken report (i.e., _before_ read re-assignment) will also be included in this directory with `-bracken` suffixed to your Bracken database name. For example: `kraken2--bracken.tsv`. However in most cases you want to use the actual Bracken file (i.e., `bracken_.tsv`). + +
+ +By providing the path to a directory containing taxdump files to `--taxpasta_taxonomy_dir`, the taxon name, the taxon rank, the taxon's entire lineage including taxon names and/or the taxon's entire lineage including taxon identifiers can also be added in the output in addition to just the taxon ID. Addition of this extra information can be turned by using the parameters `--taxpasta_add_name`, `--taxpasta_add_rank`, `--taxpasta_add_lineage` and `--taxpasta_add_idlineage` respectively. + +These files will likely be the most useful files for the comparison of differences in classification between different tools or building consensuses, with the caveat they have slightly less information than the actual output from each tool (which may have non-standard information e.g. taxonomic rank, percentage of hits, abundance estimations). + +The following report files are used for the taxpasta step: + +- Bracken: `_.tsv` Taxpasta used the `new_est_reads` column for the standardised profile. +- Centrifuge: `.centrifuge.txt` Taxpasta uses the `direct_assigned_reads` column for the standardised profile. +- Diamond: `` Taxpasta summarises number of reads per NCBI taxonomy ID standardised profile. +- Kaiju: `_.kaijutable.txt` Taxpasta uses the `reads` column from kaiju2table standardised profile. +- KrakenUniq: `_.report.txt` Taxpasta uses the `reads` column for the standardised profile. +- Kraken2: `_.report.txt` Taxpasta uses the `direct_assigned_reads` column for the standardised profile. +- MALT: `.txt.gz` Taxpasta uses the `count` (second) column from the output of MEGAN6's rma2info for the standardised profile. +- MetaPhlAn: `_profile.txt` Taxpasta uses the `relative_abundance` column multiplied with a fixed number to yield an integer for the standardised profile. +- mOTUs: `.out` Taxpasta uses the `read_count` column for the standardised profile. + +:::warning +Please aware the outputs of each tool's standardised profile _may not_ be directly comparable between each tool. Some may report raw read counts, whereas others may report abundance information. Please always refer to the list above, for which information is used for each tool. +::: + +### MultiQC + +
+Output files + +- `multiqc/` + - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. + - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. + - `multiqc_plots/`: directory containing static images from the report in various formats. + +
+ +[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. + +Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . + +All tools in taxprofiler supported by MultiQC will have a dedicated section showing summary statistics of each tool based on information stored in log files. + +You can expect in the MultiQC reports either sections and/or general stats columns for the following tools: + +- fastqc +- adapterRemoval +- fastp +- bbduk +- prinseqplusplus +- porechop +- filtlong +- bowtie2 +- minimap2 +- samtools (stats) +- kraken +- bracken +- centrifuge +- kaiju +- diamond +- malt +- motus + +:::info +The 'General Stats' table by default will only show statistics referring to pre-processing steps, and will not display possible values from each classifier/profiler, unless turned on by the user within the 'Configure Columns' menu or via a custom MultiQC config file (`--multiqc_config`) +::: + +### Pipeline information + +
+Output files + +- `pipeline_info/` + - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. + - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. + - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. + - Parameters used by the pipeline run: `params.json`. + +
+ +[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. + diff --git a/bu_isciii/assets/reports/results/mag.md b/bu_isciii/assets/reports/results/mag.md index 90193de6d..dd7848b73 100644 --- a/bu_isciii/assets/reports/results/mag.md +++ b/bu_isciii/assets/reports/results/mag.md @@ -6,3 +6,12 @@ Here we describe the results from the MAG pipeline for multispecies metagenomic > [!WARNING] > Software's versions used in this analysis can be obtained from the `MultiQC` report. + +## Taxprofiler + +Here we describe the results from the (nf-core/taxprofiler)[https://nf-co.re/taxprofiler/1.1.8] pipeline for multispecies taxonomic classification and profiling of shorgun short- and long-read. + +* multiqc_report.html​ : Final HTML report collecting numerical stats from each module executed in this pipeline. + +> [!WARNING] +> Software's versions used in this analysis can be obtained from the `MultiQC` report. \ No newline at end of file diff --git a/bu_isciii/templates/services.json b/bu_isciii/templates/services.json index 346a98727..44b5f5b68 100755 --- a/bu_isciii/templates/services.json +++ b/bu_isciii/templates/services.json @@ -209,8 +209,8 @@ "order": 2, "begin": "base", "end": "", - "url": "https://github.com/nf-core/mag", - "description": "Bioinformatics best-practise analysis pipeline for assembly, binning and annotation of metagenomes.", + "url": "https://github.com/nf-core/mag or https://github.com/nf-core/taxprofiler", + "description": "1- Bioinformatics best-practise analysis for taxonomic classification and profiling; 2- Bioinformatics best-practise analysis pipeline for assembly, binning and annotation of metagenomes.", "clean": { "folders":[], "files":[] From a2fb0aebbcc2bde4a5ced1f04bf42e14464e18f7 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 30 Aug 2024 13:06:08 +0200 Subject: [PATCH 202/321] fixed path to results in taxprofiler results lablog --- bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results b/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results index d6231a54b..9867e2f1d 100755 --- a/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results +++ b/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results @@ -5,4 +5,4 @@ mkdir -p $DELIVERY_FOLDER/taxprofiler cd $DELIVERY_FOLDER/taxprofiler # Links to reports -ln -s ../../../ANALYSIS/*ANALYSIS01_TAXPROFILER/results/multiqc/multiqc_report.html . +ln -s ../../../ANALYSIS/*ANALYSIS01_TAXPROFILER/multiqc/multiqc_report.html . From fad629309cef22efc38a25a17e9facd0cfa425a5 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 30 Aug 2024 13:06:39 +0200 Subject: [PATCH 203/321] fixed indentation --- .../templates/mag/DOC/taxprofiler.config | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/bu_isciii/templates/mag/DOC/taxprofiler.config b/bu_isciii/templates/mag/DOC/taxprofiler.config index bfbc4ac15..60569c0ac 100644 --- a/bu_isciii/templates/mag/DOC/taxprofiler.config +++ b/bu_isciii/templates/mag/DOC/taxprofiler.config @@ -3,31 +3,31 @@ */ singularity { - enabled = true - autoMounts = true - singularity.cacheDir = '/data/cnm/ratb/pipelines/singularity-images/' + enabled = true + autoMounts = true + singularity.cacheDir = '/data/cnm/ratb/pipelines/singularity-images/' } process { - executor = 'slurm' - queue = 'middle_idx' - jobName = { "$task.name - $task.hash" } - conda = null + executor = 'slurm' + queue = 'middle_idx' + jobName = { "$task.name - $task.hash" } + conda = null - errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } withName:'KAIJU_KAIJU' { - errorStrategy = { task.exitStatus in [143,137,21,1] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in [143,137,21,1] ? 'retry' : 'finish' } maxRetries = 3 - memory = { 72.GB * task.attempt } - time = { 8.h } - } + memory = { 72.GB * task.attempt } + time = { 8.h } + } } params { - max_memory = 376.GB - max_cpus = 32 - max_time = '24.h' + max_memory = 376.GB + max_cpus = 32 + max_time = '24.h' } /* From 2f16d06fb9336485fd64653af09b2ec1739f1aa7 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 30 Aug 2024 14:15:41 +0200 Subject: [PATCH 204/321] added krona rhtml to results folder --- bu_isciii/assets/reports/results/mag.md | 5 +++-- bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/bu_isciii/assets/reports/results/mag.md b/bu_isciii/assets/reports/results/mag.md index dd7848b73..cec03e6ba 100644 --- a/bu_isciii/assets/reports/results/mag.md +++ b/bu_isciii/assets/reports/results/mag.md @@ -2,7 +2,7 @@ Here we describe the results from the MAG pipeline for multispecies metagenomic analysis. -* krona_results.html​ : Final HTML report with the top 5 species most present in all samples. +* `krona_results.html`​ : Final HTML report with the top 5 species most present in all samples. > [!WARNING] > Software's versions used in this analysis can be obtained from the `MultiQC` report. @@ -11,7 +11,8 @@ Here we describe the results from the MAG pipeline for multispecies metagenomic Here we describe the results from the (nf-core/taxprofiler)[https://nf-co.re/taxprofiler/1.1.8] pipeline for multispecies taxonomic classification and profiling of shorgun short- and long-read. -* multiqc_report.html​ : Final HTML report collecting numerical stats from each module executed in this pipeline. +* `multiqc_report.html​`: Final HTML report collecting numerical stats from each module executed in this pipeline. +* `krona/*.html`: Interactive HTML files generated by Krona, displaying the results of taxonomic classification. > [!WARNING] > Software's versions used in this analysis can be obtained from the `MultiQC` report. \ No newline at end of file diff --git a/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results b/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results index 9867e2f1d..7c846010e 100755 --- a/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results +++ b/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results @@ -6,3 +6,5 @@ cd $DELIVERY_FOLDER/taxprofiler # Links to reports ln -s ../../../ANALYSIS/*ANALYSIS01_TAXPROFILER/multiqc/multiqc_report.html . +ln -s ../../../ANALYSIS/*ANALYSIS01_TAXPROFILER/krona . + From 573758e4a57ac7830cd8e0efb4743ea09386a2c8 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 30 Aug 2024 14:21:29 +0200 Subject: [PATCH 205/321] added user to sftp --- bu_isciii/templates/sftp_user.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bu_isciii/templates/sftp_user.json b/bu_isciii/templates/sftp_user.json index f8ee42f60..2461c05d9 100755 --- a/bu_isciii/templates/sftp_user.json +++ b/bu_isciii/templates/sftp_user.json @@ -52,5 +52,6 @@ "anadonoso": ["Labenterovirus"], "mvmoneo": ["SpainUDP"], "bbaladron": ["SpainUDP"], - "bioinfoadm": ["test"] + "bioinfoadm": ["test"], + "s.varona": ["misc"] } From 708b855dd70bbe91312397b3b90c547760f9d3bd Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 30 Aug 2024 15:17:38 +0200 Subject: [PATCH 206/321] added prefix to krona html results --- bu_isciii/assets/reports/results/mag.md | 2 +- bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/bu_isciii/assets/reports/results/mag.md b/bu_isciii/assets/reports/results/mag.md index cec03e6ba..fa6ea6cb7 100644 --- a/bu_isciii/assets/reports/results/mag.md +++ b/bu_isciii/assets/reports/results/mag.md @@ -12,7 +12,7 @@ Here we describe the results from the MAG pipeline for multispecies metagenomic Here we describe the results from the (nf-core/taxprofiler)[https://nf-co.re/taxprofiler/1.1.8] pipeline for multispecies taxonomic classification and profiling of shorgun short- and long-read. * `multiqc_report.html​`: Final HTML report collecting numerical stats from each module executed in this pipeline. -* `krona/*.html`: Interactive HTML files generated by Krona, displaying the results of taxonomic classification. +* `krona/database_*.html`: Interactive HTML files generated by Krona, displaying the results of taxonomic classification for supported tools (Kraken2, Centrifuge, Kaiju, and MALT) > [!WARNING] > Software's versions used in this analysis can be obtained from the `MultiQC` report. \ No newline at end of file diff --git a/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results b/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results index 7c846010e..cfb01fa4c 100755 --- a/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results +++ b/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results @@ -6,5 +6,9 @@ cd $DELIVERY_FOLDER/taxprofiler # Links to reports ln -s ../../../ANALYSIS/*ANALYSIS01_TAXPROFILER/multiqc/multiqc_report.html . -ln -s ../../../ANALYSIS/*ANALYSIS01_TAXPROFILER/krona . +mkdir -p krona +for file in ../../../ANALYSIS/*ANALYSIS01_TAXPROFILER/krona/*.html; do + base=$(basename "$file") + ln -s "../$file" "krona/database_${base}" +done From e61ac18b78ff2a83c52f0edb69b7d03db71d2be4 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Thu, 22 Aug 2024 09:00:37 +0200 Subject: [PATCH 207/321] added init config for mag template --- bu_isciii/templates/mag/DOC/mag.config | 52 +++++++++++++++++++++----- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/bu_isciii/templates/mag/DOC/mag.config b/bu_isciii/templates/mag/DOC/mag.config index 732980bf1..fdb5093e4 100644 --- a/bu_isciii/templates/mag/DOC/mag.config +++ b/bu_isciii/templates/mag/DOC/mag.config @@ -1,19 +1,51 @@ +/* + HPC XTUTATIS CONFIGURATION +*/ + singularity { - enabled = true - autoMounts = true + enabled = true + autoMounts = true + singularity.cacheDir = '/data/bi/pipelines/singularity-images' } process { - executor = 'slurm' - queue = 'middle_idx' - queue = 'middle_idx' - errorStrategy = { task.exitStatus in [140,143,137,138,104,134,139] ? 'retry' : 'finish'; task.exitStatus in [1,4,255] ? 'ignore' : 'finish' } - maxRetries = 1 - maxErrors = '-1' + executor = 'slurm' + queue = 'long_idx' + jobName = { "$task.name - $task.hash" } + conda = null + + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + + withName:'SPADES|MEGAHIT' { + errorStrategy = { task.exitStatus in [143,137,21,1] ? 'retry' : 'finish' } + maxRetries = 2 + cpus = { 16 * task.attempt } + memory = { 64.GB * task.attempt } + time = { 24.h } + } + withName:'MAXBIN2' { + // often fails when insufficient information, so we allow it to gracefully fail without failing the pipeline + errorStrategy = { task.exitStatus in [ 1, 255 ] ? 'ignore' : 'retry' } + time = { 8.h * task.attempt } + } + withName:CONCOCT_CONCOCT { + errorStrategy = { task.exitStatus in [140] ? 'retry' : 'finish' } + maxRetries = 2 + cpus = { 12 * task.attempt } + memory = { 64.GB * task.attempt } + time = { 12.h * task.attempt } + } + withName:CHECKM_LINEAGEWF { + errorStrategy = { task.exitStatus in [1] ? 'retry' : 'finish' } + maxRetries = 3 + cpus = { 8 * task.attempt } + memory = { 32.GB * task.attempt } + time = { 4.h * task.attempt } + } } params { - max_memory = 376.GB + max_memory = 128.GB max_cpus = 32 - max_time = '48.h' + max_time = '84.h' } From 0d9e3803fedbbad4df33f9046e5c077996dfd4c0 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Thu, 22 Aug 2024 09:10:36 +0200 Subject: [PATCH 208/321] added mag-complete lablogs --- .../ANALYSIS02_MAG/lablog_mag_complete | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog_mag_complete diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog_mag_complete b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog_mag_complete new file mode 100644 index 000000000..1cc51ad6e --- /dev/null +++ b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog_mag_complete @@ -0,0 +1,37 @@ +# SETUP INTPUT SAMPLE SHEET +ln -s ../00-reads . +ln -s ../samples_id.txt . + +# Setup samplesheet +echo "sample,group,short_reads_1,short_reads_2,long_reads" > samplesheet.csv +cat samples_id.txt | while read in; do + echo "${in},,00-reads/${in}_R1.fastq.gz,00-reads/${in}_R2.fastq.gz," +done >> samplesheet.csv + +scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") + + +cat < complete_mag.sbatch +#!/bin/sh +#SBATCH --ntasks 1 +#SBATCH --cpus-per-task 2 +#SBATCH --mem 8G +#SBATCH --time 120:00:00 +#SBATCH --partition long_idx +#SBATCH --output $(date '+%Y%m%d')_metagenomics01.log +#SBATCH --chdir $scratch_dir + +# module load Nextflow/23.10.0 singularity +export NXF_OPTS="-Xms500M -Xmx8G" + +nextflow run /data/bi/pipelines/nf-core-mag/nf-core-mag-2.5.3/workflow/main.nf \\ + -c ../../DOC/hpc_slurm_mag.config \\ + -profile singularity \\ + --input samplesheet.csv \\ + --kraken2_db '/data/bi/references/kraken/minikraken_8GB_20200312.tgz' \\ + --skip_spadeshybrid true \\ + --outdir $(date '+%Y%m%d')_mag_complete \\ + -resume +EOF + +echo "sbatch complete_mag.sbatch" > _01_run_complete_mag.sh From 1711849ff3aafaec748f296872c6364635d70d86 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Thu, 22 Aug 2024 10:29:16 +0200 Subject: [PATCH 209/321] changed comp queue --- bu_isciii/templates/mag/DOC/mag.config | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bu_isciii/templates/mag/DOC/mag.config b/bu_isciii/templates/mag/DOC/mag.config index fdb5093e4..86327ddcc 100644 --- a/bu_isciii/templates/mag/DOC/mag.config +++ b/bu_isciii/templates/mag/DOC/mag.config @@ -10,7 +10,7 @@ singularity { process { executor = 'slurm' - queue = 'long_idx' + queue = 'middle_idx' jobName = { "$task.name - $task.hash" } conda = null @@ -28,14 +28,14 @@ process { errorStrategy = { task.exitStatus in [ 1, 255 ] ? 'ignore' : 'retry' } time = { 8.h * task.attempt } } - withName:CONCOCT_CONCOCT { + withName:'CONCOCT_CONCOCT' { errorStrategy = { task.exitStatus in [140] ? 'retry' : 'finish' } maxRetries = 2 cpus = { 12 * task.attempt } memory = { 64.GB * task.attempt } time = { 12.h * task.attempt } } - withName:CHECKM_LINEAGEWF { + withName:'CHECKM_LINEAGEWF' { errorStrategy = { task.exitStatus in [1] ? 'retry' : 'finish' } maxRetries = 3 cpus = { 8 * task.attempt } @@ -45,7 +45,7 @@ process { } params { - max_memory = 128.GB + max_memory = 376.GB max_cpus = 32 - max_time = '84.h' + max_time = '48.h' } From 7dc3947a6fb5a105bede07a89a1ca0fb54603927 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Thu, 22 Aug 2024 10:30:32 +0200 Subject: [PATCH 210/321] renaming folders in mag template --- .../lablog_mag_complete => ANALYSIS01_MAG_ALL/lablog} | 8 ++++---- .../99-stats/lablog | 0 .../99-stats/multiqc_config.yaml | 0 .../{ANALYSIS02_MAG => ANALYSIS02_MAG_TAXONOMICS}/lablog | 0 bu_isciii/templates/mag/ANALYSIS/lablog_mag_all | 5 +++++ .../mag/ANALYSIS/{lablog_mag => lablog_mag_taxonomics} | 0 6 files changed, 9 insertions(+), 4 deletions(-) rename bu_isciii/templates/mag/ANALYSIS/{ANALYSIS02_MAG/lablog_mag_complete => ANALYSIS01_MAG_ALL/lablog} (83%) rename bu_isciii/templates/mag/ANALYSIS/{ANALYSIS02_MAG => ANALYSIS02_MAG_TAXONOMICS}/99-stats/lablog (100%) rename bu_isciii/templates/mag/ANALYSIS/{ANALYSIS02_MAG => ANALYSIS02_MAG_TAXONOMICS}/99-stats/multiqc_config.yaml (100%) rename bu_isciii/templates/mag/ANALYSIS/{ANALYSIS02_MAG => ANALYSIS02_MAG_TAXONOMICS}/lablog (100%) create mode 100644 bu_isciii/templates/mag/ANALYSIS/lablog_mag_all rename bu_isciii/templates/mag/ANALYSIS/{lablog_mag => lablog_mag_taxonomics} (100%) diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog_mag_complete b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS01_MAG_ALL/lablog similarity index 83% rename from bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog_mag_complete rename to bu_isciii/templates/mag/ANALYSIS/ANALYSIS01_MAG_ALL/lablog index 1cc51ad6e..341c46c38 100644 --- a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog_mag_complete +++ b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS01_MAG_ALL/lablog @@ -11,21 +11,21 @@ done >> samplesheet.csv scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") -cat < complete_mag.sbatch +cat < mag_complete.sbatch #!/bin/sh #SBATCH --ntasks 1 #SBATCH --cpus-per-task 2 #SBATCH --mem 8G #SBATCH --time 120:00:00 #SBATCH --partition long_idx -#SBATCH --output $(date '+%Y%m%d')_metagenomics01.log +#SBATCH --output $(date '+%Y%m%d')_mag_complete.log #SBATCH --chdir $scratch_dir # module load Nextflow/23.10.0 singularity export NXF_OPTS="-Xms500M -Xmx8G" nextflow run /data/bi/pipelines/nf-core-mag/nf-core-mag-2.5.3/workflow/main.nf \\ - -c ../../DOC/hpc_slurm_mag.config \\ + -c ../../DOC/mag.config \\ -profile singularity \\ --input samplesheet.csv \\ --kraken2_db '/data/bi/references/kraken/minikraken_8GB_20200312.tgz' \\ @@ -34,4 +34,4 @@ nextflow run /data/bi/pipelines/nf-core-mag/nf-core-mag-2.5.3/workflow/main.nf \ -resume EOF -echo "sbatch complete_mag.sbatch" > _01_run_complete_mag.sh +echo "sbatch mag_complete.sbatch" > _01_run_mag_complete.sh diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/lablog b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/99-stats/lablog similarity index 100% rename from bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/lablog rename to bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/99-stats/lablog diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/multiqc_config.yaml b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/99-stats/multiqc_config.yaml similarity index 100% rename from bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/99-stats/multiqc_config.yaml rename to bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/99-stats/multiqc_config.yaml diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/lablog similarity index 100% rename from bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG/lablog rename to bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/lablog diff --git a/bu_isciii/templates/mag/ANALYSIS/lablog_mag_all b/bu_isciii/templates/mag/ANALYSIS/lablog_mag_all new file mode 100644 index 000000000..68cd3b09f --- /dev/null +++ b/bu_isciii/templates/mag/ANALYSIS/lablog_mag_all @@ -0,0 +1,5 @@ +mkdir 00-reads +cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cd - +cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd - + +mv ANALYSIS01_MAG_ALL $(date '+%Y%m%d')_ANALYSIS01_MAG_ALL \ No newline at end of file diff --git a/bu_isciii/templates/mag/ANALYSIS/lablog_mag b/bu_isciii/templates/mag/ANALYSIS/lablog_mag_taxonomics similarity index 100% rename from bu_isciii/templates/mag/ANALYSIS/lablog_mag rename to bu_isciii/templates/mag/ANALYSIS/lablog_mag_taxonomics From 9d542db069710b3ea0266a4c7510659ba8b4df86 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Thu, 22 Aug 2024 15:47:22 +0200 Subject: [PATCH 211/321] update mag config params --- bu_isciii/templates/mag/DOC/mag.config | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/mag/DOC/mag.config b/bu_isciii/templates/mag/DOC/mag.config index 86327ddcc..a61f7686e 100644 --- a/bu_isciii/templates/mag/DOC/mag.config +++ b/bu_isciii/templates/mag/DOC/mag.config @@ -19,7 +19,7 @@ process { withName:'SPADES|MEGAHIT' { errorStrategy = { task.exitStatus in [143,137,21,1] ? 'retry' : 'finish' } maxRetries = 2 - cpus = { 16 * task.attempt } + cpus = { 10 * task.attempt } memory = { 64.GB * task.attempt } time = { 24.h } } @@ -31,7 +31,7 @@ process { withName:'CONCOCT_CONCOCT' { errorStrategy = { task.exitStatus in [140] ? 'retry' : 'finish' } maxRetries = 2 - cpus = { 12 * task.attempt } + cpus = { 8 * task.attempt } memory = { 64.GB * task.attempt } time = { 12.h * task.attempt } } @@ -42,6 +42,9 @@ process { memory = { 32.GB * task.attempt } time = { 4.h * task.attempt } } + withName:'BOWTIE2_PHIX_REMOVAL_BUILD'{ + time = 12.h + } } params { From 72721a95d9ed4bde0dd9fb1b7ea9183a0edf892b Mon Sep 17 00:00:00 2001 From: Dani VM Date: Thu, 22 Aug 2024 15:50:52 +0200 Subject: [PATCH 212/321] fix lablog in mag_tax --- bu_isciii/templates/mag/ANALYSIS/lablog_mag_taxonomics | 2 +- bu_isciii/templates/mag/RESULTS/lablog_mag_results | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/mag/ANALYSIS/lablog_mag_taxonomics b/bu_isciii/templates/mag/ANALYSIS/lablog_mag_taxonomics index 5076f4425..72d7d463c 100644 --- a/bu_isciii/templates/mag/ANALYSIS/lablog_mag_taxonomics +++ b/bu_isciii/templates/mag/ANALYSIS/lablog_mag_taxonomics @@ -1 +1 @@ -mv ANALYSIS02_MAG $(date '+%Y%m%d')_ANALYSIS02_MAG +mv ANALYSIS02_MAG_TAXONOMICS $(date '+%Y%m%d')_ANALYSIS02_MAG_TAXONOMICS diff --git a/bu_isciii/templates/mag/RESULTS/lablog_mag_results b/bu_isciii/templates/mag/RESULTS/lablog_mag_results index 55bc4c0e5..11667ce47 100755 --- a/bu_isciii/templates/mag/RESULTS/lablog_mag_results +++ b/bu_isciii/templates/mag/RESULTS/lablog_mag_results @@ -3,4 +3,4 @@ cd $(date '+%Y%m%d')_entrega01 #Create symbolic links depending on the analysis #Individual files -ln -s ../../ANALYSIS/*_MAG/99-stats/multiqc_report.html ./krona_results.html +ln -s ../../ANALYSIS/*_ANALYSIS02_MAG_TAXONOMICS/99-stats/multiqc_report.html ./krona_results.html From 092ee8fcb7f8b41b634e0d32fe9d391c631bdd70 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 23 Aug 2024 12:11:13 +0200 Subject: [PATCH 213/321] minor config update --- bu_isciii/templates/mag/DOC/mag.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/mag/DOC/mag.config b/bu_isciii/templates/mag/DOC/mag.config index a61f7686e..f6e3c8b2c 100644 --- a/bu_isciii/templates/mag/DOC/mag.config +++ b/bu_isciii/templates/mag/DOC/mag.config @@ -17,7 +17,7 @@ process { errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } withName:'SPADES|MEGAHIT' { - errorStrategy = { task.exitStatus in [143,137,21,1] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in [143,137,21,12,1] ? 'retry' : 'finish' } maxRetries = 2 cpus = { 10 * task.attempt } memory = { 64.GB * task.attempt } From d1506ac2fcfde9576e46a2ed285d9278ed8adae9 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 23 Aug 2024 16:35:47 +0200 Subject: [PATCH 214/321] update changelog #321 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 42f918ab2..e988773d5 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -65,6 +65,7 @@ Code contributions to the new version: - Added scripts for time series RNAseq and updated differential expression code for differentially expressed transcripts [#316](https://github.com/BU-ISCIII/buisciii-tools/pull/316). - Added bbaladron to SFTP users [#316](https://github.com/BU-ISCIII/buisciii-tools/pull/316). - Added new template for comprehensive taxonomy profiling using the nf-core/taxprofiler pipeline [#320](https://github.com/BU-ISCIII/buisciii-tools/pull/320). +- Added full execution support for the MAG template [#321](https://github.com/BU-ISCIII/buisciii-tools/pull/321). ### Modules From 73893ddd5520cb5249624406607acd757e0b5f33 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 23 Aug 2024 16:42:44 +0200 Subject: [PATCH 215/321] dummy changes --- bu_isciii/templates/mag/ANALYSIS/lablog_mag_all | 2 +- bu_isciii/templates/mag/DOC/mag.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/mag/ANALYSIS/lablog_mag_all b/bu_isciii/templates/mag/ANALYSIS/lablog_mag_all index 68cd3b09f..a6e0982eb 100644 --- a/bu_isciii/templates/mag/ANALYSIS/lablog_mag_all +++ b/bu_isciii/templates/mag/ANALYSIS/lablog_mag_all @@ -2,4 +2,4 @@ mkdir 00-reads cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cd - cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd - -mv ANALYSIS01_MAG_ALL $(date '+%Y%m%d')_ANALYSIS01_MAG_ALL \ No newline at end of file +mv ANALYSIS01_MAG_ALL $(date '+%Y%m%d')_ANALYSIS01_MAG_ALL diff --git a/bu_isciii/templates/mag/DOC/mag.config b/bu_isciii/templates/mag/DOC/mag.config index f6e3c8b2c..3a70811d8 100644 --- a/bu_isciii/templates/mag/DOC/mag.config +++ b/bu_isciii/templates/mag/DOC/mag.config @@ -18,7 +18,7 @@ process { withName:'SPADES|MEGAHIT' { errorStrategy = { task.exitStatus in [143,137,21,12,1] ? 'retry' : 'finish' } - maxRetries = 2 + maxRetries = 2 cpus = { 10 * task.attempt } memory = { 64.GB * task.attempt } time = { 24.h } From f46ea4d758cbe0c710ad278e17fd23cfd7968a04 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Tue, 3 Sep 2024 13:51:37 +0200 Subject: [PATCH 216/321] created tempalte for mag_all --- .../lablog | 10 ++++----- .../templates/mag/ANALYSIS/lablog_mag_all | 2 +- bu_isciii/templates/mag/DOC/mag.config | 22 ++++++++++++++++++- .../mag/RESULTS/lablog_mag_all_results | 6 +++++ ..._results => lablog_mag_taxonomics_results} | 0 5 files changed, 33 insertions(+), 7 deletions(-) rename bu_isciii/templates/mag/ANALYSIS/{ANALYSIS01_MAG_ALL => ANALYSIS03_MAG_ALL}/lablog (80%) create mode 100644 bu_isciii/templates/mag/RESULTS/lablog_mag_all_results rename bu_isciii/templates/mag/RESULTS/{lablog_mag_results => lablog_mag_taxonomics_results} (100%) diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS01_MAG_ALL/lablog b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog similarity index 80% rename from bu_isciii/templates/mag/ANALYSIS/ANALYSIS01_MAG_ALL/lablog rename to bu_isciii/templates/mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog index 341c46c38..142680656 100644 --- a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS01_MAG_ALL/lablog +++ b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog @@ -11,14 +11,14 @@ done >> samplesheet.csv scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") -cat < mag_complete.sbatch +cat < mag_all.sbatch #!/bin/sh #SBATCH --ntasks 1 #SBATCH --cpus-per-task 2 #SBATCH --mem 8G -#SBATCH --time 120:00:00 +#SBATCH --time 72:00:00 #SBATCH --partition long_idx -#SBATCH --output $(date '+%Y%m%d')_mag_complete.log +#SBATCH --output $(date '+%Y%m%d')_mag_all.log #SBATCH --chdir $scratch_dir # module load Nextflow/23.10.0 singularity @@ -30,8 +30,8 @@ nextflow run /data/bi/pipelines/nf-core-mag/nf-core-mag-2.5.3/workflow/main.nf \ --input samplesheet.csv \\ --kraken2_db '/data/bi/references/kraken/minikraken_8GB_20200312.tgz' \\ --skip_spadeshybrid true \\ - --outdir $(date '+%Y%m%d')_mag_complete \\ + --outdir $(date '+%Y%m%d')_mag_all \\ -resume EOF -echo "sbatch mag_complete.sbatch" > _01_run_mag_complete.sh +echo "sbatch mag_all.sbatch" > _01_run_mag_all.sh diff --git a/bu_isciii/templates/mag/ANALYSIS/lablog_mag_all b/bu_isciii/templates/mag/ANALYSIS/lablog_mag_all index a6e0982eb..caa07af92 100644 --- a/bu_isciii/templates/mag/ANALYSIS/lablog_mag_all +++ b/bu_isciii/templates/mag/ANALYSIS/lablog_mag_all @@ -2,4 +2,4 @@ mkdir 00-reads cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cd - cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd - -mv ANALYSIS01_MAG_ALL $(date '+%Y%m%d')_ANALYSIS01_MAG_ALL +mv ANALYSIS03_MAG_ALL $(date '+%Y%m%d')_ANALYSIS03_MAG_ALL diff --git a/bu_isciii/templates/mag/DOC/mag.config b/bu_isciii/templates/mag/DOC/mag.config index 3a70811d8..05678359e 100644 --- a/bu_isciii/templates/mag/DOC/mag.config +++ b/bu_isciii/templates/mag/DOC/mag.config @@ -43,7 +43,7 @@ process { time = { 4.h * task.attempt } } withName:'BOWTIE2_PHIX_REMOVAL_BUILD'{ - time = 12.h + time = 18.h } } @@ -52,3 +52,23 @@ params { max_cpus = 32 max_time = '48.h' } + +/* + CUSTOM OUTPUT FOLDER STRUCTURE -- modules.config +*/ +params { publish_dir_mode = 'copy' } +process { + withName: 'MULTIQC' { + publishDir = [ + path: { "${params.outdir}/99-stats" }, + mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.equals('versions.yml') || filename.endsWith('.csv')) { + null + } else { + filename + } + } + ] + } +} diff --git a/bu_isciii/templates/mag/RESULTS/lablog_mag_all_results b/bu_isciii/templates/mag/RESULTS/lablog_mag_all_results new file mode 100644 index 000000000..698a21e7e --- /dev/null +++ b/bu_isciii/templates/mag/RESULTS/lablog_mag_all_results @@ -0,0 +1,6 @@ +mkdir $(date '+%Y%m%d')_entrega01 +cd $(date '+%Y%m%d')_entrega01 + +#Create symbolic links depending on the analysis +#Individual files +ln -s ../../ANALYSIS/*_ANALYSIS03_MAG_ALL/99-stats/multiqc_report.html ./krona_results.html \ No newline at end of file diff --git a/bu_isciii/templates/mag/RESULTS/lablog_mag_results b/bu_isciii/templates/mag/RESULTS/lablog_mag_taxonomics_results similarity index 100% rename from bu_isciii/templates/mag/RESULTS/lablog_mag_results rename to bu_isciii/templates/mag/RESULTS/lablog_mag_taxonomics_results From b67f886e85322c670c4ca2b71c25638adf96d800 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Thu, 5 Sep 2024 11:06:23 +0200 Subject: [PATCH 217/321] increase timelimit to mag module and add todo --- bu_isciii/templates/mag/DOC/mag.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bu_isciii/templates/mag/DOC/mag.config b/bu_isciii/templates/mag/DOC/mag.config index 05678359e..837c8dfe6 100644 --- a/bu_isciii/templates/mag/DOC/mag.config +++ b/bu_isciii/templates/mag/DOC/mag.config @@ -28,12 +28,13 @@ process { errorStrategy = { task.exitStatus in [ 1, 255 ] ? 'ignore' : 'retry' } time = { 8.h * task.attempt } } + // TODO: This bining tool takes ~24h to finish... Consider skipping it. withName:'CONCOCT_CONCOCT' { errorStrategy = { task.exitStatus in [140] ? 'retry' : 'finish' } maxRetries = 2 cpus = { 8 * task.attempt } memory = { 64.GB * task.attempt } - time = { 12.h * task.attempt } + time = { 24.h * task.attempt } } withName:'CHECKM_LINEAGEWF' { errorStrategy = { task.exitStatus in [1] ? 'retry' : 'finish' } From 0f1f4ace148c25f09cb63a59eec7139181e6edb8 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Thu, 5 Sep 2024 11:13:27 +0200 Subject: [PATCH 218/321] add mag_all results lablog and documentation --- bu_isciii/assets/reports/results/mag.md | 12 ++++++-- .../mag/RESULTS/lablog_mag_all_results | 29 +++++++++++++++---- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/bu_isciii/assets/reports/results/mag.md b/bu_isciii/assets/reports/results/mag.md index fa6ea6cb7..ce9685141 100644 --- a/bu_isciii/assets/reports/results/mag.md +++ b/bu_isciii/assets/reports/results/mag.md @@ -2,17 +2,25 @@ Here we describe the results from the MAG pipeline for multispecies metagenomic analysis. +### MAG - TAXONIMIC ANALYSIS + * `krona_results.html`​ : Final HTML report with the top 5 species most present in all samples. > [!WARNING] > Software's versions used in this analysis can be obtained from the `MultiQC` report. +### MAG - COMPLETE ANALYSIS + +* `mag_all/krona/${sample_name}.${tool}.report.html`: A Krona interactive visualization report for the each sample based on Kraken2 (or other) taxonomic classification mehtod. +* `mag_all/quast/${sample_name}.${tool}.report.html`: A Quast report for the assembly quality control of each sample assembled using MEGAHIT, SPAdes or other. +* `mag_all/multiqc_report.html`: A combined report generated by MultiQC summarizing various quality control results for all samples. + ## Taxprofiler Here we describe the results from the (nf-core/taxprofiler)[https://nf-co.re/taxprofiler/1.1.8] pipeline for multispecies taxonomic classification and profiling of shorgun short- and long-read. -* `multiqc_report.html​`: Final HTML report collecting numerical stats from each module executed in this pipeline. -* `krona/database_*.html`: Interactive HTML files generated by Krona, displaying the results of taxonomic classification for supported tools (Kraken2, Centrifuge, Kaiju, and MALT) +* `taxprofiler/multiqc_report.html​`: Final HTML report collecting numerical stats from each module executed in this pipeline. +* `taxprofiler/krona/database_*.html`: Interactive HTML files generated by Krona, displaying the results of taxonomic classification for supported tools (Kraken2, Centrifuge, Kaiju, and MALT) > [!WARNING] > Software's versions used in this analysis can be obtained from the `MultiQC` report. \ No newline at end of file diff --git a/bu_isciii/templates/mag/RESULTS/lablog_mag_all_results b/bu_isciii/templates/mag/RESULTS/lablog_mag_all_results index 698a21e7e..f467632eb 100644 --- a/bu_isciii/templates/mag/RESULTS/lablog_mag_all_results +++ b/bu_isciii/templates/mag/RESULTS/lablog_mag_all_results @@ -1,6 +1,25 @@ -mkdir $(date '+%Y%m%d')_entrega01 -cd $(date '+%Y%m%d')_entrega01 +DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega01" +mkdir -p $DELIVERY_FOLDER/mag_all -#Create symbolic links depending on the analysis -#Individual files -ln -s ../../ANALYSIS/*_ANALYSIS03_MAG_ALL/99-stats/multiqc_report.html ./krona_results.html \ No newline at end of file +# Taxprofiling service +cd $DELIVERY_FOLDER/mag_all +ANALYSIS_FOLDER=../../../ANALYSIS/*_ANALYSIS03_MAG_ALL/*_mag_all + +# multiqc report +ln -s ${ANALYSIS_FOLDER}/99-stats/multiqc_report.html . + +# quast reports +mkdir -p quast +for quast_report in ${ANALYSIS_FOLDER}/Assembly/*/QC/*/QUAST/report.html; do + assembly_tool=$(echo "$quast_report" | awk -F'/' '{print $8}') + sample_name=$(echo "$quast_report" | awk -F'/' '{print $10}') + ln -s "../$quast_report" "quast/${sample_name}.${assembly_tool}.report.html" +done + +# krona reports +mkdir -p krona +for krona_report in ${ANALYSIS_FOLDER}/Taxonomy/*/*/taxonomy.krona.html; do + taxonomy_tool=$(echo "$krona_report" | awk -F'/' '{print $8}') + sample_name=$(echo "$krona_report" | awk -F'/' '{print $9}') + ln -s "../$krona_report" "krona/${sample_name}.${taxonomy_tool}.report.html" +done \ No newline at end of file From 34a852dd732ad01b14f1d925ae3f70088a076b56 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Thu, 5 Sep 2024 11:13:51 +0200 Subject: [PATCH 219/321] allow step for refinement --- bu_isciii/templates/mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog | 1 + 1 file changed, 1 insertion(+) diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog index 142680656..55e064df7 100644 --- a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog +++ b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog @@ -30,6 +30,7 @@ nextflow run /data/bi/pipelines/nf-core-mag/nf-core-mag-2.5.3/workflow/main.nf \ --input samplesheet.csv \\ --kraken2_db '/data/bi/references/kraken/minikraken_8GB_20200312.tgz' \\ --skip_spadeshybrid true \\ + --refine_bins_dastool true \\ --outdir $(date '+%Y%m%d')_mag_all \\ -resume EOF From 730ed93aa44863972c3ae316bb0aaec8f1bb2c72 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Thu, 5 Sep 2024 12:07:30 +0200 Subject: [PATCH 220/321] skip concoct due to excess in timelimt --- .../mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog | 1 + bu_isciii/templates/mag/DOC/mag.config | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog index 55e064df7..46875ca53 100644 --- a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog +++ b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog @@ -30,6 +30,7 @@ nextflow run /data/bi/pipelines/nf-core-mag/nf-core-mag-2.5.3/workflow/main.nf \ --input samplesheet.csv \\ --kraken2_db '/data/bi/references/kraken/minikraken_8GB_20200312.tgz' \\ --skip_spadeshybrid true \\ + --skip_concoct true \\ --refine_bins_dastool true \\ --outdir $(date '+%Y%m%d')_mag_all \\ -resume diff --git a/bu_isciii/templates/mag/DOC/mag.config b/bu_isciii/templates/mag/DOC/mag.config index 837c8dfe6..5f8420a97 100644 --- a/bu_isciii/templates/mag/DOC/mag.config +++ b/bu_isciii/templates/mag/DOC/mag.config @@ -28,14 +28,14 @@ process { errorStrategy = { task.exitStatus in [ 1, 255 ] ? 'ignore' : 'retry' } time = { 8.h * task.attempt } } - // TODO: This bining tool takes ~24h to finish... Consider skipping it. - withName:'CONCOCT_CONCOCT' { - errorStrategy = { task.exitStatus in [140] ? 'retry' : 'finish' } - maxRetries = 2 - cpus = { 8 * task.attempt } - memory = { 64.GB * task.attempt } - time = { 24.h * task.attempt } - } + // TODO: This bining tool takes ~24h to finish... skip was added in lablog, however it can be enabeled. + // withName:'CONCOCT_CONCOCT' { + // errorStrategy = { task.exitStatus in [140] ? 'retry' : 'finish' } + // maxRetries = 2 + // cpus = { 8 * task.attempt } + // memory = { 64.GB * task.attempt } + // time = { 24.h * task.attempt } + // } withName:'CHECKM_LINEAGEWF' { errorStrategy = { task.exitStatus in [1] ? 'retry' : 'finish' } maxRetries = 3 From 3b0a8e8fea74acc04e32f5efbe95dd20dba2c5c0 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Thu, 5 Sep 2024 12:17:26 +0200 Subject: [PATCH 221/321] update readme in #321 --- bu_isciii/templates/services.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/services.json b/bu_isciii/templates/services.json index 44b5f5b68..3aea8fce3 100755 --- a/bu_isciii/templates/services.json +++ b/bu_isciii/templates/services.json @@ -210,7 +210,7 @@ "begin": "base", "end": "", "url": "https://github.com/nf-core/mag or https://github.com/nf-core/taxprofiler", - "description": "1- Bioinformatics best-practise analysis for taxonomic classification and profiling; 2- Bioinformatics best-practise analysis pipeline for assembly, binning and annotation of metagenomes.", + "description": "1- Bioinformatics best-practise analysis for taxonomic classification and/or genome binning; 2- Bioinformatics best-practise analysis pipeline for assembly, binning and annotation of metagenomes.", "clean": { "folders":[], "files":[] From 7452159f12fd3194160dca5c7e90cff8a4ae3290 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Fri, 6 Sep 2024 11:53:54 +0200 Subject: [PATCH 222/321] Added several changes in bioinfo_doc.py to allow the inclusion of versions.yml in the pdf associated with the service delivery --- bu_isciii/bioinfo_doc.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/bu_isciii/bioinfo_doc.py b/bu_isciii/bioinfo_doc.py index 37512a40e..94a6c2c33 100755 --- a/bu_isciii/bioinfo_doc.py +++ b/bu_isciii/bioinfo_doc.py @@ -11,6 +11,7 @@ import markdown import pdfkit import PyPDF2 +import yaml import subprocess import json import shutil @@ -77,7 +78,7 @@ def __init__( conf_api["server"], conf_api["api_url"], api_user, api_password ) self.resolution_info = self.rest_api.get_request( - request_info="service-data", safe=True, resolution=self.resolution_id + request_info="service-data", safe=False, resolution=self.resolution_id ) if self.resolution_info == 404: print("Received Error 404 from Iskylims API. Aborting") @@ -92,7 +93,7 @@ def __init__( else: self.post_delivery_info() self.resolution_info = self.rest_api.get_request( - request_info="service-data", safe=True, resolution=self.resolution_id + request_info="service-data", safe=False, resolution=self.resolution_id ) self.services_requested = self.resolution_info["resolutions"][0][ "available_services" @@ -184,6 +185,7 @@ def __init__( self.path, self.conf["services_path"], year, self.service_name ) self.samples = self.resolution_info.get("samples", None) + self.versions = self.load_versions() self.handled_services = None try: self.config_pdfkit = pdfkit.configuration() @@ -206,6 +208,30 @@ def __init__( else: self.email_psswd = email_psswd + if self.type == "delivery": + service_list = {} + for service_id_requested in self.service_ids_requested_list: + service_list[service_id_requested] = bu_isciii.service_json.ServiceJson().get_find(service_id_requested, "label") + self.all_services = service_list + + def load_versions(self): + """Load and parse the versions.yml file.""" + result = subprocess.run(f"find /data/bi/services_and_colaborations/*/*/{self.service_name} -name '*versions.yml'", stdout=subprocess.PIPE, text=True, shell=True) + versions_files = result.stdout.strip().split("\n") + if versions_files == [""]: + stderr.print(f"[red] No versions.yml files found for the service {self.service_name}!") + return "No software versions data available for this service" + else: + versions_data = {} + loaded_contents = [] + for versions_file in versions_files: + with open(versions_file, 'r') as f: + content = yaml.safe_load(f) + if content not in loaded_contents: + versions_data[versions_file] = content + loaded_contents.append(content) + return versions_data + def create_structure(self): if os.path.exists(self.service_folder): log.info("Already creted the service folder for %s", self.service_folder) @@ -332,6 +358,8 @@ def create_markdown(self, file_path): # service related information markdown_data["service"] = self.resolution_info markdown_data["user_data"] = self.resolution_info["service_user_id"] + markdown_data["software_versions"] = self.versions + markdown_data["services_list"] = self.all_services samples_in_service = {} if self.samples is not None: From 1bea0ec53d75d03122a05d0272608f3e28755cca Mon Sep 17 00:00:00 2001 From: victor5lm Date: Fri, 6 Sep 2024 11:55:17 +0200 Subject: [PATCH 223/321] Added a software versions section in the jinja delivery template --- .../templates/jinja_template_delivery.j2 | 42 +++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/bu_isciii/templates/jinja_template_delivery.j2 b/bu_isciii/templates/jinja_template_delivery.j2 index b55828d3e..058b07901 100644 --- a/bu_isciii/templates/jinja_template_delivery.j2 +++ b/bu_isciii/templates/jinja_template_delivery.j2 @@ -74,9 +74,8 @@ Here we describe information about the resolution delivery. {% endif %} {% if samples %} -## Samples sequenced at iSCIII: - -Here we describe information about the project associated to the service: +## Samples sequenced at ISCIII: +##Here we describe information about the project associated to the service: {% if service_sequencing_center -%} * Sequencing center: {{ service_sequencing_center }}{% endif %} {% for run , projects in samples.items() %} * Run name: {{ run }} @@ -88,6 +87,43 @@ Here we describe information about the project associated to the service: {% endfor %} {% endif %} + + +
+ +## Software versions: + +{% if services_list is mapping and software_versions is mapping %} +{%- set service_list = services_list.items() | list %} +{%- set file_version_list = software_versions.items() | list %} + +{%- for index in range(service_list | length) %} + {%- if index < file_version_list | length %} + {%- set service_id, description = service_list[index] %} +* {{ description }} ({{ service_id }}): + {%- set file_path, processes = file_version_list[index] %} + {%- if processes | length > 0 %} + {%- for process, tools in processes.items() %} + - {{ process }}: + {%- for tool, version in tools.items() %} + - {{ tool }}: {{ version }} + {%- endfor %} + {%- endfor %} + {%- else %} + - No software versions data available for this file path. + {%- endif %} + {%- else %} + {%- set service_id, description = service_list[index] %} +* {{ description }} ({{ service_id }}): + - No software versions data available for this service. + {%- endif %} +{%- endfor %} +{% else %} +No software versions data available for this service. +{% endif %} +