From 9b6e545c5439247a6021a403967d8907a7ff5d62 Mon Sep 17 00:00:00 2001 From: Mohamed Sameh Date: Mon, 30 Dec 2024 17:40:39 +0100 Subject: [PATCH 1/3] test: unit tests for output accumulator psotproc --- tests/unit_tests/conftest.py | 21 +++++++ tests/unit_tests/test_outputaccumulator.py | 66 +++++++++++++++++++++- 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py index e6c8ecf3..e5f25adc 100644 --- a/tests/unit_tests/conftest.py +++ b/tests/unit_tests/conftest.py @@ -63,6 +63,7 @@ def mock_precursor_df( df = pd.DataFrame( { + "decoy": decoy, "mz_library": precursor_mz, "rt_library": random_rt, @@ -167,6 +168,26 @@ def mock_fragment_df(n_fragments: int = 10, n_precursor: int = 20): } ) +def mock_fragment_correlation_df(fragments_df: pd.DataFrame): + """Create a mock fragment correlation dataframe as it's found as the individual search outputs + + Parameters + ---------- + + fragments_df : pd.DataFrame + A mock fragment dataframe + + Returns + ------- + + fragment_correlation_df : pd.DataFrame + A mock fragment correlation dataframe + """ + # create random correlation values between 0 and 1 + fragments_shape = fragments_df.shape + fragment_correlation = np.random.rand(*fragments_shape) + fragment_correlation_df = pd.DataFrame(fragment_correlation, columns=fragments_df.columns) + return fragment_correlation_df def pytest_configure(config): test_data_path = os.environ.get("TEST_DATA_DIR", None) diff --git a/tests/unit_tests/test_outputaccumulator.py b/tests/unit_tests/test_outputaccumulator.py index 896aa06d..321443ac 100644 --- a/tests/unit_tests/test_outputaccumulator.py +++ b/tests/unit_tests/test_outputaccumulator.py @@ -5,11 +5,12 @@ import numpy as np import pandas as pd from alphabase.spectral_library.base import SpecLibBase -from conftest import mock_fragment_df, mock_precursor_df +from alphabase.spectral_library.flat import SpecLibFlat +from conftest import mock_fragment_df, mock_precursor_df,mock_fragment_correlation_df from alphadia import outputtransform from alphadia.workflow.base import QUANT_FOLDER_NAME - +from alphadia.outputaccumulator import ms2_quality_control def prepare_input_data(): """ @@ -247,3 +248,64 @@ def test_default_column_assignment(): assert built_lib.precursor_df[f"{col}"].equals( built_lib.precursor_df[f"{col}_library"] ), f"{col} != {col}_library" + +def test_non_nan_fragments(): + """ + Test that the accumulated fragments data frame has no nan values + """ + # Given: + config, temp_folder, raw_folders, psm_dfs, fragment_dfs = prepare_input_data() + keep_top = 2 + config["transfer_library"]["top_k_samples"] = keep_top + + # When: + output = outputtransform.SearchPlanOutput(config, temp_folder) + _ = output.build_transfer_library(raw_folders, save=True) + built_lib = SpecLibBase() + built_lib.load_hdf( + os.path.join(temp_folder, f"{output.TRANSFER_OUTPUT}.hdf"), load_mod_seq=True + ) + + # Then: The fragment dataframe should have no nan values + assert not built_lib.fragment_intensity_df.isnull().values.any(), "There are nan values in the fragment dataframe" + + shutil.rmtree(temp_folder) + +def test_use_for_ms2(): + """ + Test that the ms2 quality control is correctly applied by checking the use_for_ms2 column in the precursor_df + """ + # Given: + psm_flat_df = mock_precursor_df(n_precursor=100, with_decoy=True) + fragment_flat_df = mock_fragment_df(n_precursor=100, n_fragments=10) + psm_flat_df = psm_flat_df.sort_values(by="precursor_idx") + fragment_flat_df = fragment_flat_df.sort_values(by="precursor_idx") + psm_flat_df["flat_frag_start_idx"] = np.arange(0, len(psm_flat_df) * 10, 10) + psm_flat_df["flat_frag_stop_idx"] = np.arange(0, len(psm_flat_df) * 10, 10) + 9 + psm_flat_df['nAA'] =psm_flat_df.sequence.str.len().astype(np.int32) + fragment_flat_df["loss_type"] = 0 + flat_spec_lib = SpecLibFlat() + flat_spec_lib._precursor_df = psm_flat_df + flat_spec_lib._fragment_df = fragment_flat_df + spec_lib = flat_spec_lib.to_SpecLibBase() + fragment_correlation_base_df = mock_fragment_correlation_df(spec_lib.fragment_intensity_df) + spec_lib._fragment_correlation_df = fragment_correlation_base_df + precursor_correlation_cutoff = 0.5 + fragment_correlation_ratio = 0.75 + + base_precursor_df = spec_lib.precursor_df.copy() + base_fragment_df = spec_lib.fragment_intensity_df.copy() + # When: + ms2_quality_control(spec_lib, precursor_correlation_cutoff, fragment_correlation_ratio) + + # Then: The use_for_ms2 column should be correctly assigned for precursors with median fragment correlation above precursor_correlation_cutoff + target_use_for_ms2 = [] + for frag_start,frag_stop in zip(base_precursor_df["frag_start_idx"],base_precursor_df["frag_stop_idx"]): + frag_corr = fragment_correlation_base_df.iloc[frag_start:frag_stop].values + frag_intensities = base_fragment_df.iloc[frag_start:frag_stop].values + # median corr of non zero intensities + frag_corr = frag_corr[frag_intensities>0] + median_frag_corr = np.median(frag_corr) if len(frag_corr) > 0 else 0 + target_use_for_ms2.append(median_frag_corr > precursor_correlation_cutoff) + + np.testing.assert_array_equal(spec_lib.precursor_df["use_for_ms2"].values, target_use_for_ms2) From 83dfdc0f602270620299dfca6d25716c8a650e57 Mon Sep 17 00:00:00 2001 From: Mohamed Sameh Date: Tue, 14 Jan 2025 11:29:13 +0100 Subject: [PATCH 2/3] Update test_outputaccumulator.py --- tests/unit_tests/test_outputaccumulator.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/unit_tests/test_outputaccumulator.py b/tests/unit_tests/test_outputaccumulator.py index 321443ac..8c1592c4 100644 --- a/tests/unit_tests/test_outputaccumulator.py +++ b/tests/unit_tests/test_outputaccumulator.py @@ -137,7 +137,6 @@ def test_complete_output_accumulation(): == number_of_unique_precursors ), f"{len(np.unique(built_lib.precursor_df['precursor_idx']))} != {number_of_unique_precursors}" - shutil.rmtree(temp_folder) def test_selection_of_precursors(): @@ -179,7 +178,6 @@ def test_selection_of_precursors(): f"{selected_probas} != {target_kept_probas}", ) - shutil.rmtree(temp_folder) def test_keep_top_constraint(): @@ -211,7 +209,6 @@ def test_keep_top_constraint(): <= keep_top ), f"{len(built_lib.precursor_df[built_lib.precursor_df['precursor_idx'] == precursor_idx])} != {keep_top}" - shutil.rmtree(temp_folder) def test_default_column_assignment(): @@ -269,7 +266,6 @@ def test_non_nan_fragments(): # Then: The fragment dataframe should have no nan values assert not built_lib.fragment_intensity_df.isnull().values.any(), "There are nan values in the fragment dataframe" - shutil.rmtree(temp_folder) def test_use_for_ms2(): """ @@ -287,6 +283,7 @@ def test_use_for_ms2(): flat_spec_lib = SpecLibFlat() flat_spec_lib._precursor_df = psm_flat_df flat_spec_lib._fragment_df = fragment_flat_df + # TODO: to_SpecLibBase will be deprecated and this should be adapted to use to_speclib_base spec_lib = flat_spec_lib.to_SpecLibBase() fragment_correlation_base_df = mock_fragment_correlation_df(spec_lib.fragment_intensity_df) spec_lib._fragment_correlation_df = fragment_correlation_base_df From 69d8273af001351a42522b54c3602b77589af949 Mon Sep 17 00:00:00 2001 From: Mohamed Sameh Date: Tue, 14 Jan 2025 12:01:01 +0100 Subject: [PATCH 3/3] Update test_outputaccumulator.py --- tests/unit_tests/test_outputaccumulator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit_tests/test_outputaccumulator.py b/tests/unit_tests/test_outputaccumulator.py index 8c1592c4..e2dabb8b 100644 --- a/tests/unit_tests/test_outputaccumulator.py +++ b/tests/unit_tests/test_outputaccumulator.py @@ -1,16 +1,16 @@ import os -import shutil import tempfile import numpy as np import pandas as pd from alphabase.spectral_library.base import SpecLibBase from alphabase.spectral_library.flat import SpecLibFlat -from conftest import mock_fragment_df, mock_precursor_df,mock_fragment_correlation_df +from conftest import mock_fragment_correlation_df, mock_fragment_df, mock_precursor_df from alphadia import outputtransform -from alphadia.workflow.base import QUANT_FOLDER_NAME from alphadia.outputaccumulator import ms2_quality_control +from alphadia.workflow.base import QUANT_FOLDER_NAME + def prepare_input_data(): """