Skip to content

Commit

Permalink
Request up to 5 principal components in pca process
Browse files Browse the repository at this point in the history
  • Loading branch information
jkokosar committed Dec 19, 2024
1 parent fb15da5 commit 99f5ebb
Show file tree
Hide file tree
Showing 13 changed files with 59 additions and 11 deletions.
1 change: 1 addition & 0 deletions docs/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Added

Changed
-------
- Calculate up to 5 principal components in ``pca`` process

Fixed
-----
Expand Down
10 changes: 6 additions & 4 deletions resolwe_bio/processes/clustering/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def component_top_factors(component, allgenes_array, max_size=20):
return list(zip(np.array(allgenes_array)[ixs].tolist(), component[ixs].tolist()))


def get_pca(expressions=pd.DataFrame(), gene_labels=[]):
def get_pca(expressions=pd.DataFrame(), gene_labels=[], n_components=5):
"""Compute PCA."""
if not gene_labels:
gene_labels = expressions.index
Expand All @@ -41,11 +41,13 @@ def get_pca(expressions=pd.DataFrame(), gene_labels=[]):
all_components = [[], []]
all_explained_variance_ratios = [0.0, 0.0]
else:
pca = PCA(n_components=2, whiten=True)
pca_components = min(expressions.shape[0], expressions.shape[1], n_components)
pca = PCA(n_components=pca_components, whiten=True)
pca_expressions = pca.fit_transform(expressions.transpose())

coordinates = [
t[:2].tolist() if len(t) > 1 else [t[0], 0.0] for t in pca_expressions
t[:pca_components].tolist() if len(t) > 1 else [t[0], 0.0]
for t in pca_expressions
]
all_components = [
component_top_factors(component=component, allgenes_array=gene_labels)
Expand Down Expand Up @@ -197,7 +199,7 @@ class PrinicipalComponentAnalysis(Process):
},
}
data_name = "PCA"
version = "3.0.0"
version = "3.1.0"
process_type = "data:pca"
category = "Enrichment and Clustering"
scheduling_class = SchedulingClass.INTERACTIVE
Expand Down
Binary file added resolwe_bio/tests/files/exp_6_rc.tab.gz
Binary file not shown.
Binary file added resolwe_bio/tests/files/exp_6_tpm.tab.gz
Binary file not shown.
Binary file added resolwe_bio/tests/files/exp_7_rc.tab.gz
Binary file not shown.
Binary file added resolwe_bio/tests/files/exp_7_tpm.tab.gz
Binary file not shown.
Binary file added resolwe_bio/tests/files/exp_8_rc.tab.gz
Binary file not shown.
Binary file added resolwe_bio/tests/files/exp_8_tpm.tab.gz
Binary file not shown.
Binary file modified resolwe_bio/tests/files/pca_plot.json.gz
Binary file not shown.
Binary file modified resolwe_bio/tests/files/pca_plot_2.json.gz
Binary file not shown.
Binary file modified resolwe_bio/tests/files/pca_plot_3.json.gz
Binary file not shown.
Binary file modified resolwe_bio/tests/files/pca_plot_4.json.gz
Binary file not shown.
59 changes: 52 additions & 7 deletions resolwe_bio/tests/processes/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,62 @@ def test_pca(self):
source="DICTYBASE",
species="Dictyostelium discoideum",
)
expression_5 = self.prepare_expression(
f_rc="exp_6_rc.tab.gz",
f_exp="exp_6_tpm.tab.gz",
f_type="TPM",
source="DICTYBASE",
species="Dictyostelium discoideum",
)
expression_6 = self.prepare_expression(
f_rc="exp_7_rc.tab.gz",
f_exp="exp_7_tpm.tab.gz",
f_type="TPM",
source="DICTYBASE",
species="Dictyostelium discoideum",
)
expression_7 = self.prepare_expression(
f_rc="exp_8_rc.tab.gz",
f_exp="exp_8_tpm.tab.gz",
f_type="TPM",
source="DICTYBASE",
species="Dictyostelium discoideum",
)

inputs = {
"exps": [expression_1.pk, expression_2.pk],
"exps": [
expression_1.pk,
expression_2.pk,
expression_5.pk,
expression_6.pk,
expression_7.pk,
],
"source": "DICTYBASE",
"species": "Dictyostelium discoideum",
}
pca = self.run_process("pca", inputs)
saved_json, test_json = self.get_json("pca_plot.json.gz", pca.output["pca"])
# returns 4 PCA components. Last component differs when testing on different systems
# and is not tested here.
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"]),
round_elements(saved_json["flot"]["data"]),
round_elements(test_json["flot"]["data"][0][:3]),
round_elements(saved_json["flot"]["data"][0][:3]),
)
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"][1][:3]),
round_elements(saved_json["flot"]["data"][1][:3]),
)
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"][2][:3]),
round_elements(saved_json["flot"]["data"][2][:3]),
)
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"][3][:3]),
round_elements(saved_json["flot"]["data"][3][:3]),
)
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"][4][:3]),
round_elements(saved_json["flot"]["data"][4][:3]),
)
self.assertAlmostEqualGeneric(
round_elements(test_json["explained_variance_ratios"]),
Expand All @@ -84,8 +129,8 @@ def test_pca(self):
pca = self.run_process("pca", inputs)
saved_json, test_json = self.get_json("pca_plot_2.json.gz", pca.output["pca"])
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"]),
round_elements(saved_json["flot"]["data"]),
round_elements(test_json["flot"]["data"][0][:3]),
round_elements(saved_json["flot"]["data"][0][:3]),
)

self.assertEqual(len(pca.process_warning), 0)
Expand All @@ -95,8 +140,8 @@ def test_pca(self):
pca = self.run_process("pca", inputs)
saved_json, test_json = self.get_json("pca_plot_3.json.gz", pca.output["pca"])
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"]),
round_elements(saved_json["flot"]["data"]),
round_elements(test_json["flot"]["data"][0][:3]),
round_elements(saved_json["flot"]["data"][0][:3]),
)

self.assertEqual(len(pca.process_warning), 0)
Expand Down

0 comments on commit 99f5ebb

Please sign in to comment.