Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Request up to 5 principal components in PCA process #1404

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ jobs:
- name: Migrations
run: |
tox -e migrations

- name: Run entire test suite
if: github.event_name != 'pull_request'
timeout-minutes: 120
Expand All @@ -109,6 +109,20 @@ jobs:
run: |
tox -e ${{ matrix.toxenv }}-partial

- name: List files in test data directory
if: always()
run: |
ls -a -R ${{ github.workspace }}/tests/.test_data

- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v4
with:
path: ${{ github.workspace }}/tests/.test_data/*
retention-days: 1
name: test_files
include-hidden-files: true

build:
runs-on: arc-runner
needs: test
Expand Down
2 changes: 2 additions & 0 deletions docs/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@ Unreleased

Added
-----
- Save test folder data using actions/upload-artifact

Changed
-------
- Calculate up to 5 principal components in ``pca`` process

Fixed
-----
Expand Down
10 changes: 6 additions & 4 deletions resolwe_bio/processes/clustering/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def component_top_factors(component, allgenes_array, max_size=20):
return list(zip(np.array(allgenes_array)[ixs].tolist(), component[ixs].tolist()))


def get_pca(expressions=pd.DataFrame(), gene_labels=[]):
def get_pca(expressions=pd.DataFrame(), gene_labels=[], n_components=5):
"""Compute PCA."""
if not gene_labels:
gene_labels = expressions.index
Expand All @@ -41,11 +41,13 @@ def get_pca(expressions=pd.DataFrame(), gene_labels=[]):
all_components = [[], []]
all_explained_variance_ratios = [0.0, 0.0]
else:
pca = PCA(n_components=2, whiten=True)
pca_components = min(expressions.shape[0], expressions.shape[1], n_components)
pca = PCA(n_components=pca_components, whiten=True)
pca_expressions = pca.fit_transform(expressions.transpose())

coordinates = [
t[:2].tolist() if len(t) > 1 else [t[0], 0.0] for t in pca_expressions
t[:pca_components].tolist() if len(t) > 1 else [t[0], 0.0]
for t in pca_expressions
]
all_components = [
component_top_factors(component=component, allgenes_array=gene_labels)
Expand Down Expand Up @@ -197,7 +199,7 @@ class PrinicipalComponentAnalysis(Process):
},
}
data_name = "PCA"
version = "3.0.0"
version = "3.1.0"
process_type = "data:pca"
category = "Enrichment and Clustering"
scheduling_class = SchedulingClass.INTERACTIVE
Expand Down
Binary file added resolwe_bio/tests/files/exp_6_rc.tab.gz
Binary file not shown.
Binary file added resolwe_bio/tests/files/exp_6_tpm.tab.gz
Binary file not shown.
Binary file added resolwe_bio/tests/files/exp_7_rc.tab.gz
Binary file not shown.
Binary file added resolwe_bio/tests/files/exp_7_tpm.tab.gz
Binary file not shown.
Binary file added resolwe_bio/tests/files/exp_8_rc.tab.gz
Binary file not shown.
Binary file added resolwe_bio/tests/files/exp_8_tpm.tab.gz
Binary file not shown.
Binary file modified resolwe_bio/tests/files/pca_plot.json.gz
Binary file not shown.
Binary file modified resolwe_bio/tests/files/pca_plot_2.json.gz
Binary file not shown.
Binary file modified resolwe_bio/tests/files/pca_plot_3.json.gz
Binary file not shown.
Binary file modified resolwe_bio/tests/files/pca_plot_4.json.gz
Binary file not shown.
59 changes: 52 additions & 7 deletions resolwe_bio/tests/processes/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,62 @@ def test_pca(self):
source="DICTYBASE",
species="Dictyostelium discoideum",
)
expression_5 = self.prepare_expression(
f_rc="exp_6_rc.tab.gz",
f_exp="exp_6_tpm.tab.gz",
f_type="TPM",
source="DICTYBASE",
species="Dictyostelium discoideum",
)
expression_6 = self.prepare_expression(
f_rc="exp_7_rc.tab.gz",
f_exp="exp_7_tpm.tab.gz",
f_type="TPM",
source="DICTYBASE",
species="Dictyostelium discoideum",
)
expression_7 = self.prepare_expression(
f_rc="exp_8_rc.tab.gz",
f_exp="exp_8_tpm.tab.gz",
f_type="TPM",
source="DICTYBASE",
species="Dictyostelium discoideum",
)

inputs = {
"exps": [expression_1.pk, expression_2.pk],
"exps": [
expression_1.pk,
expression_2.pk,
expression_5.pk,
expression_6.pk,
expression_7.pk,
],
"source": "DICTYBASE",
"species": "Dictyostelium discoideum",
}
pca = self.run_process("pca", inputs)
saved_json, test_json = self.get_json("pca_plot.json.gz", pca.output["pca"])
# returns 4 PCA components. Last component differs when testing on different systems
# and is not tested here.
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"]),
round_elements(saved_json["flot"]["data"]),
round_elements(test_json["flot"]["data"][0][:3]),
round_elements(saved_json["flot"]["data"][0][:3]),
)
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"][1][:3]),
round_elements(saved_json["flot"]["data"][1][:3]),
)
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"][2][:3]),
round_elements(saved_json["flot"]["data"][2][:3]),
)
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"][3][:3]),
round_elements(saved_json["flot"]["data"][3][:3]),
)
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"][4][:3]),
round_elements(saved_json["flot"]["data"][4][:3]),
)
self.assertAlmostEqualGeneric(
round_elements(test_json["explained_variance_ratios"]),
Expand All @@ -84,8 +129,8 @@ def test_pca(self):
pca = self.run_process("pca", inputs)
saved_json, test_json = self.get_json("pca_plot_2.json.gz", pca.output["pca"])
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"]),
round_elements(saved_json["flot"]["data"]),
round_elements(test_json["flot"]["data"][0][:3]),
round_elements(saved_json["flot"]["data"][0][:3]),
)

self.assertEqual(len(pca.process_warning), 0)
Expand All @@ -95,8 +140,8 @@ def test_pca(self):
pca = self.run_process("pca", inputs)
saved_json, test_json = self.get_json("pca_plot_3.json.gz", pca.output["pca"])
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"]),
round_elements(saved_json["flot"]["data"]),
round_elements(test_json["flot"]["data"][0][:3]),
round_elements(saved_json["flot"]["data"][0][:3]),
)

self.assertEqual(len(pca.process_warning), 0)
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ commands =
partial: --only-changes-to \
partial: {env:RESOLWE_TEST_ONLY_CHANGES_TO:master} \
partial: --changes-file-types .resolwebio-filetypes.yml \
--verbosity 2 --parallel
--verbosity 2 --parallel --keep-data

# Check types.
[testenv:mypy]
Expand Down