Skip to content

Commit

Permalink
Alternative ways to get genomic positions (#150)
Browse files Browse the repository at this point in the history
* Update README.md

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Make gtfparse optional

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Draft function to get genomic position from biomart

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add gtf testdata

* WIP

* Update tutorial

* Add unit test for gtfs

* use UV in CI

* pre-commit

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove gtfparse from hard dependencies

* update api doc

* Fix link in README

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
grst and pre-commit-ci[bot] authored Jan 21, 2025
1 parent 842ab1f commit a39ae4f
Show file tree
Hide file tree
Showing 12 changed files with 4,416 additions and 23 deletions.
16 changes: 6 additions & 10 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,19 +46,15 @@ jobs:
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python }}
cache: "pip"
cache-dependency-path: "**/pyproject.toml"
- uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.r }}
use-public-rspm: true
- name: Install uv
uses: astral-sh/setup-uv@v5

- name: Install test dependencies
run: |
python -m pip install --upgrade pip wheel
- name: Install dependencies
run: |
pip install ".[dev,test,copykat]"
- name: Install the project
run: uv sync --extra dev --extra test --extra gtf --extra copykat

- name: Install R dependencies
run: |
Expand All @@ -73,9 +69,9 @@ jobs:
PLATFORM: ${{ matrix.os }}
DISPLAY: :42
run: |
coverage run -m pytest -v --color=yes
uv run coverage run -m pytest -v --color=yes
- name: Report coverage
run: |
coverage report
uv run coverage report
- name: Upload coverage
uses: codecov/codecov-action@v3
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
.DS_Store
*~
buck-out/
.pybiomart.sqlite

# Compiled files
.venv/
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[![Documentation][badge-docs]][link-docs]
[![PyPI][badge-pypi]][link-pypi]

[badge-tests]: https://img.shields.io/github/actions/workflow/status/icbi-lab/infercnvpy/test.yaml?branch=main
[badge-tests]: https://github.com/icbi-lab/infercnvpy/actions/workflows/test.yaml/badge.svg
[link-tests]: https://github.com/icbi-lab/infercnvpy/actions/workflows/test.yml
[badge-docs]: https://img.shields.io/readthedocs/infercnvpy
[badge-pypi]: https://img.shields.io/pypi/v/infercnvpy?logo=PyPI
Expand Down Expand Up @@ -80,5 +80,5 @@ n/a
[scverse-discourse]: https://discourse.scverse.org/
[issue-tracker]: https://github.com/icbi-lab/infercnvpy/issues
[changelog]: https://infercnvpy.readthedocs.io/latest/changelog.html
[link-docs]: https://infercnvpy.readthedocs.io
[link-docs]: https://infercnvpy.readthedocs.io/
[link-api]: https://infercnvpy.readthedocs.io/en/latest/api.html
1 change: 1 addition & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Input/Output: `io`
.. autosummary::
:toctree: ./generated

genomic_position_from_biomart
genomic_position_from_gtf
read_scevan

Expand Down
16 changes: 10 additions & 6 deletions docs/notebooks/tutorial_3k.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,10 @@
"cell_type": "raw",
"id": "respected-outreach",
"metadata": {
"raw_mimetype": "text/restructuredtext"
"raw_mimetype": "text/restructuredtext",
"vscode": {
"languageId": "raw"
}
},
"source": [
".. note::\n",
Expand All @@ -98,8 +101,9 @@
" the start and end positions on that chromosome for each gene, \n",
" respectively. \n",
" \n",
" Infercnvpy provides the :func:`infercnvpy.io.genomic_position_from_gtf` function\n",
" to read these information from a GTF file and add them to `adata.var`. \n",
" Infercnvpy provides the :func:`infercnvpy.io.genomic_position_from_biomart` and \n",
" :func:`infercnvpy.io.genomic_position_from_gtf` functions\n",
" to get these information online or from a GTF file and store them in `adata.var`. \n",
" \n",
"The example dataset is already appropriately preprocessed. "
]
Expand Down Expand Up @@ -1448,9 +1452,9 @@
"notebook_metadata_filter": "-kernelspec"
},
"kernelspec": {
"display_name": "Python [conda env:micromamba-infercnvpy]",
"display_name": ".venv",
"language": "python",
"name": "conda-env-micromamba-infercnvpy-py"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -1462,7 +1466,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.11.10"
}
},
"nbformat": 4,
Expand Down
12 changes: 10 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,24 @@ urls.Source = "https://github.com/icbi-lab/infercnvpy"
urls.Home-page = "https://github.com/icbi-lab/infercnvpy"
dependencies = [
'anndata>=0.7.3',
"scanpy>=1.9",
"scanpy>=1.10",
'pandas>=1',
'numpy>=1.20', # includes type annotations
'tqdm>=4.63.0', # fixes tqdm.auto
'pytoml',
'gtfparse>=2.1',
'pycairo>=1.20; sys_platform == "win32"',
'leidenalg',
'pyreadr',
'pytest-benchmark',
# for debug logging (referenced from the issue template)
"session-info",
"pybiomart>=0.2.0",
]

[project.optional-dependencies]
gtf = [
'gtfparse>=2.1'
]
copykat = [
'rpy2'
]
Expand All @@ -60,10 +63,12 @@ doc = [
'pycairo',
'jupyter_client',
"pandas",
"setuptools", # required for sphinxcontrib-bibtex
]
test = [
"pytest",
"coverage",
"openpyxl", # required for one of the scanpy datasets used in the tests
]

[tool.hatch.version]
Expand Down Expand Up @@ -155,3 +160,6 @@ skip = [
"docs/references.md",
"docs/notebooks/example.ipynb",
]

[tool.uv.sources]
gtfparse = { git = "https://github.com/lrauschning/gtfparse.git", rev = "dev" }
4 changes: 3 additions & 1 deletion src/infercnvpy/io/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
from ._genepos import genomic_position_from_gtf
from ._genepos import genomic_position_from_biomart, genomic_position_from_gtf
from ._scevan import read_scevan

__all__ = ["genomic_position_from_gtf", "genomic_position_from_biomart", "read_scevan"]
100 changes: 98 additions & 2 deletions src/infercnvpy/io/_genepos.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,96 @@
from pathlib import Path
from typing import Literal

import gtfparse
import numpy as np
import pandas as pd
import scanpy.queries
from anndata import AnnData
from scanpy import logging


def genomic_position_from_biomart(
adata: AnnData | None = None,
*,
adata_gene_id: str | None = None,
biomart_gene_id="ensembl_gene_id",
species: str = "hsapiens",
inplace: bool = True,
**kwargs,
):
"""
Get genomic gene positions from ENSEMBL Biomart.
Parameters
----------
adata
Adds the genomic positions to `adata.var`. If adata is None, returns
a data frame with the genomic positions instead.
adata_gene_id
Column in `adata.var` that contains (ENSMBL) gene IDs. If not specified,
use `adata.var_names`.
biomart_gene_id
The biomart column to use as gene identifier. Typically this would be `ensembl_gene_id` or `hgnc_symbol`,
but could be different for other species.
inplace
If True, add the annotations directly to adata, otherwise return a dataframe.
**kwargs
passed on to :func:`scanpy.queries.biomart_annotations`
"""
biomart_annot = (
scanpy.queries.biomart_annotations(
species,
[
biomart_gene_id,
"start_position",
"end_position",
"chromosome_name",
],
**kwargs,
)
.rename(
columns={
"start_position": "start",
"end_position": "end",
"chromosome_name": "chromosome",
}
)
# use chr prefix for chromosome
.assign(chromosome=lambda x: "chr" + x["chromosome"])
)

gene_ids_adata = (adata.var_names if adata_gene_id is None else adata.var[adata_gene_id]).values
missing_from_biomart = len(set(gene_ids_adata) - set(biomart_annot[biomart_gene_id].values))
if missing_from_biomart:
logging.warning(
f"Biomart misses annotation for {missing_from_biomart} genes in adata. Did you use ENSEMBL ids?"
)

duplicated_symbols = np.sum(biomart_annot[biomart_gene_id].duplicated())
if duplicated_symbols:
logging.warning(f"Skipped {duplicated_symbols} genes because of duplicate identifiers in GTF file.")
biomart_annot = biomart_annot.loc[~biomart_annot[biomart_gene_id].duplicated(keep=False), :]

tmp_var = adata.var.copy()
orig_index_name = tmp_var.index.name
TMP_INDEX_NAME = "adata_var_index"
tmp_var.index.name = TMP_INDEX_NAME
tmp_var.reset_index(inplace=True)
var_annotated = tmp_var.merge(
biomart_annot,
how="left",
left_on=TMP_INDEX_NAME if adata_gene_id is None else adata_gene_id,
right_on=biomart_gene_id,
validate="one_to_one",
)
var_annotated.set_index(TMP_INDEX_NAME, inplace=True)
var_annotated.index.name = orig_index_name

if inplace:
adata.var = var_annotated
else:
return var_annotated


def genomic_position_from_gtf(
gtf_file: Path | str,
adata: AnnData | None = None,
Expand All @@ -16,7 +99,8 @@ def genomic_position_from_gtf(
adata_gene_id: str | None = None,
inplace: bool = True,
) -> pd.DataFrame | None:
"""Get genomic gene positions from a GTF file.
"""
Get genomic gene positions from a GTF file.
The GTF file needs to match the genome annotation used for your single cell dataset.
Expand All @@ -38,6 +122,12 @@ def genomic_position_from_gtf(
inplace
If True, add the annotations directly to adata, otherwise return a dataframe.
"""
try:
import gtfparse
except ImportError:
raise ImportError(
"genomic_position_from_gtf requires gtfparse as optional dependency. Please install it using `pip install gtfparse`."
) from None
gtf = gtfparse.read_gtf(
gtf_file, usecols=["seqname", "feature", "start", "end", "gene_id", "gene_name"]
).to_pandas()
Expand All @@ -49,6 +139,8 @@ def genomic_position_from_gtf(
.drop_duplicates()
.rename(columns={"seqname": "chromosome"})
)
# remove ensembl versions
gtf["gene_id"] = gtf["gene_id"].str.replace(r"\.\d+$", "", regex=True)

gene_ids_adata = (adata.var_names if adata_gene_id is None else adata.var[adata_gene_id]).values
gtf = gtf.loc[gtf[gtf_gene_id].isin(gene_ids_adata), :]
Expand Down Expand Up @@ -77,6 +169,10 @@ def genomic_position_from_gtf(
var_annotated.set_index(TMP_INDEX_NAME, inplace=True)
var_annotated.index.name = orig_index_name

# if not a gencode GTF, let's add 'chr' prefix:
if np.all(~var_annotated["chromosome"].dropna().str.startswith("chr")):
var_annotated["chromosome"] = "chr" + var_annotated["chromosome"]

if inplace:
adata.var = var_annotated
else:
Expand Down
7 changes: 7 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path

import numpy as np
import pandas as pd
import pytest
Expand All @@ -7,6 +9,11 @@
import infercnvpy as cnv


@pytest.fixture()
def testdata():
return Path(__file__).parent / "data"


@pytest.fixture(params=[np.array, sp.csr_matrix, sp.csc_matrix])
def adata_oligodendroma(request):
"""Adata with raw counts in .X parametrized to be either sparse or dense."""
Expand Down
Loading

0 comments on commit a39ae4f

Please sign in to comment.