diff --git a/README.md b/README.md
index 16bc00859..faf21f79e 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,17 @@
-[](https://github.com/astral-sh/ruff)
-[](https://pypi.python.org/pypi/gentropy/)
-[](https://badge.fury.io/py/gentropy)
-[](https://opentargets.github.io/gentropy/)
-[](https://codecov.io/gh/opentargets/gentropy)
-[](https://opensource.org/licenses/Apache-2.0)
-[](https://doi.org/10.5281/zenodo.10527086)
-
+
+
+
+
+
+
+
+
+
+
Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. The package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic signals to assist drug discovery.
## Installation
diff --git a/docs/__init__.py b/docs/__init__.py
new file mode 100644
index 000000000..a7f0c2caf
--- /dev/null
+++ b/docs/__init__.py
@@ -0,0 +1 @@
+"""Docs package."""
diff --git a/docs/howto/_howto.md b/docs/howto/_howto.md
index 1e8e26b83..6772c0fbb 100644
--- a/docs/howto/_howto.md
+++ b/docs/howto/_howto.md
@@ -2,4 +2,7 @@
This page contains a collection of how-to guides for the project.
+- [**Command line interface**](command_line/_command_line.md): Learn how to use the Gentropy CLI.
+- [**Python API**](python_api/_python_api.md): Learn how to use the Gentropy Python package.
+
For additional information please visit [https://community.opentargets.org/](https://community.opentargets.org/)
diff --git a/docs/howto/command_line/_command_line.md b/docs/howto/command_line/_command_line.md
new file mode 100644
index 000000000..117a49416
--- /dev/null
+++ b/docs/howto/command_line/_command_line.md
@@ -0,0 +1,7 @@
+---
+Title: Command line interface
+---
+
+# Command line interface
+
+Gentropy steps can be run using the command line interface (CLI). This section contains a collection of how-to guides for the CLI.
diff --git a/docs/howto/run_step_in_cli.md b/docs/howto/command_line/run_step_in_cli.md
similarity index 97%
rename from docs/howto/run_step_in_cli.md
rename to docs/howto/command_line/run_step_in_cli.md
index 965c7e079..ac7d55ff9 100644
--- a/docs/howto/run_step_in_cli.md
+++ b/docs/howto/command_line/run_step_in_cli.md
@@ -41,4 +41,4 @@ In most occassions, some mandatory values will be required to run the step. For
gentropy step=gene_index step.target_path=/path/to/target step.gene_index_path=/path/to/gene_index
```
-You can find more about the available steps in the [documentation](../python_api/steps/_steps.md).
+You can find more about the available steps in the [documentation](../../python_api/steps/_steps.md).
diff --git a/docs/howto/run_step_using_config.md b/docs/howto/command_line/run_step_using_config.md
similarity index 100%
rename from docs/howto/run_step_using_config.md
rename to docs/howto/command_line/run_step_using_config.md
diff --git a/docs/howto/python_api/_python_api.md b/docs/howto/python_api/_python_api.md
new file mode 100644
index 000000000..86bf96519
--- /dev/null
+++ b/docs/howto/python_api/_python_api.md
@@ -0,0 +1,5 @@
+---
+title: Python API
+---
+
+This section explains how to use gentropy in a Python environment providing a foundational understanding on how to perform genetics analyses using the package. This section can be useful for users wishing to use Gentropy in their own projects.
diff --git a/docs/howto/python_api/a_creating_spark_session.md b/docs/howto/python_api/a_creating_spark_session.md
new file mode 100644
index 000000000..a28615fe5
--- /dev/null
+++ b/docs/howto/python_api/a_creating_spark_session.md
@@ -0,0 +1,33 @@
+---
+title: Creating a Spark Session
+---
+
+In this section, we'll guide you through creating a Spark session using Gentropy's Session class. Gentropy uses _Apache PySpark_ as the underlying framework for distributed computing. The Session class provides a convenient way to initialize a Spark session with pre-configured settings.
+
+## Creating a Default Session
+
+To begin your journey with Gentropy, start by creating a default Spark session. This is the simplest way to initialize your environment.
+
+```python
+--8<-- "src_snippets/howto/python_api/a_creating_spark_session.py:default_session"
+```
+
+The above code snippet sets up a default Spark session with pre-configured settings. This is ideal for getting started quickly without needing to tweak any configurations.
+
+## Customizing Your Spark Session
+
+Gentropy allows you to customize the Spark session to suit your specific needs. You can modify various parameters such as memory allocation, number of executors, and more. This flexibility is particularly useful for optimizing performance in steps that are more computationally intensive.
+
+### Example: Increasing Driver Memory
+
+If you require more memory for the Spark driver, you can easily adjust this setting:
+
+```python
+--8<-- "src_snippets/howto/python_api/a_creating_spark_session.py:custom_session"
+```
+
+This code snippet demonstrates how to increase the memory allocated to the Spark driver to 16 gigabytes. You can customize other Spark settings similarly, according to your project's requirements.
+
+## What's next?
+
+Now that you've created a Spark session, you're ready to start using Gentropy. In the next section, we'll show you how to process a large dataset using Gentropy's powerful _SummaryStatistics_ datatype.
diff --git a/docs/howto/python_api/b_create_dataset.md b/docs/howto/python_api/b_create_dataset.md
new file mode 100644
index 000000000..ef9e0e8ce
--- /dev/null
+++ b/docs/howto/python_api/b_create_dataset.md
@@ -0,0 +1,61 @@
+---
+title: Create a dataset
+---
+
+Gentropy provides a collection of `Dataset`s that encapsulate key concepts in the field of genetics. For example, to represent summary statistics, you'll use the [`SummaryStatistics`](../../python_api/datasets/summary_statistics.md) class. This datatype comes with a set of useful operations to disentangle the genetic architecture of a trait or disease.
+
+The full list of `Dataset`s is available in the Python API [documentation](../../python_api/datasets/_datasets.md).
+
+!!! info "Any instance of Dataset will have 2 common attributes"
+
+ - **df**: the Spark DataFrame that contains the data
+ - **schema**: the definition of the data structure in Spark format
+
+In this section you'll learn the different ways of how to create a `Dataset` instances.
+
+## Creating a dataset from parquet
+
+All the `Dataset`s have a `from_parquet` method that allows you to create any `Dataset` instance from a parquet file or directory.
+
+```python
+--8<-- "src_snippets/howto/python_api/b_create_dataset.py:create_from_parquet_import"
+path = "path/to/summary/stats"
+--8<-- "src_snippets/howto/python_api/b_create_dataset.py:create_from_parquet"
+```
+
+!!! info "Parquet files"
+
+ Parquet is a columnar storage format that is widely used in the Spark ecosystem. It is the recommended format for storing large datasets. For more information about parquet, please visit [https://parquet.apache.org/](https://parquet.apache.org/).
+
+## Creating a dataset from a data source
+
+Alternatively, `Dataset`s can be created using a [data source](../../python_api/datasources/_datasources.md) harmonisation method. For example, to create a `SummaryStatistics` object from Finngen's raw summary statistics, you can use the [`FinnGen`](../../python_api/datasources/finngen/summary_stats.md) data source.
+
+```python
+--8<-- "src_snippets/howto/python_api/b_create_dataset.py:create_from_source_import"
+path = "path/to/finngen/summary/stats"
+--8<-- "src_snippets/howto/python_api/b_create_dataset.py:create_from_source"
+```
+
+## Creating a dataset from a pandas DataFrame
+
+If none of our data sources fit your needs, you can create a `Dataset` object from your own data. To do so, you need to transform your data to fit the `Dataset` schema.
+
+!!! info "The schema of a Dataset is defined in Spark format"
+
+ The Dataset schemas can be found in the documentation of each Dataset. For example, the schema of the `SummaryStatistics` dataset can be found [here](../../python_api/datasets/summary_statistics.md).
+
+You can also create a `Dataset` from a pandas DataFrame. This is useful when you want to create a `Dataset` from a small dataset that fits in memory.
+
+```python
+--8<-- "src_snippets/howto/python_api/b_create_dataset.py:create_from_pandas_import"
+
+# Load your transformed data into a pandas DataFrame
+path = "path/to/your/data"
+custom_summary_stats_pandas_df = pd.read_csv(path)
+--8<-- "src_snippets/howto/python_api/b_create_dataset.py:create_from_pandas"
+```
+
+## What's next?
+
+In the next section, we will explore how to apply well-established algorithms that transform and analyse genetic data within the Gentropy framework.
diff --git a/docs/howto/python_api/c_applying_methods.md b/docs/howto/python_api/c_applying_methods.md
new file mode 100644
index 000000000..4dc8d139d
--- /dev/null
+++ b/docs/howto/python_api/c_applying_methods.md
@@ -0,0 +1,36 @@
+---
+title: Applying methods
+---
+
+The available methods implement well established algorithms that transform and analyse data. Methods usually take as input predefined `Dataset`(s) and produce one or several `Dataset`(s) as output. This section explains how to apply methods to your data.
+
+The full list of available methods can be found in the Python API [documentation](../../python_api/methods/_methods.md).
+
+## Apply a class method
+
+Some methods are implemented as class methods. For example, the `finemap` method is a class method of the [`PICS`](../../python_api/methods/pics.md) class. This method performs fine-mapping using the PICS algorithm. These methods usually take as input one or several `Dataset`(s) and produce one or several `Dataset`(s) as output.
+
+```python
+--8<-- "src_snippets/howto/python_api/c_applying_methods.py:apply_class_method_pics"
+```
+
+## Apply a `Dataset` instance method
+
+Some methods are implemented as instance methods of the `Dataset` class. For example, the `window_based_clumping` method is an instance method of the `SummaryStatistics` class. This method performs window-based clumping on summary statistics.
+
+```python
+--8<-- "src_snippets/howto/python_api/c_applying_methods.py:apply_instance_method"
+```
+
+!!! info "The `window_based_clumping` method is also available as a class method"
+
+ The `window_based_clumping` method is also available as a class method of the `WindowBasedClumping` class. This method performs window-based clumping on summary statistics.
+
+ ```python
+ # Perform window-based clumping on summary statistics
+ --8<-- "src_snippets/howto/python_api/c_applying_methods.py:apply_class_method_clumping"
+ ```
+
+## What's next?
+
+Up next, we'll show you how to inspect your data to ensure its integrity and the success of your transformations.
diff --git a/docs/howto/python_api/d_inspect_dataset.md b/docs/howto/python_api/d_inspect_dataset.md
new file mode 100644
index 000000000..c274b8335
--- /dev/null
+++ b/docs/howto/python_api/d_inspect_dataset.md
@@ -0,0 +1,37 @@
+---
+title: Inspect a dataset
+---
+
+We have seen how to create and transform a `Dataset` instance. This section guides you through inspecting your data to ensure its integrity and the success of your transformations.
+
+## Inspect data in a `Dataset`
+
+The `df` attribute of a Dataset instance is key to interacting with and inspecting the stored data.
+
+!!! info "By accessing the df attribute, you can apply any method that you would typically use on a PySpark DataFrame. See the [PySpark documentation](https://spark.apache.org/docs/3.1.1/api/python/reference/pyspark.sql.html#dataframe-apis) for more information."
+
+### View data samples
+
+```python
+--8<-- "src_snippets/howto/python_api/d_inspect_dataset.py:print_dataframe"
+```
+
+This method displays the first 10 rows of your dataset, giving you a snapshot of your data's structure and content.
+
+### Understand the schema
+
+```python
+--8<-- "src_snippets/howto/python_api/d_inspect_dataset.py:get_dataset_schema"
+
+--8<-- "src_snippets/howto/python_api/d_inspect_dataset.py:print_dataframe"
+```
+
+## Write a `Dataset` to disk
+
+```python
+--8<-- "src_snippets/howto/python_api/d_inspect_dataset.py:write_parquet"
+
+--8<-- "src_snippets/howto/python_api/d_inspect_dataset.py:write_csv"
+```
+
+Consider the format's compatibility with your tools, and the partitioning strategy for large datasets to optimize performance.
diff --git a/docs/index.md b/docs/index.md
index c04921d37..2147ceb14 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -7,7 +7,10 @@ hide:
-
+
+

+
+
+
-[](https://github.com/astral-sh/ruff)
-[](https://pypi.python.org/pypi/gentropy/)
-[](https://badge.fury.io/py/gentropy)
-[](https://opentargets.github.io/gentropy/)
-[](https://codecov.io/gh/opentargets/gentropy)
-[](https://opensource.org/licenses/Apache-2.0)
-[](https://doi.org/10.5281/zenodo.10527086)
-
+
+
+
+
+
+
+
+
+
---
Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. This package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic signals to assist drug discovery.
+#### Key Features:
+
+- **Specialized Datatypes**: Introduces essential genetics datatypes like _StudyLocus_, _LocusToGene_, and _SummaryStatistics_.
+- **Performance-Oriented**: Optimized for large-scale genetic data analysis, including locus-to-gene scoring, fine mapping, and colocalization analysis.
+- **User-Friendly**: The package is designed to be intuitive, allowing both beginners and experienced researchers to conduct complex genetic with ease.
+
## About Open Targets
Open Targets is a pre-competitive, public-private partnership that uses human genetics and genomics data to systematically identify and prioritise drug targets. Through large-scale genomic experiments and the development of innovative computational techniques, the partnership aims to help researchers select the best targets for the development of new therapies. For more information, visit the Open Targets [website](https://www.opentargets.org).
diff --git a/docs/src_snippets/howto/python_api/a_creating_spark_session.py b/docs/src_snippets/howto/python_api/a_creating_spark_session.py
new file mode 100644
index 000000000..947b22086
--- /dev/null
+++ b/docs/src_snippets/howto/python_api/a_creating_spark_session.py
@@ -0,0 +1,32 @@
+"""Docs to create a default Spark Session."""
+from gentropy.common.session import Session
+
+
+def default_session() -> Session:
+ """Create a default Spark Session.
+
+ Returns:
+ Session: Spark Session.
+ """
+ # --8<-- [start:default_session]
+ from gentropy.common.session import Session
+
+ # Create a default Spark Session
+ session = Session()
+ # --8<-- [end:default_session]
+ return session
+
+
+def custom_session() -> Session:
+ """Create a custom Spark Session.
+
+ Returns:
+ Session: Spark Session.
+ """
+ # --8<-- [start:custom_session]
+ from gentropy.common.session import Session
+
+ # Create a Spark session with increased driver memory
+ session = Session(extended_spark_conf={"spark.driver.memory": "4g"})
+ # --8<-- [end:custom_session]
+ return session
diff --git a/docs/src_snippets/howto/python_api/b_create_dataset.py b/docs/src_snippets/howto/python_api/b_create_dataset.py
new file mode 100644
index 000000000..813cd1a02
--- /dev/null
+++ b/docs/src_snippets/howto/python_api/b_create_dataset.py
@@ -0,0 +1,59 @@
+"""Docs to create a dataset."""
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from gentropy.common.session import Session
+
+if TYPE_CHECKING:
+ from gentropy.dataset.summary_statistics import SummaryStatistics
+
+
+def create_from_parquet(session: Session) -> SummaryStatistics:
+ """Create a dataset from a path with parquet files."""
+ # --8<-- [start:create_from_parquet_import]
+ # Create a SummaryStatistics object by loading data from the specified path
+ from gentropy.dataset.summary_statistics import SummaryStatistics
+
+ # --8<-- [end:create_from_parquet_import]
+
+ path = "tests/data_samples/sumstats_sample/GCST005523_chr18.parquet"
+ # --8<-- [start:create_from_parquet]
+ summary_stats = SummaryStatistics.from_parquet(session, path)
+ # --8<-- [end:create_from_parquet]
+ return summary_stats
+
+
+def create_from_source(session: Session) -> SummaryStatistics:
+ """Create a dataset from a path with parquet files."""
+ # --8<-- [start:create_from_source_import]
+ # Create a SummaryStatistics object by loading raw data from Finngen
+ from gentropy.datasource.finngen.summary_stats import FinnGenSummaryStats
+
+ # --8<-- [end:create_from_source_import]
+ path = "tests/data_samples/finngen_R9_AB1_ACTINOMYCOSIS.gz"
+ # --8<-- [start:create_from_source]
+ summary_stats = FinnGenSummaryStats.from_source(session.spark, path)
+ # --8<-- [end:create_from_source]
+ return summary_stats
+
+
+def create_from_pandas() -> SummaryStatistics:
+ """Create a dataset from a path with Pandas files."""
+ # --8<-- [start:create_from_pandas_import]
+ import pyspark.pandas as ps
+ from gentropy.dataset.summary_statistics import SummaryStatistics
+
+ # --8<-- [end:create_from_pandas_import]
+
+ path = "tests/data_samples/sumstats_sample/GCST005523_chr18.parquet"
+ custom_summary_stats_pandas_df = ps.read_parquet(path)
+ # --8<-- [start:create_from_pandas]
+
+ # Create a SummaryStatistics object specifying the data and schema
+ custom_summary_stats_df = custom_summary_stats_pandas_df.to_spark()
+ custom_summary_stats = SummaryStatistics(
+ _df=custom_summary_stats_df, _schema=SummaryStatistics.get_schema()
+ )
+ # --8<-- [end:create_from_pandas]
+ return custom_summary_stats
diff --git a/docs/src_snippets/howto/python_api/c_applying_methods.py b/docs/src_snippets/howto/python_api/c_applying_methods.py
new file mode 100644
index 000000000..12eaf61ac
--- /dev/null
+++ b/docs/src_snippets/howto/python_api/c_applying_methods.py
@@ -0,0 +1,39 @@
+"""Docs to apply a method on a dataset."""
+from __future__ import annotations
+
+from gentropy.dataset.study_locus import StudyLocus
+from gentropy.dataset.summary_statistics import SummaryStatistics
+
+
+def apply_class_method_pics(study_locus_ld_annotated: StudyLocus) -> StudyLocus:
+ """Docs to apply the PICS class method to mock study loci."""
+ # --8<-- [start:apply_class_method_pics]
+ from gentropy.method.pics import PICS
+
+ finemapped_study_locus = PICS.finemap(
+ study_locus_ld_annotated
+ ).annotate_credible_sets()
+ # --8<-- [end:apply_class_method_pics]
+ return finemapped_study_locus
+
+
+def apply_class_method_clumping(summary_stats: SummaryStatistics) -> StudyLocus:
+ """Docs to apply the clumping class method to mock summary statistics."""
+ # --8<-- [start:apply_class_method_clumping]
+ from gentropy.method.window_based_clumping import WindowBasedClumping
+
+ clumped_summary_statistics = WindowBasedClumping.clump(
+ summary_stats, window_length=500_000
+ )
+ # --8<-- [end:apply_class_method_clumping]
+ return clumped_summary_statistics
+
+
+def apply_instance_method(summary_stats: SummaryStatistics) -> StudyLocus:
+ """Docs to apply the clumping instance method to mock summary statistics."""
+ # --8<-- [start:apply_instance_method]
+ # Perform window-based clumping on summary statistics
+ # By default, the method uses a 1Mb window and a p-value threshold of 5e-8
+ clumped_summary_statistics = summary_stats.window_based_clumping()
+ # --8<-- [end:apply_instance_method]
+ return clumped_summary_statistics
diff --git a/docs/src_snippets/howto/python_api/d_inspect_dataset.py b/docs/src_snippets/howto/python_api/d_inspect_dataset.py
new file mode 100644
index 000000000..ad4b14b5b
--- /dev/null
+++ b/docs/src_snippets/howto/python_api/d_inspect_dataset.py
@@ -0,0 +1,43 @@
+"""Docs to inspect a dataset."""
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from gentropy.dataset.summary_statistics import SummaryStatistics
+
+if TYPE_CHECKING:
+ from pyspark.sql.types import StructType
+
+
+def interact_w_dataframe(summary_stats: SummaryStatistics) -> SummaryStatistics:
+ """Docs to interact with the `df` attribute of a dataset."""
+ # --8<-- [start:print_dataframe]
+ # Inspect the first 10 rows of the data
+ summary_stats.df.show(10)
+ # --8<-- [end:print_dataframe]
+
+ # --8<-- [start:print_dataframe_schema]
+ # Print the schema of the data
+ summary_stats.df.printSchema()
+ # --8<-- [end:print_dataframe_schema]
+ return summary_stats
+
+def get_dataset_schema(summary_stats: SummaryStatistics) -> StructType:
+ """Docs to get the schema of a dataset."""
+ # --8<-- [start:get_dataset_schema]
+ # Get the Spark schema of any `Dataset` as a `StructType` object
+ schema = summary_stats.get_schema()
+ # --8<-- [end:get_dataset_schema]
+ return schema
+
+def write_data(summary_stats: SummaryStatistics) -> None:
+ """Docs to write a dataset to disk."""
+ # --8<-- [start:write_parquet]
+ # Write the data to disk in parquet format
+ summary_stats.df.write.parquet("path/to/summary/stats")
+ # --8<-- [end:write_parquet]
+
+ # --8<-- [start:write_csv]
+ # Write the data to disk in csv format
+ summary_stats.df.write.csv("path/to/summary/stats")
+ # --8<-- [end:write_csv]
diff --git a/mkdocs.yml b/mkdocs.yml
index d25076a73..180c6fbe1 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -40,6 +40,7 @@ markdown_extensions:
- pymdownx.superfences
- pymdownx.snippets:
base_path: "docs"
+ dedent_subsections: true
- pymdownx.highlight
- pymdownx.superfences
- toc:
diff --git a/pyproject.toml b/pyproject.toml
index 114f72eac..ad7f07736 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -260,7 +260,7 @@ convention = "google"
[tool.pydoclint]
style = 'google'
-exclude = '\.git|\.venv|__init__.py|tests'
+exclude = '\.git|\.venv|__init__.py|tests|docs'
require-return-section-when-returning-nothing = false
check-return-types = true
allow-init-docstring = true
diff --git a/tests/docs/test_applying_methods.py b/tests/docs/test_applying_methods.py
new file mode 100644
index 000000000..255752f46
--- /dev/null
+++ b/tests/docs/test_applying_methods.py
@@ -0,0 +1,30 @@
+"""Testing applying methods docs."""
+from typing import Any
+
+import pytest
+from gentropy.dataset.study_locus import StudyLocus
+from gentropy.dataset.summary_statistics import SummaryStatistics
+
+from docs.src_snippets.howto.python_api.c_applying_methods import (
+ apply_class_method_clumping,
+ apply_class_method_pics,
+ apply_instance_method,
+)
+
+
+@pytest.mark.parametrize(
+ "func",
+ [
+ apply_class_method_clumping,
+ apply_class_method_pics,
+ apply_instance_method,
+ ],
+)
+def test_apply_methods(
+ func: Any, mock_study_locus: StudyLocus, mock_summary_statistics: SummaryStatistics
+) -> None:
+ """Test any method in applying_methods returns an instance of StudyLocus."""
+ if func in [apply_class_method_clumping, apply_instance_method]:
+ assert isinstance(func(mock_summary_statistics), StudyLocus)
+ elif func == apply_class_method_pics:
+ assert isinstance(func(mock_study_locus), StudyLocus)
diff --git a/tests/docs/test_create_dataset.py b/tests/docs/test_create_dataset.py
new file mode 100644
index 000000000..663254753
--- /dev/null
+++ b/tests/docs/test_create_dataset.py
@@ -0,0 +1,26 @@
+"""Testing creating dataset docs."""
+from typing import Any
+
+import pytest
+from gentropy.common.session import Session
+from gentropy.dataset.summary_statistics import SummaryStatistics
+
+from docs.src_snippets.howto.python_api.b_create_dataset import (
+ create_from_pandas,
+ create_from_parquet,
+ create_from_source,
+)
+
+
+@pytest.mark.parametrize(
+ "func",
+ [
+ create_from_parquet,
+ create_from_source,
+ create_from_pandas,
+ ],
+)
+def test_create_dataset(func: Any, session: Session) -> None:
+ """Test any method in create_dataset returns an instance of SummaryStatistics."""
+ tested_func = func(session) if func != create_from_pandas else func()
+ assert isinstance(tested_func, SummaryStatistics)
diff --git a/tests/docs/test_creating_spark_session.py b/tests/docs/test_creating_spark_session.py
new file mode 100644
index 000000000..6e1cf1ca5
--- /dev/null
+++ b/tests/docs/test_creating_spark_session.py
@@ -0,0 +1,19 @@
+"""Testing creating spark session docs."""
+from gentropy.common.session import Session
+
+from docs.src_snippets.howto.python_api.a_creating_spark_session import (
+ custom_session,
+ default_session,
+)
+
+
+def test_default_session() -> None:
+ """Test default session."""
+ session = default_session()
+ assert isinstance(session, Session)
+
+
+def test_custom_session() -> None:
+ """Test custom session."""
+ session = custom_session()
+ assert isinstance(session, Session)
diff --git a/tests/docs/test_inspect_dataset.py b/tests/docs/test_inspect_dataset.py
new file mode 100644
index 000000000..682fe784a
--- /dev/null
+++ b/tests/docs/test_inspect_dataset.py
@@ -0,0 +1,18 @@
+"""Testing inspecting dataset docs."""
+from gentropy.dataset.summary_statistics import SummaryStatistics
+from pyspark.sql.types import StructType
+
+from docs.src_snippets.howto.python_api.d_inspect_dataset import (
+ get_dataset_schema,
+ interact_w_dataframe,
+)
+
+
+def test_interact_w_dataframe(mock_summary_statistics: SummaryStatistics) -> None:
+ """Test interact_w_dataframe returns a SummaryStatistics."""
+ assert isinstance(interact_w_dataframe(mock_summary_statistics), SummaryStatistics)
+
+
+def test_get_dataset_schema(mock_summary_statistics: SummaryStatistics) -> None:
+ """Test get_dataset_schema returns a StructType."""
+ assert isinstance(get_dataset_schema(mock_summary_statistics), StructType)