From 14afd5dc4e2637cac1ff5899be34866aadb824e6 Mon Sep 17 00:00:00 2001 From: Nok Date: Tue, 20 Sep 2022 15:11:11 +0100 Subject: [PATCH 01/38] Release/0.18.3 (#1856) * Update release version and release notes Signed-off-by: Nok Chan * Update missing release notes Signed-off-by: Nok Chan * update vresion Signed-off-by: Nok Chan * update release notes Signed-off-by: Nok Chan Signed-off-by: Nok Chan Signed-off-by: Ahdra Merali --- CITATION.cff | 4 ++-- RELEASE.md | 16 +++++++++++++--- docs/source/deployment/databricks.md | 6 +++--- docs/source/development/commands_reference.md | 2 +- docs/source/extend_kedro/plugins.md | 2 +- docs/source/tutorial/tutorial_template.md | 4 ++-- docs/source/tutorial/visualise_pipeline.md | 4 ++-- kedro/__init__.py | 2 +- 8 files changed, 25 insertions(+), 15 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index f8c9436e74..336c59be9c 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -36,6 +36,6 @@ authors: - family-names: Theisen given-names: Merel title: Kedro -version: 0.18.2 -date-released: 2022-06-07 +version: 0.18.3 +date-released: 2022-09-20 url: https://github.com/kedro-org/kedro diff --git a/RELEASE.md b/RELEASE.md index d366727c29..82a1b51a6d 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -8,8 +8,15 @@ ## Migration guide from Kedro 0.18.* to 0.19.* +# Upcoming Release 0.18.4 -# Upcoming Release 0.18.3 +## Major features and improvements + +## Bug fixes and other changes + +## Breaking changes to the API + +# Release 0.18.3 ## Major features and improvements * Implemented autodiscovery of project pipelines. A pipeline created with `kedro pipeline create ` can now be accessed immediately without needing to explicitly register it in `src//pipeline_registry.py`, either individually by name (e.g. `kedro run --pipeline=`) or as part of the combined default pipeline (e.g. `kedro run`). By default, the simplified `register_pipelines()` function in `pipeline_registry.py` looks like: @@ -28,20 +35,23 @@ * The Kedro IPython extension should now be loaded with `%load_ext kedro.ipython`. * The line magic `%reload_kedro` now accepts keywords arguments, e.g. `%reload_kedro --env=prod`. +* Improved resume pipeline suggestion for `SequentialRunner`, it will backtrack the closest persisted inputs to resume. ## Bug fixes and other changes -* Use default `False` value for rich logging `set_locals`, to make sure credentials and other sensitive data isn't shown in logs. +* Changed default `False` value for rich logging `show_locals`, to make sure credentials and other sensitive data isn't shown in logs. * Rich traceback handling is disabled on Databricks so that exceptions now halt execution as expected. This is a workaround for a [bug in `rich`](https://github.com/Textualize/rich/issues/2455). * When using `kedro run -n [some_node]`, if `some_node` is missing a namespace the resulting error message will suggest the correct node name. -* Update documentation for `rich` logging. +* Updated documentation for `rich` logging. * Updated Prefect deployment documentation to allow for reruns with saved versioned datasets. * The Kedro IPython extension now surfaces errors when it cannot load a Kedro project. * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x. * Added `gdrive` to list of cloud protocols, enabling Google Drive paths for datasets. +* Added svg logo resource for ipython kernel. ## Upcoming deprecations for Kedro 0.19.0 * The Kedro IPython extension will no longer be available as `%load_ext kedro.extras.extensions.ipython`; use `%load_ext kedro.ipython` instead. +* `kedro jupyter convert`, `kedro build-docs`, `kedro build-reqs` and `kedro activate-nbstripout` will be deprecated. # Release 0.18.2 diff --git a/docs/source/deployment/databricks.md b/docs/source/deployment/databricks.md index 7ee533b68b..1ecaa4daee 100644 --- a/docs/source/deployment/databricks.md +++ b/docs/source/deployment/databricks.md @@ -34,7 +34,7 @@ conda create --name iris_databricks python=3.7 -y conda activate iris_databricks # install Kedro and create a new project -pip install "kedro~=0.18.2" +pip install "kedro~=0.18.3" # name your project Iris Databricks when prompted for it kedro new --starter pyspark-iris ``` @@ -169,10 +169,10 @@ In your newly-created notebook, put each of the below code snippets into a separ %sh rm -rf ~/projects/iris-databricks && git clone --single-branch --branch main https://${GITHUB_USER}:${GITHUB_TOKEN}@github.com/${GITHUB_USER}/.git ~/projects/iris-databricks ``` -* Install the latest version of Kedro compatible with version `0.18.2` +* Install the latest version of Kedro compatible with version `0.18.3` ```console -%pip install "kedro[spark.SparkDataSet]~=0.18.2" +%pip install "kedro[spark.SparkDataSet]~=0.18.3" ``` * Copy input data into DBFS diff --git a/docs/source/development/commands_reference.md b/docs/source/development/commands_reference.md index 6ee404f642..5f955176b6 100644 --- a/docs/source/development/commands_reference.md +++ b/docs/source/development/commands_reference.md @@ -114,7 +114,7 @@ Returns output similar to the following, depending on the version of Kedro used | |/ / _ \/ _` | '__/ _ \ | < __/ (_| | | | (_) | |_|\_\___|\__,_|_| \___/ -v0.18.2 +v0.18.3 Kedro is a Python framework for creating reproducible, maintainable diff --git a/docs/source/extend_kedro/plugins.md b/docs/source/extend_kedro/plugins.md index 1f4a5fcfb6..2bad011c10 100644 --- a/docs/source/extend_kedro/plugins.md +++ b/docs/source/extend_kedro/plugins.md @@ -84,7 +84,7 @@ setup( After that you can use this starter with `kedro new --starter=test_plugin_starter`. ```{note} -If your starter lives on a git repository, by default Kedro attempts to use a tag or branch labelled with your version of Kedro, e.g. `0.18.2.`. This means that you can host different versions of your starter template on the same repository, and the correct one will automatically be used. If you do not wish to follow this structure, you should override it with the `checkout` flag, e.g. `kedro new --starter=test_plugin_starter --checkout=main`. +If your starter lives on a git repository, by default Kedro attempts to use a tag or branch labelled with your version of Kedro, e.g. `0.18.3.`. This means that you can host different versions of your starter template on the same repository, and the correct one will automatically be used. If you do not wish to follow this structure, you should override it with the `checkout` flag, e.g. `kedro new --starter=test_plugin_starter --checkout=main`. ``` ## Working with `click` diff --git a/docs/source/tutorial/tutorial_template.md b/docs/source/tutorial/tutorial_template.md index 2ee4dfab8e..a894dd8149 100644 --- a/docs/source/tutorial/tutorial_template.md +++ b/docs/source/tutorial/tutorial_template.md @@ -29,7 +29,7 @@ ipython==7.0 # Used for an IPython session with `kedro ipython` isort~=5.0 # Used for linting code with `kedro lint` jupyter~=1.0 # Used to open a Kedro-session in Jupyter Notebook & Lab jupyterlab~=3.0 # Used to open a Kedro-session in Jupyter Lab -kedro~=0.18.2 +kedro~=0.18.3 nbstripout~=0.4 # Strips the output of a Jupyter Notebook and writes the outputless version to the original file pytest-cov~=3.0 # Produces test coverage reports pytest-mock>=1.7.1, <2.0 # Wrapper around the mock package for easier use with pytest @@ -45,7 +45,7 @@ The dependencies above might be sufficient for some projects, but for this tutor Add the following lines to your `src/requirements.txt` file: ```text -kedro[pandas.CSVDataSet, pandas.ExcelDataSet, pandas.ParquetDataSet]==0.18.2 # Specify optional Kedro dependencies +kedro[pandas.CSVDataSet, pandas.ExcelDataSet, pandas.ParquetDataSet]==0.18.3 # Specify optional Kedro dependencies kedro-viz~=5.0 # Visualise your pipelines scikit-learn~=1.0 # For modelling in the data science pipeline ``` diff --git a/docs/source/tutorial/visualise_pipeline.md b/docs/source/tutorial/visualise_pipeline.md index 064a61db28..87c8f95908 100644 --- a/docs/source/tutorial/visualise_pipeline.md +++ b/docs/source/tutorial/visualise_pipeline.md @@ -110,7 +110,7 @@ We have also used the Plotly integration to allow users to [visualise metrics fr You must update the `requirements.txt` file in your Kedro project and add the following datasets to enable Plotly for your project. -`kedro[plotly.PlotlyDataSet, plotly.JSONDataSet]==0.18.2` +`kedro[plotly.PlotlyDataSet, plotly.JSONDataSet]==0.18.3` You can view Plotly charts in Kedro-Viz when you use Kedro's plotly datasets. @@ -248,7 +248,7 @@ The MatplotlibWriter dataset converts Matplotlib objects to image files. This me You can view Matplotlib charts in Kedro-Viz when you use the [Kedro MatplotLibWriter dataset](/kedro.extras.datasets.matplotlib.MatplotlibWriter). You must update the `src/requirements.txt` file in your Kedro project by adding the following dataset to enable Matplotlib for your project: ``` -kedro[matplotlib.MatplotlibWriter]==0.18.2 +kedro[matplotlib.MatplotlibWriter]==0.18.3 ``` To use this dataset, configure your plot in your Kedro node. The below functions should be added to the `nodes.py` and `pipeline.py` files respectively. diff --git a/kedro/__init__.py b/kedro/__init__.py index 235e5431e9..f61ba5612c 100644 --- a/kedro/__init__.py +++ b/kedro/__init__.py @@ -3,7 +3,7 @@ configuration and pipeline assembly. """ -__version__ = "0.18.2" +__version__ = "0.18.3" import logging From 570c5740bcb629797ed50d5b61a19ff66ab8ffb9 Mon Sep 17 00:00:00 2001 From: Ahdra Merali Date: Fri, 21 Oct 2022 08:51:54 +0100 Subject: [PATCH 02/38] Remove comment from code example Signed-off-by: Ahdra Merali --- kedro/extras/datasets/pandas/csv_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kedro/extras/datasets/pandas/csv_dataset.py b/kedro/extras/datasets/pandas/csv_dataset.py index 521b5da09e..0a8905062f 100644 --- a/kedro/extras/datasets/pandas/csv_dataset.py +++ b/kedro/extras/datasets/pandas/csv_dataset.py @@ -60,7 +60,6 @@ class CSVDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = CSVDataSet(filepath="gcs://bucket/test.csv") >>> data_set = CSVDataSet(filepath="test.csv") >>> data_set.save(data) >>> reloaded = data_set.load() From 981dcfed3071527073d06a55735090be0e47583d Mon Sep 17 00:00:00 2001 From: Ahdra Merali Date: Fri, 21 Oct 2022 09:25:00 +0100 Subject: [PATCH 03/38] Remove more comments Signed-off-by: Ahdra Merali --- kedro/extras/datasets/pandas/excel_dataset.py | 1 - kedro/extras/datasets/pandas/feather_dataset.py | 1 - kedro/extras/datasets/pandas/generic_dataset.py | 1 - kedro/extras/datasets/pandas/hdf_dataset.py | 1 - kedro/extras/datasets/pandas/json_dataset.py | 1 - kedro/extras/datasets/pandas/parquet_dataset.py | 1 - kedro/extras/datasets/pandas/xml_dataset.py | 1 - kedro/extras/datasets/pickle/pickle_dataset.py | 1 - kedro/extras/datasets/pillow/image_dataset.py | 1 - 9 files changed, 9 deletions(-) diff --git a/kedro/extras/datasets/pandas/excel_dataset.py b/kedro/extras/datasets/pandas/excel_dataset.py index 2bc6a88c33..ab71bc96f1 100644 --- a/kedro/extras/datasets/pandas/excel_dataset.py +++ b/kedro/extras/datasets/pandas/excel_dataset.py @@ -59,7 +59,6 @@ class ExcelDataSet( >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = ExcelDataSet(filepath="gcs://bucket/test.xlsx") >>> data_set = ExcelDataSet(filepath="test.xlsx") >>> data_set.save(data) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/pandas/feather_dataset.py b/kedro/extras/datasets/pandas/feather_dataset.py index 9adba3c9b3..f43edfef44 100644 --- a/kedro/extras/datasets/pandas/feather_dataset.py +++ b/kedro/extras/datasets/pandas/feather_dataset.py @@ -37,7 +37,6 @@ class FeatherDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = FeatherDataSet(filepath="gcs://bucket/test.feather") >>> data_set = FeatherDataSet(filepath="test.feather") >>> >>> data_set.save(data) diff --git a/kedro/extras/datasets/pandas/generic_dataset.py b/kedro/extras/datasets/pandas/generic_dataset.py index f1569faf15..0c32c92794 100644 --- a/kedro/extras/datasets/pandas/generic_dataset.py +++ b/kedro/extras/datasets/pandas/generic_dataset.py @@ -73,7 +73,6 @@ class GenericDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = GenericDataSet(filepath="s3://test.csv", file_format='csv') >>> data_set = GenericDataSet(filepath="test.csv", file_format='csv') >>> data_set.save(data) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/pandas/hdf_dataset.py b/kedro/extras/datasets/pandas/hdf_dataset.py index da73c1def5..08d759f2f8 100644 --- a/kedro/extras/datasets/pandas/hdf_dataset.py +++ b/kedro/extras/datasets/pandas/hdf_dataset.py @@ -45,7 +45,6 @@ class HDFDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = HDFDataSet(filepath="gcs://bucket/test.hdf", key='data') >>> data_set = HDFDataSet(filepath="test.h5", key='data') >>> data_set.save(data) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/pandas/json_dataset.py b/kedro/extras/datasets/pandas/json_dataset.py index 21689c11d1..b8c478e76a 100644 --- a/kedro/extras/datasets/pandas/json_dataset.py +++ b/kedro/extras/datasets/pandas/json_dataset.py @@ -52,7 +52,6 @@ class JSONDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = JSONDataSet(filepath="gcs://bucket/test.json") >>> data_set = JSONDataSet(filepath="test.json") >>> data_set.save(data) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/pandas/parquet_dataset.py b/kedro/extras/datasets/pandas/parquet_dataset.py index 1c889c93cc..5c83740f38 100644 --- a/kedro/extras/datasets/pandas/parquet_dataset.py +++ b/kedro/extras/datasets/pandas/parquet_dataset.py @@ -64,7 +64,6 @@ class ParquetDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = ParquetDataSet(filepath="gcs://bucket/test.parquet") >>> data_set = ParquetDataSet(filepath="test.parquet") >>> data_set.save(data) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/pandas/xml_dataset.py b/kedro/extras/datasets/pandas/xml_dataset.py index 3e5d59989b..4b9194eced 100644 --- a/kedro/extras/datasets/pandas/xml_dataset.py +++ b/kedro/extras/datasets/pandas/xml_dataset.py @@ -35,7 +35,6 @@ class XMLDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = XMLDataSet(filepath="gcs://bucket/test.xml") >>> data_set = XMLDataSet(filepath="test.xml") >>> data_set.save(data) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/pickle/pickle_dataset.py b/kedro/extras/datasets/pickle/pickle_dataset.py index f565edd37d..7a6e1a420d 100644 --- a/kedro/extras/datasets/pickle/pickle_dataset.py +++ b/kedro/extras/datasets/pickle/pickle_dataset.py @@ -55,7 +55,6 @@ class PickleDataSet(AbstractVersionedDataSet[Any, Any]): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = PickleDataSet(filepath="gcs://bucket/test.pkl") >>> data_set = PickleDataSet(filepath="test.pkl", backend="pickle") >>> data_set.save(data) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/pillow/image_dataset.py b/kedro/extras/datasets/pillow/image_dataset.py index c498361815..74c1ec0164 100644 --- a/kedro/extras/datasets/pillow/image_dataset.py +++ b/kedro/extras/datasets/pillow/image_dataset.py @@ -26,7 +26,6 @@ class ImageDataSet(AbstractVersionedDataSet[Image.Image, Image.Image]): >>> from kedro.extras.datasets.pillow import ImageDataSet >>> - >>> # data_set = ImageDataSet(filepath="gcs://bucket/test.png") >>> data_set = ImageDataSet(filepath="test.png") >>> image = data_set.load() >>> image.show() From c60aef6b56a7e901ddbad6e75f115494633001fb Mon Sep 17 00:00:00 2001 From: Ahdra Merali Date: Fri, 21 Oct 2022 09:26:03 +0100 Subject: [PATCH 04/38] Add YAML formatting Signed-off-by: Ahdra Merali --- kedro/extras/datasets/plotly/plotly_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/extras/datasets/plotly/plotly_dataset.py b/kedro/extras/datasets/plotly/plotly_dataset.py index 4fa3880a0c..576b37f154 100644 --- a/kedro/extras/datasets/plotly/plotly_dataset.py +++ b/kedro/extras/datasets/plotly/plotly_dataset.py @@ -23,7 +23,7 @@ class PlotlyDataSet(JSONDataSet): the JSON file directly from a pandas DataFrame through ``plotly_args``. Example configuration for a PlotlyDataSet in the catalog: - :: + .. code-block:: yaml >>> bar_plot: >>> type: plotly.PlotlyDataSet From 07159ff35bd95d6b3e66c9d0eeb0982648280fc5 Mon Sep 17 00:00:00 2001 From: Ahdra Merali Date: Fri, 21 Oct 2022 09:29:14 +0100 Subject: [PATCH 05/38] Add missing import Signed-off-by: Ahdra Merali --- kedro/extras/datasets/redis/redis_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro/extras/datasets/redis/redis_dataset.py b/kedro/extras/datasets/redis/redis_dataset.py index e762afcf71..0c098797d9 100644 --- a/kedro/extras/datasets/redis/redis_dataset.py +++ b/kedro/extras/datasets/redis/redis_dataset.py @@ -43,6 +43,7 @@ class PickleDataSet(AbstractDataSet[Any, Any]): :: >>> from kedro.extras.datasets.redis import PickleDataSet + >>> import pandas as pd >>> >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) From 09b0d46e517f65aa08cd10a4d3641fad69a4fc97 Mon Sep 17 00:00:00 2001 From: Ahdra Merali Date: Fri, 21 Oct 2022 09:50:57 +0100 Subject: [PATCH 06/38] Remove even more comments Signed-off-by: Ahdra Merali --- kedro/extras/datasets/text/text_dataset.py | 1 - kedro/extras/datasets/tracking/json_dataset.py | 1 - kedro/extras/datasets/tracking/metrics_dataset.py | 1 - kedro/extras/datasets/yaml/yaml_dataset.py | 1 - 4 files changed, 4 deletions(-) diff --git a/kedro/extras/datasets/text/text_dataset.py b/kedro/extras/datasets/text/text_dataset.py index a2e01e6a93..9953e17a00 100644 --- a/kedro/extras/datasets/text/text_dataset.py +++ b/kedro/extras/datasets/text/text_dataset.py @@ -27,7 +27,6 @@ class TextDataSet(AbstractVersionedDataSet[str, str]): >>> >>> string_to_write = "This will go in a file." >>> - >>> # data_set = TextDataSet(filepath="gcs://bucket/test.md") >>> data_set = TextDataSet(filepath="test.md") >>> data_set.save(string_to_write) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/tracking/json_dataset.py b/kedro/extras/datasets/tracking/json_dataset.py index 5e8e7ea69f..481bf4eb9b 100644 --- a/kedro/extras/datasets/tracking/json_dataset.py +++ b/kedro/extras/datasets/tracking/json_dataset.py @@ -21,7 +21,6 @@ class JSONDataSet(JDS): >>> >>> data = {'col1': 1, 'col2': 0.23, 'col3': 0.002} >>> - >>> # data_set = JSONDataSet(filepath="gcs://bucket/test.json") >>> data_set = JSONDataSet(filepath="test.json") >>> data_set.save(data) diff --git a/kedro/extras/datasets/tracking/metrics_dataset.py b/kedro/extras/datasets/tracking/metrics_dataset.py index 10ff4e9b87..ef9f0e223f 100644 --- a/kedro/extras/datasets/tracking/metrics_dataset.py +++ b/kedro/extras/datasets/tracking/metrics_dataset.py @@ -23,7 +23,6 @@ class MetricsDataSet(JSONDataSet): >>> >>> data = {'col1': 1, 'col2': 0.23, 'col3': 0.002} >>> - >>> # data_set = MetricsDataSet(filepath="gcs://bucket/test.json") >>> data_set = MetricsDataSet(filepath="test.json") >>> data_set.save(data) diff --git a/kedro/extras/datasets/yaml/yaml_dataset.py b/kedro/extras/datasets/yaml/yaml_dataset.py index affa38afcf..8ab08fa48c 100644 --- a/kedro/extras/datasets/yaml/yaml_dataset.py +++ b/kedro/extras/datasets/yaml/yaml_dataset.py @@ -28,7 +28,6 @@ class YAMLDataSet(AbstractVersionedDataSet[Dict, Dict]): >>> >>> data = {'col1': [1, 2], 'col2': [4, 5], 'col3': [5, 6]} >>> - >>> # data_set = YAMLDataSet(filepath="gcs://bucket/test.yaml") >>> data_set = YAMLDataSet(filepath="test.yaml") >>> data_set.save(data) >>> reloaded = data_set.load() From 48ba4719ce1e880c5db426be29691c84f1801c75 Mon Sep 17 00:00:00 2001 From: Ahdra Merali Date: Fri, 21 Oct 2022 10:51:50 +0100 Subject: [PATCH 07/38] Remove more even more comments Signed-off-by: Ahdra Merali --- kedro/extras/datasets/email/message_dataset.py | 1 - kedro/extras/datasets/geopandas/geojson_dataset.py | 5 +---- kedro/extras/datasets/json/json_dataset.py | 5 ----- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/kedro/extras/datasets/email/message_dataset.py b/kedro/extras/datasets/email/message_dataset.py index 08dc157fc6..7e26f8da46 100644 --- a/kedro/extras/datasets/email/message_dataset.py +++ b/kedro/extras/datasets/email/message_dataset.py @@ -46,7 +46,6 @@ class EmailMessageDataSet( >>> msg["From"] = '"sin studly17"' >>> msg["To"] = '"strong bad"' >>> - >>> # data_set = EmailMessageDataSet(filepath="gcs://bucket/test") >>> data_set = EmailMessageDataSet(filepath="test") >>> data_set.save(msg) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/geopandas/geojson_dataset.py b/kedro/extras/datasets/geopandas/geojson_dataset.py index 6e82eae7de..bc0f1307e9 100644 --- a/kedro/extras/datasets/geopandas/geojson_dataset.py +++ b/kedro/extras/datasets/geopandas/geojson_dataset.py @@ -37,10 +37,7 @@ class GeoJSONDataSet( >>> >>> data = gpd.GeoDataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}, geometry=[Point(1,1), Point(2,4)]) - >>> # data_set = GeoJSONDataSet(filepath="gcs://bucket/test.geojson", - >>> save_args=None) - >>> data_set = GeoJSONDataSet(filepath="test.geojson", - >>> save_args=None) + >>> data_set = GeoJSONDataSet(filepath="test.geojson", save_args=None) >>> data_set.save(data) >>> reloaded = data_set.load() >>> diff --git a/kedro/extras/datasets/json/json_dataset.py b/kedro/extras/datasets/json/json_dataset.py index 3efabfc0e1..f1c0279d46 100644 --- a/kedro/extras/datasets/json/json_dataset.py +++ b/kedro/extras/datasets/json/json_dataset.py @@ -28,8 +28,6 @@ class JSONDataSet(AbstractVersionedDataSet[Any, Any]): >>> json_dataset: >>> type: json.JSONDataSet >>> filepath: data/01_raw/location.json - >>> load_args: - >>> lines: True >>> >>> cars: >>> type: json.JSONDataSet @@ -37,8 +35,6 @@ class JSONDataSet(AbstractVersionedDataSet[Any, Any]): >>> fs_args: >>> project: my-project >>> credentials: my_gcp_credentials - >>> load_args: - >>> lines: True Example using Python API: :: @@ -47,7 +43,6 @@ class JSONDataSet(AbstractVersionedDataSet[Any, Any]): >>> >>> data = {'col1': [1, 2], 'col2': [4, 5], 'col3': [5, 6]} >>> - >>> # data_set = JSONDataSet(filepath="gcs://bucket/test.json") >>> data_set = JSONDataSet(filepath="test.json") >>> data_set.save(data) >>> reloaded = data_set.load() From 770e3b9e6a8a2f2175891e6ae85fceca8a8cd275 Mon Sep 17 00:00:00 2001 From: Ahdra Merali Date: Fri, 21 Oct 2022 12:29:16 +0100 Subject: [PATCH 08/38] Add pickle requirement to extras_require Signed-off-by: Ahdra Merali --- kedro/extras/datasets/pickle/pickle_dataset.py | 1 - setup.py | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kedro/extras/datasets/pickle/pickle_dataset.py b/kedro/extras/datasets/pickle/pickle_dataset.py index 7a6e1a420d..4c483135fa 100644 --- a/kedro/extras/datasets/pickle/pickle_dataset.py +++ b/kedro/extras/datasets/pickle/pickle_dataset.py @@ -60,7 +60,6 @@ class PickleDataSet(AbstractVersionedDataSet[Any, Any]): >>> reloaded = data_set.load() >>> assert data.equals(reloaded) >>> - >>> # Add "compress_pickle[lz4]" to requirements.txt >>> data_set = PickleDataSet(filepath="test.pickle.lz4", >>> backend="compress_pickle", >>> load_args={"compression":"lz4"}, diff --git a/setup.py b/setup.py index 40acb38d36..fe856e635a 100644 --- a/setup.py +++ b/setup.py @@ -78,6 +78,7 @@ def _collect_requirements(requires): "pandas.XMLDataSet": [PANDAS, "lxml~=4.6"], "pandas.GenericDataSet": [PANDAS], } +pickle_require = {"pickle.PickleDataSet": ["compress-pickle~=2.1.0"]} pillow_require = {"pillow.ImageDataSet": ["Pillow~=9.0"]} plotly_require = { "plotly.PlotlyDataSet": [PANDAS, "plotly>=4.8.0, <6.0"], @@ -120,6 +121,7 @@ def _collect_requirements(requires): "holoviews": _collect_requirements(holoviews_require), "networkx": _collect_requirements(networkx_require), "pandas": _collect_requirements(pandas_require), + "pickle": _collect_requirements(pickle_require), "pillow": _collect_requirements(pillow_require), "plotly": _collect_requirements(plotly_require), "redis": _collect_requirements(redis_require), @@ -134,6 +136,7 @@ def _collect_requirements(requires): **holoviews_require, **networkx_require, **pandas_require, + **pickle_require, **pillow_require, **plotly_require, **spark_require, From 66efe0dc42a5f9084c0ad1f955e3466412c8068e Mon Sep 17 00:00:00 2001 From: Ahdra Merali Date: Fri, 21 Oct 2022 12:49:16 +0100 Subject: [PATCH 09/38] Try fix YAML docs Signed-off-by: Ahdra Merali --- .../extras/datasets/plotly/plotly_dataset.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kedro/extras/datasets/plotly/plotly_dataset.py b/kedro/extras/datasets/plotly/plotly_dataset.py index 576b37f154..54b3d9c881 100644 --- a/kedro/extras/datasets/plotly/plotly_dataset.py +++ b/kedro/extras/datasets/plotly/plotly_dataset.py @@ -26,18 +26,18 @@ class PlotlyDataSet(JSONDataSet): .. code-block:: yaml >>> bar_plot: - >>> type: plotly.PlotlyDataSet - >>> filepath: data/08_reporting/bar_plot.json - >>> plotly_args: - >>> type: bar - >>> fig: - >>> x: features - >>> y: importance - >>> orientation: h - >>> layout: - >>> xaxis_title: x - >>> yaxis_title: y - >>> title: Test + >>> type: plotly.PlotlyDataSet + >>> filepath: data/08_reporting/bar_plot.json + >>> plotly_args: + >>> type: bar + >>> fig: + >>> x: features + >>> y: importance + >>> orientation: h + >>> layout: + >>> xaxis_title: x + >>> yaxis_title: y + >>> title: Test """ # pylint: disable=too-many-arguments From 80151f8a66cbe9360bb69060058f333fb148597a Mon Sep 17 00:00:00 2001 From: Ahdra Merali Date: Fri, 21 Oct 2022 13:18:02 +0100 Subject: [PATCH 10/38] Try fix YAML docs pt 2 Signed-off-by: Ahdra Merali --- kedro/extras/datasets/plotly/plotly_dataset.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kedro/extras/datasets/plotly/plotly_dataset.py b/kedro/extras/datasets/plotly/plotly_dataset.py index 54b3d9c881..2a804bd42f 100644 --- a/kedro/extras/datasets/plotly/plotly_dataset.py +++ b/kedro/extras/datasets/plotly/plotly_dataset.py @@ -31,13 +31,13 @@ class PlotlyDataSet(JSONDataSet): >>> plotly_args: >>> type: bar >>> fig: - >>> x: features - >>> y: importance - >>> orientation: h + >>> x: features + >>> y: importance + >>> orientation: h >>> layout: - >>> xaxis_title: x - >>> yaxis_title: y - >>> title: Test + >>> xaxis_title: x + >>> yaxis_title: y + >>> title: Title """ # pylint: disable=too-many-arguments From 6e52ba6e1942d8fa7b99b4394de7ca7ae65c0f8a Mon Sep 17 00:00:00 2001 From: Ahdra Merali <90615669+AhdraMeraliQB@users.noreply.github.com> Date: Tue, 27 Sep 2022 11:24:35 +0100 Subject: [PATCH 11/38] Fix code snippets in docs (#1876) * Fix code snippets Signed-off-by: Ahdra Merali * Separate code blocks Signed-off-by: Ahdra Merali * Lint Signed-off-by: Ahdra Merali Signed-off-by: Ahdra Merali --- docs/source/tutorial/visualise_pipeline.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/docs/source/tutorial/visualise_pipeline.md b/docs/source/tutorial/visualise_pipeline.md index 87c8f95908..871ca03934 100644 --- a/docs/source/tutorial/visualise_pipeline.md +++ b/docs/source/tutorial/visualise_pipeline.md @@ -126,13 +126,16 @@ Below is an example of how to visualise plots on Kedro-Viz using `plotly.PlotlyD The below functions can be added to the `nodes.py` and `pipeline.py` files respectively. ```python +# nodes.py import pandas as pd def compare_passenger_capacity(preprocessed_shuttles: pd.DataFrame): return preprocessed_shuttles.groupby(["shuttle_type"]).mean().reset_index() +``` - +```python +# pipeline.py def create_pipeline(**kwargs) -> Pipeline: """This is a simple pipeline which generates a plot""" return pipeline( @@ -175,6 +178,7 @@ Below is an example of how to visualise plots using [Plotly Express](https://plo The below functions can be added to the `nodes.py` and `pipeline.py` files respectively. ```python +# nodes.py import plotly.express as px import pandas as pd @@ -200,8 +204,10 @@ def compare_passenger_capacity(preprocessed_shuttles: pd.DataFrame): ] ) return fig +``` - +```python +# pipeline.py def create_pipeline(**kwargs) -> Pipeline: """This is a simple pipeline which generates a plot""" return pipeline( @@ -256,6 +262,7 @@ To use this dataset, configure your plot in your Kedro node. The below functions ```python # nodes.py import matplotlib.pyplot as plt +import seaborn as sn def create_confusion_matrix(companies: pd.DataFrame): @@ -268,8 +275,9 @@ def create_confusion_matrix(companies: pd.DataFrame): ) sn.heatmap(confusion_matrix, annot=True) return plt +``` - +```python # pipeline.py def create_pipeline(**kwargs) -> Pipeline: """This is a simple pipeline which generates a plot""" @@ -287,7 +295,7 @@ def create_pipeline(**kwargs) -> Pipeline: You must also specify the output type in the `catalog.yml` file, like below. Remember to set the versioned flag to `true` if you want to add the plots to experiment tracking as well. ```yaml -reporting.dummy_confusion_matrix: +dummy_confusion_matrix: type: matplotlib.MatplotlibWriter filepath: ${base_location}/08_reporting/dummy_confusion_matrix.png versioned: true From 4e3e7b4835676210320d4dd5a031cd5686bbf73c Mon Sep 17 00:00:00 2001 From: Jimmy Stammers Date: Wed, 28 Sep 2022 09:39:44 +0100 Subject: [PATCH 12/38] Fix issue with specifying format for SparkHiveDataSet (#1857) Signed-off-by: jstammers Signed-off-by: Ahdra Merali --- RELEASE.md | 1 + kedro/extras/datasets/spark/spark_hive_dataset.py | 2 +- .../extras/datasets/spark/test_spark_hive_dataset.py | 11 +++++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/RELEASE.md b/RELEASE.md index 82a1b51a6d..8a1dfc3796 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -48,6 +48,7 @@ * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x. * Added `gdrive` to list of cloud protocols, enabling Google Drive paths for datasets. * Added svg logo resource for ipython kernel. +* Fixed `format` in `save_args` for `SparkHiveDataSet`. ## Upcoming deprecations for Kedro 0.19.0 * The Kedro IPython extension will no longer be available as `%load_ext kedro.extras.extensions.ipython`; use `%load_ext kedro.ipython` instead. diff --git a/kedro/extras/datasets/spark/spark_hive_dataset.py b/kedro/extras/datasets/spark/spark_hive_dataset.py index 5bb3f96624..549204dea2 100644 --- a/kedro/extras/datasets/spark/spark_hive_dataset.py +++ b/kedro/extras/datasets/spark/spark_hive_dataset.py @@ -114,7 +114,7 @@ def __init__( self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) if save_args is not None: self._save_args.update(save_args) - self._format = self._save_args.get("format") or "hive" + self._format = self._save_args.pop("format", None) or "hive" self._eager_checkpoint = self._save_args.pop("eager_checkpoint", None) or True def _describe(self) -> Dict[str, Any]: diff --git a/tests/extras/datasets/spark/test_spark_hive_dataset.py b/tests/extras/datasets/spark/test_spark_hive_dataset.py index 9cab7bb62e..aa570697e6 100644 --- a/tests/extras/datasets/spark/test_spark_hive_dataset.py +++ b/tests/extras/datasets/spark/test_spark_hive_dataset.py @@ -301,3 +301,14 @@ def test_read_from_non_existent_table(self): r"table_doesnt_exist\], \[\], false\n", ): dataset.load() + + def test_save_delta_format(self, mocker): + dataset = SparkHiveDataSet( + database="default_1", table="delta_table", save_args={"format": "delta"} + ) + mocked_save = mocker.patch("pyspark.sql.DataFrameWriter.saveAsTable") + dataset.save(_generate_spark_df_one()) + mocked_save.assert_called_with( + "default_1.delta_table", mode="errorifexists", format="delta" + ) + assert dataset._format == "delta" From d9551356807c900c06fcf509367f77ddaf860047 Mon Sep 17 00:00:00 2001 From: Nok Date: Wed, 28 Sep 2022 11:42:20 +0100 Subject: [PATCH 13/38] Update RELEASE.md (#1883) * Update RELEASE.md * fix broken link * Update RELEASE.md Co-authored-by: Merel Theisen <49397448+MerelTheisenQB@users.noreply.github.com> Co-authored-by: Merel Theisen <49397448+MerelTheisenQB@users.noreply.github.com> Signed-off-by: Ahdra Merali --- RELEASE.md | 3 ++- docs/source/deployment/databricks.md | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 8a1dfc3796..9c95e7f0be 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -14,6 +14,8 @@ ## Bug fixes and other changes +* Fixed `format` in `save_args` for `SparkHiveDataSet`, previously it didn't allow you to save it as delta format. + ## Breaking changes to the API # Release 0.18.3 @@ -48,7 +50,6 @@ * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x. * Added `gdrive` to list of cloud protocols, enabling Google Drive paths for datasets. * Added svg logo resource for ipython kernel. -* Fixed `format` in `save_args` for `SparkHiveDataSet`. ## Upcoming deprecations for Kedro 0.19.0 * The Kedro IPython extension will no longer be available as `%load_ext kedro.extras.extensions.ipython`; use `%load_ext kedro.ipython` instead. diff --git a/docs/source/deployment/databricks.md b/docs/source/deployment/databricks.md index 1ecaa4daee..8461f9b21e 100644 --- a/docs/source/deployment/databricks.md +++ b/docs/source/deployment/databricks.md @@ -159,7 +159,7 @@ Then press `Confirm` button. Your cluster will be restarted to apply the changes Congratulations, you are now ready to run your Kedro project from the Databricks! -[Create your Databricks notebook](https://docs.databricks.com/notebooks/notebooks-manage.html#create-a-notebook) and remember to [attach it to the cluster](https://docs.databricks.com/notebooks/notebooks-manage.html#attach) you have just configured. +[Create your Databricks notebook](https://docs.databricks.com/notebooks/notebooks-manage.html#create-a-notebook) and remember to attach it to the cluster you have just configured. In your newly-created notebook, put each of the below code snippets into a separate cell, then [run all cells](https://docs.databricks.com/notebooks/notebooks-use.html#run-notebooks): From 10972d40cc910c8544cfa8854fdcfa547ef78ba4 Mon Sep 17 00:00:00 2001 From: Nok Date: Thu, 29 Sep 2022 17:11:55 +0100 Subject: [PATCH 14/38] Deprecate `kedro test` and `kedro lint` (#1873) * Deprecating `kedro test` and `kedro lint` Signed-off-by: Nok Chan * Deprecate commands Signed-off-by: Nok Chan * Make kedro looks prettier * Update Linting Signed-off-by: Nok Signed-off-by: Nok Chan Signed-off-by: Nok Signed-off-by: Ahdra Merali --- RELEASE.md | 5 +++-- kedro/framework/cli/project.py | 26 +++++++++++++++++++++----- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 9c95e7f0be..2dad086227 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -14,9 +14,10 @@ ## Bug fixes and other changes -* Fixed `format` in `save_args` for `SparkHiveDataSet`, previously it didn't allow you to save it as delta format. +## Upcoming deprecations for Kedro 0.19.0 -## Breaking changes to the API +* `kedro test` and `kedro lint` will be deprecated. +* Fixed `format` in `save_args` for `SparkHiveDataSet`, previously it didn't allow you to save it as delta format. # Release 0.18.3 diff --git a/kedro/framework/cli/project.py b/kedro/framework/cli/project.py index 0f61394049..bcea4c4320 100644 --- a/kedro/framework/cli/project.py +++ b/kedro/framework/cli/project.py @@ -71,7 +71,14 @@ def project_group(): # pragma: no cover @forward_command(project_group, forward_help=True) @click.pass_obj # this will pass the metadata as first argument def test(metadata: ProjectMetadata, args, **kwargs): # pylint: disable=unused-argument - """Run the test suite.""" + """Run the test suite. (DEPRECATED)""" + deprecation_message = ( + "DeprecationWarning: Command 'kedro test' is deprecated and " + "will not be available from Kedro 0.19.0. " + "Use the command 'pytest' instead. " + ) + click.secho(deprecation_message, fg="red") + try: _check_module_importable("pytest") except KedroCliError as exc: @@ -90,7 +97,13 @@ def test(metadata: ProjectMetadata, args, **kwargs): # pylint: disable=unused-a def lint( metadata: ProjectMetadata, files, check_only, **kwargs ): # pylint: disable=unused-argument - """Run flake8, isort and black.""" + """Run flake8, isort and black. (DEPRECATED)""" + deprecation_message = ( + "DeprecationWarning: Command 'kedro lint' is deprecated and " + "will not be available from Kedro 0.19.0." + ) + click.secho(deprecation_message, fg="red") + source_path = metadata.source_dir package_name = metadata.package_name files = files or (str(source_path / "tests"), str(source_path / package_name)) @@ -171,13 +184,15 @@ def package(metadata: ProjectMetadata): @click.pass_obj # this will pass the metadata as first argument def build_docs(metadata: ProjectMetadata, open_docs): """Build the project documentation. (DEPRECATED)""" - source_path = metadata.source_dir - package_name = metadata.package_name deprecation_message = ( "DeprecationWarning: Command 'kedro build-docs' is deprecated and " "will not be available from Kedro 0.19.0." ) click.secho(deprecation_message, fg="red") + + source_path = metadata.source_dir + package_name = metadata.package_name + python_call("pip", ["install", str(source_path / "[docs]")]) python_call("pip", ["install", "-r", str(source_path / "requirements.txt")]) python_call("ipykernel", ["install", "--user", f"--name={package_name}"]) @@ -262,12 +277,13 @@ def activate_nbstripout( metadata: ProjectMetadata, **kwargs ): # pylint: disable=unused-argument """Install the nbstripout git hook to automatically clean notebooks. (DEPRECATED)""" - source_path = metadata.source_dir deprecation_message = ( "DeprecationWarning: Command 'kedro activate-nbstripout' is deprecated and " "will not be available from Kedro 0.19.0." ) click.secho(deprecation_message, fg="red") + + source_path = metadata.source_dir click.secho( ( "Notebook output cells will be automatically cleared before committing" From 57384cc6753f44f9c5951022472c99a6e30e025a Mon Sep 17 00:00:00 2001 From: Florian Gaudin-Delrieu <9217921+FlorianGD@users.noreply.github.com> Date: Fri, 30 Sep 2022 15:01:04 +0200 Subject: [PATCH 15/38] Fix micro package pull from PyPI (#1848) Signed-off-by: Florian Gaudin-Delrieu Signed-off-by: Ahdra Merali --- RELEASE.md | 7 +++++-- kedro/framework/cli/micropkg.py | 13 ++++++------- tests/framework/cli/micropkg/test_micropkg_pull.py | 7 +++++-- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 2dad086227..b66aae9b76 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -13,11 +13,14 @@ ## Major features and improvements ## Bug fixes and other changes +* Fixed `kedro micropkg pull` for packages on PyPI. +* Fixed `format` in `save_args` for `SparkHiveDataSet`, previously it didn't allow you to save it as delta format. -## Upcoming deprecations for Kedro 0.19.0 +## Minor breaking changes to the API +## Upcoming deprecations for Kedro 0.19.0 * `kedro test` and `kedro lint` will be deprecated. -* Fixed `format` in `save_args` for `SparkHiveDataSet`, previously it didn't allow you to save it as delta format. + # Release 0.18.3 diff --git a/kedro/framework/cli/micropkg.py b/kedro/framework/cli/micropkg.py index fa6557fa09..3242e18e81 100644 --- a/kedro/framework/cli/micropkg.py +++ b/kedro/framework/cli/micropkg.py @@ -141,18 +141,17 @@ def _pull_package( ): with tempfile.TemporaryDirectory() as temp_dir: temp_dir_path = Path(temp_dir).resolve() - _unpack_sdist(package_path, temp_dir_path, fs_args) - sdist_file_name = Path(package_path).name.rstrip(".tar.gz") - egg_info_file = list((temp_dir_path / sdist_file_name).glob("*.egg-info")) - if len(egg_info_file) != 1: + egg_info_files = list((temp_dir_path).rglob("*.egg-info")) + if len(egg_info_files) != 1: raise KedroCliError( f"More than 1 or no egg-info files found from {package_path}. " f"There has to be exactly one egg-info directory." ) - package_name = egg_info_file[0].stem - package_requirements = temp_dir_path / sdist_file_name / "setup.py" + egg_info_file = egg_info_files[0] + package_name = egg_info_file.stem + package_requirements = egg_info_file.parent / "setup.py" # Finds a string representation of 'install_requires' list from setup.py reqs_list_pattern = r"install_requires\=(.*?)\,\n" @@ -172,7 +171,7 @@ def _pull_package( _install_files( metadata, package_name, - temp_dir_path / sdist_file_name, + egg_info_file.parent, env, alias, destination, diff --git a/tests/framework/cli/micropkg/test_micropkg_pull.py b/tests/framework/cli/micropkg/test_micropkg_pull.py index f557e40d08..1893fd580b 100644 --- a/tests/framework/cli/micropkg/test_micropkg_pull.py +++ b/tests/framework/cli/micropkg/test_micropkg_pull.py @@ -627,9 +627,12 @@ def test_pull_from_pypi( options = ["-e", env] if env else [] options += ["--alias", alias] if alias else [] + + package_name = "my-pipeline" + result = CliRunner().invoke( fake_project_cli, - ["micropkg", "pull", f"{PIPELINE_NAME}-{version}", *options], + ["micropkg", "pull", package_name, *options], obj=fake_metadata, ) assert result.exit_code == 0 @@ -642,7 +645,7 @@ def test_pull_from_pypi( "--no-deps", "--dest", str(tmp_path), - f"{PIPELINE_NAME}-{version}", + package_name, ], ) From d6feaace5d5e897d1c787cafb91dfcbafe6f6e91 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Mon, 3 Oct 2022 11:30:47 +0100 Subject: [PATCH 16/38] Update Error message for `VersionNotFoundError` to handle Permission related issues better (#1881) * Update message for VersionNotFoundError Signed-off-by: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> * Add test for VersionNotFoundError for cloud protocols * Update test_data_catalog.py Update NoVersionFoundError test * minor linting update * update docs link + styling changes * Revert "update docs link + styling changes" This reverts commit 6088e00159a9ee844dfee312673654b6d248f931. * Update test with styling changes * Update RELEASE.md Signed-off-by: ankatiyar Signed-off-by: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Signed-off-by: ankatiyar Co-authored-by: Ahdra Merali <90615669+AhdraMeraliQB@users.noreply.github.com> Signed-off-by: Ahdra Merali --- RELEASE.md | 1 + kedro/io/core.py | 11 +++++++++-- tests/io/test_data_catalog.py | 13 ++++++++++++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index b66aae9b76..bae2e311fb 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -15,6 +15,7 @@ ## Bug fixes and other changes * Fixed `kedro micropkg pull` for packages on PyPI. * Fixed `format` in `save_args` for `SparkHiveDataSet`, previously it didn't allow you to save it as delta format. +* Updated error message for `VersionNotFoundError` to handle insufficient permission issues for cloud storage. ## Minor breaking changes to the API diff --git a/kedro/io/core.py b/kedro/io/core.py index 9765e0baea..fc6dea587c 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -538,9 +538,16 @@ def _fetch_latest_load_version(self) -> str: most_recent = next( (path for path in version_paths if self._exists_function(path)), None ) - + protocol = getattr(self, "_protocol", None) if not most_recent: - raise VersionNotFoundError(f"Did not find any versions for {self}") + if protocol in CLOUD_PROTOCOLS: + message = ( + f"Did not find any versions for {self}. This could be " + f"due to insufficient permission." + ) + else: + message = f"Did not find any versions for {self}" + raise VersionNotFoundError(message) return PurePath(most_recent).parent.name diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index f1c7b608ea..76e18dcae3 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -19,7 +19,7 @@ LambdaDataSet, MemoryDataSet, ) -from kedro.io.core import VERSION_FORMAT, generate_timestamp +from kedro.io.core import VERSION_FORMAT, Version, generate_timestamp @pytest.fixture @@ -652,3 +652,14 @@ def test_replacing_nonword_characters(self): assert "ds2_spark" in catalog.datasets.__dict__ assert "ds3__csv" in catalog.datasets.__dict__ assert "jalapeño" in catalog.datasets.__dict__ + + def test_no_versions_with_cloud_protocol(self): + """Check the error if no versions are available for load from cloud storage""" + version = Version(load=None, save=None) + versioned_dataset = CSVDataSet("s3://bucket/file.csv", version=version) + pattern = re.escape( + f"Did not find any versions for {versioned_dataset}. " + f"This could be due to insufficient permission." + ) + with pytest.raises(DataSetError, match=pattern): + versioned_dataset.load() From ce070f50346e3c172956640a6f1e70781d5244be Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+MerelTheisenQB@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:24:57 +0100 Subject: [PATCH 17/38] Update experiment tracking documentation with working examples (#1893) Signed-off-by: Merel Theisen Signed-off-by: Ahdra Merali --- RELEASE.md | 1 + docs/source/deployment/databricks.md | 2 +- docs/source/logging/experiment_tracking.md | 40 ++++++++++++++-- .../tutorial/set_up_experiment_tracking.md | 46 +++++++++++++++++-- docs/source/tutorial/visualise_pipeline.md | 2 +- requirements.txt | 3 +- 6 files changed, 81 insertions(+), 13 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index bae2e311fb..08fd3d7eb3 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -16,6 +16,7 @@ * Fixed `kedro micropkg pull` for packages on PyPI. * Fixed `format` in `save_args` for `SparkHiveDataSet`, previously it didn't allow you to save it as delta format. * Updated error message for `VersionNotFoundError` to handle insufficient permission issues for cloud storage. +* Updated Experiment Tracking docs with working examples. ## Minor breaking changes to the API diff --git a/docs/source/deployment/databricks.md b/docs/source/deployment/databricks.md index 8461f9b21e..e9b5e407e6 100644 --- a/docs/source/deployment/databricks.md +++ b/docs/source/deployment/databricks.md @@ -64,7 +64,7 @@ You should get a similar output: If you already have an active cluster with runtime version `7.1`, you can skip this step. Here is [how to find clusters in your Databricks workspace](https://docs.databricks.com/clusters/clusters-manage.html). -Follow the [Databricks official guide to create a new cluster](https://docs.databricks.com/clusters/create.html). For the purpose of this tutorial (and to minimise costs) we recommend the following settings: +Follow the [Databricks official guide to create a new cluster](https://docs.databricks.com/clusters/create-cluster.html). For the purpose of this tutorial (and to minimise costs) we recommend the following settings: * Runtime: `7.1 (Scala 2.12, Spark 3.0.0)` * Enable autoscaling: `off` * Terminate after 120 minutes of inactivity: `on` diff --git a/docs/source/logging/experiment_tracking.md b/docs/source/logging/experiment_tracking.md index 58a309a8db..2d230b50f6 100644 --- a/docs/source/logging/experiment_tracking.md +++ b/docs/source/logging/experiment_tracking.md @@ -13,6 +13,27 @@ Experiment tracking in Kedro adds in the missing pieces and will be developed in The following section outlines the setup within your Kedro project to enable experiment tracking. You can also refer to the [tutorial on setting up experiment tracking](../tutorial/set_up_experiment_tracking.md) for a step-by-step process to access your tracking datasets on Kedro-Viz. ## Enable experiment tracking + +### Set up the session store + +In the domain of experiment tracking, each pipeline run is considered a session. A session store records all related metadata for each pipeline run, from logged metrics to other run-related data such as timestamp, git username and branch. The session store is a [SQLite](https://www.sqlite.org/index.html) database that is generated during your first pipeline run after it has been set up in your project. + +To set up the session store, go to the `src/settings.py` file and add the following: + +```python +from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore +from pathlib import Path + +SESSION_STORE_CLASS = SQLiteStore +SESSION_STORE_ARGS = {"path": str(Path(__file__).parents[2] / "data")} +``` + +This will specify the creation of the `SQLiteStore` under the `/data` subfolder, using the `SQLiteStore` setup from your installed Kedro-Viz plugin. + +Please ensure that your installed version of Kedro-Viz is at least version 4.1.1 onwards. This step is crucial to enable experiment tracking features on Kedro-Viz, as it is the database used to serve all run data to the Kedro-Viz front-end. + +### Set up tracking datasets + Use either one of the [`tracking.MetricsDataSet`](/kedro.extras.datasets.tracking.MetricsDataSet) or [`tracking.JSONDataSet`](/kedro.extras.datasets.tracking.JSONDataSet) in your data catalog. These datasets are versioned by default to ensure a historical record is kept of the logged data. The `tracking.MetricsDataSet` should be used for tracking numerical metrics and the `tracking.JSONDataSet` can be used for tracking any other JSON-compatible data. In Kedro-Viz these datasets will be visualised in the metadata side panel. @@ -24,16 +45,21 @@ metrics: ``` +### Set up your nodes and pipelines to log metrics + Add a node that returns the data to be tracked. The `report_accuracy` node below returns metrics. ```python # nodes.py +from sklearn.metrics import accuracy_score + + +def report_accuracy(): """Node for reporting the accuracy of the predictions.""" - # Get true class index - target = np.argmax(test_y.to_numpy(), axis=1) - # Calculate accuracy of predictions - accuracy = np.sum(predictions == target) / target.shape[0] + test_y = [0, 2, 1, 3] + predictions = [0, 1, 2, 3] + accuracy = accuracy_score(test_y, predictions) # Return the accuracy of the model return {"accuracy": accuracy} ``` @@ -42,12 +68,16 @@ Add the node to your pipeline and ensure that the output name matches the name o ```python # pipeline.py +from kedro.pipeline import Pipeline, node, pipeline +from .nodes import report_accuracy + + def create_pipeline(**kwargs) -> Pipeline: return pipeline( [ node( report_accuracy, - ["example_predictions", "example_test_y"], + [], "metrics", name="report", ), diff --git a/docs/source/tutorial/set_up_experiment_tracking.md b/docs/source/tutorial/set_up_experiment_tracking.md index f5c4cb2f66..af8b49e26c 100644 --- a/docs/source/tutorial/set_up_experiment_tracking.md +++ b/docs/source/tutorial/set_up_experiment_tracking.md @@ -25,7 +25,7 @@ We assume that you have already [installed Kedro](../get_started/install.md) and kedro new --starter=spaceflights ``` -Feel free to name your project as you like, but this guide will assume the project is named **Kedro Experiment Tracking Tutorial**, and that your project is in a sub-folder in your working directory that was created by `kedro new`, named `kedro-experiment-tracking-tutorial`. To keep the default names for the `repo_name` and `python_package` when prompted, press the enter key. +Feel free to name your project as you like, but this guide will assume the project is named **Kedro Experiment Tracking Tutorial**, and that your project is in a sub-folder in your working directory that was created by `kedro new`, named `kedro-experiment-tracking-tutorial`. ## Set up the session store @@ -176,21 +176,57 @@ You can now access, compare and pin your runs by toggling the `Compare runs` but ## View and compare plot data -Experiment tracking in Kedro-Viz also supports the display and comparison of plots, such as Plotly and Matplotlib. Once you have [created your plots](../tutorial/visualise_pipeline.md#visualise-charts-in-kedro-viz) in your kedro-project, you must set the versioned flag to `true` within the project catalog to include them in experiment tracking. +From Kedro-Viz version 5.0.0 experiment tracking also supports the display and comparison of plots, such as Plotly and Matplotlib. +Add a new node to the `data_processing` nodes (`src/kedro-experiment-tracking-tutorial/pipelines/data_processing/nodes.py`): + +```python +import matplotlib.pyplot as plt +import seaborn as sn + + +def create_confusion_matrix(companies: pd.DataFrame): + actuals = [0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1] + predicted = [1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1] + data = {"y_Actual": actuals, "y_Predicted": predicted} + df = pd.DataFrame(data, columns=["y_Actual", "y_Predicted"]) + confusion_matrix = pd.crosstab( + df["y_Actual"], df["y_Predicted"], rownames=["Actual"], colnames=["Predicted"] + ) + sn.heatmap(confusion_matrix, annot=True) + return plt +``` + +> You might have to execute `pip install seaborn` if the [seaborn library](https://seaborn.pydata.org/) is not installed yet. + +And now add this node to the `data_processing` pipeline (`src/kedro-experiment-tracking-tutorial/pipelines/data_processing/pipeline.py`) + +```python +node( + func=create_confusion_matrix, + inputs="companies", + outputs="confusion_matrix", +), ``` + +In the catalog add the `confusion_matrix` data definition, making sure to set the versioned flag to `true` within the project catalog to include the plot in experiment tracking. + +```yaml # conf/base/catalog.yml -reporting.confusion_matrix: +data_processing.confusion_matrix: type: matplotlib.MatplotlibWriter - filepath: ${base_location}/08_reporting/confusion_matrix.png + filepath: data/09_tracking/confusion_matrix.png versioned: true ``` -Clicking on a plot will expand it. When in comparison view, expanding a plot will show all the plots in that view for them to be compared side-by-side. +After running the pipeline with `kedro run`, the plot will be saved and you will be able to see the plot in the experiment tracking panel when you execute `kedro viz`. Clicking on a plot will expand it. When in comparison view, expanding a plot will show all the plots in that view for them to be compared side-by-side. ![](../meta/images/expand-plot-comparison-view.gif) + +Read more about [creating plots and visualising them in Kedro viz in the visualise pipeline section.](../tutorial/visualise_pipeline.md#visualise-charts-in-kedro-viz) + ## View your metrics timeline Additionally, you can monitor the changes to metrics over time from the pipeline visualisation tab ![](../meta/images/pipeline_visualisation_icon.png). Clicking on any [MetricsDataset](https://kedro.readthedocs.io/en/stable/kedro.extras.datasets.tracking.MetricsDataSet.html) node will open a side panel displaying how the metric value has changed over time. diff --git a/docs/source/tutorial/visualise_pipeline.md b/docs/source/tutorial/visualise_pipeline.md index 871ca03934..8e038db9e8 100644 --- a/docs/source/tutorial/visualise_pipeline.md +++ b/docs/source/tutorial/visualise_pipeline.md @@ -297,7 +297,7 @@ You must also specify the output type in the `catalog.yml` file, like below. Rem ```yaml dummy_confusion_matrix: type: matplotlib.MatplotlibWriter - filepath: ${base_location}/08_reporting/dummy_confusion_matrix.png + filepath: data/08_reporting/dummy_confusion_matrix.png versioned: true ``` diff --git a/requirements.txt b/requirements.txt index f736ddf8bb..30e2386c72 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,8 @@ cookiecutter>=2.1.1, <3.0 dynaconf>=3.1.2, <4.0 fsspec>=2021.4, <=2022.7.1 gitpython~=3.0 -importlib_metadata>=3.6 # The "selectable" entry points were introduced in `importlib_metadata` 3.6 and Python 3.10. +importlib-metadata>=3.6; python_version >= '3.8' +importlib_metadata>=3.6, <5.0; python_version < '3.8' # The "selectable" entry points were introduced in `importlib_metadata` 3.6 and Python 3.10. Bandit on Python 3.7 relies on a library with `importlib_metadata` < 5.0 importlib_resources>=1.3 # The `files()` API was introduced in `importlib_resources` 1.3 and Python 3.9. jmespath>=0.9.5, <1.0 pip-tools~=6.5 From 4f459057d3f606f29a3179cc59fa229e8cda5b68 Mon Sep 17 00:00:00 2001 From: Yetunde Dada <43755008+yetudada@users.noreply.github.com> Date: Tue, 4 Oct 2022 15:34:24 +0100 Subject: [PATCH 18/38] Add NHS AI Lab and ReSpo.Vision to companies list (#1878) Signed-off-by: Ahdra Merali --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e067f59548..db34985a86 100644 --- a/README.md +++ b/README.md @@ -106,9 +106,11 @@ There are Kedro users across the world, who work at start-ups, major enterprises [Mosaic Data Science](https://www.youtube.com/watch?v=fCWGevB366g), [NaranjaX](https://www.youtube.com/watch?v=_0kMmRfltEQ), [NASA](https://github.com/nasa/ML-airport-taxi-out), +[NHS AI Lab](https://nhsx.github.io/skunkworks/synthetic-data-pipeline), [Open Data Science LatAm](https://www.odesla.org/), [Prediqt](https://prediqt.co/), [QuantumBlack](https://medium.com/quantumblack/introducing-kedro-the-open-source-library-for-production-ready-machine-learning-code-d1c6d26ce2cf), +[ReSpo.Vision](https://neptune.ai/customers/respo-vision), [Retrieva](https://tech.retrieva.jp/entry/2020/07/28/181414), [Roche](https://www.roche.com/), [Sber](https://www.linkedin.com/posts/seleznev-artem_welcome-to-kedros-documentation-kedro-activity-6767523561109385216-woTt), @@ -120,7 +122,8 @@ There are Kedro users across the world, who work at start-ups, major enterprises [WovenLight](https://www.wovenlight.com/) and [XP](https://youtu.be/wgnGOVNkXqU?t=2210). -Kedro has also won [Best Technical Tool or Framework for AI](https://awards.ai/the-awards/previous-awards/the-4th-ai-award-winners/) in the 2019 Awards AI competition and a merit award for the 2020 [UK Technical Communication Awards](https://uktcawards.com/announcing-the-award-winners-for-2020/). It is listed on the 2020 [ThoughtWorks Technology Radar](https://www.thoughtworks.com/radar/languages-and-frameworks/kedro) and the 2020 [Data & AI Landscape](https://mattturck.com/data2020/). +Kedro won [Best Technical Tool or Framework for AI](https://awards.ai/the-awards/previous-awards/the-4th-ai-award-winners/) in the 2019 Awards AI competition and a merit award for the 2020 [UK Technical Communication Awards](https://uktcawards.com/announcing-the-award-winners-for-2020/). It is listed on the 2020 [ThoughtWorks Technology Radar](https://www.thoughtworks.com/radar/languages-and-frameworks/kedro) and the 2020 [Data & AI Landscape](https://mattturck.com/data2020/). Kedro has received an [honorable mention in the User Experience category in Fast Company’s 2022 Innovation by Design Awards](https://www.fastcompany.com/90772252/user-experience-innovation-by-design-2022). + ## How can I cite Kedro? From 1108411999a836b6b4dccae23d2d10879118ad59 Mon Sep 17 00:00:00 2001 From: Jannic <37243923+jmholzer@users.noreply.github.com> Date: Tue, 4 Oct 2022 18:28:46 +0100 Subject: [PATCH 19/38] Document how users can use pytest instead of kedro test (#1879) * Add best_practices.md with introductory sections Signed-off-by: Jannic Holzer * Add pytest and pytest-cov sections Signed-off-by: Jannic Holzer * Add pytest-cov coverage report Signed-off-by: Jannic Holzer * Add sections on pytest-cov Signed-off-by: Jannic Holzer * Add automated_testing to index.rst Signed-off-by: Jannic Holzer * Reformat third-party library names and clean grammar. Signed-off-by: Jannic Holzer * Add link to virtual environment docs Signed-off-by: Jannic Holzer * Add example of good test naming Signed-off-by: Jannic Holzer * Improve link accessibility Signed-off-by: Jannic Holzer * Improve pytest docs link accessibility Signed-off-by: Jannic Holzer * Add reminder link to virtual environment docs Signed-off-by: Jannic Holzer * Fix formatting in link to coverage docs Signed-off-by: Jannic Holzer * Remove reference to /src under 'Run your tests' Signed-off-by: Jannic Holzer * Modify references to to Signed-off-by: Jannic Holzer * Fix sentence structure Signed-off-by: Jannic Holzer * Fix broken databricks doc link Signed-off-by: Jannic Holzer Signed-off-by: Jannic Holzer Signed-off-by: Ahdra Merali --- docs/source/development/automated_testing.md | 167 +++++++++++++++++++ docs/source/index.rst | 1 + 2 files changed, 168 insertions(+) create mode 100644 docs/source/development/automated_testing.md diff --git a/docs/source/development/automated_testing.md b/docs/source/development/automated_testing.md new file mode 100644 index 0000000000..e6825e1835 --- /dev/null +++ b/docs/source/development/automated_testing.md @@ -0,0 +1,167 @@ +# Automated Testing + +An important step towards achieving high code quality and maintainability in your Kedro project is the use of automated tests. Let's look at how you can set this up. +## Introduction + +Software testing is the process of checking that the code you have written fulfills its requirements. Software testing can either be **manual** or **automated**. In the context of Kedro: +- **Manual testing** is when you run part or all of your project and check that the results are what you expect. +- **Automated testing** is writing new code (using libraries called _testing frameworks_) that runs part or all of your project and automatically checks the results against what you expect. + +As a project grows larger, new code will increasingly rely on existing code. As these interdependencies grow, making changes in one part of the code base can unexpectedly break the intended functionality in another part. + +The major disadvantage of manual testing is that it is time-consuming. Manual tests are usually run once, directly after new functionality has been added. It is impractical to repeat manual tests for the entire code base each time a change is made, which means this strategy often misses breaking changes. + +The solution to this problem is automated testing. Automated testing allows many tests across the whole code base to be run in seconds, every time a new feature is added or an old one is changed. In this way, breaking changes can be discovered during development rather than in production. + +## Set up automated testing with `pytest` + +There are many testing frameworks available for Python. One of the most popular is `pytest` (see the [project's home page](https://docs.pytest.org/en/7.1.x/) for a quick overview). `pytest` is often used in Python projects for its short, readable tests and powerful set of features. + +Let's look at how you can start working with `pytest` in your Kedro project. + +### Install `pytest` + +Install `pytest` as you would install other packages with `pip`, making sure your project's virtual environment is active. If you're unfamiliar with virtual environments, see our [docs page on the subject](https://kedro.readthedocs.io/en/stable/get_started/prerequisites.html#virtual-environments) for a quick primer. + +```bash +pip install pytest +``` + +### Create a `/tests` directory + +Now that `pytest` is installed, you will need a place to put your tests. Create a `/tests` folder in the `/src` directory of your project. + +```bash +mkdir /src/tests +``` + +### Test directory structure + +The subdirectories in your project's `/tests` directory should mirror the directory structure of your project's `/src/` directory. All files in the `/tests` folder should be named `test_.py`. See an example `/src` folder below. + +``` +src +│ ... +└─── +│ └───pipelines +│ └───dataprocessing +│ │ ... +│ │ nodes.py +│ │ ... +│ +└───tests +│ └───pipelines +│ └───dataprocessing +│ │ ... +│ │ test_nodes.py +│ │ ... +``` + +### Create an example test + +Now that you have a place to put your tests, you can create an example test in the new file `/src/tests/test_run.py`. The example test simply checks that the project_path attribute of a specially-defined `KedroContext` object has been correctly set. + +``` +import pytest +from kedro.config import ConfigLoader +from kedro.framework.context import KedroContext +from kedro.framework.hooks import _create_hook_manager + + +@pytest.fixture +def config_loader(): + return ConfigLoader(conf_source=str(Path.cwd())) + + +@pytest.fixture +def project_context(config_loader): + return KedroContext( + package_name=, + project_path=Path.cwd(), + config_loader=config_loader, + hook_manager=_create_hook_manager(), + ) + +class TestProjectContext: + def test_project_path(self, project_context): + assert project_context.project_path == Path.cwd() +``` + +This test is redundant, but it introduces a few of `pytest`'s core features and demonstrates the layout of a test file: +- [Fixtures](https://docs.pytest.org/en/7.1.x/explanation/fixtures.html#about-fixtures) are used to define resources used in tests. +- Tests are implemented in methods or functions beginning with `test_` and classes beginning with `Test`. +- The `assert` statement is used to compare the result of the test with an expected value. + +Tests should be named as descriptively as possible, especially if you are working with other people. For example, it is easier to understand the purpose of a test with the name `test_node_passes_with_valid_input` than a test with the name `test_passes`. + +You can read more about the [basics of using `pytest` on the getting started page](https://docs.pytest.org/en/7.1.x/getting-started.html). For help writing your own tests and using all of the features of `pytest`, see the [project documentation](https://docs.pytest.org/). + +### Run your tests + +To run your tests, run `pytest` from within your project's root directory. + +```bash +cd +pytest +``` + +If you created the example test in the previous section, you should see the following output in your shell. + +``` +============================= test session starts ============================== +... +collected 1 item + +src/tests/test_run.py . [100%] + +============================== 1 passed in 0.38s =============================== +``` + +This output indicates that one test ran successfully in the file `src/tests/test_run.py`. + +## Add test coverage reports with `pytest-cov` + +It can be useful to see how much of your project is covered by tests. For this, you can install and configure the [`pytest-cov`](https://pypi.org/project/pytest-cov/) plugin for `pytest`, which is based on the popular [`coverage.py` library](https://coverage.readthedocs.io/). + +### Install `pytest-cov` + +Install `pytest` as you would install other packages with pip, making sure your project's virtual environment is active (see our [docs page on virtual environments](https://kedro.readthedocs.io/en/stable/get_started/prerequisites.html#virtual-environments)). + +```bash +pip install pytest-cov +``` + +### Configure `pytest` to use `pytest-cov` + +To configure `pytest` to generate a coverage report using `pytest-cov`, you can add the following lines to your `/pyproject.toml` file (creating it if it does not exist). + +``` +[tool.pytest.ini_options] +addopts = """ +--cov-report term-missing \ +--cov src/ -ra""" +``` + +### Run `pytest` with `pytest-cov` + +Running `pytest` in the spaceflights starter with `pytest-cov` installed results in the following additional report. + +``` +Name Stmts Miss Cover Missing +-------------------------------------------------------------------------------------- +src/spaceflights/__init__.py 1 1 0% 4 +src/spaceflights/__main__.py 30 30 0% 4-47 +src/spaceflights/pipeline_registry.py 7 7 0% 2-16 +src/spaceflights/pipelines/__init__.py 0 0 100% +src/spaceflights/pipelines/data_processing/__init__.py 1 1 0% 3 +src/spaceflights/pipelines/data_processing/nodes.py 25 25 0% 1-67 +src/spaceflights/pipelines/data_processing/pipeline.py 5 5 0% 1-8 +src/spaceflights/pipelines/data_science/__init__.py 1 1 0% 3 +src/spaceflights/pipelines/data_science/nodes.py 20 20 0% 1-55 +src/spaceflights/pipelines/data_science/pipeline.py 8 8 0% 1-40 +src/spaceflights/settings.py 0 0 100% +-------------------------------------------------------------------------------------- +TOTAL 98 98 0% +``` + +This is the simplest report that `coverage.py` (via `pytest-cov`) will produce. It gives an overview of how many of the executable statements in each project file are covered by tests. For detail on the full set of features offered, see the [`coverage.py` docs](https://coverage.readthedocs.io/). diff --git a/docs/source/index.rst b/docs/source/index.rst index a830a4fb32..634da07d6b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -145,6 +145,7 @@ Welcome to Kedro's documentation! development/set_up_pycharm development/commands_reference development/debugging + development/automated_testing .. toctree:: :maxdepth: 2 From f0a6f9fa50d6880fd524136598717ee26b539ffc Mon Sep 17 00:00:00 2001 From: Yash Agrawal <96697569+yash6318@users.noreply.github.com> Date: Thu, 6 Oct 2022 12:01:33 +0530 Subject: [PATCH 20/38] Capitalise Kedro-Viz in the "Visualize layers" section (#1899) * Capitalised kedro-viz Signed-off-by: yash6318 * capitalised Kedro viz Signed-off-by: yash6318 * Updated set_up_experiment_tracking.md Co-authored-by: Deepyaman Datta Signed-off-by: yash6318 Signed-off-by: yash6318 Co-authored-by: Deepyaman Datta Signed-off-by: Ahdra Merali --- docs/source/tutorial/set_up_experiment_tracking.md | 2 +- docs/source/tutorial/visualise_pipeline.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/tutorial/set_up_experiment_tracking.md b/docs/source/tutorial/set_up_experiment_tracking.md index af8b49e26c..54b65fc2b6 100644 --- a/docs/source/tutorial/set_up_experiment_tracking.md +++ b/docs/source/tutorial/set_up_experiment_tracking.md @@ -225,7 +225,7 @@ After running the pipeline with `kedro run`, the plot will be saved and you will ![](../meta/images/expand-plot-comparison-view.gif) -Read more about [creating plots and visualising them in Kedro viz in the visualise pipeline section.](../tutorial/visualise_pipeline.md#visualise-charts-in-kedro-viz) +Read more about [creating plots and visualising them in Kedro-Viz in the visualise pipeline section.](../tutorial/visualise_pipeline.md#visualise-charts-in-kedro-viz) ## View your metrics timeline diff --git a/docs/source/tutorial/visualise_pipeline.md b/docs/source/tutorial/visualise_pipeline.md index 8e038db9e8..8d28ce83c1 100644 --- a/docs/source/tutorial/visualise_pipeline.md +++ b/docs/source/tutorial/visualise_pipeline.md @@ -74,7 +74,7 @@ regressor: layer: models ``` -Run kedro-viz again with `kedro viz` and observe how your visualisation has changed to indicate the layers: +Run Kedro-Viz again with `kedro viz` and observe how your visualisation has changed to indicate the layers: ![](../meta/images/pipeline_visualisation_with_layers.png) From 6fa2048023ce4c63bbb092308fb18c6b9df74484 Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+MerelTheisenQB@users.noreply.github.com> Date: Thu, 6 Oct 2022 13:26:38 +0100 Subject: [PATCH 21/38] Fix linting on autmated test page (#1906) Signed-off-by: Merel Theisen Signed-off-by: Ahdra Merali --- docs/source/development/automated_testing.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/development/automated_testing.md b/docs/source/development/automated_testing.md index e6825e1835..9f49c6a833 100644 --- a/docs/source/development/automated_testing.md +++ b/docs/source/development/automated_testing.md @@ -9,7 +9,7 @@ Software testing is the process of checking that the code you have written fulfi As a project grows larger, new code will increasingly rely on existing code. As these interdependencies grow, making changes in one part of the code base can unexpectedly break the intended functionality in another part. -The major disadvantage of manual testing is that it is time-consuming. Manual tests are usually run once, directly after new functionality has been added. It is impractical to repeat manual tests for the entire code base each time a change is made, which means this strategy often misses breaking changes. +The major disadvantage of manual testing is that it is time-consuming. Manual tests are usually run once, directly after new functionality has been added. It is impractical to repeat manual tests for the entire code base each time a change is made, which means this strategy often misses breaking changes. The solution to this problem is automated testing. Automated testing allows many tests across the whole code base to be run in seconds, every time a new feature is added or an old one is changed. In this way, breaking changes can be discovered during development rather than in production. @@ -48,7 +48,7 @@ src │ │ ... │ │ nodes.py │ │ ... -│ +│ └───tests │ └───pipelines │ └───dataprocessing From ed06e8d1992746c1b7f428a1aa6a92f0c7826ffa Mon Sep 17 00:00:00 2001 From: Carla Vieira Date: Fri, 7 Oct 2022 05:38:20 -0300 Subject: [PATCH 22/38] Add _SINGLE_PROCESS property to CachedDataSet (#1905) Signed-off-by: Carla Vieira Signed-off-by: Ahdra Merali --- kedro/io/cached_dataset.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kedro/io/cached_dataset.py b/kedro/io/cached_dataset.py index 9d926da02b..3a54727f34 100644 --- a/kedro/io/cached_dataset.py +++ b/kedro/io/cached_dataset.py @@ -27,6 +27,11 @@ class CachedDataSet(AbstractDataSet): class as shown above. """ + # this dataset cannot be used with ``ParallelRunner``, + # therefore it has the attribute ``_SINGLE_PROCESS = True`` + # for parallelism please consider ``ThreadRunner`` instead + _SINGLE_PROCESS = True + def __init__( self, dataset: Union[AbstractDataSet, Dict], From 6de1e01998da1d5a3811afd7f1c90873de71d0d8 Mon Sep 17 00:00:00 2001 From: Kuan Tung Date: Fri, 7 Oct 2022 17:56:51 +0200 Subject: [PATCH 23/38] Update the tutorial of "Visualise pipelines" (#1913) * Change a file extention to match the previous article Signed-off-by: dinotuku * Add a missing import Signed-off-by: dinotuku * Change both preprocessed datasets to parquet files Signed-off-by: dinotuku * Change data type to ParquetDataSet for parquet files Signed-off-by: dinotuku * Add a note for installing seaborn if it is not installed Signed-off-by: dinotuku Signed-off-by: dinotuku Signed-off-by: Ahdra Merali --- docs/source/tutorial/visualise_pipeline.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/docs/source/tutorial/visualise_pipeline.md b/docs/source/tutorial/visualise_pipeline.md index 8d28ce83c1..a742cc90e4 100644 --- a/docs/source/tutorial/visualise_pipeline.md +++ b/docs/source/tutorial/visualise_pipeline.md @@ -53,18 +53,18 @@ shuttles: layer: raw preprocessed_companies: - type: pandas.CSVDataSet - filepath: data/02_intermediate/preprocessed_companies.csv + type: pandas.ParquetDataSet + filepath: data/02_intermediate/preprocessed_companies.pq layer: intermediate preprocessed_shuttles: - type: pandas.CSVDataSet - filepath: data/02_intermediate/preprocessed_shuttles.csv + type: pandas.ParquetDataSet + filepath: data/02_intermediate/preprocessed_shuttles.pq layer: intermediate model_input_table: - type: pandas.CSVDataSet - filepath: data/03_primary/model_input_table.csv + type: pandas.ParquetDataSet + filepath: data/03_primary/model_input_table.pq layer: primary regressor: @@ -180,6 +180,7 @@ The below functions can be added to the `nodes.py` and `pipeline.py` files respe ```python # nodes.py import plotly.express as px +import plotly.graph_objs as go import pandas as pd # the below function uses plotly.express @@ -277,6 +278,8 @@ def create_confusion_matrix(companies: pd.DataFrame): return plt ``` +> You might have to execute `pip install seaborn` if the [seaborn library](https://seaborn.pydata.org/) is not installed yet. + ```python # pipeline.py def create_pipeline(**kwargs) -> Pipeline: From 5de89d59b95984a2aa315a24a60b5b2a822eecc3 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Mon, 10 Oct 2022 15:54:12 +0100 Subject: [PATCH 24/38] Document how users can use linting tools instead of `kedro lint` (#1904) * Add documentation for linting tools Signed-off-by: Ankita Katiyar * Revert changes to commands_reference.md Signed-off-by: Ankita Katiyar * Update linting docs with suggestions Signed-off-by: Ankita Katiyar * Update linting doc Signed-off-by: Ankita Katiyar Signed-off-by: Ankita Katiyar Signed-off-by: Ahdra Merali --- docs/source/development/linting.md | 99 ++++++++++++++++++++++++++++++ docs/source/index.rst | 1 + 2 files changed, 100 insertions(+) create mode 100644 docs/source/development/linting.md diff --git a/docs/source/development/linting.md b/docs/source/development/linting.md new file mode 100644 index 0000000000..545cf679e2 --- /dev/null +++ b/docs/source/development/linting.md @@ -0,0 +1,99 @@ +# Linting + +## Introduction +Linting tools are used to improve the quality of your code by checking for errors and making sure it meets a stylistic +standard before it is run. As a project grows and goes through various stages of development, it becomes important to +maintain the code quality. Linting makes your code more readable, easy to debug and maintain, and stylistically +consistent. + +## Set up linting tools +There are a variety of linting tools available to use with your Kedro projects. This guide shows you how to use +[`black`](https://github.com/psf/black), [`flake8`](https://gitlab.com/pycqa/flake8), and +[`isort`](https://github.com/PyCQA/isort) to lint your Kedro projects. +- **`black`** is a [PEP 8](https://peps.python.org/pep-0008/) compliant opinionated Python code formatter. `black` can +check for styling inconsistencies and reformat your files in place. +[You can read more in the `black` documentation](https://black.readthedocs.io/en/stable/). +- **`flake8`** is a wrapper around [`pep8`](https://pypi.org/project/pep8/), +[`pyflakes`](https://pypi.org/project/pyflakes/), and [`mccabe`](https://pypi.org/project/mccabe/) which can flag +programming errors and coding style inconsistencies with respect to [PEP 8](https://peps.python.org/pep-0008/), +and check the cyclomatic complexity of your code base. +[You can read more in the `flake8` documentation](https://flake8.pycqa.org/en/latest/). +- **`isort`** is a Python library used to sort imports alphabetically and automatically separate them into sections by +type. [You can read more in the `isort` documentation](https://pycqa.github.io/isort/). + +### Install linting tools +You can install `black`, `flake8`, and `isort` by adding the following lines to your project's `src/requirements.txt` +file: +```text +black # Used for formatting code +flake8 # Used for linting code +isort # Used for linting code +``` +To install all the project-specific dependencies, including the linting tools, navigate to the root directory of the +project and run: +```bash +pip install -r src/requirements.txt +``` +Alternatively, you can individually install the linting tools using the following shell commands: +```bash +pip install black +pip install flake8 +pip install isort +``` + +### Run linting tools +Use the following commands to run lint checks: +```bash +black --check +flake8 +isort --profile black --check +``` +You can also have `black` and `isort` automatically format your code by omitting the `--check` flag. Since `isort` and +`black` both format your imports, adding `--profile black` to the `isort` run helps avoid potential conflicts. + +## Automating linting with `pre-commit` hooks + +You can automate linting by using [`pre-commit`](https://github.com/pre-commit/pre-commit) hooks. +These hooks are run before committing your code to your repositories to automatically point out formatting issues, +making code reviews easier and less time-consuming. + +### Install `pre-commit` +You can install `pre-commit` along with other dependencies by including it in the `src/requirements.txt` file of your +Kedro project by adding the following line: +```text +pre-commit +``` +You can also install `pre-commit` using the following command: +```bash +pip install pre-commit +``` +### Add `pre-commit` configuration file +Create a file named `.pre-commit-config.yaml` in your Kedro project root directory. You can add entries for the hooks +you want to run before each `commit`. +Below is a sample `YAML` file with entries for `black`,`flake8`, and `isort`: +```yaml +repos: + - repo: https://github.com/pycqa/isort + rev: 5.10.1 + hooks: + - id: isort + name: isort (python) + args: ["--profile", "black"] + + - repo: https://github.com/pycqa/flake8 + rev: '' # pick a git hash / tag to point to + hooks: + - id: flake8 + + - repo: https://github.com/psf/black + rev: 22.8.0 + hooks: + - id: black + language_version: python3.9 +``` +### Install git hook scripts +Run the following command to complete installation: +```bash +pre-commit install +``` +This enables `pre-commit` hooks to run automatically every time you execute `git commit`. diff --git a/docs/source/index.rst b/docs/source/index.rst index 634da07d6b..f2ef351b37 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -146,6 +146,7 @@ Welcome to Kedro's documentation! development/commands_reference development/debugging development/automated_testing + development/linting .. toctree:: :maxdepth: 2 From 72d9b967ff515ee4e851078f2ea917e10bb350d5 Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+MerelTheisenQB@users.noreply.github.com> Date: Tue, 11 Oct 2022 13:41:57 +0100 Subject: [PATCH 25/38] Make core config accessible in dict get way (#1870) Signed-off-by: Merel Theisen Signed-off-by: Ahdra Merali --- .circleci/continue_config.yml | 2 +- RELEASE.md | 2 ++ docs/conf.py | 11 ++++++ .../kedro_project_setup/configuration.md | 14 ++++---- kedro/config/abstract_config.py | 10 ++---- kedro/config/config.py | 31 ++++++++++++----- kedro/config/templated_config.py | 16 +++++++-- kedro/framework/context/context.py | 11 ++---- kedro/framework/session/session.py | 4 +-- tests/config/test_config.py | 34 +++++++++++++++++++ tests/config/test_templated_config.py | 9 +++++ 11 files changed, 108 insertions(+), 36 deletions(-) diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml index 6b2bf81fd7..401c282a8e 100644 --- a/.circleci/continue_config.yml +++ b/.circleci/continue_config.yml @@ -371,7 +371,7 @@ jobs: docs_linkcheck: executor: name: docker - python_version: "3.7" + python_version: "3.8" steps: - setup - run: diff --git a/RELEASE.md b/RELEASE.md index 08fd3d7eb3..c7af457002 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -11,6 +11,8 @@ # Upcoming Release 0.18.4 ## Major features and improvements +* The config loader objects now implement `UserDict` and the configuration is accessed through `conf_loader['catalog']` +* You can configure config file patterns through `settings.py` without creating a custom config loader ## Bug fixes and other changes * Fixed `kedro micropkg pull` for packages on PyPI. diff --git a/docs/conf.py b/docs/conf.py index 80da0bed65..c917366d63 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -143,6 +143,17 @@ "pluggy._manager.PluginManager", "_DI", "_DO", + # The statements below were added after subclassing UserDict in AbstractConfigLoader. + "None. Remove all items from D.", + "a shallow copy of D", + "a set-like object providing a view on D's items", + "a set-like object providing a view on D's keys", + "v, remove specified key and return the corresponding value.", + "None. Update D from dict/iterable E and F.", + "an object providing a view on D's values", + "(k, v), remove and return some (key, value) pair", + "D.get(k,d), also set D[k]=d if k not in D", + "None. Update D from mapping/iterable E and F.", ), "py:data": ( "typing.Any", diff --git a/docs/source/kedro_project_setup/configuration.md b/docs/source/kedro_project_setup/configuration.md index 252ca87fb7..4d9e89cdaa 100644 --- a/docs/source/kedro_project_setup/configuration.md +++ b/docs/source/kedro_project_setup/configuration.md @@ -19,7 +19,7 @@ from kedro.framework.project import settings conf_path = str(project_path / settings.CONF_SOURCE) conf_loader = ConfigLoader(conf_source=conf_path, env="local") -conf_catalog = conf_loader.get("catalog*", "catalog*/**") +conf_catalog = conf_loader["catalog"] ``` This recursively scans for configuration files firstly in the `conf/base/` (`base` being the default environment) and then in the `conf/local/` (`local` being the designated overriding environment) directory according to the following rules: @@ -180,7 +180,7 @@ from kedro.framework.project import settings conf_path = str(project_path / settings.CONF_SOURCE) conf_loader = ConfigLoader(conf_source=conf_path, env="local") -parameters = conf_loader.get("parameters*", "parameters*/**") +parameters = conf_loader["parameters"] ``` This will load configuration files from any subdirectories in `conf` that have a filename starting with `parameters`, or are located inside a folder with name starting with `parameters`. @@ -189,7 +189,7 @@ This will load configuration files from any subdirectories in `conf` that have a Since `local` is set as the environment, the configuration path `conf/local` takes precedence in the example above. Hence any overlapping top-level keys from `conf/base` will be overwritten by the ones from `conf/local`. ``` -Calling `conf_loader.get()` in the example above will throw a `MissingConfigException` error if no configuration files match the given patterns in any of the specified paths. If this is a valid workflow for your application, you can handle it as follows: +Calling `conf_loader[key]` in the example above will throw a `MissingConfigException` error if no configuration files match the given key. If this is a valid workflow for your application, you can handle it as follows: ```python from kedro.config import ConfigLoader, MissingConfigException @@ -199,7 +199,7 @@ conf_path = str(project_path / settings.CONF_SOURCE) conf_loader = ConfigLoader(conf_source=conf_path, env="local") try: - parameters = conf_loader.get("parameters*", "parameters*/**", "**/parameters*") + parameters = conf_loader["parameters"] except MissingConfigException: parameters = {} ``` @@ -315,7 +315,7 @@ from kedro.framework.project import settings conf_path = str(project_path / settings.CONF_SOURCE) conf_loader = ConfigLoader(conf_source=conf_path, env="local") -credentials = conf_loader.get("credentials*", "credentials*/**") +credentials = conf_loader["credentials"] ``` This will load configuration files from `conf/base` and `conf/local` whose filenames start with `credentials`, or that are located inside a folder with a name that starts with `credentials`. @@ -324,7 +324,7 @@ This will load configuration files from `conf/base` and `conf/local` whose filen Since `local` is set as the environment, the configuration path `conf/local` takes precedence in the example above. Hence, any overlapping top-level keys from `conf/base` will be overwritten by the ones from `conf/local`. ``` -Calling `conf_loader.get()` in the example above throws a `MissingConfigException` error if no configuration files match the given patterns in any of the specified paths. If this is a valid workflow for your application, you can handle it as follows: +Calling `conf_loader[key]` in the example above throws a `MissingConfigException` error if no configuration files match the given key. If this is a valid workflow for your application, you can handle it as follows: ```python from kedro.config import ConfigLoader, MissingConfigException @@ -334,7 +334,7 @@ conf_path = str(project_path / settings.CONF_SOURCE) conf_loader = ConfigLoader(conf_source=conf_path, env="local") try: - credentials = conf_loader.get("credentials*", "credentials*/**") + credentials = conf_loader["credentials"] except MissingConfigException: credentials = {} ``` diff --git a/kedro/config/abstract_config.py b/kedro/config/abstract_config.py index 18994578ae..4174917a0d 100644 --- a/kedro/config/abstract_config.py +++ b/kedro/config/abstract_config.py @@ -1,11 +1,11 @@ """This module provides ``kedro.abstract_config`` with the baseline class model for a `ConfigLoader` implementation. """ -from abc import ABC, abstractmethod +from collections import UserDict from typing import Any, Dict -class AbstractConfigLoader(ABC): +class AbstractConfigLoader(UserDict): """``AbstractConfigLoader`` is the abstract base class for all `ConfigLoader` implementations. All user-defined `ConfigLoader` implementations should inherit @@ -19,15 +19,11 @@ def __init__( runtime_params: Dict[str, Any] = None, **kwargs # pylint: disable=unused-argument ): + super().__init__() self.conf_source = conf_source self.env = env self.runtime_params = runtime_params - @abstractmethod # pragma: no cover - def get(self) -> Dict[str, Any]: - """Required method to get all configurations.""" - pass - class BadConfigException(Exception): """Raised when a configuration file cannot be loaded, for instance diff --git a/kedro/config/config.py b/kedro/config/config.py index 2665a4b60c..b32e7bb036 100644 --- a/kedro/config/config.py +++ b/kedro/config/config.py @@ -2,7 +2,7 @@ or more configuration files from specified paths. """ from pathlib import Path -from typing import Any, Dict, Iterable +from typing import Any, Dict, Iterable, List from kedro.config import AbstractConfigLoader from kedro.config.common import _get_config_from_patterns, _remove_duplicates @@ -56,11 +56,11 @@ class ConfigLoader(AbstractConfigLoader): >>> conf_path = str(project_path / settings.CONF_SOURCE) >>> conf_loader = ConfigLoader(conf_source=conf_path, env="local") >>> - >>> conf_logging = conf_loader.get('logging*') + >>> conf_logging = conf_loader["logging"] >>> logging.config.dictConfig(conf_logging) # set logging conf >>> - >>> conf_catalog = conf_loader.get('catalog*', 'catalog*/**') - >>> conf_params = conf_loader.get('**/parameters.yml') + >>> conf_catalog = conf_loader["catalog"] + >>> conf_params = conf_loader["parameters"] """ @@ -69,6 +69,7 @@ def __init__( conf_source: str, env: str = None, runtime_params: Dict[str, Any] = None, + config_patterns: Dict[str, List[str]] = None, *, base_env: str = "base", default_run_env: str = "local", @@ -86,18 +87,32 @@ def __init__( This is used in the `conf_paths` property method to construct the configuration paths. Can be overriden by supplying the `env` argument. """ - super().__init__( - conf_source=conf_source, env=env, runtime_params=runtime_params - ) self.base_env = base_env self.default_run_env = default_run_env + self.config_patterns = { + "catalog": ["catalog*", "catalog*/**", "**/catalog*"], + "parameters": ["parameters*", "parameters*/**", "**/parameters*"], + "credentials": ["credentials*", "credentials*/**", "**/credentials*"], + "logging": ["logging*", "logging*/**", "**/logging*"], + } + self.config_patterns.update(config_patterns or {}) + + super().__init__( + conf_source=conf_source, + env=env, + runtime_params=runtime_params, + ) + + def __getitem__(self, key): + return self.get(*self.config_patterns[key]) + @property def conf_paths(self): """Property method to return deduplicated configuration paths.""" return _remove_duplicates(self._build_conf_paths()) - def get(self, *patterns: str) -> Dict[str, Any]: + def get(self, *patterns: str) -> Dict[str, Any]: # type: ignore return _get_config_from_patterns( conf_paths=self.conf_paths, patterns=list(patterns) ) diff --git a/kedro/config/templated_config.py b/kedro/config/templated_config.py index 07cda9f335..f8b555eef9 100644 --- a/kedro/config/templated_config.py +++ b/kedro/config/templated_config.py @@ -5,7 +5,7 @@ import re from copy import deepcopy from pathlib import Path -from typing import Any, Dict, Iterable, Optional +from typing import Any, Dict, Iterable, List, Optional import jmespath @@ -92,6 +92,7 @@ def __init__( conf_source: str, env: str = None, runtime_params: Dict[str, Any] = None, + config_patterns: Dict[str, List[str]] = None, *, base_env: str = "base", default_run_env: str = "local", @@ -114,6 +115,14 @@ def __init__( obtained from the globals_pattern. In case of duplicate keys, the ``globals_dict`` keys take precedence. """ + self.config_patterns = { + "catalog": ["catalog*", "catalog*/**", "**/catalog*"], + "parameters": ["parameters*", "parameters*/**", "**/parameters*"], + "credentials": ["credentials*", "credentials*/**", "**/credentials*"], + "logging": ["logging*", "logging*/**", "**/logging*"], + } + self.config_patterns.update(config_patterns or {}) + super().__init__( conf_source=conf_source, env=env, runtime_params=runtime_params ) @@ -132,12 +141,15 @@ def __init__( globals_dict = deepcopy(globals_dict) or {} self._config_mapping = {**self._config_mapping, **globals_dict} + def __getitem__(self, key): + return self.get(*self.config_patterns[key]) + @property def conf_paths(self): """Property method to return deduplicated configuration paths.""" return _remove_duplicates(self._build_conf_paths()) - def get(self, *patterns: str) -> Dict[str, Any]: + def get(self, *patterns: str) -> Dict[str, Any]: # type: ignore """Tries to resolve the template variables in the config dictionary provided by the ``ConfigLoader`` (super class) ``get`` method using the dictionary of replacement values obtained in the ``__init__`` method. diff --git a/kedro/framework/context/context.py b/kedro/framework/context/context.py index 5941c38d9f..df7ad635ab 100644 --- a/kedro/framework/context/context.py +++ b/kedro/framework/context/context.py @@ -240,10 +240,7 @@ def params(self) -> Dict[str, Any]: extra parameters passed at initialization. """ try: - # '**/parameters*' reads modular pipeline configs - params = self.config_loader.get( - "parameters*", "parameters*/**", "**/parameters*" - ) + params = self.config_loader["parameters"] except MissingConfigException as exc: warn(f"Parameters not found in your Kedro project config.\n{str(exc)}") params = {} @@ -275,7 +272,7 @@ def _get_catalog( """ # '**/catalog*' reads modular pipeline configs - conf_catalog = self.config_loader.get("catalog*", "catalog*/**", "**/catalog*") + conf_catalog = self.config_loader["catalog"] # turn relative paths in conf_catalog into absolute paths # before initializing the catalog conf_catalog = _convert_paths_to_absolute_posix( @@ -337,9 +334,7 @@ def _add_param_to_feed_dict(param_name, param_value): def _get_config_credentials(self) -> Dict[str, Any]: """Getter for credentials specified in credentials directory.""" try: - conf_creds = self.config_loader.get( - "credentials*", "credentials*/**", "**/credentials*" - ) + conf_creds = self.config_loader["credentials"] except MissingConfigException as exc: warn(f"Credentials not found in your Kedro project config.\n{str(exc)}") conf_creds = {} diff --git a/kedro/framework/session/session.py b/kedro/framework/session/session.py index 9dfbfbd6f4..bd3a787fc3 100644 --- a/kedro/framework/session/session.py +++ b/kedro/framework/session/session.py @@ -182,9 +182,7 @@ def create( # pylint: disable=too-many-arguments return session def _get_logging_config(self) -> Dict[str, Any]: - logging_config = self._get_config_loader().get( - "logging*", "logging*/**", "**/logging*" - ) + logging_config = self._get_config_loader()["logging"] # turn relative paths in logging config into absolute path # before initialising loggers logging_config = _convert_paths_to_absolute_posix( diff --git a/tests/config/test_config.py b/tests/config/test_config.py index 8d0c89c9e0..0a7565b7d2 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -92,6 +92,19 @@ def proj_catalog_nested(tmp_path): class TestConfigLoader: + @use_config_dir + def test_load_core_config_dict_get(self, tmp_path): + """Make sure core config can be fetched with a dict [] access.""" + conf = ConfigLoader(str(tmp_path), _DEFAULT_RUN_ENV) + params = conf["parameters"] + catalog = conf["catalog"] + + assert params["param1"] == 1 + assert catalog["trains"]["type"] == "MemoryDataSet" + assert catalog["cars"]["type"] == "pandas.CSVDataSet" + assert catalog["boats"]["type"] == "MemoryDataSet" + assert not catalog["cars"]["save_args"]["index"] + @use_config_dir def test_load_local_config(self, tmp_path): """Make sure that configs from `local/` override the ones @@ -239,6 +252,27 @@ def test_no_files_found(self, tmp_path): with pytest.raises(MissingConfigException, match=pattern): ConfigLoader(str(tmp_path), _DEFAULT_RUN_ENV).get("non-existent-pattern") + @use_config_dir + def test_key_not_found_dict_get(self, tmp_path): + """Check the error if no config files satisfy a given pattern""" + with pytest.raises(KeyError): + # pylint: disable=expression-not-assigned + ConfigLoader(str(tmp_path), _DEFAULT_RUN_ENV)["non-existent-pattern"] + + @use_config_dir + def test_no_files_found_dict_get(self, tmp_path): + """Check the error if no config files satisfy a given pattern""" + pattern = ( + r"No files found in " + r"\[\'.*base\', " + r"\'.*local\'\] " + r"matching the glob pattern\(s\): " + r"\[\'credentials\*\', \'credentials\*/\**\', \'\**/credentials\*\'\]" + ) + with pytest.raises(MissingConfigException, match=pattern): + # pylint: disable=expression-not-assigned + ConfigLoader(str(tmp_path), _DEFAULT_RUN_ENV)["credentials"] + def test_duplicate_paths(self, tmp_path, caplog): """Check that trying to load the same environment config multiple times logs a warning and skips the reload""" diff --git a/tests/config/test_templated_config.py b/tests/config/test_templated_config.py index ca1c1b78a3..76bdf83782 100644 --- a/tests/config/test_templated_config.py +++ b/tests/config/test_templated_config.py @@ -207,6 +207,15 @@ def proj_catalog_param_with_default(tmp_path, param_config_with_default): class TestTemplatedConfigLoader: + @pytest.mark.usefixtures("proj_catalog_param") + def test_get_catalog_config_with_dict_get(self, tmp_path, template_config): + config_loader = TemplatedConfigLoader( + str(tmp_path), globals_dict=template_config + ) + config_loader.default_run_env = "" + catalog = config_loader["catalog"] + assert catalog["boats"]["type"] == "SparkDataSet" + @pytest.mark.usefixtures("proj_catalog_param") def test_catalog_parameterized_w_dict(self, tmp_path, template_config): """Test parameterized config with input from dictionary with values""" From ed37d707b7265f96d03162f42671ccb189609ce5 Mon Sep 17 00:00:00 2001 From: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com> Date: Wed, 12 Oct 2022 12:29:17 +0100 Subject: [PATCH 26/38] Create dependabot.yml configuration file for version updates (#1862) * Create dependabot.yml configuration file * Update dependabot.yml Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> * add target-branch Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> * Update dependabot.yml Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> * limit dependabot to just dependency folder Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> * Update test_requirements.txt Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> * Update MANIFEST.in Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> * fix e2e Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> * Update continue_config.yml Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> * Update requirements.txt Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> * Update requirements.txt Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> * fix link Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> * revert Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> * Delete requirements.txt Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> Signed-off-by: Ahdra Merali --- .circleci/continue_config.yml | 8 ++++---- .github/dependabot.yml | 15 +++++++++++++++ MANIFEST.in | 2 +- requirements.txt => dependency/requirements.txt | 0 features/environment.py | 2 +- setup.py | 2 +- test_requirements.txt | 2 +- 7 files changed, 23 insertions(+), 8 deletions(-) create mode 100644 .github/dependabot.yml rename requirements.txt => dependency/requirements.txt (100%) diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml index 401c282a8e..a035109bdb 100644 --- a/.circleci/continue_config.yml +++ b/.circleci/continue_config.yml @@ -146,7 +146,7 @@ commands: steps: - restore_cache: name: Restore package cache - key: kedro-deps-v1-win-{{ checksum "requirements.txt" }}-{{ checksum "test_requirements.txt" }} + key: kedro-deps-v1-win-{{ checksum "dependency/requirements.txt" }}-{{ checksum "test_requirements.txt" }} # We don't restore the conda environment cache for python 3.10 as it conflicts with the # 'Install GDAL, Fiona and pytables' step breaking the conda environment (missing zlib.dll). - unless: @@ -155,7 +155,7 @@ commands: steps: - restore_cache: name: Restore conda environment cache - key: kedro-deps-v1-win-<>-{{ checksum "requirements.txt" }}-{{ checksum "test_requirements.txt" }} + key: kedro-deps-v1-win-<>-{{ checksum "dependency/requirements.txt" }}-{{ checksum "test_requirements.txt" }} # pytables and Fiona have a series of binary dependencies under Windows that # are best handled by conda-installing instead of pip-installing them. # Dependency resolution works best when installing these altogether in one @@ -334,7 +334,7 @@ jobs: steps: - save_cache: name: Save Python package cache - key: kedro-deps-v1-win-{{ checksum "requirements.txt" }}-{{ checksum "test_requirements.txt" }} + key: kedro-deps-v1-win-{{ checksum "dependency/requirements.txt" }}-{{ checksum "test_requirements.txt" }} paths: # Cache pip cache and conda packages directories - c:\tools\miniconda3\pkgs @@ -347,7 +347,7 @@ jobs: steps: - save_cache: name: Save conda environment cache - key: kedro-deps-v1-win-<>-{{ checksum "requirements.txt" }}-{{ checksum "test_requirements.txt" }} + key: kedro-deps-v1-win-<>-{{ checksum "dependency/requirements.txt" }}-{{ checksum "test_requirements.txt" }} paths: - c:\tools\miniconda3\envs\kedro_builder - run: diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000..34ef1af16f --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,15 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "pip" # See documentation for possible values + directory: "/dependency" # Location of package manifests + schedule: + interval: "daily" + target-branch: "dependency-update" + labels: + - "dependencies" + open-pull-requests-limit: 50 # Allow up to 50 open pull requests diff --git a/MANIFEST.in b/MANIFEST.in index baeea6c63d..245671a5e0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,6 @@ include README.md include LICENSE.md -include requirements.txt +include dependency/requirements.txt include test_requirements.txt include kedro/framework/project/default_logging.yml include kedro/ipython/*.png diff --git a/requirements.txt b/dependency/requirements.txt similarity index 100% rename from requirements.txt rename to dependency/requirements.txt diff --git a/features/environment.py b/features/environment.py index a0a7f40fa9..cb2f7ab3b2 100644 --- a/features/environment.py +++ b/features/environment.py @@ -56,7 +56,7 @@ def _setup_context_with_venv(context, venv_dir): context.pip = str(bin_dir / "pip") context.python = str(bin_dir / "python") context.kedro = str(bin_dir / "kedro") - context.requirements_path = Path("requirements.txt").resolve() + context.requirements_path = Path("dependency/requirements.txt").resolve() # clone the environment, remove any condas and venvs and insert our venv context.env = os.environ.copy() diff --git a/setup.py b/setup.py index fe856e635a..f909489b09 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ version = result.group(1) # get the dependencies and installs -with open("requirements.txt", encoding="utf-8") as f: +with open("dependency/requirements.txt", encoding="utf-8") as f: requires = [x.strip() for x in f if x.strip()] # get test dependencies and installs diff --git a/test_requirements.txt b/test_requirements.txt index bd09f9efc2..10a4ebfcaf 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1,4 +1,4 @@ --r requirements.txt +-r dependency/requirements.txt adlfs>=2021.7.1, <=2022.2 bandit>=1.6.2, <2.0 behave==1.2.6 From 5d8bd9bdad74cdbd98277140a2118257e66df642 Mon Sep 17 00:00:00 2001 From: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com> Date: Wed, 12 Oct 2022 14:23:14 +0100 Subject: [PATCH 27/38] Update dependabot config (#1928) Signed-off-by: Ahdra Merali --- .github/dependabot.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 34ef1af16f..9a8065aaa1 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -8,8 +8,7 @@ updates: - package-ecosystem: "pip" # See documentation for possible values directory: "/dependency" # Location of package manifests schedule: - interval: "daily" + interval: "weekly" target-branch: "dependency-update" labels: - "dependencies" - open-pull-requests-limit: 50 # Allow up to 50 open pull requests From 81cf5a4159b3391082cf914edb842b32700b1f1d Mon Sep 17 00:00:00 2001 From: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com> Date: Wed, 12 Oct 2022 14:31:45 +0100 Subject: [PATCH 28/38] Update robots.txt (#1929) Signed-off-by: Ahdra Merali --- docs/robots.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/robots.txt b/docs/robots.txt index 41afdb0e0a..9bd9ee90da 100644 --- a/docs/robots.txt +++ b/docs/robots.txt @@ -2,4 +2,4 @@ User-agent: * Disallow: * Allow: /en/stable Allow: /en/latest -Allow: /en/0.17.* +Allow: /en/0.18.* From 0803e7478d63290faea5ca95a35ca3de9704be9c Mon Sep 17 00:00:00 2001 From: Nok Lam Chan Date: Tue, 18 Oct 2022 15:16:42 +0100 Subject: [PATCH 29/38] fix broken link (#1950) Signed-off-by: Ahdra Merali --- docs/source/deployment/airflow_astronomer.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/deployment/airflow_astronomer.md b/docs/source/deployment/airflow_astronomer.md index 88972b218e..1dbdefc435 100644 --- a/docs/source/deployment/airflow_astronomer.md +++ b/docs/source/deployment/airflow_astronomer.md @@ -12,7 +12,7 @@ The general strategy to deploy a Kedro pipeline on Apache Airflow is to run ever To follow this tutorial, ensure you have the following: -* An Airflow cluster: you can follow [Astronomer's quickstart guide](https://docs.astronomer.io/astro/#get-started) to set one up. +* An Airflow cluster: you can follow [Astronomer's quickstart guide](https://docs.astronomer.io/astro/category/install-astro) to set one up. * The [Astro CLI installed](https://docs.astronomer.io/astro/install-cli) * `kedro>=0.17` installed From 4dfb14e159a3a146e64c5c6434d767b962540cb5 Mon Sep 17 00:00:00 2001 From: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com> Date: Wed, 19 Oct 2022 10:57:26 +0100 Subject: [PATCH 30/38] Update dependabot.yml config (#1938) * Update dependabot.yml Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> * pin jupyterlab_services to requirments Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> * lint Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com> Signed-off-by: Ahdra Merali --- .github/dependabot.yml | 1 - .../{{ cookiecutter.repo_name }}/src/requirements.txt | 1 + .../project/{{ cookiecutter.repo_name }}/src/requirements.txt | 1 + test_requirements.txt | 1 + 4 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 9a8065aaa1..d13ed3ced4 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -9,6 +9,5 @@ updates: directory: "/dependency" # Location of package manifests schedule: interval: "weekly" - target-branch: "dependency-update" labels: - "dependencies" diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/requirements.txt b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/requirements.txt index 56fcfab7ac..e2e87efa92 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/requirements.txt +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/requirements.txt @@ -3,6 +3,7 @@ flake8>=3.7.9, <5.0 ipython>=7.31.1, <8.0 isort~=5.0 jupyter~=1.0 +jupyterlab_server>=2.11.1, <2.16.0 jupyterlab~=3.0 kedro[pandas.CSVDataSet]=={{ cookiecutter.kedro_version }} kedro-telemetry~=0.2.0 diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/requirements.txt b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/requirements.txt index d16a18d649..e18edf9ed0 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/requirements.txt +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/requirements.txt @@ -3,6 +3,7 @@ flake8>=3.7.9, <5.0 ipython>=7.31.1, <8.0 isort~=5.0 jupyter~=1.0 +jupyterlab_server>=2.11.1, <2.16.0 jupyterlab~=3.0 kedro~={{ cookiecutter.kedro_version }} kedro-telemetry~=0.2.0 diff --git a/test_requirements.txt b/test_requirements.txt index 10a4ebfcaf..52305e865a 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -20,6 +20,7 @@ ipython>=7.31.1, <8.0 isort~=5.0 Jinja2<3.1.0 joblib>=0.14 +jupyterlab_server>=2.11.1, <2.16.0 # 2.16.0 requires importlib_metedata >= 4.8.3 which conflicts with flake8 requirement jupyterlab~=3.0 jupyter~=1.0 lxml~=4.6 From 3ce5dcb4f58710fb8a3aef1141af7e7b7bc981cb Mon Sep 17 00:00:00 2001 From: Nok Lam Chan Date: Wed, 19 Oct 2022 13:11:34 +0100 Subject: [PATCH 31/38] Update setup.py Jinja2 dependencies (#1954) Signed-off-by: Ahdra Merali --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index f909489b09..df5c899fed 100644 --- a/setup.py +++ b/setup.py @@ -115,6 +115,7 @@ def _collect_requirements(requires): "ipykernel>=5.3, <7.0", "sphinxcontrib-mermaid~=0.7.1", "myst-parser~=0.17.2", + "Jinja2<3.1.0", ], "geopandas": _collect_requirements(geopandas_require), "matplotlib": _collect_requirements(matplotlib_require), From 5051e1cc25aab70d672577fe46b7d321c6128070 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 19 Oct 2022 13:59:13 +0100 Subject: [PATCH 32/38] Update pip-tools requirement from ~=6.5 to ~=6.9 in /dependency (#1957) Updates the requirements on [pip-tools](https://github.com/jazzband/pip-tools) to permit the latest version. - [Release notes](https://github.com/jazzband/pip-tools/releases) - [Changelog](https://github.com/jazzband/pip-tools/blob/master/CHANGELOG.md) - [Commits](https://github.com/jazzband/pip-tools/compare/6.5.0...6.9.0) --- updated-dependencies: - dependency-name: pip-tools dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Ahdra Merali --- dependency/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependency/requirements.txt b/dependency/requirements.txt index 30e2386c72..9eb1ba531b 100644 --- a/dependency/requirements.txt +++ b/dependency/requirements.txt @@ -10,7 +10,7 @@ importlib-metadata>=3.6; python_version >= '3.8' importlib_metadata>=3.6, <5.0; python_version < '3.8' # The "selectable" entry points were introduced in `importlib_metadata` 3.6 and Python 3.10. Bandit on Python 3.7 relies on a library with `importlib_metadata` < 5.0 importlib_resources>=1.3 # The `files()` API was introduced in `importlib_resources` 1.3 and Python 3.9. jmespath>=0.9.5, <1.0 -pip-tools~=6.5 +pip-tools~=6.9 pluggy~=1.0.0 PyYAML>=4.2, <7.0 rich~=12.0 From 252fb3a29e7ffdc4b14da595ce6939a17e4b40eb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 19 Oct 2022 15:05:42 +0100 Subject: [PATCH 33/38] Update toposort requirement from ~=1.5 to ~=1.7 in /dependency (#1956) Updates the requirements on [toposort]() to permit the latest version. --- updated-dependencies: - dependency-name: toposort dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com> Signed-off-by: Ahdra Merali --- dependency/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependency/requirements.txt b/dependency/requirements.txt index 9eb1ba531b..3c84b01ff4 100644 --- a/dependency/requirements.txt +++ b/dependency/requirements.txt @@ -17,4 +17,4 @@ rich~=12.0 rope~=0.21.0 # subject to LGPLv3 license setuptools>=38.0 toml~=0.10 -toposort~=1.5 # Needs to be at least 1.5 to be able to raise CircularDependencyError +toposort~=1.7 # Needs to be at least 1.5 to be able to raise CircularDependencyError From 0760011ecefc56d94429ff28998de5eda7939f6d Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+MerelTheisenQB@users.noreply.github.com> Date: Wed, 19 Oct 2022 15:50:07 +0100 Subject: [PATCH 34/38] Add deprecation warning to package_name argument in session create() (#1953) Signed-off-by: Merel Theisen Signed-off-by: Ahdra Merali --- docs/source/extend_kedro/plugins.md | 3 +-- docs/source/kedro_project_setup/session.md | 6 +++--- kedro/framework/session/session.py | 9 ++++----- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/docs/source/extend_kedro/plugins.md b/docs/source/extend_kedro/plugins.md index 2bad011c10..df31250500 100644 --- a/docs/source/extend_kedro/plugins.md +++ b/docs/source/extend_kedro/plugins.md @@ -106,8 +106,7 @@ from kedro.framework.session import KedroSession project_path = Path.cwd() -metadata = _get_project_metadata(project_path) -session = KedroSession.create(metadata.package_name, project_path) +session = KedroSession.create(project_path=project_path) context = session.load_context() ``` diff --git a/docs/source/kedro_project_setup/session.md b/docs/source/kedro_project_setup/session.md index 767afe2d22..5c498dde61 100644 --- a/docs/source/kedro_project_setup/session.md +++ b/docs/source/kedro_project_setup/session.md @@ -25,12 +25,12 @@ from kedro.framework.session import KedroSession from kedro.framework.startup import bootstrap_project from pathlib import Path -metadata = bootstrap_project(Path.cwd()) -with KedroSession.create(metadata.package_name) as session: +bootstrap_project(Path.cwd()) +with KedroSession.create() as session: session.run() ``` -You must tell `KedroSession` the package name of your Kedro project so it can load your settings, nodes and pipelines. Additionally, you can provide the following optional arguments in `KedroSession.create()`: +You can provide the following optional arguments in `KedroSession.create()`: - `project_path`: Path to the project root directory - `save_on_close`: A boolean value to indicate whether or not to save the session to disk when it's closed diff --git a/kedro/framework/session/session.py b/kedro/framework/session/session.py index bd3a787fc3..68f6a0a08b 100644 --- a/kedro/framework/session/session.py +++ b/kedro/framework/session/session.py @@ -71,7 +71,7 @@ class KedroSessionError(Exception): class KedroSession: """``KedroSession`` is the object that is responsible for managing the lifecycle - of a Kedro run. Use `KedroSession.create("")` as + of a Kedro run. Use `KedroSession.create()` as a context manager to construct a new KedroSession with session data provided (see the example below). @@ -87,8 +87,8 @@ class KedroSession: >>> # If you are creating a session outside of a Kedro project (i.e. not using >>> # `kedro run` or `kedro jupyter`), you need to run `bootstrap_project` to >>> # let Kedro find your configuration. - >>> metadata = bootstrap_project(Path("")) - >>> with KedroSession.create(metadata.package_name) as session: + >>> bootstrap_project(Path("")) + >>> with KedroSession.create() as session: >>> session.run() """ @@ -125,7 +125,7 @@ def create( # pylint: disable=too-many-arguments Args: package_name: Package name for the Kedro project the session is - created for. + created for. The package_name argument will be removed in Kedro `0.19.0`. project_path: Path to the project root directory. Default is current working directory Path.cwd(). save_on_close: Whether or not to save the session when it's closed. @@ -138,7 +138,6 @@ def create( # pylint: disable=too-many-arguments Returns: A new ``KedroSession`` instance. """ - validate_settings() session = cls( From 35d5c286193ba15fc01ca6d7c4ea7b742098de4a Mon Sep 17 00:00:00 2001 From: Nok Lam Chan Date: Thu, 20 Oct 2022 11:11:19 +0100 Subject: [PATCH 35/38] Remove redundant `resolve_load_version` call (#1911) * remove a redundant function call Signed-off-by: Nok Chan * Remove redundant resolove_load_version & fix test Signed-off-by: Nok Chan * Fix HoloviewWriter tests with more specific error message pattern & Lint Signed-off-by: Nok Chan * Rename tests Signed-off-by: Nok Chan Signed-off-by: Nok Chan Signed-off-by: Ahdra Merali --- kedro/config/abstract_config.py | 2 +- kedro/io/core.py | 4 +--- tests/extras/datasets/holoviews/test_holoviews_writer.py | 6 ++++-- tests/extras/datasets/matplotlib/test_matplotlib_writer.py | 6 ++++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/kedro/config/abstract_config.py b/kedro/config/abstract_config.py index 4174917a0d..0515b96ee7 100644 --- a/kedro/config/abstract_config.py +++ b/kedro/config/abstract_config.py @@ -17,7 +17,7 @@ def __init__( conf_source: str, env: str = None, runtime_params: Dict[str, Any] = None, - **kwargs # pylint: disable=unused-argument + **kwargs ): super().__init__() self.conf_source = conf_source diff --git a/kedro/io/core.py b/kedro/io/core.py index fc6dea587c..98f2bb1d6e 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -548,7 +548,6 @@ def _fetch_latest_load_version(self) -> str: else: message = f"Did not find any versions for {self}" raise VersionNotFoundError(message) - return PurePath(most_recent).parent.name # 'key' is set to prevent cache key overlapping for load and save: @@ -601,8 +600,7 @@ def _get_save_path(self) -> PurePosixPath: def _get_versioned_path(self, version: str) -> PurePosixPath: return self._filepath / version / self._filepath.name - def load(self) -> _DO: - self.resolve_load_version() # Make sure last load version is set + def load(self) -> _DO: # pylint: disable=useless-parent-delegation return super().load() def save(self, data: _DI) -> None: diff --git a/tests/extras/datasets/holoviews/test_holoviews_writer.py b/tests/extras/datasets/holoviews/test_holoviews_writer.py index 43233543c9..521b4d468a 100644 --- a/tests/extras/datasets/holoviews/test_holoviews_writer.py +++ b/tests/extras/datasets/holoviews/test_holoviews_writer.py @@ -174,9 +174,11 @@ def test_http_filesystem_no_versioning(self): filepath="https://example.com/file.png", version=Version(None, None) ) - def test_no_versions(self, versioned_hv_writer): + def test_load_not_supported(self, versioned_hv_writer): """Check the error if no versions are available for load.""" - pattern = r"Did not find any versions for HoloviewsWriter\(.+\)" + pattern = ( + rf"Loading not supported for '{versioned_hv_writer.__class__.__name__}'" + ) with pytest.raises(DataSetError, match=pattern): versioned_hv_writer.load() diff --git a/tests/extras/datasets/matplotlib/test_matplotlib_writer.py b/tests/extras/datasets/matplotlib/test_matplotlib_writer.py index 9d5b0732f4..55a18b9e70 100644 --- a/tests/extras/datasets/matplotlib/test_matplotlib_writer.py +++ b/tests/extras/datasets/matplotlib/test_matplotlib_writer.py @@ -324,9 +324,11 @@ def test_http_filesystem_no_versioning(self): filepath="https://example.com/file.png", version=Version(None, None) ) - def test_no_versions(self, versioned_plot_writer): + def test_load_not_supported(self, versioned_plot_writer): """Check the error if no versions are available for load.""" - pattern = r"Did not find any versions for MatplotlibWriter\(.+\)" + pattern = ( + rf"Loading not supported for '{versioned_plot_writer.__class__.__name__}'" + ) with pytest.raises(DataSetError, match=pattern): versioned_plot_writer.load() From 011b5bb6a1fea9b8ce35473eb263257afeb84bd3 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Thu, 20 Oct 2022 12:42:33 -0400 Subject: [PATCH 36/38] Make docstring in test starter match real starters (#1916) Signed-off-by: Ahdra Merali --- .../{{ cookiecutter.python_package }}/pipeline_registry.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py index 5cef76b4fc..2566f5e95d 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py @@ -8,10 +8,6 @@ def register_pipelines() -> Dict[str, Pipeline]: """Register the project's pipelines. - Since Kedro 0.18.3, projects can use the ``find_pipelines`` function - to autodiscover pipelines. However, projects that require more fine- - grained control can still construct the pipeline mapping without it. - Returns: A mapping from pipeline names to ``Pipeline`` objects. """ From 2ec5c20128145455fde4882eb929ca5af19b9ba9 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Tue, 8 Nov 2022 13:41:06 +0000 Subject: [PATCH 37/38] Try to fix formatting error Signed-off-by: Merel Theisen --- kedro/extras/datasets/plotly/plotly_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro/extras/datasets/plotly/plotly_dataset.py b/kedro/extras/datasets/plotly/plotly_dataset.py index da368f822a..6b3a3a81a0 100644 --- a/kedro/extras/datasets/plotly/plotly_dataset.py +++ b/kedro/extras/datasets/plotly/plotly_dataset.py @@ -27,6 +27,7 @@ class PlotlyDataSet(JSONDataSet): the JSON file directly from a pandas DataFrame through ``plotly_args``. Example configuration for a PlotlyDataSet in the catalog: + .. code-block:: yaml >>> bar_plot: From d0fa348cb338598a71a1958fe57889c64b987e3e Mon Sep 17 00:00:00 2001 From: Ahdra Merali <90615669+AhdraMeraliQB@users.noreply.github.com> Date: Wed, 9 Nov 2022 07:31:41 +0000 Subject: [PATCH 38/38] Specify pickle import --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index df5c899fed..edb7cd3e78 100644 --- a/setup.py +++ b/setup.py @@ -78,7 +78,7 @@ def _collect_requirements(requires): "pandas.XMLDataSet": [PANDAS, "lxml~=4.6"], "pandas.GenericDataSet": [PANDAS], } -pickle_require = {"pickle.PickleDataSet": ["compress-pickle~=2.1.0"]} +pickle_require = {"pickle.PickleDataSet": ["compress-pickle[lz4]~=2.1.0"]} pillow_require = {"pillow.ImageDataSet": ["Pillow~=9.0"]} plotly_require = { "plotly.PlotlyDataSet": [PANDAS, "plotly>=4.8.0, <6.0"],