From 9398a214d4ef2a8108a65ae0abcaba678930a7a8 Mon Sep 17 00:00:00 2001 From: Nate Parsons Date: Thu, 9 May 2024 14:08:10 -0500 Subject: [PATCH 1/4] use filter in tarfile.extractall --- featuretools/entityset/deserialize.py | 10 +++++++++- .../tests/entityset_tests/test_serialization.py | 14 +++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/featuretools/entityset/deserialize.py b/featuretools/entityset/deserialize.py index fbcd6c6ef8..9ab5844617 100644 --- a/featuretools/entityset/deserialize.py +++ b/featuretools/entityset/deserialize.py @@ -2,6 +2,7 @@ import os import tarfile import tempfile +from inspect import getfullargspec import pandas as pd import woodwork.type_sys.type_system as ww_type_system @@ -140,6 +141,8 @@ def read_data_description(path): def read_entityset(path, profile_name=None, **kwargs): """Read entityset from disk, S3 path, or URL. + NOTE: Never attempt to read an archived entityset from an untrusted source. + Args: path (str): Directory on disk, S3 path, or URL to read `data_description.json`. profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials. @@ -159,7 +162,12 @@ def read_entityset(path, profile_name=None, **kwargs): use_smartopen_es(local_path, path, transport_params) with tarfile.open(str(local_path)) as tar: - tar.extractall(path=tmpdir) + if "filter" in getfullargspec(tar.extractall).kwonlyargs: + tar.extractall(path=tmpdir, filter="data") + else: + raise RuntimeError( + "Please upgrade your Python version to the latest patch release to allow for safe extraction of the EntitySet archive.", + ) data_description = read_data_description(tmpdir) return description_to_entityset(data_description, **kwargs) diff --git a/featuretools/tests/entityset_tests/test_serialization.py b/featuretools/tests/entityset_tests/test_serialization.py index deb9da2d77..6511725a5e 100644 --- a/featuretools/tests/entityset_tests/test_serialization.py +++ b/featuretools/tests/entityset_tests/test_serialization.py @@ -2,7 +2,7 @@ import logging import os import tempfile -from unittest.mock import patch +from unittest.mock import MagicMock, patch from urllib.request import urlretrieve import boto3 @@ -292,6 +292,18 @@ def test_deserialize_local_tar(es): assert es.__eq__(new_es, deep=True) +@patch("featuretools.entityset.deserialize.getfullargspec") +def test_deserialize_errors_if_python_version_unsafe(mock_inspect, es): + mock_response = MagicMock() + mock_response.kwonlyargs = [] + mock_inspect.return_value = mock_response + with tempfile.TemporaryDirectory() as tmp_path: + temp_tar_filepath = os.path.join(tmp_path, TEST_FILE) + urlretrieve(URL, filename=temp_tar_filepath) + with pytest.raises(RuntimeError, match=""): + deserialize.read_entityset(temp_tar_filepath) + + def test_deserialize_url_csv(es): new_es = deserialize.read_entityset(URL) assert es.__eq__(new_es, deep=True) From bc1e60c587457e26e94d21dc6f307d8176f0655a Mon Sep 17 00:00:00 2001 From: Nate Parsons Date: Thu, 9 May 2024 14:11:02 -0500 Subject: [PATCH 2/4] update release notes --- docs/source/release_notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 77eb5b071a..9df35edd74 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -13,6 +13,7 @@ Future Release * Temporarily restrict Dask version (:pr:`2694`) * Remove support for creating ``EntitySets`` from Dask or Pyspark dataframes (:pr:`2705`) * Bump minimum versions of ``tqdm`` and ``pip`` in requirements files (:pr:`2716`) + * Use ``filter`` arg in call to ``tarfile.extractall`` to safely deserialize EntitySets (:pr:`2722`) * Documentation Changes * Testing Changes * Fix serialization test to work with pytest 8.1.1 (:pr:`2694`) From ee9d0360e983f62a1085ebc1949c58167232ee8b Mon Sep 17 00:00:00 2001 From: Nate Parsons Date: Thu, 9 May 2024 15:23:58 -0500 Subject: [PATCH 3/4] update release notes action --- .github/workflows/release_notes_updated.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release_notes_updated.yaml b/.github/workflows/release_notes_updated.yaml index def24cbb32..1c88ebd111 100644 --- a/.github/workflows/release_notes_updated.yaml +++ b/.github/workflows/release_notes_updated.yaml @@ -10,6 +10,8 @@ jobs: - name: Check for development branch id: branch shell: python + env: + REF: ${{ github.event.pull_request.head.ref }} run: | from re import compile main = '^main$' @@ -19,7 +21,7 @@ jobs: min_dep_update = '^min-dep-update-[a-f0-9]{7}$' regex = main, release, backport, dep_update, min_dep_update patterns = list(map(compile, regex)) - ref = "${{ github.event.pull_request.head.ref }}" + ref = "$REF" is_dev = not any(pattern.match(ref) for pattern in patterns) print('::set-output name=is_dev::' + str(is_dev)) - if: ${{ steps.branch.outputs.is_dev == 'true' }} From d9191eb4c38aaf092819d1e041d43be921046d5b Mon Sep 17 00:00:00 2001 From: Nate Parsons Date: Thu, 9 May 2024 15:25:48 -0500 Subject: [PATCH 4/4] update docstring --- featuretools/entityset/deserialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/featuretools/entityset/deserialize.py b/featuretools/entityset/deserialize.py index 9ab5844617..8a7d448b46 100644 --- a/featuretools/entityset/deserialize.py +++ b/featuretools/entityset/deserialize.py @@ -141,7 +141,7 @@ def read_data_description(path): def read_entityset(path, profile_name=None, **kwargs): """Read entityset from disk, S3 path, or URL. - NOTE: Never attempt to read an archived entityset from an untrusted source. + NOTE: Never attempt to read an archived EntitySet from an untrusted source. Args: path (str): Directory on disk, S3 path, or URL to read `data_description.json`.