Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add __str__ methods to the various part of the profiler options #1115

Open
wants to merge 11 commits into
base: dev
Choose a base branch
from
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -80,11 +80,11 @@ repos:

# requirements-ml.txt
scikit-learn>=0.23.2,
keras>=2.4.3,
'keras>=2.4.3,<3.0.0',
rapidfuzz>=2.6.1,
tensorflow>=2.6.4; sys.platform != 'darwin',
tensorflow>=2.6.4; sys_platform == 'darwin' and platform_machine != 'arm64',
tensorflow-macos>=2.6.4; sys_platform == 'darwin' and platform_machine == 'arm64',
"tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'",
"tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'",
"tensorflow-macos>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
tqdm>=4.0.0,

# requirements-reports.txt
135 changes: 135 additions & 0 deletions dataprofiler/profilers/profiler_options.py
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@

import abc
import copy
import json
import re
import warnings
from typing import Any, Generic, TypeVar, cast
@@ -193,6 +194,15 @@ def __init__(self, is_enabled: bool = True) -> None:
"""
self.is_enabled = is_enabled

def __str__(self) -> str:
"""
Return a human friendly consumable output in string form.

:return: str of the option properties
:rtype: str
"""
return str(self.is_enabled)

def _validate_helper(self, variable_path: str = "BooleanOption") -> list[str]:
"""
Validate the options do not conflict and cause errors.
@@ -958,6 +968,25 @@ def __init__(
self.cms_relative_error = cms_relative_error
self.cms_max_num_heavy_hitters = cms_max_num_heavy_hitters

def __str__(self) -> str:
"""
Return a human friendly consumable output in string form.

:vartype dict_string: dict
:return: str of the option properties
:rtype: str
"""
dict_string: dict = {"CategoricalOptions": []}
for iter_option in [
a
for a in dir(self)
if not a.startswith("__") and not callable(getattr(self, a))
]:
dict_string["CategoricalOptions"].append(
{str(iter_option): str(getattr(self, iter_option))}
)
return json.dumps(dict_string, indent=4)

Comment on lines +971 to +989
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be good to find a way to abstract this a bit more so this ends up in BaseOption 90%+ of this code is repeat just with string changes: so I think there is room to make this DRY-er

def _validate_helper(self, variable_path: str = "CategoricalOptions") -> list[str]:
"""
Validate the options do not conflict and cause errors.
@@ -1182,6 +1211,25 @@ def __init__(
)
self.null_count: BooleanOption = BooleanOption(is_enabled=null_count)

def __str__(self) -> str:
"""
Return a human friendly consumable output in string form.

:vartype dict_string: dict
:return: str of the option properties
:rtype: str
"""
dict_string: dict = {"RowStatisticsOptions": []}
for iter_option in [
a
for a in dir(self)
if not a.startswith("__") and not callable(getattr(self, a))
]:
dict_string["RowStatisticsOptions"].append(
{str(iter_option): str(getattr(self, iter_option))}
)
return json.dumps(dict_string, indent=4)

def _validate_helper(
self, variable_path: str = "RowStatisticsOptions"
) -> list[str]:
@@ -1228,6 +1276,25 @@ def __init__(self) -> None:
self.max_sample_size: int | None = None
self.data_labeler_object: BaseDataLabeler | None = None

def __str__(self) -> str:
"""
Return a human friendly consumable output in string form.

:vartype dict_string: dict
:return: str of the option properties
:rtype: str
"""
dict_string: dict = {"DataLabelerOptions": []}
for iter_option in [
a
for a in dir(self)
if not a.startswith("__") and not callable(getattr(self, a))
]:
dict_string["DataLabelerOptions"].append(
{str(iter_option): str(getattr(self, iter_option))}
)
return json.dumps(dict_string, indent=4)

def __deepcopy__(self, memo: dict) -> DataLabelerOptions:
"""
Override deepcopy for data labeler object.
@@ -1370,6 +1437,25 @@ def __init__(
self.vocab: BooleanOption = BooleanOption(is_enabled=True)
self.words: BooleanOption = BooleanOption(is_enabled=True)

def __str__(self) -> str:
"""
Return a human friendly consumable output in string form.

:vartype dict_string: dict
:return: str of the option properties
:rtype: str
"""
dict_string: dict = {"TextProfilerOptions": []}
for iter_option in [
a
for a in dir(self)
if not a.startswith("__") and not callable(getattr(self, a))
]:
dict_string["TextProfilerOptions"].append(
{str(iter_option): str(getattr(self, iter_option))}
)
return json.dumps(dict_string, indent=4)

def _validate_helper(self, variable_path: str = "TextProfilerOptions") -> list[str]:
"""
Validate the options do not conflict and cause errors.
@@ -1488,6 +1574,25 @@ def __init__(
self.column_null_values = column_null_values
self.sampling_ratio = sampling_ratio

def __str__(self) -> str:
"""
Return a human friendly consumable output in string form.

:vartype dict_string: dict
:return: str of the option properties
:rtype: str
"""
dict_string: dict = {"StructuredOptions": []}
for iter_option in [
a
for a in dir(self)
if not a.startswith("__") and not callable(getattr(self, a))
]:
dict_string["StructuredOptions"].append(
{str(iter_option): str(getattr(self, iter_option))}
)
return json.dumps(dict_string, indent=4)

@property
def enabled_profiles(self) -> list[str]:
"""Return a list of the enabled profilers for columns."""
@@ -1638,6 +1743,25 @@ def __init__(self) -> None:
self.text = TextProfilerOptions()
self.data_labeler = DataLabelerOptions()

def __str__(self) -> str:
"""
Return a human friendly consumable output in string form.

:vartype dict_string: dict
:return: str of the option properties
:rtype: str
"""
dict_string: dict = {"UnstructuredOptions": []}
for iter_option in [
a
for a in dir(self)
if not a.startswith("__") and not callable(getattr(self, a))
]:
dict_string["UnstructuredOptions"].append(
{str(iter_option): str(getattr(self, iter_option))}
)
return json.dumps(dict_string, indent=4)

@property
def enabled_profiles(self) -> list[str]:
"""Return a list of the enabled profilers."""
@@ -1715,6 +1839,17 @@ def __init__(self, presets: str = None) -> None:
else:
raise ValueError("The preset entered is not a valid preset.")

def __str__(self) -> str:
"""
Return a human friendly consumable output in string form.

:return: str of the option presets and properties
:rtype: str
"""
return f"Presets: {str(self.presets)}\n \
{str(self.structured_options)}\n \
{str(self.unstructured_options)}"

Comment on lines +1842 to +1852
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this makes sense here and you would test in test_profiler_options.py

def _complete_presets(self) -> None:
self.set({"*.is_enabled": True})

8 changes: 4 additions & 4 deletions requirements-ml.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
scikit-learn>=0.23.2
keras>=2.4.3
keras>=2.4.3,<3.0.0
rapidfuzz>=2.6.1
tensorflow>=2.6.4; sys.platform != 'darwin'
tensorflow>=2.6.4; sys_platform == 'darwin' and platform_machine != 'arm64'
tensorflow-macos>=2.6.4; sys_platform == 'darwin' and platform_machine == 'arm64'
tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'
tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'
tensorflow-macos>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine == 'arm64'
tqdm>=4.0.0
2 changes: 1 addition & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
coverage>=5.0.1
dask>=2.29.0
dask>=2.29.0,<2024.2.0
fsspec>=0.3.3
pytest>=6.0.1
pytest-cov>=2.8.1