Merge pull request #369 from capitalone/develop

Release v0.16.0
capitalone · Jan 10, 2025 · 2fdb769 · 2fdb769
2 parents 6b6affc + 43f0d36
commit 2fdb769
Show file tree

Hide file tree

Showing 19 changed files with 97 additions and 93 deletions.
diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
@@ -19,7 +19,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v2
       with:
-        python-version: '3.9'
+        python-version: '3.10'
     - name: Install dependencies
       run: python -m pip install .[dev]
     - name: Build

diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml
@@ -18,7 +18,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v2
       with:
-        python-version: '3.9'
+        python-version: '3.10'
     - name: Install dependencies
       run: python -m pip install .[dev]
     - name: Build and publish

diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml
@@ -17,10 +17,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - name: Set up Python 3.9
+      - name: Set up Python
         uses: actions/setup-python@v5
         with:
-            python-version: "3.9"
+            python-version: "3.10"
       - name: Install dependencies
         run: python -m pip install .[qa]
       - name: Linting by ruff
@@ -33,7 +33,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9, '3.10', '3.11', '3.12']
+        python-version: ['3.10', '3.11', '3.12']
         spark-version: [3.2.4, 3.3.4, 3.4.2, 3.5.1]
         pandas-version: [2.2.3, 1.5.3]
         numpy-version: [2.1.2, 1.26.4]
@@ -42,8 +42,6 @@ jobs:
             spark-version: 3.2.4
           - python-version: '3.11'
             spark-version: 3.3.4
-          - python-version: 3.9
-            numpy-version: 2.1.2
           - pandas-version: 1.5.3
             numpy-version: 2.1.2
     env:
@@ -77,7 +75,7 @@ jobs:
       run: |
         python -m pip install .[dev_no_snowflake]
 
-    - name: Install Datacompy with all dev dependencies if Python 3.9, 3.10, or 3.11
+    - name: Install Datacompy with all dev dependencies if Python 3.10, or 3.11
       if: ${{ matrix.python-version != '3.12' }}
       run: |
         python -m pip install .[dev]
@@ -92,7 +90,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9, '3.10', '3.11', '3.12']
+        python-version: ['3.10', '3.11', '3.12']
 
     env:
       PYTHON_VERSION: ${{ matrix.python-version }}
@@ -119,7 +117,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9, '3.10', '3.11']
+        python-version: ['3.10', '3.11']
     env:
       PYTHON_VERSION: ${{ matrix.python-version }}
 

diff --git a/CONTRIBUTORS b/CONTRIBUTORS
@@ -4,4 +4,6 @@
 - Mark Zhou
 - Ian Whitestone
 - Faisal Dosani
-- Lorenzo Mercado
+- Lorenzo Mercado
+- Jacob Dawang
+- Raymond Haffar
diff --git a/README.md b/README.md
@@ -7,11 +7,19 @@
 ![PyPI - Downloads](https://img.shields.io/pypi/dm/datacompy)
 
 
-DataComPy is a package to compare two Pandas DataFrames. Originally started to
-be something of a replacement for SAS's ``PROC COMPARE`` for Pandas DataFrames
-with some more functionality than just ``Pandas.DataFrame.equals(Pandas.DataFrame)``
-(in that it prints out some stats, and lets you tweak how accurate matches have to be).
-Then extended to carry that functionality over to Spark Dataframes.
+DataComPy is a package to compare two DataFrames (or tables) such as Pandas, Spark, Polars, and
+even Snowflake. Originally it was created to be something of a replacement
+for SAS's ``PROC COMPARE`` for Pandas DataFrames with some more functionality than
+just ``Pandas.DataFrame.equals(Pandas.DataFrame)`` (in that it prints out some stats,
+and lets you tweak how accurate matches have to be). Supported types include:
+
+- Pandas
+- Polars
+- Spark
+- Snowflake (via snowpark)
+- Dask (via Fugue)
+- DuckDB (via Fugue)
+
 
 ## Quick Installation
 
@@ -69,7 +77,6 @@ with the Pandas on Spark implementation. Spark plans to support Pandas 2 in [Spa
 
 |             | Spark 3.2.4 | Spark 3.3.4 | Spark 3.4.2 | Spark 3.5.1 |
 |-------------|-------------|-------------|-------------|-------------|
-| Python 3.9  | ✅           | ✅           | ✅           | ✅           |
 | Python 3.10 | ✅           | ✅           | ✅           | ✅           |
 | Python 3.11 | ❌           | ❌           | ✅           | ✅           |
 | Python 3.12 | ❌           | ❌           | ❌           | ❌           |

diff --git a/ROADMAP.rst b/ROADMAP.rst
@@ -2,13 +2,7 @@ datacompy Roadmap
 -----------------
 
 At this current time ``datacompy`` is in a stable state. We are planning on continuing to
-add features and functionality as the community of users asks for them, but there are no 
+add features and functionality as the community of users asks for them, but there are no
 pressing issues which we are looking to add in immediately.
 
-There are some longer term issues which are open for people to work on, and some which are more of a nice to have.
-We are looking for contributors and also maintaners to help with the project.
-
-- Add in docs how to change the number of mismatches in report `#6 <https://github.com/capitalone/datacompy/issues/6>`_
-- Make duplicate handling better `#7 <https://github.com/capitalone/datacompy/issues/7>`_
-- Refactor Spark datacompy   `#13 <https://github.com/capitalone/datacompy/issues/13>`_
-- Drop Python 3.7 suport  `#173 <https://github.com/capitalone/datacompy/issues/173>`_
+Please feel free to check the issues section of the repository for the most up to date list.
diff --git a/datacompy/__init__.py b/datacompy/__init__.py
@@ -18,7 +18,7 @@
 Then extended to carry that functionality over to Spark Dataframes.
 """
 
-__version__ = "0.15.0"
+__version__ = "0.16.0"
 
 import platform
 from warnings import warn

diff --git a/datacompy/base.py b/datacompy/base.py
@@ -22,7 +22,7 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any
 
 from ordered_set import OrderedSet
 
@@ -154,7 +154,7 @@ def report(
         self,
         sample_count: int = 10,
         column_count: int = 10,
-        html_file: Optional[str] = None,
+        html_file: str | None = None,
     ) -> str:
         """Return a string representation of a report."""
         pass

diff --git a/datacompy/core.py b/datacompy/core.py
@@ -22,7 +22,7 @@
 """
 
 import os
-from typing import Any, Dict, List, Optional, Union, cast
+from typing import Any, Dict, List, cast
 
 import numpy as np
 import pandas as pd
@@ -84,7 +84,7 @@ def __init__(
         self,
         df1: pd.DataFrame,
         df2: pd.DataFrame,
-        join_columns: Optional[Union[List[str], str]] = None,
+        join_columns: List[str] | str | None = None,
         on_index: bool = False,
         abs_tol: float = 0,
         rel_tol: float = 0,
@@ -100,7 +100,7 @@ def __init__(
         elif on_index:
             self.on_index = True
             self.join_columns = []
-        elif isinstance(join_columns, (str, int, float)):
+        elif isinstance(join_columns, str | int | float):
             self.join_columns = [
                 str(join_columns).lower()
                 if self.cast_column_names_lower
@@ -564,7 +564,7 @@ def report(
         self,
         sample_count: int = 10,
         column_count: int = 10,
-        html_file: Optional[str] = None,
+        html_file: str | None = None,
     ) -> str:
         """Return a string representation of a report.
 
@@ -728,7 +728,7 @@ def df_to_str(pdf: pd.DataFrame) -> str:
         return report
 
 
-def render(filename: str, *fields: Union[int, float, str]) -> str:
+def render(filename: str, *fields: int | float | str) -> str:
     """Render out an individual template.
 
     This basically just reads in a

diff --git a/datacompy/fugue.py b/datacompy/fugue.py
@@ -17,7 +17,7 @@
 
 import pickle
 from collections import defaultdict
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
+from typing import Any, Callable, Dict, Iterable, List, Tuple, cast
 
 import pandas as pd
 from ordered_set import OrderedSet
@@ -105,15 +105,15 @@ def all_columns_match(df1: "AnyDataFrame", df2: "AnyDataFrame") -> bool:
 def is_match(
     df1: "AnyDataFrame",
     df2: "AnyDataFrame",
-    join_columns: Union[str, List[str]],
+    join_columns: str | List[str],
     abs_tol: float = 0,
     rel_tol: float = 0,
     df1_name: str = "df1",
     df2_name: str = "df2",
     ignore_spaces: bool = False,
     ignore_case: bool = False,
     cast_column_names_lower: bool = True,
-    parallelism: Optional[int] = None,
+    parallelism: int | None = None,
     strict_schema: bool = False,
 ) -> bool:
     """Check whether two dataframes match.
@@ -204,15 +204,15 @@ def is_match(
 def all_rows_overlap(
     df1: "AnyDataFrame",
     df2: "AnyDataFrame",
-    join_columns: Union[str, List[str]],
+    join_columns: str | List[str],
     abs_tol: float = 0,
     rel_tol: float = 0,
     df1_name: str = "df1",
     df2_name: str = "df2",
     ignore_spaces: bool = False,
     ignore_case: bool = False,
     cast_column_names_lower: bool = True,
-    parallelism: Optional[int] = None,
+    parallelism: int | None = None,
     strict_schema: bool = False,
 ) -> bool:
     """Check if the rows are all present in both dataframes.
@@ -300,15 +300,15 @@ def all_rows_overlap(
 def count_matching_rows(
     df1: "AnyDataFrame",
     df2: "AnyDataFrame",
-    join_columns: Union[str, List[str]],
+    join_columns: str | List[str],
     abs_tol: float = 0,
     rel_tol: float = 0,
     df1_name: str = "df1",
     df2_name: str = "df2",
     ignore_spaces: bool = False,
     ignore_case: bool = False,
     cast_column_names_lower: bool = True,
-    parallelism: Optional[int] = None,
+    parallelism: int | None = None,
     strict_schema: bool = False,
 ) -> int:
     """Count the number of rows match (on overlapping fields).
@@ -395,7 +395,7 @@ def count_matching_rows(
 def report(
     df1: "AnyDataFrame",
     df2: "AnyDataFrame",
-    join_columns: Union[str, List[str]],
+    join_columns: str | List[str],
     abs_tol: float = 0,
     rel_tol: float = 0,
     df1_name: str = "df1",
@@ -405,8 +405,8 @@ def report(
     cast_column_names_lower: bool = True,
     sample_count: int = 10,
     column_count: int = 10,
-    html_file: Optional[str] = None,
-    parallelism: Optional[int] = None,
+    html_file: str | None = None,
+    parallelism: int | None = None,
 ) -> str:
     """Return a string representation of a report.
 
@@ -648,7 +648,7 @@ def _any(col: str) -> int:
 def _distributed_compare(
     df1: "AnyDataFrame",
     df2: "AnyDataFrame",
-    join_columns: Union[str, List[str]],
+    join_columns: str | List[str],
     return_obj_func: Callable[[Compare], Any],
     abs_tol: float = 0,
     rel_tol: float = 0,
@@ -657,7 +657,7 @@ def _distributed_compare(
     ignore_spaces: bool = False,
     ignore_case: bool = False,
     cast_column_names_lower: bool = True,
-    parallelism: Optional[int] = None,
+    parallelism: int | None = None,
     strict_schema: bool = False,
 ) -> List[Any]:
     """Compare the data distributively using the core Compare class.

diff --git a/datacompy/polars.py b/datacompy/polars.py
@@ -23,7 +23,7 @@
 
 import os
 from copy import deepcopy
-from typing import Any, Dict, List, Optional, Union, cast
+from typing import Any, Dict, List, cast
 
 import numpy as np
 import polars as pl
@@ -85,7 +85,7 @@ def __init__(
         self,
         df1: "pl.DataFrame",
         df2: "pl.DataFrame",
-        join_columns: Union[List[str], str],
+        join_columns: List[str] | str,
         abs_tol: float = 0,
         rel_tol: float = 0,
         df1_name: str = "df1",
@@ -327,8 +327,8 @@ def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
         creates a column column_match which is True for matches, False
         otherwise.
         """
-        match_cnt: Union[int, float]
-        null_diff: Union[int, float]
+        match_cnt: int | float
+        null_diff: int | float
 
         LOG.debug("Comparing intersection")
         row_cnt = len(self.intersect_rows)
@@ -571,7 +571,7 @@ def report(
         self,
         sample_count: int = 10,
         column_count: int = 10,
-        html_file: Optional[str] = None,
+        html_file: str | None = None,
     ) -> str:
         """Return a string representation of a report.
 
@@ -734,7 +734,7 @@ def df_to_str(pdf: "pl.DataFrame") -> str:
         return report
 
 
-def render(filename: str, *fields: Union[int, float, str]) -> str:
+def render(filename: str, *fields: int | float | str) -> str:
     """Render out an individual template.
 
     This basically just reads in a