Skip to content

Commit

Permalink
ran black, isort, etc.
Browse files Browse the repository at this point in the history
  • Loading branch information
yoid2000 committed Jul 22, 2024
1 parent a596f7c commit 79dfcac
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 17 deletions.
8 changes: 5 additions & 3 deletions syndiffix/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,11 @@ def _create_child_leaf(self, child_index: int, initial_row: RowId) -> Leaf:

# Set child's subnodes to the matching-interval children of the parent's subnodes.
subnodes = tuple(
subnode.children.get(Branch._remove_dimension_from_index(dim_index, child_index))
if isinstance(subnode, Branch)
else None
(
subnode.children.get(Branch._remove_dimension_from_index(dim_index, child_index))
if isinstance(subnode, Branch)
else None
)
for dim_index, subnode in enumerate(self.subnodes)
)

Expand Down
45 changes: 31 additions & 14 deletions tests/my_debugger.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,40 @@
import pandas as pd
import numpy as np
import os

import numpy as np
import pandas as pd
from pandas.errors import ParserError

from syndiffix import Synthesizer


def count_identical_rows(df1, df2):
# Merge the two dataframes on all columns
merged_df = pd.merge(df1, df2, how='inner')
merged_df = pd.merge(df1, df2, how="inner")

# The number of identical rows is the number of rows in the merged dataframe
num_identical_rows = len(merged_df)

return num_identical_rows


def test2():
catCols = [ 'SEX', 'MSP', 'HISP', 'RAC1P', 'HOUSING_TYPE',
'OWN_RENT', 'INDP_CAT', 'EDU', 'PINCP_DECILE',
'DVET', 'DREM', 'DEYE', 'DEAR', 'DPHY',
]
csv_path = os.path.join('c:\\', 'paul', 'sdnist', 'diverse_communities_data_excerpts', 'texas', 'tx2019.csv')
catCols = [
"SEX",
"MSP",
"HISP",
"RAC1P",
"HOUSING_TYPE",
"OWN_RENT",
"INDP_CAT",
"EDU",
"PINCP_DECILE",
"DVET",
"DREM",
"DEYE",
"DEAR",
"DPHY",
]
csv_path = os.path.join("c:\\", "paul", "sdnist", "diverse_communities_data_excerpts", "texas", "tx2019.csv")
print(csv_path)
df = pd.read_csv(csv_path, low_memory=False)
for col in catCols:
Expand All @@ -31,15 +46,16 @@ def test2():
print(f" {null_values.sum()} null values")
df = df.sample(n=1000)
# HISP and SEX are numeric, so let's change only HISP to string
df['HISP'] = df['HISP'].astype(str)
synth = Synthesizer(df[['HISP','SEX']])
df["HISP"] = df["HISP"].astype(str)
synth = Synthesizer(df[["HISP", "SEX"]])
df_syn = synth.sample()
print(df_syn.dtypes)
print(df_syn.head())
pass


def test1():
csv_path = os.path.join('c:\\', 'paul', 'datasets', 'banking.loans', 'original', 'loan_account_card_clients.csv')
csv_path = os.path.join("c:\\", "paul", "datasets", "banking.loans", "original", "loan_account_card_clients.csv")
print(csv_path)
df = pd.read_csv(csv_path, keep_default_na=False, na_values=[""], low_memory=False)
# Try to infer datetime columns.
Expand All @@ -61,7 +77,8 @@ def test1():
cnt = count_identical_rows(df_syn1, df_syn2)
print(f"There are {cnt} identical rows")


if False:
test1()
if True:
test2()
test2()

0 comments on commit 79dfcac

Please sign in to comment.