Skip to content

Commit

Permalink
Fix mismatch bug
Browse files Browse the repository at this point in the history
  • Loading branch information
argenisleon committed Oct 2, 2019
1 parent 545d2d2 commit 62eab74
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 4 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,5 @@ examples/many=columns\.parquet/
examples/many-columns.parquet/

examples/sandbox.ipynb

examples/dask-worker-space/
2 changes: 1 addition & 1 deletion optimus/bumblebee.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def send(self, message):
logger.print(message)
self.token = self._encrypt(self._compress(message)).decode()

logger.print(message)
logger.print(self.token)
try:
headers = {'content-type': 'application/json'}

Expand Down
11 changes: 8 additions & 3 deletions optimus/profiler/profiler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import configparser
import copy

import humanize
import imgkit
Expand Down Expand Up @@ -67,7 +68,7 @@ def _count_data_types(self, df, columns, infer=False, mismatch=None):
columns = parse_columns(df, columns)

count_by_data_type = df.cols.count_by_dtypes(columns, infer=infer, mismatch=mismatch)

count_by_data_type_no_mismatch = copy.deepcopy(count_by_data_type)
# Info from all the columns
type_details = {}

Expand All @@ -78,9 +79,13 @@ def _count_data_types(self, df, columns, infer=False, mismatch=None):
:param col_name:
:return:
"""
# Not count mismatch
if "mismatch" in count_by_data_type_no_mismatch[col_name]:
count_by_data_type_no_mismatch[col_name].pop("mismatch")

# Get the greatest count by column data type
greatest_data_type_count = max(count_by_data_type[col_name], key=count_by_data_type[col_name].get)
greatest_data_type_count = max(count_by_data_type_no_mismatch[col_name],
key=count_by_data_type_no_mismatch[col_name].get)
if greatest_data_type_count == "string" or greatest_data_type_count == "boolean":
cat = "categorical"
elif greatest_data_type_count == "int" or greatest_data_type_count == "decimal":
Expand All @@ -99,7 +104,7 @@ def _count_data_types(self, df, columns, infer=False, mismatch=None):
assign(type_details, col_name + ".dtype", greatest_data_type_count, dict)
assign(type_details, col_name + ".type", cat, dict)
assign(type_details, col_name + ".stats", count_by_data_type[col_name], dict)

# print(type_details)
return type_details

@time_it
Expand Down

0 comments on commit 62eab74

Please sign in to comment.