Skip to content

Commit

Permalink
add skiprows
Browse files Browse the repository at this point in the history
  • Loading branch information
armandleopold committed Jan 8, 2024
1 parent 89a6b45 commit 55efe7e
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 19 deletions.
18 changes: 11 additions & 7 deletions profiling_pack/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,18 @@ def denormalize(data):
denormalized[index] = content
return denormalized


# Load the configuration file
print("Load source_conf.json")
with open("source_conf.json", "r", encoding="utf-8") as file:
config = json.load(file)
source_config = json.load(file)

# Load the pack configuration file
print("Load pack_conf.json")
with open("pack_conf.json", "r", encoding="utf-8") as file:
pack_config = json.load(file)

# Load data using the opener.py logic
df = load_data(config)
df = load_data(source_config, pack_config)

# Run the profiling report
profile = ProfileReport(df, minimal=True, title="Profiling Report")
Expand Down Expand Up @@ -111,7 +115,7 @@ def denormalize(data):
entry = {
"key": key,
"value": round_if_numeric(value),
"scope": {"perimeter": "dataset", "value": config["name"]},
"scope": {"perimeter": "dataset", "value": source_config["name"]},
}
new_format_data.append(entry)
general_data = new_format_data
Expand Down Expand Up @@ -147,7 +151,7 @@ def denormalize(data):
{
"key": "score",
"value": str(round(score_value, 2)),
"scope": {"perimeter": "dataset", "value": config["name"]},
"scope": {"perimeter": "dataset", "value": source_config["name"]},
},
index=[0],
)
Expand All @@ -169,8 +173,8 @@ def denormalize(data):
schemas_data = [
{
"key": "dataset",
"value": config["name"],
"scope": {"perimeter": "dataset", "value": config["name"]},
"value": source_config["name"],
"scope": {"perimeter": "dataset", "value": source_config["name"]},
}
]

Expand Down
41 changes: 30 additions & 11 deletions profiling_pack/opener.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,32 @@


# Function to load data file
def load_data_file(file_path):
def load_data_file(file_path, pack_config):
if file_path.endswith(".csv"):
return pd.read_csv(
file_path, low_memory=False, memory_map=True, on_bad_lines="skip"
)

if pack_config["job"]["source"]["skiprows"]:
return pd.read_csv(
file_path,
low_memory=False,
memory_map=True,
skiprows=pack_config["job"]["source"]["skiprows"],
on_bad_lines="warn",
)
else:
return pd.read_csv(
file_path, low_memory=False, memory_map=True, on_bad_lines="warn"
)

elif file_path.endswith(".xlsx"):
return pd.read_excel(file_path, engine="openpyxl")

if pack_config["job"]["source"]["skiprows"]:
return pd.read_excel(
file_path,
engine="openpyxl",
skiprows=pack_config["job"]["source"]["skiprows"],
)
else:
return pd.read_excel(file_path, engine="openpyxl")


# Function to create database connection
Expand Down Expand Up @@ -69,15 +88,15 @@ def load_data_from_db(engine):


# Function to load data based on the configuration
def load_data(config):
source_type = config["type"]
def load_data(source_config, pack_config):
source_type = source_config["type"]

if source_type == "file":
path = config["config"]["path"]
path = source_config["config"]["path"]

if os.path.isfile(path):
if path.endswith(".csv") or path.endswith(".xlsx"):
return load_data_file(path)
return load_data_file(path, pack_config)
else:
raise ValueError(
"Unsupported file type. Only CSV and XLSX are supported."
Expand All @@ -91,14 +110,14 @@ def load_data(config):
"No CSV or XLSX files found in the provided path."
)
first_data_file = data_files[0]
return load_data_file(first_data_file)
return load_data_file(first_data_file, pack_config)
else:
raise FileNotFoundError(
f"The path {path} is neither a file nor a directory. Or can't be reached."
)

elif source_type == "database":
db_config = config["config"]
db_config = source_config["config"]
engine = create_db_connection(db_config)
return load_data_from_db(engine)

Expand Down
2 changes: 1 addition & 1 deletion profiling_pack/properties.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ icon: icon.png
name: profiling
type: completeness
url: https://github.com/qalita-io/packs/tree/main/profiling_pack
version: 1.0.22
version: 1.0.29
visibility: public

0 comments on commit 55efe7e

Please sign in to comment.