From 5cf7fca8ff938fcd82f8a63ac8a055f98f2a3fe2 Mon Sep 17 00:00:00 2001 From: Daljeet Gahle Date: Tue, 17 Oct 2023 10:14:07 +0100 Subject: [PATCH 01/12] Updated gitignore to ignore SQLite database files. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 3e46839..2f9a36e 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ # Files config.json **/*tmp.* +*.db # Folders input output From 26ad6ea1382ad625bb1b0f7ec5193f04f3e686bc Mon Sep 17 00:00:00 2001 From: Daljeet Gahle Date: Tue, 17 Oct 2023 10:14:34 +0100 Subject: [PATCH 02/12] Add requirements for the package/repo. --- requirements.txt | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index b23fc69..7998be1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,13 @@ -black==23.3.0 -isort==5.12.0 -pytest==7.1.2 -tqdm==4.64.0 \ No newline at end of file +colorama==0.4.6 +et-xmlfile==1.1.0 +Faker==19.6.2 +greenlet==2.0.2 +numpy==1.26.0 +openpyxl==3.1.2 +pandas==1.5.3 +python-dateutil==2.8.2 +pytz==2023.3.post1 +six==1.16.0 +SQLAlchemy==2.0.21 +tqdm==4.64.0 +typing_extensions==4.8.0 From 35f425b4017c8e9d6996a894ff43229c1633782c Mon Sep 17 00:00:00 2001 From: Daljeet Gahle Date: Tue, 17 Oct 2023 10:16:30 +0100 Subject: [PATCH 03/12] Added backend/faker_schema.py to translate a schema CSV to fake data using faker. --- backend/faker_schema.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 backend/faker_schema.py diff --git a/backend/faker_schema.py b/backend/faker_schema.py new file mode 100644 index 0000000..522a648 --- /dev/null +++ b/backend/faker_schema.py @@ -0,0 +1,33 @@ +from faker import Faker + + +class FakerSchema(object): + + def __init__(self, faker=None, locale=None, providers=None, includes=None): + self._faker = faker or Faker(locale=locale, providers=providers, includes=includes) + + def generate_fake(self, schema, iterations=1): + result = [self._generate_one_fake(schema) for _ in range(iterations)] + return result[0] if len(result) == 1 else result + + def _generate_one_fake(self, schema): + """ + Recursively traverse schema dictionary and for each "leaf node", evaluate the fake + value + + Implementation: + For each key-value pair: + 1) If value is not an iterable (i.e. dict or list), evaluate the fake data (base case) + 2) If value is a dictionary, recurse + 3) If value is a list, iteratively recurse over each item + """ + data = {} + for k, (v, kwargs) in schema.items(): + if isinstance(v, dict): + data[k] = self._generate_one_fake(v) + elif isinstance(v, list): + data[k] = [self._generate_one_fake(item) for item in v] + else: + data[k] = getattr(self._faker, v)(**kwargs) + + return data From 713dc7def68a325f7309e532af4e64dced175396 Mon Sep 17 00:00:00 2001 From: Daljeet Gahle Date: Tue, 17 Oct 2023 10:17:02 +0100 Subject: [PATCH 04/12] Constants and mappings used by backend/faker_schema.py --- backend/constants.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 backend/constants.py diff --git a/backend/constants.py b/backend/constants.py new file mode 100644 index 0000000..6044328 --- /dev/null +++ b/backend/constants.py @@ -0,0 +1,44 @@ +# Imports +from pathlib import Path +from sqlalchemy import BIGINT, CHAR, DATETIME, DECIMAL, NVARCHAR, TIME +from sqlalchemy import Date, Integer, Numeric, SmallInteger, String + +# Paths +DIR_PATH: Path = Path(__file__).parent.parent +CONFIG_PATH: Path = DIR_PATH / 'config.json' +INPUT_PATH: Path = DIR_PATH / 'input' +OUTPUT_PATH: Path = DIR_PATH / 'output' + +# Mapping +data_type_mapping: dict[str, str] = dict( + CHAR='pystr', + VARCHAR='pystr', + SMALLINT='pyint', + DATETIME='date_time', + BIGINT='pyint', + DECIMAL='pyfloat', + INT='pyint', + TINYINT='pyint', + DATE='date', + NVARCHAR='pystr', + NUMERIC='pyint', + SMALLDATETIME='date_time', + UNIQUEIDENTIFIER='ean', +) + +SQLTYPE_MAPPING: dict[str, object] = dict( + BIGINT=BIGINT, + BIT=Integer, + CHAR=CHAR, + DATE=Date, + DATETIME=DATETIME, + DATETIME2=DATETIME, + DECIMAL=DECIMAL, + INT=Integer, + NUMERIC=Numeric, + NVARCHAR=NVARCHAR, + SMALLINT=SmallInteger, + TIME=TIME, + TINYINT=Integer, + VARCHAR=String, +) From a18509f5b0c8ac417dfcc36593ef6e6a74f53154 Mon Sep 17 00:00:00 2001 From: Daljeet Gahle Date: Tue, 17 Oct 2023 10:17:24 +0100 Subject: [PATCH 05/12] init file for backend package --- backend/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/backend/__init__.py b/backend/__init__.py index b842064..e69de29 100644 --- a/backend/__init__.py +++ b/backend/__init__.py @@ -1,2 +0,0 @@ -from .logger import get_logger -from .time import TimeIt From 1059ab130ab4c1692e0dc8f1a497962431643587 Mon Sep 17 00:00:00 2001 From: Daljeet Gahle Date: Tue, 17 Oct 2023 10:18:44 +0100 Subject: [PATCH 06/12] Updated default config. --- metadata/config.json | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/metadata/config.json b/metadata/config.json index a42bd19..9ef2e0d 100644 --- a/metadata/config.json +++ b/metadata/config.json @@ -1,4 +1,6 @@ { - "output_path": null, - "report_name": "test_run" + "input": "ecds-raw-faker-schema.csv", + "output": "ecds-raw-fake-data.csv", + "null_nullables": false, + "number_of_rows": 10 } \ No newline at end of file From d1ed27ec2a7552af4415bc66c9c612d365b0f004 Mon Sep 17 00:00:00 2001 From: Daljeet Gahle Date: Tue, 17 Oct 2023 10:19:52 +0100 Subject: [PATCH 07/12] Corrected typos --- metadata/config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata/config.json b/metadata/config.json index 9ef2e0d..11ea5ef 100644 --- a/metadata/config.json +++ b/metadata/config.json @@ -1,6 +1,6 @@ { - "input": "ecds-raw-faker-schema.csv", - "output": "ecds-raw-fake-data.csv", + "input": "raw-faker-schema.csv", + "output": "raw-synthetic-data.csv", "null_nullables": false, "number_of_rows": 10 } \ No newline at end of file From a3803225b9a12798a89d4a474e04809fd4bc937a Mon Sep 17 00:00:00 2001 From: Daljeet Gahle Date: Tue, 17 Oct 2023 10:21:14 +0100 Subject: [PATCH 08/12] A script to create fake data CSV from a schema CSV. --- scripts/create_fake_data.py | 110 ++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 scripts/create_fake_data.py diff --git a/scripts/create_fake_data.py b/scripts/create_fake_data.py new file mode 100644 index 0000000..c53466a --- /dev/null +++ b/scripts/create_fake_data.py @@ -0,0 +1,110 @@ +# Imports +from backend.faker_schema import FakerSchema +from json import load, loads +from pandas import concat, DataFrame, isna, read_csv, Series +from pathlib import Path + +from backend.constants import CONFIG_PATH, data_type_mapping, DIR_PATH + +# Variables +DEFAULT_DATA_TYPE: str = 'VARCHAR' +DEFAULT_NULLABLE: str = 'N' +DEFAULT_FAKER_KWARGS: str = "{}" + + +# Functions and classes +def clean_input(df: DataFrame) -> DataFrame: + # 'Data Type', + column: str = 'Data Type' + nan_check: Series = df[column].isna() + df[column][nan_check]: Series = DEFAULT_DATA_TYPE + df[column] = df[column].apply(lambda x: x.upper()) + df[column] = df[column].apply(lambda x: x.split('(')[0]) + # 'Nullable', + column: str = 'Nullable' + nan_check: Series = df[column].isna() + df[column][nan_check]: Series = DEFAULT_NULLABLE + # 'Faker Type' + column: str = 'Faker Type' + nan_check: Series = df[column].isna() + df[column][nan_check]: Series = df['Data Type'][nan_check].apply( + lambda x: data_type_mapping[x] + ) + # 'Faker kwargs' + column: str = 'Faker kwargs' + if column not in df.columns: + df[column] = DEFAULT_FAKER_KWARGS + nan_check: Series = df[column].isna() + df[column][nan_check] = DEFAULT_FAKER_KWARGS + + return df + + +def get_fake_schema(CONFIG: dict) -> dict[str, str]: + # Controls + RAW_SCHEMA_PATH: Path = DIR_PATH / 'input' / CONFIG['input'] + NULL_NULLABLES: bool = CONFIG['null_nullables'] + # Load the raw schema + df: DataFrame = read_csv(RAW_SCHEMA_PATH) + # Clean input + df = clean_input(df) + df.to_csv(RAW_SCHEMA_PATH, index=False) + # Null check + nullable_key: str = 'Nullable' + if NULL_NULLABLES and nullable_key in df: + # Find where nullable + df[nullable_key]: Series = df[nullable_key] == 'Y' + # Define faker types to null + df['Faker Type'][df['Nullable'].isin([True])] = 'pyobject' + + # Build faker-schema dict + faker_schema: list[tuple] = [] + for _, row in df.iterrows(): + kwargs = loads(row['Faker kwargs']) + faker_schema.append( + (row['Source Fields'], (row['Faker Type'], kwargs)) + ) + # Convert to dict + faker_schema: dict[str, str] = dict(faker_schema) + + return faker_schema + + +def create_fake_data(config_path: Path = CONFIG_PATH) -> None: + # Config + with open(config_path, 'r') as f: + CONFIG: dict[str] = load(f) + DATA_OUTPUT_PATH: Path = DIR_PATH / 'output' / CONFIG['output'] + NUMBER_OF_ROWS: int = CONFIG['number_of_rows'] + # Get faker-schema schema + schema: dict[str, str] = get_fake_schema(CONFIG) + # Produce fake data + faker: FakerSchema = FakerSchema() + data: list[dict] = faker.generate_fake(schema, iterations=NUMBER_OF_ROWS) + # Build output DataFrame/CSV + _data: dict + frames: list[DataFrame] = [] + for _data in data: + # Format to data + _df: DataFrame = DataFrame( + [list(_data.values())], + columns=list(_data.keys()) + ) + # Cache + frames.append(_df) + # Concat Frames + df: DataFrame = concat(frames) + del data, frames + # Format output + df.reset_index(drop=True, inplace=True) + # Save + df.to_csv(DATA_OUTPUT_PATH, index=False) + + +def main() -> None: + create_fake_data() + pass + + +if __name__ == "__main__": + main() From 5634b446aaff00a47e5330fc5e89be63f84eac31 Mon Sep 17 00:00:00 2001 From: Daljeet Gahle Date: Tue, 17 Oct 2023 10:21:56 +0100 Subject: [PATCH 09/12] Script to load schema and data and dynamically built SQLite class and load to a database. --- scripts/create_sql_table.py | 108 ++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 scripts/create_sql_table.py diff --git a/scripts/create_sql_table.py b/scripts/create_sql_table.py new file mode 100644 index 0000000..87632e9 --- /dev/null +++ b/scripts/create_sql_table.py @@ -0,0 +1,108 @@ +# Imports +from json import load +from pandas import DataFrame, read_csv, Series +from sqlalchemy import create_engine, sql +from sqlalchemy import Column, MetaData +from sqlalchemy.orm import declarative_base, sessionmaker + +from backend.constants import CONFIG_PATH, DIR_PATH, SQLTYPE_MAPPING + +# Variables +# Config +with open(CONFIG_PATH, 'r') as f: + CONFIG: dict[str] = load(f) +TABLE_NAME: str = CONFIG['table_name'] +SQL_LITE_ENGINE_ADDRESS: str = f'sqlite:///data-store-15-9-23.db' + + +# Functions +def build_column(datatype: str, nullable: str, primary_key: bool = False, autoincrement: bool = False) -> Column: + # Format inputs + nullable: bool = True if nullable == 'Y' else False + sqltype: object = SQLTYPE_MAPPING[datatype] + # Build columns + column: Column = Column( + sqltype, + nullable=nullable, + primary_key=primary_key, + autoincrement=autoincrement + ) + + return column + + +def build_table_inputs() -> dict: + class_inputs: dict + # Load metadata + df_schema: DataFrame = read_csv(DIR_PATH / 'input' / CONFIG['input']) + # Construct table input + index: int + row: Series + columns: list[tuple[str, Column]] = [] + for index, row in df_schema.iterrows(): + primary_key: bool = True if index == 0 else False + col: Column = build_column(row['Data Type'], row['Nullable'], primary_key=primary_key) + columns.append( + (row['Source Fields'], col) + ) + + class_inputs: dict = dict(columns) + + return class_inputs + + +def create_sql_table() -> None: + # Create a Session + engine = create_engine(SQL_LITE_ENGINE_ADDRESS) + Session = sessionmaker(bind=engine) + session = Session() + Base = declarative_base() + + # Create table + class_inputs: dict = build_table_inputs() + class_inputs['__tablename__'] = TABLE_NAME + Test = type(TABLE_NAME, (Base,), class_inputs) + # Create the table if it doesn't exist + Base.metadata.create_all(engine) + # # Template: empty the table + # rows_to_delete = session.query(Test).all() + # for row in rows_to_delete: + # session.delete(row) + + # Pass table object to sqlalchemy session + session.query(Test).all() + metadata = MetaData() # bind=self.engine) + metadata.reflect(bind=engine) + + # Set up SQL connection and print example queries + connection = engine.connect() + + # Add data to table + df_data: DataFrame = read_csv(DIR_PATH / 'output' / CONFIG['output']) + df_data.to_sql(TABLE_NAME, con=connection, if_exists='replace') + + # Print queries + columns: list[str] = [x.name for x in metadata.tables[TABLE_NAME].columns.values()] + analytics = connection.execute( + sql.text( + f"SELECT * FROM {TABLE_NAME}" + ) + ).fetchall() # Returns a list of rows without columns names + + print(columns) + print(analytics) + + # Commit the changes and close the session + session.commit() + session.close() + + pass + + +def main() -> None: + create_sql_table() + pass + + +if __name__ == "__main__": + main() From 68ba75bd4e8f5aecbc4e658958af06bacceec527 Mon Sep 17 00:00:00 2001 From: Daljeet Gahle Date: Tue, 17 Oct 2023 10:22:34 +0100 Subject: [PATCH 10/12] Updated to run create_fake_data and create_sql_table --- scripts/main.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/scripts/main.py b/scripts/main.py index 5447643..a7df3f9 100644 --- a/scripts/main.py +++ b/scripts/main.py @@ -1,22 +1,13 @@ # Imports -from pathlib import Path - -from backend import TimeIt, get_logger - +from create_fake_data import create_fake_data +from create_sql_table import create_sql_table # Variables -logger = get_logger(Path(__file__).name) -# Functions and classes -@TimeIt +# Functions def main() -> None: - logger.info("Started main!") - logger.debug("This is a debug message") - logger.info("This is an info message") - logger.warning("This is a warning message") - logger.error("This is an error message") - logger.critical("This is a critical message") - logger.info("Completed main!") + create_fake_data() + create_sql_table() pass From c1bae08edaa84f8f17b987437a7524e489ab971c Mon Sep 17 00:00:00 2001 From: Daljeet Gahle Date: Tue, 17 Oct 2023 10:24:30 +0100 Subject: [PATCH 11/12] init file for scripts folder as they are currently importing from each other. --- scripts/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 scripts/__init__.py diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 From b8712176328c4c1bff502c8a28ea34e2db1b0e00 Mon Sep 17 00:00:00 2001 From: Daljeet Gahle Date: Tue, 17 Oct 2023 10:26:12 +0100 Subject: [PATCH 12/12] Added tmp folder to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 2f9a36e..217e0ba 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ config.json # Folders input output +tmp # Pycharm .idea