diff --git a/.gitignore b/.gitignore index 3e46839..217e0ba 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,11 @@ # Files config.json **/*tmp.* +*.db # Folders input output +tmp # Pycharm .idea diff --git a/backend/__init__.py b/backend/__init__.py index b842064..e69de29 100644 --- a/backend/__init__.py +++ b/backend/__init__.py @@ -1,2 +0,0 @@ -from .logger import get_logger -from .time import TimeIt diff --git a/backend/constants.py b/backend/constants.py new file mode 100644 index 0000000..6044328 --- /dev/null +++ b/backend/constants.py @@ -0,0 +1,44 @@ +# Imports +from pathlib import Path +from sqlalchemy import BIGINT, CHAR, DATETIME, DECIMAL, NVARCHAR, TIME +from sqlalchemy import Date, Integer, Numeric, SmallInteger, String + +# Paths +DIR_PATH: Path = Path(__file__).parent.parent +CONFIG_PATH: Path = DIR_PATH / 'config.json' +INPUT_PATH: Path = DIR_PATH / 'input' +OUTPUT_PATH: Path = DIR_PATH / 'output' + +# Mapping +data_type_mapping: dict[str, str] = dict( + CHAR='pystr', + VARCHAR='pystr', + SMALLINT='pyint', + DATETIME='date_time', + BIGINT='pyint', + DECIMAL='pyfloat', + INT='pyint', + TINYINT='pyint', + DATE='date', + NVARCHAR='pystr', + NUMERIC='pyint', + SMALLDATETIME='date_time', + UNIQUEIDENTIFIER='ean', +) + +SQLTYPE_MAPPING: dict[str, object] = dict( + BIGINT=BIGINT, + BIT=Integer, + CHAR=CHAR, + DATE=Date, + DATETIME=DATETIME, + DATETIME2=DATETIME, + DECIMAL=DECIMAL, + INT=Integer, + NUMERIC=Numeric, + NVARCHAR=NVARCHAR, + SMALLINT=SmallInteger, + TIME=TIME, + TINYINT=Integer, + VARCHAR=String, +) diff --git a/backend/faker_schema.py b/backend/faker_schema.py new file mode 100644 index 0000000..522a648 --- /dev/null +++ b/backend/faker_schema.py @@ -0,0 +1,33 @@ +from faker import Faker + + +class FakerSchema(object): + + def __init__(self, faker=None, locale=None, providers=None, includes=None): + self._faker = faker or Faker(locale=locale, providers=providers, includes=includes) + + def generate_fake(self, schema, iterations=1): + result = [self._generate_one_fake(schema) for _ in range(iterations)] + return result[0] if len(result) == 1 else result + + def _generate_one_fake(self, schema): + """ + Recursively traverse schema dictionary and for each "leaf node", evaluate the fake + value + + Implementation: + For each key-value pair: + 1) If value is not an iterable (i.e. dict or list), evaluate the fake data (base case) + 2) If value is a dictionary, recurse + 3) If value is a list, iteratively recurse over each item + """ + data = {} + for k, (v, kwargs) in schema.items(): + if isinstance(v, dict): + data[k] = self._generate_one_fake(v) + elif isinstance(v, list): + data[k] = [self._generate_one_fake(item) for item in v] + else: + data[k] = getattr(self._faker, v)(**kwargs) + + return data diff --git a/metadata/config.json b/metadata/config.json index a42bd19..11ea5ef 100644 --- a/metadata/config.json +++ b/metadata/config.json @@ -1,4 +1,6 @@ { - "output_path": null, - "report_name": "test_run" + "input": "raw-faker-schema.csv", + "output": "raw-synthetic-data.csv", + "null_nullables": false, + "number_of_rows": 10 } \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b23fc69..7998be1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,13 @@ -black==23.3.0 -isort==5.12.0 -pytest==7.1.2 -tqdm==4.64.0 \ No newline at end of file +colorama==0.4.6 +et-xmlfile==1.1.0 +Faker==19.6.2 +greenlet==2.0.2 +numpy==1.26.0 +openpyxl==3.1.2 +pandas==1.5.3 +python-dateutil==2.8.2 +pytz==2023.3.post1 +six==1.16.0 +SQLAlchemy==2.0.21 +tqdm==4.64.0 +typing_extensions==4.8.0 diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/create_fake_data.py b/scripts/create_fake_data.py new file mode 100644 index 0000000..c53466a --- /dev/null +++ b/scripts/create_fake_data.py @@ -0,0 +1,110 @@ +# Imports +from backend.faker_schema import FakerSchema +from json import load, loads +from pandas import concat, DataFrame, isna, read_csv, Series +from pathlib import Path + +from backend.constants import CONFIG_PATH, data_type_mapping, DIR_PATH + +# Variables +DEFAULT_DATA_TYPE: str = 'VARCHAR' +DEFAULT_NULLABLE: str = 'N' +DEFAULT_FAKER_KWARGS: str = "{}" + + +# Functions and classes +def clean_input(df: DataFrame) -> DataFrame: + # 'Data Type', + column: str = 'Data Type' + nan_check: Series = df[column].isna() + df[column][nan_check]: Series = DEFAULT_DATA_TYPE + df[column] = df[column].apply(lambda x: x.upper()) + df[column] = df[column].apply(lambda x: x.split('(')[0]) + # 'Nullable', + column: str = 'Nullable' + nan_check: Series = df[column].isna() + df[column][nan_check]: Series = DEFAULT_NULLABLE + # 'Faker Type' + column: str = 'Faker Type' + nan_check: Series = df[column].isna() + df[column][nan_check]: Series = df['Data Type'][nan_check].apply( + lambda x: data_type_mapping[x] + ) + # 'Faker kwargs' + column: str = 'Faker kwargs' + if column not in df.columns: + df[column] = DEFAULT_FAKER_KWARGS + nan_check: Series = df[column].isna() + df[column][nan_check] = DEFAULT_FAKER_KWARGS + + return df + + +def get_fake_schema(CONFIG: dict) -> dict[str, str]: + # Controls + RAW_SCHEMA_PATH: Path = DIR_PATH / 'input' / CONFIG['input'] + NULL_NULLABLES: bool = CONFIG['null_nullables'] + # Load the raw schema + df: DataFrame = read_csv(RAW_SCHEMA_PATH) + # Clean input + df = clean_input(df) + df.to_csv(RAW_SCHEMA_PATH, index=False) + # Null check + nullable_key: str = 'Nullable' + if NULL_NULLABLES and nullable_key in df: + # Find where nullable + df[nullable_key]: Series = df[nullable_key] == 'Y' + # Define faker types to null + df['Faker Type'][df['Nullable'].isin([True])] = 'pyobject' + + # Build faker-schema dict + faker_schema: list[tuple] = [] + for _, row in df.iterrows(): + kwargs = loads(row['Faker kwargs']) + faker_schema.append( + (row['Source Fields'], (row['Faker Type'], kwargs)) + ) + # Convert to dict + faker_schema: dict[str, str] = dict(faker_schema) + + return faker_schema + + +def create_fake_data(config_path: Path = CONFIG_PATH) -> None: + # Config + with open(config_path, 'r') as f: + CONFIG: dict[str] = load(f) + DATA_OUTPUT_PATH: Path = DIR_PATH / 'output' / CONFIG['output'] + NUMBER_OF_ROWS: int = CONFIG['number_of_rows'] + # Get faker-schema schema + schema: dict[str, str] = get_fake_schema(CONFIG) + # Produce fake data + faker: FakerSchema = FakerSchema() + data: list[dict] = faker.generate_fake(schema, iterations=NUMBER_OF_ROWS) + # Build output DataFrame/CSV + _data: dict + frames: list[DataFrame] = [] + for _data in data: + # Format to data + _df: DataFrame = DataFrame( + [list(_data.values())], + columns=list(_data.keys()) + ) + # Cache + frames.append(_df) + # Concat Frames + df: DataFrame = concat(frames) + del data, frames + # Format output + df.reset_index(drop=True, inplace=True) + # Save + df.to_csv(DATA_OUTPUT_PATH, index=False) + + +def main() -> None: + create_fake_data() + pass + + +if __name__ == "__main__": + main() diff --git a/scripts/create_sql_table.py b/scripts/create_sql_table.py new file mode 100644 index 0000000..87632e9 --- /dev/null +++ b/scripts/create_sql_table.py @@ -0,0 +1,108 @@ +# Imports +from json import load +from pandas import DataFrame, read_csv, Series +from sqlalchemy import create_engine, sql +from sqlalchemy import Column, MetaData +from sqlalchemy.orm import declarative_base, sessionmaker + +from backend.constants import CONFIG_PATH, DIR_PATH, SQLTYPE_MAPPING + +# Variables +# Config +with open(CONFIG_PATH, 'r') as f: + CONFIG: dict[str] = load(f) +TABLE_NAME: str = CONFIG['table_name'] +SQL_LITE_ENGINE_ADDRESS: str = f'sqlite:///data-store-15-9-23.db' + + +# Functions +def build_column(datatype: str, nullable: str, primary_key: bool = False, autoincrement: bool = False) -> Column: + # Format inputs + nullable: bool = True if nullable == 'Y' else False + sqltype: object = SQLTYPE_MAPPING[datatype] + # Build columns + column: Column = Column( + sqltype, + nullable=nullable, + primary_key=primary_key, + autoincrement=autoincrement + ) + + return column + + +def build_table_inputs() -> dict: + class_inputs: dict + # Load metadata + df_schema: DataFrame = read_csv(DIR_PATH / 'input' / CONFIG['input']) + # Construct table input + index: int + row: Series + columns: list[tuple[str, Column]] = [] + for index, row in df_schema.iterrows(): + primary_key: bool = True if index == 0 else False + col: Column = build_column(row['Data Type'], row['Nullable'], primary_key=primary_key) + columns.append( + (row['Source Fields'], col) + ) + + class_inputs: dict = dict(columns) + + return class_inputs + + +def create_sql_table() -> None: + # Create a Session + engine = create_engine(SQL_LITE_ENGINE_ADDRESS) + Session = sessionmaker(bind=engine) + session = Session() + Base = declarative_base() + + # Create table + class_inputs: dict = build_table_inputs() + class_inputs['__tablename__'] = TABLE_NAME + Test = type(TABLE_NAME, (Base,), class_inputs) + # Create the table if it doesn't exist + Base.metadata.create_all(engine) + # # Template: empty the table + # rows_to_delete = session.query(Test).all() + # for row in rows_to_delete: + # session.delete(row) + + # Pass table object to sqlalchemy session + session.query(Test).all() + metadata = MetaData() # bind=self.engine) + metadata.reflect(bind=engine) + + # Set up SQL connection and print example queries + connection = engine.connect() + + # Add data to table + df_data: DataFrame = read_csv(DIR_PATH / 'output' / CONFIG['output']) + df_data.to_sql(TABLE_NAME, con=connection, if_exists='replace') + + # Print queries + columns: list[str] = [x.name for x in metadata.tables[TABLE_NAME].columns.values()] + analytics = connection.execute( + sql.text( + f"SELECT * FROM {TABLE_NAME}" + ) + ).fetchall() # Returns a list of rows without columns names + + print(columns) + print(analytics) + + # Commit the changes and close the session + session.commit() + session.close() + + pass + + +def main() -> None: + create_sql_table() + pass + + +if __name__ == "__main__": + main() diff --git a/scripts/main.py b/scripts/main.py index 5447643..a7df3f9 100644 --- a/scripts/main.py +++ b/scripts/main.py @@ -1,22 +1,13 @@ # Imports -from pathlib import Path - -from backend import TimeIt, get_logger - +from create_fake_data import create_fake_data +from create_sql_table import create_sql_table # Variables -logger = get_logger(Path(__file__).name) -# Functions and classes -@TimeIt +# Functions def main() -> None: - logger.info("Started main!") - logger.debug("This is a debug message") - logger.info("This is an info message") - logger.warning("This is a warning message") - logger.error("This is an error message") - logger.critical("This is a critical message") - logger.info("Completed main!") + create_fake_data() + create_sql_table() pass