Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1 initial codes #4

Draft
wants to merge 12 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
# Files
config.json
**/*tmp.*
*.db
# Folders
input
output
tmp

# Pycharm
.idea
Expand Down
2 changes: 0 additions & 2 deletions backend/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +0,0 @@
from .logger import get_logger
from .time import TimeIt
44 changes: 44 additions & 0 deletions backend/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Imports
from pathlib import Path
from sqlalchemy import BIGINT, CHAR, DATETIME, DECIMAL, NVARCHAR, TIME
from sqlalchemy import Date, Integer, Numeric, SmallInteger, String

# Paths
DIR_PATH: Path = Path(__file__).parent.parent
CONFIG_PATH: Path = DIR_PATH / 'config.json'
INPUT_PATH: Path = DIR_PATH / 'input'
OUTPUT_PATH: Path = DIR_PATH / 'output'

# Mapping
data_type_mapping: dict[str, str] = dict(
CHAR='pystr',
VARCHAR='pystr',
SMALLINT='pyint',
DATETIME='date_time',
BIGINT='pyint',
DECIMAL='pyfloat',
INT='pyint',
TINYINT='pyint',
DATE='date',
NVARCHAR='pystr',
NUMERIC='pyint',
SMALLDATETIME='date_time',
UNIQUEIDENTIFIER='ean',
)

SQLTYPE_MAPPING: dict[str, object] = dict(
BIGINT=BIGINT,
BIT=Integer,
CHAR=CHAR,
DATE=Date,
DATETIME=DATETIME,
DATETIME2=DATETIME,
DECIMAL=DECIMAL,
INT=Integer,
NUMERIC=Numeric,
NVARCHAR=NVARCHAR,
SMALLINT=SmallInteger,
TIME=TIME,
TINYINT=Integer,
VARCHAR=String,
)
33 changes: 33 additions & 0 deletions backend/faker_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from faker import Faker


class FakerSchema(object):

def __init__(self, faker=None, locale=None, providers=None, includes=None):
self._faker = faker or Faker(locale=locale, providers=providers, includes=includes)

def generate_fake(self, schema, iterations=1):
result = [self._generate_one_fake(schema) for _ in range(iterations)]
return result[0] if len(result) == 1 else result

def _generate_one_fake(self, schema):
"""
Recursively traverse schema dictionary and for each "leaf node", evaluate the fake
value

Implementation:
For each key-value pair:
1) If value is not an iterable (i.e. dict or list), evaluate the fake data (base case)
2) If value is a dictionary, recurse
3) If value is a list, iteratively recurse over each item
"""
data = {}
for k, (v, kwargs) in schema.items():
if isinstance(v, dict):
data[k] = self._generate_one_fake(v)
elif isinstance(v, list):
data[k] = [self._generate_one_fake(item) for item in v]
else:
data[k] = getattr(self._faker, v)(**kwargs)

return data
6 changes: 4 additions & 2 deletions metadata/config.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
{
"output_path": null,
"report_name": "test_run"
"input": "raw-faker-schema.csv",
"output": "raw-synthetic-data.csv",
"null_nullables": false,
"number_of_rows": 10
}
17 changes: 13 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
black==23.3.0
isort==5.12.0
pytest==7.1.2
tqdm==4.64.0
colorama==0.4.6
et-xmlfile==1.1.0
Faker==19.6.2
greenlet==2.0.2
numpy==1.26.0
openpyxl==3.1.2
pandas==1.5.3
python-dateutil==2.8.2
pytz==2023.3.post1
six==1.16.0
SQLAlchemy==2.0.21
tqdm==4.64.0
typing_extensions==4.8.0
Empty file added scripts/__init__.py
Empty file.
110 changes: 110 additions & 0 deletions scripts/create_fake_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# Imports
from backend.faker_schema import FakerSchema
from json import load, loads
from pandas import concat, DataFrame, isna, read_csv, Series
from pathlib import Path

from backend.constants import CONFIG_PATH, data_type_mapping, DIR_PATH

# Variables
DEFAULT_DATA_TYPE: str = 'VARCHAR'
DEFAULT_NULLABLE: str = 'N'
DEFAULT_FAKER_KWARGS: str = "{}"


# Functions and classes
def clean_input(df: DataFrame) -> DataFrame:
# 'Data Type',
column: str = 'Data Type'
nan_check: Series = df[column].isna()
df[column][nan_check]: Series = DEFAULT_DATA_TYPE
df[column] = df[column].apply(lambda x: x.upper())
df[column] = df[column].apply(lambda x: x.split('(')[0])
# 'Nullable',
column: str = 'Nullable'
nan_check: Series = df[column].isna()
df[column][nan_check]: Series = DEFAULT_NULLABLE
# 'Faker Type'
column: str = 'Faker Type'
nan_check: Series = df[column].isna()
df[column][nan_check]: Series = df['Data Type'][nan_check].apply(
lambda x: data_type_mapping[x]
)
# 'Faker kwargs'
column: str = 'Faker kwargs'
if column not in df.columns:
df[column] = DEFAULT_FAKER_KWARGS
nan_check: Series = df[column].isna()
df[column][nan_check] = DEFAULT_FAKER_KWARGS

return df


def get_fake_schema(CONFIG: dict) -> dict[str, str]:
# Controls
RAW_SCHEMA_PATH: Path = DIR_PATH / 'input' / CONFIG['input']
NULL_NULLABLES: bool = CONFIG['null_nullables']
# Load the raw schema
df: DataFrame = read_csv(RAW_SCHEMA_PATH)
# Clean input
df = clean_input(df)
df.to_csv(RAW_SCHEMA_PATH, index=False)
# Null check
nullable_key: str = 'Nullable'
if NULL_NULLABLES and nullable_key in df:
# Find where nullable
df[nullable_key]: Series = df[nullable_key] == 'Y'
# Define faker types to null
df['Faker Type'][df['Nullable'].isin([True])] = 'pyobject'

# Build faker-schema dict
faker_schema: list[tuple] = []
for _, row in df.iterrows():
kwargs = loads(row['Faker kwargs'])
faker_schema.append(
(row['Source Fields'], (row['Faker Type'], kwargs))
)
# Convert to dict
faker_schema: dict[str, str] = dict(faker_schema)

return faker_schema


def create_fake_data(config_path: Path = CONFIG_PATH) -> None:
# Config
with open(config_path, 'r') as f:
CONFIG: dict[str] = load(f)
DATA_OUTPUT_PATH: Path = DIR_PATH / 'output' / CONFIG['output']
NUMBER_OF_ROWS: int = CONFIG['number_of_rows']
# Get faker-schema schema
schema: dict[str, str] = get_fake_schema(CONFIG)
# Produce fake data
faker: FakerSchema = FakerSchema()
data: list[dict] = faker.generate_fake(schema, iterations=NUMBER_OF_ROWS)
# Build output DataFrame/CSV
_data: dict
frames: list[DataFrame] = []
for _data in data:
# Format to data
_df: DataFrame = DataFrame(
[list(_data.values())],
columns=list(_data.keys())
)
# Cache
frames.append(_df)
# Concat Frames
df: DataFrame = concat(frames)
del data, frames
# Format output
df.reset_index(drop=True, inplace=True)
# Save
df.to_csv(DATA_OUTPUT_PATH, index=False)


def main() -> None:
create_fake_data()
pass


if __name__ == "__main__":
main()
108 changes: 108 additions & 0 deletions scripts/create_sql_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Imports
from json import load
from pandas import DataFrame, read_csv, Series
from sqlalchemy import create_engine, sql
from sqlalchemy import Column, MetaData
from sqlalchemy.orm import declarative_base, sessionmaker

from backend.constants import CONFIG_PATH, DIR_PATH, SQLTYPE_MAPPING

# Variables
# Config
with open(CONFIG_PATH, 'r') as f:
CONFIG: dict[str] = load(f)
TABLE_NAME: str = CONFIG['table_name']
SQL_LITE_ENGINE_ADDRESS: str = f'sqlite:///data-store-15-9-23.db'


# Functions
def build_column(datatype: str, nullable: str, primary_key: bool = False, autoincrement: bool = False) -> Column:
# Format inputs
nullable: bool = True if nullable == 'Y' else False
sqltype: object = SQLTYPE_MAPPING[datatype]
# Build columns
column: Column = Column(
sqltype,
nullable=nullable,
primary_key=primary_key,
autoincrement=autoincrement
)

return column


def build_table_inputs() -> dict:
class_inputs: dict
# Load metadata
df_schema: DataFrame = read_csv(DIR_PATH / 'input' / CONFIG['input'])
# Construct table input
index: int
row: Series
columns: list[tuple[str, Column]] = []
for index, row in df_schema.iterrows():
primary_key: bool = True if index == 0 else False
col: Column = build_column(row['Data Type'], row['Nullable'], primary_key=primary_key)
columns.append(
(row['Source Fields'], col)
)

class_inputs: dict = dict(columns)

return class_inputs


def create_sql_table() -> None:
# Create a Session
engine = create_engine(SQL_LITE_ENGINE_ADDRESS)
Session = sessionmaker(bind=engine)
session = Session()
Base = declarative_base()

# Create table
class_inputs: dict = build_table_inputs()
class_inputs['__tablename__'] = TABLE_NAME
Test = type(TABLE_NAME, (Base,), class_inputs)
# Create the table if it doesn't exist
Base.metadata.create_all(engine)
# # Template: empty the table
# rows_to_delete = session.query(Test).all()
# for row in rows_to_delete:
# session.delete(row)

# Pass table object to sqlalchemy session
session.query(Test).all()
metadata = MetaData() # bind=self.engine)
metadata.reflect(bind=engine)

# Set up SQL connection and print example queries
connection = engine.connect()

# Add data to table
df_data: DataFrame = read_csv(DIR_PATH / 'output' / CONFIG['output'])
df_data.to_sql(TABLE_NAME, con=connection, if_exists='replace')

# Print queries
columns: list[str] = [x.name for x in metadata.tables[TABLE_NAME].columns.values()]
analytics = connection.execute(
sql.text(
f"SELECT * FROM {TABLE_NAME}"
)
).fetchall() # Returns a list of rows without columns names

print(columns)
print(analytics)

# Commit the changes and close the session
session.commit()
session.close()

pass


def main() -> None:
create_sql_table()
pass


if __name__ == "__main__":
main()
19 changes: 5 additions & 14 deletions scripts/main.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,13 @@
# Imports
from pathlib import Path

from backend import TimeIt, get_logger

from create_fake_data import create_fake_data
from create_sql_table import create_sql_table
# Variables
logger = get_logger(Path(__file__).name)


# Functions and classes
@TimeIt
# Functions
def main() -> None:
logger.info("Started main!")
logger.debug("This is a debug message")
logger.info("This is an info message")
logger.warning("This is a warning message")
logger.error("This is an error message")
logger.critical("This is a critical message")
logger.info("Completed main!")
create_fake_data()
create_sql_table()
pass


Expand Down