From 5cf7fca8ff938fcd82f8a63ac8a055f98f2a3fe2 Mon Sep 17 00:00:00 2001
From: Daljeet Gahle <Daljeet.Gahle@ukhsa.gov.uk>
Date: Tue, 17 Oct 2023 10:14:07 +0100
Subject: [PATCH 01/12] Updated gitignore to ignore SQLite database files.

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 3e46839..2f9a36e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 # Files
 config.json
 **/*tmp.*
+*.db
 # Folders
 input
 output

From 26ad6ea1382ad625bb1b0f7ec5193f04f3e686bc Mon Sep 17 00:00:00 2001
From: Daljeet Gahle <Daljeet.Gahle@ukhsa.gov.uk>
Date: Tue, 17 Oct 2023 10:14:34 +0100
Subject: [PATCH 02/12] Add requirements for the package/repo.

---
 requirements.txt | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index b23fc69..7998be1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,13 @@
-black==23.3.0
-isort==5.12.0
-pytest==7.1.2
-tqdm==4.64.0
\ No newline at end of file
+colorama==0.4.6
+et-xmlfile==1.1.0
+Faker==19.6.2
+greenlet==2.0.2
+numpy==1.26.0
+openpyxl==3.1.2
+pandas==1.5.3
+python-dateutil==2.8.2
+pytz==2023.3.post1
+six==1.16.0
+SQLAlchemy==2.0.21
+tqdm==4.64.0
+typing_extensions==4.8.0

From 35f425b4017c8e9d6996a894ff43229c1633782c Mon Sep 17 00:00:00 2001
From: Daljeet Gahle <Daljeet.Gahle@ukhsa.gov.uk>
Date: Tue, 17 Oct 2023 10:16:30 +0100
Subject: [PATCH 03/12] Added backend/faker_schema.py to translate a schema CSV
 to fake data using faker.

---
 backend/faker_schema.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 backend/faker_schema.py

diff --git a/backend/faker_schema.py b/backend/faker_schema.py
new file mode 100644
index 0000000..522a648
--- /dev/null
+++ b/backend/faker_schema.py
@@ -0,0 +1,33 @@
+from faker import Faker
+
+
+class FakerSchema(object):
+
+    def __init__(self, faker=None, locale=None, providers=None, includes=None):
+        self._faker = faker or Faker(locale=locale, providers=providers, includes=includes)
+
+    def generate_fake(self, schema, iterations=1):
+        result = [self._generate_one_fake(schema) for _ in range(iterations)]
+        return result[0] if len(result) == 1 else result
+
+    def _generate_one_fake(self, schema):
+        """
+        Recursively traverse schema dictionary and for each "leaf node", evaluate the fake
+        value
+
+        Implementation:
+        For each key-value pair:
+        1) If value is not an iterable (i.e. dict or list), evaluate the fake data (base case)
+        2) If value is a dictionary, recurse
+        3) If value is a list, iteratively recurse over each item
+        """
+        data = {}
+        for k, (v, kwargs) in schema.items():
+            if isinstance(v, dict):
+                data[k] = self._generate_one_fake(v)
+            elif isinstance(v, list):
+                data[k] = [self._generate_one_fake(item) for item in v]
+            else:
+                data[k] = getattr(self._faker, v)(**kwargs)
+
+        return data

From 713dc7def68a325f7309e532af4e64dced175396 Mon Sep 17 00:00:00 2001
From: Daljeet Gahle <Daljeet.Gahle@ukhsa.gov.uk>
Date: Tue, 17 Oct 2023 10:17:02 +0100
Subject: [PATCH 04/12] Constants and mappings used by backend/faker_schema.py

---
 backend/constants.py | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 backend/constants.py

diff --git a/backend/constants.py b/backend/constants.py
new file mode 100644
index 0000000..6044328
--- /dev/null
+++ b/backend/constants.py
@@ -0,0 +1,44 @@
+# Imports
+from pathlib import Path
+from sqlalchemy import BIGINT, CHAR, DATETIME, DECIMAL, NVARCHAR, TIME
+from sqlalchemy import Date, Integer, Numeric, SmallInteger, String
+
+# Paths
+DIR_PATH: Path = Path(__file__).parent.parent
+CONFIG_PATH: Path = DIR_PATH / 'config.json'
+INPUT_PATH: Path = DIR_PATH / 'input'
+OUTPUT_PATH: Path = DIR_PATH / 'output'
+
+# Mapping
+data_type_mapping: dict[str, str] = dict(
+    CHAR='pystr',
+    VARCHAR='pystr',
+    SMALLINT='pyint',
+    DATETIME='date_time',
+    BIGINT='pyint',
+    DECIMAL='pyfloat',
+    INT='pyint',
+    TINYINT='pyint',
+    DATE='date',
+    NVARCHAR='pystr',
+    NUMERIC='pyint',
+    SMALLDATETIME='date_time',
+    UNIQUEIDENTIFIER='ean',
+)
+
+SQLTYPE_MAPPING: dict[str, object] = dict(
+    BIGINT=BIGINT,
+    BIT=Integer,
+    CHAR=CHAR,
+    DATE=Date,
+    DATETIME=DATETIME,
+    DATETIME2=DATETIME,
+    DECIMAL=DECIMAL,
+    INT=Integer,
+    NUMERIC=Numeric,
+    NVARCHAR=NVARCHAR,
+    SMALLINT=SmallInteger,
+    TIME=TIME,
+    TINYINT=Integer,
+    VARCHAR=String,
+)

From a18509f5b0c8ac417dfcc36593ef6e6a74f53154 Mon Sep 17 00:00:00 2001
From: Daljeet Gahle <Daljeet.Gahle@ukhsa.gov.uk>
Date: Tue, 17 Oct 2023 10:17:24 +0100
Subject: [PATCH 05/12] init file for backend package

---
 backend/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/backend/__init__.py b/backend/__init__.py
index b842064..e69de29 100644
--- a/backend/__init__.py
+++ b/backend/__init__.py
@@ -1,2 +0,0 @@
-from .logger import get_logger
-from .time import TimeIt

From 1059ab130ab4c1692e0dc8f1a497962431643587 Mon Sep 17 00:00:00 2001
From: Daljeet Gahle <Daljeet.Gahle@ukhsa.gov.uk>
Date: Tue, 17 Oct 2023 10:18:44 +0100
Subject: [PATCH 06/12] Updated default config.

---
 metadata/config.json | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/metadata/config.json b/metadata/config.json
index a42bd19..9ef2e0d 100644
--- a/metadata/config.json
+++ b/metadata/config.json
@@ -1,4 +1,6 @@
 {
-  "output_path": null,
-  "report_name": "test_run"
+  "input": "ecds-raw-faker-schema.csv",
+  "output": "ecds-raw-fake-data.csv",
+  "null_nullables": false,
+  "number_of_rows": 10
 }
\ No newline at end of file

From d1ed27ec2a7552af4415bc66c9c612d365b0f004 Mon Sep 17 00:00:00 2001
From: Daljeet Gahle <Daljeet.Gahle@ukhsa.gov.uk>
Date: Tue, 17 Oct 2023 10:19:52 +0100
Subject: [PATCH 07/12] Corrected typos

---
 metadata/config.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/metadata/config.json b/metadata/config.json
index 9ef2e0d..11ea5ef 100644
--- a/metadata/config.json
+++ b/metadata/config.json
@@ -1,6 +1,6 @@
 {
-  "input": "ecds-raw-faker-schema.csv",
-  "output": "ecds-raw-fake-data.csv",
+  "input": "raw-faker-schema.csv",
+  "output": "raw-synthetic-data.csv",
   "null_nullables": false,
   "number_of_rows": 10
 }
\ No newline at end of file

From a3803225b9a12798a89d4a474e04809fd4bc937a Mon Sep 17 00:00:00 2001
From: Daljeet Gahle <Daljeet.Gahle@ukhsa.gov.uk>
Date: Tue, 17 Oct 2023 10:21:14 +0100
Subject: [PATCH 08/12] A script to create fake data CSV from a schema CSV.

---
 scripts/create_fake_data.py | 110 ++++++++++++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 scripts/create_fake_data.py

diff --git a/scripts/create_fake_data.py b/scripts/create_fake_data.py
new file mode 100644
index 0000000..c53466a
--- /dev/null
+++ b/scripts/create_fake_data.py
@@ -0,0 +1,110 @@
+# Imports
+from backend.faker_schema import FakerSchema
+from json import load, loads
+from pandas import concat, DataFrame, isna, read_csv, Series
+from pathlib import Path
+
+from backend.constants import CONFIG_PATH, data_type_mapping, DIR_PATH
+
+# Variables
+DEFAULT_DATA_TYPE: str = 'VARCHAR'
+DEFAULT_NULLABLE: str = 'N'
+DEFAULT_FAKER_KWARGS: str = "{}"
+
+
+# Functions and classes
+def clean_input(df: DataFrame) -> DataFrame:
+    # 'Data Type',
+    column: str = 'Data Type'
+    nan_check: Series = df[column].isna()
+    df[column][nan_check]: Series = DEFAULT_DATA_TYPE
+    df[column] = df[column].apply(lambda x: x.upper())
+    df[column] = df[column].apply(lambda x: x.split('(')[0])
+    # 'Nullable',
+    column: str = 'Nullable'
+    nan_check: Series = df[column].isna()
+    df[column][nan_check]: Series = DEFAULT_NULLABLE
+    # 'Faker Type'
+    column: str = 'Faker Type'
+    nan_check: Series = df[column].isna()
+    df[column][nan_check]: Series = df['Data Type'][nan_check].apply(
+        lambda x: data_type_mapping[x]
+    )
+    # 'Faker kwargs'
+    column: str = 'Faker kwargs'
+    if column not in df.columns:
+        df[column] = DEFAULT_FAKER_KWARGS
+    nan_check: Series = df[column].isna()
+    df[column][nan_check] = DEFAULT_FAKER_KWARGS
+
+    return df
+
+
+def get_fake_schema(CONFIG: dict) -> dict[str, str]:
+    # Controls
+    RAW_SCHEMA_PATH: Path = DIR_PATH / 'input' / CONFIG['input']
+    NULL_NULLABLES: bool = CONFIG['null_nullables']
+    # Load the raw schema
+    df: DataFrame = read_csv(RAW_SCHEMA_PATH)
+    # Clean input
+    df = clean_input(df)
+    df.to_csv(RAW_SCHEMA_PATH, index=False)
+    # Null check
+    nullable_key: str = 'Nullable'
+    if NULL_NULLABLES and nullable_key in df:
+        # Find where nullable
+        df[nullable_key]: Series = df[nullable_key] == 'Y'
+        # Define faker types to null
+        df['Faker Type'][df['Nullable'].isin([True])] = 'pyobject'
+
+    # Build faker-schema dict
+    faker_schema: list[tuple] = []
+    for _, row in df.iterrows():
+        kwargs = loads(row['Faker kwargs'])
+        faker_schema.append(
+            (row['Source Fields'], (row['Faker Type'], kwargs))
+        )
+    # Convert to dict
+    faker_schema: dict[str, str] = dict(faker_schema)
+
+    return faker_schema
+
+
+def create_fake_data(config_path: Path = CONFIG_PATH) -> None:
+    # Config
+    with open(config_path, 'r') as f:
+        CONFIG: dict[str] = load(f)
+    DATA_OUTPUT_PATH: Path = DIR_PATH / 'output' / CONFIG['output']
+    NUMBER_OF_ROWS: int = CONFIG['number_of_rows']
+    # Get faker-schema schema
+    schema: dict[str, str] = get_fake_schema(CONFIG)
+    # Produce fake data
+    faker: FakerSchema = FakerSchema()
+    data: list[dict] = faker.generate_fake(schema, iterations=NUMBER_OF_ROWS)
+    # Build output DataFrame/CSV
+    _data: dict
+    frames: list[DataFrame] = []
+    for _data in data:
+        # Format to data
+        _df: DataFrame = DataFrame(
+            [list(_data.values())],
+            columns=list(_data.keys())
+        )
+        # Cache
+        frames.append(_df)
+    # Concat Frames
+    df: DataFrame = concat(frames)
+    del data, frames
+    # Format output
+    df.reset_index(drop=True, inplace=True)
+    # Save
+    df.to_csv(DATA_OUTPUT_PATH, index=False)
+
+
+def main() -> None:
+    create_fake_data()
+    pass
+
+
+if __name__ == "__main__":
+    main()

From 5634b446aaff00a47e5330fc5e89be63f84eac31 Mon Sep 17 00:00:00 2001
From: Daljeet Gahle <Daljeet.Gahle@ukhsa.gov.uk>
Date: Tue, 17 Oct 2023 10:21:56 +0100
Subject: [PATCH 09/12] Script to load schema and data and dynamically built
 SQLite class and load to a database.

---
 scripts/create_sql_table.py | 108 ++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 scripts/create_sql_table.py

diff --git a/scripts/create_sql_table.py b/scripts/create_sql_table.py
new file mode 100644
index 0000000..87632e9
--- /dev/null
+++ b/scripts/create_sql_table.py
@@ -0,0 +1,108 @@
+# Imports
+from json import load
+from pandas import DataFrame, read_csv, Series
+from sqlalchemy import create_engine, sql
+from sqlalchemy import Column, MetaData
+from sqlalchemy.orm import declarative_base, sessionmaker
+
+from backend.constants import CONFIG_PATH, DIR_PATH, SQLTYPE_MAPPING
+
+# Variables
+# Config
+with open(CONFIG_PATH, 'r') as f:
+    CONFIG: dict[str] = load(f)
+TABLE_NAME: str = CONFIG['table_name']
+SQL_LITE_ENGINE_ADDRESS: str = f'sqlite:///data-store-15-9-23.db'
+
+
+# Functions
+def build_column(datatype: str, nullable: str, primary_key: bool = False, autoincrement: bool = False) -> Column:
+    # Format inputs
+    nullable: bool = True if nullable == 'Y' else False
+    sqltype: object = SQLTYPE_MAPPING[datatype] 
+    # Build columns
+    column: Column = Column(
+        sqltype,
+        nullable=nullable,
+        primary_key=primary_key,
+        autoincrement=autoincrement
+    )
+    
+    return column
+
+
+def build_table_inputs() -> dict:
+    class_inputs: dict
+    # Load metadata
+    df_schema: DataFrame = read_csv(DIR_PATH / 'input' / CONFIG['input'])
+    # Construct table input
+    index: int
+    row: Series
+    columns: list[tuple[str, Column]] = []
+    for index, row in df_schema.iterrows():
+        primary_key: bool = True if index == 0 else False
+        col: Column = build_column(row['Data Type'], row['Nullable'], primary_key=primary_key)
+        columns.append(
+            (row['Source Fields'], col)
+        )
+
+    class_inputs: dict = dict(columns)
+
+    return class_inputs
+
+
+def create_sql_table() -> None:
+    # Create a Session
+    engine = create_engine(SQL_LITE_ENGINE_ADDRESS)
+    Session = sessionmaker(bind=engine)
+    session = Session()
+    Base = declarative_base()
+
+    # Create table
+    class_inputs: dict = build_table_inputs()
+    class_inputs['__tablename__'] = TABLE_NAME
+    Test = type(TABLE_NAME, (Base,), class_inputs)
+    # Create the table if it doesn't exist
+    Base.metadata.create_all(engine)
+    # # Template: empty the table
+    # rows_to_delete = session.query(Test).all()
+    # for row in rows_to_delete:
+    #     session.delete(row)
+
+    # Pass table object to sqlalchemy session
+    session.query(Test).all()
+    metadata = MetaData()  # bind=self.engine)
+    metadata.reflect(bind=engine)
+
+    # Set up SQL connection and print example queries
+    connection = engine.connect()
+
+    # Add data to table
+    df_data: DataFrame = read_csv(DIR_PATH / 'output' / CONFIG['output'])
+    df_data.to_sql(TABLE_NAME, con=connection, if_exists='replace')
+
+    # Print queries
+    columns: list[str] = [x.name for x in metadata.tables[TABLE_NAME].columns.values()]
+    analytics = connection.execute(
+        sql.text(
+            f"SELECT * FROM {TABLE_NAME}"
+        )
+    ).fetchall()  # Returns a list of rows without columns names
+
+    print(columns)
+    print(analytics)
+
+    # Commit the changes and close the session
+    session.commit()
+    session.close()
+
+    pass
+
+
+def main() -> None:
+    create_sql_table()
+    pass
+
+
+if __name__ == "__main__":
+    main()

From 68ba75bd4e8f5aecbc4e658958af06bacceec527 Mon Sep 17 00:00:00 2001
From: Daljeet Gahle <Daljeet.Gahle@ukhsa.gov.uk>
Date: Tue, 17 Oct 2023 10:22:34 +0100
Subject: [PATCH 10/12] Updated to run create_fake_data and create_sql_table

---
 scripts/main.py | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/scripts/main.py b/scripts/main.py
index 5447643..a7df3f9 100644
--- a/scripts/main.py
+++ b/scripts/main.py
@@ -1,22 +1,13 @@
 # Imports
-from pathlib import Path
-
-from backend import TimeIt, get_logger
-
+from create_fake_data import create_fake_data
+from create_sql_table import create_sql_table
 # Variables
-logger = get_logger(Path(__file__).name)
 
 
-# Functions and classes
-@TimeIt
+# Functions
 def main() -> None:
-    logger.info("Started main!")
-    logger.debug("This is a debug message")
-    logger.info("This is an info message")
-    logger.warning("This is a warning message")
-    logger.error("This is an error message")
-    logger.critical("This is a critical message")
-    logger.info("Completed main!")
+    create_fake_data()
+    create_sql_table()
     pass
 
 

From c1bae08edaa84f8f17b987437a7524e489ab971c Mon Sep 17 00:00:00 2001
From: Daljeet Gahle <Daljeet.Gahle@ukhsa.gov.uk>
Date: Tue, 17 Oct 2023 10:24:30 +0100
Subject: [PATCH 11/12] init file for scripts folder as they are currently
 importing from each other.

---
 scripts/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 scripts/__init__.py

diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000..e69de29

From b8712176328c4c1bff502c8a28ea34e2db1b0e00 Mon Sep 17 00:00:00 2001
From: Daljeet Gahle <Daljeet.Gahle@ukhsa.gov.uk>
Date: Tue, 17 Oct 2023 10:26:12 +0100
Subject: [PATCH 12/12] Added tmp folder to gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 2f9a36e..217e0ba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@ config.json
 # Folders
 input
 output
+tmp
 
 # Pycharm
 .idea