diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 0000000..e207cb4
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,26 @@
+name: pre-commit
+
+on:
+ pull_request:
+ branches: ['*']
+ push:
+ branches: [main]
+
+jobs:
+ pre-commit:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+
+ - uses: actions/setup-python@v3
+ with:
+ python-version: '3.11'
+
+ - name: "install project dependencies"
+ run : |
+ python -m pip install --upgrade pip
+ pip install setuptools_scm wheel
+ pip install -r requirements.txt
+ pip install -r test-requirements.txt
+
+ - uses: pre-commit/action@v3.0.0
diff --git a/.github/workflows/publish_package.yml b/.github/workflows/publish_package.yml
new file mode 100644
index 0000000..8670eb5
--- /dev/null
+++ b/.github/workflows/publish_package.yml
@@ -0,0 +1,31 @@
+name: Upload Release ot PyPI
+
+on:
+ release:
+ types: [published]
+
+jobs:
+ publish:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python
+ uses: actions/setup-python@v3
+ with:
+ python-version: '3.11'
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install build wheel twine
+ python -m pip install --upgrade setuptools build wheel twine
+ - name: Build package
+ run: |
+ python -m build
+ twine check dist/*
+ - name: Publish package
+ uses: pypa/gh-action-pypi-publish@release/v1
+ with:
+ user: __token__
+ password: ${{ secrets.PG_BULK_LOADER_PYPI }}
+ verify_metadata: false
+ verbose: true
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..452c0b3
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,23 @@
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v3.2.0
+ hooks:
+ - id: trailing-whitespace
+ - id: end-of-file-fixer
+ - id: check-yaml
+ - id: check-added-large-files
+
+- repo: https://github.com/PyCQA/autoflake
+ rev: v2.2.1
+ hooks:
+ - id: autoflake
+ args: [--remove-all-unused-imports, --in-place]
+
+- repo: local
+ hooks:
+ - id: code-coverage-checker
+ name: pytest-coverage-checker
+ entry: pytest --cov=src.pg_bulk_loader --cov-fail-under=95
+ language: system
+ types: [python]
+ pass_filenames: false
diff --git a/README.md b/README.md
index 600aecf..8b7f269 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
Overview
**pg-bulk-loader** is a utility package designed to facilitate faster bulk insertion DataFrame to a PostgreSQL Database.
-Currently, it supports load from pandas DataFrame only.
+Currently, it supports load from pandas DataFrame only.
Purpose
@@ -17,12 +17,12 @@ This utility leverages the power of PostgreSQL in combination with Python to eff
package's Efficiency
-**Machine:**
-- Resource config - 5 core, 8GB
-- Azure hosted PostgreSQL Server
+**Machine:**
+- Resource config - 5 core, 8GB
+- Azure hosted PostgreSQL Server
- Azure hosted Python service (jupyter notebook)
-**Table info:**
+**Table info:**
- 12 columns (3 texts, 2 date, 7 double)
- Primary key: 3 columns (2 text and 1 date)
- Indexes: 2 b-tree. (1 on single column and another on three columns)
@@ -30,7 +30,7 @@ This utility leverages the power of PostgreSQL in combination with Python to eff
**Runtime:**
- Data Size: 20M
- without PK and Indexes: ~55s
- - with PK and indexes: ~150s (~85s to insert data with PK enabled and ~65 seconds to create indexes)
+ - with PK and indexes: ~150s (~85s to insert data with PK enabled and ~65 seconds to create indexes)
**Running with 1M records without having PK and Indexes with different approaches:**
@@ -58,7 +58,7 @@ The utility provides the following useful functions and classes:
**Note:** Provide input either in the form of DataFrame or DataFrame generator
-batch_insert_to_postgres_with_multi_process() function
+batch_insert_to_postgres_with_multi_process() function
- `pg_conn_details`: Instance of the PgConnectionDetail class containing PostgreSQL server connection details.
- `table_name`: Name of the table for bulk insertion.
@@ -101,8 +101,8 @@ from pg_bulk_loader import PgConnectionDetail, batch_insert_to_postgres
async def run():
# Read data. Let's suppose below DataFrame has 20M records
input_data_df = pd.DataFrame()
-
- # Create Postgres Connection Details object. This will help in creating and managing the database connections
+
+ # Create Postgres Connection Details object. This will help in creating and managing the database connections
pg_conn_details = PgConnectionDetail(
user="",
password="",
@@ -111,7 +111,7 @@ async def run():
port="",
schema=""
)
-
+
# Data will be inserted and committed in the batch of 2,50,000
await batch_insert_to_postgres(
pg_conn_details=pg_conn_details,
@@ -140,8 +140,8 @@ from pg_bulk_loader import PgConnectionDetail, batch_insert_to_postgres
async def run():
# Read data. Let's suppose below DataFrame has 20M records
input_data_df_generator = pd.read_csv("file.csv", chunksize=1000000)
-
- # Create Postgres Connection Details object. This will help in creating and managing the database connections
+
+ # Create Postgres Connection Details object. This will help in creating and managing the database connections
pg_conn_details = PgConnectionDetail(
user="",
password="",
@@ -150,7 +150,7 @@ async def run():
port="",
schema=""
)
-
+
# Data will be inserted and committed in the batch of 2,50,000
await batch_insert_to_postgres(
pg_conn_details=pg_conn_details,
@@ -181,7 +181,7 @@ from pg_bulk_loader import PgConnectionDetail, batch_insert_to_postgres_with_mul
async def run():
- # Create Postgres Connection Details object. This will help in creating and managing the database connections
+ # Create Postgres Connection Details object. This will help in creating and managing the database connections
pg_conn_details = PgConnectionDetail(
user="",
password="",
@@ -190,9 +190,9 @@ async def run():
port="",
schema=""
)
-
+
df_generator = pd.read_csv("20M-file.csv", chunksize=1000000)
-
+
# Data will be inserted and committed in the batch of 2,50,000
await batch_insert_to_postgres_with_multi_process(
pg_conn_details=pg_conn_details,
@@ -214,6 +214,5 @@ if __name__ == '__main__':
Development:
- Run this command to install the required development dependencies `pip install -r dev-requirements.txt`
+- Run `pre-commit install` so that it creates a hook with `git commit` and run for basic sanity before you make any commit.
- Run below commands to run the unit test cases: `pytest` or `coverage run --source=src.pg_bulk_loader --module pytest --verbose && coverage report --show-missing`
-
-
diff --git a/dev-requirements.txt b/dev-requirements.txt
index 6972ed8..c588f84 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1,5 +1,6 @@
-r requirements.txt
-r test-requirements.txt
+pre-commit
build
-twine
\ No newline at end of file
+twine