Initial commit

pjpena19 · Nov 22, 2024 · 8387fb6 · 8387fb6
commit 8387fb6
Show file tree

Hide file tree

Showing 16 changed files with 341 additions and 0 deletions.
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -0,0 +1,15 @@
+FROM mcr.microsoft.com/devcontainers/python:0-3.10
+
+ENV PYTHONUNBUFFERED 1
+
+# [Optional] If your requirements rarely change, uncomment this section to add them to the image.
+# COPY requirements.txt /tmp/pip-tmp/
+# RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
+#    && rm -rf /tmp/pip-tmp
+
+# [Optional] Uncomment this section to install additional OS packages.
+# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
+#     && apt-get -y install --no-install-recommends <your-package-list-here>
+
+RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
+    && apt-get -y install --no-install-recommends postgresql-client
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,25 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/postgres
+{
+	"name": "Python 3 & PostgreSQL",
+	"dockerComposeFile": "docker-compose.yml",
+	"service": "app",
+	"workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}",
+
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	// "features": {},
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// This can be used to network with other containers or the host.
+	"forwardPorts": [5432],
+
+	"onCreateCommand": "cp -n .env.example .env",
+	// Use 'postCreateCommand' to run commands after the container is created.
+	// "postCreateCommand": "pip install --user -r requirements.txt",
+
+	// Configure tool-specific properties.
+	// "customizations": {},
+
+	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+	// "remoteUser": "root"
+}
diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml
@@ -0,0 +1,35 @@
+version: '3.8'
+
+services:
+  app:
+    build:
+      context: ..
+      dockerfile: .devcontainer/Dockerfile
+
+    volumes:
+      - ../..:/workspaces:cached
+
+    # Overrides default command so things don't shut down after the process ends.
+    command: sleep infinity
+
+    # Runs app on the same network as the database container, allows "forwardPorts" in devcontainer.json function.
+    network_mode: service:db
+
+    # Use "forwardPorts" in **devcontainer.json** to forward an app port locally.
+    # (Adding the "ports" property to this file will not forward from a Codespace.)
+
+  db:
+    image: postgres:latest
+    restart: unless-stopped
+    volumes:
+      - postgres-data:/var/lib/postgresql/data
+    environment:
+      POSTGRES_USER: gitpod
+      POSTGRES_DB: sample-db
+      POSTGRES_PASSWORD: postgres
+
+    # Add "forwardPorts": ["5432"] to **devcontainer.json** to forward PostgreSQL locally.
+    # (Adding the "ports" property to this file will not forward from a Codespace.)
+
+volumes:
+  postgres-data:
diff --git a/.env.example b/.env.example
@@ -0,0 +1 @@
+DATABASE_URL=postgresql://gitpod@localhost:5432/example
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+__pycache__
+.env
+.learn
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "workbench.startupEditor": "readme",
+    "workbench.editorAssociations": {
+        "*.md": "vscode.markdown.preview.editor"
+    }
+}
diff --git a/README.es.md b/README.es.md
@@ -0,0 +1,88 @@
+# Plantilla de Proyecto de Ciencia de Datos
+
+Esta plantilla está diseñada para impulsar proyectos de ciencia de datos proporcionando una configuración básica para conexiones de base de datos, procesamiento de datos, y desarrollo de modelos de aprendizaje automático. Incluye una organización estructurada de carpetas para tus conjuntos de datos y un conjunto de paquetes de Python predefinidos necesarios para la mayoría de las tareas de ciencia de datos.
+
+## Estructura
+
+El proyecto está organizado de la siguiente manera:
+
+- `app.py` - El script principal de Python que ejecutas para tu proyecto.
+- `explore.py` - Un notebook para que puedas hacer tus exploraciones, idealmente el codigo de este notebook se migra hacia app.py para subir a produccion.
+- `utils.py` - Este archivo contiene código de utilidad para operaciones como conexiones de base de datos.
+- `requirements.txt` - Este archivo contiene la lista de paquetes de Python necesarios.
+- `models/` - Este directorio debería contener tus clases de modelos SQLAlchemy.
+- `data/` - Este directorio contiene los siguientes subdirectorios:
+  - `interim/` - Para datos intermedios que han sido transformados.
+  - `processed/` - Para los datos finales a utilizar para el modelado.
+  - `raw/` - Para datos brutos sin ningún procesamiento.
+
+## Configuración
+
+**Prerrequisitos**
+
+Asegúrate de tener Python 3.11+ instalado en tu máquina. También necesitarás pip para instalar los paquetes de Python.
+
+**Instalación**
+
+Clona el repositorio del proyecto en tu máquina local.
+
+Navega hasta el directorio del proyecto e instala los paquetes de Python requeridos:
+
+```bash
+pip install -r requirements.txt
+```
+
+**Crear una base de datos (si es necesario)**
+
+Crea una nueva base de datos dentro del motor Postgres personalizando y ejecutando el siguiente comando: `$ createdb -h localhost -U <username> <db_name>`
+Conéctate al motor Postgres para usar tu base de datos, manipular tablas y datos: `$ psql -h localhost -U <username> <db_name>`
+NOTA: Recuerda revisar la información del archivo ./.env para obtener el nombre de usuario y db_name.
+
+¡Una vez que estés dentro de PSQL podrás crear tablas, hacer consultas, insertar, actualizar o eliminar datos y mucho más!
+
+**Variables de entorno**
+
+Crea un archivo .env en el directorio raíz del proyecto para almacenar tus variables de entorno, como tu cadena de conexión a la base de datos:
+
+```makefile
+DATABASE_URL="your_database_connection_url_here"
+```
+
+## Ejecutando la Aplicación
+
+Para ejecutar la aplicación, ejecuta el script app.py desde la raíz del directorio del proyecto:
+
+```bash
+python app.py
+```
+
+## Añadiendo Modelos
+
+Para añadir clases de modelos SQLAlchemy, crea nuevos archivos de script de Python dentro del directorio models/. Estas clases deben ser definidas de acuerdo a tu esquema de base de datos.
+
+Definición del modelo de ejemplo (`models/example_model.py`):
+
+```py
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy import Column, Integer, String
+
+Base = declarative_base()
+
+class ExampleModel(Base):
+    __tablename__ = 'example_table'
+    id = Column(Integer, primary_key=True)
+    name = Column(String)
+
+```
+
+## Trabajando con Datos
+
+Puedes colocar tus conjuntos de datos brutos en el directorio data/raw, conjuntos de datos intermedios en data/interim, y los conjuntos de datos procesados listos para el análisis en data/processed.
+
+Para procesar datos, puedes modificar el script app.py para incluir tus pasos de procesamiento de datos, utilizando pandas para la manipulación y análisis de datos.
+
+## Contribuyentes
+
+Esta plantilla fue construida como parte del [Data Science and Machine Learning Bootcamp](https://4geeksacademy.com/us/coding-bootcamps/datascience-machine-learning) de 4Geeks Academy por [Alejandro Sanchez](https://twitter.com/alesanchezr) y muchos otros contribuyentes. Descubre más sobre [los programas BootCamp de 4Geeks Academy](https://4geeksacademy.com/us/programs) aquí.
+
+Otras plantillas y recursos como este se pueden encontrar en la página de GitHub de la escuela.
diff --git a/README.md b/README.md
@@ -0,0 +1,89 @@
+# Data Science Project Boilerplate
+
+This boilerplate is designed to kickstart data science projects by providing a basic setup for database connections, data processing, and machine learning model development. It includes a structured folder organization for your datasets and a set of pre-defined Python packages necessary for most data science tasks.
+
+## Structure
+
+The project is organized as follows:
+
+- `app.py` - The main Python script that you run for your project.
+- `explore.py` - A notebook to explore data, play around, visualize, clean, etc. Ideally the notebook code should be migrated to the app.py when moving to production.
+- `utils.py` - This file contains utility code for operations like database connections.
+- `requirements.txt` - This file contains the list of necessary python packages.
+- `models/` - This directory should contain your SQLAlchemy model classes.
+- `data/` - This directory contains the following subdirectories:
+  - `interin/` - For intermediate data that has been transformed.
+  - `processed/` - For the final data to be used for modeling.
+  - `raw/` - For raw data without any processing.
+
+
+## Setup
+
+**Prerequisites**
+
+Make sure you have Python 3.11+ installed on your. You will also need pip for installing the Python packages.
+
+**Installation**
+
+Clone the project repository to your local machine.
+
+Navigate to the project directory and install the required Python packages:
+
+```bash
+pip install -r requirements.txt
+```
+
+**Create a database (if needed)**
+
+Create a new database within the Postgres engine by customizing and executing the following command: `$ createdb -h localhost -U <username> <db_name>`
+Connect to the Postgres engine to use your database, manipulate tables and data: `$ psql -h localhost -U <username> <db_name>`
+NOTE: Remember to check the ./.env file information to get the username and db_name.
+
+Once you are inside PSQL you will be able to create tables, make queries, insert, update or delete data and much more!
+
+**Environment Variables**
+
+Create a .env file in the project root directory to store your environment variables, such as your database connection string:
+
+```makefile
+DATABASE_URL="your_database_connection_url_here"
+```
+
+## Running the Application
+
+To run the application, execute the app.py script from the root of the project directory:
+
+```bash
+python app.py
+```
+
+## Adding Models
+
+To add SQLAlchemy model classes, create new Python script files inside the models/ directory. These classes should be defined according to your database schema.
+
+Example model definition (`models/example_model.py`):
+
+```py
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy import Column, Integer, String
+
+Base = declarative_base()
+
+class ExampleModel(Base):
+    __tablename__ = 'example_table'
+    id = Column(Integer, primary_key=True)
+    name = Column(String)
+
+```
+
+## Working with Data
+
+You can place your raw datasets in the data/raw directory, intermediate datasets in data/interim, and the processed datasets ready for analysis in data/processed.
+
+To process data, you can modify the app.py script to include your data processing steps, utilizing pandas for data manipulation and analysis.
+
+## Contributors
+
+This template was built as part of the 4Geeks Academy [Data Science and Machine Learning Bootcamp](https://4geeksacademy.com/us/coding-bootcamps/datascience-machine-learning) by [Alejandro Sanchez](https://twitter.com/alesanchezr) and many other contributors. Find out more about [4Geeks Academy's BootCamp programs](https://4geeksacademy.com/us/programs) here.
+
+Other templates and resources like this can be found on the school GitHub page.
diff --git a/data/interim/.gitkeep b/data/interim/.gitkeep
diff --git a/data/processed/.gitkeep b/data/processed/.gitkeep
diff --git a/data/raw/.gitkeep b/data/raw/.gitkeep
diff --git a/models/.gitkeep b/models/.gitkeep
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,15 @@
+ipyleaflet>=0.14.0
+ipywidgets>=7.7.1
+matplotlib>=3.7.0
+numpy>=1.24.2
+opencv-python>=4.1.2
+pandas>=1.5.3
+psycopg2-binary>=2.9.3
+pymysql>=1.0.2
+python-dotenv>=0.20.0
+requests>=2.27.1
+scikit-learn
+seaborn>=0.12.2
+sqlalchemy>=1.4.37
+sympy>=1.10.1
+xgboost
diff --git a/src/app.py b/src/app.py
@@ -0,0 +1,4 @@
+from utils import db_connect
+engine = db_connect()
+
+# your code here
diff --git a/src/explore.ipynb b/src/explore.ipynb
@@ -0,0 +1,47 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "# Explore here"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# Your code here"
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3.8.13 64-bit ('3.8.13')",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.8.13"
+        },
+        "orig_nbformat": 4,
+        "vscode": {
+            "interpreter": {
+                "hash": "110cc1dee26208153f2972f08a2ad52b6a56238dc66d48e87fb757ef2996db56"
+            }
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 2
+}
diff --git a/src/utils.py b/src/utils.py
@@ -0,0 +1,13 @@
+from dotenv import load_dotenv
+from sqlalchemy import create_engine
+import pandas as pd
+
+# load the .env file variables
+load_dotenv()
+
+
+def db_connect():
+    import os
+    engine = create_engine(os.getenv('DATABASE_URL'))
+    engine.connect()
+    return engine
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		DATABASE_URL=postgresql://gitpod@localhost:5432/example