diff --git a/.github/workflows/artifact.yml b/.github/workflows/artifact.yml index 61dbab28e..2aa634a7e 100644 --- a/.github/workflows/artifact.yml +++ b/.github/workflows/artifact.yml @@ -10,6 +10,7 @@ env: REGION: europe-west1 GAR_LOCATION: europe-west1-docker.pkg.dev/open-targets-genetics-dev REPOSITORY: gentropy-app + PYTHON_VERSION_DEFAULT: "3.10.8" jobs: build-push-artifact: @@ -67,3 +68,18 @@ jobs: tags: "${{ env.GAR_LOCATION }}/${{ env.REPOSITORY }}/custom_ensembl_vep:${{ github.ref_name }}" context: . file: "src/vep/Dockerfile" + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION_DEFAULT }} + - name: Install and configure Poetry + uses: snok/install-poetry@v1 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Build and push spark cluster dependencies + run: | + make build diff --git a/Makefile b/Makefile index b83075558..1d79d35fd 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,10 @@ PROJECT_ID ?= open-targets-genetics-dev REGION ?= europe-west1 -APP_NAME ?= $$(cat pyproject.toml| grep -m 1 "name" | cut -d" " -f3 | sed 's/"//g') -VERSION_NO ?= $$(poetry version --short) -CLEAN_VERSION_NO := $(shell echo "$(VERSION_NO)" | tr -cd '[:alnum:]') -BUCKET_NAME=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/ -BUCKET_COMPOSER_DAGS=gs://europe-west1-ot-workflows-fe147745-bucket/dags/ +APP_NAME ?= $$(cat pyproject.toml | grep -m 1 "name" | cut -d" " -f3 | sed 's/"//g') +REF ?= $$(git rev-parse --abbrev-ref HEAD) +PACKAGE_VERSION ?= $$(poetry version --short) +CLEAN_PACKAGE_VERSION := $(shell echo "$(PACKAGE_VERSION)" | tr -cd '[:alnum:]') +BUCKET_NAME=gs://genetics_etl_python_playground/initialisation/${APP_NAME}/${REF} .PHONY: $(shell sed -n -e '/^$$/ { n ; /^[^ .\#][^ ]*:/ { s/:.*$$// ; p ; } ; }' $(MAKEFILE_LIST)) @@ -38,35 +38,33 @@ build-documentation: ## Create local server with documentation create-dev-cluster: build ## Spin up a simple dataproc cluster with all dependencies for development purposes @echo "Creating Dataproc Dev Cluster" @gcloud config set project ${PROJECT_ID} - @gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_VERSION_NO}-$(USER)" \ + @gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_PACKAGE_VERSION}-$(USER)" \ --image-version 2.1 \ --region ${REGION} \ --master-machine-type n1-standard-16 \ - --initialization-actions=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/install_dependencies_on_cluster.sh \ - --metadata="PACKAGE=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/gentropy-${VERSION_NO}-py3-none-any.whl,CONFIGTAR=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/config.tar.gz" \ + --initialization-actions=$(BUCKET_NAME)/install_dependencies_on_cluster.sh \ + --metadata="PACKAGE=$(BUCKET_NAME)/${APP_NAME}-${PACKAGE_VERSION}-py3-none-any.whl" \ --secondary-worker-type spot \ --worker-machine-type n1-standard-4 \ --worker-boot-disk-size 500 \ --autoscaling-policy="projects/${PROJECT_ID}/regions/${REGION}/autoscalingPolicies/otg-etl" \ --optional-components=JUPYTER \ --enable-component-gateway \ - --max-idle=30m + --max-idle=60m make update-dev-cluster: build ## Reinstalls the package on the dev-cluster @echo "Updating Dataproc Dev Cluster" @gcloud config set project ${PROJECT_ID} - gcloud dataproc jobs submit pig --cluster="ot-genetics-dev-${CLEAN_VERSION_NO}" \ + gcloud dataproc jobs submit pig --cluster="ot-genetics-dev-${CLEAN_PACKAGE_VERSION}" \ --region ${REGION} \ --jars=${BUCKET_NAME}/install_dependencies_on_cluster.sh \ -e='sh chmod 750 $${PWD}/install_dependencies_on_cluster.sh; sh $${PWD}/install_dependencies_on_cluster.sh' build: clean ## Build Python package with dependencies @gcloud config set project ${PROJECT_ID} - @echo "Packaging Code and Dependencies for ${APP_NAME}-${VERSION_NO}" + @echo "Packaging Code and Dependencies for ${APP_NAME}-${PACKAGE_VERSION}" @poetry build - @tar -czf dist/config.tar.gz config/ - @echo "Uploading to Dataproc" - @gsutil cp src/gentropy/cli.py ${BUCKET_NAME} - @gsutil cp ./dist/${APP_NAME}-${VERSION_NO}-py3-none-any.whl ${BUCKET_NAME} - @gsutil cp ./dist/config.tar.gz ${BUCKET_NAME} - @gsutil cp ./utils/install_dependencies_on_cluster.sh ${BUCKET_NAME} + @echo "Uploading to ${BUCKET_NAME}" + @gsutil cp src/${APP_NAME}/cli.py ${BUCKET_NAME}/ + @gsutil cp ./dist/${APP_NAME}-${PACKAGE_VERSION}-py3-none-any.whl ${BUCKET_NAME}/ + @gsutil cp ./utils/install_dependencies_on_cluster.sh ${BUCKET_NAME}/ diff --git a/docs/development/troubleshooting.md b/docs/development/troubleshooting.md index a30f72be0..498ee3b86 100644 --- a/docs/development/troubleshooting.md +++ b/docs/development/troubleshooting.md @@ -49,3 +49,19 @@ Some functions on MacOS may throw a java error: This can be resolved by adding the follow line to your `~/.zshrc`: `export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES` + +## Creating development dataproc cluster (OT users only) + +To start dataproc cluster in the development mode run + +``` +make create-dev-cluster +``` + +The command above will prepare 3 different resources: + +- gentropy package +- cli script +- cluster setup script + +and based on the branch ref (for example `dev`) will create a namespaced folder under GCS (`gs://genetics_etl_python_playground/initialisation/gentropy/dev`) with the three files described above. These files will be then used to create the cluster environment. diff --git a/pyproject.toml b/pyproject.toml index 1343c7b50..8e5469c6a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ google = "^3.0.0" omegaconf = "^2.3.0" typing-extensions = "^4.9.0" scikit-learn = "^1.3.2" -pandas = {extras = ["gcp", "parquet"], version = "^2.2.2"} +pandas = { extras = ["gcp", "parquet"], version = "^2.2.2" } skops = ">=0.9,<0.11" google-cloud-secret-manager = "^2.20.0" diff --git a/utils/install_dependencies_on_cluster.sh b/utils/install_dependencies_on_cluster.sh index 9f26b9f17..6b76a7d60 100644 --- a/utils/install_dependencies_on_cluster.sh +++ b/utils/install_dependencies_on_cluster.sh @@ -3,7 +3,6 @@ set -exo pipefail readonly PACKAGE=$(/usr/share/google/get_metadata_value attributes/PACKAGE || true) -readonly CONFIGTAR=$(/usr/share/google/get_metadata_value attributes/CONFIGTAR || true) function err() { echo "[$(date +'%Y-%m-%dT%H:%M:%S%z')]: $*" >&2 @@ -63,9 +62,6 @@ function main() { echo "Install package..." run_with_retry pip install --upgrade ${PACKAGENAME} - echo "Downloading and uncompressing config..." - gsutil cp ${CONFIGTAR} . || err "Failed to download CONFIGTAR" - tar -xvf $(basename ${CONFIGTAR}) || err "Failed to extract CONFIGTAR" } main