From e4c1aef7740411e243b58f221893012c40f81078 Mon Sep 17 00:00:00 2001 From: "Wei-Chun, Chang" Date: Thu, 16 May 2024 11:03:11 +0800 Subject: [PATCH 01/26] Add dbt profile to the project Signed-off-by: Wei-Chun, Chang --- profiles.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 profiles.yml diff --git a/profiles.yml b/profiles.yml new file mode 100644 index 000000000..44f94cf68 --- /dev/null +++ b/profiles.yml @@ -0,0 +1,26 @@ +opensource_observer: + outputs: + playground: + type: bigquery + method: service-account + keyfile: "{{ env_var('DBT_GOOGLE_KEYFILE') }}" + project: "{{ env_var('DBT_GOOGLE_PROJECT') }}" + dataset: "{{ env_var('DBT_GOOGLE_DATASET') }}" + job_execution_time_seconds: 300 + job_retries: 1 + location: US + threads: 32 + dev: + type: bigquery + method: service-account + keyfile: "{{ env_var('DBT_GOOGLE_KEYFILE') }}" + project: "{{ env_var('DBT_GOOGLE_PROJECT') }}" + dataset: "{{ env_var('DBT_GOOGLE_DEV_DATASET') }}" + job_execution_time_seconds: 300 + job_retries: 1 + location: US + threads: 32 + # By default we target the playground. it's less costly and also safer to write + # there while developing + target: playground + From 5ec96001ae9757c9b57939b833e644dea0025d65 Mon Sep 17 00:00:00 2001 From: "Wei-Chun, Chang" Date: Mon, 20 May 2024 12:03:28 +0800 Subject: [PATCH 02/26] Add recce staging env preparation workflow Signed-off-by: Wei-Chun, Chang --- .github/workflows/recce-staging-daily.yml | 71 +++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 .github/workflows/recce-staging-daily.yml diff --git a/.github/workflows/recce-staging-daily.yml b/.github/workflows/recce-staging-daily.yml new file mode 100644 index 000000000..b246cef91 --- /dev/null +++ b/.github/workflows/recce-staging-daily.yml @@ -0,0 +1,71 @@ +name: OSO Recce Staging CI + +on: + workflow_dispatch: + schedule: + - cron: '0 18 * * *' # run at 2 AM (UTC + 8) everyday + pull_request: + types: [closed] + branches: [main, dev] + paths: + - warehouse/dbt/** + +env: + # dbt env variables used in your dbt profiles.yml + DBT_PROFILES_DIR: ./ + DBT_GOOGLE_PROJECT: ${{ vars.DBT_GOOGLE_PROJECT }} + DBT_GOOGLE_DATASET: ${{ vars.DBT_GOOGLE_DATASET }} + DBT_GOOGLE_DEV_DATASET: ${{ vars.DBT_GOOGLE_DEV_DATASET }} + DBT_GOOGLE_KEYFILE: /tmp/google/google-service-account.json + PACKAGES_YAML: ${{ vars.PACKAGES_YAML }} + KEYFILE_CONTENTS: ${{ secrets.KEYFILE_CONTENTS }} + +jobs: + prepare-recce-staging-env: + name: Prepare Recce staging env + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.12.x" + + - name: Install dependencies + run: | + pipx install poetry==1.7.1 + poetry install + source $(poetry env info --path)/bin/activate + which dbt + + - name: Add packages.yml file + run: | + echo '${{ vars.PACKAGES_YAML }}' > packages.yml + + - name: Prep Google keyfile + run: | + mkdir -p "$(dirname $DBT_GOOGLE_KEYFILE)" + echo "$KEYFILE_CONTENTS" > $DBT_GOOGLE_KEYFILE + + - name: Prepare dbt Base environment + run: | + source $(poetry env info --path)/bin/activate + git checkout ${{ github.event.pull_request.base.sha }} + dbt deps + dbt seed --target ${{ env.DBT_BASE_TARGET }} + dbt run --target ${{ env.DBT_BASE_TARGET }} + dbt docs generate --target ${{ env.DBT_BASE_TARGET }} + env: + DBT_BASE_TARGET: "playground" + + - name: Archive DBT artifacts + uses: actions/upload-artifact@v4 + with: + name: dbt-artifacts + path: | + target/manifest.json + target/catalog.json From 8f3bf78fac6472d91b5d2bcc350fffa9b39a2503 Mon Sep 17 00:00:00 2001 From: "Wei-Chun, Chang" Date: Mon, 20 May 2024 14:52:40 +0800 Subject: [PATCH 03/26] Refine workflow Signed-off-by: Wei-Chun, Chang --- .github/workflows/recce-staging-daily.yml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/recce-staging-daily.yml b/.github/workflows/recce-staging-daily.yml index b246cef91..11a61026e 100644 --- a/.github/workflows/recce-staging-daily.yml +++ b/.github/workflows/recce-staging-daily.yml @@ -15,7 +15,6 @@ env: DBT_PROFILES_DIR: ./ DBT_GOOGLE_PROJECT: ${{ vars.DBT_GOOGLE_PROJECT }} DBT_GOOGLE_DATASET: ${{ vars.DBT_GOOGLE_DATASET }} - DBT_GOOGLE_DEV_DATASET: ${{ vars.DBT_GOOGLE_DEV_DATASET }} DBT_GOOGLE_KEYFILE: /tmp/google/google-service-account.json PACKAGES_YAML: ${{ vars.PACKAGES_YAML }} KEYFILE_CONTENTS: ${{ secrets.KEYFILE_CONTENTS }} @@ -27,8 +26,6 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v3 - with: - fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v4 @@ -54,10 +51,8 @@ jobs: - name: Prepare dbt Base environment run: | source $(poetry env info --path)/bin/activate - git checkout ${{ github.event.pull_request.base.sha }} dbt deps - dbt seed --target ${{ env.DBT_BASE_TARGET }} - dbt run --target ${{ env.DBT_BASE_TARGET }} + dbt build --target ${{ env.DBT_BASE_TARGET }} dbt docs generate --target ${{ env.DBT_BASE_TARGET }} env: DBT_BASE_TARGET: "playground" From 4cf3ac5bb31505a1c3c60093dd19feb2825dd01b Mon Sep 17 00:00:00 2001 From: "Wei-Chun, Chang" Date: Thu, 16 May 2024 10:14:10 +0800 Subject: [PATCH 04/26] Add recce ci workflow Signed-off-by: Wei-Chun, Chang --- .github/workflows/recce-ci.yml | 112 +++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 .github/workflows/recce-ci.yml diff --git a/.github/workflows/recce-ci.yml b/.github/workflows/recce-ci.yml new file mode 100644 index 000000000..b67eba5dd --- /dev/null +++ b/.github/workflows/recce-ci.yml @@ -0,0 +1,112 @@ +name: OSO Recce CI + +on: + pull_request: + branches: [main, dev] + +env: + # dbt env variables used in your dbt profiles.yml + DBT_PROFILES_DIR: ./ + DBT_GOOGLE_PROJECT: ${{ vars.DBT_GOOGLE_PROJECT }} + DBT_GOOGLE_DATASET: ${{ vars.DBT_GOOGLE_DATASET }} + DBT_GOOGLE_KEYFILE: /tmp/google/google-service-account.json + KEYFILE_CONTENTS: ${{ secrets.KEYFILE_CONTENTS }} + +jobs: + check-pull-request: + name: Check pull request by Recce CI + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.12.x" + + - name: Install dependencies + run: | + pipx install poetry==1.7.1 + poetry install + poetry run which dbt + + - name: Install Recce + run: poetry run pip install recce-nightly + + - name: Add packages.yml file + run: | + echo '${{ vars.PACKAGES_YAML }}' > packages.yml + + - name: Prep Google keyfile + run: | + mkdir -p "$(dirname $DBT_GOOGLE_KEYFILE)" + echo "$KEYFILE_CONTENTS" > $DBT_GOOGLE_KEYFILE + + - name: Prepare dbt Base environment + run: | + run_id=$(gh run list --workflow "OSO Recce Staging CI" --repo DataRecce/oso --status success --limit 1 --json databaseId --jq '.[0].databaseId') + gh run download $run_id --repo DataRecce/oso + mv dbt-artifacts target-base + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Set PR Schema + run: echo "DBT_GOOGLE_DEV_DATASET=OSO_PR_${{ github.event.pull_request.number }}" >> $GITHUB_ENV + + - name: Prepare dbt Current environment + run: | + source $(poetry env info --path)/bin/activate + dbt deps + dbt build --target ${{ env.DBT_CURRENT_TARGET}} + dbt docs generate --target ${{ env.DBT_CURRENT_TARGET}} + env: + DBT_CURRENT_TARGET: "dev" + + - name: Run Recce CI + run: poetry run recce run + + - name: Archive Recce State File + uses: actions/upload-artifact@v4 + id: recce-artifact-uploader + with: + name: recce-state-file + path: recce_state.json + + - name: Prepare Recce Summary + id: recce-summary + run: | + source $(poetry env info --path)/bin/activate + recce summary recce_state.json > recce_summary.md + cat recce_summary.md >> $GITHUB_STEP_SUMMARY + echo '${{ env.NEXT_STEP_MESSAGE }}' >> recce_summary.md + + # Handle the case when the recce summary is too long to be displayed in the GitHub PR comment + if [[ `wc -c recce_summary.md | awk '{print $1}'` -ge '65535' ]]; then + echo '# Recce Summary + The recce summary is too long to be displayed in the GitHub PR comment. + Please check the summary detail in the [Job Summary](${{github.server_url}}/${{github.repository}}/actions/runs/${{github.run_id}}) page. + ${{ env.NEXT_STEP_MESSAGE }}' > recce_summary.md + fi + + env: + ARTIFACT_URL: ${{ steps.recce-artifact-uploader.outputs.artifact-url }} + NEXT_STEP_MESSAGE: | + ## Next Steps + If you want to check more detail inforamtion about the recce result, please download the [artifact](${{ steps.recce-artifact-uploader.outputs.artifact-url }}) file and open it by [Recce](https://pypi.org/project/recce/) CLI. + + ### How to check the recce result + ```bash + # Unzip the downloaded artifact file + tar -xf recce-state-file.zip + + # Launch the recce server based on the state file + recce server --review recce_state.json + + # Open the recce server http://localhost:8000 by your browser + ``` + + - name: Comment on pull request + uses: thollander/actions-comment-pull-request@v2 + with: + filePath: recce_summary.md From 29254a026807185dcca1cacd56276d3846ebcb11 Mon Sep 17 00:00:00 2001 From: "Wei-Chun, Chang" Date: Tue, 21 May 2024 01:54:08 +0800 Subject: [PATCH 05/26] Add triggering event Signed-off-by: Wei-Chun, Chang --- .github/workflows/recce-ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/recce-ci.yml b/.github/workflows/recce-ci.yml index b67eba5dd..ca041350f 100644 --- a/.github/workflows/recce-ci.yml +++ b/.github/workflows/recce-ci.yml @@ -3,6 +3,8 @@ name: OSO Recce CI on: pull_request: branches: [main, dev] + paths: + - warehouse/dbt/** env: # dbt env variables used in your dbt profiles.yml From fe1d43877c38e8d7486d8ca0d4c645fa25410a30 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 16 May 2024 17:38:48 +0800 Subject: [PATCH 06/26] [Feature] DRC-443 Setup dev-container to integrate with GitHub Codespace - Provide a Docker image for dev-container - Setup google-cloud auth inside the GitHub Codespace - Parse `recce-state` and daily recce staging from existing workflow - Execute `dbt build` and `dbt docs generate` during the launch time - Exectue `recce server` in the end Signed-off-by: Kent Huang --- .devcontainer/Dockerfile | 35 +++++++++++++++ .devcontainer/devcontainer.json | 22 ++++++++++ .devcontainer/docker-build.sh | 3 ++ .devcontainer/github_codespace_env.sh | 30 +++++++++++++ .devcontainer/setup_required_env.sh | 61 +++++++++++++++++++++++++++ .gitignore | 7 ++- .vscode/tasks.json | 40 ++++++++++++++++++ 7 files changed, 197 insertions(+), 1 deletion(-) create mode 100644 .devcontainer/Dockerfile create mode 100644 .devcontainer/devcontainer.json create mode 100644 .devcontainer/docker-build.sh create mode 100644 .devcontainer/github_codespace_env.sh create mode 100644 .devcontainer/setup_required_env.sh create mode 100644 .vscode/tasks.json diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 000000000..f3698c063 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,35 @@ +FROM mcr.microsoft.com/vscode/devcontainers/python:3.11 + +# Add GitHub CLI +RUN (type -p wget >/dev/null || (sudo apt update && sudo apt-get install wget -y)) \ +&& sudo mkdir -p -m 755 /etc/apt/keyrings \ +&& wget -qO- https://cli.github.com/packages/githubcli-archive-keyring.gpg | sudo tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \ +&& sudo chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \ +&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | sudo tee /etc/apt/sources.list.d/github-cli.list > /dev/null + +RUN apt-get update && apt-get install -y python3-dev gh && rm -rf /var/lib/apt/lists/* + +ARG USER_UID=1000 +ARG USER_GID=$USER_UID + +RUN if [ "$USER_GID" != "1000" ] || [ "$USER_UID" != "1000" ]; then groupmod --gid $USER_GID vscode && usermod --uid $USER_UID --gid $USER_GID vscode; fi + +RUN pip3 install --upgrade pip +RUN pip3 install pipx +RUN pipx install poetry + +# Downloading gcloud package +RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz + +# Installing the package +RUN mkdir -p /usr/local/gcloud \ + && tar -C /usr/local/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \ + && /usr/local/gcloud/google-cloud-sdk/install.sh + +# Adding the package path to local +ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin + +# Install osos python dependencies +COPY . /tmp/oso +RUN cd /tmp/oso && pip3 install . +RUN pip3 install recce-nightly \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 000000000..484d5f99d --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,22 @@ +{ + "name": "Recce CodeSpace", + "image": "infuseai/oso-dev-container:3.11", + "containerEnv": { + "DBT_GOOGLE_PROJECT": "infuseai-dev", + "DBT_GOOGLE_DATASET": "oso_playground", + "DBT_GOOGLE_DEV_DATASET": "oso_playground_dev", + "DBT_GOOGLE_KEYFILE": "/home/vscode/.config/gcloud/google-service-account.json", + "RECCE_CI_WORKFLOW_NAME": "OSO Recce CI", + "RECCE_DAILY_CI_WORKFLOW_NAME": "OSO Recce Staging CI" + }, + "customizations": { + "vscode": { + "settings": { + "terminal.integrated.shell.linux": "/bin/bash", + "python.terminal.activateEnvironment": true + } + }, + "extensions": [] + }, + "forwardPorts": [8000] +} \ No newline at end of file diff --git a/.devcontainer/docker-build.sh b/.devcontainer/docker-build.sh new file mode 100644 index 000000000..cb7beff81 --- /dev/null +++ b/.devcontainer/docker-build.sh @@ -0,0 +1,3 @@ +#! /bin/bash + +docker buildx build --platform linux/amd64 -t infuseai/oso-dev-container:3.11 .. -f Dockerfile \ No newline at end of file diff --git a/.devcontainer/github_codespace_env.sh b/.devcontainer/github_codespace_env.sh new file mode 100644 index 000000000..540f52feb --- /dev/null +++ b/.devcontainer/github_codespace_env.sh @@ -0,0 +1,30 @@ +#! /bin/bash + +if [ "${CODESPACES}" == "true" ]; then + # Set the default git repository if running in GitHub Codespaces + echo "Setting the default git repository to $GITHUB_REPOSITORY" + gh repo set-default $GITHUB_REPOSITORY + + current_branch=$(git branch --show-current) + # Check if the current branch is under a pull request + if gh pr view > /dev/null ; then + # Check if the Recce state file is downloaded + run_id=$(gh run list -b ${current_branch} -s success --limit 1 -w "${RECCE_CI_WORKFLOW_NAME}" --json databaseId | jq .[].databaseId) + if [ -z "$run_id" ]; then + echo "No successful Recce run found for the current branch." + else + echo "Downloading the Recce state file for the last successful run." + gh run download $run_id --dir .recce + echo "The Recce state file is downloaded to '.recce/recce_state_file/recce_state.json'." + fi + fi + + # Check daily staging artifact files + default_branch=$(gh repo view --json defaultBranchRef --jq .defaultBranchRef.name) + daily_artifact_workflow_id=$(gh run list -w "${RECCE_DAILY_CI_WORKFLOW_NAME}" --status success -b dev --limit 1 --json databaseId | jq .[].databaseId) + gh run download $daily_artifact_workflow_id --dir .recce + if [ -d ".recce/dbt-artifacts" ]; then + mv .recce/dbt-artifacts target-base + echo "The daily staging artifact files are downloaded to 'target-base'." + fi +fi \ No newline at end of file diff --git a/.devcontainer/setup_required_env.sh b/.devcontainer/setup_required_env.sh new file mode 100644 index 000000000..e09fde804 --- /dev/null +++ b/.devcontainer/setup_required_env.sh @@ -0,0 +1,61 @@ +#! /bin/bash + +GREEN="\033[0;32m" +YELLOW="\033[1;33m" +ENDCOLOR="\033[0m" + +function show_env_hint() { + echo -e "[${YELLOW}Required ENV${ENDCOLOR}] $1 is not set." + cat << EOF + +Please set the following environment variables in your GitHub Codespaces Secrets. +You can set the secret in your GitHub Codespaces Secrets by going to: + GitHub Personal Account -> Settings -> Codespaces -> Codespaces secrets + +Then add the secret with the name $1 and the value of the secret. +After adding the secret, please restart the Codespaces to apply the changes. +EOF +} + +# Check the ENV variables should be provided by the user +if [ -z "$DBT_GOOGLE_PROJECT" ]; then + show_env_hint "DBT_GOOGLE_PROJECT" + exit 1 +fi + +if [ -z "$DBT_GOOGLE_DATASET" ]; then + show_env_hint "DBT_GOOGLE_DATASET" + exit 1 +fi + +if [ -z "$DBT_GOOGLE_DEV_DATASET" ]; then + show_env_hint "DBT_GOOGLE_DEV_DATASET" + exit 1 +fi + +mkdir -p $HOME/.config/gcloud +DBT_GOOGLE_KEYFILE=$HOME/.config/gcloud/google-service-account.json + +# Setup dbt profiles.yml +if [ "$DBT_PROFILES_YML_CONTENT" != '' ]; then + echo "$DBT_PROFILES_YML_CONTENT" > $HOME/.dbt/profiles.yml + echo "dbt profiles.yml is saved to $HOME/.dbt/profiles.yml" +fi + + +# Check if the user is already logged in +if [ -z "$GOOGLE_CLOUD_SERVICE_ACCOUNT_KEY_CONTENT" ]; then + # Change to use OAuth2 to login + if [ -f "${DBT_GOOGLE_KEYFILE}" ]; then + echo "User is already logged in Google cloud" + gcloud auth list + exit 0 + else + echo -e "[${GREEN}Action${ENDCOLOR}] Please login to Google cloud to continue." + gcloud auth application-default login + fi +else + # Use the service account key to login + echo "$GOOGLE_CLOUD_SERVICE_ACCOUNT_KEY_CONTENT" > ${DBT_GOOGLE_KEYFILE} + echo "Google cloud service account key is saved to ${DBT_GOOGLE_KEYFILE}" +fi diff --git a/.gitignore b/.gitignore index e78628001..01512cbde 100644 --- a/.gitignore +++ b/.gitignore @@ -63,4 +63,9 @@ dbt_packages/ supabase/.temp/ **/supabase/.temp/ -*/**/supabase/.temp/ \ No newline at end of file +*/**/supabase/.temp/ + +# Recce & DBT +.recce/ +target/ +target-base/ \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 000000000..715dda479 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,40 @@ +{ + // See https://go.microsoft.com/fwlink/?LinkId=733558 + // for the documentation about the tasks.json format + "version": "2.0.0", + "tasks": [ + { + "label": "Check GitHub CodeSpace environment", + "type": "shell", + "command": "bash .devcontainer/github_codespace_env.sh", + }, + { + "label": "Install Project dependencies", + "type": "shell", + "command": "poetry install && poetry env use python", + }, + { + "label": "Setup required environment variables", + "type": "shell", + "command": "bash .devcontainer/setup_required_env.sh", + }, + { + "label": "Run DBT", + "type": "shell", + "command": "poetry run dbt deps && poetry run dbt build && poetry run dbt docs generate", + "dependsOn": [ + "Login Google Cloud for BigQuery", + "Check GitHub CodeSpace environment" + ], + }, + { + "label": "Launch Recce server", + "type": "shell", + "command": "if [ -f '.recce/recce_state_file/recce_state.json' ]; then recce server --review .recce/recce_state_file/recce_state.json; else recce server; fi", + "dependsOn": ["Run DBT"], + "runOptions": { + "runOn": "folderOpen" + } + } + ] +} \ No newline at end of file From 9b75cd3b666c47e346434aa63edf12a9c133b492 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Tue, 21 May 2024 16:12:01 +0800 Subject: [PATCH 07/26] [Feature] Execute recce by bash script Signed-off-by: Kent Huang --- .devcontainer/launch_recce_server.sh | 12 ++++++++++++ .gitignore | 3 ++- .vscode/tasks.json | 6 +++--- 3 files changed, 17 insertions(+), 4 deletions(-) create mode 100644 .devcontainer/launch_recce_server.sh diff --git a/.devcontainer/launch_recce_server.sh b/.devcontainer/launch_recce_server.sh new file mode 100644 index 000000000..c85c4dbfb --- /dev/null +++ b/.devcontainer/launch_recce_server.sh @@ -0,0 +1,12 @@ +#! /bin/bash + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +if [ -f "${DIR}/../.recce/recce_state_file/recce_state.json" ]; then + echo "Launching the Recce server in review mode. The Recce state file is found." + cp ${DIR}/../.recce/recce_state_file/recce_state.json recce_state.json + recce server --review recce_state.json +else + echo "Launching the Recce server." + recce server +fi \ No newline at end of file diff --git a/.gitignore b/.gitignore index 01512cbde..ab5471596 100644 --- a/.gitignore +++ b/.gitignore @@ -68,4 +68,5 @@ supabase/.temp/ # Recce & DBT .recce/ target/ -target-base/ \ No newline at end of file +target-base/ +recce_state.json \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json index 715dda479..d9494ac15 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -21,16 +21,16 @@ { "label": "Run DBT", "type": "shell", - "command": "poetry run dbt deps && poetry run dbt build && poetry run dbt docs generate", + "command": "dbt deps && dbt build && dbt docs generate", "dependsOn": [ - "Login Google Cloud for BigQuery", + "Setup required environment variables", "Check GitHub CodeSpace environment" ], }, { "label": "Launch Recce server", "type": "shell", - "command": "if [ -f '.recce/recce_state_file/recce_state.json' ]; then recce server --review .recce/recce_state_file/recce_state.json; else recce server; fi", + "command": "bash .devcontainer/launch_recce_server.sh", "dependsOn": ["Run DBT"], "runOptions": { "runOn": "folderOpen" From f9768ce703618862cb467bb475d0f5dbbf10f771 Mon Sep 17 00:00:00 2001 From: "Wei-Chun, Chang" Date: Wed, 22 May 2024 14:47:17 +0800 Subject: [PATCH 08/26] Add condition to staging env preparation workflow Signed-off-by: Wei-Chun, Chang --- .github/workflows/recce-staging-daily.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/recce-staging-daily.yml b/.github/workflows/recce-staging-daily.yml index 11a61026e..d6de11f7d 100644 --- a/.github/workflows/recce-staging-daily.yml +++ b/.github/workflows/recce-staging-daily.yml @@ -22,6 +22,7 @@ env: jobs: prepare-recce-staging-env: name: Prepare Recce staging env + if: github.event.pull_request.merged == true || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' runs-on: ubuntu-latest steps: - name: Checkout repository From 8027ae6f93a95aad11851c833283218bb4ef31c0 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 23 May 2024 12:03:37 +0800 Subject: [PATCH 09/26] [Fix] Skip dbt build when recce state file is found Signed-off-by: Kent Huang --- .devcontainer/prepare_dbt_manifest.sh | 14 ++++++++++++++ .vscode/tasks.json | 7 +------ 2 files changed, 15 insertions(+), 6 deletions(-) create mode 100644 .devcontainer/prepare_dbt_manifest.sh diff --git a/.devcontainer/prepare_dbt_manifest.sh b/.devcontainer/prepare_dbt_manifest.sh new file mode 100644 index 000000000..b3648104c --- /dev/null +++ b/.devcontainer/prepare_dbt_manifest.sh @@ -0,0 +1,14 @@ +#! /bin/bash + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# Check if the dbt's target path is existed +if [ -d "${DIR}/../recce_state.json" ]; then + echo "The Recce state file is found. Skip preparing the dbt manifest." + exit 0 +fi + +echo "Preparing the dbt manifest." +pushd ${DIR}/../ +dbt deps && dbt build && dbt docs generate +popd diff --git a/.vscode/tasks.json b/.vscode/tasks.json index d9494ac15..b18018d02 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -8,11 +8,6 @@ "type": "shell", "command": "bash .devcontainer/github_codespace_env.sh", }, - { - "label": "Install Project dependencies", - "type": "shell", - "command": "poetry install && poetry env use python", - }, { "label": "Setup required environment variables", "type": "shell", @@ -21,7 +16,7 @@ { "label": "Run DBT", "type": "shell", - "command": "dbt deps && dbt build && dbt docs generate", + "command": "bash .devcontainer/prepare_dbt_manifest.sh", "dependsOn": [ "Setup required environment variables", "Check GitHub CodeSpace environment" From 78b057220c62da197dbcb123cd06f9cf9a4f6084 Mon Sep 17 00:00:00 2001 From: popcorny Date: Tue, 21 May 2024 17:53:32 +0800 Subject: [PATCH 10/26] Ad prsync Signed-off-by: popcorny --- .gitignore | 4 +- .prsync/foo | 1 + prsync.py | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 .prsync/foo create mode 100644 prsync.py diff --git a/.gitignore b/.gitignore index ab5471596..ee02bc517 100644 --- a/.gitignore +++ b/.gitignore @@ -69,4 +69,6 @@ supabase/.temp/ .recce/ target/ target-base/ -recce_state.json \ No newline at end of file +recce_state.json +venv/ +.envrc diff --git a/.prsync/foo b/.prsync/foo new file mode 100644 index 000000000..189899d22 --- /dev/null +++ b/.prsync/foo @@ -0,0 +1 @@ +barbar diff --git a/prsync.py b/prsync.py new file mode 100644 index 000000000..d2e12ebdd --- /dev/null +++ b/prsync.py @@ -0,0 +1,142 @@ +import os +import sys +import tempfile + +from github import Github, PullRequest +import git + + + +class GithubPrSyncer: + + def __init__( + self, + github_repo, + repo_path, + ): + self.github = Github(os.getenv('GITHUB_TOKEN')) + if self.github is None: + raise ValueError('GITHUB_TOKEN is required') + + self.github_repo = self.github.get_repo(github_repo) + if not self.github_repo.fork: + raise ValueError(f'{github_repo} is not a fork') + self.local_repo = git.Repo(repo_path) + self.remote_name = 'origin' + self.remote_parent_name = self.github_repo.owner.login + self.repo_path = repo_path + + def get_open_pull_requests(self, repository): + return repository.get_pulls(state='open') + + def sync_default_branch(self): + branch = self.github_repo.parent.default_branch + self.local_repo.git.push(self.remote_name, f'refs/remotes/{self.remote_parent_name}/{branch}:refs/heads/{branch}', force=True) + + def fetch_remotes(self): + print(f"Fetching {self.remote_name}...") + self.local_repo.remotes[self.remote_name].fetch() + print(f"Fetching {self.remote_parent_name}...") + self.local_repo.remotes[self.remote_parent_name].fetch() + + def sync_pull_request(self, pr: PullRequest, prsync_dir): + # format {owner}:{branch_name} + pr_from = f"{pr.head.user.login}:{pr.head.ref}" + pr_to = f"{pr.base.user.login}:{pr.base.ref}" + print(f"Syncing PR #{pr.number}: {pr.title}") + print(f"from {pr_from} to {pr_to}") + + # Remote details + repo_name = 'oso' + owner_name = pr.head.user.login + remote_name = f'remote_{owner_name}' + remote_url = f'https://github.com/{owner_name}/{repo_name}.git' + + # Check if the remote already exists, if not, add it + if remote_name not in [remote.name for remote in self.local_repo.remotes]: + print(f"Adding remote: {remote_name}") + self.local_repo.create_remote(remote_name, remote_url) + + # Fetch the branch from the remote + print(f"Fetch github repo: {owner_name}/{repo_name}") + self.local_repo.remotes[remote_name].fetch() + + # Create and checkout a local branch + branch_name = f'{owner_name}/{pr.head.ref}' + if branch_name not in self.local_repo.heads: + print(f"Creating local branch: {branch_name}") + self.local_repo.create_head(branch_name, f'{remote_name}/{pr.head.ref}') + self.local_repo.git.checkout(branch_name) + else: + print(f"Branch {branch_name} already exists. Force to use the latest changes.") + self.local_repo.git.checkout(branch_name) + self.local_repo.git.reset('--hard', f'{self.remote_name}/{branch_name}') + + # If no behind and ahead only 1, don't sync + behind, ahead = self.local_repo.git.rev_list(f'{remote_name}/{pr.head.ref}...{branch_name}', '--left-right', '--count').split() + if behind == '0' and ahead == '999': + print(f"Branch {branch_name} is up-to-date. Skip syncing.") + else: + print(f"Branch {branch_name} is {behind} behind and {ahead} ahead. Syncing...") + + # Copy all files in '.githubprsyncer/*' to the root of the repo and commit + print(f"Copying files from {prsync_dir}/ to the root of the {self.repo_path}/") + os.system(f'cp -r {prsync_dir}/* {self.repo_path}/') + self.local_repo.git.add('.') + # commit if there are changes + if self.local_repo.is_dirty(): + self.local_repo.git.commit('-m', f"Auto-sync by GithubPrSyncer") + else: + print("No changes to commit. Skip committing.") + + # Push the branch to the remote + print(f"Pushing branch {branch_name} to remote...") + self.local_repo.git.push(self.remote_name, branch_name, force=True) + + # Create pull request + pulls = self.github_repo.get_pulls(state='open', head=f"{self.github_repo.owner.login}:{branch_name}") + if pulls.totalCount > 0: + print(f"PR: {pulls[0].html_url}") + return pulls[0] + else: + body = f"{pr.body}\n\nsynced from {pr.number}" + pull_request = self.github_repo.create_pull(title=pr.title, body=body, head=f"{branch_name}", base=self.github_repo.parent.default_branch) + print(f"New PR created: {pull_request.html_url}") + return pull_request + + def sync(self): + # get the current branch + current_branch = self.local_repo.active_branch + + # create temp folder by python library + with tempfile.TemporaryDirectory() as temp_dir: + try: + # copy the files from .githubprsyncer/* to the temp dir + print(f"Copying files from .githubprsyncer/* to {temp_dir}/") + os.system(f'cp -r .prsync/* {temp_dir}/') + print() + + # sync the pull requests + open_pull_requests = self.github_repo.parent.get_pulls(state='open') + for pr in open_pull_requests: + if pr.number != 1144: + continue + + self.sync_pull_request(pr, temp_dir) + print() + + finally: + # checkout back to the original branch + self.local_repo.git.checkout(current_branch) + + +def main(): + repo_path = '.' + if len(sys.argv) > 1: + repo_path = sys.argv[1] + + syncer = GithubPrSyncer('DataRecce/oso', repo_path=repo_path) + syncer.sync() + +if __name__ == '__main__': + main() From 8bfc6798690301dddd75625c130b8d0e2c6316e3 Mon Sep 17 00:00:00 2001 From: popcorny Date: Tue, 21 May 2024 17:57:23 +0800 Subject: [PATCH 11/26] Add prsync Signed-off-by: popcorny --- .github/workflows/pysync.yml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 .github/workflows/pysync.yml diff --git a/.github/workflows/pysync.yml b/.github/workflows/pysync.yml new file mode 100644 index 000000000..02bb85c6c --- /dev/null +++ b/.github/workflows/pysync.yml @@ -0,0 +1,27 @@ +name: PR Sync + +on: + workflow_dispatch: + +jobs: + run_pysync: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install gitpython PyGithub + + - name: Run pysync.py + run: | + python pysync.py From 4ad23831a540bb1db4182dba55dcae63a5126ef9 Mon Sep 17 00:00:00 2001 From: popcorny Date: Tue, 21 May 2024 18:13:50 +0800 Subject: [PATCH 12/26] Add prsync Signed-off-by: popcorny --- .github/workflows/pysync.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/pysync.yml b/.github/workflows/pysync.yml index 02bb85c6c..dfb38b4a7 100644 --- a/.github/workflows/pysync.yml +++ b/.github/workflows/pysync.yml @@ -2,6 +2,9 @@ name: PR Sync on: workflow_dispatch: + pull_request: + branches: + - dev jobs: run_pysync: From fa2fcf3a24a5bd7dfe6958e60bfb076ad7f6fac3 Mon Sep 17 00:00:00 2001 From: popcorny Date: Wed, 22 May 2024 16:43:29 +0800 Subject: [PATCH 13/26] Add prsync Signed-off-by: popcorny --- .github/workflows/pysync.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pysync.yml b/.github/workflows/pysync.yml index dfb38b4a7..84887673a 100644 --- a/.github/workflows/pysync.yml +++ b/.github/workflows/pysync.yml @@ -23,8 +23,10 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install gitpython PyGithub + pip install git+https://github.com/DataRecce/github-pr-syncer.git - - name: Run pysync.py + - name: Run pysync + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - python pysync.py + prsync 'DataRecce/oso' From db3d54de96809887bfca7841f10a81839a86d536 Mon Sep 17 00:00:00 2001 From: popcorny Date: Wed, 22 May 2024 16:52:39 +0800 Subject: [PATCH 14/26] Add workflow Signed-off-by: popcorny --- .githubprsyncer/.github/workflows/recce-ci.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .githubprsyncer/.github/workflows/recce-ci.yml diff --git a/.githubprsyncer/.github/workflows/recce-ci.yml b/.githubprsyncer/.github/workflows/recce-ci.yml new file mode 100644 index 000000000..9d6cc1957 --- /dev/null +++ b/.githubprsyncer/.github/workflows/recce-ci.yml @@ -0,0 +1,14 @@ +name: OSO Recce CI + +on: + pull_request: + branches: + - main + +jobs: + print-hello-world: + runs-on: ubuntu-latest + + steps: + - name: Print Hello World + run: echo "Hello World" From d9b9163a7209994a314e942d619f28edac648909 Mon Sep 17 00:00:00 2001 From: popcorny Date: Wed, 22 May 2024 17:01:03 +0800 Subject: [PATCH 15/26] Add git username Signed-off-by: popcorny --- .github/workflows/{pysync.yml => prsync.yml} | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) rename .github/workflows/{pysync.yml => prsync.yml} (77%) diff --git a/.github/workflows/pysync.yml b/.github/workflows/prsync.yml similarity index 77% rename from .github/workflows/pysync.yml rename to .github/workflows/prsync.yml index 84887673a..8b4e6f49b 100644 --- a/.github/workflows/pysync.yml +++ b/.github/workflows/prsync.yml @@ -7,7 +7,7 @@ on: - dev jobs: - run_pysync: + prsync: runs-on: ubuntu-latest steps: @@ -25,6 +25,11 @@ jobs: python -m pip install --upgrade pip pip install git+https://github.com/DataRecce/github-pr-syncer.git + - name: Set up Git + run: | + git config --global user.name "prsync[bot]" + git config --global user.email "prsync[bot]@users.noreply.github.com" + - name: Run pysync env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 719e45345925a9ac47b2cd401ba2112f018a511d Mon Sep 17 00:00:00 2001 From: popcorny Date: Wed, 22 May 2024 18:04:59 +0800 Subject: [PATCH 16/26] Add workflow permission Signed-off-by: popcorny --- .github/workflows/prsync.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/prsync.yml b/.github/workflows/prsync.yml index 8b4e6f49b..fbc4b428a 100644 --- a/.github/workflows/prsync.yml +++ b/.github/workflows/prsync.yml @@ -6,6 +6,8 @@ on: branches: - dev +permissions: write-all + jobs: prsync: runs-on: ubuntu-latest From f53fce4ce60135047ab0a81c1cce551135faa2f5 Mon Sep 17 00:00:00 2001 From: popcorny Date: Thu, 23 May 2024 11:45:19 +0800 Subject: [PATCH 17/26] Add PAT Signed-off-by: popcorny --- .github/workflows/prsync.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/prsync.yml b/.github/workflows/prsync.yml index fbc4b428a..cc76e97fa 100644 --- a/.github/workflows/prsync.yml +++ b/.github/workflows/prsync.yml @@ -34,6 +34,8 @@ jobs: - name: Run pysync env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # We meed the "workflows" permission which teh github token for job cannot provide. Use the PersonalAccessToken instead. + GITHUB_TOKEN: ${{ secrets.GH_TOKEN_FOR_ADVANCED_PERMISSIONS }} run: | prsync 'DataRecce/oso' From 1a131dc4448e218dc74055df1bf5436a3f5c1362 Mon Sep 17 00:00:00 2001 From: popcorny Date: Thu, 23 May 2024 14:07:13 +0800 Subject: [PATCH 18/26] Update the ci Signed-off-by: popcorny --- .../{prsync.yml => recce-prsync.yml} | 13 +- .../.github/workflows/recce-ci.yml | 112 +++++++++++++- .githubprsyncer/profiles.yml | 26 ++++ .prsync/foo | 1 - prsync.py | 142 ------------------ 5 files changed, 138 insertions(+), 156 deletions(-) rename .github/workflows/{prsync.yml => recce-prsync.yml} (74%) create mode 100644 .githubprsyncer/profiles.yml delete mode 100644 .prsync/foo delete mode 100644 prsync.py diff --git a/.github/workflows/prsync.yml b/.github/workflows/recce-prsync.yml similarity index 74% rename from .github/workflows/prsync.yml rename to .github/workflows/recce-prsync.yml index cc76e97fa..fe23a1060 100644 --- a/.github/workflows/prsync.yml +++ b/.github/workflows/recce-prsync.yml @@ -1,12 +1,11 @@ -name: PR Sync +# This workflow use the Github PR Syncer https://github.com/dataRecce/github-pr-syncer/ +# to sync the PRs from the upstream repository +name: OSO Recce PR Sync on: workflow_dispatch: - pull_request: - branches: - - dev - -permissions: write-all + schedule: + - cron: '30 17 * * *' # run at 1:30 AM (UTC + 8) everyday jobs: prsync: @@ -35,7 +34,7 @@ jobs: - name: Run pysync env: # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # We meed the "workflows" permission which teh github token for job cannot provide. Use the PersonalAccessToken instead. + # We need the "workflows" permission which the github token for job cannot provide. Use the PersonalAccessToken instead. GITHUB_TOKEN: ${{ secrets.GH_TOKEN_FOR_ADVANCED_PERMISSIONS }} run: | prsync 'DataRecce/oso' diff --git a/.githubprsyncer/.github/workflows/recce-ci.yml b/.githubprsyncer/.github/workflows/recce-ci.yml index 9d6cc1957..ca041350f 100644 --- a/.githubprsyncer/.github/workflows/recce-ci.yml +++ b/.githubprsyncer/.github/workflows/recce-ci.yml @@ -2,13 +2,113 @@ name: OSO Recce CI on: pull_request: - branches: - - main + branches: [main, dev] + paths: + - warehouse/dbt/** + +env: + # dbt env variables used in your dbt profiles.yml + DBT_PROFILES_DIR: ./ + DBT_GOOGLE_PROJECT: ${{ vars.DBT_GOOGLE_PROJECT }} + DBT_GOOGLE_DATASET: ${{ vars.DBT_GOOGLE_DATASET }} + DBT_GOOGLE_KEYFILE: /tmp/google/google-service-account.json + KEYFILE_CONTENTS: ${{ secrets.KEYFILE_CONTENTS }} jobs: - print-hello-world: + check-pull-request: + name: Check pull request by Recce CI runs-on: ubuntu-latest - steps: - - name: Print Hello World - run: echo "Hello World" + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.12.x" + + - name: Install dependencies + run: | + pipx install poetry==1.7.1 + poetry install + poetry run which dbt + + - name: Install Recce + run: poetry run pip install recce-nightly + + - name: Add packages.yml file + run: | + echo '${{ vars.PACKAGES_YAML }}' > packages.yml + + - name: Prep Google keyfile + run: | + mkdir -p "$(dirname $DBT_GOOGLE_KEYFILE)" + echo "$KEYFILE_CONTENTS" > $DBT_GOOGLE_KEYFILE + + - name: Prepare dbt Base environment + run: | + run_id=$(gh run list --workflow "OSO Recce Staging CI" --repo DataRecce/oso --status success --limit 1 --json databaseId --jq '.[0].databaseId') + gh run download $run_id --repo DataRecce/oso + mv dbt-artifacts target-base + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Set PR Schema + run: echo "DBT_GOOGLE_DEV_DATASET=OSO_PR_${{ github.event.pull_request.number }}" >> $GITHUB_ENV + + - name: Prepare dbt Current environment + run: | + source $(poetry env info --path)/bin/activate + dbt deps + dbt build --target ${{ env.DBT_CURRENT_TARGET}} + dbt docs generate --target ${{ env.DBT_CURRENT_TARGET}} + env: + DBT_CURRENT_TARGET: "dev" + + - name: Run Recce CI + run: poetry run recce run + + - name: Archive Recce State File + uses: actions/upload-artifact@v4 + id: recce-artifact-uploader + with: + name: recce-state-file + path: recce_state.json + + - name: Prepare Recce Summary + id: recce-summary + run: | + source $(poetry env info --path)/bin/activate + recce summary recce_state.json > recce_summary.md + cat recce_summary.md >> $GITHUB_STEP_SUMMARY + echo '${{ env.NEXT_STEP_MESSAGE }}' >> recce_summary.md + + # Handle the case when the recce summary is too long to be displayed in the GitHub PR comment + if [[ `wc -c recce_summary.md | awk '{print $1}'` -ge '65535' ]]; then + echo '# Recce Summary + The recce summary is too long to be displayed in the GitHub PR comment. + Please check the summary detail in the [Job Summary](${{github.server_url}}/${{github.repository}}/actions/runs/${{github.run_id}}) page. + ${{ env.NEXT_STEP_MESSAGE }}' > recce_summary.md + fi + + env: + ARTIFACT_URL: ${{ steps.recce-artifact-uploader.outputs.artifact-url }} + NEXT_STEP_MESSAGE: | + ## Next Steps + If you want to check more detail inforamtion about the recce result, please download the [artifact](${{ steps.recce-artifact-uploader.outputs.artifact-url }}) file and open it by [Recce](https://pypi.org/project/recce/) CLI. + + ### How to check the recce result + ```bash + # Unzip the downloaded artifact file + tar -xf recce-state-file.zip + + # Launch the recce server based on the state file + recce server --review recce_state.json + + # Open the recce server http://localhost:8000 by your browser + ``` + + - name: Comment on pull request + uses: thollander/actions-comment-pull-request@v2 + with: + filePath: recce_summary.md diff --git a/.githubprsyncer/profiles.yml b/.githubprsyncer/profiles.yml new file mode 100644 index 000000000..44f94cf68 --- /dev/null +++ b/.githubprsyncer/profiles.yml @@ -0,0 +1,26 @@ +opensource_observer: + outputs: + playground: + type: bigquery + method: service-account + keyfile: "{{ env_var('DBT_GOOGLE_KEYFILE') }}" + project: "{{ env_var('DBT_GOOGLE_PROJECT') }}" + dataset: "{{ env_var('DBT_GOOGLE_DATASET') }}" + job_execution_time_seconds: 300 + job_retries: 1 + location: US + threads: 32 + dev: + type: bigquery + method: service-account + keyfile: "{{ env_var('DBT_GOOGLE_KEYFILE') }}" + project: "{{ env_var('DBT_GOOGLE_PROJECT') }}" + dataset: "{{ env_var('DBT_GOOGLE_DEV_DATASET') }}" + job_execution_time_seconds: 300 + job_retries: 1 + location: US + threads: 32 + # By default we target the playground. it's less costly and also safer to write + # there while developing + target: playground + diff --git a/.prsync/foo b/.prsync/foo deleted file mode 100644 index 189899d22..000000000 --- a/.prsync/foo +++ /dev/null @@ -1 +0,0 @@ -barbar diff --git a/prsync.py b/prsync.py deleted file mode 100644 index d2e12ebdd..000000000 --- a/prsync.py +++ /dev/null @@ -1,142 +0,0 @@ -import os -import sys -import tempfile - -from github import Github, PullRequest -import git - - - -class GithubPrSyncer: - - def __init__( - self, - github_repo, - repo_path, - ): - self.github = Github(os.getenv('GITHUB_TOKEN')) - if self.github is None: - raise ValueError('GITHUB_TOKEN is required') - - self.github_repo = self.github.get_repo(github_repo) - if not self.github_repo.fork: - raise ValueError(f'{github_repo} is not a fork') - self.local_repo = git.Repo(repo_path) - self.remote_name = 'origin' - self.remote_parent_name = self.github_repo.owner.login - self.repo_path = repo_path - - def get_open_pull_requests(self, repository): - return repository.get_pulls(state='open') - - def sync_default_branch(self): - branch = self.github_repo.parent.default_branch - self.local_repo.git.push(self.remote_name, f'refs/remotes/{self.remote_parent_name}/{branch}:refs/heads/{branch}', force=True) - - def fetch_remotes(self): - print(f"Fetching {self.remote_name}...") - self.local_repo.remotes[self.remote_name].fetch() - print(f"Fetching {self.remote_parent_name}...") - self.local_repo.remotes[self.remote_parent_name].fetch() - - def sync_pull_request(self, pr: PullRequest, prsync_dir): - # format {owner}:{branch_name} - pr_from = f"{pr.head.user.login}:{pr.head.ref}" - pr_to = f"{pr.base.user.login}:{pr.base.ref}" - print(f"Syncing PR #{pr.number}: {pr.title}") - print(f"from {pr_from} to {pr_to}") - - # Remote details - repo_name = 'oso' - owner_name = pr.head.user.login - remote_name = f'remote_{owner_name}' - remote_url = f'https://github.com/{owner_name}/{repo_name}.git' - - # Check if the remote already exists, if not, add it - if remote_name not in [remote.name for remote in self.local_repo.remotes]: - print(f"Adding remote: {remote_name}") - self.local_repo.create_remote(remote_name, remote_url) - - # Fetch the branch from the remote - print(f"Fetch github repo: {owner_name}/{repo_name}") - self.local_repo.remotes[remote_name].fetch() - - # Create and checkout a local branch - branch_name = f'{owner_name}/{pr.head.ref}' - if branch_name not in self.local_repo.heads: - print(f"Creating local branch: {branch_name}") - self.local_repo.create_head(branch_name, f'{remote_name}/{pr.head.ref}') - self.local_repo.git.checkout(branch_name) - else: - print(f"Branch {branch_name} already exists. Force to use the latest changes.") - self.local_repo.git.checkout(branch_name) - self.local_repo.git.reset('--hard', f'{self.remote_name}/{branch_name}') - - # If no behind and ahead only 1, don't sync - behind, ahead = self.local_repo.git.rev_list(f'{remote_name}/{pr.head.ref}...{branch_name}', '--left-right', '--count').split() - if behind == '0' and ahead == '999': - print(f"Branch {branch_name} is up-to-date. Skip syncing.") - else: - print(f"Branch {branch_name} is {behind} behind and {ahead} ahead. Syncing...") - - # Copy all files in '.githubprsyncer/*' to the root of the repo and commit - print(f"Copying files from {prsync_dir}/ to the root of the {self.repo_path}/") - os.system(f'cp -r {prsync_dir}/* {self.repo_path}/') - self.local_repo.git.add('.') - # commit if there are changes - if self.local_repo.is_dirty(): - self.local_repo.git.commit('-m', f"Auto-sync by GithubPrSyncer") - else: - print("No changes to commit. Skip committing.") - - # Push the branch to the remote - print(f"Pushing branch {branch_name} to remote...") - self.local_repo.git.push(self.remote_name, branch_name, force=True) - - # Create pull request - pulls = self.github_repo.get_pulls(state='open', head=f"{self.github_repo.owner.login}:{branch_name}") - if pulls.totalCount > 0: - print(f"PR: {pulls[0].html_url}") - return pulls[0] - else: - body = f"{pr.body}\n\nsynced from {pr.number}" - pull_request = self.github_repo.create_pull(title=pr.title, body=body, head=f"{branch_name}", base=self.github_repo.parent.default_branch) - print(f"New PR created: {pull_request.html_url}") - return pull_request - - def sync(self): - # get the current branch - current_branch = self.local_repo.active_branch - - # create temp folder by python library - with tempfile.TemporaryDirectory() as temp_dir: - try: - # copy the files from .githubprsyncer/* to the temp dir - print(f"Copying files from .githubprsyncer/* to {temp_dir}/") - os.system(f'cp -r .prsync/* {temp_dir}/') - print() - - # sync the pull requests - open_pull_requests = self.github_repo.parent.get_pulls(state='open') - for pr in open_pull_requests: - if pr.number != 1144: - continue - - self.sync_pull_request(pr, temp_dir) - print() - - finally: - # checkout back to the original branch - self.local_repo.git.checkout(current_branch) - - -def main(): - repo_path = '.' - if len(sys.argv) > 1: - repo_path = sys.argv[1] - - syncer = GithubPrSyncer('DataRecce/oso', repo_path=repo_path) - syncer.sync() - -if __name__ == '__main__': - main() From 4be70f2378f2bcd315f631d3944ac0d3f9608c17 Mon Sep 17 00:00:00 2001 From: popcorny Date: Thu, 23 May 2024 14:30:36 +0800 Subject: [PATCH 19/26] Add manual trigger Signed-off-by: popcorny --- .githubprsyncer/.github/workflows/recce-ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.githubprsyncer/.github/workflows/recce-ci.yml b/.githubprsyncer/.github/workflows/recce-ci.yml index ca041350f..73e8c34e9 100644 --- a/.githubprsyncer/.github/workflows/recce-ci.yml +++ b/.githubprsyncer/.github/workflows/recce-ci.yml @@ -1,8 +1,10 @@ name: OSO Recce CI on: + workflow_dispatch: pull_request: - branches: [main, dev] + branches: + - main paths: - warehouse/dbt/** From ad428aa9441b152df2d4747d4808c2df85ff761b Mon Sep 17 00:00:00 2001 From: popcorny Date: Thu, 23 May 2024 15:10:46 +0800 Subject: [PATCH 20/26] Add manual trigger Signed-off-by: popcorny --- .github/workflows/recce-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/recce-ci.yml b/.github/workflows/recce-ci.yml index ca041350f..335719034 100644 --- a/.github/workflows/recce-ci.yml +++ b/.github/workflows/recce-ci.yml @@ -1,6 +1,7 @@ name: OSO Recce CI on: + workflow_dispatch: pull_request: branches: [main, dev] paths: From 2ade5f08ec0b133f4b5f73545a131e363a2f297f Mon Sep 17 00:00:00 2001 From: popcorny Date: Thu, 23 May 2024 15:19:42 +0800 Subject: [PATCH 21/26] Dont put the workflow file in the prsync Signed-off-by: popcorny --- .github/workflows/recce-prsync.yml | 4 +- .../.github/workflows/recce-ci.yml | 116 ------------------ 2 files changed, 1 insertion(+), 119 deletions(-) delete mode 100644 .githubprsyncer/.github/workflows/recce-ci.yml diff --git a/.github/workflows/recce-prsync.yml b/.github/workflows/recce-prsync.yml index fe23a1060..867d55ede 100644 --- a/.github/workflows/recce-prsync.yml +++ b/.github/workflows/recce-prsync.yml @@ -33,8 +33,6 @@ jobs: - name: Run pysync env: - # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # We need the "workflows" permission which the github token for job cannot provide. Use the PersonalAccessToken instead. - GITHUB_TOKEN: ${{ secrets.GH_TOKEN_FOR_ADVANCED_PERMISSIONS }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | prsync 'DataRecce/oso' diff --git a/.githubprsyncer/.github/workflows/recce-ci.yml b/.githubprsyncer/.github/workflows/recce-ci.yml deleted file mode 100644 index 73e8c34e9..000000000 --- a/.githubprsyncer/.github/workflows/recce-ci.yml +++ /dev/null @@ -1,116 +0,0 @@ -name: OSO Recce CI - -on: - workflow_dispatch: - pull_request: - branches: - - main - paths: - - warehouse/dbt/** - -env: - # dbt env variables used in your dbt profiles.yml - DBT_PROFILES_DIR: ./ - DBT_GOOGLE_PROJECT: ${{ vars.DBT_GOOGLE_PROJECT }} - DBT_GOOGLE_DATASET: ${{ vars.DBT_GOOGLE_DATASET }} - DBT_GOOGLE_KEYFILE: /tmp/google/google-service-account.json - KEYFILE_CONTENTS: ${{ secrets.KEYFILE_CONTENTS }} - -jobs: - check-pull-request: - name: Check pull request by Recce CI - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.12.x" - - - name: Install dependencies - run: | - pipx install poetry==1.7.1 - poetry install - poetry run which dbt - - - name: Install Recce - run: poetry run pip install recce-nightly - - - name: Add packages.yml file - run: | - echo '${{ vars.PACKAGES_YAML }}' > packages.yml - - - name: Prep Google keyfile - run: | - mkdir -p "$(dirname $DBT_GOOGLE_KEYFILE)" - echo "$KEYFILE_CONTENTS" > $DBT_GOOGLE_KEYFILE - - - name: Prepare dbt Base environment - run: | - run_id=$(gh run list --workflow "OSO Recce Staging CI" --repo DataRecce/oso --status success --limit 1 --json databaseId --jq '.[0].databaseId') - gh run download $run_id --repo DataRecce/oso - mv dbt-artifacts target-base - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Set PR Schema - run: echo "DBT_GOOGLE_DEV_DATASET=OSO_PR_${{ github.event.pull_request.number }}" >> $GITHUB_ENV - - - name: Prepare dbt Current environment - run: | - source $(poetry env info --path)/bin/activate - dbt deps - dbt build --target ${{ env.DBT_CURRENT_TARGET}} - dbt docs generate --target ${{ env.DBT_CURRENT_TARGET}} - env: - DBT_CURRENT_TARGET: "dev" - - - name: Run Recce CI - run: poetry run recce run - - - name: Archive Recce State File - uses: actions/upload-artifact@v4 - id: recce-artifact-uploader - with: - name: recce-state-file - path: recce_state.json - - - name: Prepare Recce Summary - id: recce-summary - run: | - source $(poetry env info --path)/bin/activate - recce summary recce_state.json > recce_summary.md - cat recce_summary.md >> $GITHUB_STEP_SUMMARY - echo '${{ env.NEXT_STEP_MESSAGE }}' >> recce_summary.md - - # Handle the case when the recce summary is too long to be displayed in the GitHub PR comment - if [[ `wc -c recce_summary.md | awk '{print $1}'` -ge '65535' ]]; then - echo '# Recce Summary - The recce summary is too long to be displayed in the GitHub PR comment. - Please check the summary detail in the [Job Summary](${{github.server_url}}/${{github.repository}}/actions/runs/${{github.run_id}}) page. - ${{ env.NEXT_STEP_MESSAGE }}' > recce_summary.md - fi - - env: - ARTIFACT_URL: ${{ steps.recce-artifact-uploader.outputs.artifact-url }} - NEXT_STEP_MESSAGE: | - ## Next Steps - If you want to check more detail inforamtion about the recce result, please download the [artifact](${{ steps.recce-artifact-uploader.outputs.artifact-url }}) file and open it by [Recce](https://pypi.org/project/recce/) CLI. - - ### How to check the recce result - ```bash - # Unzip the downloaded artifact file - tar -xf recce-state-file.zip - - # Launch the recce server based on the state file - recce server --review recce_state.json - - # Open the recce server http://localhost:8000 by your browser - ``` - - - name: Comment on pull request - uses: thollander/actions-comment-pull-request@v2 - with: - filePath: recce_summary.md From e7b7bd3e268438468c4a683b46c77f74f98cac14 Mon Sep 17 00:00:00 2001 From: popcorny Date: Thu, 23 May 2024 16:10:50 +0800 Subject: [PATCH 22/26] Add dummy workflows Signed-off-by: popcorny --- .githubprsyncer/.github/workflows/hello.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 .githubprsyncer/.github/workflows/hello.yml diff --git a/.githubprsyncer/.github/workflows/hello.yml b/.githubprsyncer/.github/workflows/hello.yml new file mode 100644 index 000000000..9105ebb83 --- /dev/null +++ b/.githubprsyncer/.github/workflows/hello.yml @@ -0,0 +1,16 @@ +name: Hello World Workflow + +on: + pull_request: + types: [opened, synchronize] + +jobs: + hello-world-job: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Print Hello, World! + run: echo "Hello, World!" From c70e0a95eeb73cc840e28e510b36ec2291a3f430 Mon Sep 17 00:00:00 2001 From: popcorny Date: Thu, 23 May 2024 16:14:10 +0800 Subject: [PATCH 23/26] test permission Signed-off-by: popcorny --- .github/workflows/recce-prsync.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/recce-prsync.yml b/.github/workflows/recce-prsync.yml index 867d55ede..d9caede84 100644 --- a/.github/workflows/recce-prsync.yml +++ b/.github/workflows/recce-prsync.yml @@ -33,6 +33,7 @@ jobs: - name: Run pysync env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_TOKEN: ${{ secrets.GH_TOKEN_FOR_ADVANCED_PERMISSIONS }} run: | prsync 'DataRecce/oso' From e529c3763b68070fccf93e4bdf4f238f15d5bb11 Mon Sep 17 00:00:00 2001 From: popcorny Date: Thu, 23 May 2024 16:28:04 +0800 Subject: [PATCH 24/26] Test permmision issue Signed-off-by: popcorny --- .githubprsyncer/.github/workflows/hello.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.githubprsyncer/.github/workflows/hello.yml b/.githubprsyncer/.github/workflows/hello.yml index 9105ebb83..37594b996 100644 --- a/.githubprsyncer/.github/workflows/hello.yml +++ b/.githubprsyncer/.github/workflows/hello.yml @@ -13,4 +13,4 @@ jobs: uses: actions/checkout@v2 - name: Print Hello, World! - run: echo "Hello, World!" + run: echo "Hello, World! Test permissions." From 9336f2f7d864997782081843bd741a3c21916709 Mon Sep 17 00:00:00 2001 From: "Wei-Chun, Chang" Date: Fri, 24 May 2024 16:27:53 +0800 Subject: [PATCH 25/26] Cast from bool to string Signed-off-by: Wei-Chun, Chang --- .../dbt/models/marts/superchain/rf4_trusted_users.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/warehouse/dbt/models/marts/superchain/rf4_trusted_users.sql b/warehouse/dbt/models/marts/superchain/rf4_trusted_users.sql index a8b4dd53a..9909bff81 100644 --- a/warehouse/dbt/models/marts/superchain/rf4_trusted_users.sql +++ b/warehouse/dbt/models/marts/superchain/rf4_trusted_users.sql @@ -5,11 +5,11 @@ with user_model as ( artifacts_by_user.user_source_id, artifacts_by_user.artifact_name, CAST( - true as bool + true as string ) as eigentrust_verification, CAST( passport_scores.evidence_rawscore - >= passport_scores.evidence_threshold as bool + >= passport_scores.evidence_threshold as string ) as passport_verification from {{ ref('int_artifacts_by_user') }} as artifacts_by_user left join {{ ref('stg_passport__scores') }} as passport_scores @@ -23,5 +23,5 @@ select artifact_name from user_model where - passport_verification is true - or eigentrust_verification is true + passport_verification = "true" + or eigentrust_verification = "true" From f76ec9bfeb42a2b2b22b4d2851e0b2c9d9843a57 Mon Sep 17 00:00:00 2001 From: "Wei-Chun, Chang" Date: Fri, 24 May 2024 18:27:28 +0800 Subject: [PATCH 26/26] Adjust trusted user criteria Signed-off-by: Wei-Chun, Chang --- .../dbt/models/marts/superchain/rf4_trusted_users.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/warehouse/dbt/models/marts/superchain/rf4_trusted_users.sql b/warehouse/dbt/models/marts/superchain/rf4_trusted_users.sql index 9909bff81..1ab0cef45 100644 --- a/warehouse/dbt/models/marts/superchain/rf4_trusted_users.sql +++ b/warehouse/dbt/models/marts/superchain/rf4_trusted_users.sql @@ -5,11 +5,11 @@ with user_model as ( artifacts_by_user.user_source_id, artifacts_by_user.artifact_name, CAST( - true as string + true as bool ) as eigentrust_verification, CAST( passport_scores.evidence_rawscore - >= passport_scores.evidence_threshold as string + > passport_scores.evidence_threshold as bool ) as passport_verification from {{ ref('int_artifacts_by_user') }} as artifacts_by_user left join {{ ref('stg_passport__scores') }} as passport_scores @@ -23,5 +23,5 @@ select artifact_name from user_model where - passport_verification = "true" - or eigentrust_verification = "true" + passport_verification is true + or eigentrust_verification is true