diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 000000000..f3698c063 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,35 @@ +FROM mcr.microsoft.com/vscode/devcontainers/python:3.11 + +# Add GitHub CLI +RUN (type -p wget >/dev/null || (sudo apt update && sudo apt-get install wget -y)) \ +&& sudo mkdir -p -m 755 /etc/apt/keyrings \ +&& wget -qO- https://cli.github.com/packages/githubcli-archive-keyring.gpg | sudo tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \ +&& sudo chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \ +&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | sudo tee /etc/apt/sources.list.d/github-cli.list > /dev/null + +RUN apt-get update && apt-get install -y python3-dev gh && rm -rf /var/lib/apt/lists/* + +ARG USER_UID=1000 +ARG USER_GID=$USER_UID + +RUN if [ "$USER_GID" != "1000" ] || [ "$USER_UID" != "1000" ]; then groupmod --gid $USER_GID vscode && usermod --uid $USER_UID --gid $USER_GID vscode; fi + +RUN pip3 install --upgrade pip +RUN pip3 install pipx +RUN pipx install poetry + +# Downloading gcloud package +RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz + +# Installing the package +RUN mkdir -p /usr/local/gcloud \ + && tar -C /usr/local/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \ + && /usr/local/gcloud/google-cloud-sdk/install.sh + +# Adding the package path to local +ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin + +# Install osos python dependencies +COPY . /tmp/oso +RUN cd /tmp/oso && pip3 install . +RUN pip3 install recce-nightly \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 000000000..484d5f99d --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,22 @@ +{ + "name": "Recce CodeSpace", + "image": "infuseai/oso-dev-container:3.11", + "containerEnv": { + "DBT_GOOGLE_PROJECT": "infuseai-dev", + "DBT_GOOGLE_DATASET": "oso_playground", + "DBT_GOOGLE_DEV_DATASET": "oso_playground_dev", + "DBT_GOOGLE_KEYFILE": "/home/vscode/.config/gcloud/google-service-account.json", + "RECCE_CI_WORKFLOW_NAME": "OSO Recce CI", + "RECCE_DAILY_CI_WORKFLOW_NAME": "OSO Recce Staging CI" + }, + "customizations": { + "vscode": { + "settings": { + "terminal.integrated.shell.linux": "/bin/bash", + "python.terminal.activateEnvironment": true + } + }, + "extensions": [] + }, + "forwardPorts": [8000] +} \ No newline at end of file diff --git a/.devcontainer/docker-build.sh b/.devcontainer/docker-build.sh new file mode 100644 index 000000000..cb7beff81 --- /dev/null +++ b/.devcontainer/docker-build.sh @@ -0,0 +1,3 @@ +#! /bin/bash + +docker buildx build --platform linux/amd64 -t infuseai/oso-dev-container:3.11 .. -f Dockerfile \ No newline at end of file diff --git a/.devcontainer/github_codespace_env.sh b/.devcontainer/github_codespace_env.sh new file mode 100644 index 000000000..540f52feb --- /dev/null +++ b/.devcontainer/github_codespace_env.sh @@ -0,0 +1,30 @@ +#! /bin/bash + +if [ "${CODESPACES}" == "true" ]; then + # Set the default git repository if running in GitHub Codespaces + echo "Setting the default git repository to $GITHUB_REPOSITORY" + gh repo set-default $GITHUB_REPOSITORY + + current_branch=$(git branch --show-current) + # Check if the current branch is under a pull request + if gh pr view > /dev/null ; then + # Check if the Recce state file is downloaded + run_id=$(gh run list -b ${current_branch} -s success --limit 1 -w "${RECCE_CI_WORKFLOW_NAME}" --json databaseId | jq .[].databaseId) + if [ -z "$run_id" ]; then + echo "No successful Recce run found for the current branch." + else + echo "Downloading the Recce state file for the last successful run." + gh run download $run_id --dir .recce + echo "The Recce state file is downloaded to '.recce/recce_state_file/recce_state.json'." + fi + fi + + # Check daily staging artifact files + default_branch=$(gh repo view --json defaultBranchRef --jq .defaultBranchRef.name) + daily_artifact_workflow_id=$(gh run list -w "${RECCE_DAILY_CI_WORKFLOW_NAME}" --status success -b dev --limit 1 --json databaseId | jq .[].databaseId) + gh run download $daily_artifact_workflow_id --dir .recce + if [ -d ".recce/dbt-artifacts" ]; then + mv .recce/dbt-artifacts target-base + echo "The daily staging artifact files are downloaded to 'target-base'." + fi +fi \ No newline at end of file diff --git a/.devcontainer/launch_recce_server.sh b/.devcontainer/launch_recce_server.sh new file mode 100644 index 000000000..c85c4dbfb --- /dev/null +++ b/.devcontainer/launch_recce_server.sh @@ -0,0 +1,12 @@ +#! /bin/bash + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +if [ -f "${DIR}/../.recce/recce_state_file/recce_state.json" ]; then + echo "Launching the Recce server in review mode. The Recce state file is found." + cp ${DIR}/../.recce/recce_state_file/recce_state.json recce_state.json + recce server --review recce_state.json +else + echo "Launching the Recce server." + recce server +fi \ No newline at end of file diff --git a/.devcontainer/prepare_dbt_manifest.sh b/.devcontainer/prepare_dbt_manifest.sh new file mode 100644 index 000000000..b3648104c --- /dev/null +++ b/.devcontainer/prepare_dbt_manifest.sh @@ -0,0 +1,14 @@ +#! /bin/bash + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# Check if the dbt's target path is existed +if [ -d "${DIR}/../recce_state.json" ]; then + echo "The Recce state file is found. Skip preparing the dbt manifest." + exit 0 +fi + +echo "Preparing the dbt manifest." +pushd ${DIR}/../ +dbt deps && dbt build && dbt docs generate +popd diff --git a/.devcontainer/setup_required_env.sh b/.devcontainer/setup_required_env.sh new file mode 100644 index 000000000..e09fde804 --- /dev/null +++ b/.devcontainer/setup_required_env.sh @@ -0,0 +1,61 @@ +#! /bin/bash + +GREEN="\033[0;32m" +YELLOW="\033[1;33m" +ENDCOLOR="\033[0m" + +function show_env_hint() { + echo -e "[${YELLOW}Required ENV${ENDCOLOR}] $1 is not set." + cat << EOF + +Please set the following environment variables in your GitHub Codespaces Secrets. +You can set the secret in your GitHub Codespaces Secrets by going to: + GitHub Personal Account -> Settings -> Codespaces -> Codespaces secrets + +Then add the secret with the name $1 and the value of the secret. +After adding the secret, please restart the Codespaces to apply the changes. +EOF +} + +# Check the ENV variables should be provided by the user +if [ -z "$DBT_GOOGLE_PROJECT" ]; then + show_env_hint "DBT_GOOGLE_PROJECT" + exit 1 +fi + +if [ -z "$DBT_GOOGLE_DATASET" ]; then + show_env_hint "DBT_GOOGLE_DATASET" + exit 1 +fi + +if [ -z "$DBT_GOOGLE_DEV_DATASET" ]; then + show_env_hint "DBT_GOOGLE_DEV_DATASET" + exit 1 +fi + +mkdir -p $HOME/.config/gcloud +DBT_GOOGLE_KEYFILE=$HOME/.config/gcloud/google-service-account.json + +# Setup dbt profiles.yml +if [ "$DBT_PROFILES_YML_CONTENT" != '' ]; then + echo "$DBT_PROFILES_YML_CONTENT" > $HOME/.dbt/profiles.yml + echo "dbt profiles.yml is saved to $HOME/.dbt/profiles.yml" +fi + + +# Check if the user is already logged in +if [ -z "$GOOGLE_CLOUD_SERVICE_ACCOUNT_KEY_CONTENT" ]; then + # Change to use OAuth2 to login + if [ -f "${DBT_GOOGLE_KEYFILE}" ]; then + echo "User is already logged in Google cloud" + gcloud auth list + exit 0 + else + echo -e "[${GREEN}Action${ENDCOLOR}] Please login to Google cloud to continue." + gcloud auth application-default login + fi +else + # Use the service account key to login + echo "$GOOGLE_CLOUD_SERVICE_ACCOUNT_KEY_CONTENT" > ${DBT_GOOGLE_KEYFILE} + echo "Google cloud service account key is saved to ${DBT_GOOGLE_KEYFILE}" +fi diff --git a/.github/workflows/recce-ci.yml b/.github/workflows/recce-ci.yml new file mode 100644 index 000000000..335719034 --- /dev/null +++ b/.github/workflows/recce-ci.yml @@ -0,0 +1,115 @@ +name: OSO Recce CI + +on: + workflow_dispatch: + pull_request: + branches: [main, dev] + paths: + - warehouse/dbt/** + +env: + # dbt env variables used in your dbt profiles.yml + DBT_PROFILES_DIR: ./ + DBT_GOOGLE_PROJECT: ${{ vars.DBT_GOOGLE_PROJECT }} + DBT_GOOGLE_DATASET: ${{ vars.DBT_GOOGLE_DATASET }} + DBT_GOOGLE_KEYFILE: /tmp/google/google-service-account.json + KEYFILE_CONTENTS: ${{ secrets.KEYFILE_CONTENTS }} + +jobs: + check-pull-request: + name: Check pull request by Recce CI + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.12.x" + + - name: Install dependencies + run: | + pipx install poetry==1.7.1 + poetry install + poetry run which dbt + + - name: Install Recce + run: poetry run pip install recce-nightly + + - name: Add packages.yml file + run: | + echo '${{ vars.PACKAGES_YAML }}' > packages.yml + + - name: Prep Google keyfile + run: | + mkdir -p "$(dirname $DBT_GOOGLE_KEYFILE)" + echo "$KEYFILE_CONTENTS" > $DBT_GOOGLE_KEYFILE + + - name: Prepare dbt Base environment + run: | + run_id=$(gh run list --workflow "OSO Recce Staging CI" --repo DataRecce/oso --status success --limit 1 --json databaseId --jq '.[0].databaseId') + gh run download $run_id --repo DataRecce/oso + mv dbt-artifacts target-base + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Set PR Schema + run: echo "DBT_GOOGLE_DEV_DATASET=OSO_PR_${{ github.event.pull_request.number }}" >> $GITHUB_ENV + + - name: Prepare dbt Current environment + run: | + source $(poetry env info --path)/bin/activate + dbt deps + dbt build --target ${{ env.DBT_CURRENT_TARGET}} + dbt docs generate --target ${{ env.DBT_CURRENT_TARGET}} + env: + DBT_CURRENT_TARGET: "dev" + + - name: Run Recce CI + run: poetry run recce run + + - name: Archive Recce State File + uses: actions/upload-artifact@v4 + id: recce-artifact-uploader + with: + name: recce-state-file + path: recce_state.json + + - name: Prepare Recce Summary + id: recce-summary + run: | + source $(poetry env info --path)/bin/activate + recce summary recce_state.json > recce_summary.md + cat recce_summary.md >> $GITHUB_STEP_SUMMARY + echo '${{ env.NEXT_STEP_MESSAGE }}' >> recce_summary.md + + # Handle the case when the recce summary is too long to be displayed in the GitHub PR comment + if [[ `wc -c recce_summary.md | awk '{print $1}'` -ge '65535' ]]; then + echo '# Recce Summary + The recce summary is too long to be displayed in the GitHub PR comment. + Please check the summary detail in the [Job Summary](${{github.server_url}}/${{github.repository}}/actions/runs/${{github.run_id}}) page. + ${{ env.NEXT_STEP_MESSAGE }}' > recce_summary.md + fi + + env: + ARTIFACT_URL: ${{ steps.recce-artifact-uploader.outputs.artifact-url }} + NEXT_STEP_MESSAGE: | + ## Next Steps + If you want to check more detail inforamtion about the recce result, please download the [artifact](${{ steps.recce-artifact-uploader.outputs.artifact-url }}) file and open it by [Recce](https://pypi.org/project/recce/) CLI. + + ### How to check the recce result + ```bash + # Unzip the downloaded artifact file + tar -xf recce-state-file.zip + + # Launch the recce server based on the state file + recce server --review recce_state.json + + # Open the recce server http://localhost:8000 by your browser + ``` + + - name: Comment on pull request + uses: thollander/actions-comment-pull-request@v2 + with: + filePath: recce_summary.md diff --git a/.github/workflows/recce-prsync.yml b/.github/workflows/recce-prsync.yml new file mode 100644 index 000000000..d9caede84 --- /dev/null +++ b/.github/workflows/recce-prsync.yml @@ -0,0 +1,39 @@ +# This workflow use the Github PR Syncer https://github.com/dataRecce/github-pr-syncer/ +# to sync the PRs from the upstream repository +name: OSO Recce PR Sync + +on: + workflow_dispatch: + schedule: + - cron: '30 17 * * *' # run at 1:30 AM (UTC + 8) everyday + +jobs: + prsync: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install git+https://github.com/DataRecce/github-pr-syncer.git + + - name: Set up Git + run: | + git config --global user.name "prsync[bot]" + git config --global user.email "prsync[bot]@users.noreply.github.com" + + - name: Run pysync + env: + # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_TOKEN: ${{ secrets.GH_TOKEN_FOR_ADVANCED_PERMISSIONS }} + run: | + prsync 'DataRecce/oso' diff --git a/.github/workflows/recce-staging-daily.yml b/.github/workflows/recce-staging-daily.yml new file mode 100644 index 000000000..d6de11f7d --- /dev/null +++ b/.github/workflows/recce-staging-daily.yml @@ -0,0 +1,67 @@ +name: OSO Recce Staging CI + +on: + workflow_dispatch: + schedule: + - cron: '0 18 * * *' # run at 2 AM (UTC + 8) everyday + pull_request: + types: [closed] + branches: [main, dev] + paths: + - warehouse/dbt/** + +env: + # dbt env variables used in your dbt profiles.yml + DBT_PROFILES_DIR: ./ + DBT_GOOGLE_PROJECT: ${{ vars.DBT_GOOGLE_PROJECT }} + DBT_GOOGLE_DATASET: ${{ vars.DBT_GOOGLE_DATASET }} + DBT_GOOGLE_KEYFILE: /tmp/google/google-service-account.json + PACKAGES_YAML: ${{ vars.PACKAGES_YAML }} + KEYFILE_CONTENTS: ${{ secrets.KEYFILE_CONTENTS }} + +jobs: + prepare-recce-staging-env: + name: Prepare Recce staging env + if: github.event.pull_request.merged == true || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.12.x" + + - name: Install dependencies + run: | + pipx install poetry==1.7.1 + poetry install + source $(poetry env info --path)/bin/activate + which dbt + + - name: Add packages.yml file + run: | + echo '${{ vars.PACKAGES_YAML }}' > packages.yml + + - name: Prep Google keyfile + run: | + mkdir -p "$(dirname $DBT_GOOGLE_KEYFILE)" + echo "$KEYFILE_CONTENTS" > $DBT_GOOGLE_KEYFILE + + - name: Prepare dbt Base environment + run: | + source $(poetry env info --path)/bin/activate + dbt deps + dbt build --target ${{ env.DBT_BASE_TARGET }} + dbt docs generate --target ${{ env.DBT_BASE_TARGET }} + env: + DBT_BASE_TARGET: "playground" + + - name: Archive DBT artifacts + uses: actions/upload-artifact@v4 + with: + name: dbt-artifacts + path: | + target/manifest.json + target/catalog.json diff --git a/.githubprsyncer/.github/workflows/hello.yml b/.githubprsyncer/.github/workflows/hello.yml new file mode 100644 index 000000000..37594b996 --- /dev/null +++ b/.githubprsyncer/.github/workflows/hello.yml @@ -0,0 +1,16 @@ +name: Hello World Workflow + +on: + pull_request: + types: [opened, synchronize] + +jobs: + hello-world-job: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Print Hello, World! + run: echo "Hello, World! Test permissions." diff --git a/.githubprsyncer/profiles.yml b/.githubprsyncer/profiles.yml new file mode 100644 index 000000000..44f94cf68 --- /dev/null +++ b/.githubprsyncer/profiles.yml @@ -0,0 +1,26 @@ +opensource_observer: + outputs: + playground: + type: bigquery + method: service-account + keyfile: "{{ env_var('DBT_GOOGLE_KEYFILE') }}" + project: "{{ env_var('DBT_GOOGLE_PROJECT') }}" + dataset: "{{ env_var('DBT_GOOGLE_DATASET') }}" + job_execution_time_seconds: 300 + job_retries: 1 + location: US + threads: 32 + dev: + type: bigquery + method: service-account + keyfile: "{{ env_var('DBT_GOOGLE_KEYFILE') }}" + project: "{{ env_var('DBT_GOOGLE_PROJECT') }}" + dataset: "{{ env_var('DBT_GOOGLE_DEV_DATASET') }}" + job_execution_time_seconds: 300 + job_retries: 1 + location: US + threads: 32 + # By default we target the playground. it's less costly and also safer to write + # there while developing + target: playground + diff --git a/.gitignore b/.gitignore index e78628001..ee02bc517 100644 --- a/.gitignore +++ b/.gitignore @@ -63,4 +63,12 @@ dbt_packages/ supabase/.temp/ **/supabase/.temp/ -*/**/supabase/.temp/ \ No newline at end of file +*/**/supabase/.temp/ + +# Recce & DBT +.recce/ +target/ +target-base/ +recce_state.json +venv/ +.envrc diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 000000000..b18018d02 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,35 @@ +{ + // See https://go.microsoft.com/fwlink/?LinkId=733558 + // for the documentation about the tasks.json format + "version": "2.0.0", + "tasks": [ + { + "label": "Check GitHub CodeSpace environment", + "type": "shell", + "command": "bash .devcontainer/github_codespace_env.sh", + }, + { + "label": "Setup required environment variables", + "type": "shell", + "command": "bash .devcontainer/setup_required_env.sh", + }, + { + "label": "Run DBT", + "type": "shell", + "command": "bash .devcontainer/prepare_dbt_manifest.sh", + "dependsOn": [ + "Setup required environment variables", + "Check GitHub CodeSpace environment" + ], + }, + { + "label": "Launch Recce server", + "type": "shell", + "command": "bash .devcontainer/launch_recce_server.sh", + "dependsOn": ["Run DBT"], + "runOptions": { + "runOn": "folderOpen" + } + } + ] +} \ No newline at end of file diff --git a/profiles.yml b/profiles.yml new file mode 100644 index 000000000..44f94cf68 --- /dev/null +++ b/profiles.yml @@ -0,0 +1,26 @@ +opensource_observer: + outputs: + playground: + type: bigquery + method: service-account + keyfile: "{{ env_var('DBT_GOOGLE_KEYFILE') }}" + project: "{{ env_var('DBT_GOOGLE_PROJECT') }}" + dataset: "{{ env_var('DBT_GOOGLE_DATASET') }}" + job_execution_time_seconds: 300 + job_retries: 1 + location: US + threads: 32 + dev: + type: bigquery + method: service-account + keyfile: "{{ env_var('DBT_GOOGLE_KEYFILE') }}" + project: "{{ env_var('DBT_GOOGLE_PROJECT') }}" + dataset: "{{ env_var('DBT_GOOGLE_DEV_DATASET') }}" + job_execution_time_seconds: 300 + job_retries: 1 + location: US + threads: 32 + # By default we target the playground. it's less costly and also safer to write + # there while developing + target: playground + diff --git a/warehouse/dbt/models/marts/superchain/rf4_trusted_users.sql b/warehouse/dbt/models/marts/superchain/rf4_trusted_users.sql index a8b4dd53a..1ab0cef45 100644 --- a/warehouse/dbt/models/marts/superchain/rf4_trusted_users.sql +++ b/warehouse/dbt/models/marts/superchain/rf4_trusted_users.sql @@ -9,7 +9,7 @@ with user_model as ( ) as eigentrust_verification, CAST( passport_scores.evidence_rawscore - >= passport_scores.evidence_threshold as bool + > passport_scores.evidence_threshold as bool ) as passport_verification from {{ ref('int_artifacts_by_user') }} as artifacts_by_user left join {{ ref('stg_passport__scores') }} as passport_scores