Skip to content

Commit

Permalink
Add XL e2e nightly CI job
Browse files Browse the repository at this point in the history
Add a new XL e2e nightly CI job that triggers every day at 6am UTC. Also update the existing large CI job so that uploaded files are not overwritten.

Signed-off-by: Courtney Pacheco <[email protected]>
  • Loading branch information
courtneypacheco committed Jan 16, 2025
1 parent 8969740 commit bb8d66f
Show file tree
Hide file tree
Showing 3 changed files with 534 additions and 14 deletions.
70 changes: 70 additions & 0 deletions .github/actions/free-disk-space/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
name: 'Free Disk Space'
description: 'Frees disk space on the runner'
runs:
using: "composite"
steps:
- name: Print disk space before cleanup
run: |
df -h
shell: bash
- name: Free Disk Space Linux
if: runner.os == 'Linux'
run: |
# Determine if we have Ubuntu, CentOS, or other distro as our runner OS
os_id=$(grep '^ID=' /etc/os-release | cut -d "=" -f2)
echo "Detected OS distro as: ${os_id}"
# Sometimes `docker` is not installed, so only remove images if we need to.
if command -v docker 2>&1 >/dev/null ; then
sudo docker rmi "$(docker image ls -aq) -f" >/dev/null 2>&1 || true
fi
# Remove Android, .NET, and Haskell runtimes
sudo rm -rf \
/usr/local/lib/android \
/usr/share/dotnet \
/opt/ghc \
/usr/local/.ghcup \
/usr/local/share/powershell \
/usr/share/swift \
/usr/lib/jvm || true
printWarningMessage () {
echo "[warning] Failed to remove '$1', perhaps because it doesn't exist. Ignoring..."
}
# Remove large packages we don't use.
echo "Attempting to remove unused ${os_id} packages..."
if [[ "${os_id}" == "ubuntu" ]]; then
sudo apt-get remove -y '^mysql-.*' || printWarningMessage '^mysql-.*'
sudo apt-get remove -y '^dotnet-.*' --fix-missing || printWarningMessage '^dotnet-.*'
sudo apt-get remove -y 'php.*' --fix-missing || printWarningMessage 'php.*'
sudo apt-get remove -y '^mongodb-.*' --fix-missing || printWarningMessage '^mongodb-.*'
sudo apt-get remove -y '^llvm-.*' --fix-missing || printWarningMessage '^llvm-.*'
sudo apt-get remove -y google-cloud-sdk --fix-missing || printWarningMessage 'google-cloud-sdk'
sudo apt-get remove -y google-cloud-cli --fix-missing || printWarningMessage 'google-cloud-cli'
sudo apt-get autoremove -y >/dev/null 2>&1
sudo apt-get autoclean -y >/dev/null 2>&1
elif [[ "${os_id}" == "centos" ]]; then
sudo dnf -y remove 'mysql-*' || printWarningMessage 'mysql-*'
sudo dnf -y remove 'dotnet-*' || printWarningMessage 'dotnet-*'
sudo dnf -y remove 'php-*' || printWarningMessage 'php-*'
sudo dnf -y remove 'mongodb-*' || printWarningMessage 'mongodb-*'
sudo dnf -y remove 'llvm-*' || printWarningMessage 'llvm-*'
sudo dnf -y remove google-cloud-sdk || printWarningMessage 'google-cloud-sdk'
sudo dnf -y remove google-cloud-cli || printWarningMessage 'google-cloud-cli'
sudo dnf clean all
rm -rf /var/cache/dnf*
else
echo "Unrecognized OS '${os_id}'. Skipping large package cleanup, as this logic has not been implemented for ${os_id}."
fi
shell: bash
- name: Free Disk Space MacOS
if: runner.os == 'macOS'
run: |
sudo rm -rf /System/Volumes/Data/Applications/Xcode_15*
shell: bash
- name: Print disk space after cleanup
run: |
df -h
shell: bash
34 changes: 20 additions & 14 deletions .github/workflows/e2e-nvidia-l40s-x4.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,20 @@
name: E2E (NVIDIA L40S x4)

on:
schedule:
- cron: '0 16 * * *' # Runs at 4PM UTC every day
workflow_dispatch:
inputs:
pr_or_branch:
description: 'pull request number or branch name'
required: true
default: 'main'
### WILL BE UNCOMMENTED BEFORE MERGE
# schedule:
# - cron: '0 16 * * *' # Runs at 4PM UTC every day
# workflow_dispatch:
# inputs:
# pr_or_branch:
# description: 'pull request number or branch name'
# required: true
# default: 'main'

# FOR TESTING ON GITHUB ONLY. WILL BE REMOVED.
push:
branches:
- courtneypacheco-add-xl-e2e-job

env:
TMPDIR: /home/tmp
Expand Down Expand Up @@ -206,15 +212,15 @@ jobs:
- name: Upload training logs Phase 1
uses: actions/upload-artifact@v4
with:
name: phase-1-training-log.jsonl
name: phase-1-training-log-large.jsonl
path: ./instructlab/phase-1-training-log.jsonl
retention-days: 1
overwrite: true

- name: Upload training logs Phase 2
uses: actions/upload-artifact@v4
with:
name: phase-2-training-log.jsonl
name: phase-2-training-log-large.jsonl
path: ./instructlab/phase-2-training-log.jsonl
retention-days: 1
overwrite: true
Expand Down Expand Up @@ -338,14 +344,14 @@ jobs:
id: phase-1-download-logs
uses: actions/download-artifact@v4
with:
name: phase-1-training-log.jsonl
name: phase-1-training-log-large.jsonl
path: downloaded-data

- name: Download loss data Phase 2
id: phase-2-download-logs
uses: actions/download-artifact@v4
with:
name: phase-2-training-log.jsonl
name: phase-2-training-log-large.jsonl
path: downloaded-data

- name: Checkout instructlab/training
Expand All @@ -366,7 +372,7 @@ jobs:
continue-on-error: true
run: |
python training/scripts/create-loss-graph.py \
--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log-large.jsonl" \
--output-file "./phase-1-test.md" \
--phase "1" \
--aws-region "${{ vars.AWS_REGION }}" \
Expand All @@ -381,7 +387,7 @@ jobs:
continue-on-error: true
run: |
python training/scripts/create-loss-graph.py \
--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log-large.jsonl" \
--output-file "./phase-2-test.md" \
--phase "2" \
--aws-region "${{ vars.AWS_REGION }}" \
Expand Down
Loading

0 comments on commit bb8d66f

Please sign in to comment.