Skip to content

Commit

Permalink
tests/gpu: Check that an instance with a CDI GPU device, can be start…
Browse files Browse the repository at this point in the history
…ed even after its host abrupty crash and reboot

Signed-off-by: Gabriel Mougard <[email protected]>
  • Loading branch information
gabrielmougard committed Jan 23, 2025
1 parent 34c9391 commit 74571d9
Showing 1 changed file with 57 additions and 1 deletion.
58 changes: 57 additions & 1 deletion tests/gpu-container
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,65 @@ lxc config device add c1 gpu0 gpu id="nvidia.com/gpu=0"
lxc start c1
[ "$(lxc exec c1 -- ls /dev/dri/ | grep -c '^card[0-9]')" = "1" ] || false
lxc exec c1 -- nvidia-smi
lxc delete -f c1

# Check that CDI device files are cleanly remove even if the host machine is abruptly rebooted
echo "==> Testing that CDI device files are cleanly removed after abrupt reboot"
lxc init "${IMAGE}" v1 --vm
lxc config device add v1 gpu0 gpu pci="${first_card_pci_slot}"
lxc start v1
echo "==> Waiting for the VM agent to be ready"
waitInstanceReady v1

echo "==> Installing NVIDIA drivers inside the VM"
lxc exec v1 -- bash -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y ubuntu-drivers-common"
lxc exec v1 -- bash -c "DEBIAN_FRONTEND=noninteractive ubuntu-drivers autoinstall"

echo "==> Rebooting the VM to load NVIDIA drivers"
lxc restart v1

waitInstanceReady v1

echo "==> Verifying NVIDIA driver installation in the VM"
lxc exec v1 -- nvidia-smi

echo "==> Installing LXD inside the VM"
lxc exec v1 -- bash -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y snapd"
lxc exec v1 -- snap install lxd --channel="${LXD_SNAP_CHANNEL}"

echo "==> Initializing LXD inside the VM"
lxc exec v1 -- lxd init --auto

echo "==> Launching a container inside the VM"
lxc exec v1 -- lxc init ubuntu-daily:24.04 c1

echo "==> Adding GPU to the container inside the VM using CDI"
lxc exec v1 -- lxc config device add c1 gpu0 gpu id="nvidia.com/gpu=0"
lxc exec v1 -- lxc start c1
# Wait for the container to be ready
sleep 20

echo "==> Verifying GPU access inside the container"
lxc exec v1 -- lxc exec c1 -- nvidia-smi

echo "==> Simulating abrupt reboot by force-stopping the VM"
lxc stop v1 -f

echo "==> Starting the VM again"
lxc start v1

waitInstanceReady v1

echo "==> Starting the container inside the VM after reboot"
lxc exec v1 -- lxc start c1

echo "==> Verifying GPU access inside the container after VM reboot"
lxc exec v1 -- lxc exec c1 -- nvidia-smi

echo "==> Cleaning up the VM"
lxc delete v1 -f

echo "==> Cleaning up"
lxc delete -f c1
lxc profile device remove default root
lxc profile device remove default eth0
lxc storage delete default
Expand Down

0 comments on commit 74571d9

Please sign in to comment.