flyteorg · JiangJiaWei1103 · Dec 14, 2024 · Dec 16, 2024 · Dec 17, 2024 · Dec 18, 2024
@@ -326,7 +326,7 @@ class AsyncAgentExecutorMixin:
 
     def execute(self: PythonTask, **kwargs) -> LiteralMap:
         ctx = FlyteContext.current_context()
-        ss = ctx.serialization_settings or SerializationSettings(ImageConfig())
+        ss = ctx.serialization_settings or SerializationSettings(ImageConfig.auto_default_image())
         output_prefix = ctx.file_access.get_random_remote_directory()
         self.resource_meta = None
 

@@ -20,9 +20,9 @@ def convert_to_flyte_phase(state: str) -> TaskExecution.Phase:
     Convert the state from the agent to the phase in flyte.
     """
     state = state.lower()
-    if state in ["failed", "timeout", "timedout", "canceled", "skipped", "internal_error"]:
+    if state in ["failed", "timeout", "timedout", "canceled", "cancelled", "skipped", "internal_error"]:
         return TaskExecution.FAILED
-    elif state in ["done", "succeeded", "success"]:
+    elif state in ["done", "succeeded", "success", "completed"]:
         return TaskExecution.SUCCEEDED
     elif state in ["running", "terminating"]:
         return TaskExecution.RUNNING

@@ -250,7 +250,10 @@ def __init__(
 
         if task_config is not None:
             fully_qualified_class_name = task_config.__module__ + "." + task_config.__class__.__name__
-            if not fully_qualified_class_name == "flytekitplugins.pod.task.Pod":
+            if fully_qualified_class_name not in [
+                "flytekitplugins.pod.task.Pod",
+                "flytekitplugins.slurm.script.task.Slurm",
+            ]:
                 raise ValueError("TaskConfig can either be empty - indicating simple container task or a PodConfig.")
 
         # Each instance of NotebookTask instantiates an underlying task with a dummy function that will only be used
@@ -259,11 +262,14 @@ def __init__(
         # errors.
         # This seem like a hack. We should use a plugin_class that doesn't require a fake-function to make work.
         plugin_class = TaskPlugins.find_pythontask_plugin(type(task_config))
-        self._config_task_instance = plugin_class(task_config=task_config, task_function=_dummy_task_func)
-        # Rename the internal task so that there are no conflicts at serialization time. Technically these internal
-        # tasks should not be serialized at all, but we don't currently have a mechanism for skipping Flyte entities
-        # at serialization time.
-        self._config_task_instance._name = f"_bash.{name}"
+        if plugin_class.__name__ in ["SlurmShellTask"]:
+            self._config_task_instance = None
+        else:
+            self._config_task_instance = plugin_class(task_config=task_config, task_function=_dummy_task_func)
+            # Rename the internal task so that there are no conflicts at serialization time. Technically these internal
+            # tasks should not be serialized at all, but we don't currently have a mechanism for skipping Flyte entities
+            # at serialization time.
+            self._config_task_instance._name = f"_bash.{name}"
         self._script = script
         self._script_file = script_file
         self._debug = debug
@@ -275,7 +281,9 @@ def __init__(
         super().__init__(
             name,
             task_config,
-            task_type=self._config_task_instance.task_type,
+            task_type=kwargs.pop("task_type")
+            if self._config_task_instance is None
+            else self._config_task_instance.task_type,
             interface=Interface(inputs=inputs, outputs=outputs),
             **kwargs,
         )
@@ -309,7 +317,10 @@ def script_file(self) -> typing.Optional[os.PathLike]:
         return self._script_file
 
     def pre_execute(self, user_params: ExecutionParameters) -> ExecutionParameters:
-        return self._config_task_instance.pre_execute(user_params)
+        if self._config_task_instance is None:
+            return user_params
+        else:
+            return self._config_task_instance.pre_execute(user_params)
 
     def execute(self, **kwargs) -> typing.Any:
         """
@@ -367,7 +378,10 @@ def execute(self, **kwargs) -> typing.Any:
         return None
 
     def post_execute(self, user_params: ExecutionParameters, rval: typing.Any) -> typing.Any:
-        return self._config_task_instance.post_execute(user_params, rval)
+        if self._config_task_instance is None:
+            return rval
+        else:
+            return self._config_task_instance.post_execute(user_params, rval)
 
 
 class RawShellTask(ShellTask):

@@ -0,0 +1,5 @@
+# Flytekit Slurm Plugin
+
+The Slurm agent is designed to integrate Flyte workflows with Slurm-managed high-performance computing (HPC) clusters, enabling users to leverage Slurm's capability of compute resource allocation, scheduling, and monitoring.
+
+This [guide](https://github.com/JiangJiaWei1103/flytekit/blob/slurm-agent-dev/plugins/flytekit-slurm/demo.md) provides a concise overview of the design philosophy behind the Slurm agent and explains how to set up a local environment for testing the agent.
@@ -0,0 +1,113 @@
+# Slurm Agent Demo
+
+In this guide, we will briefly introduce how to setup an environment to test Slurm agent locally without running the backend service (e.g., flyte agent gRPC server). It covers both basic and advanced use cases.
+
+## Table of Content
+* [Overview](https://github.com/JiangJiaWei1103/flytekit/blob/slurm-agent-dev/plugins/flytekit-slurm/demo.md#overview)
+* [Setup a Local Test Environment](https://github.com/JiangJiaWei1103/flytekit/blob/slurm-agent-dev/plugins/flytekit-slurm/demo.md#setup-a-local-test-environment)
+    * [Flyte Client (Localhost)](https://github.com/JiangJiaWei1103/flytekit/blob/slurm-agent-dev/plugins/flytekit-slurm/demo.md#flyte-client-localhost)
+    * [Remote Tiny Slurm Cluster](https://github.com/JiangJiaWei1103/flytekit/blob/slurm-agent-dev/plugins/flytekit-slurm/demo.md#remote-tiny-slurm-cluster)
+    * [SSH Configuration](https://github.com/JiangJiaWei1103/flytekit/blob/slurm-agent-dev/plugins/flytekit-slurm/demo.md#ssh-configuration)
+* [Run a Demo](https://github.com/JiangJiaWei1103/flytekit/blob/slurm-agent-dev/plugins/flytekit-slurm/demo.md#run-a-demo)
+
+## Overview
+Slurm agent on the highest level has three core methods to interact with a Slurm cluster:
+1. `create`: Use `srun` or `sbatch` to run a job on a Slurm cluster
+2. `get`: Use `scontrol show job <job_id>` to monitor the Slurm job state
+3. `delete`: Use `scancel <job_id>` to cancel the Slurm job (this method is still under test)
+
+In the simplest form, Slurm agent supports directly running a batch script using `sbatch` on a Slurm cluster as shown below:
+
+![](https://github.com/JiangJiaWei1103/flytekit/blob/slurm-agent-dev/plugins/flytekit-slurm/assets/basic_arch.png)
+
+## Setup a Local Test Environment
+Without running the backend service, we can setup an environment to test Slurm agent locally. The setup consists of two main components: a client (localhost) and a remote tiny Slurm cluster. Then, we need to configure SSH connection to facilitate communication between the two, which relies on `asyncssh`.
+
+### Flyte Client (Localhost)
+1. Setup a local Flyte cluster following this [official guide](https://docs.flyte.org/en/latest/community/contribute/contribute_code.html#how-to-setup-dev-environment-for-flytekit)
+2. Build a virtual environment (e.g., conda) and activate it
+3. Clone Flytekit repo, checkout the Slurm agent PR, and install Flytekit
+```
+git clone https://github.com/flyteorg/flytekit.git
+gh pr checkout 3005
+make setup && pip install -e .
+```
+4. Install Flytekit Slurm agent
+```
+cd plugins/flytekit-slurm/
+pip install -e .
+```
+
+### Remote Tiny Slurm Cluster
+To simplify the setup process, we follow this [guide](https://github.com/JiangJiaWei1103/Slurm-101) to configure a single-host Slurm cluster, covering `slurmctld` (the central management daemon) and `slurmd` (the compute node daemon).
+
+### SSH Configuration
+To facilitate communication between the Flyte client and the remote Slurm cluster, we setup SSH on the Flyte client side as follows:
+1. Create a new authentication key pair
+```
+ssh-keygen -t rsa -b 4096
+```
+2. Copy the public key into the remote Slurm cluster
+```
+ssh-copy-id <username>@<remote_server_ip>
+```
+3. Enable key-based authentication
+```
+# ~/.ssh/config
+Host <host_alias>
+  HostName <remote_server_ip>
+  Port <ssh_port>
+  User <username>
+  IdentityFile <path_to_private_key>
+```
+
+## Run a Demo
+Suppose we have a batch script to run on Slurm cluster:
+```
+#!/bin/bash
+
+echo "Working!" >> ./remote_touch.txt
+```
+
+We use the following python script to test Slurm agent on the client side. A crucial part of the task configuration is specifying the target Slurm cluster and designating the batch script's path within the cluster.
+
+```python
+import os
+
+from flytekit import workflow
+from flytekitplugins.slurm import Slurm, SlurmTask
+
+
+echo_job = SlurmTask(
+    name="echo-job-name",
+    task_config=Slurm(
+        slurm_host="<host_alias>",
+        batch_script_path="<path_to_batch_script_within_cluster>",
+        sbatch_conf={
+            "partition": "debug",
+            "job-name": "tiny-slurm",
+        }
+    )
+)
+
+
+@workflow
+def wf() -> None:
+    echo_job()
+
+
+if __name__ == "__main__":
+    from flytekit.clis.sdk_in_container import pyflyte
+    from click.testing import CliRunner
+
+    runner = CliRunner()
+    path = os.path.realpath(__file__)
+
+    print(f">>> LOCAL EXEC <<<")
+    result = runner.invoke(pyflyte.main, ["run", path, "wf"])
+    print(result.output)
+```
+
+After the Slurm job is completed, we can find the following result on Slurm cluster:
+
+![](https://github.com/JiangJiaWei1103/flytekit/blob/slurm-agent-dev/plugins/flytekit-slurm/assets/slurm_basic_result.png)
@@ -0,0 +1,4 @@
+from .function.agent import SlurmFunctionAgent
+from .function.task import SlurmFunction, SlurmFunctionTask
+from .script.agent import SlurmScriptAgent
+from .script.task import Slurm, SlurmRemoteScript, SlurmShellTask, SlurmTask
@@ -0,0 +1,115 @@
+from dataclasses import dataclass
+from typing import Dict, Optional
+
+import asyncssh
+from asyncssh import SSHClientConnection
+
+from flytekit.extend.backend.base_agent import AgentRegistry, AsyncAgentBase, Resource, ResourceMeta
+from flytekit.extend.backend.utils import convert_to_flyte_phase
+from flytekit.models.literals import LiteralMap
+from flytekit.models.task import TaskTemplate
+
+
+@dataclass
+class SlurmJobMetadata(ResourceMeta):
+    """Slurm job metadata.
+
+    Args:
+        job_id: Slurm job id.
+    """
+
+    job_id: str
+    slurm_host: str
+
+
+class SlurmFunctionAgent(AsyncAgentBase):
+    name = "Slurm Function Agent"
+
+    # SSH connection pool for multi-host environment
+    _conn: Optional[SSHClientConnection] = None
+
+    def __init__(self) -> None:
+        super(SlurmFunctionAgent, self).__init__(task_type_name="slurm_fn", metadata_type=SlurmJobMetadata)
+
+    async def create(
+        self,
+        task_template: TaskTemplate,
+        inputs: Optional[LiteralMap] = None,
+        **kwargs,
+    ) -> SlurmJobMetadata:
+        # Retrieve task config
+        slurm_host = task_template.custom["slurm_host"]
+        srun_conf = task_template.custom["srun_conf"]
+
+        # Construct srun command for Slurm cluster
+        cmd = _get_srun_cmd(srun_conf=srun_conf, entrypoint=" ".join(task_template.container.args))
+
+        # Run Slurm job
+        if self._conn is None:
+            await self._connect(slurm_host)
+        res = await self._conn.run(cmd, check=True)
+
+        # Direct return for sbatch
+        # job_id = res.stdout.split()[-1]
+        # Use echo trick for srun
+        job_id = res.stdout.strip()
+
+        return SlurmJobMetadata(job_id=job_id, slurm_host=slurm_host)
+
+    async def get(self, resource_meta: SlurmJobMetadata, **kwargs) -> Resource:
+        await self._connect(resource_meta.slurm_host)
+        res = await self._conn.run(f"scontrol show job {resource_meta.job_id}", check=True)
+
+        # Determine the current flyte phase from Slurm job state
+        job_state = "running"
+        for o in res.stdout.split(" "):
+            if "JobState" in o:
+                job_state = o.split("=")[1].strip().lower()
+        cur_phase = convert_to_flyte_phase(job_state)
+
+        return Resource(phase=cur_phase)
+
+    async def delete(self, resource_meta: SlurmJobMetadata, **kwargs) -> None:
+        await self._connect(resource_meta.slurm_host)
+        _ = await self._conn.run(f"scancel {resource_meta.job_id}", check=True)
+
+    async def _connect(self, slurm_host: str) -> None:
+        """Make an SSH client connection."""
+        self._conn = await asyncssh.connect(host=slurm_host)
+
+
+def _get_srun_cmd(srun_conf: Dict[str, str], entrypoint: str) -> str:
+    """Construct Slurm srun command.
+
+    Flyte entrypoint, pyflyte-execute, is run within a bash shell process.
+
+    Args:
+        srun_conf: Options of srun command.
+        entrypoint: Flyte entrypoint.
+
+    Returns:
+        cmd: Slurm srun command.
+    """
+    # Setup srun options
+    cmd = ["srun"]
+    for opt, val in srun_conf.items():
+        cmd.extend([f"--{opt}", str(val)])
+
+    cmd.extend(["bash", "-c"])
+    cmd = " ".join(cmd)
+
+    cmd += f""" '# Setup environment variables
+        export PATH=$PATH:/opt/anaconda/anaconda3/bin;
+
+        # Run pyflyte-execute in a pre-built conda env
+        source activate dev;
+        {entrypoint};
+
+        # A trick to show Slurm job id on stdout
+        echo $SLURM_JOB_ID;'
+    """
+
+    return cmd
+
+
+AgentRegistry.register(SlurmFunctionAgent())