doc: explained the operating mechanism of the Operator

Signed-off-by: YouXam <[email protected]>
uplion · Jul 10, 2024 · 1b91b59 · 1b91b59
1 parent 371c44d
commit 1b91b59
Show file tree

Hide file tree

Showing 6 changed files with 192 additions and 45 deletions.
diff --git a/README.md b/README.md
@@ -1,10 +1,144 @@
-# aimodel-operator
-// TODO(user): Add simple overview of use/purpose
+# AI Model Operator
+
+AI Model Operator is a Kubernetes operator designed specifically for the uplion project, used to create and manage AI models.
 
 ## Description
-// TODO(user): An in-depth paragraph about your project and overview of use
 
-## Getting Started
+Create an AIModel resource similar to the following. The AIModel resource defines the deployment information for an AI model, including the model name, model type, model image, number of model replicas, etc.
+
+```yaml
+apiVersion: model.youxam.com/v1alpha1
+kind: AIModel
+metadata:
+  name: ai-model-sample
+spec:
+  type: local
+  model: TinyLlama-1.1B
+  replicas: 3
+  image: user/image:tag
+```
+
+You can also create a remote model:
+
+```yaml
+apiVersion: model.youxam.com/v1alpha1
+kind: AIModel
+metadata:
+  name: ai-model-sample
+spec:
+  type: remote
+  model: gpt-3.5-turbo
+  apiKey: xxxxxxx
+  baseURL: https://api.openai.com
+  replicas: 3
+  image: user/image:tag
+```
+
+Taking the local model as an example:
+
+```bash
+$ kubectl apply -f demo.yaml
+aimodel.model.youxam.com/ai-model-sample created
+```
+
+At this point, the AI model operator will create corresponding deployments and pods based on the AIModel resource definition:
+
+```
+$ kubectl get pods,deploy,aimodel
+NAME                                              READY   STATUS    RESTARTS   AGE
+pod/ai-model-sample-deployment-8464ffcfff-6xh7g   1/1     Running   0          29s
+pod/ai-model-sample-deployment-8464ffcfff-ccf9q   1/1     Running   0          29s
+pod/ai-model-sample-deployment-8464ffcfff-rwzk8   1/1     Running   0          29s
+
+NAME                                         READY   UP-TO-DATE   AVAILABLE   AGE
+deployment.apps/ai-model-sample-deployment   3/3     3            3           29s
+
+NAME                                       TYPE    MODEL            REPLICAS   STATE
+aimodel.model.youxam.com/ai-model-sample   local   TinyLlama-1.1B   3          Running
+```
+
+If an irreversible error occurs, such as a configuration error, it will also be reflected in the status of the AIModel resource:
+
+```bash
+# Trigger an error
+$ kubectl get aimodel
+NAME              TYPE    MODEL            REPLICAS   STATE
+ai-model-sample   local   TinyLlama-1.1B   3          Failed
+$ kubectl describe aimodel
+Name:         ai-model-sample
+Namespace:    default
+Labels:       app=ai-model
+Annotations:  <none>
+API Version:  model.youxam.com/v1alpha1
+Kind:         AIModel
+Metadata:
+  Creation Timestamp:  2024-07-10T17:28:45Z
+  Generation:          2
+  Resource Version:    133511
+  UID:                 2337a795-c15a-4401-a545-be4750fdff42
+Spec:
+  Image:     youxam/uplion-aimodel-operator-test-worker:latest
+  Model:     TinyLlama-1.1B
+  Replicas:  3
+  Type:      local
+Status:
+  Message:  ConfigurationError: Local model "not-found-model" is not supported.
+  State:    Failed
+Events:     <none>
+```
+
+The status of `ai-model-sample` is `Failed`, and there is an error message `ConfigurationError: Local model "not-found-model" is not supported.`
+
+At this point, the status of the corresponding pod depends on the program logic. If the program continues to run after reporting the error, the pod will still be in the `Running` state, but service availability is not guaranteed. If the program exits after reporting the error, the pod will enter the `CrashLoopBackOff` state.
+
+## Environment Variables
+
+The Operator supports the following environment variables:
+
+1. `MAX_PROCESS_NUM`: An environment variable passed to the Pod, specifying the maximum number of threads within each Pod, default is 128;
+2. `PULSAR_URL`: The URL of Pulsar, default is `pulsar://pulsar:6650`;
+3. `PULSAR_TOKEN`: The Token of Pulsar, default is empty;
+4. `RES_TOPIC_NAME`: The Topic name of the result message queue, default is `res-topic`;
+
+All of the above environment variables will be passed to the Pod.
+
+In addition, the Operator will automatically inject the following environment variables:
+
+1. `NODE_TYPE`: Node type, value is either `local` or `remote`, corresponding to the `type` in the yaml file
+2. `MODEL_NAME`: Model name, corresponding to `model`
+3. `API_URL`: API URL for remote models, corresponding to `baseURL`
+4. `API_KEY`: API Key for remote models, corresponding to `apiKey`
+5. `AIMODEL_NAME`: The name of the AIModel resource, in the above example it's `ai-model-sample`
+6. `AIMODEL_NAMESPACE`: The namespace where the AIModel resource is located, in the above example it's `default`
+
+`AIMODEL_NAME` and `AIMODEL_NAMESPACE` are provided for the program inside the Pod to report status, as described below.
+
+## Reporting Irreversible Errors
+
+The program inside the Pod may encounter irreversible errors, such as:
+
+1. `ConfigurationError`: When the program inside the Pod discovers a configuration error, it should report a `ConfigurationError`. For example, `Local model "not-found-model" is not supported.` in the above example;
+2. `AuthenticationError`: When the program inside the Pod believes that API Key authentication has failed, it should report an `AuthenticationError`;
+3. `MessageQueueConnectionError`: When the program inside the Pod cannot connect to the message queue, it should report a `MessageQueueConnectionError`;
+4. `APIQuotaExceededError`: When the program inside the Pod believes that the API quota has been consumed, it should report an `APIQuotaExceededError`;
+5. `GeneralError`: Other errors.
+
+At this time, the internal program should use Kubernetes' event mechanism to report errors. The `Reason` field should be one of the above error types, and the `Message` field should be a human-readable error message.
+
+The Operator will automatically assign the permission to create events to the Pod, and listen for events that meet the following conditions:
+
+1. `Type` is `Warning`;
+2. `InvolvedObject.Kind` is `AIModel`;
+3. `InvolvedObject.Name` is the name of an existing AIModel resource;
+4. `InvolvedObject.Namespace` is the namespace of an existing AIModel resource.
+
+If the Pod reports an event that meets the above conditions, the Operator will update the status of the corresponding AIModel resource to `Failed` and record the error message in the `Status.Message` field.
+
+When the AIModel is modified, the status will be reset to `Running`. If the program inside the Pod continues to report errors, the status will change to `Failed` again.
+
+For an example Golang code, see the [createK8sEvent function](https://github.com/uplion/ai-model-operator/blob/371c44df88e85d336638f268f370d11805458899/worker/main.go#L54-L103).
+
+## Installation
 
 ### Prerequisites
 - go version v1.20.0+

diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml
@@ -1,2 +1,8 @@
 resources:
 - manager.yaml
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+images:
+- name: controller
+  newName: youxam/uplion-aimodel-operator
+  newTag: latest
diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
@@ -1,7 +1,3 @@
-
-No resources found in default namespace.
-
-
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole

diff --git a/internal/controller/aimodel_controller.go b/internal/controller/aimodel_controller.go
@@ -106,44 +106,50 @@ func (r *AIModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
 	// Check if the Deployment already exists, if not create a new one
 	found := &appsv1.Deployment{}
 	err = r.Get(ctx, types.NamespacedName{Name: dep.Name, Namespace: dep.Namespace}, found)
-	if err != nil && errors.IsNotFound(err) {
 
-		// Create ServiceAccount, Role and RoleBinding for the Deployment to create events
-		err = r.createEventServiceAccountRoleAndBinding(ctx, aimodel, dep, logger)
-		if err != nil {
-			logger.Error(err, "Failed to create ServiceAccount, Role and RoleBinding", "Deployment.Namespace", dep.Namespace, "Deployment.Name", dep.Name)
-			return ctrl.Result{}, err
-		}
-
-		logger.Info("Creating a new Deployment", "Deployment.Namespace", dep.Namespace, "Deployment.Name", dep.Name)
-		// Set AIModel instance as the owner and controller
-		if err := controllerutil.SetControllerReference(aimodel, dep, r.Scheme); err != nil {
-			return ctrl.Result{}, err
-		}
-
-		err = r.Create(ctx, dep)
-		if err != nil {
-			logger.Error(err, "Failed to create new Deployment", "Deployment.Namespace", dep.Namespace, "Deployment.Name", dep.Name)
-			return ctrl.Result{}, err
-		}
-
-		// Update the AIModel status
-		aimodel.Status.State = "Running"
-		aimodel.Status.Message = "AIModel is running successfully"
-		err = r.Status().Update(ctx, aimodel)
-		if err != nil {
-			logger.Error(err, "Failed to update AIModel status")
+	if err != nil {
+		// In most cases, err != nil means that Deployment does not exist.
+		if errors.IsNotFound(err) {
+			// If the Deployment does not exist, create a new Deployment
+
+			// Create ServiceAccount, Role and RoleBinding for the Deployment to create events
+			err = r.createEventServiceAccountRoleAndBinding(ctx, aimodel, dep, logger)
+			if err != nil {
+				logger.Error(err, "Failed to create ServiceAccount, Role and RoleBinding", "Deployment.Namespace", dep.Namespace, "Deployment.Name", dep.Name)
+				return ctrl.Result{}, err
+			}
+
+			logger.Info("Creating a new Deployment", "Deployment.Namespace", dep.Namespace, "Deployment.Name", dep.Name)
+			// Set AIModel instance as the owner and controller
+			if err := controllerutil.SetControllerReference(aimodel, dep, r.Scheme); err != nil {
+				return ctrl.Result{}, err
+			}
+
+			err = r.Create(ctx, dep)
+			if err != nil {
+				logger.Error(err, "Failed to create new Deployment", "Deployment.Namespace", dep.Namespace, "Deployment.Name", dep.Name)
+				return ctrl.Result{}, err
+			}
+
+			// Update the AIModel status
+			aimodel.Status.State = "Running"
+			aimodel.Status.Message = "AIModel is running successfully"
+			err = r.Status().Update(ctx, aimodel)
+			if err != nil {
+				logger.Error(err, "Failed to update AIModel status")
+				return ctrl.Result{}, err
+			}
+
+			// Deployment created successfully - return and requeue
+			return ctrl.Result{Requeue: true}, nil
+		} else {
+			// Error reading the object - requeue the request.
+			logger.Error(err, "Failed to get Deployment")
 			return ctrl.Result{}, err
 		}
 
-		// Deployment created successfully - return and requeue
-		return ctrl.Result{Requeue: true}, nil
-	} else if err != nil {
-		logger.Error(err, "Failed to get Deployment")
-		return ctrl.Result{}, err
 	}
 
-	// Check for any changes in AIModel spec or labels
 	if !deploymentEqual(found, dep, logger) {
 		logger.Info("Updating existing Deployment", "Deployment.Namespace", dep.Namespace, "Deployment.Name", dep.Name)
 		found.Spec = dep.Spec

diff --git a/internal/listener/events.go b/internal/listener/events.go
@@ -14,12 +14,16 @@ import (
 )
 
 func StartEventListener(kubeClient kubernetes.Interface, dynamicClient client.Client, namespace string) {
+	log.Println("Starting event listener")
+
 	watcher, err := kubeClient.CoreV1().Events(v1.NamespaceAll).Watch(context.Background(), metav1.ListOptions{})
 	if err != nil {
 		log.Printf("Failed to create watcher: %v", err)
 		return
 	}
 
+	log.Println("Watcher created")
+
 	startTime := time.Now()
 
 	for event := range watcher.ResultChan() {
@@ -31,6 +35,7 @@ func StartEventListener(kubeClient kubernetes.Interface, dynamicClient client.Cl
 
 func handleEvent(event *v1.Event, dynamicClient client.Client) {
 	if event.InvolvedObject.Kind == "AIModel" && event.Type == v1.EventTypeWarning {
+		log.Printf("Handling event: %s %s", event.InvolvedObject.Name, event.Message)
 		updateAIModelStatus(event, dynamicClient)
 	}
 }

diff --git a/worker/demo.yaml b/worker/demo.yaml
@@ -1,11 +1,11 @@
 apiVersion: model.youxam.com/v1alpha1
 kind: AIModel
 metadata:
-  name: aimodel-sample
+  name: ai-model-sample
   labels:
-    app: aimodel
+    app: ai-model
 spec:
-  type: remote
+  type: local
   model: TinyLlama-1.1B
   replicas: 3
   image: youxam/uplion-aimodel-operator-test-worker:latest
@@ -14,10 +14,10 @@ spec:
 apiVersion: v1
 kind: Service
 metadata:
-  name: aimodel-svc
+  name: ai-model-svc
 spec:
     selector:
-        app: aimodel
+        app: ai-model
     ports:
         - protocol: TCP
           port: 8080