From 1b91b5905ef2b9c1a3b6db11419684d54c3d20bc Mon Sep 17 00:00:00 2001 From: YouXam Date: Thu, 11 Jul 2024 02:42:57 +0800 Subject: [PATCH] doc: explained the operating mechanism of the Operator Signed-off-by: YouXam --- README.md | 142 +++++++++++++++++++++- config/manager/kustomization.yaml | 6 + config/rbac/role.yaml | 4 - internal/controller/aimodel_controller.go | 70 ++++++----- internal/listener/events.go | 5 + worker/demo.yaml | 10 +- 6 files changed, 192 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index aea5a1d..ca23c02 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,144 @@ -# aimodel-operator -// TODO(user): Add simple overview of use/purpose +# AI Model Operator + +AI Model Operator is a Kubernetes operator designed specifically for the uplion project, used to create and manage AI models. ## Description -// TODO(user): An in-depth paragraph about your project and overview of use -## Getting Started +Create an AIModel resource similar to the following. The AIModel resource defines the deployment information for an AI model, including the model name, model type, model image, number of model replicas, etc. + +```yaml +apiVersion: model.youxam.com/v1alpha1 +kind: AIModel +metadata: + name: ai-model-sample +spec: + type: local + model: TinyLlama-1.1B + replicas: 3 + image: user/image:tag +``` + +You can also create a remote model: + +```yaml +apiVersion: model.youxam.com/v1alpha1 +kind: AIModel +metadata: + name: ai-model-sample +spec: + type: remote + model: gpt-3.5-turbo + apiKey: xxxxxxx + baseURL: https://api.openai.com + replicas: 3 + image: user/image:tag +``` + +Taking the local model as an example: + +```bash +$ kubectl apply -f demo.yaml +aimodel.model.youxam.com/ai-model-sample created +``` + +At this point, the AI model operator will create corresponding deployments and pods based on the AIModel resource definition: + +``` +$ kubectl get pods,deploy,aimodel +NAME READY STATUS RESTARTS AGE +pod/ai-model-sample-deployment-8464ffcfff-6xh7g 1/1 Running 0 29s +pod/ai-model-sample-deployment-8464ffcfff-ccf9q 1/1 Running 0 29s +pod/ai-model-sample-deployment-8464ffcfff-rwzk8 1/1 Running 0 29s + +NAME READY UP-TO-DATE AVAILABLE AGE +deployment.apps/ai-model-sample-deployment 3/3 3 3 29s + +NAME TYPE MODEL REPLICAS STATE +aimodel.model.youxam.com/ai-model-sample local TinyLlama-1.1B 3 Running +``` + +If an irreversible error occurs, such as a configuration error, it will also be reflected in the status of the AIModel resource: + +```bash +# Trigger an error +$ kubectl get aimodel +NAME TYPE MODEL REPLICAS STATE +ai-model-sample local TinyLlama-1.1B 3 Failed +$ kubectl describe aimodel +Name: ai-model-sample +Namespace: default +Labels: app=ai-model +Annotations: +API Version: model.youxam.com/v1alpha1 +Kind: AIModel +Metadata: + Creation Timestamp: 2024-07-10T17:28:45Z + Generation: 2 + Resource Version: 133511 + UID: 2337a795-c15a-4401-a545-be4750fdff42 +Spec: + Image: youxam/uplion-aimodel-operator-test-worker:latest + Model: TinyLlama-1.1B + Replicas: 3 + Type: local +Status: + Message: ConfigurationError: Local model "not-found-model" is not supported. + State: Failed +Events: +``` + +The status of `ai-model-sample` is `Failed`, and there is an error message `ConfigurationError: Local model "not-found-model" is not supported.` + +At this point, the status of the corresponding pod depends on the program logic. If the program continues to run after reporting the error, the pod will still be in the `Running` state, but service availability is not guaranteed. If the program exits after reporting the error, the pod will enter the `CrashLoopBackOff` state. + +## Environment Variables + +The Operator supports the following environment variables: + +1. `MAX_PROCESS_NUM`: An environment variable passed to the Pod, specifying the maximum number of threads within each Pod, default is 128; +2. `PULSAR_URL`: The URL of Pulsar, default is `pulsar://pulsar:6650`; +3. `PULSAR_TOKEN`: The Token of Pulsar, default is empty; +4. `RES_TOPIC_NAME`: The Topic name of the result message queue, default is `res-topic`; + +All of the above environment variables will be passed to the Pod. + +In addition, the Operator will automatically inject the following environment variables: + +1. `NODE_TYPE`: Node type, value is either `local` or `remote`, corresponding to the `type` in the yaml file +2. `MODEL_NAME`: Model name, corresponding to `model` +3. `API_URL`: API URL for remote models, corresponding to `baseURL` +4. `API_KEY`: API Key for remote models, corresponding to `apiKey` +5. `AIMODEL_NAME`: The name of the AIModel resource, in the above example it's `ai-model-sample` +6. `AIMODEL_NAMESPACE`: The namespace where the AIModel resource is located, in the above example it's `default` + +`AIMODEL_NAME` and `AIMODEL_NAMESPACE` are provided for the program inside the Pod to report status, as described below. + +## Reporting Irreversible Errors + +The program inside the Pod may encounter irreversible errors, such as: + +1. `ConfigurationError`: When the program inside the Pod discovers a configuration error, it should report a `ConfigurationError`. For example, `Local model "not-found-model" is not supported.` in the above example; +2. `AuthenticationError`: When the program inside the Pod believes that API Key authentication has failed, it should report an `AuthenticationError`; +3. `MessageQueueConnectionError`: When the program inside the Pod cannot connect to the message queue, it should report a `MessageQueueConnectionError`; +4. `APIQuotaExceededError`: When the program inside the Pod believes that the API quota has been consumed, it should report an `APIQuotaExceededError`; +5. `GeneralError`: Other errors. + +At this time, the internal program should use Kubernetes' event mechanism to report errors. The `Reason` field should be one of the above error types, and the `Message` field should be a human-readable error message. + +The Operator will automatically assign the permission to create events to the Pod, and listen for events that meet the following conditions: + +1. `Type` is `Warning`; +2. `InvolvedObject.Kind` is `AIModel`; +3. `InvolvedObject.Name` is the name of an existing AIModel resource; +4. `InvolvedObject.Namespace` is the namespace of an existing AIModel resource. + +If the Pod reports an event that meets the above conditions, the Operator will update the status of the corresponding AIModel resource to `Failed` and record the error message in the `Status.Message` field. + +When the AIModel is modified, the status will be reset to `Running`. If the program inside the Pod continues to report errors, the status will change to `Failed` again. + +For an example Golang code, see the [createK8sEvent function](https://github.com/uplion/ai-model-operator/blob/371c44df88e85d336638f268f370d11805458899/worker/main.go#L54-L103). + +## Installation ### Prerequisites - go version v1.20.0+ diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 5c5f0b8..9ff1d71 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -1,2 +1,8 @@ resources: - manager.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +images: +- name: controller + newName: youxam/uplion-aimodel-operator + newTag: latest diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index d749c34..b7820dd 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -1,7 +1,3 @@ - -No resources found in default namespace. - - --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole diff --git a/internal/controller/aimodel_controller.go b/internal/controller/aimodel_controller.go index ff18810..79f401a 100644 --- a/internal/controller/aimodel_controller.go +++ b/internal/controller/aimodel_controller.go @@ -106,44 +106,50 @@ func (r *AIModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct // Check if the Deployment already exists, if not create a new one found := &appsv1.Deployment{} err = r.Get(ctx, types.NamespacedName{Name: dep.Name, Namespace: dep.Namespace}, found) - if err != nil && errors.IsNotFound(err) { - // Create ServiceAccount, Role and RoleBinding for the Deployment to create events - err = r.createEventServiceAccountRoleAndBinding(ctx, aimodel, dep, logger) - if err != nil { - logger.Error(err, "Failed to create ServiceAccount, Role and RoleBinding", "Deployment.Namespace", dep.Namespace, "Deployment.Name", dep.Name) - return ctrl.Result{}, err - } - - logger.Info("Creating a new Deployment", "Deployment.Namespace", dep.Namespace, "Deployment.Name", dep.Name) - // Set AIModel instance as the owner and controller - if err := controllerutil.SetControllerReference(aimodel, dep, r.Scheme); err != nil { - return ctrl.Result{}, err - } - - err = r.Create(ctx, dep) - if err != nil { - logger.Error(err, "Failed to create new Deployment", "Deployment.Namespace", dep.Namespace, "Deployment.Name", dep.Name) - return ctrl.Result{}, err - } - - // Update the AIModel status - aimodel.Status.State = "Running" - aimodel.Status.Message = "AIModel is running successfully" - err = r.Status().Update(ctx, aimodel) - if err != nil { - logger.Error(err, "Failed to update AIModel status") + if err != nil { + // In most cases, err != nil means that Deployment does not exist. + if errors.IsNotFound(err) { + // If the Deployment does not exist, create a new Deployment + + // Create ServiceAccount, Role and RoleBinding for the Deployment to create events + err = r.createEventServiceAccountRoleAndBinding(ctx, aimodel, dep, logger) + if err != nil { + logger.Error(err, "Failed to create ServiceAccount, Role and RoleBinding", "Deployment.Namespace", dep.Namespace, "Deployment.Name", dep.Name) + return ctrl.Result{}, err + } + + logger.Info("Creating a new Deployment", "Deployment.Namespace", dep.Namespace, "Deployment.Name", dep.Name) + // Set AIModel instance as the owner and controller + if err := controllerutil.SetControllerReference(aimodel, dep, r.Scheme); err != nil { + return ctrl.Result{}, err + } + + err = r.Create(ctx, dep) + if err != nil { + logger.Error(err, "Failed to create new Deployment", "Deployment.Namespace", dep.Namespace, "Deployment.Name", dep.Name) + return ctrl.Result{}, err + } + + // Update the AIModel status + aimodel.Status.State = "Running" + aimodel.Status.Message = "AIModel is running successfully" + err = r.Status().Update(ctx, aimodel) + if err != nil { + logger.Error(err, "Failed to update AIModel status") + return ctrl.Result{}, err + } + + // Deployment created successfully - return and requeue + return ctrl.Result{Requeue: true}, nil + } else { + // Error reading the object - requeue the request. + logger.Error(err, "Failed to get Deployment") return ctrl.Result{}, err } - // Deployment created successfully - return and requeue - return ctrl.Result{Requeue: true}, nil - } else if err != nil { - logger.Error(err, "Failed to get Deployment") - return ctrl.Result{}, err } - // Check for any changes in AIModel spec or labels if !deploymentEqual(found, dep, logger) { logger.Info("Updating existing Deployment", "Deployment.Namespace", dep.Namespace, "Deployment.Name", dep.Name) found.Spec = dep.Spec diff --git a/internal/listener/events.go b/internal/listener/events.go index 71be5d7..e6be85b 100644 --- a/internal/listener/events.go +++ b/internal/listener/events.go @@ -14,12 +14,16 @@ import ( ) func StartEventListener(kubeClient kubernetes.Interface, dynamicClient client.Client, namespace string) { + log.Println("Starting event listener") + watcher, err := kubeClient.CoreV1().Events(v1.NamespaceAll).Watch(context.Background(), metav1.ListOptions{}) if err != nil { log.Printf("Failed to create watcher: %v", err) return } + log.Println("Watcher created") + startTime := time.Now() for event := range watcher.ResultChan() { @@ -31,6 +35,7 @@ func StartEventListener(kubeClient kubernetes.Interface, dynamicClient client.Cl func handleEvent(event *v1.Event, dynamicClient client.Client) { if event.InvolvedObject.Kind == "AIModel" && event.Type == v1.EventTypeWarning { + log.Printf("Handling event: %s %s", event.InvolvedObject.Name, event.Message) updateAIModelStatus(event, dynamicClient) } } diff --git a/worker/demo.yaml b/worker/demo.yaml index 27627f3..18c6239 100644 --- a/worker/demo.yaml +++ b/worker/demo.yaml @@ -1,11 +1,11 @@ apiVersion: model.youxam.com/v1alpha1 kind: AIModel metadata: - name: aimodel-sample + name: ai-model-sample labels: - app: aimodel + app: ai-model spec: - type: remote + type: local model: TinyLlama-1.1B replicas: 3 image: youxam/uplion-aimodel-operator-test-worker:latest @@ -14,10 +14,10 @@ spec: apiVersion: v1 kind: Service metadata: - name: aimodel-svc + name: ai-model-svc spec: selector: - app: aimodel + app: ai-model ports: - protocol: TCP port: 8080