From 4410c1b9ece5baa4c04b161804128124446c2540 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Tue, 23 Jul 2024 03:19:11 +0000 Subject: [PATCH 01/30] chore: add condition branch in requeue logic. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller.go | 10 +++++++--- pkg/controller.v1beta1/trial/trial_controller_util.go | 5 +++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller.go b/pkg/controller.v1beta1/trial/trial_controller.go index 77652331f56..16b975809f0 100644 --- a/pkg/controller.v1beta1/trial/trial_controller.go +++ b/pkg/controller.v1beta1/trial/trial_controller.go @@ -46,6 +46,7 @@ import ( "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" "github.com/kubeflow/katib/pkg/controller.v1beta1/trial/managerclient" trialutil "github.com/kubeflow/katib/pkg/controller.v1beta1/trial/util" + commonapiv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" ) const ( @@ -244,9 +245,12 @@ func (r *ReconcileTrial) reconcileTrial(instance *trialsv1beta1.Trial) error { } } - // If observation is empty metrics collector doesn't finish. - // For early stopping metrics collector are reported logs before Trial status is changed to EarlyStopped. - if jobStatus.Condition == trialutil.JobSucceeded && instance.Status.Observation == nil { + // If observation is empty, metrics collector doesn't finish. + // For early stopping scenario, metrics collector will report logs before Trial status is changed to EarlyStopped. + // We need to requeue reconcile when the Trial is succeeded, metrics collector's type is not `Push`, and metrics are not reported. + if jobStatus.Condition == trialutil.JobSucceeded && + instance.Spec.MetricsCollector.Collector.Kind != commonapiv1beta1.PushCollector && + instance.Status.Observation == nil { logger.Info("Trial job is succeeded but metrics are not reported, reconcile requeued") return errMetricsNotReported } diff --git a/pkg/controller.v1beta1/trial/trial_controller_util.go b/pkg/controller.v1beta1/trial/trial_controller_util.go index 96341275a0e..a9b876fb70d 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_util.go +++ b/pkg/controller.v1beta1/trial/trial_controller_util.go @@ -70,6 +70,11 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Tria msg := "Metrics are not available" reason := TrialMetricsUnavailableReason + // If the type of metrics collector is Push, We should insert an `unavailable` value to DB + if instance.Spec.MetricsCollector.Collector.Kind == commonv1beta1.PushCollector { + + } + // Get message and reason from deployed job if jobStatus.Message != "" { msg = fmt.Sprintf("%v. Job message: %v", msg, jobStatus.Message) From 8a9297722a5651e41eb29a88539a680419e26482 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Tue, 23 Jul 2024 03:20:07 +0000 Subject: [PATCH 02/30] chore: add ReportObservationLog in katib_manager_util.go. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/common/v1beta1/katib_manager_util.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pkg/common/v1beta1/katib_manager_util.go b/pkg/common/v1beta1/katib_manager_util.go index 9e6dd819e4c..3345f45b124 100644 --- a/pkg/common/v1beta1/katib_manager_util.go +++ b/pkg/common/v1beta1/katib_manager_util.go @@ -72,6 +72,17 @@ func GetObservationLog(request *api_pb.GetObservationLogRequest) (*api_pb.GetObs return kc.GetObservationLog(ctx, request) } +func ReportObservationLog(request *api_pb.ReportObservationLogRequest) (*api_pb.ReportObservationLogReply, error) { + ctx := context.Background() + kcc, err := getKatibDBManagerClientAndConn() + if err != nil { + return nil, err + } + defer closeKatibDBManagerConnection(kcc) + kc := kcc.KatibDBManagerClient + return kc.ReportObservationLog(ctx, request) +} + func DeleteObservationLog(request *api_pb.DeleteObservationLogRequest) (*api_pb.DeleteObservationLogReply, error) { ctx := context.Background() kcc, err := getKatibDBManagerClientAndConn() From 4fb810cffb686054b3f2858bd88936a467c04de0 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Tue, 23 Jul 2024 04:12:59 +0000 Subject: [PATCH 03/30] chore: add ReportTrialUnavailableMetrics func. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- .../trial/managerclient/managerclient.go | 28 +++++++++++++++++++ .../trial/managerclient/katibmanager.go | 15 ++++++++++ 2 files changed, 43 insertions(+) diff --git a/pkg/controller.v1beta1/trial/managerclient/managerclient.go b/pkg/controller.v1beta1/trial/managerclient/managerclient.go index 656ed8248d8..819285b43a7 100644 --- a/pkg/controller.v1beta1/trial/managerclient/managerclient.go +++ b/pkg/controller.v1beta1/trial/managerclient/managerclient.go @@ -17,9 +17,12 @@ limitations under the License. package managerclient import ( + "time" + trialsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/trials/v1beta1" api_pb "github.com/kubeflow/katib/pkg/apis/manager/v1beta1" common "github.com/kubeflow/katib/pkg/common/v1beta1" + "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" ) // ManagerClient is the interface for katib manager client in trial controller. @@ -28,6 +31,8 @@ type ManagerClient interface { instance *trialsv1beta1.Trial) (*api_pb.GetObservationLogReply, error) DeleteTrialObservationLog( instance *trialsv1beta1.Trial) (*api_pb.DeleteObservationLogReply, error) + ReportTrialUnavailableMetrics( + instance *trialsv1beta1.Trial) (*api_pb.ReportObservationLogReply, error) } // DefaultClient implements the Client interface. @@ -88,3 +93,26 @@ func (d *DefaultClient) DeleteTrialObservationLog( } return reply, nil } + +func (d *DefaultClient) ReportTrialUnavailableMetrics( + instance *trialsv1beta1.Trial) (*api_pb.ReportObservationLogReply, error) { + request := &api_pb.ReportObservationLogRequest{ + TrialName: instance.Name, + ObservationLog: &api_pb.ObservationLog{ + MetricLogs: []*api_pb.MetricLog{ + { + TimeStamp: time.Time{}.UTC().Format(time.RFC3339), + Metric: &api_pb.Metric{ + Name: instance.Spec.Objective.ObjectiveMetricName, + Value: consts.UnavailableMetricValue, + }, + }, + }, + }, + } + reply, err := common.ReportObservationLog(request) + if err != nil { + return nil, err + } + return reply, nil +} diff --git a/pkg/mock/v1beta1/trial/managerclient/katibmanager.go b/pkg/mock/v1beta1/trial/managerclient/katibmanager.go index 4706cc1d877..27a604b2b36 100644 --- a/pkg/mock/v1beta1/trial/managerclient/katibmanager.go +++ b/pkg/mock/v1beta1/trial/managerclient/katibmanager.go @@ -69,3 +69,18 @@ func (mr *MockManagerClientMockRecorder) GetTrialObservationLog(arg0 any) *gomoc mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetTrialObservationLog", reflect.TypeOf((*MockManagerClient)(nil).GetTrialObservationLog), arg0) } + +// ReportTrialUnavailableMetrics mocks base method. +func (m *MockManagerClient) ReportTrialUnavailableMetrics(arg0 *v1beta1.Trial) (*api_v1_beta1.ReportObservationLogReply, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "DeleteTrialObservationLog", arg0) + ret0, _ := ret[0].(*api_v1_beta1.ReportObservationLogReply) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ReportTrialUnavailableMetrics indicated an expected call of ReportTrialUnavailableMetrics. +func (mr *MockManagerClientMockRecorder) ReportTrialUnavailableMetrics(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReportTrialUnavailableMetrics", reflect.TypeOf((*MockManagerClient)(nil).ReportTrialUnavailableMetrics), arg0) +} From 712af685b57d95fcc153fa32acd291d0e1c45904 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Tue, 23 Jul 2024 05:58:23 +0000 Subject: [PATCH 04/30] chore: insert unavailable value into Katib DB. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- .../trial/managerclient/managerclient.go | 25 ++++++------------- .../trial/trial_controller_util.go | 23 +++++++++++++++-- .../trial/managerclient/katibmanager.go | 12 ++++----- 3 files changed, 34 insertions(+), 26 deletions(-) diff --git a/pkg/controller.v1beta1/trial/managerclient/managerclient.go b/pkg/controller.v1beta1/trial/managerclient/managerclient.go index 819285b43a7..c14253c1207 100644 --- a/pkg/controller.v1beta1/trial/managerclient/managerclient.go +++ b/pkg/controller.v1beta1/trial/managerclient/managerclient.go @@ -17,12 +17,9 @@ limitations under the License. package managerclient import ( - "time" - trialsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/trials/v1beta1" api_pb "github.com/kubeflow/katib/pkg/apis/manager/v1beta1" common "github.com/kubeflow/katib/pkg/common/v1beta1" - "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" ) // ManagerClient is the interface for katib manager client in trial controller. @@ -31,8 +28,9 @@ type ManagerClient interface { instance *trialsv1beta1.Trial) (*api_pb.GetObservationLogReply, error) DeleteTrialObservationLog( instance *trialsv1beta1.Trial) (*api_pb.DeleteObservationLogReply, error) - ReportTrialUnavailableMetrics( - instance *trialsv1beta1.Trial) (*api_pb.ReportObservationLogReply, error) + ReportTrialObservationLog( + instance *trialsv1beta1.Trial, + observationLogs *api_pb.ObservationLog) (*api_pb.ReportObservationLogReply, error) } // DefaultClient implements the Client interface. @@ -94,21 +92,12 @@ func (d *DefaultClient) DeleteTrialObservationLog( return reply, nil } -func (d *DefaultClient) ReportTrialUnavailableMetrics( - instance *trialsv1beta1.Trial) (*api_pb.ReportObservationLogReply, error) { +func (d *DefaultClient) ReportTrialObservationLog( + instance *trialsv1beta1.Trial, + observationLog *api_pb.ObservationLog) (*api_pb.ReportObservationLogReply, error) { request := &api_pb.ReportObservationLogRequest{ TrialName: instance.Name, - ObservationLog: &api_pb.ObservationLog{ - MetricLogs: []*api_pb.MetricLog{ - { - TimeStamp: time.Time{}.UTC().Format(time.RFC3339), - Metric: &api_pb.Metric{ - Name: instance.Spec.Objective.ObjectiveMetricName, - Value: consts.UnavailableMetricValue, - }, - }, - }, - }, + ObservationLog: observationLog, } reply, err := common.ReportObservationLog(request) if err != nil { diff --git a/pkg/controller.v1beta1/trial/trial_controller_util.go b/pkg/controller.v1beta1/trial/trial_controller_util.go index a9b876fb70d..8de3e703d19 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_util.go +++ b/pkg/controller.v1beta1/trial/trial_controller_util.go @@ -70,9 +70,11 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Tria msg := "Metrics are not available" reason := TrialMetricsUnavailableReason - // If the type of metrics collector is Push, We should insert an `unavailable` value to DB + // If the type of metrics collector is Push, We should insert an unavailable value to Katib DB if instance.Spec.MetricsCollector.Collector.Kind == commonv1beta1.PushCollector { - + if err := r.reportUnavailableMetrics(instance); err != nil { + logger.Error(err, "Failed to insert unavailable value to Katib DB") + } } // Get message and reason from deployed job @@ -167,6 +169,23 @@ func (r *ReconcileTrial) updateFinalizers(instance *trialsv1beta1.Trial, finaliz } } +func (r *ReconcileTrial) reportUnavailableMetrics(instance *trialsv1beta1.Trial) error { + observationLog := &api_pb.ObservationLog{ + MetricLogs: []*api_pb.MetricLog{ + { + TimeStamp: time.Time{}.UTC().Format(time.RFC3339), + Metric: &api_pb.Metric{ + Name: instance.Spec.Objective.ObjectiveMetricName, + Value: consts.UnavailableMetricValue, + }, + }, + }, + } + _, err := r.ReportTrialObservationLog(instance, observationLog) + + return err +} + func getMetrics(metricLogs []*api_pb.MetricLog, strategies []commonv1beta1.MetricStrategy) (*commonv1beta1.Observation, error) { metrics := make(map[string]*commonv1beta1.Metric) timestamps := make(map[string]*time.Time) diff --git a/pkg/mock/v1beta1/trial/managerclient/katibmanager.go b/pkg/mock/v1beta1/trial/managerclient/katibmanager.go index 27a604b2b36..159a5ade746 100644 --- a/pkg/mock/v1beta1/trial/managerclient/katibmanager.go +++ b/pkg/mock/v1beta1/trial/managerclient/katibmanager.go @@ -70,17 +70,17 @@ func (mr *MockManagerClientMockRecorder) GetTrialObservationLog(arg0 any) *gomoc return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetTrialObservationLog", reflect.TypeOf((*MockManagerClient)(nil).GetTrialObservationLog), arg0) } -// ReportTrialUnavailableMetrics mocks base method. -func (m *MockManagerClient) ReportTrialUnavailableMetrics(arg0 *v1beta1.Trial) (*api_v1_beta1.ReportObservationLogReply, error) { +// ReportTrialObservationLog mocks base method. +func (m *MockManagerClient) ReportTrialObservationLog(arg0 *v1beta1.Trial, arg1 *api_v1_beta1.ObservationLog) (*api_v1_beta1.ReportObservationLogReply, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "DeleteTrialObservationLog", arg0) + ret := m.ctrl.Call(m, "ReportTrialObservationLog", arg0, arg1) ret0, _ := ret[0].(*api_v1_beta1.ReportObservationLogReply) ret1, _ := ret[1].(error) return ret0, ret1 } -// ReportTrialUnavailableMetrics indicated an expected call of ReportTrialUnavailableMetrics. -func (mr *MockManagerClientMockRecorder) ReportTrialUnavailableMetrics(arg0 any) *gomock.Call { +// ReportTrialObservationLog indicated an expected call of ReportTrialObservationLog. +func (mr *MockManagerClientMockRecorder) ReportTrialObservationLog(arg0 any, arg1 any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReportTrialUnavailableMetrics", reflect.TypeOf((*MockManagerClient)(nil).ReportTrialUnavailableMetrics), arg0) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReportTrialObservationLog", reflect.TypeOf((*MockManagerClient)(nil).ReportTrialObservationLog), arg0, arg1) } From 76e3c95b972c62b873fbb21d101cb07792766fd9 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Tue, 23 Jul 2024 06:00:28 +0000 Subject: [PATCH 05/30] fix: fix lint error. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/managerclient/managerclient.go | 4 ++-- pkg/controller.v1beta1/trial/trial_controller.go | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/controller.v1beta1/trial/managerclient/managerclient.go b/pkg/controller.v1beta1/trial/managerclient/managerclient.go index c14253c1207..8250aa5d6aa 100644 --- a/pkg/controller.v1beta1/trial/managerclient/managerclient.go +++ b/pkg/controller.v1beta1/trial/managerclient/managerclient.go @@ -29,7 +29,7 @@ type ManagerClient interface { DeleteTrialObservationLog( instance *trialsv1beta1.Trial) (*api_pb.DeleteObservationLogReply, error) ReportTrialObservationLog( - instance *trialsv1beta1.Trial, + instance *trialsv1beta1.Trial, observationLogs *api_pb.ObservationLog) (*api_pb.ReportObservationLogReply, error) } @@ -96,7 +96,7 @@ func (d *DefaultClient) ReportTrialObservationLog( instance *trialsv1beta1.Trial, observationLog *api_pb.ObservationLog) (*api_pb.ReportObservationLogReply, error) { request := &api_pb.ReportObservationLogRequest{ - TrialName: instance.Name, + TrialName: instance.Name, ObservationLog: observationLog, } reply, err := common.ReportObservationLog(request) diff --git a/pkg/controller.v1beta1/trial/trial_controller.go b/pkg/controller.v1beta1/trial/trial_controller.go index 16b975809f0..7efa7399f35 100644 --- a/pkg/controller.v1beta1/trial/trial_controller.go +++ b/pkg/controller.v1beta1/trial/trial_controller.go @@ -42,11 +42,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/source" + commonapiv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" trialsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/trials/v1beta1" "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" "github.com/kubeflow/katib/pkg/controller.v1beta1/trial/managerclient" trialutil "github.com/kubeflow/katib/pkg/controller.v1beta1/trial/util" - commonapiv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" ) const ( @@ -248,8 +248,8 @@ func (r *ReconcileTrial) reconcileTrial(instance *trialsv1beta1.Trial) error { // If observation is empty, metrics collector doesn't finish. // For early stopping scenario, metrics collector will report logs before Trial status is changed to EarlyStopped. // We need to requeue reconcile when the Trial is succeeded, metrics collector's type is not `Push`, and metrics are not reported. - if jobStatus.Condition == trialutil.JobSucceeded && - instance.Spec.MetricsCollector.Collector.Kind != commonapiv1beta1.PushCollector && + if jobStatus.Condition == trialutil.JobSucceeded && + instance.Spec.MetricsCollector.Collector.Kind != commonapiv1beta1.PushCollector && instance.Status.Observation == nil { logger.Info("Trial job is succeeded but metrics are not reported, reconcile requeued") return errMetricsNotReported From 7f6271b4d55b16199246c27d692cbf38b7d639be Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Fri, 26 Jul 2024 13:29:06 +0000 Subject: [PATCH 06/30] fix: add nil condition judgement. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller.go b/pkg/controller.v1beta1/trial/trial_controller.go index 7efa7399f35..e0c38077995 100644 --- a/pkg/controller.v1beta1/trial/trial_controller.go +++ b/pkg/controller.v1beta1/trial/trial_controller.go @@ -248,11 +248,12 @@ func (r *ReconcileTrial) reconcileTrial(instance *trialsv1beta1.Trial) error { // If observation is empty, metrics collector doesn't finish. // For early stopping scenario, metrics collector will report logs before Trial status is changed to EarlyStopped. // We need to requeue reconcile when the Trial is succeeded, metrics collector's type is not `Push`, and metrics are not reported. - if jobStatus.Condition == trialutil.JobSucceeded && - instance.Spec.MetricsCollector.Collector.Kind != commonapiv1beta1.PushCollector && - instance.Status.Observation == nil { - logger.Info("Trial job is succeeded but metrics are not reported, reconcile requeued") - return errMetricsNotReported + if jobStatus.Condition == trialutil.JobSucceeded && instance.Status.Observation == nil { + if instance.Spec.MetricsCollector.Collector == nil || + instance.Spec.MetricsCollector.Collector.Kind != commonapiv1beta1.PushCollector { + logger.Info("Trial job is succeeded but metrics are not reported, reconcile requeued") + return errMetricsNotReported + } } // Update Trial job status only From 46a5afd736006f4bb7585c9d82481a195572494f Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Fri, 26 Jul 2024 13:33:49 +0000 Subject: [PATCH 07/30] fix: add nil condition judgement in trial_controller_util.go Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller_util.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller_util.go b/pkg/controller.v1beta1/trial/trial_controller_util.go index 8de3e703d19..97f6fa7c14b 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_util.go +++ b/pkg/controller.v1beta1/trial/trial_controller_util.go @@ -71,7 +71,8 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Tria reason := TrialMetricsUnavailableReason // If the type of metrics collector is Push, We should insert an unavailable value to Katib DB - if instance.Spec.MetricsCollector.Collector.Kind == commonv1beta1.PushCollector { + if instance.Spec.MetricsCollector.Collector != nil && + instance.Spec.MetricsCollector.Collector.Kind == commonv1beta1.PushCollector { if err := r.reportUnavailableMetrics(instance); err != nil { logger.Error(err, "Failed to insert unavailable value to Katib DB") } From 59e98a3e8cc2a788d9f43dec8ff2bb81c66459bd Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Fri, 2 Aug 2024 15:39:55 +0000 Subject: [PATCH 08/30] chore(trial): delete nil check of MC kind in the Trial controller. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller.go | 11 +++++------ pkg/controller.v1beta1/trial/trial_controller_util.go | 3 +-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller.go b/pkg/controller.v1beta1/trial/trial_controller.go index e0c38077995..3d756fd1128 100644 --- a/pkg/controller.v1beta1/trial/trial_controller.go +++ b/pkg/controller.v1beta1/trial/trial_controller.go @@ -248,12 +248,11 @@ func (r *ReconcileTrial) reconcileTrial(instance *trialsv1beta1.Trial) error { // If observation is empty, metrics collector doesn't finish. // For early stopping scenario, metrics collector will report logs before Trial status is changed to EarlyStopped. // We need to requeue reconcile when the Trial is succeeded, metrics collector's type is not `Push`, and metrics are not reported. - if jobStatus.Condition == trialutil.JobSucceeded && instance.Status.Observation == nil { - if instance.Spec.MetricsCollector.Collector == nil || - instance.Spec.MetricsCollector.Collector.Kind != commonapiv1beta1.PushCollector { - logger.Info("Trial job is succeeded but metrics are not reported, reconcile requeued") - return errMetricsNotReported - } + if jobStatus.Condition == trialutil.JobSucceeded && + instance.Status.Observation == nil && + instance.Spec.MetricsCollector.Collector.Kind != commonapiv1beta1.PushCollector { + logger.Info("Trial job is succeeded but metrics are not reported, reconcile requeued") + return errMetricsNotReported } // Update Trial job status only diff --git a/pkg/controller.v1beta1/trial/trial_controller_util.go b/pkg/controller.v1beta1/trial/trial_controller_util.go index 97f6fa7c14b..8de3e703d19 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_util.go +++ b/pkg/controller.v1beta1/trial/trial_controller_util.go @@ -71,8 +71,7 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Tria reason := TrialMetricsUnavailableReason // If the type of metrics collector is Push, We should insert an unavailable value to Katib DB - if instance.Spec.MetricsCollector.Collector != nil && - instance.Spec.MetricsCollector.Collector.Kind == commonv1beta1.PushCollector { + if instance.Spec.MetricsCollector.Collector.Kind == commonv1beta1.PushCollector { if err := r.reportUnavailableMetrics(instance); err != nil { logger.Error(err, "Failed to insert unavailable value to Katib DB") } From bc9106efa1bf35e7ce02706e1e577b6009d40751 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Fri, 2 Aug 2024 20:48:32 +0000 Subject: [PATCH 09/30] chore(trial): init MC in newFakeTrialBatchJob to avoid nil condition in trial reconcile loop. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller_test.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index 558cdba563e..c172585a407 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -429,6 +429,11 @@ func newFakeTrialBatchJob() *trialsv1beta1.Trial { }, Spec: trialsv1beta1.TrialSpec{ PrimaryContainerName: primaryContainer, + MetricsCollector: commonv1beta1.MetricsCollectorSpec{ + Collector: &commonv1beta1.CollectorSpec{ + Kind: commonv1beta1.StdOutCollector, + }, + }, SuccessCondition: experimentsv1beta1.DefaultJobSuccessCondition, FailureCondition: experimentsv1beta1.DefaultJobFailureCondition, Objective: &commonv1beta1.ObjectiveSpec{ From a8c9d900c7dea9b2a8070319edd25f289dd90ce7 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Fri, 2 Aug 2024 20:55:23 +0000 Subject: [PATCH 10/30] fix(trial): fix lint error. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index c172585a407..2c3bf8978e4 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -429,13 +429,13 @@ func newFakeTrialBatchJob() *trialsv1beta1.Trial { }, Spec: trialsv1beta1.TrialSpec{ PrimaryContainerName: primaryContainer, - MetricsCollector: commonv1beta1.MetricsCollectorSpec{ + MetricsCollector: commonv1beta1.MetricsCollectorSpec{ Collector: &commonv1beta1.CollectorSpec{ Kind: commonv1beta1.StdOutCollector, }, }, - SuccessCondition: experimentsv1beta1.DefaultJobSuccessCondition, - FailureCondition: experimentsv1beta1.DefaultJobFailureCondition, + SuccessCondition: experimentsv1beta1.DefaultJobSuccessCondition, + FailureCondition: experimentsv1beta1.DefaultJobFailureCondition, Objective: &commonv1beta1.ObjectiveSpec{ ObjectiveMetricName: objectiveMetric, MetricStrategies: []commonv1beta1.MetricStrategy{ From 4d1d010c6a606ab70bc4b78cbf355feed81b09da Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Fri, 2 Aug 2024 20:59:34 +0000 Subject: [PATCH 11/30] fix(trial): fix lint error in controller. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller.go b/pkg/controller.v1beta1/trial/trial_controller.go index 3d756fd1128..c63729cc71e 100644 --- a/pkg/controller.v1beta1/trial/trial_controller.go +++ b/pkg/controller.v1beta1/trial/trial_controller.go @@ -248,8 +248,8 @@ func (r *ReconcileTrial) reconcileTrial(instance *trialsv1beta1.Trial) error { // If observation is empty, metrics collector doesn't finish. // For early stopping scenario, metrics collector will report logs before Trial status is changed to EarlyStopped. // We need to requeue reconcile when the Trial is succeeded, metrics collector's type is not `Push`, and metrics are not reported. - if jobStatus.Condition == trialutil.JobSucceeded && - instance.Status.Observation == nil && + if jobStatus.Condition == trialutil.JobSucceeded && + instance.Status.Observation == nil && instance.Spec.MetricsCollector.Collector.Kind != commonapiv1beta1.PushCollector { logger.Info("Trial job is succeeded but metrics are not reported, reconcile requeued") return errMetricsNotReported From 45e4446530b24a0dfeec9c24addb2f425d99efb4 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Fri, 2 Aug 2024 21:24:51 +0000 Subject: [PATCH 12/30] test(trial): add integration test for Push MC. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- .../trial/trial_controller_test.go | 51 +++++++++++++++---- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index 2c3bf8978e4..cf5f38dfa4a 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -183,10 +183,10 @@ func TestReconcileBatchJob(t *testing.T) { g := gomega.NewGomegaWithT(t) mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil) - trial := newFakeTrialBatchJob() + trial := newFakeTrialBatchJob(commonv1beta1.StdOutCollector) batchJob := &batchv1.Job{} - // Create the Trial + // Create the Trial with StdOut MC g.Expect(c.Create(ctx, trial)).NotTo(gomega.HaveOccurred()) // Expect that BatchJob with appropriate name is created @@ -262,8 +262,8 @@ func TestReconcileBatchJob(t *testing.T) { } g.Expect(c.Status().Update(ctx, batchJob)).NotTo(gomega.HaveOccurred()) - // Create the Trial - trial := newFakeTrialBatchJob() + // Create the Trial with StdOut MC + trial := newFakeTrialBatchJob(commonv1beta1.StdOutCollector) g.Expect(c.Create(ctx, trial)).NotTo(gomega.HaveOccurred()) // Expect that Trial status is succeeded and metrics are properly populated @@ -290,14 +290,47 @@ func TestReconcileBatchJob(t *testing.T) { }, timeout).Should(gomega.BeTrue()) }) - t.Run(`Trail with "Complete" BatchJob and Unavailable metrics.`, func(t *testing.T) { + t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(StdOut MC).`, func(t *testing.T) { g := gomega.NewGomegaWithT(t) gomock.InOrder( mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(1), mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil), ) - // Create the Trial - trial := newFakeTrialBatchJob() + // Create the Trial with StdOut MC + trial := newFakeTrialBatchJob(commonv1beta1.StdOutCollector) + g.Expect(c.Create(ctx, trial)).NotTo(gomega.HaveOccurred()) + + // Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason. + // Metrics unavailable because GetTrialObservationLog returns "unavailable". + g.Eventually(func() bool { + if err = c.Get(ctx, trialKey, trial); err != nil { + return false + } + return trial.IsMetricsUnavailable() && + len(trial.Status.Observation.Metrics) > 0 && + trial.Status.Observation.Metrics[0].Min == consts.UnavailableMetricValue && + trial.Status.Observation.Metrics[0].Max == consts.UnavailableMetricValue && + trial.Status.Observation.Metrics[0].Latest == consts.UnavailableMetricValue + }, timeout).Should(gomega.BeTrue()) + + // Delete the Trial + g.Expect(c.Delete(ctx, trial)).NotTo(gomega.HaveOccurred()) + + // Expect that Trial is deleted + g.Eventually(func() bool { + return errors.IsNotFound(c.Get(ctx, trialKey, &trialsv1beta1.Trial{})) + }, timeout).Should(gomega.BeTrue()) + }) + + t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(Push MC).`, func(t *testing.T) { + g := gomega.NewGomegaWithT(t) + gomock.InOrder( + mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(1), + mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil).MinTimes(1), + mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil), + ) + // Create the Trial with Push MC + trial := newFakeTrialBatchJob(commonv1beta1.PushCollector) g.Expect(c.Create(ctx, trial)).NotTo(gomega.HaveOccurred()) // Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason. @@ -386,7 +419,7 @@ func TestGetObjectiveMetricValue(t *testing.T) { g.Expect(err).To(gomega.HaveOccurred()) } -func newFakeTrialBatchJob() *trialsv1beta1.Trial { +func newFakeTrialBatchJob(mcType commonv1beta1.CollectorKind) *trialsv1beta1.Trial { primaryContainer := "training-container" job := &batchv1.Job{ @@ -431,7 +464,7 @@ func newFakeTrialBatchJob() *trialsv1beta1.Trial { PrimaryContainerName: primaryContainer, MetricsCollector: commonv1beta1.MetricsCollectorSpec{ Collector: &commonv1beta1.CollectorSpec{ - Kind: commonv1beta1.StdOutCollector, + Kind: mcType, }, }, SuccessCondition: experimentsv1beta1.DefaultJobSuccessCondition, From e0bd76a93e7cffe7f5938c82b937dbefa25e96ff Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Sun, 25 Aug 2024 12:54:25 +0000 Subject: [PATCH 13/30] chore(trial): retry reconcilation when reporting unavailable metrics failed. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- .../trial/trial_controller.go | 2 +- .../trial/trial_controller_test.go | 34 +++++++++++++++++++ .../trial/trial_controller_util.go | 7 ++-- 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller.go b/pkg/controller.v1beta1/trial/trial_controller.go index c63729cc71e..9b6ba9936e8 100644 --- a/pkg/controller.v1beta1/trial/trial_controller.go +++ b/pkg/controller.v1beta1/trial/trial_controller.go @@ -259,7 +259,7 @@ func (r *ReconcileTrial) reconcileTrial(instance *trialsv1beta1.Trial) error { // if job has succeeded and if observation field is available. // if job has failed // This will ensure that trial is set to be complete only if metric is collected at least once - r.UpdateTrialStatusCondition(instance, deployedJob.GetName(), jobStatus) + return r.UpdateTrialStatusCondition(instance, deployedJob.GetName(), jobStatus) } return nil } diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index cf5f38dfa4a..1536d5690cd 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -355,6 +355,40 @@ func TestReconcileBatchJob(t *testing.T) { }, timeout).Should(gomega.BeTrue()) }) + t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(Push MC, failed once).`, func(t *testing.T) { + g := gomega.NewGomegaWithT(t) + gomock.InOrder( + mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(2), + mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, errMetricsNotReported).MinTimes(1), + mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil).MinTimes(1), + mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil), + ) + // Create the Trial with Push MC + trial := newFakeTrialBatchJob(commonv1beta1.PushCollector) + g.Expect(c.Create(ctx, trial)).NotTo(gomega.HaveOccurred()) + + // Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason. + // Metrics unavailable because GetTrialObservationLog returns "unavailable". + g.Eventually(func() bool { + if err = c.Get(ctx, trialKey, trial); err != nil { + return false + } + return trial.IsMetricsUnavailable() && + len(trial.Status.Observation.Metrics) > 0 && + trial.Status.Observation.Metrics[0].Min == consts.UnavailableMetricValue && + trial.Status.Observation.Metrics[0].Max == consts.UnavailableMetricValue && + trial.Status.Observation.Metrics[0].Latest == consts.UnavailableMetricValue + }, timeout).Should(gomega.BeTrue()) + + // Delete the Trial + g.Expect(c.Delete(ctx, trial)).NotTo(gomega.HaveOccurred()) + + // Expect that Trial is deleted + g.Eventually(func() bool { + return errors.IsNotFound(c.Get(ctx, trialKey, &trialsv1beta1.Trial{})) + }, timeout).Should(gomega.BeTrue()) + }) + t.Run("Update status for empty Trial", func(t *testing.T) { g := gomega.NewGomegaWithT(t) g.Expect(r.updateStatus(&trialsv1beta1.Trial{})).To(gomega.HaveOccurred()) diff --git a/pkg/controller.v1beta1/trial/trial_controller_util.go b/pkg/controller.v1beta1/trial/trial_controller_util.go index 8de3e703d19..25913cc4d64 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_util.go +++ b/pkg/controller.v1beta1/trial/trial_controller_util.go @@ -39,7 +39,7 @@ const ( ) // UpdateTrialStatusCondition updates Trial status from current deployed Job status -func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Trial, deployedJobName string, jobStatus *trialutil.TrialJobStatus) { +func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Trial, deployedJobName string, jobStatus *trialutil.TrialJobStatus) error { logger := log.WithValues("Trial", types.NamespacedName{Name: instance.GetName(), Namespace: instance.GetNamespace()}) timeNow := metav1.Now() @@ -70,10 +70,12 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Tria msg := "Metrics are not available" reason := TrialMetricsUnavailableReason - // If the type of metrics collector is Push, We should insert an unavailable value to Katib DB + // If the type of metrics collector is Push, We should insert an unavailable value to Katib DB. + // We would retry reconcilation if some error occurs while we report unavailable metrics. if instance.Spec.MetricsCollector.Collector.Kind == commonv1beta1.PushCollector { if err := r.reportUnavailableMetrics(instance); err != nil { logger.Error(err, "Failed to insert unavailable value to Katib DB") + return errMetricsNotReported } } @@ -126,6 +128,7 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Tria // TODO(gaocegege): Should we maintain a TrialsRunningCount? } // else nothing to do + return nil } func (r *ReconcileTrial) UpdateTrialStatusObservation(instance *trialsv1beta1.Trial) error { From 7971594065e0661bf416d90ea2205ca157fdf60a Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Sun, 25 Aug 2024 13:01:47 +0000 Subject: [PATCH 14/30] test(trial): fix EXPECT order. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index 1536d5690cd..7aaf0a12d83 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -358,8 +358,9 @@ func TestReconcileBatchJob(t *testing.T) { t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(Push MC, failed once).`, func(t *testing.T) { g := gomega.NewGomegaWithT(t) gomock.InOrder( - mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(2), + mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(1), mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, errMetricsNotReported).MinTimes(1), + mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(1), mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil).MinTimes(1), mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil), ) From 2b6b4c14f95a1570f566a5181276305da1e4473e Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Sun, 25 Aug 2024 14:30:26 +0000 Subject: [PATCH 15/30] test(trial): fix typo error. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller_util.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller_util.go b/pkg/controller.v1beta1/trial/trial_controller_util.go index 25913cc4d64..1a63e945afc 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_util.go +++ b/pkg/controller.v1beta1/trial/trial_controller_util.go @@ -71,7 +71,7 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Tria reason := TrialMetricsUnavailableReason // If the type of metrics collector is Push, We should insert an unavailable value to Katib DB. - // We would retry reconcilation if some error occurs while we report unavailable metrics. + // We would retry reconciliation if some error occurs while we report unavailable metrics. if instance.Spec.MetricsCollector.Collector.Kind == commonv1beta1.PushCollector { if err := r.reportUnavailableMetrics(instance); err != nil { logger.Error(err, "Failed to insert unavailable value to Katib DB") From b2872ec283f481e5f9760cb0f2a4b0f210028278 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Mon, 26 Aug 2024 17:08:20 +0000 Subject: [PATCH 16/30] chore(trial): add errReportMetricsFailed. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller.go | 4 +++- pkg/controller.v1beta1/trial/trial_controller_test.go | 2 +- pkg/controller.v1beta1/trial/trial_controller_util.go | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller.go b/pkg/controller.v1beta1/trial/trial_controller.go index 9b6ba9936e8..30f608b853e 100644 --- a/pkg/controller.v1beta1/trial/trial_controller.go +++ b/pkg/controller.v1beta1/trial/trial_controller.go @@ -58,6 +58,8 @@ var ( log = logf.Log.WithName(ControllerName) // errMetricsNotReported is the error when Trial job is succeeded but metrics are not reported yet errMetricsNotReported = fmt.Errorf("metrics are not reported yet") + // errReportMetricsFailed is the error when metrics are reported but not successful yet + errReportMetricsFailed = fmt.Errorf("failed to report metrics") ) // Add creates a new Trial Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller @@ -180,7 +182,7 @@ func (r *ReconcileTrial) Reconcile(ctx context.Context, request reconcile.Reques } else { err := r.reconcileTrial(instance) if err != nil { - if err == errMetricsNotReported { + if err == errMetricsNotReported || err == errReportMetricsFailed { return reconcile.Result{ RequeueAfter: time.Second * 1, }, nil diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index 7aaf0a12d83..ffaedb8af8d 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -359,7 +359,7 @@ func TestReconcileBatchJob(t *testing.T) { g := gomega.NewGomegaWithT(t) gomock.InOrder( mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(1), - mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, errMetricsNotReported).MinTimes(1), + mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, errReportMetricsFailed).MinTimes(1), mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(1), mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil).MinTimes(1), mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil), diff --git a/pkg/controller.v1beta1/trial/trial_controller_util.go b/pkg/controller.v1beta1/trial/trial_controller_util.go index 1a63e945afc..7a4eba99075 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_util.go +++ b/pkg/controller.v1beta1/trial/trial_controller_util.go @@ -75,7 +75,7 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Tria if instance.Spec.MetricsCollector.Collector.Kind == commonv1beta1.PushCollector { if err := r.reportUnavailableMetrics(instance); err != nil { logger.Error(err, "Failed to insert unavailable value to Katib DB") - return errMetricsNotReported + return errReportMetricsFailed } } From 17d56101872d833be86be4f058754428fb85469e Mon Sep 17 00:00:00 2001 From: Shao Wang <77665902+Electronic-Waste@users.noreply.github.com> Date: Tue, 27 Aug 2024 10:04:42 +0800 Subject: [PATCH 17/30] Update pkg/controller.v1beta1/trial/trial_controller.go Co-authored-by: Andrey Velichkevich Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller.go b/pkg/controller.v1beta1/trial/trial_controller.go index 30f608b853e..0cb518eba34 100644 --- a/pkg/controller.v1beta1/trial/trial_controller.go +++ b/pkg/controller.v1beta1/trial/trial_controller.go @@ -58,8 +58,8 @@ var ( log = logf.Log.WithName(ControllerName) // errMetricsNotReported is the error when Trial job is succeeded but metrics are not reported yet errMetricsNotReported = fmt.Errorf("metrics are not reported yet") - // errReportMetricsFailed is the error when metrics are reported but not successful yet - errReportMetricsFailed = fmt.Errorf("failed to report metrics") + // errReportMetricsFailed is the error when `unavailable` metrics value can't be inserted to the Katib DB. + errReportMetricsFailed = fmt.Errorf("failed to report unavailable metrics") ) // Add creates a new Trial Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller From 11d71af7a946fd610ef31e22bced46a77854b487 Mon Sep 17 00:00:00 2001 From: Shao Wang <77665902+Electronic-Waste@users.noreply.github.com> Date: Wed, 28 Aug 2024 21:39:01 +0800 Subject: [PATCH 18/30] Update pkg/controller.v1beta1/trial/trial_controller_util.go Co-authored-by: Yuki Iwai Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller_util.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller_util.go b/pkg/controller.v1beta1/trial/trial_controller_util.go index 7a4eba99075..ddfc0b11628 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_util.go +++ b/pkg/controller.v1beta1/trial/trial_controller_util.go @@ -75,7 +75,7 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Tria if instance.Spec.MetricsCollector.Collector.Kind == commonv1beta1.PushCollector { if err := r.reportUnavailableMetrics(instance); err != nil { logger.Error(err, "Failed to insert unavailable value to Katib DB") - return errReportMetricsFailed + return fmt.Errorf("%w: %w", errReportMetricsFailed, err) } } From 303938b50669581435ef7ea0d9b1b045dddc33ee Mon Sep 17 00:00:00 2001 From: Shao Wang <77665902+Electronic-Waste@users.noreply.github.com> Date: Wed, 28 Aug 2024 21:39:18 +0800 Subject: [PATCH 19/30] Update pkg/controller.v1beta1/trial/trial_controller.go Co-authored-by: Yuki Iwai Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller.go b/pkg/controller.v1beta1/trial/trial_controller.go index 0cb518eba34..c22c9be806a 100644 --- a/pkg/controller.v1beta1/trial/trial_controller.go +++ b/pkg/controller.v1beta1/trial/trial_controller.go @@ -182,7 +182,7 @@ func (r *ReconcileTrial) Reconcile(ctx context.Context, request reconcile.Reques } else { err := r.reconcileTrial(instance) if err != nil { - if err == errMetricsNotReported || err == errReportMetricsFailed { + if errors.Is(err, errMetricsNotReported) || errors.Is(err, errReportMetricsFailed) { return reconcile.Result{ RequeueAfter: time.Second * 1, }, nil From 541ec26e77ebefb82b3a73732e1c60d41b1cb734 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Wed, 28 Aug 2024 13:47:11 +0000 Subject: [PATCH 20/30] fix(trial): rename errors pkg. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller.go b/pkg/controller.v1beta1/trial/trial_controller.go index c22c9be806a..1a8b5084dcf 100644 --- a/pkg/controller.v1beta1/trial/trial_controller.go +++ b/pkg/controller.v1beta1/trial/trial_controller.go @@ -18,13 +18,14 @@ package trial import ( "context" + "errors" "fmt" "time" "github.com/spf13/viper" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" - "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" @@ -153,7 +154,7 @@ func (r *ReconcileTrial) Reconcile(ctx context.Context, request reconcile.Reques original := &trialsv1beta1.Trial{} err := r.Get(ctx, request.NamespacedName, original) if err != nil { - if errors.IsNotFound(err) { + if apierrors.IsNotFound(err) { // Object not found, return. Created objects are automatically garbage collected. // For additional cleanup logic use finalizers. return reconcile.Result{}, nil @@ -277,7 +278,7 @@ func (r *ReconcileTrial) reconcileJob(instance *trialsv1beta1.Trial, desiredJob deployedJob.SetGroupVersionKind(gvk) err = r.Get(context.TODO(), types.NamespacedName{Name: desiredJob.GetName(), Namespace: desiredJob.GetNamespace()}, deployedJob) if err != nil { - if errors.IsNotFound(err) { + if apierrors.IsNotFound(err) { if instance.IsCompleted() { return nil, nil } From 2fe8c3388a763e3809130ff362d0a982b8beff13 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Sun, 1 Sep 2024 13:31:31 +0000 Subject: [PATCH 21/30] test(trial): update the order of UT. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- .../trial/trial_controller_test.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index ffaedb8af8d..a9390b8153b 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -290,14 +290,15 @@ func TestReconcileBatchJob(t *testing.T) { }, timeout).Should(gomega.BeTrue()) }) - t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(StdOut MC).`, func(t *testing.T) { + t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(Push MC).`, func(t *testing.T) { g := gomega.NewGomegaWithT(t) gomock.InOrder( mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(1), + mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil).MinTimes(1), mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil), ) - // Create the Trial with StdOut MC - trial := newFakeTrialBatchJob(commonv1beta1.StdOutCollector) + // Create the Trial with Push MC + trial := newFakeTrialBatchJob(commonv1beta1.PushCollector) g.Expect(c.Create(ctx, trial)).NotTo(gomega.HaveOccurred()) // Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason. @@ -322,15 +323,14 @@ func TestReconcileBatchJob(t *testing.T) { }, timeout).Should(gomega.BeTrue()) }) - t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(Push MC).`, func(t *testing.T) { + t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(StdOut MC).`, func(t *testing.T) { g := gomega.NewGomegaWithT(t) gomock.InOrder( mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(1), - mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil).MinTimes(1), mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil), ) - // Create the Trial with Push MC - trial := newFakeTrialBatchJob(commonv1beta1.PushCollector) + // Create the Trial with StdOut MC + trial := newFakeTrialBatchJob(commonv1beta1.StdOutCollector) g.Expect(c.Create(ctx, trial)).NotTo(gomega.HaveOccurred()) // Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason. From 849558968ab7a690ed123c45ec037f777162a471 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Tue, 3 Sep 2024 05:00:39 +0000 Subject: [PATCH 22/30] test(trial): use different names for UTs. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- .../trial/trial_controller_test.go | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index a9390b8153b..f463b3905be 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -48,13 +48,11 @@ import ( const ( namespace = "default" - trialName = "test-trial" batchJobName = "test-job" objectiveMetric = "accuracy" timeout = time.Second * 80 ) -var trialKey = types.NamespacedName{Name: trialName, Namespace: namespace} var batchJobKey = types.NamespacedName{Name: batchJobName, Namespace: namespace} func init() { @@ -112,6 +110,7 @@ func TestReconcileBatchJob(t *testing.T) { // Try to update status until it be succeeded for err != nil { updatedInstance := &trialsv1beta1.Trial{} + trialKey := types.NamespacedName{Name: instance.Name, Namespace: namespace} if err = c.Get(ctx, trialKey, updatedInstance); err != nil { continue } @@ -183,7 +182,8 @@ func TestReconcileBatchJob(t *testing.T) { g := gomega.NewGomegaWithT(t) mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil) - trial := newFakeTrialBatchJob(commonv1beta1.StdOutCollector) + trial := newFakeTrialBatchJob(commonv1beta1.StdOutCollector, "test-failed-batch-job") + trialKey := types.NamespacedName{Name: "test-failed-batch-job", Namespace: namespace} batchJob := &batchv1.Job{} // Create the Trial with StdOut MC @@ -263,7 +263,8 @@ func TestReconcileBatchJob(t *testing.T) { g.Expect(c.Status().Update(ctx, batchJob)).NotTo(gomega.HaveOccurred()) // Create the Trial with StdOut MC - trial := newFakeTrialBatchJob(commonv1beta1.StdOutCollector) + trial := newFakeTrialBatchJob(commonv1beta1.StdOutCollector, "test-available-stdout") + trialKey := types.NamespacedName{Name: "test-available-stdout", Namespace: namespace} g.Expect(c.Create(ctx, trial)).NotTo(gomega.HaveOccurred()) // Expect that Trial status is succeeded and metrics are properly populated @@ -290,15 +291,15 @@ func TestReconcileBatchJob(t *testing.T) { }, timeout).Should(gomega.BeTrue()) }) - t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(Push MC).`, func(t *testing.T) { + t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(StdOut MC).`, func(t *testing.T) { g := gomega.NewGomegaWithT(t) gomock.InOrder( mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(1), - mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil).MinTimes(1), mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil), ) - // Create the Trial with Push MC - trial := newFakeTrialBatchJob(commonv1beta1.PushCollector) + // Create the Trial with StdOut MC + trial := newFakeTrialBatchJob(commonv1beta1.StdOutCollector, "test-unavailable-stdout") + trialKey := types.NamespacedName{Name: "test-unavailable-stdout", Namespace: namespace} g.Expect(c.Create(ctx, trial)).NotTo(gomega.HaveOccurred()) // Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason. @@ -323,14 +324,16 @@ func TestReconcileBatchJob(t *testing.T) { }, timeout).Should(gomega.BeTrue()) }) - t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(StdOut MC).`, func(t *testing.T) { + t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(Push MC).`, func(t *testing.T) { g := gomega.NewGomegaWithT(t) gomock.InOrder( mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(1), + mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil).MinTimes(1), mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil), ) - // Create the Trial with StdOut MC - trial := newFakeTrialBatchJob(commonv1beta1.StdOutCollector) + // Create the Trial with Push MC + trial := newFakeTrialBatchJob(commonv1beta1.PushCollector, "test-unavailable-push") + trialKey := types.NamespacedName{Name: "test-unavailable-push", Namespace: namespace} g.Expect(c.Create(ctx, trial)).NotTo(gomega.HaveOccurred()) // Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason. @@ -365,7 +368,8 @@ func TestReconcileBatchJob(t *testing.T) { mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil), ) // Create the Trial with Push MC - trial := newFakeTrialBatchJob(commonv1beta1.PushCollector) + trial := newFakeTrialBatchJob(commonv1beta1.PushCollector, "test-unavailable-push-failed-once") + trialKey := types.NamespacedName{Name: "test-unavailable-push-failed-once", Namespace: namespace} g.Expect(c.Create(ctx, trial)).NotTo(gomega.HaveOccurred()) // Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason. @@ -454,7 +458,7 @@ func TestGetObjectiveMetricValue(t *testing.T) { g.Expect(err).To(gomega.HaveOccurred()) } -func newFakeTrialBatchJob(mcType commonv1beta1.CollectorKind) *trialsv1beta1.Trial { +func newFakeTrialBatchJob(mcType commonv1beta1.CollectorKind, trialName string) *trialsv1beta1.Trial { primaryContainer := "training-container" job := &batchv1.Job{ From af40d1fb5352fb94a492e0e46a25d120320e99a0 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Mon, 9 Sep 2024 10:21:35 +0000 Subject: [PATCH 23/30] test(trial): separate Push MC UTs with original UTs. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- .../trial/trial_controller_test.go | 268 ++++++++++++++---- 1 file changed, 214 insertions(+), 54 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index f463b3905be..00c315408ca 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -17,7 +17,7 @@ limitations under the License. package trial import ( - "sync" + "context" "testing" "time" @@ -50,10 +50,47 @@ const ( namespace = "default" batchJobName = "test-job" objectiveMetric = "accuracy" - timeout = time.Second * 80 + startSignal = "start" + closeSignal = "close" + timeout = time.Second * 10 ) -var batchJobKey = types.NamespacedName{Name: batchJobName, Namespace: namespace} +var ( + batchJobKey = types.NamespacedName{Name: batchJobName, Namespace: namespace} + observationLogAvailable = &api_pb.GetObservationLogReply{ + ObservationLog: &api_pb.ObservationLog{ + MetricLogs: []*api_pb.MetricLog{ + { + TimeStamp: "2020-08-10T14:47:38+08:00", + Metric: &api_pb.Metric{ + Name: objectiveMetric, + Value: "0.99", + }, + }, + { + TimeStamp: "2020-08-10T14:50:38+08:00", + Metric: &api_pb.Metric{ + Name: objectiveMetric, + Value: "0.11", + }, + }, + }, + }, + } + observationLogUnavailable = &api_pb.GetObservationLogReply{ + ObservationLog: &api_pb.ObservationLog{ + MetricLogs: []*api_pb.MetricLog{ + { + Metric: &api_pb.Metric{ + Name: objectiveMetric, + Value: consts.UnavailableMetricValue, + }, + TimeStamp: time.Time{}.UTC().Format(time.RFC3339), + }, + }, + }, + } +) func init() { logf.SetLogger(zap.New(zap.UseDevMode(true))) @@ -134,50 +171,11 @@ func TestReconcileBatchJob(t *testing.T) { g.Expect(add(mgr, recFn)).NotTo(gomega.HaveOccurred()) // Start test manager. - wg := &sync.WaitGroup{} - wg.Add(1) + mgrCtx, cancel := context.WithCancel(context.TODO()) go func() { - defer wg.Done() - g.Expect(mgr.Start(ctx)).NotTo(gomega.HaveOccurred()) + g.Expect(mgr.Start(mgrCtx)).NotTo(gomega.HaveOccurred()) }() - // Result for GetTrialObservationLog with some metrics. - observationLogAvailable := &api_pb.GetObservationLogReply{ - ObservationLog: &api_pb.ObservationLog{ - MetricLogs: []*api_pb.MetricLog{ - { - TimeStamp: "2020-08-10T14:47:38+08:00", - Metric: &api_pb.Metric{ - Name: objectiveMetric, - Value: "0.99", - }, - }, - { - TimeStamp: "2020-08-10T14:50:38+08:00", - Metric: &api_pb.Metric{ - Name: objectiveMetric, - Value: "0.11", - }, - }, - }, - }, - } - // Empty result for GetTrialObservationLog. - // If objective metrics are not parsed, metrics collector reports "unavailable" value to DB. - observationLogUnavailable := &api_pb.GetObservationLogReply{ - ObservationLog: &api_pb.ObservationLog{ - MetricLogs: []*api_pb.MetricLog{ - { - Metric: &api_pb.Metric{ - Name: objectiveMetric, - Value: consts.UnavailableMetricValue, - }, - TimeStamp: time.Time{}.UTC().Format(time.RFC3339), - }, - }, - }, - } - t.Run(`Trial run with "Failed" BatchJob.`, func(t *testing.T) { g := gomega.NewGomegaWithT(t) mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil) @@ -324,18 +322,102 @@ func TestReconcileBatchJob(t *testing.T) { }, timeout).Should(gomega.BeTrue()) }) + t.Run("Update status for empty Trial", func(t *testing.T) { + g := gomega.NewGomegaWithT(t) + g.Expect(r.updateStatus(&trialsv1beta1.Trial{})).To(gomega.HaveOccurred()) + }) + + // Stop the test manager + cancel() +} + +func TestUnavailablePushMC(t *testing.T) { + g := gomega.NewGomegaWithT(t) + + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + mockManagerClient := managerclientmock.NewMockManagerClient(mockCtrl) + + // Setup the Manager and Controller. Wrap the Controller Reconcile function so it writes each request to a + // channel when it is finished. + mgr, err := manager.New(cfg, manager.Options{Metrics: metricsserver.Options{BindAddress: "0"}}) + g.Expect(err).NotTo(gomega.HaveOccurred()) + c := mgr.GetClient() + + r := &ReconcileTrial{ + Client: mgr.GetClient(), + scheme: mgr.GetScheme(), + ManagerClient: mockManagerClient, + recorder: mgr.GetEventRecorderFor(ControllerName), + collector: trialutil.NewTrialsCollector(mgr.GetCache(), prometheus.NewRegistry()), + } + + r.updateStatusHandler = func(instance *trialsv1beta1.Trial) error { + var err error = errors.NewBadRequest("fake-error") + // Try to update status until it be succeeded + for err != nil { + updatedInstance := &trialsv1beta1.Trial{} + trialKey := types.NamespacedName{Name: instance.Name, Namespace: namespace} + if err = c.Get(ctx, trialKey, updatedInstance); err != nil { + continue + } + updatedInstance.Status = instance.Status + err = r.updateStatus(updatedInstance) + } + return err + } + + recFn := SetupTestReconcile(r) + // Set Job resource + trialResources := []schema.GroupVersionKind{ + { + Group: "batch", + Version: "v1", + Kind: "Job", + }, + } + + viper.Set(consts.ConfigTrialResources, trialResources) + g.Expect(add(mgr, recFn)).NotTo(gomega.HaveOccurred()) + + // Start test manager. + mgrCtx, cancel := context.WithCancel(context.TODO()) + go func() { + g.Expect(mgr.Start(mgrCtx)).NotTo(gomega.HaveOccurred()) + }() + t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(Push MC).`, func(t *testing.T) { g := gomega.NewGomegaWithT(t) gomock.InOrder( - mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(1), - mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil).MinTimes(1), + mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil), + mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil), mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil), ) + mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).AnyTimes() + mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes() + // Create the Trial with Push MC trial := newFakeTrialBatchJob(commonv1beta1.PushCollector, "test-unavailable-push") trialKey := types.NamespacedName{Name: "test-unavailable-push", Namespace: namespace} g.Expect(c.Create(ctx, trial)).NotTo(gomega.HaveOccurred()) + // Update BatchJob status to Complete. + batchJob := &batchv1.Job{} + batchJobCompleteMessage := "BatchJob completed test message" + batchJobCompleteReason := "BatchJob completed test reason" + g.Expect(c.Get(ctx, batchJobKey, batchJob)).NotTo(gomega.HaveOccurred()) + batchJob.Status = batchv1.JobStatus{ + Conditions: []batchv1.JobCondition{ + { + Type: batchv1.JobComplete, + Status: corev1.ConditionTrue, + Message: batchJobCompleteMessage, + Reason: batchJobCompleteReason, + }, + }, + } + g.Expect(c.Status().Update(ctx, batchJob)).NotTo(gomega.HaveOccurred()) + // Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason. // Metrics unavailable because GetTrialObservationLog returns "unavailable". g.Eventually(func() bool { @@ -358,20 +440,100 @@ func TestReconcileBatchJob(t *testing.T) { }, timeout).Should(gomega.BeTrue()) }) + // Stop the test manager + cancel() +} + +func TestUnavailablePushMCFailedOnce(t *testing.T) { + g := gomega.NewGomegaWithT(t) + + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + mockManagerClient := managerclientmock.NewMockManagerClient(mockCtrl) + + // Setup the Manager and Controller. Wrap the Controller Reconcile function so it writes each request to a + // channel when it is finished. + mgr, err := manager.New(cfg, manager.Options{Metrics: metricsserver.Options{BindAddress: "0"}}) + g.Expect(err).NotTo(gomega.HaveOccurred()) + c := mgr.GetClient() + + r := &ReconcileTrial{ + Client: mgr.GetClient(), + scheme: mgr.GetScheme(), + ManagerClient: mockManagerClient, + recorder: mgr.GetEventRecorderFor(ControllerName), + collector: trialutil.NewTrialsCollector(mgr.GetCache(), prometheus.NewRegistry()), + } + + r.updateStatusHandler = func(instance *trialsv1beta1.Trial) error { + var err error = errors.NewBadRequest("fake-error") + // Try to update status until it be succeeded + for err != nil { + updatedInstance := &trialsv1beta1.Trial{} + trialKey := types.NamespacedName{Name: instance.Name, Namespace: namespace} + if err = c.Get(ctx, trialKey, updatedInstance); err != nil { + continue + } + updatedInstance.Status = instance.Status + err = r.updateStatus(updatedInstance) + } + return err + } + + recFn := SetupTestReconcile(r) + // Set Job resource + trialResources := []schema.GroupVersionKind{ + { + Group: "batch", + Version: "v1", + Kind: "Job", + }, + } + + viper.Set(consts.ConfigTrialResources, trialResources) + g.Expect(add(mgr, recFn)).NotTo(gomega.HaveOccurred()) + + // Start test manager. + mgrCtx, cancel := context.WithCancel(context.TODO()) + go func() { + g.Expect(mgr.Start(mgrCtx)).NotTo(gomega.HaveOccurred()) + }() + t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(Push MC, failed once).`, func(t *testing.T) { + mockCtrl.Finish() g := gomega.NewGomegaWithT(t) gomock.InOrder( - mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(1), - mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, errReportMetricsFailed).MinTimes(1), - mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(1), - mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil).MinTimes(1), + mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil), + mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, errReportMetricsFailed), + mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil), + mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil), mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil), ) + mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).AnyTimes() + mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes() + // Create the Trial with Push MC trial := newFakeTrialBatchJob(commonv1beta1.PushCollector, "test-unavailable-push-failed-once") trialKey := types.NamespacedName{Name: "test-unavailable-push-failed-once", Namespace: namespace} g.Expect(c.Create(ctx, trial)).NotTo(gomega.HaveOccurred()) + // Update BatchJob status to Complete. + batchJob := &batchv1.Job{} + batchJobCompleteMessage := "BatchJob completed test message" + batchJobCompleteReason := "BatchJob completed test reason" + g.Expect(c.Get(ctx, batchJobKey, batchJob)).NotTo(gomega.HaveOccurred()) + batchJob.Status = batchv1.JobStatus{ + Conditions: []batchv1.JobCondition{ + { + Type: batchv1.JobComplete, + Status: corev1.ConditionTrue, + Message: batchJobCompleteMessage, + Reason: batchJobCompleteReason, + }, + }, + } + g.Expect(c.Status().Update(ctx, batchJob)).NotTo(gomega.HaveOccurred()) + // Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason. // Metrics unavailable because GetTrialObservationLog returns "unavailable". g.Eventually(func() bool { @@ -394,10 +556,8 @@ func TestReconcileBatchJob(t *testing.T) { }, timeout).Should(gomega.BeTrue()) }) - t.Run("Update status for empty Trial", func(t *testing.T) { - g := gomega.NewGomegaWithT(t) - g.Expect(r.updateStatus(&trialsv1beta1.Trial{})).To(gomega.HaveOccurred()) - }) + // Stop the test manager + cancel() } func TestGetObjectiveMetricValue(t *testing.T) { From 60214c850f4bc9fec23d28b0dbb02d656464fc7d Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Mon, 9 Sep 2024 10:27:12 +0000 Subject: [PATCH 24/30] test(trial): fix line error with gofmt. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller_test.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index 00c315408ca..8c580359d42 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -50,13 +50,11 @@ const ( namespace = "default" batchJobName = "test-job" objectiveMetric = "accuracy" - startSignal = "start" - closeSignal = "close" timeout = time.Second * 10 ) var ( - batchJobKey = types.NamespacedName{Name: batchJobName, Namespace: namespace} + batchJobKey = types.NamespacedName{Name: batchJobName, Namespace: namespace} observationLogAvailable = &api_pb.GetObservationLogReply{ ObservationLog: &api_pb.ObservationLog{ MetricLogs: []*api_pb.MetricLog{ From 008a574c17c142740bbff1a13513017d6e46a58f Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Mon, 9 Sep 2024 11:11:08 +0000 Subject: [PATCH 25/30] test(trial): reserve one UT for Push MC. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- .../trial/trial_controller_test.go | 199 +----------------- 1 file changed, 5 insertions(+), 194 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index 8c580359d42..8be490698ed 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -320,183 +320,6 @@ func TestReconcileBatchJob(t *testing.T) { }, timeout).Should(gomega.BeTrue()) }) - t.Run("Update status for empty Trial", func(t *testing.T) { - g := gomega.NewGomegaWithT(t) - g.Expect(r.updateStatus(&trialsv1beta1.Trial{})).To(gomega.HaveOccurred()) - }) - - // Stop the test manager - cancel() -} - -func TestUnavailablePushMC(t *testing.T) { - g := gomega.NewGomegaWithT(t) - - mockCtrl := gomock.NewController(t) - defer mockCtrl.Finish() - mockManagerClient := managerclientmock.NewMockManagerClient(mockCtrl) - - // Setup the Manager and Controller. Wrap the Controller Reconcile function so it writes each request to a - // channel when it is finished. - mgr, err := manager.New(cfg, manager.Options{Metrics: metricsserver.Options{BindAddress: "0"}}) - g.Expect(err).NotTo(gomega.HaveOccurred()) - c := mgr.GetClient() - - r := &ReconcileTrial{ - Client: mgr.GetClient(), - scheme: mgr.GetScheme(), - ManagerClient: mockManagerClient, - recorder: mgr.GetEventRecorderFor(ControllerName), - collector: trialutil.NewTrialsCollector(mgr.GetCache(), prometheus.NewRegistry()), - } - - r.updateStatusHandler = func(instance *trialsv1beta1.Trial) error { - var err error = errors.NewBadRequest("fake-error") - // Try to update status until it be succeeded - for err != nil { - updatedInstance := &trialsv1beta1.Trial{} - trialKey := types.NamespacedName{Name: instance.Name, Namespace: namespace} - if err = c.Get(ctx, trialKey, updatedInstance); err != nil { - continue - } - updatedInstance.Status = instance.Status - err = r.updateStatus(updatedInstance) - } - return err - } - - recFn := SetupTestReconcile(r) - // Set Job resource - trialResources := []schema.GroupVersionKind{ - { - Group: "batch", - Version: "v1", - Kind: "Job", - }, - } - - viper.Set(consts.ConfigTrialResources, trialResources) - g.Expect(add(mgr, recFn)).NotTo(gomega.HaveOccurred()) - - // Start test manager. - mgrCtx, cancel := context.WithCancel(context.TODO()) - go func() { - g.Expect(mgr.Start(mgrCtx)).NotTo(gomega.HaveOccurred()) - }() - - t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(Push MC).`, func(t *testing.T) { - g := gomega.NewGomegaWithT(t) - gomock.InOrder( - mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil), - mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil), - mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil), - ) - mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).AnyTimes() - mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes() - - // Create the Trial with Push MC - trial := newFakeTrialBatchJob(commonv1beta1.PushCollector, "test-unavailable-push") - trialKey := types.NamespacedName{Name: "test-unavailable-push", Namespace: namespace} - g.Expect(c.Create(ctx, trial)).NotTo(gomega.HaveOccurred()) - - // Update BatchJob status to Complete. - batchJob := &batchv1.Job{} - batchJobCompleteMessage := "BatchJob completed test message" - batchJobCompleteReason := "BatchJob completed test reason" - g.Expect(c.Get(ctx, batchJobKey, batchJob)).NotTo(gomega.HaveOccurred()) - batchJob.Status = batchv1.JobStatus{ - Conditions: []batchv1.JobCondition{ - { - Type: batchv1.JobComplete, - Status: corev1.ConditionTrue, - Message: batchJobCompleteMessage, - Reason: batchJobCompleteReason, - }, - }, - } - g.Expect(c.Status().Update(ctx, batchJob)).NotTo(gomega.HaveOccurred()) - - // Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason. - // Metrics unavailable because GetTrialObservationLog returns "unavailable". - g.Eventually(func() bool { - if err = c.Get(ctx, trialKey, trial); err != nil { - return false - } - return trial.IsMetricsUnavailable() && - len(trial.Status.Observation.Metrics) > 0 && - trial.Status.Observation.Metrics[0].Min == consts.UnavailableMetricValue && - trial.Status.Observation.Metrics[0].Max == consts.UnavailableMetricValue && - trial.Status.Observation.Metrics[0].Latest == consts.UnavailableMetricValue - }, timeout).Should(gomega.BeTrue()) - - // Delete the Trial - g.Expect(c.Delete(ctx, trial)).NotTo(gomega.HaveOccurred()) - - // Expect that Trial is deleted - g.Eventually(func() bool { - return errors.IsNotFound(c.Get(ctx, trialKey, &trialsv1beta1.Trial{})) - }, timeout).Should(gomega.BeTrue()) - }) - - // Stop the test manager - cancel() -} - -func TestUnavailablePushMCFailedOnce(t *testing.T) { - g := gomega.NewGomegaWithT(t) - - mockCtrl := gomock.NewController(t) - defer mockCtrl.Finish() - mockManagerClient := managerclientmock.NewMockManagerClient(mockCtrl) - - // Setup the Manager and Controller. Wrap the Controller Reconcile function so it writes each request to a - // channel when it is finished. - mgr, err := manager.New(cfg, manager.Options{Metrics: metricsserver.Options{BindAddress: "0"}}) - g.Expect(err).NotTo(gomega.HaveOccurred()) - c := mgr.GetClient() - - r := &ReconcileTrial{ - Client: mgr.GetClient(), - scheme: mgr.GetScheme(), - ManagerClient: mockManagerClient, - recorder: mgr.GetEventRecorderFor(ControllerName), - collector: trialutil.NewTrialsCollector(mgr.GetCache(), prometheus.NewRegistry()), - } - - r.updateStatusHandler = func(instance *trialsv1beta1.Trial) error { - var err error = errors.NewBadRequest("fake-error") - // Try to update status until it be succeeded - for err != nil { - updatedInstance := &trialsv1beta1.Trial{} - trialKey := types.NamespacedName{Name: instance.Name, Namespace: namespace} - if err = c.Get(ctx, trialKey, updatedInstance); err != nil { - continue - } - updatedInstance.Status = instance.Status - err = r.updateStatus(updatedInstance) - } - return err - } - - recFn := SetupTestReconcile(r) - // Set Job resource - trialResources := []schema.GroupVersionKind{ - { - Group: "batch", - Version: "v1", - Kind: "Job", - }, - } - - viper.Set(consts.ConfigTrialResources, trialResources) - g.Expect(add(mgr, recFn)).NotTo(gomega.HaveOccurred()) - - // Start test manager. - mgrCtx, cancel := context.WithCancel(context.TODO()) - go func() { - g.Expect(mgr.Start(mgrCtx)).NotTo(gomega.HaveOccurred()) - }() - t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(Push MC, failed once).`, func(t *testing.T) { mockCtrl.Finish() g := gomega.NewGomegaWithT(t) @@ -515,23 +338,6 @@ func TestUnavailablePushMCFailedOnce(t *testing.T) { trialKey := types.NamespacedName{Name: "test-unavailable-push-failed-once", Namespace: namespace} g.Expect(c.Create(ctx, trial)).NotTo(gomega.HaveOccurred()) - // Update BatchJob status to Complete. - batchJob := &batchv1.Job{} - batchJobCompleteMessage := "BatchJob completed test message" - batchJobCompleteReason := "BatchJob completed test reason" - g.Expect(c.Get(ctx, batchJobKey, batchJob)).NotTo(gomega.HaveOccurred()) - batchJob.Status = batchv1.JobStatus{ - Conditions: []batchv1.JobCondition{ - { - Type: batchv1.JobComplete, - Status: corev1.ConditionTrue, - Message: batchJobCompleteMessage, - Reason: batchJobCompleteReason, - }, - }, - } - g.Expect(c.Status().Update(ctx, batchJob)).NotTo(gomega.HaveOccurred()) - // Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason. // Metrics unavailable because GetTrialObservationLog returns "unavailable". g.Eventually(func() bool { @@ -554,6 +360,11 @@ func TestUnavailablePushMCFailedOnce(t *testing.T) { }, timeout).Should(gomega.BeTrue()) }) + t.Run("Update status for empty Trial", func(t *testing.T) { + g := gomega.NewGomegaWithT(t) + g.Expect(r.updateStatus(&trialsv1beta1.Trial{})).To(gomega.HaveOccurred()) + }) + // Stop the test manager cancel() } From ffe6089bf48dada01d396de0b27be791ec381ce9 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Tue, 10 Sep 2024 08:52:44 +0000 Subject: [PATCH 26/30] test(trial): fix typo error. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index 8be490698ed..48116e6d20c 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -235,7 +235,7 @@ func TestReconcileBatchJob(t *testing.T) { }, timeout).Should(gomega.BeTrue()) }) - t.Run(`Trail with "Complete" BatchJob and Available metrics.`, func(t *testing.T) { + t.Run(`Trial with "Complete" BatchJob and Available metrics.`, func(t *testing.T) { g := gomega.NewGomegaWithT(t) gomock.InOrder( mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogAvailable, nil).MinTimes(1), @@ -287,7 +287,7 @@ func TestReconcileBatchJob(t *testing.T) { }, timeout).Should(gomega.BeTrue()) }) - t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(StdOut MC).`, func(t *testing.T) { + t.Run(`Trial with "Complete" BatchJob and Unavailable metrics(StdOut MC).`, func(t *testing.T) { g := gomega.NewGomegaWithT(t) gomock.InOrder( mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(1), @@ -320,7 +320,7 @@ func TestReconcileBatchJob(t *testing.T) { }, timeout).Should(gomega.BeTrue()) }) - t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(Push MC, failed once).`, func(t *testing.T) { + t.Run(`Trial with "Complete" BatchJob and Unavailable metrics(Push MC, failed once).`, func(t *testing.T) { mockCtrl.Finish() g := gomega.NewGomegaWithT(t) gomock.InOrder( From ddd2cb366e882c1596b1c4b8d25152b301f5e575 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Wed, 11 Sep 2024 16:07:20 +0000 Subject: [PATCH 27/30] test(trial): make some tiny changes. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index 48116e6d20c..ab4c79708ea 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -168,7 +168,7 @@ func TestReconcileBatchJob(t *testing.T) { viper.Set(consts.ConfigTrialResources, trialResources) g.Expect(add(mgr, recFn)).NotTo(gomega.HaveOccurred()) - // Start test manager. + // Start test manager mgrCtx, cancel := context.WithCancel(context.TODO()) go func() { g.Expect(mgr.Start(mgrCtx)).NotTo(gomega.HaveOccurred()) From 6a7a528e15da31cd1f675927a0596eff5ef1023f Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Thu, 19 Sep 2024 02:53:52 +0000 Subject: [PATCH 28/30] fix(trial): move cancel func to t.Cleanup. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- pkg/controller.v1beta1/trial/trial_controller_test.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index ab4c79708ea..8258806f2eb 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -170,6 +170,7 @@ func TestReconcileBatchJob(t *testing.T) { // Start test manager mgrCtx, cancel := context.WithCancel(context.TODO()) + t.Cleanup(cancel) go func() { g.Expect(mgr.Start(mgrCtx)).NotTo(gomega.HaveOccurred()) }() @@ -364,9 +365,6 @@ func TestReconcileBatchJob(t *testing.T) { g := gomega.NewGomegaWithT(t) g.Expect(r.updateStatus(&trialsv1beta1.Trial{})).To(gomega.HaveOccurred()) }) - - // Stop the test manager - cancel() } func TestGetObjectiveMetricValue(t *testing.T) { From 7cb4e3e1624059eb461cf3e59f02e09920c2b9a0 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Thu, 19 Sep 2024 03:03:28 +0000 Subject: [PATCH 29/30] fix(trial): use the propagated gomega instance to improve debuggability. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- .../trial/trial_controller_test.go | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index 8258806f2eb..3a7a3d527fc 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -301,16 +301,17 @@ func TestReconcileBatchJob(t *testing.T) { // Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason. // Metrics unavailable because GetTrialObservationLog returns "unavailable". - g.Eventually(func() bool { - if err = c.Get(ctx, trialKey, trial); err != nil { - return false - } - return trial.IsMetricsUnavailable() && - len(trial.Status.Observation.Metrics) > 0 && - trial.Status.Observation.Metrics[0].Min == consts.UnavailableMetricValue && - trial.Status.Observation.Metrics[0].Max == consts.UnavailableMetricValue && - trial.Status.Observation.Metrics[0].Latest == consts.UnavailableMetricValue - }, timeout).Should(gomega.BeTrue()) + g.Eventually(func(g gomega.Gomega) { + g.Expect(c.Get(ctx, trialKey, trial)).Should(gomega.Succeed()) + g.Expect(trial.IsMetricsUnavailable()).Should(gomega.BeTrue()) + g.Expect(trial.Status.Observation.Metrics).ShouldNot(gomega.HaveLen(0)) + g.Expect(trial.Status.Observation.Metrics[0]).Should(gomega.BeComparableTo(commonv1beta1.Metric{ + Name: objectiveMetric, + Min: consts.UnavailableMetricValue, + Max: consts.UnavailableMetricValue, + Latest: consts.UnavailableMetricValue, + })) + }, timeout).Should(gomega.Succeed()) // Delete the Trial g.Expect(c.Delete(ctx, trial)).NotTo(gomega.HaveOccurred()) @@ -341,16 +342,17 @@ func TestReconcileBatchJob(t *testing.T) { // Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason. // Metrics unavailable because GetTrialObservationLog returns "unavailable". - g.Eventually(func() bool { - if err = c.Get(ctx, trialKey, trial); err != nil { - return false - } - return trial.IsMetricsUnavailable() && - len(trial.Status.Observation.Metrics) > 0 && - trial.Status.Observation.Metrics[0].Min == consts.UnavailableMetricValue && - trial.Status.Observation.Metrics[0].Max == consts.UnavailableMetricValue && - trial.Status.Observation.Metrics[0].Latest == consts.UnavailableMetricValue - }, timeout).Should(gomega.BeTrue()) + g.Eventually(func(g gomega.Gomega) { + g.Expect(c.Get(ctx, trialKey, trial)).Should(gomega.Succeed()) + g.Expect(trial.IsMetricsUnavailable()).Should(gomega.BeTrue()) + g.Expect(trial.Status.Observation.Metrics).ShouldNot(gomega.HaveLen(0)) + g.Expect(trial.Status.Observation.Metrics[0]).Should(gomega.BeComparableTo(commonv1beta1.Metric{ + Name: objectiveMetric, + Min: consts.UnavailableMetricValue, + Max: consts.UnavailableMetricValue, + Latest: consts.UnavailableMetricValue, + })) + }, timeout).Should(gomega.Succeed()) // Delete the Trial g.Expect(c.Delete(ctx, trial)).NotTo(gomega.HaveOccurred()) From 9604ed49b9bc3d98768e162f5489956fc832ac2f Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Thu, 19 Sep 2024 03:04:02 +0000 Subject: [PATCH 30/30] fix(trial): use gofmt to reformat code. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- .../trial/trial_controller_test.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index 3a7a3d527fc..00ac7764b5e 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -306,10 +306,10 @@ func TestReconcileBatchJob(t *testing.T) { g.Expect(trial.IsMetricsUnavailable()).Should(gomega.BeTrue()) g.Expect(trial.Status.Observation.Metrics).ShouldNot(gomega.HaveLen(0)) g.Expect(trial.Status.Observation.Metrics[0]).Should(gomega.BeComparableTo(commonv1beta1.Metric{ - Name: objectiveMetric, - Min: consts.UnavailableMetricValue, - Max: consts.UnavailableMetricValue, - Latest: consts.UnavailableMetricValue, + Name: objectiveMetric, + Min: consts.UnavailableMetricValue, + Max: consts.UnavailableMetricValue, + Latest: consts.UnavailableMetricValue, })) }, timeout).Should(gomega.Succeed()) @@ -347,10 +347,10 @@ func TestReconcileBatchJob(t *testing.T) { g.Expect(trial.IsMetricsUnavailable()).Should(gomega.BeTrue()) g.Expect(trial.Status.Observation.Metrics).ShouldNot(gomega.HaveLen(0)) g.Expect(trial.Status.Observation.Metrics[0]).Should(gomega.BeComparableTo(commonv1beta1.Metric{ - Name: objectiveMetric, - Min: consts.UnavailableMetricValue, - Max: consts.UnavailableMetricValue, - Latest: consts.UnavailableMetricValue, + Name: objectiveMetric, + Min: consts.UnavailableMetricValue, + Max: consts.UnavailableMetricValue, + Latest: consts.UnavailableMetricValue, })) }, timeout).Should(gomega.Succeed())