From 6ec23fa247809506e20c608d05665c1f55b1822a Mon Sep 17 00:00:00 2001 From: Pauline Ribeyre <4224001+paulineribeyre@users.noreply.github.com> Date: Thu, 18 Jul 2024 13:45:30 -0500 Subject: [PATCH] MIDRC-536 MIDRC-653 Add expiration bucket lifecycle rule (#112) --- doc/howto/configuration.md | 2 ++ hatchery/authz.go | 13 +++--------- hatchery/authz_test.go | 4 ++-- hatchery/config.go | 1 + hatchery/helpers.go | 3 +++ hatchery/nextflow.go | 41 +++++++++++++++++++++++++++++++------- 6 files changed, 45 insertions(+), 19 deletions(-) diff --git a/doc/howto/configuration.md b/doc/howto/configuration.md index e53db294..b9112479 100644 --- a/doc/howto/configuration.md +++ b/doc/howto/configuration.md @@ -33,6 +33,7 @@ An example manifest entry may look like "lifecycle-pre-stop": ["su", "-c", "cd /data; for f in *; do fusermount -u $f; rm -rf $f; done", "-s", "/bin/sh", "jovyan"] }, "nextflow-global": { + "s3-objects-expiration-days": 30, "sample-config-public-image": "", "imagebuilder-reader-role-arn": "" }, @@ -126,6 +127,7 @@ An example manifest entry may look like * `command` a string array as the command to run in the container overriding the default. * `lifecycle-pre-stop` a string array as the container prestop command. * `nextflow-global` is for global configuration specific to Nextflow containers. + * `s3-objects-expiration-days` (int, default 30): objects created in S3 by Nextflow are deleted after the specified number of days. * `sample-config-public-image`: a publicly-accessible image that any user can pull to test Nextflow workflows. Will be mentioned in the auto-generated sample configuration and documentation when a user launches a Nextflow workspace. * `imagebuilder-reader-role-arn`: see the [nextflow-global.imagebuilder-reader-role-arn section](/doc/explanation/nextflow.md#nextflow-globalimagebuilder-reader-role-arn) of the Nextflow workspaces documentation. * `containers` is the list of workspaces available to be run by this instance of Hatchery. Each container must be a single image and expose a web server. diff --git a/hatchery/authz.go b/hatchery/authz.go index b7dedfaf..5421aa04 100644 --- a/hatchery/authz.go +++ b/hatchery/authz.go @@ -233,14 +233,14 @@ var isUserAuthorizedForPayModels = func(userName string, allowedPayModels []stri var isUserAuthorizedForResourcePaths = func(userName string, accessToken string, resourcePaths []string) (bool, error) { Config.Logger.Printf("DEBUG: Checking user '%s' access to resource paths %v (service 'jupyterhub', method 'launch')", userName, resourcePaths) - body := "{ \"requests\": [" + body := fmt.Sprintf("{\"user\": {\"token\": \"%s\"}, \"requests\": [", accessToken) for _, resource := range resourcePaths { body += fmt.Sprintf("{\"resource\": \"%s\", \"action\": {\"service\": \"jupyterhub\", \"method\": \"launch\"}},", resource) } body = body[:len(body)-1] // remove the last trailing comma body += "]}" - authorized, err := arboristAuthRequest(accessToken, body) + authorized, err := arboristAuthRequest(body) if err != nil { Config.Logger.Printf("something went wrong when making a call to arborist's `/auth/request` endpoint. Denying access. Details: %v", err.Error()) return false, nil @@ -249,20 +249,13 @@ var isUserAuthorizedForResourcePaths = func(userName string, accessToken string, return authorized, nil } -var arboristAuthRequest = func(accessToken string, body string) (bool, error) { +var arboristAuthRequest = func(body string) (bool, error) { arboristUrl := "http://arborist-service/auth/request" req, err := http.NewRequest("POST", arboristUrl, bytes.NewBufferString(body)) if err != nil { return false, errors.New("Error occurred while generating HTTP request: " + err.Error()) } - headers := map[string]string{ - "Authorization": fmt.Sprintf("Bearer %s", accessToken), - } - for k, v := range headers { - req.Header.Add(k, v) - } - client := &http.Client{Timeout: 10 * time.Second} resp, err := client.Do(req) if err != nil { diff --git a/hatchery/authz_test.go b/hatchery/authz_test.go index 888da8c9..501cad3f 100644 --- a/hatchery/authz_test.go +++ b/hatchery/authz_test.go @@ -256,7 +256,7 @@ func TestIsUserAuthorizedForResourcePaths(t *testing.T) { } resourcePaths := []string{"/workspace/abc", "/workspace/xyz"} - expectedRequestBody := "{ \"requests\": [{\"resource\": \"/workspace/abc\", \"action\": {\"service\": \"jupyterhub\", \"method\": \"launch\"}},{\"resource\": \"/workspace/xyz\", \"action\": {\"service\": \"jupyterhub\", \"method\": \"launch\"}}]}" + expectedRequestBody := "{\"user\": {\"token\": \"accessToken\"}, \"requests\": [{\"resource\": \"/workspace/abc\", \"action\": {\"service\": \"jupyterhub\", \"method\": \"launch\"}},{\"resource\": \"/workspace/xyz\", \"action\": {\"service\": \"jupyterhub\", \"method\": \"launch\"}}]}" originalArboristAuthRequest := arboristAuthRequest defer func() { @@ -267,7 +267,7 @@ func TestIsUserAuthorizedForResourcePaths(t *testing.T) { t.Logf("Running test case: '%s'", testCase.name) // mock the call to arborist - arboristAuthRequest = func(accessToken string, body string) (bool, error) { + arboristAuthRequest = func(body string) (bool, error) { if testCase.arboristError { return false, fmt.Errorf("mocking an error while making call to arborist") } diff --git a/hatchery/config.go b/hatchery/config.go index 5eb77622..64c27c88 100644 --- a/hatchery/config.go +++ b/hatchery/config.go @@ -13,6 +13,7 @@ import ( // Global configuration shared by all Nextflow containers type NextflowGlobalConfig struct { + S3ObjectsExpirationDays int `json:"s3-objects-expiration-days"` SampleConfigPublicImage string `json:"sample-config-public-image"` ImageBuilderReaderRoleArn string `json:"imagebuilder-reader-role-arn"` } diff --git a/hatchery/helpers.go b/hatchery/helpers.go index b506280a..4fff1bc7 100644 --- a/hatchery/helpers.go +++ b/hatchery/helpers.go @@ -218,6 +218,9 @@ func getAwsAccountId(sess *session.Session, awsConfig *aws.Config) (string, erro if err != nil { return "", err } + if *req.Account == "" { + return "", fmt.Errorf("unable to find AWS account ID: STS GetCallerIdentity returned: %v", *req) + } return *req.Account, nil } diff --git a/hatchery/nextflow.go b/hatchery/nextflow.go index 4cc6acc9..cfa34004 100644 --- a/hatchery/nextflow.go +++ b/hatchery/nextflow.go @@ -26,7 +26,6 @@ import ( General TODOS: - Make the AWS region configurable in the hatchery config (although ideally, the user should be able to choose) (MIDRC-743) - Make the `roleArn` configurable (MIDRC-744) -- The contents of `s3:///` are not deleted because researchers may need to keep the intermediary files. We should set bucket lifecycle rules to delete after X days. (MIDRC-653 and MIDRC-536) - Can we do this long setup as a separate workspace launch step, instead of in the launch() function? (MIDRC-745) */ @@ -118,14 +117,14 @@ func createNextflowResources(userName string, nextflowGlobalConfig NextflowGloba } // Create nextflow compute environment if it does not exist - batchComputeEnvArn, err := createBatchComputeEnvironment(nextflowGlobalConfig, nextflowConfig, userName, hostname, tagsMap, batchSvc, ec2Svc, iamSvc, *vpcid, *subnetids, payModel, awsAccountId) + batchComputeEnvArn, err := createBatchComputeEnvironment(nextflowGlobalConfig, nextflowConfig, userName, hostname, tagsMap, batchSvc, ec2Svc, iamSvc, *vpcid, *subnetids) if err != nil { Config.Logger.Printf("Error creating compute environment for user %s: %s", userName, err.Error()) return "", "", err } // Create S3 bucket - kmsKeyArn, err := createS3bucket(s3Svc, kmsSvc, bucketName, kmsTags) + kmsKeyArn, err := createS3bucket(nextflowGlobalConfig, s3Svc, kmsSvc, bucketName, kmsTags) if err != nil { Config.Logger.Printf("Error creating S3 bucket '%s': %v", bucketName, err) return "", "", err @@ -445,10 +444,11 @@ var getNextflowAwsSettings = func(sess *session.Session, payModel *PayModel, use Config.Logger.Printf("Info: pay model disabled for user '%s': %s Nextflow resources in main AWS account", userName, action) awsConfig = aws.Config{} Config.Logger.Printf("Debug: Getting AWS account ID...") - awsAccountId, err := getAwsAccountId(sess, &awsConfig) + var err error + awsAccountId, err = getAwsAccountId(sess, &awsConfig) if err != nil { Config.Logger.Printf("Error getting AWS account ID: %v", err) - return awsAccountId, awsConfig, err + return "", awsConfig, err } } return awsAccountId, awsConfig, nil @@ -605,7 +605,7 @@ func ensureLaunchTemplate(ec2Svc *ec2.EC2, userName string, hostname string, job } // Create AWS Batch compute environment -func createBatchComputeEnvironment(nextflowGlobalConfig NextflowGlobalConfig, nextflowConfig NextflowConfig, userName string, hostname string, tagsMap map[string]*string, batchSvc *batch.Batch, ec2Svc *ec2.EC2, iamSvc *iam.IAM, vpcid string, subnetids []string, payModel *PayModel, awsAccountId string) (string, error) { +func createBatchComputeEnvironment(nextflowGlobalConfig NextflowGlobalConfig, nextflowConfig NextflowConfig, userName string, hostname string, tagsMap map[string]*string, batchSvc *batch.Batch, ec2Svc *ec2.EC2, iamSvc *iam.IAM, vpcid string, subnetids []string) (string, error) { instanceProfileArn, err := createEcsInstanceProfile(iamSvc, fmt.Sprintf("%s-nf-ecsInstanceRole", hostname)) if err != nil { Config.Logger.Printf("Unable to create ECS instance profile: %s", err.Error()) @@ -692,6 +692,7 @@ func createBatchComputeEnvironment(nextflowGlobalConfig NextflowGlobalConfig, ne return "", err } } else { // compute environment does not exist, create it + Config.Logger.Printf("Debug: Batch compute environment '%s' does not exist, creating it", batchComputeEnvName) subnets := []*string{} for _, subnet := range subnetids { s := subnet @@ -861,7 +862,7 @@ func createEcsInstanceProfile(iamSvc *iam.IAM, name string) (*string, error) { return instanceProfile.InstanceProfile.Arn, nil } -func createS3bucket(s3Svc *s3.S3, kmsSvc *kms.KMS, bucketName string, kmsTags []*kms.Tag) (string, error) { +func createS3bucket(nextflowGlobalConfig NextflowGlobalConfig, s3Svc *s3.S3, kmsSvc *kms.KMS, bucketName string, kmsTags []*kms.Tag) (string, error) { // create S3 bucket for nextflow input, output and intermediate files _, err := s3Svc.CreateBucket(&s3.CreateBucketInput{ Bucket: &bucketName, @@ -964,6 +965,32 @@ func createS3bucket(s3Svc *s3.S3, kmsSvc *kms.KMS, bucketName string, kmsTags [] return "", err } + expirationDays := nextflowGlobalConfig.S3ObjectsExpirationDays + if expirationDays <= 0 { + expirationDays = 30 + } + Config.Logger.Printf("DEBUG: Setting bucket objects expiration to %d days", expirationDays) + _, err = s3Svc.PutBucketLifecycleConfiguration(&s3.PutBucketLifecycleConfigurationInput{ + Bucket: &bucketName, + LifecycleConfiguration: &s3.BucketLifecycleConfiguration{ + Rules: []*s3.LifecycleRule{ + { + Expiration: &s3.LifecycleExpiration{ + Days: aws.Int64(int64(expirationDays)), + }, + Status: aws.String("Enabled"), + Filter: &s3.LifecycleRuleFilter{ + Prefix: aws.String(""), // apply to all objects + }, + }, + }, + }, + }) + if err != nil { + Config.Logger.Printf("Unable to set lifecycle configuration: %v", err) + return "", err + } + Config.Logger.Printf("DEBUG: Done setting up S3 bucket!") return *kmsKeyArn, nil }