From e527df1d2e6f189804543592dde715c8dcd4d574 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Mon, 30 Oct 2023 18:03:12 -0700 Subject: [PATCH] add metastore module (#528) --- databricks-metastore/README.md | 61 +++++++++++++ databricks-metastore/main.tf | 61 +++++++++++++ databricks-metastore/outputs.tf | 4 + databricks-metastore/provider.tf | 10 ++ databricks-metastore/s3.tf | 147 ++++++++++++++++++++++++++++++ databricks-metastore/variables.tf | 68 ++++++++++++++ databricks-metastore/versions.tf | 11 +++ 7 files changed, 362 insertions(+) create mode 100644 databricks-metastore/README.md create mode 100644 databricks-metastore/main.tf create mode 100644 databricks-metastore/outputs.tf create mode 100644 databricks-metastore/provider.tf create mode 100644 databricks-metastore/s3.tf create mode 100644 databricks-metastore/variables.tf create mode 100644 databricks-metastore/versions.tf diff --git a/databricks-metastore/README.md b/databricks-metastore/README.md new file mode 100644 index 00000000..b28d903e --- /dev/null +++ b/databricks-metastore/README.md @@ -0,0 +1,61 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 0.13 | + +## Providers + +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | n/a | +| [databricks.workspace](#provider\_databricks.workspace) | n/a | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [aws_iam_policy.metastore_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_policy_attachment.metastore_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy_attachment) | resource | +| [aws_iam_role.metastore_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_kms_alias.metastore_key_alias](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/kms_alias) | resource | +| [aws_kms_key.metastore_key](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/kms_key) | resource | +| [aws_s3_bucket.metastore](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_bucket) | resource | +| [databricks_catalog.sandbox](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/catalog) | resource | +| [databricks_grants.admin](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/grants) | resource | +| [databricks_grants.poweruser](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/grants) | resource | +| [databricks_metastore.this](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/metastore) | resource | +| [databricks_metastore_assignment.this](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/metastore_assignment) | resource | +| [databricks_metastore_data_access.metastore_data_access](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/metastore_data_access) | resource | +| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | +| [aws_iam_policy_document.metastore_assumerole_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.metastore_role_access_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [admin\_groups](#input\_admin\_groups) | List of databricks groups to grant admin access for metastore; includes owner by default | `list(string)` | `[]` | no | +| [databricks\_external\_id](#input\_databricks\_external\_id) | External ID for Databricks account | `string` | n/a | yes | +| [deletion\_window\_in\_days](#input\_deletion\_window\_in\_days) | Deletion window in days for S3 encryption key | `number` | `7` | no | +| [delta\_sharing\_recipient\_token\_lifetime\_in\_seconds](#input\_delta\_sharing\_recipient\_token\_lifetime\_in\_seconds) | Lifetime of delta sharing recipient token in seconds | `number` | `3600` | no | +| [delta\_sharing\_scope](#input\_delta\_sharing\_scope) | Delta sharing scope | `string` | `"INTERNAL"` | no | +| [enable\_key\_rotation](#input\_enable\_key\_rotation) | Enable key rotation for S3 encryption key | `bool` | `true` | no | +| [force\_destroy](#input\_force\_destroy) | Force destroy metastore if data exists | `bool` | `false` | no | +| [owner](#input\_owner) | Owner of the metastore; should be a group display name | `string` | `"data-infra-admin"` | no | +| [powerusers](#input\_powerusers) | List of databricks groups to grant poweruser access for metastore | `list(string)` |
[
"powerusers"
]
| no | +| [tags](#input\_tags) | Fogg generated tags for the environment | `object({ project : string, env : string, service : string, owner : string })` | n/a | yes | +| [workspace\_url](#input\_workspace\_url) | URL of the workspace to use to create this metastore | `string` | n/a | yes | +| [workspaces](#input\_workspaces) | Map of workspace names to ids to associate with this metastore | `map(string)` | `{}` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [metastore\_id](#output\_metastore\_id) | ID of the metastore | + \ No newline at end of file diff --git a/databricks-metastore/main.tf b/databricks-metastore/main.tf new file mode 100644 index 00000000..964b2f37 --- /dev/null +++ b/databricks-metastore/main.tf @@ -0,0 +1,61 @@ +locals { + name = "${var.tags.project}-${var.tags.env}-${var.tags.service}" + admins = toset(concat(var.admin_groups, [var.owner])) + workspace_ids = values(var.workspaces) +} + +resource "databricks_metastore" "this" { + provider = databricks.workspace + name = "${local.name}-metastore" + storage_root = "s3://${aws_s3_bucket.metastore.id}/metastore" + owner = var.owner + delta_sharing_scope = var.delta_sharing_scope + delta_sharing_recipient_token_lifetime_in_seconds = var.delta_sharing_recipient_token_lifetime_in_seconds + force_destroy = var.force_destroy +} + +resource "databricks_metastore_assignment" "this" { + provider = databricks.workspace + for_each = toset(local.workspace_ids) + metastore_id = databricks_metastore.this.id + workspace_id = each.value +} + +resource "databricks_grants" "admin" { + for_each = local.admins + provider = databricks.workspace + metastore = databricks_metastore.this.id + grant { + principal = each.value + privileges = ["CREATE_CATALOG", "CREATE_EXTERNAL_LOCATION", "CREATE_SHARE", "CREATE_RECIPIENT", "CREATE_PROVIDER"] + } +} + +resource "databricks_grants" "poweruser" { + for_each = toset(var.powerusers) + provider = databricks.workspace + metastore = databricks_metastore.this.id + grant { + principal = each.value + privileges = ["CREATE_CATALOG", "CREATE_SHARE"] + } +} + +resource "databricks_metastore_data_access" "metastore_data_access" { + provider = databricks.workspace + depends_on = [databricks_metastore.this] + metastore_id = databricks_metastore.this.id + name = aws_iam_role.metastore_access.name + aws_iam_role { role_arn = aws_iam_role.metastore_access.arn } + is_default = true +} + +resource "databricks_catalog" "sandbox" { + provider = databricks.workspace + metastore_id = databricks_metastore.this.id + name = "sandbox" + comment = "this catalog is managed by terraform" + properties = { + purpose = "testing" + } +} \ No newline at end of file diff --git a/databricks-metastore/outputs.tf b/databricks-metastore/outputs.tf new file mode 100644 index 00000000..ec468927 --- /dev/null +++ b/databricks-metastore/outputs.tf @@ -0,0 +1,4 @@ +output "metastore_id" { + description = "ID of the metastore" + value = databricks_metastore.this.id +} diff --git a/databricks-metastore/provider.tf b/databricks-metastore/provider.tf new file mode 100644 index 00000000..50107158 --- /dev/null +++ b/databricks-metastore/provider.tf @@ -0,0 +1,10 @@ +provider "databricks" { + alias = "mws" + host = "https://accounts.cloud.databricks.com" + account_id = var.databricks_external_id +} + +provider "databricks" { + alias = "workspace" + host = var.workspace_url +} diff --git a/databricks-metastore/s3.tf b/databricks-metastore/s3.tf new file mode 100644 index 00000000..cb89ba40 --- /dev/null +++ b/databricks-metastore/s3.tf @@ -0,0 +1,147 @@ +## Sets up a metastore for use with Databricks Unity Catalog +## https://docs.databricks.com/data-governance/unity-catalog/get-started.html + +locals { + metastore_access_role_name = "${local.name}-access" +} + +## Bucket which will be used for the metastore, with KMS for encryption + +resource "aws_s3_bucket" "metastore" { + bucket = local.name + tags = var.tags + server_side_encryption_configuration { + rule { + apply_server_side_encryption_by_default { + kms_master_key_id = aws_kms_key.metastore_key.arn + sse_algorithm = "aws:kms" + } + } + } +} + +resource "aws_kms_key" "metastore_key" { + description = "KMS key for ${local.name}" + deletion_window_in_days = var.deletion_window_in_days + enable_key_rotation = var.enable_key_rotation + tags = var.tags +} + +resource "aws_kms_alias" "metastore_key_alias" { + name = "alias/${local.name}-key" + target_key_id = aws_kms_key.metastore_key.id +} + +## Allow Databricks role to assume our role + +data "aws_caller_identity" "current" {} + +data "aws_iam_policy_document" "metastore_assumerole_policy" { + statement { + effect = "Allow" + actions = [ + "sts:AssumeRole" + ] + principals { + type = "AWS" + identifiers = [ + # Default role for all databricks accounts https://docs.databricks.com/data-governance/unity-catalog/automate.html#configure-storage-for-a-metastore + "arn:aws:iam::414351767826:role/unity-catalog-prod-UCMasterRole-14S5ZJVKOTYTL" + ] + } + condition { + test = "StringEquals" + variable = "sts:ExternalId" + values = [ + # This is our non-education account number + var.databricks_external_id + ] + } + } + # AWS introduced a new change 6/30/23 that requires IAM policies to self-reference and allow the role to + # assume itself. We can't just use the arn as-is since the role might not exist yet + # https://docs.databricks.com/data-governance/unity-catalog/get-started.html#configure-a-storage-bucket-and-iam-role-in-aws + statement { + effect = "Allow" + actions = [ + "sts:AssumeRole" + ] + principals { + type = "AWS" + identifiers = [ + "arn:aws:iam::${data.aws_caller_identity.current.account_id}:root" + ] + } + condition { + test = "ArnEquals" + variable = "aws:PrincipalArn" + values = ["arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/${local.metastore_access_role_name}"] + } + } +} + + +## Create role which will be assumed by Databricks Unity Catalog + +resource "aws_iam_role" "metastore_access" { + name = local.metastore_access_role_name + assume_role_policy = data.aws_iam_policy_document.metastore_assumerole_policy.json + tags = var.tags +} + + +## Allow our role access S3 and KMS + +resource "aws_iam_policy_attachment" "metastore_access" { + name = "${local.name}-policy" + roles = [aws_iam_role.metastore_access.name] + policy_arn = aws_iam_policy.metastore_access.arn +} + +resource "aws_iam_policy" "metastore_access" { + name = "${local.name}-s3-kms-access" + description = "Allow access to the ${local.name} bucket" + policy = data.aws_iam_policy_document.metastore_role_access_policy.json +} + +data "aws_iam_policy_document" "metastore_role_access_policy" { + statement { + sid = "S3RWBucketAccess" + effect = "Allow" + actions = [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket", + "s3:GetBucketLocation", + "s3:GetLifecycleConfiguration", + "s3:PutLifecycleConfiguration" + ] + resources = [ + "arn:aws:s3:::${aws_s3_bucket.metastore.id}", + "arn:aws:s3:::${aws_s3_bucket.metastore.id}/*" + ] + } + statement { + sid = "KMSAccess" + effect = "Allow" + actions = [ + "kms:Decrypt", + "kms:Encrypt", + "kms:GenerateDataKey*" + ] + resources = [ + aws_kms_key.metastore_key.arn + ] + } + statement { + sid = "STSAssumeRoleAccess" + effect = "Allow" + actions = [ + "sts:AssumeRole" + ] + resources = [ + aws_iam_role.metastore_access.arn + ] + } +} \ No newline at end of file diff --git a/databricks-metastore/variables.tf b/databricks-metastore/variables.tf new file mode 100644 index 00000000..5ead2345 --- /dev/null +++ b/databricks-metastore/variables.tf @@ -0,0 +1,68 @@ +variable "databricks_external_id" { + type = string + description = "External ID for Databricks account" +} + +variable "tags" { + type = object({ project : string, env : string, service : string, owner : string }) + description = "Fogg generated tags for the environment" +} + +variable "deletion_window_in_days" { + type = number + description = "Deletion window in days for S3 encryption key" + default = 7 +} + +variable "enable_key_rotation" { + type = bool + description = "Enable key rotation for S3 encryption key" + default = true +} + +variable "delta_sharing_scope" { + type = string + description = "Delta sharing scope" + default = "INTERNAL" +} + +variable "delta_sharing_recipient_token_lifetime_in_seconds" { + type = number + description = "Lifetime of delta sharing recipient token in seconds" + default = 3600 +} + +variable "force_destroy" { + type = bool + description = "Force destroy metastore if data exists" + default = false +} + +variable "workspaces" { + type = map(string) + description = "Map of workspace names to ids to associate with this metastore" + default = {} +} + +variable "admin_groups" { + type = list(string) + description = "List of databricks groups to grant admin access for metastore; includes owner by default" + default = [] +} + +variable "owner" { + type = string + description = "Owner of the metastore; should be a group display name" + default = "data-infra-admin" +} + +variable "powerusers" { + type = list(string) + description = "List of databricks groups to grant poweruser access for metastore" + default = ["powerusers"] +} + +variable "workspace_url" { + type = string + description = "URL of the workspace to use to create this metastore" +} diff --git a/databricks-metastore/versions.tf b/databricks-metastore/versions.tf new file mode 100644 index 00000000..159e8002 --- /dev/null +++ b/databricks-metastore/versions.tf @@ -0,0 +1,11 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + databricks = { + source = "databricks/databricks" + } + } + required_version = ">= 1.3.0" +}