diff --git a/.github/workflows/check-datahub-jars.yml b/.github/workflows/check-datahub-jars.yml index 7a49f32729ec1f..dc770f7fc83a61 100644 --- a/.github/workflows/check-datahub-jars.yml +++ b/.github/workflows/check-datahub-jars.yml @@ -5,12 +5,12 @@ on: branches: - master paths: - - "metadata-integration" + - "metadata-integration/**" pull_request: branches: - "**" paths: - - "metadata-integration" + - "metadata-integration/**" release: types: [published] @@ -28,15 +28,22 @@ jobs: runs-on: ubuntu-latest steps: - uses: acryldata/sane-checkout-action@v3 + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + - uses: actions/cache@v4 + with: + path: | + ~/.cache/uv + key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }} + - name: Install dependencies + run: ./metadata-ingestion/scripts/install_deps.sh - name: Set up JDK 17 uses: actions/setup-java@v4 with: distribution: "zulu" java-version: 17 - uses: gradle/actions/setup-gradle@v3 - - uses: actions/setup-python@v5 - with: - python-version: "3.10" - name: check ${{ matrix.command }} jar run: | ./gradlew :metadata-integration:java:${{ matrix.command }}:build --info diff --git a/.github/workflows/dagster-plugin.yml b/.github/workflows/dagster-plugin.yml index f512dcf8f3ffd4..bee1ec95e77747 100644 --- a/.github/workflows/dagster-plugin.yml +++ b/.github/workflows/dagster-plugin.yml @@ -31,9 +31,9 @@ jobs: DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: - python-version: ["3.8", "3.10"] + python-version: ["3.9", "3.10"] include: - - python-version: "3.8" + - python-version: "3.9" extraPythonRequirement: "dagster>=1.3.3" - python-version: "3.10" extraPythonRequirement: "dagster>=1.3.3" diff --git a/build.gradle b/build.gradle index 3df3ffe6abfbb8..a3d807a7333494 100644 --- a/build.gradle +++ b/build.gradle @@ -34,6 +34,7 @@ buildscript { // Releases: https://github.com/linkedin/rest.li/blob/master/CHANGELOG.md ext.pegasusVersion = '29.57.0' ext.mavenVersion = '3.6.3' + ext.versionGradle = '8.11.1' ext.springVersion = '6.1.13' ext.springBootVersion = '3.2.9' ext.springKafkaVersion = '3.1.6' @@ -78,7 +79,7 @@ buildscript { plugins { id 'com.gorylenko.gradle-git-properties' version '2.4.1' - id 'com.github.johnrengelman.shadow' version '8.1.1' apply false + id 'com.gradleup.shadow' version '8.3.5' apply false id 'com.palantir.docker' version '0.35.0' apply false id 'com.avast.gradle.docker-compose' version '0.17.6' id "com.diffplug.spotless" version "6.23.3" @@ -372,6 +373,7 @@ configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) { exclude group: "org.slf4j", module: "slf4j-log4j12" exclude group: "org.slf4j", module: "slf4j-nop" exclude group: "org.slf4j", module: "slf4j-ext" + exclude group: "org.codehaus.jackson", module: "jackson-mapper-asl" resolutionStrategy.force externalDependency.antlr4Runtime resolutionStrategy.force externalDependency.antlr4 @@ -499,3 +501,8 @@ subprojects { } } } + +wrapper { + gradleVersion = project.versionGradle + distributionType = Wrapper.DistributionType.ALL +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index 079a20619d1eab..94f0e8a055b701 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -1318,7 +1318,8 @@ private void configureMutationResolvers(final RuntimeWiring.Builder builder) { .dataFetcher("updateQuery", new UpdateQueryResolver(this.queryService)) .dataFetcher("deleteQuery", new DeleteQueryResolver(this.queryService)) .dataFetcher( - "createDataProduct", new CreateDataProductResolver(this.dataProductService)) + "createDataProduct", + new CreateDataProductResolver(this.dataProductService, this.entityService)) .dataFetcher( "updateDataProduct", new UpdateDataProductResolver(this.dataProductService)) .dataFetcher( diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataproduct/CreateDataProductResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataproduct/CreateDataProductResolver.java index 470267264f12f2..8bee544ca55c33 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataproduct/CreateDataProductResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataproduct/CreateDataProductResolver.java @@ -10,8 +10,11 @@ import com.linkedin.datahub.graphql.exception.AuthorizationException; import com.linkedin.datahub.graphql.generated.CreateDataProductInput; import com.linkedin.datahub.graphql.generated.DataProduct; +import com.linkedin.datahub.graphql.generated.OwnerEntityType; +import com.linkedin.datahub.graphql.resolvers.mutate.util.OwnerUtils; import com.linkedin.datahub.graphql.types.dataproduct.mappers.DataProductMapper; import com.linkedin.entity.EntityResponse; +import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.service.DataProductService; import graphql.schema.DataFetcher; import graphql.schema.DataFetchingEnvironment; @@ -24,6 +27,7 @@ public class CreateDataProductResolver implements DataFetcher> { private final DataProductService _dataProductService; + private final EntityService _entityService; @Override public CompletableFuture get(final DataFetchingEnvironment environment) @@ -56,6 +60,8 @@ public CompletableFuture get(final DataFetchingEnvironment environm context.getOperationContext(), dataProductUrn, UrnUtils.getUrn(input.getDomainUrn())); + OwnerUtils.addCreatorAsOwner( + context, dataProductUrn.toString(), OwnerEntityType.CORP_USER, _entityService); EntityResponse response = _dataProductService.getDataProductEntityResponse( context.getOperationContext(), dataProductUrn); diff --git a/datahub-upgrade/build.gradle b/datahub-upgrade/build.gradle index 372b0eb0570b98..a3b2e9ad6b3e22 100644 --- a/datahub-upgrade/build.gradle +++ b/datahub-upgrade/build.gradle @@ -60,7 +60,7 @@ dependencies { // mock internal schema registry implementation externalDependency.kafkaAvroSerde implementation externalDependency.kafkaAvroSerializer - implementation "org.apache.kafka:kafka_2.12:3.7.1" + implementation "org.apache.kafka:kafka_2.13:3.7.2" implementation externalDependency.slf4jApi compileOnly externalDependency.lombok diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/dataprocessinstances/BackfillDataProcessInstancesHasRunEventsStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/dataprocessinstances/BackfillDataProcessInstancesHasRunEventsStep.java index 55cdcae931ab5b..1bdea10123999a 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/dataprocessinstances/BackfillDataProcessInstancesHasRunEventsStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/dataprocessinstances/BackfillDataProcessInstancesHasRunEventsStep.java @@ -2,6 +2,8 @@ import static com.linkedin.metadata.Constants.*; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.common.base.Throwables; import com.linkedin.common.urn.Urn; import com.linkedin.datahub.upgrade.UpgradeContext; @@ -23,8 +25,6 @@ import java.util.Set; import java.util.function.Function; import lombok.extern.slf4j.Slf4j; -import org.codehaus.jackson.node.JsonNodeFactory; -import org.codehaus.jackson.node.ObjectNode; import org.opensearch.action.search.SearchRequest; import org.opensearch.action.search.SearchResponse; import org.opensearch.client.RequestOptions; diff --git a/docker/kafka-setup/Dockerfile b/docker/kafka-setup/Dockerfile index a11f823f5efa55..324357b942e8e1 100644 --- a/docker/kafka-setup/Dockerfile +++ b/docker/kafka-setup/Dockerfile @@ -22,7 +22,7 @@ ARG ALPINE_REPO_URL ARG APACHE_DOWNLOAD_URL ARG GITHUB_REPO_URL -ENV KAFKA_VERSION=3.7.1 +ENV KAFKA_VERSION=3.7.2 ENV SCALA_VERSION=2.13 LABEL name="kafka" version=${KAFKA_VERSION} diff --git a/docker/profiles/docker-compose.gms.yml b/docker/profiles/docker-compose.gms.yml index e8b2d4cd1f29d3..64163ef970080a 100644 --- a/docker/profiles/docker-compose.gms.yml +++ b/docker/profiles/docker-compose.gms.yml @@ -99,7 +99,6 @@ x-datahub-gms-service: &datahub-gms-service - ${DATAHUB_LOCAL_GMS_ENV:-empty2.env} environment: &datahub-gms-env <<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *datahub-quickstart-telemetry-env, *kafka-env] - ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:-search_config.yaml} ALTERNATE_MCP_VALIDATION: ${ALTERNATE_MCP_VALIDATION:-true} STRICT_URN_VALIDATION_ENABLED: ${STRICT_URN_VALIDATION_ENABLED:-true} healthcheck: @@ -126,7 +125,6 @@ x-datahub-gms-service-dev: &datahub-gms-service-dev - ${DATAHUB_LOCAL_GMS_ENV:-empty2.env} environment: &datahub-gms-dev-env <<: [*datahub-dev-telemetry-env, *datahub-gms-env] - ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:-search_config.yaml} SKIP_ELASTICSEARCH_CHECK: false JAVA_TOOL_OPTIONS: '-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5001' BOOTSTRAP_SYSTEM_UPDATE_WAIT_FOR_SYSTEM_UPDATE: false diff --git a/docs-website/vercel-setup.sh b/docs-website/vercel-setup.sh index 4bb40eaddf4775..e9ba87b75be779 100755 --- a/docs-website/vercel-setup.sh +++ b/docs-website/vercel-setup.sh @@ -5,8 +5,8 @@ set -euxo pipefail ./metadata-ingestion/scripts/install_deps.sh # Set up java version for gradle -yum install java-17-amazon-corretto -y -java --version +yum install java-17-amazon-corretto-devel -y +javac --version # Build python from source. # Amazon Linux 2 has Python 3.8, but it's version of OpenSSL is super old and hence it diff --git a/docs/advanced/mcp-mcl.md b/docs/advanced/mcp-mcl.md index 333891ba1a95d3..3a06b2abadc115 100644 --- a/docs/advanced/mcp-mcl.md +++ b/docs/advanced/mcp-mcl.md @@ -218,3 +218,6 @@ Another form of conditional writes which considers the existence of an aspect or `CREATE_ENTITY` - Create the aspect if no aspects exist for the entity. +By default, a validation exception is thrown if the `CREATE`/`CREATE_ENTITY` constraint is violated. If the write operation +should be dropped without considering it an exception, then add the following header: `If-None-Match: *` to the MCP. + diff --git a/docs/plugins.md b/docs/plugins.md index 12c192b5b5190e..7212c74dad3c08 100644 --- a/docs/plugins.md +++ b/docs/plugins.md @@ -65,14 +65,14 @@ The sample authenticator implementation can be found at [Authenticator Sample](. 3. Use `getResourceAsStream` to read files: If your plugin read any configuration file like properties or YAML or JSON or xml then use `this.getClass().getClassLoader().getResourceAsStream("")` to read that file from DataHub GMS plugin's class-path. For DataHub GMS resource look-up behavior please refer [Plugin Installation](#plugin-installation) section. Sample code of `getResourceAsStream` is available in sample Authenticator plugin [TestAuthenticator.java](../metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthenticator.java). -4. Bundle your Jar: Use `com.github.johnrengelman.shadow` gradle plugin to create an uber jar. +4. Bundle your Jar: Use `com.gradleup.shadow` gradle plugin to create an uber jar. To see an example of building an uber jar, check out the `build.gradle` file for the apache-ranger-plugin file of [Apache Ranger Plugin](https://github.com/acryldata/datahub-ranger-auth-plugin/tree/main/apache-ranger-plugin) for reference. Exclude signature files as shown in below `shadowJar` task. ```groovy - apply plugin: 'com.github.johnrengelman.shadow'; + apply plugin: 'com.gradleup.shadow'; shadowJar { // Exclude com.datahub.plugins package and files related to jar signature exclude "META-INF/*.RSA", "META-INF/*.SF","META-INF/*.DSA" @@ -152,14 +152,14 @@ The sample authorizer implementation can be found at [Authorizer Sample](https:/ 3. Use `getResourceAsStream` to read files: If your plugin read any configuration file like properties or YAML or JSON or xml then use `this.getClass().getClassLoader().getResourceAsStream("")` to read that file from DataHub GMS plugin's class-path. For DataHub GMS resource look-up behavior please refer [Plugin Installation](#plugin-installation) section. Sample code of `getResourceAsStream` is available in sample Authenticator plugin [TestAuthenticator.java](../metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthenticator.java). -4. Bundle your Jar: Use `com.github.johnrengelman.shadow` gradle plugin to create an uber jar. +4. Bundle your Jar: Use `com.gradleup.shadow` gradle plugin to create an uber jar. To see an example of building an uber jar, check out the `build.gradle` file for the apache-ranger-plugin file of [Apache Ranger Plugin](https://github.com/acryldata/datahub-ranger-auth-plugin/tree/main/apache-ranger-plugin) for reference. Exclude signature files as shown in below `shadowJar` task. ```groovy - apply plugin: 'com.github.johnrengelman.shadow'; + apply plugin: 'com.gradleup.shadow'; shadowJar { // Exclude com.datahub.plugins package and files related to jar signature exclude "META-INF/*.RSA", "META-INF/*.SF","META-INF/*.DSA" diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/validation/AspectValidationException.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/validation/AspectValidationException.java index dd8798ee89ae6b..938cb2d5f99e62 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/validation/AspectValidationException.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/validation/AspectValidationException.java @@ -18,45 +18,39 @@ public static AspectValidationException forItem(BatchItem item, String msg) { } public static AspectValidationException forItem(BatchItem item, String msg, Exception e) { - return new AspectValidationException( - item.getChangeType(), item.getUrn(), item.getAspectName(), msg, SubType.VALIDATION, e); + return new AspectValidationException(item, msg, SubType.VALIDATION, e); } public static AspectValidationException forPrecondition(BatchItem item, String msg) { return forPrecondition(item, msg, null); } + public static AspectValidationException forFilter(BatchItem item, String msg) { + return new AspectValidationException(item, msg, SubType.FILTER); + } + public static AspectValidationException forPrecondition(BatchItem item, String msg, Exception e) { - return new AspectValidationException( - item.getChangeType(), item.getUrn(), item.getAspectName(), msg, SubType.PRECONDITION, e); + return new AspectValidationException(item, msg, SubType.PRECONDITION, e); } + @Nonnull BatchItem item; @Nonnull ChangeType changeType; @Nonnull Urn entityUrn; @Nonnull String aspectName; @Nonnull SubType subType; @Nullable String msg; - public AspectValidationException( - @Nonnull ChangeType changeType, - @Nonnull Urn entityUrn, - @Nonnull String aspectName, - String msg, - SubType subType) { - this(changeType, entityUrn, aspectName, msg, subType, null); + public AspectValidationException(@Nonnull BatchItem item, String msg, SubType subType) { + this(item, msg, subType, null); } public AspectValidationException( - @Nonnull ChangeType changeType, - @Nonnull Urn entityUrn, - @Nonnull String aspectName, - @Nonnull String msg, - @Nullable SubType subType, - Exception e) { + @Nonnull BatchItem item, @Nonnull String msg, @Nullable SubType subType, Exception e) { super(msg, e); - this.changeType = changeType; - this.entityUrn = entityUrn; - this.aspectName = aspectName; + this.item = item; + this.changeType = item.getChangeType(); + this.entityUrn = item.getUrn(); + this.aspectName = item.getAspectName(); this.msg = msg; this.subType = subType != null ? subType : SubType.VALIDATION; } @@ -65,8 +59,12 @@ public Pair getAspectGroup() { return Pair.of(entityUrn, aspectName); } - public static enum SubType { + public enum SubType { + // A validation exception is thrown VALIDATION, - PRECONDITION + // A failed precondition is thrown if the header constraints are not met + PRECONDITION, + // Exclude from processing further + FILTER } } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/validation/ValidationExceptionCollection.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/validation/ValidationExceptionCollection.java index 007c196156b124..fc1fcb68029ce1 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/validation/ValidationExceptionCollection.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/validation/ValidationExceptionCollection.java @@ -15,12 +15,30 @@ public class ValidationExceptionCollection extends HashMap, Set> { + private final Set failedHashCodes; + private final Set filteredHashCodes; + + public ValidationExceptionCollection() { + super(); + this.failedHashCodes = new HashSet<>(); + this.filteredHashCodes = new HashSet<>(); + } + + public boolean hasFatalExceptions() { + return !failedHashCodes.isEmpty(); + } + public static ValidationExceptionCollection newCollection() { return new ValidationExceptionCollection(); } public void addException(AspectValidationException exception) { super.computeIfAbsent(exception.getAspectGroup(), key -> new HashSet<>()).add(exception); + if (!AspectValidationException.SubType.FILTER.equals(exception.getSubType())) { + failedHashCodes.add(exception.getItem().hashCode()); + } else { + filteredHashCodes.add(exception.getItem().hashCode()); + } } public void addException(BatchItem item, String message) { @@ -28,8 +46,7 @@ public void addException(BatchItem item, String message) { } public void addException(BatchItem item, String message, Exception ex) { - super.computeIfAbsent(Pair.of(item.getUrn(), item.getAspectName()), key -> new HashSet<>()) - .add(AspectValidationException.forItem(item, message, ex)); + addException(AspectValidationException.forItem(item, message, ex)); } public Stream streamAllExceptions() { @@ -41,7 +58,8 @@ public Collection successful(Collection items) { } public Stream streamSuccessful(Stream items) { - return items.filter(i -> !this.containsKey(Pair.of(i.getUrn(), i.getAspectName()))); + return items.filter( + i -> !failedHashCodes.contains(i.hashCode()) && !filteredHashCodes.contains(i.hashCode())); } public Collection exceptions(Collection items) { @@ -49,7 +67,7 @@ public Collection exceptions(Collection items) { } public Stream streamExceptions(Stream items) { - return items.filter(i -> this.containsKey(Pair.of(i.getUrn(), i.getAspectName()))); + return items.filter(i -> failedHashCodes.contains(i.hashCode())); } @Override diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/validation/CreateIfNotExistsValidator.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/validation/CreateIfNotExistsValidator.java index 2ad885dc9fdd2c..9b9d8f49d84627 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/validation/CreateIfNotExistsValidator.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/validation/CreateIfNotExistsValidator.java @@ -25,6 +25,8 @@ @Getter @Accessors(chain = true) public class CreateIfNotExistsValidator extends AspectPayloadValidator { + public static final String FILTER_EXCEPTION_HEADER = "If-None-Match"; + public static final String FILTER_EXCEPTION_VALUE = "*"; @Nonnull private AspectPluginConfig config; @@ -49,11 +51,17 @@ protected Stream validatePreCommitAspects( .filter(item -> ChangeType.CREATE_ENTITY.equals(item.getChangeType())) .collect(Collectors.toSet())) { // if the key aspect is missing in the batch, the entity exists and CREATE_ENTITY should be - // denied + // denied or dropped if (!entityKeyMap.containsKey(createEntityItem.getUrn())) { - exceptions.addException( - createEntityItem, - "Cannot perform CREATE_ENTITY if not exists since the entity key already exists."); + if (isPrecondition(createEntityItem)) { + exceptions.addException( + AspectValidationException.forFilter( + createEntityItem, "Dropping write per precondition header If-None-Match: *")); + } else { + exceptions.addException( + createEntityItem, + "Cannot perform CREATE_ENTITY if not exists since the entity key already exists."); + } } } @@ -61,10 +69,16 @@ protected Stream validatePreCommitAspects( changeMCPs.stream() .filter(item -> ChangeType.CREATE.equals(item.getChangeType())) .collect(Collectors.toSet())) { - // if a CREATE item has a previous value, should be denied + // if a CREATE item has a previous value, should be denied or dropped if (createItem.getPreviousRecordTemplate() != null) { - exceptions.addException( - createItem, "Cannot perform CREATE since the aspect already exists."); + if (isPrecondition(createItem)) { + exceptions.addException( + AspectValidationException.forFilter( + createItem, "Dropping write per precondition header If-None-Match: *")); + } else { + exceptions.addException( + createItem, "Cannot perform CREATE since the aspect already exists."); + } } } @@ -77,4 +91,10 @@ protected Stream validateProposedAspects( @Nonnull RetrieverContext retrieverContext) { return Stream.empty(); } + + private static boolean isPrecondition(ChangeMCP item) { + return item.getHeader(FILTER_EXCEPTION_HEADER) + .map(FILTER_EXCEPTION_VALUE::equals) + .orElse(false); + } } diff --git a/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/batch/TestMCP.java b/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/batch/TestMCP.java index 5b714bdbf0b478..d7dd1fab2b6acf 100644 --- a/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/batch/TestMCP.java +++ b/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/batch/TestMCP.java @@ -21,6 +21,7 @@ import com.linkedin.test.metadata.aspect.TestEntityRegistry; import java.net.URISyntaxException; import java.util.Collection; +import java.util.Collections; import java.util.Map; import java.util.Objects; import java.util.Optional; @@ -140,7 +141,7 @@ public Map getHeaders() { mcp -> mcp.getHeaders().entrySet().stream() .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue))) - .orElse(headers); + .orElse(headers != null ? headers : Collections.emptyMap()); } @Override diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar index ccebba7710deaf..a4b76b9530d66f 100644 Binary files a/gradle/wrapper/gradle-wrapper.jar and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index bdc9a83b1e6524..22286c90de3d10 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,8 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.0.2-bin.zip +distributionSha256Sum=89d4e70e4e84e2d2dfbb63e4daa53e21b25017cc70c37e4eea31ee51fb15098a +distributionUrl=https\://services.gradle.org/distributions/gradle-8.11.1-all.zip networkTimeout=10000 +validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/gradlew b/gradlew index 79a61d421cc4e2..f5feea6d6b116b 100755 --- a/gradlew +++ b/gradlew @@ -15,6 +15,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # +# SPDX-License-Identifier: Apache-2.0 +# ############################################################################## # @@ -55,7 +57,7 @@ # Darwin, MinGW, and NonStop. # # (3) This script is generated from the Groovy template -# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# https://github.com/gradle/gradle/blob/HEAD/platforms/jvm/plugins-application/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt # within the Gradle project. # # You can find Gradle at https://github.com/gradle/gradle/. @@ -83,10 +85,9 @@ done # This is normally unused # shellcheck disable=SC2034 APP_BASE_NAME=${0##*/} -APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit - -# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' +# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) +APP_HOME=$( cd -P "${APP_HOME:-./}" > /dev/null && printf '%s +' "$PWD" ) || exit # Use the maximum available, or set MAX_FD != -1 to use that value. MAX_FD=maximum @@ -133,10 +134,13 @@ location of your Java installation." fi else JAVACMD=java - which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + if ! command -v java >/dev/null 2>&1 + then + die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. Please set the JAVA_HOME variable in your environment to match the location of your Java installation." + fi fi # Increase the maximum file descriptors if we can. @@ -144,7 +148,7 @@ if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then case $MAX_FD in #( max*) # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. - # shellcheck disable=SC3045 + # shellcheck disable=SC2039,SC3045 MAX_FD=$( ulimit -H -n ) || warn "Could not query maximum file descriptor limit" esac @@ -152,7 +156,7 @@ if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then '' | soft) :;; #( *) # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. - # shellcheck disable=SC3045 + # shellcheck disable=SC2039,SC3045 ulimit -n "$MAX_FD" || warn "Could not set maximum file descriptor limit to $MAX_FD" esac @@ -197,11 +201,15 @@ if "$cygwin" || "$msys" ; then done fi -# Collect all arguments for the java command; -# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of -# shell script including quotes and variable substitutions, so put them in -# double quotes to make sure that they get re-expanded; and -# * put everything else in single quotes, so that it's not re-expanded. + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + +# Collect all arguments for the java command: +# * DEFAULT_JVM_OPTS, JAVA_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments, +# and any embedded shellness will be escaped. +# * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be +# treated as '${Hostname}' itself on the command line. set -- \ "-Dorg.gradle.appname=$APP_BASE_NAME" \ diff --git a/gradlew.bat b/gradlew.bat index 6689b85beecde6..9b42019c7915b9 100644 --- a/gradlew.bat +++ b/gradlew.bat @@ -13,6 +13,8 @@ @rem See the License for the specific language governing permissions and @rem limitations under the License. @rem +@rem SPDX-License-Identifier: Apache-2.0 +@rem @if "%DEBUG%"=="" @echo off @rem ########################################################################## @@ -43,11 +45,11 @@ set JAVA_EXE=java.exe %JAVA_EXE% -version >NUL 2>&1 if %ERRORLEVEL% equ 0 goto execute -echo. -echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. +echo. 1>&2 +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2 +echo. 1>&2 +echo Please set the JAVA_HOME variable in your environment to match the 1>&2 +echo location of your Java installation. 1>&2 goto fail @@ -57,11 +59,11 @@ set JAVA_EXE=%JAVA_HOME%/bin/java.exe if exist "%JAVA_EXE%" goto execute -echo. -echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. +echo. 1>&2 +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2 +echo. 1>&2 +echo Please set the JAVA_HOME variable in your environment to match the 1>&2 +echo location of your Java installation. 1>&2 goto fail diff --git a/metadata-auth/auth-api/build.gradle b/metadata-auth/auth-api/build.gradle index 3aafaf45bc2cb8..9a833dacf7fb19 100644 --- a/metadata-auth/auth-api/build.gradle +++ b/metadata-auth/auth-api/build.gradle @@ -2,7 +2,7 @@ plugins { id("com.palantir.git-version") apply false } -apply plugin: 'com.github.johnrengelman.shadow' +apply plugin: 'com.gradleup.shadow' apply plugin: 'java-library' apply plugin: 'signing' apply plugin: 'maven-publish' diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py new file mode 100644 index 00000000000000..d72ba67c23cd72 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py @@ -0,0 +1,35 @@ +"""This example DAG demonstrates how to create and use a DataHubGraph client.""" + +from datetime import timedelta + +import pendulum +from airflow.decorators import dag, task +from datahub.ingestion.graph.client import DataHubGraph, RemovedStatusFilter + +from datahub_airflow_plugin.hooks.datahub import DatahubRestHook + + +@dag( + schedule_interval=timedelta(days=1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), + catchup=False, +) +def datahub_graph_usage_sample_dag(): + @task() + def use_the_graph(): + graph: DataHubGraph = DatahubRestHook("my_datahub_rest_conn_id").make_graph() + graph.test_connection() + + # Example usage: Find all soft-deleted BigQuery DEV entities + # in DataHub, and hard delete them. + for urn in graph.get_urns_by_filter( + platform="bigquery", + env="DEV", + status=RemovedStatusFilter.ONLY_SOFT_DELETED, + ): + graph.hard_delete_entity(urn) + + use_the_graph() + + +datahub_graph_usage_sample_dag() diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py index b60f20c5bf8b28..5f4d787fb893d3 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py @@ -14,6 +14,7 @@ from datahub.emitter.kafka_emitter import DatahubKafkaEmitter from datahub.emitter.rest_emitter import DataHubRestEmitter from datahub.emitter.synchronized_file_emitter import SynchronizedFileEmitter + from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.sink.datahub_kafka import KafkaSinkConfig @@ -94,6 +95,9 @@ def make_emitter(self) -> "DataHubRestEmitter": host, token, **extra_args ) + def make_graph(self) -> "DataHubGraph": + return self.make_emitter().to_graph() + def emit( self, items: Sequence[ diff --git a/metadata-ingestion-modules/dagster-plugin/setup.py b/metadata-ingestion-modules/dagster-plugin/setup.py index 660dbb2981c516..0e0685cb378c1b 100644 --- a/metadata-ingestion-modules/dagster-plugin/setup.py +++ b/metadata-ingestion-modules/dagster-plugin/setup.py @@ -123,7 +123,7 @@ def get_long_description(): ], # Package info. zip_safe=False, - python_requires=">=3.8", + python_requires=">=3.9", package_dir={"": "src"}, packages=setuptools.find_namespace_packages(where="./src"), entry_points=entry_points, diff --git a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/sensors/datahub_sensors.py b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/sensors/datahub_sensors.py index f6b0629b7ca7b9..bccdb4ac7922a5 100644 --- a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/sensors/datahub_sensors.py +++ b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/sensors/datahub_sensors.py @@ -28,10 +28,15 @@ from dagster._core.definitions.multi_asset_sensor_definition import ( AssetMaterializationFunctionReturn, ) -from dagster._core.definitions.sensor_definition import ( - DefaultSensorStatus, - RawSensorEvaluationFunctionReturn, -) +from dagster._core.definitions.sensor_definition import DefaultSensorStatus + +# This SensorReturnTypesUnion is from Dagster 1.9.1+ and is not available in older versions +# of Dagster. We need to import it conditionally to avoid breaking compatibility with older +try: + from dagster._core.definitions.sensor_definition import SensorReturnTypesUnion +except ImportError: + from dagster._core.definitions.sensor_definition import RawSensorEvaluationFunctionReturn as SensorReturnTypesUnion # type: ignore + from dagster._core.definitions.target import ExecutableDefinition from dagster._core.definitions.unresolved_asset_job_definition import ( UnresolvedAssetJobDefinition, @@ -689,9 +694,7 @@ def _emit_asset_metadata( return SkipReason("Asset metadata processed") - def _emit_metadata( - self, context: RunStatusSensorContext - ) -> RawSensorEvaluationFunctionReturn: + def _emit_metadata(self, context: RunStatusSensorContext) -> SensorReturnTypesUnion: """ Function to emit metadata for datahub rest. """ diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py index 013efbdf6a2f6b..fd3fe7ca098ecb 100644 --- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py +++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py @@ -1,8 +1,7 @@ import logging -from contextlib import contextmanager from enum import Enum from pathlib import Path -from typing import Generator, List, Optional +from typing import List, Optional import yaml from pydantic import validator @@ -10,6 +9,7 @@ from datahub.configuration.common import ConfigModel from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.global_context import get_graph_context, set_graph_context from datahub.ingestion.graph.client import DataHubGraph, get_default_graph from datahub.metadata.schema_classes import ( PropertyValueClass, @@ -24,23 +24,10 @@ class StructuredPropertiesConfig: """Configuration class to hold the graph client""" - _graph: Optional[DataHubGraph] = None - - @classmethod - @contextmanager - def use_graph(cls, graph: DataHubGraph) -> Generator[None, None, None]: - """Context manager to temporarily set a custom graph""" - previous_graph = cls._graph - cls._graph = graph - try: - yield - finally: - cls._graph = previous_graph - @classmethod - def get_graph(cls) -> DataHubGraph: + def get_graph_required(cls) -> DataHubGraph: """Get the current graph, falling back to default if none set""" - return cls._graph if cls._graph is not None else get_default_graph() + return get_graph_context() or get_default_graph() class AllowedTypes(Enum): @@ -79,7 +66,7 @@ class TypeQualifierAllowedTypes(ConfigModel): @validator("allowed_types", each_item=True) def validate_allowed_types(cls, v): if v: - graph = StructuredPropertiesConfig.get_graph() + graph = StructuredPropertiesConfig.get_graph_required() validated_urn = Urn.make_entity_type_urn(v) if not graph.exists(validated_urn): raise ValueError( @@ -106,7 +93,7 @@ class StructuredProperties(ConfigModel): @validator("entity_types", each_item=True) def validate_entity_types(cls, v): if v: - graph = StructuredPropertiesConfig.get_graph() + graph = StructuredPropertiesConfig.get_graph_required() validated_urn = Urn.make_entity_type_urn(v) if not graph.exists(validated_urn): raise ValueError( @@ -136,63 +123,64 @@ def urn_must_be_present(cls, v, values): @staticmethod def create(file: str, graph: Optional[DataHubGraph] = None) -> None: - emitter: DataHubGraph = graph if graph else get_default_graph() - with StructuredPropertiesConfig.use_graph(emitter): - print("Using graph") + with set_graph_context(graph): + graph = StructuredPropertiesConfig.get_graph_required() + with open(file) as fp: structuredproperties: List[dict] = yaml.safe_load(fp) - for structuredproperty_raw in structuredproperties: - structuredproperty = StructuredProperties.parse_obj( - structuredproperty_raw + for structuredproperty_raw in structuredproperties: + structuredproperty = StructuredProperties.parse_obj( + structuredproperty_raw + ) + + if not structuredproperty.type.islower(): + structuredproperty.type = structuredproperty.type.lower() + logger.warning( + f"Structured property type should be lowercase. Updated to {structuredproperty.type}" ) - if not structuredproperty.type.islower(): - structuredproperty.type = structuredproperty.type.lower() - logger.warn( - f"Structured property type should be lowercase. Updated to {structuredproperty.type}" - ) - if not AllowedTypes.check_allowed_type(structuredproperty.type): - raise ValueError( - f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}" - ) - mcp = MetadataChangeProposalWrapper( - entityUrn=structuredproperty.urn, - aspect=StructuredPropertyDefinitionClass( - qualifiedName=structuredproperty.fqn, - valueType=Urn.make_data_type_urn(structuredproperty.type), - displayName=structuredproperty.display_name, - description=structuredproperty.description, - entityTypes=[ - Urn.make_entity_type_urn(entity_type) - for entity_type in structuredproperty.entity_types or [] - ], - cardinality=structuredproperty.cardinality, - immutable=structuredproperty.immutable, - allowedValues=( - [ - PropertyValueClass( - value=v.value, description=v.description - ) - for v in structuredproperty.allowed_values - ] - if structuredproperty.allowed_values - else None - ), - typeQualifier=( - { - "allowedTypes": structuredproperty.type_qualifier.allowed_types - } - if structuredproperty.type_qualifier - else None - ), - ), + if not AllowedTypes.check_allowed_type(structuredproperty.type): + raise ValueError( + f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}" ) - emitter.emit_mcp(mcp) + mcp = MetadataChangeProposalWrapper( + entityUrn=structuredproperty.urn, + aspect=StructuredPropertyDefinitionClass( + qualifiedName=structuredproperty.fqn, + valueType=Urn.make_data_type_urn(structuredproperty.type), + displayName=structuredproperty.display_name, + description=structuredproperty.description, + entityTypes=[ + Urn.make_entity_type_urn(entity_type) + for entity_type in structuredproperty.entity_types or [] + ], + cardinality=structuredproperty.cardinality, + immutable=structuredproperty.immutable, + allowedValues=( + [ + PropertyValueClass( + value=v.value, description=v.description + ) + for v in structuredproperty.allowed_values + ] + if structuredproperty.allowed_values + else None + ), + typeQualifier=( + { + "allowedTypes": structuredproperty.type_qualifier.allowed_types + } + if structuredproperty.type_qualifier + else None + ), + ), + ) + graph.emit_mcp(mcp) - logger.info(f"Created structured property {structuredproperty.urn}") + logger.info(f"Created structured property {structuredproperty.urn}") @classmethod def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties": - with StructuredPropertiesConfig.use_graph(graph): + with set_graph_context(graph): structured_property: Optional[ StructuredPropertyDefinitionClass ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass) diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py index ef2082b95330b4..e2bc14925ad383 100644 --- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py @@ -46,8 +46,18 @@ os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4") ) -# The limit is 16mb. We will use a max of 15mb to have some space for overhead. -_MAX_BATCH_INGEST_PAYLOAD_SIZE = 15 * 1024 * 1024 +# The limit is 16mb. We will use a max of 15mb to have some space +# for overhead like request headers. +# This applies to pretty much all calls to GMS. +INGEST_MAX_PAYLOAD_BYTES = 15 * 1024 * 1024 + +# This limit is somewhat arbitrary. All GMS endpoints will timeout +# and return a 500 if processing takes too long. To avoid sending +# too much to the backend and hitting a timeout, we try to limit +# the number of MCPs we send in a batch. +BATCH_INGEST_MAX_PAYLOAD_LENGTH = int( + os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_LENGTH", 200) +) class DataHubRestEmitter(Closeable, Emitter): @@ -290,11 +300,14 @@ def emit_mcps( # As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS. # If we will exceed the limit, we need to break it up into chunks. mcp_obj_chunks: List[List[str]] = [] - current_chunk_size = _MAX_BATCH_INGEST_PAYLOAD_SIZE + current_chunk_size = INGEST_MAX_PAYLOAD_BYTES for mcp_obj in mcp_objs: mcp_obj_size = len(json.dumps(mcp_obj)) - if mcp_obj_size + current_chunk_size > _MAX_BATCH_INGEST_PAYLOAD_SIZE: + if ( + mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES + or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH + ): mcp_obj_chunks.append([]) current_chunk_size = 0 mcp_obj_chunks[-1].append(mcp_obj) diff --git a/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py b/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py index 1bb07ea8462279..209efbbb90febc 100644 --- a/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py +++ b/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py @@ -18,7 +18,10 @@ ) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import mcps_from_mce -from datahub.emitter.rest_emitter import DataHubRestEmitter +from datahub.emitter.rest_emitter import ( + BATCH_INGEST_MAX_PAYLOAD_LENGTH, + DataHubRestEmitter, +) from datahub.ingestion.api.common import RecordEnvelope, WorkUnit from datahub.ingestion.api.sink import ( NoopWriteCallback, @@ -71,6 +74,14 @@ class DatahubRestSinkConfig(DatahubClientConfig): # Only applies in async batch mode. max_per_batch: pydantic.PositiveInt = 100 + @pydantic.validator("max_per_batch", always=True) + def validate_max_per_batch(cls, v): + if v > BATCH_INGEST_MAX_PAYLOAD_LENGTH: + raise ValueError( + f"max_per_batch must be less than or equal to {BATCH_INGEST_MAX_PAYLOAD_LENGTH}" + ) + return v + @dataclasses.dataclass class DataHubRestSinkReport(SinkReport): diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py index 0b201278142e3a..23a99ccb310e13 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py @@ -282,10 +282,6 @@ class JdbcParser: query: str transforms: list - def report_warning(self, key: str, reason: str) -> None: - logger.warning(f"{key}: {reason}") - self.report.report_warning(key, reason) - def get_parser( self, connector_manifest: ConnectorManifest, @@ -355,9 +351,9 @@ def default_get_lineages( source_table = f"{table_name_tuple[-2]}.{source_table}" else: include_source_dataset = False - self.report_warning( - self.connector_manifest.name, - f"could not find schema for table {source_table}", + self.report.warning( + "Could not find schema for table" + f"{self.connector_manifest.name} : {source_table}", ) dataset_name: str = get_dataset_name(database_name, source_table) lineage = KafkaConnectLineage( @@ -457,9 +453,9 @@ def _extract_lineages(self): target_platform=KAFKA, ) lineages.append(lineage) - self.report_warning( + self.report.warning( + "Could not find input dataset, the connector has query configuration set", self.connector_manifest.name, - "could not find input dataset, the connector has query configuration set", ) self.connector_manifest.lineages = lineages return @@ -535,24 +531,24 @@ def _extract_lineages(self): include_source_dataset=False, ) ) - self.report_warning( - self.connector_manifest.name, - f"could not find input dataset, for connector topics {topic_names}", + self.report.warning( + "Could not find input dataset for connector topics", + f"{self.connector_manifest.name} : {topic_names}", ) self.connector_manifest.lineages = lineages return else: include_source_dataset = True if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: - self.report_warning( - self.connector_manifest.name, - f"could not find input dataset, connector has unknown transform - {transforms[0]['type']}", + self.report.warning( + "Could not find input dataset, connector has unknown transform", + f"{self.connector_manifest.name} : {transforms[0]['type']}", ) include_source_dataset = False if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: - self.report_warning( + self.report.warning( + "Could not find input dataset, connector has one or more unknown transforms", self.connector_manifest.name, - "could not find input dataset, connector has one or more unknown transforms", ) include_source_dataset = False lineages = self.default_get_lineages( @@ -753,8 +749,10 @@ def _extract_lineages(self): lineages.append(lineage) self.connector_manifest.lineages = lineages except Exception as e: - self.report.report_warning( - self.connector_manifest.name, f"Error resolving lineage: {e}" + self.report.warning( + "Error resolving lineage for connector", + self.connector_manifest.name, + exc=e, ) return @@ -783,10 +781,6 @@ class BQParser: defaultDataset: Optional[str] = None version: str = "v1" - def report_warning(self, key: str, reason: str) -> None: - logger.warning(f"{key}: {reason}") - self.report.report_warning(key, reason) - def get_parser( self, connector_manifest: ConnectorManifest, @@ -917,9 +911,9 @@ def _extract_lineages(self): transformed_topic = self.apply_transformations(topic, transforms) dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser) if dataset_table is None: - self.report_warning( - self.connector_manifest.name, - f"could not find target dataset for topic {transformed_topic}, please check your connector configuration", + self.report.warning( + "Could not find target dataset for topic, please check your connector configuration" + f"{self.connector_manifest.name} : {transformed_topic} ", ) continue target_dataset = f"{project}.{dataset_table}" @@ -954,10 +948,6 @@ class SnowflakeParser: schema_name: str topics_to_tables: Dict[str, str] - def report_warning(self, key: str, reason: str) -> None: - logger.warning(f"{key}: {reason}") - self.report.report_warning(key, reason) - def get_table_name_from_topic_name(self, topic_name: str) -> str: """ This function converts the topic name to a valid Snowflake table name using some rules. @@ -1105,8 +1095,10 @@ def _extract_lineages(self): ) self.connector_manifest.lineages = lineages except Exception as e: - self.report.report_warning( - self.connector_manifest.name, f"Error resolving lineage: {e}" + self.report.warning( + "Error resolving lineage for connector", + self.connector_manifest.name, + exc=e, ) return @@ -1155,7 +1147,7 @@ def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext): ) self.session.auth = (self.config.username, self.config.password) - test_response = self.session.get(f"{self.config.connect_uri}") + test_response = self.session.get(f"{self.config.connect_uri}/connectors") test_response.raise_for_status() logger.info(f"Connection to {self.config.connect_uri} is ok") if not jpype.isJVMStarted(): @@ -1178,13 +1170,16 @@ def get_connectors_manifest(self) -> List[ConnectorManifest]: payload = connector_response.json() - for c in payload: - connector_url = f"{self.config.connect_uri}/connectors/{c}" - connector_response = self.session.get(connector_url) - manifest = connector_response.json() - connector_manifest = ConnectorManifest(**manifest) - if not self.config.connector_patterns.allowed(connector_manifest.name): - self.report.report_dropped(connector_manifest.name) + for connector_name in payload: + connector_url = f"{self.config.connect_uri}/connectors/{connector_name}" + connector_manifest = self._get_connector_manifest( + connector_name, connector_url + ) + if ( + connector_manifest is None + or not self.config.connector_patterns.allowed(connector_manifest.name) + ): + self.report.report_dropped(connector_name) continue if self.config.provided_configs: @@ -1195,19 +1190,11 @@ def get_connectors_manifest(self) -> List[ConnectorManifest]: connector_manifest.lineages = list() connector_manifest.url = connector_url - topics = self.session.get( - f"{self.config.connect_uri}/connectors/{c}/topics", - ).json() - - connector_manifest.topic_names = topics[c]["topics"] + connector_manifest.topic_names = self._get_connector_topics(connector_name) # Populate Source Connector metadata if connector_manifest.type == SOURCE: - tasks = self.session.get( - f"{self.config.connect_uri}/connectors/{c}/tasks", - ).json() - - connector_manifest.tasks = tasks + connector_manifest.tasks = self._get_connector_tasks(connector_name) # JDBC source connector lineages if connector_manifest.config.get(CONNECTOR_CLASS).__eq__( @@ -1246,7 +1233,7 @@ def get_connectors_manifest(self) -> List[ConnectorManifest]: ) continue - for topic in topics: + for topic in connector_manifest.topic_names: lineage = KafkaConnectLineage( source_dataset=target_connector.source_dataset, source_platform=target_connector.source_platform, @@ -1286,6 +1273,49 @@ def get_connectors_manifest(self) -> List[ConnectorManifest]: return connectors_manifest + def _get_connector_manifest( + self, connector_name: str, connector_url: str + ) -> Optional[ConnectorManifest]: + try: + connector_response = self.session.get(connector_url) + connector_response.raise_for_status() + except Exception as e: + self.report.warning( + "Failed to get connector details", connector_name, exc=e + ) + return None + manifest = connector_response.json() + connector_manifest = ConnectorManifest(**manifest) + return connector_manifest + + def _get_connector_tasks(self, connector_name: str) -> dict: + try: + response = self.session.get( + f"{self.config.connect_uri}/connectors/{connector_name}/tasks", + ) + response.raise_for_status() + except Exception as e: + self.report.warning( + "Error getting connector tasks", context=connector_name, exc=e + ) + return {} + + return response.json() + + def _get_connector_topics(self, connector_name: str) -> List[str]: + try: + response = self.session.get( + f"{self.config.connect_uri}/connectors/{connector_name}/topics", + ) + response.raise_for_status() + except Exception as e: + self.report.warning( + "Error getting connector topics", context=connector_name, exc=e + ) + return [] + + return response.json()[connector_name]["topics"] + def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit: connector_name = connector.name connector_type = connector.type diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index e065e2f34bc66d..93d84d8b246e51 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -413,9 +413,10 @@ def _process_upstream_lineage_row( return UpstreamLineageEdge.parse_obj(db_row) except Exception as e: self.report.num_upstream_lineage_edge_parsing_failed += 1 + upstream_tables = db_row.get("UPSTREAM_TABLES") self.structured_reporter.warning( "Failed to parse lineage edge", - context=db_row.get("DOWNSTREAM_TABLE_NAME") or None, + context=f"Upstreams: {upstream_tables} Downstreams: {db_row.get('DOWNSTREAM_TABLE_NAME')}", exc=e, ) return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index 99790de529ac3a..97c398c1962d6b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -237,6 +237,19 @@ def show_views_for_database( LIMIT {limit} {from_clause}; """ + @staticmethod + def get_secure_view_definitions() -> str: + # https://docs.snowflake.com/en/sql-reference/account-usage/views + return """ + SELECT + TABLE_CATALOG as "TABLE_CATALOG", + TABLE_SCHEMA as "TABLE_SCHEMA", + TABLE_NAME as "TABLE_NAME", + VIEW_DEFINITION as "VIEW_DEFINITION" + FROM SNOWFLAKE.ACCOUNT_USAGE.VIEWS + WHERE IS_SECURE = 'YES' AND VIEW_DEFINITION !='' AND DELETED IS NULL + """ + @staticmethod def columns_for_schema( schema_name: str, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py index 5a69b4bb779d72..780effc82b0163 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py @@ -266,6 +266,22 @@ def get_schemas_for_database(self, db_name: str) -> List[SnowflakeSchema]: snowflake_schemas.append(snowflake_schema) return snowflake_schemas + @serialized_lru_cache(maxsize=1) + def get_secure_view_definitions(self) -> Dict[str, Dict[str, Dict[str, str]]]: + secure_view_definitions: Dict[str, Dict[str, Dict[str, str]]] = defaultdict( + lambda: defaultdict(lambda: defaultdict()) + ) + cur = self.connection.query(SnowflakeQuery.get_secure_view_definitions()) + for view in cur: + db_name = view["TABLE_CATALOG"] + schema_name = view["TABLE_SCHEMA"] + view_name = view["TABLE_NAME"] + secure_view_definitions[db_name][schema_name][view_name] = view[ + "VIEW_DEFINITION" + ] + + return secure_view_definitions + @serialized_lru_cache(maxsize=1) def get_tables_for_database( self, db_name: str diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index 4ceeb8560c1758..bc64693b6a1084 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -424,6 +424,10 @@ def _process_schema( view_identifier = self.identifiers.get_dataset_identifier( view.name, schema_name, db_name ) + if view.is_secure and not view.view_definition: + view.view_definition = self.fetch_secure_view_definition( + view.name, schema_name, db_name + ) if view.view_definition: self.aggregator.add_view_definition( view_urn=self.identifiers.gen_dataset_urn(view_identifier), @@ -449,6 +453,25 @@ def _process_schema( context=f"{db_name}.{schema_name}", ) + def fetch_secure_view_definition( + self, table_name: str, schema_name: str, db_name: str + ) -> Optional[str]: + try: + view_definitions = self.data_dictionary.get_secure_view_definitions() + return view_definitions[db_name][schema_name][table_name] + except Exception as e: + if isinstance(e, SnowflakePermissionError): + error_msg = ( + "Failed to get secure views definitions. Please check permissions." + ) + else: + error_msg = "Failed to get secure views definitions" + self.structured_reporter.warning( + error_msg, + exc=e, + ) + return None + def fetch_views_for_schema( self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str ) -> List[SnowflakeView]: diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index 9e4bb2f0eb634f..862d27186703a8 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -14,6 +14,11 @@ NUM_OPS = 10 NUM_USAGE = 0 + +def is_secure(view_idx): + return view_idx == 1 + + FROZEN_TIME = "2022-06-07 17:00:00" large_sql_query = """WITH object_access_history AS ( @@ -247,9 +252,25 @@ def default_query_results( # noqa: C901 "name": f"VIEW_{view_idx}", "created_on": datetime(2021, 6, 8, 0, 0, 0, 0), "comment": "Comment for View", - "text": f"create view view_{view_idx} as select * from table_{view_idx}", + "is_secure": "true" if is_secure(view_idx) else "false", + "text": ( + f"create view view_{view_idx} as select * from table_{view_idx}" + if not is_secure(view_idx) + else None + ), + } + for view_idx in range(1, num_views + 1) + ] + elif query == SnowflakeQuery.get_secure_view_definitions(): + return [ + { + "TABLE_CATALOG": "TEST_DB", + "TABLE_SCHEMA": "TEST_SCHEMA", + "TABLE_NAME": f"VIEW_{view_idx}", + "VIEW_DEFINITION": f"create view view_{view_idx} as select * from table_{view_idx}", } for view_idx in range(1, num_views + 1) + if is_secure(view_idx) ] elif query == SnowflakeQuery.columns_for_schema("TEST_SCHEMA", "TEST_DB"): return [ diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json index 4415b1ad3e5159..48ec46af069cef 100644 --- a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json +++ b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json @@ -490,7 +490,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_1/", "name": "TABLE_1", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_1", @@ -789,7 +791,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_2/", "name": "TABLE_2", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_2", @@ -1088,7 +1092,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_3/", "name": "TABLE_3", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_3", @@ -1387,7 +1393,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_4/", "name": "TABLE_4", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_4", @@ -1686,7 +1694,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_5/", "name": "TABLE_5", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_5", @@ -1985,7 +1995,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_6/", "name": "TABLE_6", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_6", @@ -2284,7 +2296,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_7/", "name": "TABLE_7", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_7", @@ -2583,7 +2597,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_8/", "name": "TABLE_8", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_8", @@ -2882,7 +2898,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_9/", "name": "TABLE_9", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_9", @@ -3181,7 +3199,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_10/", "name": "TABLE_10", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_10", @@ -3471,23 +3491,25 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {}, + "customProperties": { + "IS_SECURE": "true" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_1/", "name": "VIEW_1", "qualifiedName": "TEST_DB.TEST_SCHEMA.VIEW_1", "description": "Comment for View", "created": { - "time": 1623103200000 + "time": 1623090600000 }, "lastModified": { - "time": 1623103200000 + "time": 1623090600000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_12_18-10_16_09", + "runId": "snowflake-2024_12_16-15_30_20-649nax", "lastRunId": "no-run-id-provided" } }, diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json index 3040c6c4e9196f..f22cbd122361dc 100644 --- a/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json +++ b/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json @@ -621,12 +621,17 @@ "op": "add", "path": "/qualifiedName", "value": "TEST_DB.TEST_SCHEMA.VIEW_1" + }, + { + "op": "add", + "path": "/customProperties/IS_SECURE", + "value": "true" } ] }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00-ad3hnf", + "runId": "snowflake-2022_06_07-17_00_00-ivthci", "lastRunId": "no-run-id-provided" } }, diff --git a/metadata-integration/java/acryl-spark-lineage/build.gradle b/metadata-integration/java/acryl-spark-lineage/build.gradle index 940ebb98485367..8816264fbe50f7 100644 --- a/metadata-integration/java/acryl-spark-lineage/build.gradle +++ b/metadata-integration/java/acryl-spark-lineage/build.gradle @@ -2,7 +2,7 @@ plugins { id("com.palantir.git-version") apply false } apply plugin: 'java-library' -apply plugin: 'com.github.johnrengelman.shadow' +apply plugin: 'com.gradleup.shadow' apply plugin: 'signing' apply plugin: 'io.codearte.nexus-staging' apply plugin: 'maven-publish' diff --git a/metadata-integration/java/custom-plugin-lib/build.gradle b/metadata-integration/java/custom-plugin-lib/build.gradle index 305dbe9578fa04..11933a947487f4 100644 --- a/metadata-integration/java/custom-plugin-lib/build.gradle +++ b/metadata-integration/java/custom-plugin-lib/build.gradle @@ -1,6 +1,6 @@ plugins { id 'java-library' - id 'com.github.johnrengelman.shadow' + id 'com.gradleup.shadow' id 'signing' id 'io.codearte.nexus-staging' id 'maven-publish' diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle index 3e940b0f32248f..42861cf235b56f 100644 --- a/metadata-integration/java/datahub-client/build.gradle +++ b/metadata-integration/java/datahub-client/build.gradle @@ -1,7 +1,7 @@ plugins { id("com.palantir.git-version") apply false id 'java-library' - id 'com.github.johnrengelman.shadow' + id 'com.gradleup.shadow' id 'signing' id 'io.codearte.nexus-staging' id 'maven-publish' @@ -23,7 +23,7 @@ dependencies { api project(':entity-registry') api project(':metadata-integration:java:datahub-event') implementation project(':metadata-integration:java:datahub-schematron:lib') - + implementation(externalDependency.kafkaAvroSerializer) { exclude group: "org.apache.avro" } @@ -70,14 +70,14 @@ task validatePythonEnv(dependsOn: [":metadata-ingestion:installDev"]) { def venvPath = System.getProperty('python.venv.path', '../../../metadata-ingestion/venv') def isWindows = System.getProperty('os.name').toLowerCase().contains('windows') def pythonExe = isWindows ? "${venvPath}/Scripts/python.exe" : "${venvPath}/bin/python" - + def result = exec { commandLine pythonExe, "-c", "import sys; print(sys.executable)" ignoreExitValue = true standardOutput = new ByteArrayOutputStream() errorOutput = new ByteArrayOutputStream() } - + if (result.exitValue != 0) { throw new GradleException("Python virtual environment not properly set up at ${venvPath}") } @@ -95,6 +95,11 @@ test { finalizedBy jacocoTestReport } +// no submodule depends on datahub-schematron:cli +// and tests there are the ones checking python-java compatibility +test.dependsOn tasks.getByPath(":metadata-integration:java:datahub-schematron:cli:test") +test.dependsOn tasks.getByPath(":metadata-integration:java:datahub-schematron:lib:test") + task checkShadowJar(type: Exec) { commandLine 'sh', '-c', 'scripts/check_jar.sh' } diff --git a/metadata-integration/java/datahub-event/build.gradle b/metadata-integration/java/datahub-event/build.gradle index 3dca2eb0a40c9f..752e95656bcf6c 100644 --- a/metadata-integration/java/datahub-event/build.gradle +++ b/metadata-integration/java/datahub-event/build.gradle @@ -1,7 +1,7 @@ plugins { id("com.palantir.git-version") apply false id 'java' - id 'com.github.johnrengelman.shadow' + id 'com.gradleup.shadow' id 'signing' id 'io.codearte.nexus-staging' id 'maven-publish' diff --git a/metadata-integration/java/datahub-protobuf/build.gradle b/metadata-integration/java/datahub-protobuf/build.gradle index 97595fd1345dc7..748ab2ef8afeb7 100644 --- a/metadata-integration/java/datahub-protobuf/build.gradle +++ b/metadata-integration/java/datahub-protobuf/build.gradle @@ -3,7 +3,7 @@ plugins { id "application" } apply plugin: 'java' -apply plugin: 'com.github.johnrengelman.shadow' +apply plugin: 'com.gradleup.shadow' apply plugin: 'signing' apply plugin: 'io.codearte.nexus-staging' apply plugin: 'maven-publish' diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverterTest.java b/metadata-integration/java/datahub-schematron/lib/src/test/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverterTest.java new file mode 100644 index 00000000000000..d6522c2d84670f --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverterTest.java @@ -0,0 +1,942 @@ +package io.datahubproject.schematron.converters.avro; + +import static org.testng.Assert.*; + +import com.linkedin.common.urn.DataPlatformUrn; +import com.linkedin.data.template.StringArray; +import com.linkedin.schema.*; +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; +import java.util.Collections; +import org.apache.avro.Schema; +import org.testng.annotations.*; + +@Test(groups = "unit") +class AvroSchemaConverterTest { + + private AvroSchemaConverter avroSchemaConverter = AvroSchemaConverter.builder().build(); + private DataPlatformUrn dataPlatformUrn = + DataPlatformUrn.createFromString("urn:li:dataPlatform:foo"); + + AvroSchemaConverterTest() throws URISyntaxException {} + + @Test(groups = "basic") + void testPrimitiveTypes() throws IOException { + SchemaMetadata schema = + avroSchemaConverter.toDataHubSchema( + readAvroSchema("primitive_types.avsc"), false, false, dataPlatformUrn, null); + + schema.getFields().forEach(System.out::println); + + assertEquals(schema.getFields().size(), 14); + + assertSchemaField( + schema.getFields().get(0), + "[version=2.0].[type=PrimitiveType].[type=int].intField", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(1), + "[version=2.0].[type=PrimitiveType].[type=union].intFieldV2", + "union", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new UnionType() + .setNestedTypes(new StringArray(Collections.singletonList("union")))))); + assertSchemaField( + schema.getFields().get(2), + "[version=2.0].[type=PrimitiveType].[type=union].[type=int].intFieldV2", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(3), + "[version=2.0].[type=PrimitiveType].[type=null].nullField", + "null", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NullType()))); + assertSchemaField( + schema.getFields().get(4), + "[version=2.0].[type=PrimitiveType].[type=union].nullFieldV2", + "union", + true, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new UnionType() + .setNestedTypes(new StringArray(Collections.singletonList("union")))))); + assertSchemaField( + schema.getFields().get(5), + "[version=2.0].[type=PrimitiveType].[type=long].longField", + "long", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(6), + "[version=2.0].[type=PrimitiveType].[type=float].floatField", + "float", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(7), + "[version=2.0].[type=PrimitiveType].[type=double].doubleField", + "double", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(8), + "[version=2.0].[type=PrimitiveType].[type=string].stringField", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(9), + "[version=2.0].[type=PrimitiveType].[type=boolean].booleanField", + "boolean", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new BooleanType()))); + assertSchemaField( + schema.getFields().get(10), + "[version=2.0].[type=PrimitiveType].[type=int].nullableIntField", + "int", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(11), + "[version=2.0].[type=PrimitiveType].[type=long].nullableLongField", + "long", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(12), + "[version=2.0].[type=PrimitiveType].[type=string].nullableStringField", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(13), + "[version=2.0].[type=PrimitiveType].[type=enum].status", + "Enum", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new EnumType()))); + } + + @Test(groups = "basic") + void testComplexMaps() throws IOException { + SchemaMetadata schema = + avroSchemaConverter.toDataHubSchema( + readAvroSchema("complex_maps.avsc"), false, false, dataPlatformUrn, null); + + schema.getFields().forEach(System.out::println); + + assertEquals(schema.getFields().size(), 15); + + assertSchemaField( + schema.getFields().get(0), + "[version=2.0].[type=MapType].[type=map].mapOfString", + "map", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("string")))); + assertSchemaField( + schema.getFields().get(1), + "[version=2.0].[type=MapType].[type=map].[type=ComplexType].mapOfComplexType", + "ComplexType", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("ComplexType")))); + assertSchemaField( + schema.getFields().get(2), + "[version=2.0].[type=MapType].[type=map].[type=ComplexType].mapOfComplexType.[type=string].field1", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(3), + "[version=2.0].[type=MapType].[type=map].[type=ComplexType].mapOfComplexType.[type=int].field2", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(4), + "[version=2.0].[type=MapType].[type=map].[type=union].mapOfNullableString", + "union", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("union")))); + assertSchemaField( + schema.getFields().get(5), + "[version=2.0].[type=MapType].[type=map].[type=union].[type=string].mapOfNullableString", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(6), + "[version=2.0].[type=MapType].[type=map].[type=union].mapOfNullableComplexType", + "union", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("union")))); + assertSchemaField( + schema.getFields().get(7), + "[version=2.0].[type=MapType].[type=map].[type=union].[type=ComplexTypeNullable].mapOfNullableComplexType", + "ComplexTypeNullable", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType()))); + assertSchemaField( + schema.getFields().get(8), + "[version=2.0].[type=MapType].[type=map].[type=union].[type=ComplexTypeNullable].mapOfNullableComplexType.[type=string].field1", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(9), + "[version=2.0].[type=MapType].[type=map].[type=union].[type=ComplexTypeNullable].mapOfNullableComplexType.[type=int].field2", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(10), + "[version=2.0].[type=MapType].[type=map].[type=array].mapOfArray", + "array(string)", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType().setNestedType(new StringArray("string"))))); + assertSchemaField( + schema.getFields().get(11), + "[version=2.0].[type=MapType].[type=map].[type=map].mapOfMap", + "map", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("int")))); + assertSchemaField( + schema.getFields().get(12), + "[version=2.0].[type=MapType].[type=map].[type=union].mapOfUnion", + "union", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("union")))); + assertSchemaField( + schema.getFields().get(13), + "[version=2.0].[type=MapType].[type=map].[type=union].[type=string].mapOfUnion", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(14), + "[version=2.0].[type=MapType].[type=map].[type=union].[type=int].mapOfUnion", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + } + + @Test(groups = "basic") + void testComplexArrays() throws IOException { + SchemaMetadata schema = + avroSchemaConverter.toDataHubSchema( + readAvroSchema("complex_arrays.avsc"), false, false, dataPlatformUrn, null); + + schema.getFields().forEach(System.out::println); + + assertEquals(schema.getFields().size(), 16); + + assertSchemaField( + schema.getFields().get(0), + "[version=2.0].[type=ArrayType].[type=array].arrayOfString", + "array(string)", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType().setNestedType(new StringArray("string"))))); + assertSchemaField( + schema.getFields().get(1), + "[version=2.0].[type=ArrayType].[type=array].[type=map].arrayOfMap", + "map", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("string")))); + assertSchemaField( + schema.getFields().get(2), + "[version=2.0].[type=ArrayType].[type=array].[type=ComplexType].arrayOfRecord", + "ComplexType", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType().setNestedType(new StringArray("ComplexType"))))); + assertSchemaField( + schema.getFields().get(3), + "[version=2.0].[type=ArrayType].[type=array].[type=ComplexType].arrayOfRecord.[type=string].field1", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(4), + "[version=2.0].[type=ArrayType].[type=array].[type=ComplexType].arrayOfRecord.[type=int].field2", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(5), + "[version=2.0].[type=ArrayType].[type=array].[type=array].arrayOfArray", + "array(string)", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType().setNestedType(new StringArray("string"))))); + assertSchemaField( + schema.getFields().get(6), + "[version=2.0].[type=ArrayType].[type=array].[type=union].arrayOfUnion", + "union", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType().setNestedType(new StringArray("union"))))); + assertSchemaField( + schema.getFields().get(7), + "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=string].arrayOfUnion", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(8), + "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=int].arrayOfUnion", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(9), + "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=boolean].arrayOfUnion", + "boolean", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new BooleanType()))); + assertSchemaField( + schema.getFields().get(10), + "[version=2.0].[type=ArrayType].[type=array].[type=union].arrayOfNullableString", + "union", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType().setNestedType(new StringArray("union"))))); + assertSchemaField( + schema.getFields().get(11), + "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=string].arrayOfNullableString", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(12), + "[version=2.0].[type=ArrayType].[type=array].[type=union].arrayOfNullableRecord", + "union", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType().setNestedType(new StringArray("union"))))); + assertSchemaField( + schema.getFields().get(13), + "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=ComplexTypeNullable].arrayOfNullableRecord", + "ComplexTypeNullable", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType()))); + assertSchemaField( + schema.getFields().get(14), + "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=ComplexTypeNullable].arrayOfNullableRecord.[type=string].field1", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(15), + "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=ComplexTypeNullable].arrayOfNullableRecord.[type=int].field2", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + } + + @Test(groups = "basic") + void testComplexStructs() throws IOException { + SchemaMetadata schema = + avroSchemaConverter.toDataHubSchema( + readAvroSchema("complex_structs.avsc"), false, false, dataPlatformUrn, null); + + schema.getFields().forEach(System.out::println); + + assertEquals(schema.getFields().size(), 13); + + assertSchemaField( + schema.getFields().get(0), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField", + "ComplexStruct", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType()))); + assertSchemaField( + schema.getFields().get(1), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=string].fieldString", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(2), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=int].fieldInt", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(3), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=boolean].fieldBoolean", + "boolean", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new BooleanType()))); + assertSchemaField( + schema.getFields().get(4), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=map].fieldMap", + "map", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("string")))); + assertSchemaField( + schema.getFields().get(5), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=NestedRecord].fieldRecord", + "NestedRecord", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType()))); + assertSchemaField( + schema.getFields().get(6), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=NestedRecord].fieldRecord.[type=string].nestedField1", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(7), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=NestedRecord].fieldRecord.[type=int].nestedField2", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(8), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=array].fieldArray", + "array(string)", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType().setNestedType(new StringArray("string"))))); + assertSchemaField( + schema.getFields().get(9), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=union].fieldUnion", + "union", + true, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new UnionType().setNestedTypes(new StringArray("union"))))); + assertSchemaField( + schema.getFields().get(10), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=union].[type=string].fieldUnion", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(11), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=union].[type=int].fieldUnion", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(12), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=map].fieldNullableMap", + "map", + true, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("string")))); + } + + @Test(groups = "basic") + void testComplexUnions() throws IOException { + SchemaMetadata schema = + avroSchemaConverter.toDataHubSchema( + readAvroSchema("complex_unions.avsc"), false, false, dataPlatformUrn, null); + + schema.getFields().forEach(System.out::println); + + assertEquals(schema.getFields().size(), 14); + + assertSchemaField( + schema.getFields().get(0), + "[version=2.0].[type=UnionType].[type=union].fieldUnionNullablePrimitives", + "union", + true, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new UnionType().setNestedTypes(new StringArray("union"))))); + assertSchemaField( + schema.getFields().get(1), + "[version=2.0].[type=UnionType].[type=union].[type=string].fieldUnionNullablePrimitives", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(2), + "[version=2.0].[type=UnionType].[type=union].[type=int].fieldUnionNullablePrimitives", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(3), + "[version=2.0].[type=UnionType].[type=union].[type=boolean].fieldUnionNullablePrimitives", + "boolean", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new BooleanType()))); + assertSchemaField( + schema.getFields().get(4), + "[version=2.0].[type=UnionType].[type=union].fieldUnionComplexTypes", + "union", + true, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new UnionType().setNestedTypes(new StringArray("union"))))); + assertSchemaField( + schema.getFields().get(5), + "[version=2.0].[type=UnionType].[type=union].[type=NestedRecord].fieldUnionComplexTypes", + "NestedRecord", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType()))); + assertSchemaField( + schema.getFields().get(6), + "[version=2.0].[type=UnionType].[type=union].[type=NestedRecord].fieldUnionComplexTypes.[type=string].nestedField1", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(7), + "[version=2.0].[type=UnionType].[type=union].[type=NestedRecord].fieldUnionComplexTypes.[type=int].nestedField2", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(8), + "[version=2.0].[type=UnionType].[type=union].[type=map].fieldUnionComplexTypes", + "map", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("string")))); + assertSchemaField( + schema.getFields().get(9), + "[version=2.0].[type=UnionType].[type=union].fieldUnionPrimitiveAndComplex", + "union", + true, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new UnionType().setNestedTypes(new StringArray("union"))))); + assertSchemaField( + schema.getFields().get(10), + "[version=2.0].[type=UnionType].[type=union].[type=string].fieldUnionPrimitiveAndComplex", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(11), + "[version=2.0].[type=UnionType].[type=union].[type=ComplexTypeRecord].fieldUnionPrimitiveAndComplex", + "ComplexTypeRecord", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType()))); + assertSchemaField( + schema.getFields().get(12), + "[version=2.0].[type=UnionType].[type=union].[type=ComplexTypeRecord].fieldUnionPrimitiveAndComplex.[type=string].complexField1", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(13), + "[version=2.0].[type=UnionType].[type=union].[type=ComplexTypeRecord].fieldUnionPrimitiveAndComplex.[type=int].complexField2", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + } + + @Test(groups = "basic") + void testLogicalTypes() throws IOException { + SchemaMetadata schema = + avroSchemaConverter.toDataHubSchema( + readAvroSchema("logical_types.avsc"), false, false, dataPlatformUrn, null); + + schema.getFields().forEach(System.out::println); + + assertEquals(schema.getFields().size(), 9); + + assertSchemaField( + schema.getFields().get(0), + "[version=2.0].[type=LogicalTypes].[type=bytes].decimalField", + "bytes(decimal)", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())), + "{\"scale\":2,\"logicalType\":\"decimal\",\"precision\":9}"); + assertSchemaField( + schema.getFields().get(1), + "[version=2.0].[type=LogicalTypes].[type=bytes].decimalFieldWithoutScale", + "bytes(decimal)", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())), + "{\"logicalType\":\"decimal\",\"precision\":9}"); + assertSchemaField( + schema.getFields().get(2), + "[version=2.0].[type=LogicalTypes].[type=bytes].decimalFieldWithoutPrecisionAndScale", + "bytes", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new BytesType())), + "{\"logicalType\":\"decimal\"}"); + assertSchemaField( + schema.getFields().get(3), + "[version=2.0].[type=LogicalTypes].[type=long].timestampMillisField", + "long(timestamp-millis)", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())), + "{\"logicalType\":\"timestamp-millis\"}"); + assertSchemaField( + schema.getFields().get(4), + "[version=2.0].[type=LogicalTypes].[type=long].timestampMicrosField", + "long(timestamp-micros)", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())), + "{\"logicalType\":\"timestamp-micros\"}"); + assertSchemaField( + schema.getFields().get(5), + "[version=2.0].[type=LogicalTypes].[type=int].dateField", + "int(date)", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new DateType())), + "{\"logicalType\":\"date\"}"); + assertSchemaField( + schema.getFields().get(6), + "[version=2.0].[type=LogicalTypes].[type=int].timeMillisField", + "int(time-millis)", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())), + "{\"logicalType\":\"time-millis\"}"); + assertSchemaField( + schema.getFields().get(7), + "[version=2.0].[type=LogicalTypes].[type=long].timeMicrosField", + "long(time-micros)", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())), + "{\"logicalType\":\"time-micros\"}"); + assertSchemaField( + schema.getFields().get(8), + "[version=2.0].[type=LogicalTypes].[type=string].uuidField", + "string(uuid)", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())), + "{\"logicalType\":\"uuid\"}"); + } + + @Test(groups = "basic") + void testUsersRecord() throws IOException { + // this is a test case got during the Hudi integration + SchemaMetadata schema = + avroSchemaConverter.toDataHubSchema( + readAvroSchema("users_record.avsc"), false, false, dataPlatformUrn, null); + + schema.getFields().forEach(System.out::println); + + assertEquals(schema.getFields().size(), 20); + + assertSchemaField( + schema.getFields().get(0), + "[version=2.0].[type=users_record].[type=string]._hoodie_commit_time", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(1), + "[version=2.0].[type=users_record].[type=string]._hoodie_commit_seqno", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(2), + "[version=2.0].[type=users_record].[type=string]._hoodie_record_key", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(3), + "[version=2.0].[type=users_record].[type=string]._hoodie_partition_path", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(4), + "[version=2.0].[type=users_record].[type=string]._hoodie_file_name", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(5), + "[version=2.0].[type=users_record].[type=string].user_id", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(6), + "[version=2.0].[type=users_record].[type=string].name", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(7), + "[version=2.0].[type=users_record].[type=address].address", + "address", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType()))); + assertSchemaField( + schema.getFields().get(8), + "[version=2.0].[type=users_record].[type=address].address.[type=string].street", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(9), + "[version=2.0].[type=users_record].[type=address].address.[type=string].city", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(10), + "[version=2.0].[type=users_record].[type=address].address.[type=string].country", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(11), + "[version=2.0].[type=users_record].[type=address].address.[type=string].postal_code", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(12), + "[version=2.0].[type=users_record].[type=address].address.[type=long].created_at", + "long(timestamp-micros)", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())), + "{\"logicalType\":\"timestamp-micros\"}"); + assertSchemaField( + schema.getFields().get(13), + "[version=2.0].[type=users_record].[type=contact].contact", + "contact", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType()))); + assertSchemaField( + schema.getFields().get(14), + "[version=2.0].[type=users_record].[type=contact].contact.[type=string].email", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(15), + "[version=2.0].[type=users_record].[type=contact].contact.[type=string].phone", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(16), + "[version=2.0].[type=users_record].[type=long].created_at", + "long(timestamp-micros)", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())), + "{\"logicalType\":\"timestamp-micros\"}"); + assertSchemaField( + schema.getFields().get(17), + "[version=2.0].[type=users_record].[type=long].updated_at", + "long(timestamp-micros)", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())), + "{\"logicalType\":\"timestamp-micros\"}"); + assertSchemaField( + schema.getFields().get(18), + "[version=2.0].[type=users_record].[type=map].[type=int].props", + "int", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(19), + "[version=2.0].[type=users_record].[type=string].country", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + } + + private void assertSchemaField( + SchemaField field, + String expectedPath, + String expectedNativeType, + boolean expectedNullable, + boolean expectedIsPartOfKey, + SchemaFieldDataType expectedType) { + assertSchemaField( + field, + expectedPath, + expectedNativeType, + expectedNullable, + expectedIsPartOfKey, + expectedType, + null); + } + + private void assertSchemaField( + SchemaField field, + String expectedPath, + String expectedNativeType, + boolean expectedNullable, + boolean expectedIsPartOfKey, + SchemaFieldDataType expectedType, + String expectedJsonProps) { + assertEquals(field.getFieldPath(), expectedPath); + assertEquals(field.getNativeDataType(), expectedNativeType); + assertEquals(field.isNullable(), expectedNullable); + assertEquals(field.isIsPartOfKey(), expectedIsPartOfKey); + assertEquals(field.getType(), expectedType); + if (expectedJsonProps != null) { + assertEquals(field.getJsonProps(), expectedJsonProps); + } + } + + private Schema readAvroSchema(String schemaFileName) throws IOException { + String schemaPath = getClass().getClassLoader().getResource(schemaFileName).getPath(); + File schemaFile = new File(schemaPath); + return new Schema.Parser().parse(schemaFile); + } +} diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile.avsc deleted file mode 100644 index 81f8b0e54b11e0..00000000000000 --- a/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile.avsc +++ /dev/null @@ -1,456 +0,0 @@ -{ - "type": "record", - "name": "CustomerProfile", - "namespace": "com.example.customer", - "doc": "A complex customer profile schema demonstrating various union types and optional fields", - "fields": [ - { - "name": "customerId", - "type": { - "type": "string", - "logicalType": "uuid" - }, - "doc": "Unique identifier for the customer" - }, - { - "name": "identificationDocument", - "type": [ - "null", - { - "type": "record", - "name": "Passport", - "fields": [ - { - "name": "passportNumber", - "type": "string" - }, - { - "name": "expiryDate", - "type": { - "type": "long", - "logicalType": "date" - } - } - ] - }, - { - "type": "record", - "name": "DriversLicense", - "fields": [ - { - "name": "licenseNumber", - "type": "string" - }, - { - "name": "state", - "type": "string" - }, - { - "name": "validUntil", - "type": { - "type": "long", - "logicalType": "date" - } - } - ] - }, - { - "type": "record", - "name": "NationalID", - "fields": [ - { - "name": "idNumber", - "type": "string" - }, - { - "name": "country", - "type": "string" - } - ] - } - ], - "default": null, - "doc": "Customer's identification document - can be passport, driver's license, or national ID" - }, - { - "name": "contactInfo", - "type": { - "type": "record", - "name": "ContactInformation", - "fields": [ - { - "name": "primaryContact", - "type": [ - { - "type": "record", - "name": "EmailContact", - "fields": [ - { - "name": "emailAddress", - "type": "string" - }, - { - "name": "isVerified", - "type": "boolean", - "default": false - } - ] - }, - { - "type": "record", - "name": "PhoneContact", - "fields": [ - { - "name": "countryCode", - "type": "string" - }, - { - "name": "number", - "type": "string" - }, - { - "name": "type", - "type": { - "type": "enum", - "name": "PhoneType", - "symbols": [ - "MOBILE", - "LANDLINE" - ] - } - } - ] - } - ], - "doc": "Primary contact method - either email or phone" - }, - { - "name": "alternativeContacts", - "type": { - "type": "array", - "items": [ - "null", - "EmailContact", - "PhoneContact" - ] - }, - "default": [], - "doc": "List of alternative contact methods" - } - ] - } - }, - { - "name": "addresses", - "type": { - "type": "array", - "items": { - "type": "record", - "name": "Address", - "fields": [ - { - "name": "type", - "type": { - "type": "enum", - "name": "AddressType", - "symbols": [ - "RESIDENTIAL", - "BUSINESS", - "SHIPPING" - ] - }, - "default": "RESIDENTIAL" - }, - { - "name": "street", - "type": "string" - }, - { - "name": "city", - "type": "string" - }, - { - "name": "state", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "country", - "type": "string" - }, - { - "name": "postalCode", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "validationStatus", - "type": [ - "null", - { - "type": "record", - "name": "AddressValidation", - "fields": [ - { - "name": "isValid", - "type": "boolean" - }, - { - "name": "verificationDate", - "type": { - "type": "long", - "logicalType": "timestamp-millis" - } - }, - { - "name": "verificationMethod", - "type": { - "type": "enum", - "name": "VerificationMethod", - "symbols": [ - "MANUAL", - "AUTOMATED" - ] - } - } - ] - } - ], - "default": null - } - ] - } - }, - "doc": "Customer's addresses with validation information" - }, - { - "name": "preferences", - "type": { - "type": "map", - "values": [ - "null", - "string", - "boolean", - { - "type": "record", - "name": "FrequencyPreference", - "fields": [ - { - "name": "frequency", - "type": { - "type": "enum", - "name": "Frequency", - "symbols": [ - "DAILY", - "WEEKLY", - "MONTHLY" - ] - } - }, - { - "name": "enabled", - "type": "boolean", - "default": true - }, - { - "name": "lastUpdated", - "type": { - "type": "long", - "logicalType": "timestamp-millis" - } - } - ] - } - ] - }, - "doc": "Customer preferences with various possible value types" - }, - { - "name": "subscriptionHistory", - "type": [ - "null", - { - "type": "array", - "items": { - "type": "record", - "name": "Subscription", - "fields": [ - { - "name": "planName", - "type": "string" - }, - { - "name": "startDate", - "type": { - "type": "long", - "logicalType": "date" - } - }, - { - "name": "endDate", - "type": [ - "null", - { - "type": "long", - "logicalType": "date" - } - ], - "default": null - }, - { - "name": "status", - "type": { - "type": "enum", - "name": "SubscriptionStatus", - "symbols": [ - "ACTIVE", - "CANCELLED", - "EXPIRED", - "SUSPENDED" - ] - } - }, - { - "name": "paymentMethod", - "type": [ - "null", - { - "type": "record", - "name": "PaymentMethod", - "fields": [ - { - "name": "type", - "type": { - "type": "enum", - "name": "PaymentType", - "symbols": [ - "CREDIT_CARD", - "DEBIT_CARD", - "BANK_TRANSFER", - "DIGITAL_WALLET" - ] - } - }, - { - "name": "lastFourDigits", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "expiryDate", - "type": [ - "null", - { - "type": "long", - "logicalType": "date" - } - ], - "default": null - } - ] - } - ], - "default": null - } - ] - } - } - ], - "default": null, - "doc": "Historical record of customer subscriptions" - }, - { - "name": "metadata", - "type": { - "type": "map", - "values": [ - "null", - "string", - "long", - "boolean", - { - "type": "record", - "name": "MetadataValue", - "fields": [ - { - "name": "value", - "type": [ - "null", - "string", - "long", - "boolean" - ], - "default": null - }, - { - "name": "timestamp", - "type": { - "type": "long", - "logicalType": "timestamp-millis" - } - }, - { - "name": "source", - "type": "string" - } - ] - } - ] - }, - "doc": "Flexible metadata storage with various possible value types" - }, - { - "name": "tags", - "type": [ - "null", - { - "type": "array", - "items": { - "type": "record", - "name": "Tag", - "fields": [ - { - "name": "name", - "type": "string" - }, - { - "name": "value", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "score", - "type": [ - "null", - "double" - ], - "default": null - }, - { - "name": "addedAt", - "type": { - "type": "long", - "logicalType": "timestamp-millis" - } - } - ] - } - } - ], - "default": null, - "doc": "Optional tags associated with the customer profile" - } - ] -} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile2.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile2.avsc deleted file mode 100644 index b8c7654ea072a2..00000000000000 --- a/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile2.avsc +++ /dev/null @@ -1,244 +0,0 @@ -{ - "type": "record", - "name": "CustomerProfile2", - "namespace": "com.example.customer", - "doc": "A complex customer profile schema demonstrating various union types and optional fields", - "fields": [ - { - "name": "customerId", - "type": { - "type": "string", - "logicalType": "uuid" - }, - "doc": "Unique identifier for the customer" - }, - { - "name": "identificationDocument", - "type": [ - "null", - { - "type": "record", - "name": "Passport", - "fields": [ - { - "name": "passportNumber", - "type": "string" - }, - { - "name": "expiryDate", - "type": { - "type": "long", - "logicalType": "date" - } - } - ] - }, - { - "type": "record", - "name": "DriversLicense", - "fields": [ - { - "name": "licenseNumber", - "type": "string" - }, - { - "name": "state", - "type": "string" - }, - { - "name": "validUntil", - "type": { - "type": "long", - "logicalType": "date" - } - } - ] - }, - { - "type": "record", - "name": "NationalID", - "fields": [ - { - "name": "idNumber", - "type": "string" - }, - { - "name": "country", - "type": "string" - } - ] - } - ], - "default": null, - "doc": "Customer's identification document" - }, - { - "name": "contactInfo", - "type": { - "type": "record", - "name": "ContactInformation", - "fields": [ - { - "name": "primaryEmailContact", - "type": [ - "null", - { - "type": "record", - "name": "PrimaryEmailContact", - "fields": [ - { - "name": "emailAddress", - "type": "string" - }, - { - "name": "isVerified", - "type": "boolean", - "default": false - } - ] - } - ], - "default": null - }, - { - "name": "primaryPhoneContact", - "type": [ - "null", - { - "type": "record", - "name": "PrimaryPhoneContact", - "fields": [ - { - "name": "countryCode", - "type": "string" - }, - { - "name": "number", - "type": "string" - }, - { - "name": "type", - "type": { - "type": "enum", - "name": "PhoneType", - "symbols": [ - "MOBILE", - "LANDLINE" - ] - } - } - ] - } - ], - "default": null - }, - { - "name": "alternativeEmailContacts", - "type": { - "type": "array", - "items": { - "type": "record", - "name": "AlternativeEmailContact", - "fields": [ - { - "name": "emailAddress", - "type": "string" - }, - { - "name": "isVerified", - "type": "boolean", - "default": false - } - ] - } - }, - "default": [] - }, - { - "name": "alternativePhoneContacts", - "type": { - "type": "array", - "items": { - "type": "record", - "name": "AlternativePhoneContact", - "fields": [ - { - "name": "countryCode", - "type": "string" - }, - { - "name": "number", - "type": "string" - }, - { - "name": "type", - "type": "PhoneType" - } - ] - } - }, - "default": [] - } - ] - } - }, - { - "name": "preferences", - "type": { - "type": "record", - "name": "Preferences", - "fields": [ - { - "name": "simplePreferences", - "type": { - "type": "map", - "values": [ - "null", - "string", - "boolean" - ] - }, - "default": {} - }, - { - "name": "frequencyPreferences", - "type": { - "type": "map", - "values": { - "type": "record", - "name": "FrequencyPreference", - "fields": [ - { - "name": "frequency", - "type": { - "type": "enum", - "name": "Frequency", - "symbols": [ - "DAILY", - "WEEKLY", - "MONTHLY" - ] - } - }, - { - "name": "enabled", - "type": "boolean", - "default": true - }, - { - "name": "lastUpdated", - "type": { - "type": "long", - "logicalType": "timestamp-millis" - } - } - ] - } - }, - "default": {} - } - ] - } - } - ] -} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/FlatUser.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/FlatUser.avsc deleted file mode 100644 index c796878c32ae41..00000000000000 --- a/metadata-integration/java/datahub-schematron/lib/src/test/resources/FlatUser.avsc +++ /dev/null @@ -1,45 +0,0 @@ -{ - "type": "record", - "name": "FlatUser", - "namespace": "com.example", - "fields": [ - { - "name": "id", - "type": "int", - "doc": "The unique identifier for a user", - "default": -1, - "metadata": { - "key1": "value1", - "key2": "value2" - } - }, - { - "name": "username", - "type": "string", - "doc": "The username of the user" - }, - { - "name": "email", - "type": "string", - "doc": "The email of the user" - }, - { - "name": "age", - "type": "int", - "doc": "The age of the user" - }, - { - "name": "isActive", - "type": "boolean", - "doc": "Whether the user is active or not" - }, - { - "name": "registrationDate", - "type": { - "type": "long", - "logicalType": "timestamp-millis" - }, - "doc": "The registration date of the user" - } - ] -} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_arrays.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_arrays.avsc new file mode 100644 index 00000000000000..8e8bcdaa0a7dce --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_arrays.avsc @@ -0,0 +1,87 @@ +{ + "type": "record", + "name": "ArrayType", + "fields": [ + { + "name": "arrayOfString", + "type": { + "type": "array", + "items": "string" + } + }, + { + "name": "arrayOfMap", + "type": { + "type": "array", + "items": { + "type": "map", + "values": "string" + } + } + }, + { + "name": "arrayOfRecord", + "type": { + "type": "array", + "items": { + "type": "record", + "name": "ComplexType", + "fields": [ + { + "name": "field1", + "type": "string" + }, + { + "name": "field2", + "type": "int" + } + ] + } + } + }, + { + "name": "arrayOfArray", + "type": { + "type": "array", + "items": { + "type": "array", + "items": "string" + } + } + }, + { + "name": "arrayOfUnion", + "type": { + "type": "array", + "items": ["string", "int", "boolean"] + } + }, + { + "name": "arrayOfNullableString", + "type": { + "type": "array", + "items": ["null", "string"] + } + }, + { + "name": "arrayOfNullableRecord", + "type": { + "type": "array", + "items": ["null", { + "type": "record", + "name": "ComplexTypeNullable", + "fields": [ + { + "name": "field1", + "type": "string" + }, + { + "name": "field2", + "type": "int" + } + ] + }] + } + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_maps.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_maps.avsc new file mode 100644 index 00000000000000..baedae1b9dcc15 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_maps.avsc @@ -0,0 +1,87 @@ +{ + "type": "record", + "name": "MapType", + "fields": [ + { + "name": "mapOfString", + "type": { + "type": "map", + "values": "string" + } + }, + { + "name": "mapOfComplexType", + "type": { + "type": "map", + "values": { + "type": "record", + "name": "ComplexType", + "fields": [ + { + "name": "field1", + "type": "string" + }, + { + "name": "field2", + "type": "int" + } + ] + } + } + }, + { + "name": "mapOfNullableString", + "type": { + "type": "map", + "values": ["null", "string"] + } + }, + { + "name": "mapOfNullableComplexType", + "type": { + "type": "map", + "values": ["null", { + "type": "record", + "name": "ComplexTypeNullable", + "fields": [ + { + "name": "field1", + "type": "string" + }, + { + "name": "field2", + "type": "int" + } + ] + }] + } + }, + { + "name": "mapOfArray", + "type": { + "type": "map", + "values": { + "type": "array", + "items": "string" + } + } + }, + { + "name": "mapOfMap", + "type": { + "type": "map", + "values": { + "type": "map", + "values": "int" + } + } + }, + { + "name": "mapOfUnion", + "type": { + "type": "map", + "values": ["null", "string", "int"] + } + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_structs.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_structs.avsc new file mode 100644 index 00000000000000..7f5824192d3062 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_structs.avsc @@ -0,0 +1,76 @@ +{ + "type": "record", + "name": "StructType", + "fields": [ + { + "name": "structField", + "type": { + "type": "record", + "name": "ComplexStruct", + "fields": [ + { + "name": "fieldString", + "type": "string" + }, + { + "name": "fieldInt", + "type": "int" + }, + { + "name": "fieldBoolean", + "type": "boolean" + }, + { + "name": "fieldMap", + "type": { + "type": "map", + "values": "string" + } + }, + { + "name": "fieldRecord", + "type": { + "type": "record", + "name": "NestedRecord", + "fields": [ + { + "name": "nestedField1", + "type": "string" + }, + { + "name": "nestedField2", + "type": "int" + } + ] + } + }, + { + "name": "fieldArray", + "type": { + "type": "array", + "items": "string" + } + }, + { + "name": "fieldUnion", + "type": [ + "null", + "string", + "int" + ] + }, + { + "name": "fieldNullableMap", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ] + } + ] + } + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_unions.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_unions.avsc new file mode 100644 index 00000000000000..1a35f1cfa0e6d6 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_unions.avsc @@ -0,0 +1,60 @@ +{ + "type": "record", + "name": "UnionType", + "fields": [ + { + "name": "fieldUnionNullablePrimitives", + "type": [ + "null", + "string", + "int", + "boolean" + ] + }, + { + "name": "fieldUnionComplexTypes", + "type": [ + "null", + { + "type": "record", + "name": "NestedRecord", + "fields": [ + { + "name": "nestedField1", + "type": "string" + }, + { + "name": "nestedField2", + "type": "int" + } + ] + }, + { + "type": "map", + "values": "string" + } + ] + }, + { + "name": "fieldUnionPrimitiveAndComplex", + "type": [ + "null", + "string", + { + "type": "record", + "name": "ComplexTypeRecord", + "fields": [ + { + "name": "complexField1", + "type": "string" + }, + { + "name": "complexField2", + "type": "int" + } + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/logical_types.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/logical_types.avsc new file mode 100644 index 00000000000000..24919d82149653 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/logical_types.avsc @@ -0,0 +1,72 @@ +{ + "type": "record", + "name": "LogicalTypes", + "fields": [ + { + "name": "decimalField", + "type": { + "type": "bytes", + "logicalType": "decimal", + "precision": 9, + "scale": 2 + } + }, + { + "name": "decimalFieldWithoutScale", + "type": { + "type": "bytes", + "logicalType": "decimal", + "precision": 9 + } + }, + { + "name": "decimalFieldWithoutPrecisionAndScale", + "type": { + "type": "bytes", + "logicalType": "decimal" + } + }, + { + "name": "timestampMillisField", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + } + }, + { + "name": "timestampMicrosField", + "type": { + "type": "long", + "logicalType": "timestamp-micros" + } + }, + { + "name": "dateField", + "type": { + "type": "int", + "logicalType": "date" + } + }, + { + "name": "timeMillisField", + "type": { + "type": "int", + "logicalType": "time-millis" + } + }, + { + "name": "timeMicrosField", + "type": { + "type": "long", + "logicalType": "time-micros" + } + }, + { + "name": "uuidField", + "type": { + "type": "string", + "logicalType": "uuid" + } + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/primitive_types.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/primitive_types.avsc new file mode 100644 index 00000000000000..c618299748fab1 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/primitive_types.avsc @@ -0,0 +1,62 @@ +{ + "type": "record", + "name": "PrimitiveType", + "fields": [ + { + "name": "intField", + "type": "int" + }, + { + "name": "intFieldV2", + "type": ["int"] + }, + { + "name": "nullField", + "type": "null" + }, + { + "name": "nullFieldV2", + "type": ["null"] + }, + { + "name": "longField", + "type": "long" + }, + { + "name": "floatField", + "type": "float" + }, + { + "name": "doubleField", + "type": "double" + }, + { + "name": "stringField", + "type": "string" + }, + { + "name": "booleanField", + "type": "boolean" + }, + { + "name": "nullableIntField", + "type": ["null", "int"] + }, + { + "name": "nullableLongField", + "type": ["null", "long"] + }, + { + "name": "nullableStringField", + "type": ["null", "string"] + }, + { + "name": "status", + "type": { + "type": "enum", + "name": "StatusEnum", + "symbols": ["ACTIVE", "INACTIVE", "PENDING"] + } + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/users_record.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/users_record.avsc new file mode 100644 index 00000000000000..bd46ae715a4810 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/users_record.avsc @@ -0,0 +1,195 @@ +{ + "type": "record", + "name": "users_record", + "namespace": "hoodie.users", + "fields": [ + { + "name": "_hoodie_commit_time", + "type": [ + "null", + "string" + ], + "doc": "", + "default": null + }, + { + "name": "_hoodie_commit_seqno", + "type": [ + "null", + "string" + ], + "doc": "", + "default": null + }, + { + "name": "_hoodie_record_key", + "type": [ + "null", + "string" + ], + "doc": "", + "default": null + }, + { + "name": "_hoodie_partition_path", + "type": [ + "null", + "string" + ], + "doc": "", + "default": null + }, + { + "name": "_hoodie_file_name", + "type": [ + "null", + "string" + ], + "doc": "", + "default": null + }, + { + "name": "user_id", + "type": "string" + }, + { + "name": "name", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "address", + "type": [ + "null", + { + "type": "record", + "name": "address", + "namespace": "hoodie.users.users_record", + "fields": [ + { + "name": "street", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "city", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "country", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "postal_code", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "created_at", + "type": [ + "null", + { + "type": "long", + "logicalType": "timestamp-micros" + } + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "contact", + "type": [ + "null", + { + "type": "record", + "name": "contact", + "namespace": "hoodie.users.users_record", + "fields": [ + { + "name": "email", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "phone", + "type": [ + "null", + "string" + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "created_at", + "type": [ + "null", + { + "type": "long", + "logicalType": "timestamp-micros" + } + ], + "default": null + }, + { + "name": "updated_at", + "type": [ + "null", + { + "type": "long", + "logicalType": "timestamp-micros" + } + ], + "default": null + }, + { + "name": "props", + "type": [ + "null", + { + "type": "map", + "values": [ + "null", + "int" + ] + } + ], + "default": null + }, + { + "name": "country", + "type": [ + "null", + "string" + ], + "default": null + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/openlineage-converter/build.gradle b/metadata-integration/java/openlineage-converter/build.gradle index 1bf4a3c0fadb6a..301d1e6dffdd84 100644 --- a/metadata-integration/java/openlineage-converter/build.gradle +++ b/metadata-integration/java/openlineage-converter/build.gradle @@ -1,5 +1,5 @@ apply plugin: 'java-library' -apply plugin: 'com.github.johnrengelman.shadow' +apply plugin: 'com.gradleup.shadow' apply plugin: 'signing' apply plugin: 'maven-publish' apply from: '../../../gradle/coverage/java-coverage.gradle' diff --git a/metadata-integration/java/spark-lineage-legacy/build.gradle b/metadata-integration/java/spark-lineage-legacy/build.gradle index d33290c16b3a56..f0281a685368ef 100644 --- a/metadata-integration/java/spark-lineage-legacy/build.gradle +++ b/metadata-integration/java/spark-lineage-legacy/build.gradle @@ -2,7 +2,7 @@ plugins { id("com.palantir.git-version") apply false } apply plugin: 'java' -apply plugin: 'com.github.johnrengelman.shadow' +apply plugin: 'com.gradleup.shadow' apply plugin: 'signing' apply plugin: 'io.codearte.nexus-staging' apply plugin: 'maven-publish' diff --git a/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java b/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java index a4b2e991b6e1e0..99eadd223acd1a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java @@ -14,6 +14,7 @@ import com.linkedin.data.template.RecordTemplate; import com.linkedin.data.template.SetMode; import com.linkedin.data.template.StringArray; +import com.linkedin.data.template.StringMap; import com.linkedin.dataplatform.DataPlatformInfo; import com.linkedin.entity.EntityResponse; import com.linkedin.events.metadata.ChangeType; @@ -21,6 +22,7 @@ import com.linkedin.metadata.aspect.batch.AspectsBatch; import com.linkedin.metadata.aspect.batch.BatchItem; import com.linkedin.metadata.aspect.batch.MCPItem; +import com.linkedin.metadata.aspect.validation.CreateIfNotExistsValidator; import com.linkedin.metadata.entity.EntityApiUtils; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.entity.ebean.batch.AspectsBatchImpl; @@ -98,7 +100,8 @@ public static List getAdditionalChanges( .filter(item -> SUPPORTED_TYPES.contains(item.getChangeType())) .collect(Collectors.groupingBy(BatchItem::getUrn)); - Set urnsWithExistingKeyAspects = entityService.exists(opContext, itemsByUrn.keySet()); + Set urnsWithExistingKeyAspects = + entityService.exists(opContext, itemsByUrn.keySet(), true, true); // create default aspects when key aspect is missing return itemsByUrn.entrySet().stream() @@ -126,7 +129,7 @@ public static List getAdditionalChanges( // pick the first item as a template (use entity information) MCPItem templateItem = aspectsEntry.getValue().get(0); - // generate default aspects (including key aspect, always upserts) + // generate default aspects (including key aspect) return defaultAspects.stream() .map( entry -> @@ -215,7 +218,7 @@ private static List> generateDefaultAspectsIfMissin if (!fetchAspects.isEmpty()) { Set latestAspects = - entityService.getLatestAspectsForUrn(opContext, urn, fetchAspects).keySet(); + entityService.getLatestAspectsForUrn(opContext, urn, fetchAspects, true).keySet(); return fetchAspects.stream() .filter(aspectName -> !latestAspects.contains(aspectName)) @@ -347,6 +350,11 @@ public static MetadataChangeProposal getProposalFromAspectForDefault( proposal.setAspectName(aspectName); // already checked existence, default aspects should be changeType CREATE proposal.setChangeType(ChangeType.CREATE); + proposal.setHeaders( + new StringMap( + Map.of( + CreateIfNotExistsValidator.FILTER_EXCEPTION_HEADER, + CreateIfNotExistsValidator.FILTER_EXCEPTION_VALUE))); // Set fields determined from original if (templateItem.getSystemMetadata() != null) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/AspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/AspectDao.java index 3f0545b6f94a85..7a8c5c76c31c3a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/AspectDao.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/AspectDao.java @@ -43,7 +43,7 @@ EntityAspect getAspect( @Nonnull Map batchGet( - @Nonnull final Set keys); + @Nonnull final Set keys, boolean forUpdate); @Nonnull List getAspectsInRange( diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java index 9a05f54cf04c29..6de7784bfbc0ec 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java @@ -238,7 +238,7 @@ public Map> getLatestAspects( boolean alwaysIncludeKeyAspect) { Map batchGetResults = - getLatestAspect(opContext, urns, aspectNames); + getLatestAspect(opContext, urns, aspectNames, false); // Fetch from db and populate urn -> aspect map. final Map> urnToAspects = new HashMap<>(); @@ -285,9 +285,10 @@ public Map> getLatestAspects( public Map getLatestAspectsForUrn( @Nonnull OperationContext opContext, @Nonnull final Urn urn, - @Nonnull final Set aspectNames) { + @Nonnull final Set aspectNames, + boolean forUpdate) { Map batchGetResults = - getLatestAspect(opContext, new HashSet<>(Arrays.asList(urn)), aspectNames); + getLatestAspect(opContext, new HashSet<>(Arrays.asList(urn)), aspectNames, forUpdate); return EntityUtils.toSystemAspects( opContext.getRetrieverContext().get(), batchGetResults.values()) @@ -868,7 +869,12 @@ private List ingestAspectsToLocalDB( // Read before write is unfortunate, however batch it final Map> urnAspects = batchWithDefaults.getUrnAspectsMap(); + // read #1 + // READ COMMITED is used in conjunction with SELECT FOR UPDATE (read lock) in order + // to ensure that the aspect's version is not modified outside the transaction. + // We rely on the retry mechanism if the row is modified and will re-read (require the + // lock) Map> databaseAspects = aspectDao.getLatestAspects(urnAspects, true); @@ -936,19 +942,29 @@ private List ingestAspectsToLocalDB( // do final pre-commit checks with previous aspect value ValidationExceptionCollection exceptions = AspectsBatch.validatePreCommit(changeMCPs, opContext.getRetrieverContext().get()); - if (!exceptions.isEmpty()) { - MetricUtils.counter(EntityServiceImpl.class, "batch_validation_exception").inc(); - throw new ValidationException(collectMetrics(exceptions).toString()); + + if (exceptions.hasFatalExceptions()) { + // IF this is a client request/API request we fail the `transaction batch` + if (opContext.getRequestContext() != null) { + MetricUtils.counter(EntityServiceImpl.class, "batch_request_validation_exception") + .inc(); + throw new ValidationException(collectMetrics(exceptions).toString()); + } + + MetricUtils.counter(EntityServiceImpl.class, "batch_consumer_validation_exception") + .inc(); + log.error("mce-consumer batch exceptions: {}", collectMetrics(exceptions)); } - // Database Upsert results + // Database Upsert successfully validated results log.info( "Ingesting aspects batch to database: {}", AspectsBatch.toAbbreviatedString(changeMCPs, 2048)); Timer.Context ingestToLocalDBTimer = MetricUtils.timer(this.getClass(), "ingestAspectsToLocalDB").time(); List upsertResults = - changeMCPs.stream() + exceptions + .streamSuccessful(changeMCPs.stream()) .map( writeItem -> { @@ -1498,7 +1514,7 @@ public List restoreIndices( List systemAspects = EntityUtils.toSystemAspects( opContext.getRetrieverContext().get(), - getLatestAspect(opContext, entityBatch.getValue(), aspectNames).values()); + getLatestAspect(opContext, entityBatch.getValue(), aspectNames, false).values()); long timeSqlQueryMs = System.currentTimeMillis() - startTime; RestoreIndicesResult result = restoreIndices(opContext, systemAspects, s -> {}); @@ -2168,7 +2184,8 @@ public Set exists( @Nonnull OperationContext opContext, @Nonnull final Collection urns, @Nullable String aspectName, - boolean includeSoftDeleted) { + boolean includeSoftDeleted, + boolean forUpdate) { final Set dbKeys = urns.stream() .map( @@ -2184,11 +2201,11 @@ public Set exists( : aspectName, ASPECT_LATEST_VERSION)) .collect(Collectors.toSet()); - final Map aspects = aspectDao.batchGet(dbKeys); + final Map aspects = aspectDao.batchGet(dbKeys, forUpdate); final Set existingUrnStrings = aspects.values().stream() - .filter(aspect -> aspect != null) - .map(aspect -> aspect.getUrn()) + .filter(Objects::nonNull) + .map(EntityAspect::getUrn) .collect(Collectors.toSet()); Set existing = @@ -2444,7 +2461,8 @@ protected AuditStamp createSystemAuditStamp() { private Map getLatestAspect( @Nonnull OperationContext opContext, @Nonnull final Set urns, - @Nonnull final Set aspectNames) { + @Nonnull final Set aspectNames, + boolean forUpdate) { log.debug("Invoked getLatestAspects with urns: {}, aspectNames: {}", urns, aspectNames); @@ -2468,7 +2486,8 @@ private Map getLatestAspect( Map batchGetResults = new HashMap<>(); Iterators.partition(dbKeys.iterator(), MAX_KEYS_PER_QUERY) .forEachRemaining( - batch -> batchGetResults.putAll(aspectDao.batchGet(ImmutableSet.copyOf(batch)))); + batch -> + batchGetResults.putAll(aspectDao.batchGet(ImmutableSet.copyOf(batch), forUpdate))); return batchGetResults; } @@ -2487,7 +2506,7 @@ private long calculateVersionNumber( private Map getEnvelopedAspects( @Nonnull OperationContext opContext, final Set dbKeys) { - final Map dbEntries = aspectDao.batchGet(dbKeys); + final Map dbEntries = aspectDao.batchGet(dbKeys, false); List envelopedAspects = EntityUtils.toSystemAspects(opContext.getRetrieverContext().get(), dbEntries.values()); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraAspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraAspectDao.java index a00482acda62e2..4d177d50ea44de 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraAspectDao.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraAspectDao.java @@ -198,7 +198,7 @@ public void saveAspect( @Override @Nonnull public Map batchGet( - @Nonnull final Set keys) { + @Nonnull final Set keys, boolean forUpdate) { validateConnection(); return keys.stream() .map(this::getAspect) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java index 729d0e61cb2c00..bd6cc67561b883 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java @@ -68,7 +68,10 @@ @Slf4j public class EbeanAspectDao implements AspectDao, AspectMigrationsDao { - + // READ COMMITED is used in conjunction with SELECT FOR UPDATE (read lock) in order + // to ensure that the aspect's version is not modified outside the transaction. + // We rely on the retry mechanism if the row is modified and will re-read (require the lock) + public static final TxIsolation TX_ISOLATION = TxIsolation.READ_COMMITED; private final Database _server; private boolean _connectionValidated = false; private final Clock _clock = Clock.systemUTC(); @@ -329,7 +332,7 @@ public int deleteUrn(@Nullable TransactionContext txContext, @Nonnull final Stri @Override @Nonnull public Map batchGet( - @Nonnull final Set keys) { + @Nonnull final Set keys, boolean forUpdate) { validateConnection(); if (keys.isEmpty()) { return Collections.emptyMap(); @@ -341,9 +344,9 @@ public Map batchGet( .collect(Collectors.toSet()); final List records; if (_queryKeysCount == 0) { - records = batchGet(ebeanKeys, ebeanKeys.size()); + records = batchGet(ebeanKeys, ebeanKeys.size(), forUpdate); } else { - records = batchGet(ebeanKeys, _queryKeysCount); + records = batchGet(ebeanKeys, _queryKeysCount, forUpdate); } return records.stream() .collect( @@ -357,22 +360,23 @@ record -> record.getKey().toAspectIdentifier(), EbeanAspectV2::toEntityAspect)); * * @param keys a set of keys with urn, aspect and version * @param keysCount the max number of keys for each sub query + * @param forUpdate whether the operation is intending to write to this row in a tx */ @Nonnull private List batchGet( - @Nonnull final Set keys, final int keysCount) { + @Nonnull final Set keys, final int keysCount, boolean forUpdate) { validateConnection(); int position = 0; final int totalPageCount = QueryUtils.getTotalPageCount(keys.size(), keysCount); final List finalResult = - batchGetUnion(new ArrayList<>(keys), keysCount, position); + batchGetUnion(new ArrayList<>(keys), keysCount, position, forUpdate); while (QueryUtils.hasMore(position, keysCount, totalPageCount)) { position += keysCount; final List oneStatementResult = - batchGetUnion(new ArrayList<>(keys), keysCount, position); + batchGetUnion(new ArrayList<>(keys), keysCount, position, forUpdate); finalResult.addAll(oneStatementResult); } @@ -407,7 +411,10 @@ private String batchGetSelect( @Nonnull private List batchGetUnion( - @Nonnull final List keys, final int keysCount, final int position) { + @Nonnull final List keys, + final int keysCount, + final int position, + boolean forUpdate) { validateConnection(); // Build one SELECT per key and then UNION ALL the results. This can be much more performant @@ -439,6 +446,11 @@ private List batchGetUnion( } } + // Add FOR UPDATE clause only once at the end of the entire statement + if (forUpdate) { + sb.append(" FOR UPDATE"); + } + final RawSql rawSql = RawSqlBuilder.parse(sb.toString()) .columnMapping(EbeanAspectV2.URN_COLUMN, "key.urn") @@ -736,8 +748,7 @@ public T runInTransactionWithRetryUnlocked( T result = null; do { try (Transaction transaction = - _server.beginTransaction( - TxScope.requiresNew().setIsolation(TxIsolation.REPEATABLE_READ))) { + _server.beginTransaction(TxScope.requiresNew().setIsolation(TX_ISOLATION))) { transaction.setBatchMode(true); result = block.apply(transactionContext.tx(transaction)); transaction.commit(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/DeleteEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/DeleteEntityServiceTest.java index 0e8ee08e60739f..723cb7813769f4 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/DeleteEntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/DeleteEntityServiceTest.java @@ -113,7 +113,7 @@ public void testDeleteUniqueRefGeneratesValidMCP() { dbValue.setCreatedOn(new Timestamp(auditStamp.getTime())); final Map dbEntries = Map.of(dbKey, dbValue); - Mockito.when(_aspectDao.batchGet(Mockito.any())).thenReturn(dbEntries); + Mockito.when(_aspectDao.batchGet(Mockito.any(), Mockito.anyBoolean())).thenReturn(dbEntries); RollbackResult result = new RollbackResult( diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java index aa42545fa0e46f..0386031cbcad86 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java @@ -2,6 +2,7 @@ import static com.linkedin.metadata.Constants.CORP_USER_ENTITY_NAME; import static com.linkedin.metadata.Constants.STATUS_ASPECT_NAME; +import static com.linkedin.metadata.entity.ebean.EbeanAspectDao.TX_ISOLATION; import static org.mockito.Mockito.mock; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertNotNull; @@ -39,7 +40,6 @@ import io.ebean.Database; import io.ebean.Transaction; import io.ebean.TxScope; -import io.ebean.annotation.TxIsolation; import java.net.URISyntaxException; import java.sql.Timestamp; import java.time.Instant; @@ -281,12 +281,11 @@ public void testNestedTransactions() throws AssertionError { Database server = _aspectDao.getServer(); try (Transaction transaction = - server.beginTransaction(TxScope.requiresNew().setIsolation(TxIsolation.REPEATABLE_READ))) { + server.beginTransaction(TxScope.requiresNew().setIsolation(TX_ISOLATION))) { transaction.setBatchMode(true); // Work 1 try (Transaction transaction2 = - server.beginTransaction( - TxScope.requiresNew().setIsolation(TxIsolation.REPEATABLE_READ))) { + server.beginTransaction(TxScope.requiresNew().setIsolation(TX_ISOLATION))) { transaction2.setBatchMode(true); // Work 2 transaction2.commit(); @@ -337,7 +336,7 @@ public void testSystemMetadataDuplicateKey() throws Exception { try (Transaction transaction = ((EbeanAspectDao) _entityServiceImpl.aspectDao) .getServer() - .beginTransaction(TxScope.requiresNew().setIsolation(TxIsolation.REPEATABLE_READ))) { + .beginTransaction(TxScope.requiresNew().setIsolation(TX_ISOLATION))) { TransactionContext transactionContext = TransactionContext.empty(transaction, 3); _entityServiceImpl.aspectDao.saveAspect( transactionContext, @@ -417,7 +416,7 @@ public void multiThreadingTest() { List> testData = dataGenerator.generateMCPs("dataset", 25, aspects).collect(Collectors.toList()); - executeThreadingTest(opContext, _entityServiceImpl, testData, 15); + executeThreadingTest(userContext, _entityServiceImpl, testData, 15); // Expected aspects Set> generatedAspectIds = @@ -456,7 +455,9 @@ public void multiThreadingTest() { assertEquals( missing.size(), 0, - String.format("Expected all generated aspects to be inserted. Missing: %s", missing)); + String.format( + "Expected all generated aspects to be inserted. Missing Examples: %s", + missing.stream().limit(10).collect(Collectors.toSet()))); } /** @@ -473,7 +474,7 @@ public void singleThreadingTest() { List> testData = dataGenerator.generateMCPs("dataset", 25, aspects).collect(Collectors.toList()); - executeThreadingTest(opContext, _entityServiceImpl, testData, 1); + executeThreadingTest(userContext, _entityServiceImpl, testData, 1); // Expected aspects Set> generatedAspectIds = diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java index 4c42815a80f3f1..2d59632e6f3c6d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java @@ -123,6 +123,8 @@ public abstract class EntityServiceTest latestAspects = _entityServiceImpl.getLatestAspectsForUrn( - opContext, entityUrn, new HashSet<>(Arrays.asList(aspectName1, aspectName2))); + opContext, entityUrn, new HashSet<>(Arrays.asList(aspectName1, aspectName2)), false); assertTrue(DataTemplateUtil.areEqual(writeAspect1, latestAspects.get(aspectName1))); assertTrue(DataTemplateUtil.areEqual(writeAspect2, latestAspects.get(aspectName2))); @@ -557,7 +559,7 @@ public void testReingestAspectsGetLatestAspects() throws Exception { Map latestAspects = _entityServiceImpl.getLatestAspectsForUrn( - opContext, entityUrn, new HashSet<>(List.of(aspectName1))); + opContext, entityUrn, new HashSet<>(List.of(aspectName1)), false); assertTrue(DataTemplateUtil.areEqual(writeAspect1, latestAspects.get(aspectName1))); verify(_mockProducer, times(1)) @@ -636,7 +638,7 @@ public void testReingestLineageAspect() throws Exception { Map latestAspects = _entityServiceImpl.getLatestAspectsForUrn( - opContext, entityUrn, new HashSet<>(List.of(aspectName1))); + opContext, entityUrn, new HashSet<>(List.of(aspectName1)), false); assertTrue(DataTemplateUtil.areEqual(upstreamLineage, latestAspects.get(aspectName1))); verify(_mockProducer, times(1)) @@ -709,7 +711,7 @@ public void testReingestLineageProposal() throws Exception { Map latestAspects = _entityServiceImpl.getLatestAspectsForUrn( - opContext, entityUrn, new HashSet<>(List.of(aspectName1))); + opContext, entityUrn, new HashSet<>(List.of(aspectName1)), false); assertTrue(DataTemplateUtil.areEqual(upstreamLineage, latestAspects.get(aspectName1))); verify(_mockProducer, times(1)) @@ -2156,7 +2158,7 @@ public void testCreateChangeTypeProposal() { ValidationException.class, () -> _entityServiceImpl.ingestProposal( - opContext, secondCreateProposal, TEST_AUDIT_STAMP, false)); + userContext, secondCreateProposal, TEST_AUDIT_STAMP, false)); } @Test diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java index 775770d28b4a2b..4915f897835966 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java @@ -1,5 +1,7 @@ package com.linkedin.metadata.entity.ebean; +import static com.linkedin.metadata.Constants.ASPECT_LATEST_VERSION; +import static com.linkedin.metadata.Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME; import static org.mockito.Mockito.mock; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; @@ -9,6 +11,7 @@ import com.linkedin.metadata.EbeanTestUtils; import com.linkedin.metadata.aspect.batch.AspectsBatch; import com.linkedin.metadata.config.EbeanConfiguration; +import com.linkedin.metadata.entity.EntityAspectIdentifier; import io.ebean.Database; import io.ebean.test.LoggedSql; import java.util.List; @@ -73,4 +76,40 @@ public void testGetLatestAspectsForUpdate() throws JsonProcessingException { assertTrue( sql.get(0).contains("for update;"), String.format("Did not find `for update` in %s ", sql)); } + + @Test + public void testbatchGetForUpdate() throws JsonProcessingException { + LoggedSql.start(); + + testDao.runInTransactionWithRetryUnlocked( + (txContext) -> { + testDao.batchGet( + Set.of( + new EntityAspectIdentifier( + "urn:li:corpuser:testbatchGetForUpdate1", + DATA_PLATFORM_INSTANCE_ASPECT_NAME, + ASPECT_LATEST_VERSION), + new EntityAspectIdentifier( + "urn:li:corpuser:testbatchGetForUpdate2", + DATA_PLATFORM_INSTANCE_ASPECT_NAME, + ASPECT_LATEST_VERSION)), + true); + return ""; + }, + mock(AspectsBatch.class), + 0); + + // Get the captured SQL statements + List sql = + LoggedSql.stop().stream() + .filter( + str -> + str.contains("testbatchGetForUpdate1") + && str.contains("testbatchGetForUpdate2")) + .toList(); + assertEquals( + sql.size(), 1, String.format("Found: %s", new ObjectMapper().writeValueAsString(sql))); + assertTrue( + sql.get(0).contains("FOR UPDATE;"), String.format("Did not find `for update` in %s ", sql)); + } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/timeline/TimelineServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/timeline/TimelineServiceTest.java index e8154720a140db..2073f3f01ca903 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/timeline/TimelineServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/timeline/TimelineServiceTest.java @@ -99,7 +99,7 @@ public void testGetTimeline() throws Exception { Map latestAspects = _entityServiceImpl.getLatestAspectsForUrn( - opContext, entityUrn, new HashSet<>(Arrays.asList(aspectName))); + opContext, entityUrn, new HashSet<>(Arrays.asList(aspectName)), false); Set elements = new HashSet<>(); elements.add(ChangeCategory.TECHNICAL_SCHEMA); diff --git a/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java index d5aa7e9c51983a..5e387d7d88292a 100644 --- a/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java +++ b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java @@ -2,6 +2,7 @@ import static com.linkedin.metadata.Constants.*; import static io.datahubproject.test.search.config.SearchTestContainerConfiguration.REFRESH_INTERVAL_SECONDS; +import static org.mockito.ArgumentMatchers.anyBoolean; import static org.mockito.ArgumentMatchers.anySet; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -303,7 +304,7 @@ private EntityClient entityClientHelper( new ConcurrentMapCacheManager(), entitySearchService, 1, false); AspectDao mockAspectDao = mock(AspectDao.class); - when(mockAspectDao.batchGet(anySet())) + when(mockAspectDao.batchGet(anySet(), anyBoolean())) .thenAnswer( args -> { Set ids = args.getArgument(0); diff --git a/metadata-service/openapi-servlet/src/test/java/mock/MockEntityService.java b/metadata-service/openapi-servlet/src/test/java/mock/MockEntityService.java index b70b643b10f323..4073bff4a22f95 100644 --- a/metadata-service/openapi-servlet/src/test/java/mock/MockEntityService.java +++ b/metadata-service/openapi-servlet/src/test/java/mock/MockEntityService.java @@ -72,7 +72,10 @@ public Map> getLatestAspects( @Override public @NotNull Map getLatestAspectsForUrn( - @Nonnull OperationContext opContext, @Nonnull Urn urn, @Nonnull Set aspectNames) { + @Nonnull OperationContext opContext, + @Nonnull Urn urn, + @Nonnull Set aspectNames, + boolean forUpdate) { return Collections.emptyMap(); } diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/EntityService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/EntityService.java index 445724f0144e64..57af4aa05fff6f 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/EntityService.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/EntityService.java @@ -41,13 +41,15 @@ public interface EntityService { * @param urns urns for the entities * @param aspectName aspect for the entity, if null, assumes key aspect * @param includeSoftDelete including soft deleted entities + * @param forUpdate whether the operation is intending to write to this row in a tx * @return set of urns with the specified aspect existing */ Set exists( @Nonnull OperationContext opContext, @Nonnull final Collection urns, @Nullable String aspectName, - boolean includeSoftDelete); + boolean includeSoftDelete, + boolean forUpdate); /** * Just whether the entity/aspect exists, prefer batched method. @@ -62,20 +64,37 @@ default boolean exists( @Nonnull Urn urn, @Nullable String aspectName, boolean includeSoftDelete) { - return exists(opContext, Set.of(urn), aspectName, includeSoftDelete).contains(urn); + return exists(opContext, Set.of(urn), aspectName, includeSoftDelete, false).contains(urn); } /** * Returns a set of urns of entities that exist (has materialized aspects). * * @param urns the list of urns of the entities to check + * @param includeSoftDelete including soft deleted entities * @return a set of urns of entities that exist. */ default Set exists( @Nonnull OperationContext opContext, @Nonnull final Collection urns, boolean includeSoftDelete) { - return exists(opContext, urns, null, includeSoftDelete); + return exists(opContext, urns, null, includeSoftDelete, false); + } + + /** + * Returns a set of urns of entities that exist (has materialized aspects). + * + * @param urns the list of urns of the entities to check + * @param includeSoftDelete including soft deleted entities + * @param forUpdate whether the operation is intending to write to this row in a tx + * @return a set of urns of entities that exist. + */ + default Set exists( + @Nonnull OperationContext opContext, + @Nonnull final Collection urns, + boolean includeSoftDelete, + boolean forUpdate) { + return exists(opContext, urns, null, includeSoftDelete, forUpdate); } /** @@ -86,18 +105,33 @@ default Set exists( */ default Set exists( @Nonnull OperationContext opContext, @Nonnull final Collection urns) { - return exists(opContext, urns, true); + return exists(opContext, urns, true, false); } /** * Returns whether the urn of the entity exists (has materialized aspects). * * @param urn the urn of the entity to check + * @param includeSoftDelete including soft deleted entities * @return entities exists. */ default boolean exists( @Nonnull OperationContext opContext, @Nonnull Urn urn, boolean includeSoftDelete) { - return exists(opContext, List.of(urn), includeSoftDelete).contains(urn); + return exists(opContext, List.of(urn), includeSoftDelete, false).contains(urn); + } + + /** + * Returns whether the urn of the entity exists (has materialized aspects). + * + * @param urn the urn of the entity to check + * @return entities exists. + */ + default boolean exists( + @Nonnull OperationContext opContext, + @Nonnull Urn urn, + boolean includeSoftDelete, + boolean forUpdate) { + return exists(opContext, List.of(urn), includeSoftDelete, forUpdate).contains(urn); } /** @@ -107,7 +141,7 @@ default boolean exists( * @return entities exists. */ default boolean exists(@Nonnull OperationContext opContext, @Nonnull Urn urn) { - return exists(opContext, urn, true); + return exists(opContext, urn, true, false); } /** @@ -137,7 +171,8 @@ default Map> getLatestAspects( Map getLatestAspectsForUrn( @Nonnull OperationContext opContext, @Nonnull final Urn urn, - @Nonnull final Set aspectNames); + @Nonnull final Set aspectNames, + boolean forUpdate); /** * Retrieves an aspect having a specific {@link Urn}, name, & version. diff --git a/smoke-test/tests/database/__init__.py b/smoke-test/tests/database/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/smoke-test/tests/database/test_database.py b/smoke-test/tests/database/test_database.py new file mode 100644 index 00000000000000..656c9e98887bf0 --- /dev/null +++ b/smoke-test/tests/database/test_database.py @@ -0,0 +1,32 @@ +import logging + +import pytest +from datahub.emitter.mce_builder import make_dataset_urn + +from tests.utilities.concurrent_openapi import run_tests +from tests.utils import delete_urns, wait_for_writes_to_sync + +logger = logging.getLogger(__name__) + + +generated_urns = [make_dataset_urn("test", f"database_test_{i}") for i in range(0, 100)] + + +@pytest.fixture(scope="module") +def ingest_cleanup_data(graph_client, request): + print("removing test data before") + delete_urns(graph_client, generated_urns) + wait_for_writes_to_sync() + yield + print("removing test data after") + delete_urns(graph_client, generated_urns) + wait_for_writes_to_sync() + + +def test_mysql_deadlock_gap_locking(auth_session, ingest_cleanup_data): + # This generates concurrent batches with interleaved urn ids + run_tests( + auth_session, + fixture_globs=["tests/database/v3/mysql_gap_deadlock/*.json"], + num_workers=8, + ) diff --git a/smoke-test/tests/database/v3/__init__.py b/smoke-test/tests/database/v3/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/smoke-test/tests/database/v3/mysql_gap_deadlock/__init__.py b/smoke-test/tests/database/v3/mysql_gap_deadlock/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/smoke-test/tests/database/v3/mysql_gap_deadlock/batchA1.json b/smoke-test/tests/database/v3/mysql_gap_deadlock/batchA1.json new file mode 100644 index 00000000000000..ef601dacc211c3 --- /dev/null +++ b/smoke-test/tests/database/v3/mysql_gap_deadlock/batchA1.json @@ -0,0 +1,115 @@ +[{ + "request": { + "url": "/openapi/v3/entity/dataset", + "params": { + "async": "false" + }, + "description": "Create dataset batch, single transaction", + "json": [ + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_0,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_4,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_8,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_12,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_16,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_20,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_24,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_28,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_32,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_36,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_40,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_44,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_48,PROD)", + "status": { + "value": { + "removed": false + } + } + } + ] + } +}] \ No newline at end of file diff --git a/smoke-test/tests/database/v3/mysql_gap_deadlock/batchA2.json b/smoke-test/tests/database/v3/mysql_gap_deadlock/batchA2.json new file mode 100644 index 00000000000000..3f56f730e30f53 --- /dev/null +++ b/smoke-test/tests/database/v3/mysql_gap_deadlock/batchA2.json @@ -0,0 +1,107 @@ +[{ + "request": { + "url": "/openapi/v3/entity/dataset", + "params": { + "async": "false" + }, + "description": "Create dataset batch, single transaction", + "json": [ + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_52,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_56,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_60,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_64,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_68,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_72,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_76,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_80,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_84,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_88,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_92,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_96,PROD)", + "status": { + "value": { + "removed": false + } + } + } + ] + } +}] \ No newline at end of file diff --git a/smoke-test/tests/database/v3/mysql_gap_deadlock/batchB1.json b/smoke-test/tests/database/v3/mysql_gap_deadlock/batchB1.json new file mode 100644 index 00000000000000..de807321e815bb --- /dev/null +++ b/smoke-test/tests/database/v3/mysql_gap_deadlock/batchB1.json @@ -0,0 +1,115 @@ +[{ + "request": { + "url": "/openapi/v3/entity/dataset", + "params": { + "async": "false" + }, + "description": "Create dataset batch, single transaction", + "json": [ + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_1,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_5,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_9,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_13,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_17,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_21,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_25,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_29,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_33,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_37,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_41,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_45,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_49,PROD)", + "status": { + "value": { + "removed": false + } + } + } + ] + } +}] \ No newline at end of file diff --git a/smoke-test/tests/database/v3/mysql_gap_deadlock/batchB2.json b/smoke-test/tests/database/v3/mysql_gap_deadlock/batchB2.json new file mode 100644 index 00000000000000..7f35e6ac24e5e2 --- /dev/null +++ b/smoke-test/tests/database/v3/mysql_gap_deadlock/batchB2.json @@ -0,0 +1,107 @@ +[{ + "request": { + "url": "/openapi/v3/entity/dataset", + "params": { + "async": "false" + }, + "description": "Create dataset batch, single transaction", + "json": [ + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_53,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_57,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_61,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_65,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_69,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_73,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_77,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_81,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_85,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_89,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_93,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_97,PROD)", + "status": { + "value": { + "removed": false + } + } + } + ] + } +}] \ No newline at end of file diff --git a/smoke-test/tests/database/v3/mysql_gap_deadlock/batchC1.json b/smoke-test/tests/database/v3/mysql_gap_deadlock/batchC1.json new file mode 100644 index 00000000000000..986c119dae29d9 --- /dev/null +++ b/smoke-test/tests/database/v3/mysql_gap_deadlock/batchC1.json @@ -0,0 +1,115 @@ +[{ + "request": { + "url": "/openapi/v3/entity/dataset", + "params": { + "async": "false" + }, + "description": "Create dataset batch, single transaction", + "json": [ + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_2,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_6,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_10,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_14,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_18,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_22,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_26,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_30,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_34,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_38,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_42,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_46,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_50,PROD)", + "status": { + "value": { + "removed": false + } + } + } + ] + } +}] \ No newline at end of file diff --git a/smoke-test/tests/database/v3/mysql_gap_deadlock/batchC2.json b/smoke-test/tests/database/v3/mysql_gap_deadlock/batchC2.json new file mode 100644 index 00000000000000..861e7cb2d0ffce --- /dev/null +++ b/smoke-test/tests/database/v3/mysql_gap_deadlock/batchC2.json @@ -0,0 +1,107 @@ +[{ + "request": { + "url": "/openapi/v3/entity/dataset", + "params": { + "async": "false" + }, + "description": "Create dataset batch, single transaction", + "json": [ + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_54,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_58,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_62,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_66,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_70,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_74,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_78,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_82,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_86,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_90,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_94,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_98,PROD)", + "status": { + "value": { + "removed": false + } + } + } + ] + } +}] \ No newline at end of file diff --git a/smoke-test/tests/database/v3/mysql_gap_deadlock/batchD1.json b/smoke-test/tests/database/v3/mysql_gap_deadlock/batchD1.json new file mode 100644 index 00000000000000..e4721aaeec1c32 --- /dev/null +++ b/smoke-test/tests/database/v3/mysql_gap_deadlock/batchD1.json @@ -0,0 +1,115 @@ +[{ + "request": { + "url": "/openapi/v3/entity/dataset", + "params": { + "async": "false" + }, + "description": "Create dataset batch, single transaction", + "json": [ + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_3,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_7,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_11,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_15,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_19,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_23,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_27,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_31,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_35,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_39,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_43,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_47,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_51,PROD)", + "status": { + "value": { + "removed": false + } + } + } + ] + } +}] \ No newline at end of file diff --git a/smoke-test/tests/database/v3/mysql_gap_deadlock/batchD2.json b/smoke-test/tests/database/v3/mysql_gap_deadlock/batchD2.json new file mode 100644 index 00000000000000..5bb9bb3772c350 --- /dev/null +++ b/smoke-test/tests/database/v3/mysql_gap_deadlock/batchD2.json @@ -0,0 +1,107 @@ +[{ + "request": { + "url": "/openapi/v3/entity/dataset", + "params": { + "async": "false" + }, + "description": "Create dataset batch, single transaction", + "json": [ + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_55,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_59,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_63,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_67,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_71,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_75,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_79,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_83,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_87,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_91,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_95,PROD)", + "status": { + "value": { + "removed": false + } + } + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:test,database_test_99,PROD)", + "status": { + "value": { + "removed": false + } + } + } + ] + } +}] \ No newline at end of file diff --git a/smoke-test/tests/openapi/test_openapi.py b/smoke-test/tests/openapi/test_openapi.py index dbb28fb9a2e319..9b753f2a06c46e 100644 --- a/smoke-test/tests/openapi/test_openapi.py +++ b/smoke-test/tests/openapi/test_openapi.py @@ -1,95 +1,10 @@ -import concurrent.futures -import glob -import json import logging -import time -from deepdiff import DeepDiff +from tests.utilities.concurrent_openapi import run_tests logger = logging.getLogger(__name__) -def load_tests(fixture_glob="tests/openapi/**/*.json"): - for test_fixture in glob.glob(fixture_glob): - with open(test_fixture) as f: - yield (test_fixture, json.load(f)) - - -def execute_request(auth_session, request): - if "method" in request: - method = request.pop("method") - else: - method = "post" - - url = auth_session.gms_url() + request.pop("url") - - return getattr(auth_session, method)(url, **request) - - -def evaluate_test(auth_session, test_name, test_data): - try: - for idx, req_resp in enumerate(test_data): - if "description" in req_resp["request"]: - description = req_resp["request"].pop("description") - else: - description = None - if "wait" in req_resp["request"]: - time.sleep(int(req_resp["request"]["wait"])) - continue - url = req_resp["request"]["url"] - actual_resp = execute_request(auth_session, req_resp["request"]) - try: - if "response" in req_resp and "status_codes" in req_resp["response"]: - assert ( - actual_resp.status_code in req_resp["response"]["status_codes"] - ) - else: - assert actual_resp.status_code in [200, 202, 204] - if "response" in req_resp: - if "json" in req_resp["response"]: - if "exclude_regex_paths" in req_resp["response"]: - exclude_regex_paths = req_resp["response"][ - "exclude_regex_paths" - ] - else: - exclude_regex_paths = [] - diff = DeepDiff( - actual_resp.json(), - req_resp["response"]["json"], - exclude_regex_paths=exclude_regex_paths, - ignore_order=True, - ) - assert not diff - else: - logger.warning("No expected response json found") - except Exception as e: - logger.error( - f"Error executing step: {idx}, url: {url}, test: {test_name}" - ) - if description: - logger.error(f"Step {idx} Description: {description}") - logger.error(f"Response content: {actual_resp.content}") - raise e - except Exception as e: - logger.error(f"Error executing test: {test_name}") - raise e - - -def run_tests(auth_session, fixture_globs, num_workers=3): - with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: - futures = [] - for fixture_glob in fixture_globs: - for test_fixture, test_data in load_tests(fixture_glob=fixture_glob): - futures.append( - executor.submit( - evaluate_test, auth_session, test_fixture, test_data - ) - ) - - for future in concurrent.futures.as_completed(futures): - logger.info(future.result()) - - def test_openapi_all(auth_session): run_tests(auth_session, fixture_globs=["tests/openapi/*/*.json"], num_workers=10) diff --git a/smoke-test/tests/utilities/concurrent_openapi.py b/smoke-test/tests/utilities/concurrent_openapi.py new file mode 100644 index 00000000000000..076cd12c7a3b3a --- /dev/null +++ b/smoke-test/tests/utilities/concurrent_openapi.py @@ -0,0 +1,116 @@ +import concurrent.futures +import glob +import json +import logging +import time + +from deepdiff import DeepDiff + +logger = logging.getLogger(__name__) + + +def load_tests(fixture_glob): + """ + Scans a directory structure looking for json files which define expected tests/responses + :param fixture_glob: Glob path such as "tests/openapi/**/*.json" + :return: tuples of the filename and dictionary of the file content + """ + for test_fixture in glob.glob(fixture_glob): + with open(test_fixture) as f: + yield (test_fixture, json.load(f)) + + +def execute_request(auth_session, request): + """ + Based on the request dictionary execute the request against gms + :param auth_session: authentication + :param request: request dictionary + :return: output of the request + """ + if "method" in request: + method = request.pop("method") + else: + method = "post" + + url = auth_session.gms_url() + request.pop("url") + + return getattr(auth_session, method)(url, **request) + + +def evaluate_test(auth_session, test_name, test_data): + """ + For each test step, execute the request and assert the expected response + :param auth_session: authentication + :param test_name: name of the test + :param test_data: test steps as defined in the test file + :return: none + """ + try: + assert isinstance(test_data, list), "Expected test_data is a list of test steps" + for idx, req_resp in enumerate(test_data): + if "description" in req_resp["request"]: + description = req_resp["request"].pop("description") + else: + description = None + if "wait" in req_resp["request"]: + time.sleep(req_resp["request"]["wait"]) + continue + url = req_resp["request"]["url"] + actual_resp = execute_request(auth_session, req_resp["request"]) + try: + if "response" in req_resp and "status_codes" in req_resp["response"]: + assert ( + actual_resp.status_code in req_resp["response"]["status_codes"] + ) + else: + assert actual_resp.status_code in [200, 202, 204] + if "response" in req_resp: + if "json" in req_resp["response"]: + if "exclude_regex_paths" in req_resp["response"]: + exclude_regex_paths = req_resp["response"][ + "exclude_regex_paths" + ] + else: + exclude_regex_paths = [] + diff = DeepDiff( + actual_resp.json(), + req_resp["response"]["json"], + exclude_regex_paths=exclude_regex_paths, + ignore_order=True, + ) + assert not diff + else: + logger.warning("No expected response json found") + except Exception as e: + logger.error( + f"Error executing step: {idx}, url: {url}, test: {test_name}" + ) + if description: + logger.error(f"Step {idx} Description: {description}") + logger.error(f"Response content: {actual_resp.content}") + raise e + except Exception as e: + logger.error(f"Error executing test: {test_name}") + raise e + + +def run_tests(auth_session, fixture_globs, num_workers=3): + """ + Given a collection of test files, run them in parallel using N workers + :param auth_session: authentication + :param fixture_globs: test files + :param num_workers: concurrency + :return: none + """ + with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [] + for fixture_glob in fixture_globs: + for test_fixture, test_data in load_tests(fixture_glob=fixture_glob): + futures.append( + executor.submit( + evaluate_test, auth_session, test_fixture, test_data + ) + ) + + for future in concurrent.futures.as_completed(futures): + logger.info(future.result())