Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
anshbansal authored Feb 17, 2025
2 parents 778fc39 + 26e7743 commit a6951bd
Show file tree
Hide file tree
Showing 100 changed files with 48,898 additions and 2,898 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ Here are the companies that have officially adopted DataHub. Please feel free to
- [Udemy](https://www.udemy.com/)
- [Uphold](https://uphold.com)
- [Viasat](https://viasat.com)
- [Wealthsimple](https://www.wealthsimple.com)
- [Wikimedia](https://www.wikimedia.org)
- [Wolt](https://wolt.com)
- [Zynga](https://www.zynga.com)
Expand Down
4 changes: 2 additions & 2 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ project.ext.externalDependency = [
'jsonPatch': 'jakarta.json:jakarta.json-api:2.1.3',
'jsonPathImpl': 'org.eclipse.parsson:parsson:1.1.6',
'jsonSimple': 'com.googlecode.json-simple:json-simple:1.1.1',
'jsonSmart': 'net.minidev:json-smart:2.4.9',
'jsonSmart': 'net.minidev:json-smart:2.5.2',
'json': 'org.json:json:20231013',
'jsonSchemaValidator': 'com.github.java-json-tools:json-schema-validator:2.2.14',
'junit': 'junit:junit:4.13.2',
Expand Down Expand Up @@ -419,7 +419,7 @@ subprojects {
implementation externalDependency.annotationApi
constraints {
implementation("com.google.googlejavaformat:google-java-format:$googleJavaFormatVersion")
implementation('io.netty:netty-all:4.1.116.Final')
implementation('io.netty:netty-all:4.1.118.Final')
implementation('org.apache.commons:commons-compress:1.27.1')
implementation('org.apache.velocity:velocity-engine-core:2.4')
implementation('org.hibernate:hibernate-validator:6.0.20.Final')
Expand Down
2 changes: 1 addition & 1 deletion datahub-frontend/play.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ dependencies {
play(externalDependency.jacksonDataBind)
play("com.typesafe.akka:akka-actor_$playScalaVersion:2.6.20")
play(externalDependency.jsonSmart)
play('io.netty:netty-all:4.1.114.Final')
play('io.netty:netty-all:4.1.118.Final')

implementation(externalDependency.commonsText) {
because("previous versions are vulnerable to CVE-2022-42889")
Expand Down
2 changes: 1 addition & 1 deletion datahub-web-react/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ task yarnLintFix(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) {
}

task yarnBuild(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) {
args = ['run', 'build']
args = ['run', project.hasProperty('sourcemap') ? 'buildWithSourceMap' :'build']

outputs.cacheIf { true }
inputs.files(
Expand Down
1 change: 1 addition & 0 deletions datahub-web-react/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
"start": "yarn run generate && vite",
"ec2-dev": "yarn run generate && CI=true vite",
"build": "yarn run generate && CI=false NODE_OPTIONS='--max-old-space-size=5120 --openssl-legacy-provider' vite build",
"buildWithSourceMap": "yarn run generate && CI=false NODE_OPTIONS='--max-old-space-size=8192 --openssl-legacy-provider' vite build --sourcemap",
"test": "NODE_OPTIONS='--max-old-space-size=5120 --openssl-legacy-provider' vitest",
"test-coverage": "yarn test run --coverage",
"generate": "NODE_OPTIONS='--max-old-space-size=5120 --openssl-legacy-provider' graphql-codegen --config codegen.yml",
Expand Down
2 changes: 1 addition & 1 deletion docker/kafka-setup/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG KAFKA_DOCKER_VERSION=7.7.1
ARG KAFKA_DOCKER_VERSION=7.8.1

# Defining custom repo urls for use in enterprise environments. Re-used between stages below.
ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine
Expand Down
62 changes: 50 additions & 12 deletions docs/developers.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,38 +77,76 @@ We suggest partially compiling DataHub according to your needs:
./gradlew :docs-website:serve
```

## Deploying Local Versions
## Deploying Local Versions
This guide explains how to set up and deploy DataHub locally for development purposes.

Run just once to have the local `datahub` cli tool installed in your $PATH
### Initial Setup
Before you begin, you'll need to install the local `datahub` CLI tool:

```shell
cd smoke-test/
cd metadata-ingestion/
python3 -m venv venv
source venv/bin/activate
pip install --upgrade pip wheel setuptools
pip install -r requirements.txt
cd ../
```

Once you have compiled & packaged the project or appropriate module you can deploy the entire system via docker-compose by running:
### Deploying the Full Stack

Deploy the entire system using docker-compose:
```shell
./gradlew quickstart
./gradlew quickstartDebug
```

Replace whatever container you want in the existing deployment.
I.e, replacing datahub's backend (GMS):
Access the DataHub UI at `http://localhost:9002`

### Refreshing the Frontend

To run and update the frontend with local changes, open a new terminal and run:
```shell
(cd docker && COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker compose -p datahub -f docker-compose-without-neo4j.yml -f docker-compose-without-neo4j.override.yml -f docker-compose.dev.yml up -d --no-deps --force-recreate --build datahub-gms)
cd datahub-web-react
yarn install && yarn start
```
The frontend will be available at `http://localhost:3000` and will automatically update as you make changes to the code.

Running the local version of the frontend
### Refreshing GMS

To refresh the GMS (Generalized Metadata Service) with local changes:
```shell
(cd docker && COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker compose -p datahub -f docker-compose-without-neo4j.yml -f docker-compose-without-neo4j.override.yml -f docker-compose.dev.yml up -d --no-deps --force-recreate --build datahub-frontend-react)
./gradlew :metadata-service:war:build -x test --parallel && docker restart datahub-datahub-gms-debug-1
```

### Refreshing the CLI

If you haven't set up the CLI for local development yet, run:

```commandline
./gradlew :metadata-ingestion:installDev
cd metadata-ingestion
source venv/bin/activate
```

Once you're in `venv`, your local changes will be reflected automatically.
For example, you can run `datahub ingest -c <file>` to test local changes in ingestion connectors.

To verify that you're using the local version, run:

```commandline
datahub --version
```

Expected Output:
```commandline
acryl-datahub, version unavailable (installed in develop mode)
```

### Refreshing Other Components

To refresh other components with local changes, just run:
```commandline
./gradlew quickstartDebug
```


## IDE Support

The recommended IDE for DataHub development is [IntelliJ IDEA](https://www.jetbrains.com/idea/).
Expand Down
22 changes: 22 additions & 0 deletions docs/modeling/extending-the-metadata-model.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ The Aspect has four key components: its properties, the @Aspect annotation, the
the case of DashboardInfo, the `charts` field is an Array of Urns. The @Relationship annotation cannot be applied
directly to an array of Urns. That’s why you see the use of an Annotation override (`"/*":`) to apply the @Relationship
annotation to the Urn directly. Read more about overrides in the annotation docs further down on this page.
- **@UrnValidation**: This annotation can enforce constraints on Urn fields, including entity type restrictions and existence.

After you create your Aspect, you need to attach to all the entities that it applies to.

Expand Down Expand Up @@ -496,6 +497,27 @@ This annotation says that when we ingest an Entity with an Ownership Aspect, Dat
between that entity and the CorpUser or CorpGroup who owns it. This will be queryable using the Relationships resource
in both the forward and inverse directions.

#### @UrnValidation

This annotation can be applied to Urn fields inside an aspect. The annotation can optionally perform one or more of the following:
- Enforce that the URN exists
- Enforce stricter URN validation
- Restrict the URN to specific entity types

##### Example

Using this example from StructuredPropertyDefinition, we are enforcing that the valueType URN must exist,
it must follow stricter Urn encoding logic, and it can only be of entity type `dataType`.

```
@UrnValidation = {
"exist": true,
"strict": true,
"entityTypes": [ "dataType" ],
}
valueType: Urn
```

#### Annotating Collections & Annotation Overrides

You will not always be able to apply annotations to a primitive field directly. This may be because the field is wrapped
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ public class AspectSpec {
private final Map<String, TimeseriesFieldSpec> _timeseriesFieldSpecs;
private final Map<String, TimeseriesFieldCollectionSpec> _timeseriesFieldCollectionSpecs;
private final Map<String, SearchableRefFieldSpec> _searchableRefFieldSpecs;
private final Map<String, UrnValidationFieldSpec> _urnValidationFieldSpecs;

// Classpath & Pegasus-specific: Temporary.
private final RecordDataSchema _schema;
Expand All @@ -39,6 +40,7 @@ public AspectSpec(
@Nonnull final List<TimeseriesFieldSpec> timeseriesFieldSpecs,
@Nonnull final List<TimeseriesFieldCollectionSpec> timeseriesFieldCollectionSpecs,
@Nonnull final List<SearchableRefFieldSpec> searchableRefFieldSpecs,
@Nonnull final List<UrnValidationFieldSpec> urnValidationFieldSpecs,
final RecordDataSchema schema,
final Class<RecordTemplate> aspectClass) {
_aspectAnnotation = aspectAnnotation;
Expand Down Expand Up @@ -76,6 +78,11 @@ public AspectSpec(
spec -> spec.getTimeseriesFieldCollectionAnnotation().getCollectionName(),
spec -> spec,
(val1, val2) -> val1));
_urnValidationFieldSpecs =
urnValidationFieldSpecs.stream()
.collect(
Collectors.toMap(
spec -> spec.getPath().toString(), spec -> spec, (val1, val2) -> val1));
_schema = schema;
_aspectClass = aspectClass;
}
Expand Down Expand Up @@ -112,6 +119,10 @@ public Map<String, TimeseriesFieldSpec> getTimeseriesFieldSpecMap() {
return _timeseriesFieldSpecs;
}

public Map<String, UrnValidationFieldSpec> getUrnValidationFieldSpecMap() {
return _urnValidationFieldSpecs;
}

public Map<String, TimeseriesFieldCollectionSpec> getTimeseriesFieldCollectionSpecMap() {
return _timeseriesFieldCollectionSpecs;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import com.linkedin.metadata.models.annotation.SearchableRefAnnotation;
import com.linkedin.metadata.models.annotation.TimeseriesFieldAnnotation;
import com.linkedin.metadata.models.annotation.TimeseriesFieldCollectionAnnotation;
import com.linkedin.metadata.models.annotation.UrnValidationAnnotation;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
Expand Down Expand Up @@ -48,6 +49,8 @@ public class EntitySpecBuilder {
new PegasusSchemaAnnotationHandlerImpl(TimeseriesFieldAnnotation.ANNOTATION_NAME);
public static SchemaAnnotationHandler _timeseriesFieldCollectionHandler =
new PegasusSchemaAnnotationHandlerImpl(TimeseriesFieldCollectionAnnotation.ANNOTATION_NAME);
public static SchemaAnnotationHandler _urnValidationAnnotationHandler =
new PegasusSchemaAnnotationHandlerImpl(UrnValidationAnnotation.ANNOTATION_NAME);

private final AnnotationExtractionMode _extractionMode;
private final Set<String> _entityNames = new HashSet<>();
Expand Down Expand Up @@ -226,6 +229,7 @@ public AspectSpec buildAspectSpec(
Collections.emptyList(),
Collections.emptyList(),
Collections.emptyList(),
Collections.emptyList(),
aspectRecordSchema,
aspectClass);
}
Expand Down Expand Up @@ -299,6 +303,18 @@ public AspectSpec buildAspectSpec(
new DataSchemaRichContextTraverser(timeseriesFieldSpecExtractor);
timeseriesFieldSpecTraverser.traverse(processedTimeseriesFieldResult.getResultSchema());

// Extract UrnValidation aspects
final SchemaAnnotationProcessor.SchemaAnnotationProcessResult processedTimestampResult =
SchemaAnnotationProcessor.process(
Collections.singletonList(_urnValidationAnnotationHandler),
aspectRecordSchema,
new SchemaAnnotationProcessor.AnnotationProcessOption());
final UrnValidationFieldSpecExtractor urnValidationFieldSpecExtractor =
new UrnValidationFieldSpecExtractor();
final DataSchemaRichContextTraverser timestampFieldSpecTraverser =
new DataSchemaRichContextTraverser(urnValidationFieldSpecExtractor);
timestampFieldSpecTraverser.traverse(processedTimestampResult.getResultSchema());

return new AspectSpec(
aspectAnnotation,
searchableFieldSpecExtractor.getSpecs(),
Expand All @@ -307,6 +323,7 @@ public AspectSpec buildAspectSpec(
timeseriesFieldSpecExtractor.getTimeseriesFieldSpecs(),
timeseriesFieldSpecExtractor.getTimeseriesFieldCollectionSpecs(),
searchableRefFieldSpecExtractor.getSpecs(),
urnValidationFieldSpecExtractor.getUrnValidationFieldSpecs(),
aspectRecordSchema,
aspectClass);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,9 @@ private void extractSearchableAnnotation(
annotation.getNumValuesFieldName(),
annotation.getWeightsPerFieldValue(),
annotation.getFieldNameAliases(),
annotation.isIncludeQueryEmptyAggregation());
annotation.isIncludeQueryEmptyAggregation(),
annotation.isIncludeSystemModifiedAt(),
annotation.getSystemModifiedAtFieldName());
}
}
log.debug("Searchable annotation for field: {} : {}", schemaPathSpec, annotation);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package com.linkedin.metadata.models;

import com.linkedin.data.schema.DataSchema;
import com.linkedin.data.schema.PathSpec;
import com.linkedin.metadata.models.annotation.UrnValidationAnnotation;
import javax.annotation.Nonnull;
import lombok.Value;

@Value
public class UrnValidationFieldSpec {
@Nonnull PathSpec path;
@Nonnull UrnValidationAnnotation urnValidationAnnotation;
@Nonnull DataSchema pegasusSchema;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package com.linkedin.metadata.models;

import com.linkedin.data.schema.DataSchema;
import com.linkedin.data.schema.DataSchemaTraverse;
import com.linkedin.data.schema.PathSpec;
import com.linkedin.data.schema.annotation.SchemaVisitor;
import com.linkedin.data.schema.annotation.SchemaVisitorTraversalResult;
import com.linkedin.data.schema.annotation.TraverserContext;
import com.linkedin.metadata.models.annotation.UrnValidationAnnotation;
import java.util.ArrayList;
import java.util.List;
import lombok.Getter;

@Getter
public class UrnValidationFieldSpecExtractor implements SchemaVisitor {
private final List<UrnValidationFieldSpec> urnValidationFieldSpecs = new ArrayList<>();

@Override
public void callbackOnContext(TraverserContext context, DataSchemaTraverse.Order order) {
if (context.getEnclosingField() == null) {
return;
}

if (DataSchemaTraverse.Order.PRE_ORDER.equals(order)) {
final DataSchema currentSchema = context.getCurrentSchema().getDereferencedDataSchema();
final PathSpec path = new PathSpec(context.getSchemaPathSpec());

// Check for @UrnValidation annotation in primary properties
final Object urnValidationAnnotationObj =
context.getEnclosingField().getProperties().get(UrnValidationAnnotation.ANNOTATION_NAME);

// Check if it's either explicitly annotated with @UrnValidation
if (urnValidationAnnotationObj != null) {
addUrnValidationFieldSpec(currentSchema, path, urnValidationAnnotationObj);
}
}
}

private void addUrnValidationFieldSpec(
DataSchema currentSchema, PathSpec path, Object annotationObj) {
UrnValidationAnnotation annotation =
UrnValidationAnnotation.fromPegasusAnnotationObject(
annotationObj, FieldSpecUtils.getSchemaFieldName(path), path.toString());

urnValidationFieldSpecs.add(new UrnValidationFieldSpec(path, annotation, currentSchema));
}

@Override
public VisitorContext getInitialVisitorContext() {
return null;
}

@Override
public SchemaVisitorTraversalResult getSchemaVisitorTraversalResult() {
return new SchemaVisitorTraversalResult();
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package com.linkedin.metadata.models.annotation;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import lombok.experimental.UtilityClass;
Expand All @@ -13,4 +16,23 @@ <T> Optional<T> getField(final Map fieldMap, final String fieldName, final Class
}
return Optional.empty();
}

<T> List<T> getFieldList(
final Map<String, ?> fieldMap, final String fieldName, final Class<T> itemType) {
Object value = fieldMap.get(fieldName);
if (!(value instanceof List<?>)) {
return Collections.emptyList();
}

List<?> list = (List<?>) value;
List<T> result = new ArrayList<>();

for (Object item : list) {
if (itemType.isInstance(item)) {
result.add(itemType.cast(item));
}
}

return Collections.unmodifiableList(result);
}
}
Loading

0 comments on commit a6951bd

Please sign in to comment.