diff --git a/.github/workflows/build_and_deploy.yaml b/.github/workflows/build_and_deploy.yaml
index 601c8f5e..41801ec8 100644
--- a/.github/workflows/build_and_deploy.yaml
+++ b/.github/workflows/build_and_deploy.yaml
@@ -27,17 +27,16 @@ jobs:
target: "transformer-image"
- image: "druid-router"
target: "router-image"
- - image: "merged-pipeline"
- target: "merged-image"
+ - image: "unified-pipeline"
+ target: "unified-image"
- image: "master-data-processor"
target: "master-data-processor-image"
- image: "lakehouse-connector"
target: "lakehouse-connector-image"
- steps:
- - uses: actions/checkout@v4
+ - image: "cache-indexer"
+ target: "cache-indexer-image"
with:
fetch-depth: 0
-
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
@@ -97,7 +96,7 @@ jobs:
run: |
cd deploy/terraform/aws
terragrunt init
- terragrunt apply -auto-approve -var merged_pipeline_enabled={{ vars.MERGED_PIPELINE || 'true' }} --replace='module.flink.helm_release.flink' \
+ terragrunt apply -auto-approve -var unified_pipeline_enabled={{ vars.MERGED_PIPELINE || 'true' }} --replace='module.flink.helm_release.flink' \
-var flink_image_tag=${{ github.ref_name }}
azure-deploy:
diff --git a/.github/workflows/upload_artifact.yaml b/.github/workflows/upload_artifact.yaml
index 38cb7ec8..07943fd1 100644
--- a/.github/workflows/upload_artifact.yaml
+++ b/.github/workflows/upload_artifact.yaml
@@ -56,7 +56,7 @@ jobs:
- image: "denormalizer"
- image: "transformer"
- image: "druid-router"
- - image: "pipeline-merged"
+ - image: "unified-pipeline"
- image: "master-data-processor"
steps:
- name: Get Tag Name
diff --git a/Dockerfile b/Dockerfile
index fd4002be..1d5ea6c6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,6 +2,7 @@ FROM --platform=linux/x86_64 maven:3.9.4-eclipse-temurin-11-focal AS build-core
COPY . /app
RUN mvn clean install -DskipTests -f /app/framework/pom.xml
RUN mvn clean install -DskipTests -f /app/dataset-registry/pom.xml
+RUN mvn clean install -DskipTests -f /app/transformation-sdk/pom.xml
FROM --platform=linux/x86_64 maven:3.9.4-eclipse-temurin-11-focal AS build-pipeline
COPY --from=build-core /root/.m2 /root/.m2
@@ -28,9 +29,9 @@ FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as rout
USER flink
COPY --from=build-pipeline /app/pipeline/druid-router/target/druid-router-1.0.0.jar $FLINK_HOME/lib/
-FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as merged-image
+FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as unified-image
USER flink
-COPY --from=build-pipeline /app/pipeline/pipeline-merged/target/pipeline-merged-1.0.0.jar $FLINK_HOME/lib/
+COPY --from=build-pipeline /app/pipeline/unified-pipeline/target/unified-pipeline-1.0.0.jar $FLINK_HOME/lib/
FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as master-data-processor-image
USER flink
@@ -40,3 +41,7 @@ FROM --platform=linux/x86_64 sanketikahub/flink:1.15.0-scala_2.12-lakehouse as l
USER flink
RUN mkdir $FLINK_HOME/custom-lib
COPY ./pipeline/hudi-connector/target/hudi-connector-1.0.0.jar $FLINK_HOME/custom-lib
+
+FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as cache-indexer-image
+USER flink
+COPY --from=build-pipeline /app/pipeline/cache-indexer/target/cache-indexer-1.0.0.jar $FLINK_HOME/lib
diff --git a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala
index 22729aa0..781b916a 100644
--- a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala
+++ b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala
@@ -69,7 +69,8 @@ object MasterDataProcessorIndexer {
logger.info(s"createDataFile() | START | dataset=${dataset.id} ")
import spark.implicits._
val readWriteConf = ReadWriteConfig(scanCount = config.getInt("redis.scan.count"), maxPipelineSize = config.getInt("redis.max.pipeline.size"))
- val redisConfig = new RedisConfig(initialHost = RedisEndpoint(host = dataset.datasetConfig.redisDBHost.get, port = dataset.datasetConfig.redisDBPort.get, dbNum = dataset.datasetConfig.redisDB.get))
+ val cacheConfig = dataset.datasetConfig.cacheConfig.get
+ val redisConfig = new RedisConfig(initialHost = RedisEndpoint(host = cacheConfig.redisDBHost.get, port = cacheConfig.redisDBPort.get, dbNum = cacheConfig.redisDB.get))
val ts: Long = new DateTime(DateTimeZone.UTC).withTimeAtStartOfDay().getMillis
val rdd = spark.sparkContext.fromRedisKV("*")(redisConfig = redisConfig, readWriteConfig = readWriteConf).map(
f => CommonUtil.processEvent(f._2, ts)
@@ -83,9 +84,9 @@ object MasterDataProcessorIndexer {
}
private def getDatasets(): List[Dataset] = {
- val datasets: List[Dataset] = DatasetRegistry.getAllDatasets("master-dataset")
+ val datasets: List[Dataset] = DatasetRegistry.getAllDatasets(Some("master"))
datasets.filter(dataset => {
- dataset.datasetConfig.indexData.nonEmpty && dataset.datasetConfig.indexData.get && dataset.status == DatasetStatus.Live
+ dataset.datasetConfig.indexingConfig.olapStoreEnabled && dataset.status == DatasetStatus.Live
})
}
diff --git a/data-products/src/test/scala/org/sunbird/spec/MasterDataIndexerSpec.scala b/data-products/src/test/scala/org/sunbird/spec/MasterDataIndexerSpec.scala
index 0d54050e..6ef1458e 100644
--- a/data-products/src/test/scala/org/sunbird/spec/MasterDataIndexerSpec.scala
+++ b/data-products/src/test/scala/org/sunbird/spec/MasterDataIndexerSpec.scala
@@ -108,16 +108,16 @@ class MasterDataIndexerSpec extends FlatSpec with BeforeAndAfterAll with Matcher
}
private def insertTestData(postgresConnect: PostgresConnect) = {
- postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md1','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 3}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');")
+ postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES('md1','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 3}', NULL, NULL, 'Live', 'v1', 'local.masterdata.ingest', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');")
postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md1_md1.1_DAY', 'md1', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md1.1_DAY', 'md1.1_DAY');")
postgresConnect.execute("insert into dataset_transformations values('tf1', 'md1', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'Live', 'Strict', 'System', 'System', now(), now());")
- postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md2','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\":6340, \"index_data\": true, \"redis_db\": 5}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', 'now()', 'now()');")
+ postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, denorm_config, router_config, dataset_config, tags, data_version, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES('md2','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\":6340, \"index_data\": true, \"redis_db\": 5}', NULL, NULL, 'Live', 'v1', 'local.masterdata.ingest', 'SYSTEM', 'SYSTEM', 'now()', 'now()');")
postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md2_md1.1_DAY', 'md2', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md2.1_DAY', 'md2.1_DAY');")
- postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md3','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 6}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');")
+ postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES('md3','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 6}', NULL, NULL, 'Live', 'v1', 'local.masterdata.ingest', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');")
postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md3_md3.1_DAY', 'md3', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md3.1_DAY', 'md3.1_DAY');")
postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md3_md3.2_DAY', 'md3', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md3.2_DAY', 'md3.2_DAY');")
- postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md5','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 9}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');")
- postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md4','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 3}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');")
+ postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES('md5','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 9}', NULL, NULL, 'Live', 'v1', 'local.masterdata.ingest', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');")
+ postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES('md4','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 3}', NULL, NULL, 'Live', 'v1', 'local.masterdata.ingest', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');")
postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md4_md4.1_DAY', 'md4', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md4.1_DAY', 'md4.1_DAY');")
}
diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala
index 3aebe8bd..8bc1623b 100644
--- a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala
+++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala
@@ -4,7 +4,7 @@ import com.fasterxml.jackson.annotation.JsonProperty
import com.fasterxml.jackson.core.`type`.TypeReference
import com.fasterxml.jackson.module.scala.JsonScalaEnumeration
import org.sunbird.obsrv.core.model.SystemConfig
-import org.sunbird.obsrv.model.DatasetStatus.DatasetStatus
+import org.sunbird.obsrv.model.DatasetStatus.{DatasetStatus, Value}
import org.sunbird.obsrv.model.TransformMode.TransformMode
import org.sunbird.obsrv.model.ValidationMode.ValidationMode
@@ -25,25 +25,42 @@ object DatasetModels {
case class ValidationConfig(@JsonProperty("validate") validate: Option[Boolean] = Some(true),
@JsonProperty("mode") @JsonScalaEnumeration(classOf[ValidationModeType]) mode: Option[ValidationMode])
- case class DenormFieldConfig(@JsonProperty("denorm_key") denormKey: String, @JsonProperty("redis_db") redisDB: Int,
- @JsonProperty("denorm_out_field") denormOutField: String)
+ case class DenormFieldConfig(@JsonProperty("denorm_key") denormKey: Option[String], @JsonProperty("redis_db") redisDB: Int,
+ @JsonProperty("denorm_out_field") denormOutField: String, @JsonProperty("jsonata_expr") jsonAtaExpr: Option[String])
case class DenormConfig(@JsonProperty("redis_db_host") redisDBHost: String, @JsonProperty("redis_db_port") redisDBPort: Int,
@JsonProperty("denorm_fields") denormFields: List[DenormFieldConfig])
case class RouterConfig(@JsonProperty("topic") topic: String)
- case class DatasetConfig(@JsonProperty("data_key") key: String, @JsonProperty("timestamp_key") tsKey: String, @JsonProperty("entry_topic") entryTopic: String,
- @JsonProperty("exclude_fields") excludeFields: Option[List[String]] = None, @JsonProperty("redis_db_host") redisDBHost: Option[String] = None,
- @JsonProperty("redis_db_port") redisDBPort: Option[Int] = None, @JsonProperty("redis_db") redisDB: Option[Int] = None,
- @JsonProperty("index_data") indexData: Option[Boolean] = None, @JsonProperty("timestamp_format") tsFormat: Option[String] = None,
- @JsonProperty("dataset_tz") datasetTimezone: Option[String] = None)
+ case class IndexingConfig(@JsonProperty("olap_store_enabled") olapStoreEnabled: Boolean, @JsonProperty("lakehouse_enabled") lakehouseEnabled: Boolean,
+ @JsonProperty("cache_enabled") cacheEnabled: Boolean)
+
+ case class KeysConfig(@JsonProperty("data_key") dataKey: Option[String], @JsonProperty("partition_key") partitionKey: Option[String],
+ @JsonProperty("timestamp_key") tsKey: Option[String], @JsonProperty("timestamp_format") tsFormat: Option[String])
+
+ case class CacheConfig(@JsonProperty("redis_db_host") redisDBHost: Option[String], @JsonProperty("redis_db_port") redisDBPort: Option[Int],
+ @JsonProperty("redis_db") redisDB: Option[Int])
+
+ case class DatasetConfigV1(@JsonProperty("data_key") key: String, @JsonProperty("timestamp_key") tsKey: String, @JsonProperty("entry_topic") entryTopic: String,
+ @JsonProperty("exclude_fields") excludeFields: Option[List[String]] = None, @JsonProperty("redis_db_host") redisDBHost: Option[String] = None,
+ @JsonProperty("redis_db_port") redisDBPort: Option[Int] = None, @JsonProperty("redis_db") redisDB: Option[Int] = None,
+ @JsonProperty("index_data") indexData: Option[Boolean] = None, @JsonProperty("timestamp_format") tsFormat: Option[String] = None,
+ @JsonProperty("dataset_tz") datasetTimezone: Option[String] = None)
+
+ case class DatasetConfig(@JsonProperty("indexing_config") indexingConfig: IndexingConfig,
+ @JsonProperty("keys_config") keysConfig: KeysConfig,
+ @JsonProperty("exclude_fields") excludeFields: Option[List[String]] = None,
+ @JsonProperty("dataset_tz") datasetTimezone: Option[String] = None,
+ @JsonProperty("cache_config") cacheConfig: Option[CacheConfig] = None)
case class Dataset(@JsonProperty("id") id: String, @JsonProperty("type") datasetType: String, @JsonProperty("extraction_config") extractionConfig: Option[ExtractionConfig],
@JsonProperty("dedup_config") dedupConfig: Option[DedupConfig], @JsonProperty("validation_config") validationConfig: Option[ValidationConfig],
@JsonProperty("data_schema") jsonSchema: Option[String], @JsonProperty("denorm_config") denormConfig: Option[DenormConfig],
- @JsonProperty("router_config") routerConfig: RouterConfig, datasetConfig: DatasetConfig, @JsonProperty("status") @JsonScalaEnumeration(classOf[DatasetStatusType]) status: DatasetStatus,
- @JsonProperty("tags") tags: Option[Array[String]] = None, @JsonProperty("data_version") dataVersion: Option[Int] = None)
+ @JsonProperty("router_config") routerConfig: RouterConfig, datasetConfig: DatasetConfig,
+ @JsonProperty("status") @JsonScalaEnumeration(classOf[DatasetStatusType]) status: DatasetStatus,
+ @JsonProperty("entry_topic") entryTopic: String, @JsonProperty("tags") tags: Option[Array[String]] = None,
+ @JsonProperty("data_version") dataVersion: Option[Int] = None, @JsonProperty("api_version") apiVersion: Option[String] = None)
case class Condition(@JsonProperty("type") `type`: String, @JsonProperty("expr") expr: String)
@@ -51,7 +68,7 @@ object DatasetModels {
case class DatasetTransformation(@JsonProperty("id") id: String, @JsonProperty("dataset_id") datasetId: String,
@JsonProperty("field_key") fieldKey: String, @JsonProperty("transformation_function") transformationFunction: TransformationFunction,
- @JsonProperty("status") status: String, @JsonProperty("mode") @JsonScalaEnumeration(classOf[TransformModeType]) mode: Option[TransformMode] = Some(TransformMode.Strict))
+ @JsonProperty("mode") @JsonScalaEnumeration(classOf[TransformModeType]) mode: Option[TransformMode] = Some(TransformMode.Strict))
case class ConnectorConfig(@JsonProperty("kafkaBrokers") kafkaBrokers: String, @JsonProperty("topic") topic: String, @JsonProperty("type") databaseType: String,
@JsonProperty("connection") connection: Connection, @JsonProperty("tableName") tableName: String, @JsonProperty("databaseName") databaseName: String,
@@ -94,4 +111,10 @@ class DatasetStatusType extends TypeReference[DatasetStatus.type]
object DatasetStatus extends Enumeration {
type DatasetStatus = Value
val Draft, Publish, Live, Retired, Purged = Value
+}
+
+class DatasetTypeType extends TypeReference[DatasetType.type]
+object DatasetType extends Enumeration {
+ type DatasetType = Value
+ val event, transaction, master = Value
}
\ No newline at end of file
diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala
index 08921adc..0945fa58 100644
--- a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala
+++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala
@@ -12,9 +12,14 @@ object DatasetRegistry {
datasets ++= DatasetRegistryService.readAllDatasets()
lazy private val datasetTransformations: Map[String, List[DatasetTransformation]] = DatasetRegistryService.readAllDatasetTransformations()
- def getAllDatasets(datasetType: String): List[Dataset] = {
+ def getAllDatasets(datasetType: Option[String]): List[Dataset] = {
val datasetList = DatasetRegistryService.readAllDatasets()
- datasetList.filter(f => f._2.datasetType.equals(datasetType)).values.toList
+ if(datasetType.isDefined) {
+ datasetList.filter(f => f._2.datasetType.equals(datasetType.get)).values.toList
+ } else {
+ datasetList.values.toList
+ }
+
}
def getDataset(id: String): Option[Dataset] = {
@@ -47,8 +52,8 @@ object DatasetRegistry {
datasourceList.getOrElse(List())
}
- def getDataSetIds(datasetType: String): List[String] = {
- datasets.filter(f => f._2.datasetType.equals(datasetType)).keySet.toList
+ def getDataSetIds(): List[String] = {
+ datasets.keySet.toList
}
def updateDatasourceRef(datasource: DataSource, datasourceRef: String): Int = {
diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala
index 0b0abe23..a0a90655 100644
--- a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala
+++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala
@@ -6,7 +6,7 @@ import org.sunbird.obsrv.model.DatasetModels._
import org.sunbird.obsrv.model.{DatasetStatus, TransformMode}
import java.io.File
-import java.sql.{PreparedStatement, ResultSet, Timestamp}
+import java.sql.{ResultSet, Timestamp}
object DatasetRegistryService {
private val configFile = new File("/data/flink/conf/baseconfig.conf")
@@ -41,6 +41,21 @@ object DatasetRegistryService {
}
}
+ def readDataset(id: String): Option[Dataset] = {
+
+ val postgresConnect = new PostgresConnect(postgresConfig)
+ try {
+ val rs = postgresConnect.executeQuery(s"SELECT * FROM datasets where id='$id'")
+ if (rs.next()) {
+ Some(parseDataset(rs))
+ } else {
+ None
+ }
+ } finally {
+ postgresConnect.closeConnection()
+ }
+ }
+
def readDataset(id: String): Option[Dataset] = {
val postgresConnect = new PostgresConnect(postgresConfig)
var preparedStatement: PreparedStatement = null
@@ -76,6 +91,20 @@ object DatasetRegistryService {
}
}
+ def readDatasetSourceConfig(datasetId: String): Option[List[DatasetSourceConfig]] = {
+
+ val postgresConnect = new PostgresConnect(postgresConfig)
+ try {
+ val rs = postgresConnect.executeQuery(s"SELECT * FROM dataset_source_config where dataset_id='$datasetId'")
+ Option(Iterator.continually((rs, rs.next)).takeWhile(f => f._2).map(f => f._1).map(result => {
+ val datasetSourceConfig = parseDatasetSourceConfig(result)
+ datasetSourceConfig
+ }).toList)
+ } finally {
+ postgresConnect.closeConnection()
+ }
+ }
+
def readDatasetSourceConfig(datasetId: String): Option[List[DatasetSourceConfig]] = {
val postgresConnect = new PostgresConnect(postgresConfig)
@@ -112,20 +141,14 @@ object DatasetRegistryService {
}
def readDatasources(datasetId: String): Option[List[DataSource]] = {
+
val postgresConnect = new PostgresConnect(postgresConfig)
- var preparedStatement: PreparedStatement = null
- var resultSet: ResultSet = null
try {
- val query = "SELECT * FROM datasources WHERE dataset_id = ?"
- preparedStatement = postgresConnect.prepareStatement(query)
- preparedStatement.setString(1, datasetId)
- resultSet = postgresConnect.executeQuery(preparedStatement = preparedStatement)
- Option(Iterator.continually((resultSet, resultSet.next)).takeWhile(f => f._2).map(f => f._1).map(result => {
+ val rs = postgresConnect.executeQuery(s"SELECT * FROM datasources where dataset_id='$datasetId'")
+ Option(Iterator.continually((rs, rs.next)).takeWhile(f => f._2).map(f => f._1).map(result => {
parseDatasource(result)
}).toList)
} finally {
- if (resultSet != null) resultSet.close()
- if (preparedStatement != null) preparedStatement.close()
postgresConnect.closeConnection()
}
}
@@ -133,24 +156,13 @@ object DatasetRegistryService {
def readAllDatasources(): Option[List[DataSource]] = {
val postgresConnect = new PostgresConnect(postgresConfig)
+ var preparedStatement: PreparedStatement = null
+ val query = "UPDATE datasources SET datasource_ref = ? WHERE datasource = ? AND dataset_id = ?"
try {
val rs = postgresConnect.executeQuery(s"SELECT * FROM datasources")
Option(Iterator.continually((rs, rs.next)).takeWhile(f => f._2).map(f => f._1).map(result => {
parseDatasource(result)
}).toList)
- }
- }
-
- def updateDatasourceRef(datasource: DataSource, datasourceRef: String): Int = {
- val postgresConnect = new PostgresConnect(postgresConfig)
- var preparedStatement: PreparedStatement = null
- val query = "UPDATE datasources SET datasource_ref = ? WHERE datasource = ? AND dataset_id = ?"
- try {
- preparedStatement = postgresConnect.prepareStatement(query)
- preparedStatement.setString(1, datasourceRef)
- preparedStatement.setString(2, datasource.datasource)
- preparedStatement.setString(3, datasource.datasetId)
- postgresConnect.executeUpdate(preparedStatement)
} finally {
if (preparedStatement != null) preparedStatement.close()
postgresConnect.closeConnection()
@@ -174,6 +186,37 @@ object DatasetRegistryService {
}
}
+ def updateDatasourceRef(datasource: DataSource, datasourceRef: String): Int = {
+ val query = s"UPDATE datasources set datasource_ref = '$datasourceRef' where datasource='${datasource.datasource}' and dataset_id='${datasource.datasetId}'"
+ updateRegistry(query)
+ }
+
+ def updateConnectorStats(id: String, lastFetchTimestamp: Timestamp, records: Long): Int = {
+ val query = s"UPDATE dataset_source_config SET connector_stats = coalesce(connector_stats, '{}')::jsonb || " +
+ s"jsonb_build_object('records', COALESCE(connector_stats->>'records', '0')::int + '$records'::int) || " +
+ s"jsonb_build_object('last_fetch_timestamp', '$lastFetchTimestamp'::timestamp) || " +
+ s"jsonb_build_object('last_run_timestamp', '${new Timestamp(System.currentTimeMillis())}'::timestamp) WHERE id = '$id';"
+ updateRegistry(query)
+ }
+
+ def updateConnectorDisconnections(id: String, disconnections: Int): Int = {
+ val query = s"UPDATE dataset_source_config SET connector_stats = jsonb_set(coalesce(connector_stats, '{}')::jsonb, '{disconnections}','$disconnections') WHERE id = '$id'"
+ updateRegistry(query)
+ }
+
+ def updateConnectorAvgBatchReadTime(id: String, avgReadTime: Long): Int = {
+ val query = s"UPDATE dataset_source_config SET connector_stats = jsonb_set(coalesce(connector_stats, '{}')::jsonb, '{avg_batch_read_time}','$avgReadTime') WHERE id = '$id'"
+ updateRegistry(query)
+ }
+
+ private def updateRegistry(query: String): Int = {
+ val postgresConnect = new PostgresConnect(postgresConfig)
+ try {
+ postgresConnect.executeUpdate(query)
+ } finally {
+ postgresConnect.closeConnection()
+ }
+ }
def updateConnectorDisconnections(id: String, disconnections: Int): Int = {
val postgresConnect = new PostgresConnect(postgresConfig)
@@ -214,11 +257,25 @@ object DatasetRegistryService {
val jsonSchema = rs.getString("data_schema")
val denormConfig = rs.getString("denorm_config")
val routerConfig = rs.getString("router_config")
- val datasetConfig = rs.getString("dataset_config")
+ val datasetConfigStr = rs.getString("dataset_config")
val status = rs.getString("status")
val tagArray = rs.getArray("tags")
val tags = if (tagArray != null) tagArray.getArray.asInstanceOf[Array[String]] else null
val dataVersion = rs.getInt("data_version")
+ val apiVersion = rs.getString("api_version")
+ val entryTopic = rs.getString("entry_topic")
+
+ val datasetConfig: DatasetConfig = if ("v2".equalsIgnoreCase(apiVersion)) {
+ JSONUtil.deserialize[DatasetConfig](datasetConfigStr)
+ } else {
+ val v1Config = JSONUtil.deserialize[DatasetConfigV1](datasetConfigStr)
+ DatasetConfig(
+ indexingConfig = IndexingConfig(olapStoreEnabled = true, lakehouseEnabled = false, cacheEnabled = if ("master".equalsIgnoreCase(datasetType)) true else false),
+ keysConfig = KeysConfig(dataKey = Some(v1Config.key), None, tsKey = Some(v1Config.tsKey), None),
+ excludeFields = v1Config.excludeFields, datasetTimezone = v1Config.datasetTimezone,
+ cacheConfig = Some(CacheConfig(redisDBHost = v1Config.redisDBHost, redisDBPort = v1Config.redisDBPort, redisDB = v1Config.redisDB))
+ )
+ }
Dataset(datasetId, datasetType,
if (extractionConfig == null) None else Some(JSONUtil.deserialize[ExtractionConfig](extractionConfig)),
@@ -227,10 +284,12 @@ object DatasetRegistryService {
Option(jsonSchema),
if (denormConfig == null) None else Some(JSONUtil.deserialize[DenormConfig](denormConfig)),
JSONUtil.deserialize[RouterConfig](routerConfig),
- JSONUtil.deserialize[DatasetConfig](datasetConfig),
+ datasetConfig,
DatasetStatus.withName(status),
+ entryTopic,
Option(tags),
- Option(dataVersion)
+ Option(dataVersion),
+ Option(apiVersion)
)
}
@@ -265,10 +324,9 @@ object DatasetRegistryService {
val datasetId = rs.getString("dataset_id")
val fieldKey = rs.getString("field_key")
val transformationFunction = rs.getString("transformation_function")
- val status = rs.getString("status")
val mode = rs.getString("mode")
- DatasetTransformation(id, datasetId, fieldKey, JSONUtil.deserialize[TransformationFunction](transformationFunction), status, Some(if (mode != null) TransformMode.withName(mode) else TransformMode.Strict))
+ DatasetTransformation(id, datasetId, fieldKey, JSONUtil.deserialize[TransformationFunction](transformationFunction), Some(if (mode != null) TransformMode.withName(mode) else TransformMode.Strict))
}
}
\ No newline at end of file
diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala
index 4e992eba..9c454ec0 100644
--- a/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala
+++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala
@@ -28,11 +28,11 @@ trait SystemEventHandler {
}
private def getTime(timespans: Map[String, AnyRef], producer: Producer): Option[Long] = {
- timespans.get(producer.toString).map(f => f.asInstanceOf[Long])
+ timespans.get(producer.toString).map(f => f.asInstanceOf[Number].longValue())
}
private def getStat(obsrvMeta: Map[String, AnyRef], stat: Stats): Option[Long] = {
- obsrvMeta.get(stat.toString).map(f => f.asInstanceOf[Long])
+ obsrvMeta.get(stat.toString).map(f => f.asInstanceOf[Number].longValue())
}
def getError(error: ErrorConstants.Error, producer: Producer, functionalError: FunctionalError): Option[ErrorLog] = {
@@ -74,7 +74,7 @@ abstract class BaseDatasetProcessFunction(config: BaseJobConfig[mutable.Map[Stri
override def getMetricsList(): MetricsList = {
val metrics = getMetrics() ++ List(config.eventFailedMetricsCount)
- MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics)
+ MetricsList(DatasetRegistry.getDataSetIds(), metrics)
}
private def initMetrics(datasetId: String): Unit = {
@@ -138,7 +138,7 @@ abstract class BaseDatasetWindowProcessFunction(config: BaseJobConfig[mutable.Ma
override def getMetricsList(): MetricsList = {
val metrics = getMetrics() ++ List(config.eventFailedMetricsCount)
- MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics)
+ MetricsList(DatasetRegistry.getDataSetIds(), metrics)
}
private def initMetrics(datasetId: String): Unit = {
diff --git a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala
index 1b3edea0..53a40ddd 100644
--- a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala
+++ b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala
@@ -35,18 +35,18 @@ class BaseSpecWithDatasetRegistry extends BaseSpecWithPostgres {
private def createSchema(postgresConnect: PostgresConnect): Unit = {
- postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasets ( id text PRIMARY KEY, type text NOT NULL, validation_config json, extraction_config json, dedup_config json, data_schema json, denorm_config json, router_config json NOT NULL, dataset_config json NOT NULL, status text NOT NULL, tags text[], data_version INT, created_by text NOT NULL, updated_by text NOT NULL, created_date timestamp NOT NULL, updated_date timestamp NOT NULL );")
+ postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasets ( id text PRIMARY KEY, type text NOT NULL, validation_config json, extraction_config json, dedup_config json, data_schema json, denorm_config json, router_config json NOT NULL, dataset_config json NOT NULL, status text NOT NULL, tags text[], data_version INT, api_version VARCHAR(255) NOT NULL, entry_topic TEXT NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date timestamp NOT NULL, updated_date timestamp NOT NULL );")
postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasources ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), type text NOT NULL, ingestion_spec json NOT NULL, datasource text NOT NULL, datasource_ref text NOT NULL, retention_period json, archival_policy json, purge_policy json, backup_config json NOT NULL, status text NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL );")
- postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_transformations ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), field_key text NOT NULL, transformation_function json NOT NULL, status text NOT NULL, mode text, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(field_key, dataset_id) );")
+ postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_transformations ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), field_key text NOT NULL, transformation_function json NOT NULL, mode text, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(field_key, dataset_id) );")
postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_source_config ( id text PRIMARY KEY, dataset_id text NOT NULL REFERENCES datasets (id), connector_type text NOT NULL, connector_config json NOT NULL, status text NOT NULL, connector_stats json, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(connector_type, dataset_id) );")
}
private def insertTestData(postgresConnect: PostgresConnect): Unit = {
- postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, created_by, updated_by, created_date, updated_date) values ('d1', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'System', 'System', now(), now());")
+ postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d1', 'event', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'v1', 'ingest', 'System', 'System', now(), now());")
postgresConnect.execute("update datasets set denorm_config = '{\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"denorm_fields\":[{\"denorm_key\":\"vehicleCode\",\"redis_db\":2,\"denorm_out_field\":\"vehicleData\"}]}' where id='d1';")
- postgresConnect.execute("insert into dataset_transformations values('tf1', 'd1', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'Live', 'Strict', 'System', 'System', now(), now());")
- postgresConnect.execute("insert into dataset_transformations values('tf2', 'd1', 'dealer.maskedPhone', '{\"type\":\"mask\",\"expr\": \"dealer.phone\"}', 'Live', null, 'System', 'System', now(), now());")
- postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date, tags) values ('d2', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);")
+ postgresConnect.execute("insert into dataset_transformations values('tf1', 'd1', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'Strict', 'System', 'System', now(), now());")
+ postgresConnect.execute("insert into dataset_transformations values('tf2', 'd1', 'dealer.maskedPhone', '{\"type\":\"mask\",\"expr\": \"dealer.phone\"}', null, 'System', 'System', now(), now());")
+ postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date, tags) values ('d2', 'event', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);")
}
def getPrintableMetrics(metricsMap: mutable.Map[String, Long]): Map[String, Map[String, Map[String, Long]]] = {
diff --git a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala
index 3d83552d..dcdcf402 100644
--- a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala
+++ b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala
@@ -23,7 +23,7 @@ class TestDatasetRegistrySpec extends BaseSpecWithDatasetRegistry with Matchers
d2Opt.get.denormConfig should be(None)
val postgresConnect = new PostgresConnect(postgresConfig)
- postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date, tags) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);")
+ postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date, tags) values ('d3', 'event', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);")
postgresConnect.closeConnection()
val d3Opt = DatasetRegistry.getDataset("d3")
@@ -34,14 +34,14 @@ class TestDatasetRegistrySpec extends BaseSpecWithDatasetRegistry with Matchers
val d4Opt = DatasetRegistry.getDataset("d4")
d4Opt should be (None)
- val allDatasets = DatasetRegistry.getAllDatasets("dataset")
+ val allDatasets = DatasetRegistry.getAllDatasets(Some("event"))
allDatasets.size should be(3)
val d1Tfs = DatasetRegistry.getDatasetTransformations("d1")
d1Tfs should not be None
d1Tfs.get.size should be(2)
- val ids = DatasetRegistry.getDataSetIds("dataset").sortBy(f => f)
+ val ids = DatasetRegistry.getDataSetIds().sortBy(f => f)
ids.head should be("d1")
ids.apply(1) should be("d2")
ids.apply(2) should be("d3")
diff --git a/framework/pom.xml b/framework/pom.xml
index 52ced63f..263a52a7 100644
--- a/framework/pom.xml
+++ b/framework/pom.xml
@@ -44,7 +44,7 @@
org.apache.httpcomponents
httpclient
- 4.5.1
+ 4.5.13
com.google.code.gson
@@ -98,7 +98,7 @@
junit
junit
- 4.12
+ 4.13.1
test
@@ -144,12 +144,6 @@
1.0.0
test
-
- org.cassandraunit
- cassandra-unit
- 3.11.2.0
- test
-
diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala
index 466552dd..c20bb925 100644
--- a/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala
+++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala
@@ -16,5 +16,4 @@ object Constants {
val TOPIC = "topic"
val MESSAGE = "message"
val DATALAKE_TYPE = "datalake"
- val MASTER_DATASET_TYPE = "master-dataset"
}
diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala
index b5e57d87..6b9fcc08 100644
--- a/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala
+++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala
@@ -24,7 +24,7 @@ object ErrorConstants extends Enumeration {
val JSON_SCHEMA_NOT_FOUND = ErrorInternalValue("ERR_PP_1011", "Json schema not found for the dataset")
val INVALID_JSON_SCHEMA = ErrorInternalValue("ERR_PP_1012", "Invalid json schema")
val SCHEMA_VALIDATION_FAILED = ErrorInternalValue("ERR_PP_1013", "Event failed the schema validation")
- val DENORM_KEY_MISSING = ErrorInternalValue("ERR_DENORM_1014", "No denorm key found or missing data for the specified key")
+ val DENORM_KEY_MISSING = ErrorInternalValue("ERR_DENORM_1014", "No denorm key or transformation expr found or missing data for the specified key")
val DENORM_KEY_NOT_A_STRING_OR_NUMBER = ErrorInternalValue("ERR_DENORM_1015", "Denorm key value is not a String or Number")
val DENORM_DATA_NOT_FOUND = ErrorInternalValue("ERR_DENORM_1016", "Denorm data not found for the given key")
val MISSING_DATASET_CONFIG_KEY = ErrorInternalValue("ERR_MASTER_DATA_1017", "Master dataset configuration key is missing")
diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/SystemConfig.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/SystemConfig.scala
index e4c05e4c..0adb1098 100644
--- a/framework/src/main/scala/org/sunbird/obsrv/core/model/SystemConfig.scala
+++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/SystemConfig.scala
@@ -6,7 +6,7 @@ import org.sunbird.obsrv.core.model.Models.SystemSetting
import org.sunbird.obsrv.core.util.{PostgresConnect, PostgresConnectionConfig}
import java.io.File
-import java.sql.{PreparedStatement, ResultSet}
+import java.sql.ResultSet
object SystemConfig {
@@ -102,17 +102,10 @@ object SystemConfigService {
@throws[Exception]
def getSystemSetting(key: String): Option[SystemSetting] = {
val postgresConnect = new PostgresConnect(postgresConfig)
- var preparedStatement: PreparedStatement = null
- var rs: ResultSet = null
- val query = "SELECT * FROM system_settings WHERE key = ?"
- preparedStatement = postgresConnect.prepareStatement(query)
- preparedStatement.setString(1, key)
try {
- rs = postgresConnect.executeQuery(preparedStatement = preparedStatement)
+ val rs = postgresConnect.executeQuery(s"SELECT * FROM system_settings WHERE key = '$key'")
if (rs.next) Option(parseSystemSetting(rs)) else None
} finally {
- if (rs != null) rs.close()
- if (preparedStatement != null) preparedStatement.close()
postgresConnect.closeConnection()
}
}
diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala b/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala
index 370353c7..d68b924a 100644
--- a/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala
+++ b/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala
@@ -46,6 +46,40 @@ class MapDeserializationSchema extends KafkaRecordDeserializationSchema[mutable.
}
+class TopicDeserializationSchema extends KafkaRecordDeserializationSchema[mutable.Map[String, AnyRef]] {
+
+ private val serialVersionUID = -3224825136576915426L
+
+ override def getProducedType: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]])
+
+ override def deserialize(record: ConsumerRecord[Array[Byte], Array[Byte]], out: Collector[mutable.Map[String, AnyRef]]): Unit = {
+ val msg = try {
+ val event = JSONUtil.deserialize[Map[String, AnyRef]](record.value())
+ mutable.Map[String, AnyRef](
+ "dataset" -> record.topic(),
+ "event" -> event
+ )
+ } catch {
+ case _: Exception =>
+ mutable.Map[String, AnyRef](Constants.INVALID_JSON -> new String(record.value, "UTF-8"))
+ }
+ initObsrvMeta(msg, record)
+ out.collect(msg)
+ }
+
+ private def initObsrvMeta(msg: mutable.Map[String, AnyRef], record: ConsumerRecord[Array[Byte], Array[Byte]]): Unit = {
+ if (!msg.contains("obsrv_meta")) {
+ msg.put("obsrv_meta", Map(
+ "syncts" -> record.timestamp(),
+ "processingStartTime" -> System.currentTimeMillis(),
+ "flags" -> Map(),
+ "timespans" -> Map(),
+ "error" -> Map()
+ ))
+ }
+ }
+}
+
class StringDeserializationSchema extends KafkaRecordDeserializationSchema[String] {
private val serialVersionUID = -3224825136576915426L
diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala
index cb4657c3..51753f75 100644
--- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala
+++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala
@@ -53,8 +53,6 @@ abstract class BaseJobConfig[T](val config: Config, val jobName: String) extends
val checkpointingBaseUrl: Option[String] = if (config.hasPath("job.statebackend.base.url")) Option(config.getString("job.statebackend.base.url")) else None
// Base Methods
- def datasetType(): String = if (config.hasPath("dataset.type")) config.getString("dataset.type") else "dataset"
-
def inputTopic(): String
def inputConsumer(): String
diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseStreamTask.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseStreamTask.scala
index 8ebdb8a7..bdc897da 100644
--- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseStreamTask.scala
+++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseStreamTask.scala
@@ -38,6 +38,13 @@ abstract class BaseStreamTask[T] extends BaseStreamTaskSink[T] {
.rebalance()
}
+ def getTopicMapDataStream(env: StreamExecutionEnvironment, config: BaseJobConfig[T], kafkaTopics: List[String],
+ consumerSourceName: String, kafkaConnector: FlinkKafkaConnector): DataStream[mutable.Map[String, AnyRef]] = {
+ env.fromSource(kafkaConnector.kafkaTopicMapSource(kafkaTopics), WatermarkStrategy.noWatermarks[mutable.Map[String, AnyRef]](), consumerSourceName)
+ .uid(consumerSourceName).setParallelism(config.kafkaConsumerParallelism)
+ .rebalance()
+ }
+
def getStringDataStream(env: StreamExecutionEnvironment, config: BaseJobConfig[T], kafkaConnector: FlinkKafkaConnector): DataStream[String] = {
env.fromSource(kafkaConnector.kafkaStringSource(config.inputTopic()), WatermarkStrategy.noWatermarks[String](), config.inputConsumer())
.uid(config.inputConsumer()).setParallelism(config.kafkaConsumerParallelism)
diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/FlinkKafkaConnector.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/FlinkKafkaConnector.scala
index 508e1e7c..39552dd7 100644
--- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/FlinkKafkaConnector.scala
+++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/FlinkKafkaConnector.scala
@@ -47,6 +47,15 @@ class FlinkKafkaConnector(config: BaseJobConfig[_]) extends Serializable {
.build()
}
+ def kafkaTopicMapSource(kafkaTopics: List[String]): KafkaSource[mutable.Map[String, AnyRef]] = {
+ KafkaSource.builder[mutable.Map[String, AnyRef]]()
+ .setTopics(kafkaTopics.asJava)
+ .setDeserializer(new TopicDeserializationSchema)
+ .setProperties(config.kafkaConsumerProperties())
+ .setStartingOffsets(OffsetsInitializer.committedOffsets(OffsetResetStrategy.EARLIEST))
+ .build()
+ }
+
def kafkaMapDynamicSink(): KafkaSink[mutable.Map[String, AnyRef]] = {
KafkaSink.builder[mutable.Map[String, AnyRef]]()
.setDeliverGuarantee(DeliveryGuarantee.AT_LEAST_ONCE)
diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala b/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala
index 67156256..550e99d8 100644
--- a/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala
+++ b/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala
@@ -57,6 +57,10 @@ object JSONUtil {
root.at(path);
}
+ def getJsonNode(json: String): JsonNode = {
+ mapper.readTree(json);
+ }
+
private[this] def typeReference[T: Manifest] = new TypeReference[T] {
override def getType: Type = typeFromManifest(manifest[T])
}
diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/util/PostgresConnect.scala b/framework/src/main/scala/org/sunbird/obsrv/core/util/PostgresConnect.scala
index a1a23df9..64469882 100644
--- a/framework/src/main/scala/org/sunbird/obsrv/core/util/PostgresConnect.scala
+++ b/framework/src/main/scala/org/sunbird/obsrv/core/util/PostgresConnect.scala
@@ -71,41 +71,6 @@ class PostgresConnect(config: PostgresConnectionConfig) {
// $COVERAGE-ON$
}
- def prepareStatement(query: String): PreparedStatement = {
- try {
- connection.prepareStatement(query)
- } catch {
- case ex: SQLException =>
- ex.printStackTrace()
- logger.error("PostgresConnect:prepareStatement() - Exception", ex)
- reset()
- connection.prepareStatement(query)
- }
- }
-
- def executeUpdate(preparedStatement: PreparedStatement): Int = {
- try {
- preparedStatement.executeUpdate()
- } catch {
- case ex: SQLException =>
- ex.printStackTrace()
- logger.error("PostgresConnect:executeUpdate():PreparedStatement - Exception", ex)
- reset()
- preparedStatement.executeUpdate()
- }
- }
-
- def executeQuery(preparedStatement: PreparedStatement): ResultSet = {
- try {
- preparedStatement.executeQuery()
- } catch {
- case ex: SQLException =>
- logger.error("PostgresConnect:execute():PreparedStatement - Exception", ex)
- reset()
- preparedStatement.executeQuery()
- }
- }
-
def executeQuery(query:String):ResultSet = statement.executeQuery(query)
}
diff --git a/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala b/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala
index bac2b0ae..cdffa023 100644
--- a/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala
+++ b/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala
@@ -142,7 +142,6 @@ class BaseProcessFunctionTestSpec extends BaseSpecWithPostgres with Matchers {
val metrics = Metrics(mutable.Map("test" -> new ConcurrentHashMap[String, AtomicLong]()))
metrics.reset("test1", "m1")
- bsConfig.datasetType() should be ("dataset")
}
"TestBaseStreamTask" should "validate the getMapDataStream method" in {
diff --git a/framework/src/test/scala/org/sunbird/spec/ModelsTestSpec.scala b/framework/src/test/scala/org/sunbird/spec/ModelsTestSpec.scala
index 4ca0ad5e..f85347dd 100644
--- a/framework/src/test/scala/org/sunbird/spec/ModelsTestSpec.scala
+++ b/framework/src/test/scala/org/sunbird/spec/ModelsTestSpec.scala
@@ -104,7 +104,6 @@ class ModelsTestSpec extends FlatSpec with Matchers {
bsMapConfig.kafkaConsumerProperties()
bsMapConfig.enableDistributedCheckpointing should be (None)
bsMapConfig.checkpointingBaseUrl should be (None)
- bsMapConfig.datasetType() should be ("master-dataset")
val dsk = new DatasetKeySelector()
dsk.getKey(mutable.Map("dataset" -> "d1".asInstanceOf[AnyRef])) should be ("d1")
diff --git a/pipeline/cache-indexer/pom.xml b/pipeline/cache-indexer/pom.xml
new file mode 100644
index 00000000..36d76208
--- /dev/null
+++ b/pipeline/cache-indexer/pom.xml
@@ -0,0 +1,248 @@
+
+
+ 4.0.0
+
+ pipeline
+ org.sunbird.obsrv
+ 1.0
+
+ cache-indexer
+ 1.0.0
+ Cache Indexer
+
+ UTF-8
+ 1.4.0
+
+
+
+
+ org.apache.flink
+ flink-streaming-scala_${scala.maj.version}
+ ${flink.version}
+ provided
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+
+
+
+
+ org.sunbird.obsrv
+ framework
+ 1.0.0
+
+
+ org.sunbird.obsrv
+ dataset-registry
+ 1.0.0
+
+
+ org.json4s
+ json4s-native_${scala.maj.version}
+ 4.0.6
+
+
+ org.apache.kafka
+ kafka_${scala.maj.version}
+ ${kafka.version}
+ test
+
+
+ org.sunbird.obsrv
+ framework
+ 1.0.0
+ test-jar
+ test
+
+
+ org.sunbird.obsrv
+ dataset-registry
+ 1.0.0
+ test-jar
+ test
+
+
+ org.apache.flink
+ flink-test-utils
+ ${flink.version}
+ test
+
+
+ org.apache.flink
+ flink-runtime
+ ${flink.version}
+ test
+ tests
+
+
+ com.github.codemonstur
+ embedded-redis
+ 1.0.0
+ test
+
+
+ org.apache.flink
+ flink-streaming-java
+ ${flink.version}
+ test
+ tests
+
+
+ org.scalatest
+ scalatest_2.12
+ 3.0.6
+ test
+
+
+ org.mockito
+ mockito-core
+ 3.3.3
+ test
+
+
+ com.fiftyonred
+ mock-jedis
+ 0.4.0
+ test
+
+
+ io.github.embeddedkafka
+ embedded-kafka_2.12
+ 3.4.0
+ test
+
+
+ io.zonky.test
+ embedded-postgres
+ 2.0.3
+ test
+
+
+
+
+ src/main/scala
+ src/test/scala
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.8.1
+
+ 11
+
+
+
+ org.apache.maven.plugins
+ maven-shade-plugin
+ 3.2.1
+
+
+
+ package
+
+ shade
+
+
+ false
+
+
+ com.google.code.findbugs:jsr305
+
+
+
+
+
+ *:*
+
+ META-INF/*.SF
+ META-INF/*.DSA
+ META-INF/*.RSA
+ core-site.xml
+
+
+
+
+
+ org.sunbird.obsrv.streaming.CacheIndexerStreamTask
+
+
+
+ reference.conf
+
+
+
+
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+ 4.4.0
+
+
+ ${java.target.runtime}
+ ${scala.version}
+ false
+
+
+
+ scala-compile-first
+ process-resources
+
+ add-source
+ compile
+
+
+
+ scala-test-compile
+ process-test-resources
+
+ testCompile
+
+
+
+
+
+
+ maven-surefire-plugin
+ 2.22.2
+
+ true
+
+
+
+
+ org.scalatest
+ scalatest-maven-plugin
+ 1.0
+
+ ${project.build.directory}/surefire-reports
+ .
+ cache-indexer-testsuite.txt
+
+
+
+ test
+
+ test
+
+
+
+
+
+ org.scoverage
+ scoverage-maven-plugin
+ ${scoverage.plugin.version}
+
+ ${scala.version}
+ true
+ true
+
+
+
+
+
diff --git a/pipeline/cache-indexer/src/main/resources/cache-indexer.conf b/pipeline/cache-indexer/src/main/resources/cache-indexer.conf
new file mode 100644
index 00000000..58a9c9d1
--- /dev/null
+++ b/pipeline/cache-indexer/src/main/resources/cache-indexer.conf
@@ -0,0 +1,15 @@
+include "baseconfig.conf"
+
+kafka {
+ output.failed.topic = ${job.env}".masterdata.failed"
+ groupId = ${job.env}"-cache-indexer-group"
+ producer {
+ max-request-size = 5242880
+ }
+}
+
+task {
+ window.time.in.seconds = 5
+ window.count = 30
+ window.shards = 1400
+}
\ No newline at end of file
diff --git a/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/function/MasterDataProcessorFunction.scala b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/function/MasterDataProcessorFunction.scala
new file mode 100644
index 00000000..bbab5307
--- /dev/null
+++ b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/function/MasterDataProcessorFunction.scala
@@ -0,0 +1,56 @@
+package org.sunbird.obsrv.function
+
+import org.apache.flink.configuration.Configuration
+import org.apache.flink.streaming.api.functions.ProcessFunction
+import org.json4s.native.JsonMethods._
+import org.slf4j.LoggerFactory
+import org.sunbird.obsrv.core.model.{ErrorConstants, FunctionalError, Producer}
+import org.sunbird.obsrv.core.streaming.Metrics
+import org.sunbird.obsrv.core.util.JSONUtil
+import org.sunbird.obsrv.model.DatasetModels.Dataset
+import org.sunbird.obsrv.pipeline.task.CacheIndexerConfig
+import org.sunbird.obsrv.registry.DatasetRegistry
+import org.sunbird.obsrv.streaming.BaseDatasetProcessFunction
+import org.sunbird.obsrv.util.MasterDataCache
+
+import scala.collection.mutable
+
+class MasterDataProcessorFunction(config: CacheIndexerConfig) extends BaseDatasetProcessFunction(config) {
+
+ private[this] val logger = LoggerFactory.getLogger(classOf[MasterDataProcessorFunction])
+ private[this] var masterDataCache: MasterDataCache = _
+
+ override def open(parameters: Configuration): Unit = {
+ super.open(parameters)
+ masterDataCache = new MasterDataCache(config)
+ masterDataCache.open(DatasetRegistry.getAllDatasets(Some("master")))
+ }
+
+ override def close(): Unit = {
+ super.close()
+ masterDataCache.close()
+ }
+
+ override def getMetrics(): List[String] = {
+ List(config.successEventCount, config.systemEventCount, config.totalEventCount, config.successInsertCount, config.successUpdateCount)
+ }
+
+ override def processElement(dataset: Dataset, msg: mutable.Map[String, AnyRef], context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, metrics: Metrics): Unit = {
+
+ metrics.incCounter(dataset.id, config.totalEventCount)
+ masterDataCache.open(dataset)
+ val event = JSONUtil.serialize(msg(config.CONST_EVENT))
+ val json = parse(event, useBigIntForLong = false)
+ val node = JSONUtil.getKey(dataset.datasetConfig.keysConfig.dataKey.get, event)
+ if (node.isMissingNode) {
+ markFailure(Some(dataset.id), msg, context, metrics, ErrorConstants.MISSING_DATASET_CONFIG_KEY, Producer.masterdataprocessor, FunctionalError.MissingMasterDatasetKey, datasetType = Some(dataset.datasetType))
+ } else {
+ val result = masterDataCache.process(dataset, node.asText(), json)
+ metrics.incCounter(dataset.id, config.successInsertCount, result._1)
+ metrics.incCounter(dataset.id, config.successUpdateCount, result._2)
+ metrics.incCounter(dataset.id, config.successEventCount)
+ }
+
+ }
+
+}
\ No newline at end of file
diff --git a/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerConfig.scala b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerConfig.scala
new file mode 100644
index 00000000..c6a49f57
--- /dev/null
+++ b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerConfig.scala
@@ -0,0 +1,33 @@
+package org.sunbird.obsrv.pipeline.task
+
+import com.typesafe.config.Config
+import org.apache.flink.api.common.typeinfo.TypeInformation
+import org.apache.flink.api.java.typeutils.TypeExtractor
+import org.apache.flink.streaming.api.scala.OutputTag
+import org.sunbird.obsrv.core.streaming.BaseJobConfig
+
+import scala.collection.mutable
+
+class CacheIndexerConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "CacheIndexerJob") {
+
+ private val serialVersionUID = 2905979434303791379L
+ implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]])
+
+ // Metric List
+ val totalEventCount = "total-event-count"
+ val successEventCount = "success-event-count"
+ val successInsertCount = "success-insert-count"
+ val successUpdateCount = "success-update-count"
+
+ val windowTime: Int = config.getInt("task.window.time.in.seconds")
+ val windowCount: Int = config.getInt("task.window.count")
+
+ // Functions
+ val cacheIndexerFunction = "CacheIndexerFunction"
+
+ override def inputTopic(): String = null
+ override def inputConsumer(): String = "cache-indexer"
+ override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("processing_stats")
+
+ override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events")
+}
diff --git a/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerStreamTask.scala b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerStreamTask.scala
new file mode 100644
index 00000000..735440b7
--- /dev/null
+++ b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerStreamTask.scala
@@ -0,0 +1,61 @@
+package org.sunbird.obsrv.streaming
+
+import com.typesafe.config.ConfigFactory
+import org.apache.flink.api.common.typeinfo.TypeInformation
+import org.apache.flink.api.java.typeutils.TypeExtractor
+import org.apache.flink.api.java.utils.ParameterTool
+import org.apache.flink.streaming.api.datastream.DataStream
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
+import org.slf4j.LoggerFactory
+import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector}
+import org.sunbird.obsrv.core.util.FlinkUtil
+import org.sunbird.obsrv.function.MasterDataProcessorFunction
+import org.sunbird.obsrv.model.DatasetType
+import org.sunbird.obsrv.pipeline.task.CacheIndexerConfig
+import org.sunbird.obsrv.registry.DatasetRegistry
+
+import java.io.File
+import scala.collection.mutable
+
+class CacheIndexerStreamTask(config: CacheIndexerConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] {
+
+ implicit val mutableMapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]])
+ private val logger = LoggerFactory.getLogger(classOf[CacheIndexerStreamTask])
+
+ def process(): Unit = {
+ implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config)
+ process(env)
+ env.execute(config.jobName)
+ }
+
+ def process(env: StreamExecutionEnvironment): Unit = {
+
+ val datasets = DatasetRegistry.getAllDatasets(Some(DatasetType.master.toString))
+ val datasetIds = datasets.map(f => f.id)
+ val dataStream = getTopicMapDataStream(env, config, datasetIds, consumerSourceName = s"cache-indexer-consumer", kafkaConnector)
+ processStream(dataStream)
+ }
+
+ override def processStream(dataStream: DataStream[mutable.Map[String, AnyRef]]): DataStream[mutable.Map[String, AnyRef]] = {
+ val processedStream = dataStream.process(new MasterDataProcessorFunction(config)).name(config.cacheIndexerFunction)
+ .uid(config.cacheIndexerFunction).setParallelism(config.downstreamOperatorsParallelism)
+ addDefaultSinks(processedStream, config, kafkaConnector)
+ processedStream.getSideOutput(config.successTag())
+ }
+
+}
+
+object CacheIndexerStreamTask {
+
+ def main(args: Array[String]): Unit = {
+ val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path"))
+ val config = configFilePath.map {
+ path => ConfigFactory.parseFile(new File(path)).resolve()
+ }.getOrElse(ConfigFactory.load("cache-indexer.conf").withFallback(ConfigFactory.systemEnvironment()))
+ val cacheConfig = new CacheIndexerConfig(config)
+ val kafkaUtil = new FlinkKafkaConnector(cacheConfig)
+ val task = new CacheIndexerStreamTask(cacheConfig, kafkaUtil)
+ task.process()
+ }
+
+}
\ No newline at end of file
diff --git a/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/util/MasterDataCache.scala b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/util/MasterDataCache.scala
new file mode 100644
index 00000000..c5f95f32
--- /dev/null
+++ b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/util/MasterDataCache.scala
@@ -0,0 +1,63 @@
+package org.sunbird.obsrv.util
+
+import org.json4s.native.JsonMethods._
+import org.json4s.{JField, JNothing, JValue}
+import org.slf4j.LoggerFactory
+import org.sunbird.obsrv.core.cache.RedisConnect
+import org.sunbird.obsrv.core.model.Constants.OBSRV_META
+import org.sunbird.obsrv.model.DatasetModels.Dataset
+import org.sunbird.obsrv.pipeline.task.CacheIndexerConfig
+import redis.clients.jedis.Jedis
+
+import scala.collection.mutable
+
+class MasterDataCache(val config: CacheIndexerConfig) {
+
+ private[this] val logger = LoggerFactory.getLogger(classOf[MasterDataCache])
+ private val datasetPipelineMap: mutable.Map[String, Jedis] = mutable.Map[String, Jedis]()
+
+ def close(): Unit = {
+ datasetPipelineMap.values.foreach(pipeline => pipeline.close())
+ }
+
+ def open(datasets: List[Dataset]): Unit = {
+ datasets.foreach(dataset => {
+ open(dataset)
+ })
+ }
+
+ def open(dataset: Dataset): Unit = {
+ if (!datasetPipelineMap.contains(dataset.id)) {
+ val redisConfig = dataset.datasetConfig.cacheConfig.get
+ val redisConnect = new RedisConnect(redisConfig.redisDBHost.get, redisConfig.redisDBPort.get, config.redisConnectionTimeout)
+ val jedis: Jedis = redisConnect.getConnection(0)
+ datasetPipelineMap.put(dataset.id, jedis)
+ }
+ }
+
+ def process(dataset: Dataset, key: String, event: JValue): (Int, Int) = {
+ val jedis = this.datasetPipelineMap(dataset.id)
+ val dataFromCache = getDataFromCache(dataset, key, jedis)
+ val updatedEvent = event.removeField {
+ case JField(OBSRV_META, _) => true
+ case _ => false
+ }
+ updateCache(dataset, dataFromCache, key, updatedEvent, jedis)
+ (if (dataFromCache == null) 1 else 0, if (dataFromCache == null) 0 else 1)
+ }
+
+ private def getDataFromCache(dataset: Dataset, key: String, jedis: Jedis): String = {
+
+ jedis.select(dataset.datasetConfig.cacheConfig.get.redisDB.get)
+ jedis.get(key)
+ }
+
+ private def updateCache(dataset: Dataset, dataFromCache: String, key: String, event: JValue, jedis: Jedis): Unit = {
+
+ jedis.select(dataset.datasetConfig.cacheConfig.get.redisDB.get)
+ val existingJson = if (dataFromCache != null) parse(dataFromCache) else JNothing
+ val mergedJson = existingJson merge event
+ jedis.set(key, compact(render(mergedJson)))
+ }
+
+}
diff --git a/pipeline/pipeline-merged/src/test/resources/base-config.conf b/pipeline/cache-indexer/src/test/resources/base-config.conf
similarity index 100%
rename from pipeline/pipeline-merged/src/test/resources/base-config.conf
rename to pipeline/cache-indexer/src/test/resources/base-config.conf
diff --git a/pipeline/cache-indexer/src/test/resources/test.conf b/pipeline/cache-indexer/src/test/resources/test.conf
new file mode 100644
index 00000000..7861c8d0
--- /dev/null
+++ b/pipeline/cache-indexer/src/test/resources/test.conf
@@ -0,0 +1,20 @@
+include "base-test.conf"
+
+kafka {
+
+ output.failed.topic = ${job.env}".masterdata.failed"
+ groupId = ${job.env}"-cache-indexer-group"
+ producer {
+ max-request-size = 5242880
+ }
+}
+
+task {
+ window.time.in.seconds = 2
+ window.count = 2
+ window.shards = 1400
+}
+
+redis {
+ port = 6340
+}
\ No newline at end of file
diff --git a/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala b/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala
new file mode 100644
index 00000000..078cde33
--- /dev/null
+++ b/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala
@@ -0,0 +1,10 @@
+package org.sunbird.obsrv.fixture
+
+object EventFixture {
+
+ val VALID_BATCH_EVENT_D3_INSERT = """{"code":"HYUN-CRE-D6","manufacturer":"Hyundai","model":"Creta","variant":"SX(O)","modelYear":"2023","price":"2200000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","dealer":{"email":"john.doe@example.com","locationId":"KUN12345"}}"""
+ val VALID_BATCH_EVENT_D3_INSERT_2 = """{"code":"HYUN-TUC-D6","manufacturer":"Hyundai","model":"Tucson","variant":"Signature","modelYear":"2023","price":"4000000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","dealer":{"email":"admin.hyun@gmail.com","locationId":"KUN134567"}}"""
+ val VALID_BATCH_EVENT_D3_UPDATE = """{"code":"HYUN-CRE-D6","dealer":{"email":"john.doe@example.com","locationId":"KUN12345"},"safety":"3 Star (Global NCAP)","seatingCapacity":5}"""
+ val VALID_BATCH_EVENT_D4 = """{"code":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}"""
+ val INVALID_BATCH_EVENT_D4 = """{"code1":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}"""
+}
diff --git a/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/pipeline/CacheIndexerStreamTaskTestSpec.scala b/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/pipeline/CacheIndexerStreamTaskTestSpec.scala
new file mode 100644
index 00000000..b95d754d
--- /dev/null
+++ b/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/pipeline/CacheIndexerStreamTaskTestSpec.scala
@@ -0,0 +1,142 @@
+package org.sunbird.obsrv.pipeline
+
+import io.github.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig}
+import org.apache.flink.configuration.Configuration
+import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
+import org.apache.flink.test.util.MiniClusterWithClientResource
+import org.apache.kafka.common.serialization.StringDeserializer
+import org.scalatest.Matchers._
+import org.sunbird.obsrv.BaseMetricsReporter
+import org.sunbird.obsrv.core.cache.RedisConnect
+import org.sunbird.obsrv.core.model.ErrorConstants
+import org.sunbird.obsrv.core.model.Models.SystemEvent
+import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector
+import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect}
+import org.sunbird.obsrv.fixture.EventFixture
+import org.sunbird.obsrv.pipeline.task.CacheIndexerConfig
+import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry
+import org.sunbird.obsrv.streaming.CacheIndexerStreamTask
+
+import scala.collection.mutable
+import scala.concurrent.ExecutionContext.Implicits.global
+import scala.concurrent.Future
+import scala.concurrent.duration._
+
+class CacheIndexerStreamTaskTestSpec extends BaseSpecWithDatasetRegistry {
+
+ val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder()
+ .setConfiguration(testConfiguration())
+ .setNumberSlotsPerTaskManager(1)
+ .setNumberTaskManagers(1)
+ .build)
+
+ val cacheIndexerConfig = new CacheIndexerConfig(config)
+ val kafkaConnector = new FlinkKafkaConnector(cacheIndexerConfig)
+ val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group")
+ implicit val embeddedKafkaConfig: EmbeddedKafkaConfig =
+ EmbeddedKafkaConfig(
+ kafkaPort = 9093,
+ zooKeeperPort = 2183,
+ customConsumerProperties = customKafkaConsumerProperties
+ )
+ implicit val deserializer: StringDeserializer = new StringDeserializer()
+
+ def testConfiguration(): Configuration = {
+ val config = new Configuration()
+ config.setString("metrics.reporter", "job_metrics_reporter")
+ config.setString("metrics.reporter.job_metrics_reporter.class", classOf[BaseMetricsReporter].getName)
+ config
+ }
+
+ override def beforeAll(): Unit = {
+ super.beforeAll()
+ BaseMetricsReporter.gaugeMetrics.clear()
+ EmbeddedKafka.start()(embeddedKafkaConfig)
+ val postgresConnect = new PostgresConnect(postgresConfig)
+ insertTestData(postgresConnect)
+ createTestTopics()
+ EmbeddedKafka.publishStringMessageToKafka("dataset3", EventFixture.VALID_BATCH_EVENT_D3_INSERT)
+ EmbeddedKafka.publishStringMessageToKafka("dataset3", EventFixture.VALID_BATCH_EVENT_D3_INSERT_2)
+ EmbeddedKafka.publishStringMessageToKafka("dataset4", EventFixture.VALID_BATCH_EVENT_D4)
+ EmbeddedKafka.publishStringMessageToKafka("dataset3", EventFixture.VALID_BATCH_EVENT_D3_UPDATE)
+ EmbeddedKafka.publishStringMessageToKafka("dataset4", EventFixture.INVALID_BATCH_EVENT_D4)
+ flinkCluster.before()
+ }
+
+ private def insertTestData(postgresConnect: PostgresConnect): Unit = {
+ postgresConnect.execute("INSERT INTO datasets (id, type, validation_config, extraction_config, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES ('dataset3', 'master', '{\"validate\":true,\"mode\":\"Strict\"}', '{\"is_batch_event\":true,\"extraction_key\":\"events\"}', '{\"type\":\"object\",\"$schema\":\"http://json-schema.org/draft-04/schema#\",\"properties\":{\"code\":{\"type\":\"string\"},\"manufacturer\":{\"type\":\"string\"}, \"model\":{\"type\":\"string\"},\"variant\":{\"type\":\"string\"},\"modelYear\":{\"type\":\"string\"},\"price\":{\"type\":\"string\"},\"currencyCode\":{\"type\":\"string\"},\"currency\":{\"type\":\"string\"},\"transmission\":{\"type\":\"string\"},\"fuel\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"email\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"}}}}}', '{\"topic\":\"d3-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"local.masterdata.ingest\",\"redis_db\":3,\"redis_db_host\":\"localhost\",\"redis_db_port\":" + cacheIndexerConfig.redisPort + "}', 'Live', 'v1', 'local.masterdata.ingest', 'System', 'System', now(), now());")
+ postgresConnect.execute("INSERT INTO datasets (id, type, validation_config, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES ('dataset4', 'master', '{\"validate\":true,\"mode\":\"Strict\"}', '{\"type\":\"object\",\"$schema\":\"http://json-schema.org/draft-04/schema#\",\"properties\":{\"code\":{\"type\":\"string\"},\"manufacturer\":{\"type\":\"string\"},\"model\":{\"type\":\"string\"},\"variant\":{\"type\":\"string\"},\"modelYear\":{\"type\":\"string\"},\"price\":{\"type\":\"string\"},\"currencyCode\":{\"type\":\"string\"},\"currency\":{\"type\":\"string\"},\"seatingCapacity\": {\"type\": \"integer\"}, \"safety\": {\"type\": \"string\"},\"transmission\":{\"type\":\"string\"},\"fuel\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"email\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"}}}}}', '{\"topic\":\"d34-events\"}', '{\"indexing_config\":{\"olap_store_enabled\":false,\"lakehouse_enabled\":false,\"cache_enabled\":true},\"keys_config\":{\"data_key\":\"code\",\"timestamp_key\":\"date\"},\"cache_config\":{\"redis_db\":4,\"redis_db_host\":\"localhost\",\"redis_db_port\":" + cacheIndexerConfig.redisPort + "}}', 'Live', 'v2', 'local.masterdata.ingest', 'System', 'System', now(), now());")
+ }
+
+ override def afterAll(): Unit = {
+
+ super.afterAll()
+ flinkCluster.after()
+ EmbeddedKafka.stop()
+ }
+
+ def createTestTopics(): Unit = {
+ List(config.getString("kafka.output.system.event.topic"), "dataset3", "dataset4").foreach(EmbeddedKafka.createCustomTopic(_))
+ }
+
+ "CacheIndexerStreamTaskTestSpec" should "validate the cache indexer job for master datasets" in {
+
+ implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(cacheIndexerConfig)
+ val task = new CacheIndexerStreamTask(cacheIndexerConfig, kafkaConnector)
+ task.process(env)
+ Future {
+ env.execute(cacheIndexerConfig.jobName)
+ }
+
+ val input = EmbeddedKafka.consumeNumberMessagesFrom[String](config.getString("kafka.output.system.event.topic"), 1, timeout = 30.seconds)
+ input.size should be(1)
+
+ input.foreach(se => {
+ val event = JSONUtil.deserialize[SystemEvent](se)
+ val error = event.data.error
+ if (event.ctx.dataset.getOrElse("ALL").equals("ALL"))
+ event.ctx.dataset_type should be(None)
+ else if (error.isDefined) {
+ val errorCode = error.get.error_code
+ if (errorCode.equals(ErrorConstants.MISSING_DATASET_ID.errorCode) ||
+ errorCode.equals(ErrorConstants.MISSING_DATASET_CONFIGURATION.errorCode) ||
+ errorCode.equals(ErrorConstants.EVENT_MISSING.errorCode)) {
+ event.ctx.dataset_type should be(None)
+ }
+ }
+ else
+ event.ctx.dataset_type should be(Some("master"))
+ })
+
+ val mutableMetricsMap = mutable.Map[String, Long]();
+ BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2))
+
+ cacheIndexerConfig.successTag().getId should be("processing_stats")
+
+ mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset3.${cacheIndexerConfig.totalEventCount}") should be(3)
+ mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset3.${cacheIndexerConfig.successEventCount}") should be(3)
+ mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset3.${cacheIndexerConfig.successInsertCount}") should be(2)
+ mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset3.${cacheIndexerConfig.successUpdateCount}") should be(1)
+
+ mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset4.${cacheIndexerConfig.totalEventCount}") should be(2)
+ mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset4.${cacheIndexerConfig.successEventCount}") should be(1)
+ mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset4.${cacheIndexerConfig.successInsertCount}") should be(1)
+ mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset4.${cacheIndexerConfig.eventFailedMetricsCount}") should be(1)
+
+ val redisConnection = new RedisConnect(cacheIndexerConfig.redisHost, cacheIndexerConfig.redisPort, cacheIndexerConfig.redisConnectionTimeout)
+ val jedis1 = redisConnection.getConnection(3)
+ val event1 = jedis1.get("HYUN-CRE-D6")
+ event1 should be("""{"dealer":{"email":"john.doe@example.com","locationId":"KUN12345"},"model":"Creta","price":"2200000","variant":"SX(O)","fuel":"Diesel","code":"HYUN-CRE-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic","safety":"3 Star (Global NCAP)","seatingCapacity":5}""")
+ val event3 = jedis1.get("HYUN-TUC-D6")
+ event3 should be("""{"dealer":{"email":"admin.hyun@gmail.com","locationId":"KUN134567"},"model":"Tucson","price":"4000000","variant":"Signature","fuel":"Diesel","code":"HYUN-TUC-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic"}""")
+ jedis1.close()
+
+ val jedis2 = redisConnection.getConnection(4)
+ val event2 = jedis2.get("JEEP-CP-D3")
+ event2 should be("""{"model":"Compass","price":"3800000","variant":"Model S (O) Diesel 4x4 AT","fuel":"Diesel","seatingCapacity":5,"code":"JEEP-CP-D3","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Jeep","safety":"5 Star (Euro NCAP)","modelYear":"2023","transmission":"automatic"}""")
+ jedis2.close()
+ }
+
+
+}
diff --git a/pipeline/druid-router/pom.xml b/pipeline/dataset-router/pom.xml
similarity index 97%
rename from pipeline/druid-router/pom.xml
rename to pipeline/dataset-router/pom.xml
index 41e2e390..5c6b5d23 100644
--- a/pipeline/druid-router/pom.xml
+++ b/pipeline/dataset-router/pom.xml
@@ -12,12 +12,12 @@
org.sunbird.obsrv.pipeline
- druid-router
+ dataset-router
1.0.0
jar
- Druid Events Router
+ Dataset Events Router
- Validate and Route Datasets for Druid Indexing
+ Validate and Route Datasets for Indexing into OLAP Store or a Lakehouse
@@ -198,7 +198,7 @@
- reference.conf
+ dataset-router.conf
diff --git a/pipeline/druid-router/src/main/resources/druid-router.conf b/pipeline/dataset-router/src/main/resources/dataset-router.conf
similarity index 100%
rename from pipeline/druid-router/src/main/resources/druid-router.conf
rename to pipeline/dataset-router/src/main/resources/dataset-router.conf
diff --git a/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala
new file mode 100644
index 00000000..9f2c7907
--- /dev/null
+++ b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala
@@ -0,0 +1,117 @@
+package org.sunbird.obsrv.router.functions
+
+import com.fasterxml.jackson.databind.JsonNode
+import com.fasterxml.jackson.databind.node.JsonNodeType
+import org.apache.flink.configuration.Configuration
+import org.apache.flink.streaming.api.functions.ProcessFunction
+import org.joda.time.format.DateTimeFormat
+import org.joda.time.{DateTime, DateTimeZone}
+import org.slf4j.LoggerFactory
+import org.sunbird.obsrv.core.model.{Constants, ErrorConstants, FunctionalError, Producer}
+import org.sunbird.obsrv.core.streaming.Metrics
+import org.sunbird.obsrv.core.util.{JSONUtil, Util}
+import org.sunbird.obsrv.model.DatasetModels.{Dataset, DatasetConfig}
+import org.sunbird.obsrv.model.DatasetType
+import org.sunbird.obsrv.router.task.DynamicRouterConfig
+import org.sunbird.obsrv.streaming.BaseDatasetProcessFunction
+
+import java.util.TimeZone
+import scala.collection.mutable
+
+case class TimestampKey(isValid: Boolean, value: AnyRef)
+
+class DynamicRouterFunction(config: DynamicRouterConfig) extends BaseDatasetProcessFunction(config) {
+
+ private[this] val logger = LoggerFactory.getLogger(classOf[DynamicRouterFunction])
+
+ override def open(parameters: Configuration): Unit = {
+ super.open(parameters)
+ }
+
+ override def close(): Unit = {
+ super.close()
+ }
+
+ override def getMetrics(): List[String] = {
+ List(config.routerTotalCount, config.routerSuccessCount)
+ }
+
+ override def processElement(dataset: Dataset, msg: mutable.Map[String, AnyRef],
+ ctx: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context,
+ metrics: Metrics): Unit = {
+
+ metrics.incCounter(dataset.id, config.routerTotalCount)
+ val event = Util.getMutableMap(msg(config.CONST_EVENT).asInstanceOf[Map[String, AnyRef]])
+ event.put(config.CONST_OBSRV_META, msg(config.CONST_OBSRV_META).asInstanceOf[Map[String, AnyRef]])
+ val tsKeyData = TimestampKeyParser.parseTimestampKey(dataset.datasetConfig, event)
+ event.put("indexTS", tsKeyData.value)
+ if (tsKeyData.isValid || dataset.datasetType.equalsIgnoreCase(DatasetType.master.toString)) {
+ val routerConfig = dataset.routerConfig
+ val topicEventMap = mutable.Map(Constants.TOPIC -> routerConfig.topic, Constants.MESSAGE -> event)
+ ctx.output(config.routerOutputTag, topicEventMap)
+ metrics.incCounter(dataset.id, config.routerSuccessCount)
+ markCompletion(dataset, super.markComplete(event, dataset.dataVersion), ctx, Producer.router)
+ } else {
+ markFailure(Some(dataset.id), msg, ctx, metrics, ErrorConstants.INDEX_KEY_MISSING_OR_BLANK, Producer.router, FunctionalError.MissingTimestampKey, datasetType = Some(dataset.datasetType))
+ }
+ }
+
+}
+
+object TimestampKeyParser {
+
+ def parseTimestampKey(datasetConfig: DatasetConfig, event: mutable.Map[String, AnyRef]): TimestampKey = {
+ val indexKey = datasetConfig.keysConfig.tsKey.get
+ val node = JSONUtil.getKey(indexKey, JSONUtil.serialize(event))
+ node.getNodeType match {
+ case JsonNodeType.NUMBER => onNumber(datasetConfig, node)
+ case JsonNodeType.STRING => onText(datasetConfig, node)
+ case _ => TimestampKey(isValid = false, null)
+ }
+ }
+
+ private def onNumber(datasetConfig: DatasetConfig, node: JsonNode): TimestampKey = {
+ val length = node.asText().length
+ val value = node.numberValue().longValue()
+ // TODO: [P3] Crude implementation. Checking if the epoch timestamp format is one of seconds, milli-seconds, micro-second and nano-seconds. Find a elegant approach
+ if (length == 10 || length == 13 || length == 16 || length == 19) {
+ val tfValue:Long = if (length == 10) (value * 1000).longValue() else if (length == 16) (value / 1000).longValue() else if (length == 19) (value / 1000000).longValue() else value
+ TimestampKey(isValid = true, addTimeZone(datasetConfig, new DateTime(tfValue)).asInstanceOf[AnyRef])
+ } else {
+ TimestampKey(isValid = false, 0.asInstanceOf[AnyRef])
+ }
+ }
+
+ private def onText(datasetConfig: DatasetConfig, node: JsonNode): TimestampKey = {
+ val value = node.textValue()
+ if (datasetConfig.keysConfig.tsFormat.isDefined) {
+ parseDateTime(datasetConfig, value)
+ } else {
+ TimestampKey(isValid = true, value)
+ }
+ }
+
+ private def parseDateTime(datasetConfig: DatasetConfig, value: String): TimestampKey = {
+ try {
+ datasetConfig.keysConfig.tsFormat.get match {
+ case "epoch" => TimestampKey(isValid = true, addTimeZone(datasetConfig, new DateTime(value.toLong)).asInstanceOf[AnyRef])
+ case _ =>
+ val dtf = DateTimeFormat.forPattern(datasetConfig.keysConfig.tsFormat.get)
+ TimestampKey(isValid = true, addTimeZone(datasetConfig, dtf.parseDateTime(value)).asInstanceOf[AnyRef])
+ }
+ } catch {
+ case _: Exception => TimestampKey(isValid = false, null)
+ }
+ }
+
+ private def addTimeZone(datasetConfig: DatasetConfig, dateTime: DateTime): Long = {
+ if (datasetConfig.datasetTimezone.isDefined) {
+ val tz = DateTimeZone.forTimeZone(TimeZone.getTimeZone(datasetConfig.datasetTimezone.get))
+ val offsetInMilliseconds = tz.getOffset(dateTime)
+ dateTime.plusMillis(offsetInMilliseconds).getMillis
+ } else {
+ dateTime.getMillis
+ }
+ }
+
+}
diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterConfig.scala b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterConfig.scala
similarity index 92%
rename from pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterConfig.scala
rename to pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterConfig.scala
index 31106b00..a9309016 100644
--- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterConfig.scala
+++ b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterConfig.scala
@@ -8,7 +8,7 @@ import org.sunbird.obsrv.core.streaming.BaseJobConfig
import scala.collection.mutable
-class DruidRouterConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "DruidRouterJob") {
+class DynamicRouterConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "DruidRouterJob") {
private val serialVersionUID = 2905979434303791379L
implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]])
diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala
similarity index 89%
rename from pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala
rename to pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala
index 9e17a974..5ac1067f 100644
--- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala
+++ b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala
@@ -17,7 +17,7 @@ import scala.collection.mutable
* Druid Router stream task routes every event into its respective topic configured at dataset level
*/
-class DynamicRouterStreamTask(config: DruidRouterConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] {
+class DynamicRouterStreamTask(config: DynamicRouterConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] {
private val serialVersionUID = 146697324640926024L
@@ -56,8 +56,8 @@ object DynamicRouterStreamTask {
val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path"))
val config = configFilePath.map {
path => ConfigFactory.parseFile(new File(path)).resolve()
- }.getOrElse(ConfigFactory.load("druid-router.conf").withFallback(ConfigFactory.systemEnvironment()))
- val druidRouterConfig = new DruidRouterConfig(config)
+ }.getOrElse(ConfigFactory.load("dataset-router.conf").withFallback(ConfigFactory.systemEnvironment()))
+ val druidRouterConfig = new DynamicRouterConfig(config)
val kafkaUtil = new FlinkKafkaConnector(druidRouterConfig)
val task = new DynamicRouterStreamTask(druidRouterConfig, kafkaUtil)
task.process()
diff --git a/pipeline/druid-router/src/test/resources/test.conf b/pipeline/dataset-router/src/test/resources/test.conf
similarity index 100%
rename from pipeline/druid-router/src/test/resources/test.conf
rename to pipeline/dataset-router/src/test/resources/test.conf
diff --git a/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala
new file mode 100644
index 00000000..98370128
--- /dev/null
+++ b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala
@@ -0,0 +1,171 @@
+package org.sunbird.obsrv.router
+
+import io.github.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig}
+import org.apache.flink.configuration.Configuration
+import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
+import org.apache.flink.test.util.MiniClusterWithClientResource
+import org.apache.kafka.common.serialization.StringDeserializer
+import org.scalatest.Matchers._
+import org.sunbird.obsrv.BaseMetricsReporter
+import org.sunbird.obsrv.core.model.Models.SystemEvent
+import org.sunbird.obsrv.core.model._
+import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector
+import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect}
+import org.sunbird.obsrv.router.task.{DynamicRouterConfig, DynamicRouterStreamTask}
+import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry
+
+import scala.collection.mutable
+import scala.concurrent.ExecutionContext.Implicits.global
+import scala.concurrent.Future
+import scala.concurrent.duration._
+
+class DynamicRouterStreamTaskTestSpec extends BaseSpecWithDatasetRegistry {
+
+ val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder()
+ .setConfiguration(testConfiguration())
+ .setNumberSlotsPerTaskManager(1)
+ .setNumberTaskManagers(1)
+ .build)
+
+ val routerConfig = new DynamicRouterConfig(config)
+ val kafkaConnector = new FlinkKafkaConnector(routerConfig)
+ val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group")
+ implicit val embeddedKafkaConfig: EmbeddedKafkaConfig =
+ EmbeddedKafkaConfig(
+ kafkaPort = 9093,
+ zooKeeperPort = 2183,
+ customConsumerProperties = customKafkaConsumerProperties
+ )
+ implicit val deserializer: StringDeserializer = new StringDeserializer()
+
+ def testConfiguration(): Configuration = {
+ val config = new Configuration()
+ config.setString("metrics.reporter", "job_metrics_reporter")
+ config.setString("metrics.reporter.job_metrics_reporter.class", classOf[BaseMetricsReporter].getName)
+ config
+ }
+
+ override def beforeAll(): Unit = {
+ super.beforeAll()
+ BaseMetricsReporter.gaugeMetrics.clear()
+ EmbeddedKafka.start()(embeddedKafkaConfig)
+ val postgresConnect = new PostgresConnect(postgresConfig)
+ insertTestData(postgresConnect)
+ postgresConnect.closeConnection()
+ createTestTopics()
+ publishMessagesToKafka()
+ flinkCluster.before()
+ }
+
+ private def publishMessagesToKafka(): Unit = {
+ EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.SUCCESS_EVENT)
+ EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.FAILED_EVENT)
+ }
+
+ private def insertTestData(postgresConnect: PostgresConnect): Unit = {
+ postgresConnect.execute("update datasets set dataset_config = '" + """{"data_key":"id","timestamp_key":"date1","entry_topic":"ingest"}""" + "' where id='d2';")
+
+ }
+
+ override def afterAll(): Unit = {
+
+ super.afterAll()
+ flinkCluster.after()
+ EmbeddedKafka.stop()
+ }
+
+ def createTestTopics(): Unit = {
+ List(
+ routerConfig.kafkaSystemTopic, routerConfig.kafkaInputTopic, "d1-events", routerConfig.kafkaFailedTopic
+ ).foreach(EmbeddedKafka.createCustomTopic(_))
+ }
+
+ "DynamicRouterStreamTaskTestSpec" should "validate the router stream task" in {
+
+ implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(routerConfig)
+ val task = new DynamicRouterStreamTask(routerConfig, kafkaConnector)
+ task.process(env)
+ Future {
+ env.execute(routerConfig.jobName)
+ }
+
+ val outputs = EmbeddedKafka.consumeNumberMessagesFrom[String]("d1-events", 1, timeout = 30.seconds)
+ validateOutputs(outputs)
+
+ val failedEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](routerConfig.kafkaFailedTopic, 1, timeout = 30.seconds)
+ validateFailedEvents(failedEvents)
+
+ val systemEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](routerConfig.kafkaSystemTopic, 2, timeout = 30.seconds)
+ validateSystemEvents(systemEvents)
+
+ val mutableMetricsMap = mutable.Map[String, Long]()
+ BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2))
+ Console.println("### DynamicRouterStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap)))
+ validateMetrics(mutableMetricsMap)
+ }
+
+ private def validateOutputs(outputs: List[String]): Unit = {
+ outputs.size should be(1)
+ Console.println("Output", outputs.head)
+ }
+
+ private def validateFailedEvents(failedEvents: List[String]): Unit = {
+ failedEvents.size should be(1)
+ Console.println("Output", failedEvents.head)
+ }
+
+ private def validateSystemEvents(systemEvents: List[String]): Unit = {
+ systemEvents.size should be(2)
+
+ systemEvents.foreach(se => {
+ val event = JSONUtil.deserialize[SystemEvent](se)
+ val error = event.data.error
+ if (event.ctx.dataset.getOrElse("ALL").equals("ALL"))
+ event.ctx.dataset_type should be(None)
+ else if (error.isDefined) {
+ val errorCode = error.get.error_code
+ if (errorCode.equals(ErrorConstants.MISSING_DATASET_ID.errorCode) ||
+ errorCode.equals(ErrorConstants.MISSING_DATASET_CONFIGURATION.errorCode) ||
+ errorCode.equals(ErrorConstants.EVENT_MISSING.errorCode)) {
+ event.ctx.dataset_type should be(None)
+ }
+ }
+ else
+ event.ctx.dataset_type should be(Some("event"))
+ })
+
+ systemEvents.foreach(f => {
+ val event = JSONUtil.deserialize[SystemEvent](f)
+ event.etype should be(EventID.METRIC)
+ event.ctx.module should be(ModuleID.processing)
+ event.ctx.pdata.id should be(routerConfig.jobName)
+ event.ctx.pdata.`type` should be(PDataType.flink)
+ event.ctx.pdata.pid.get should be(Producer.router)
+ if(event.data.error.isDefined) {
+ val errorLog = event.data.error.get
+ errorLog.error_level should be(ErrorLevel.critical)
+ errorLog.pdata_id should be(Producer.router)
+ errorLog.pdata_status should be(StatusCode.failed)
+ errorLog.error_count.get should be(1)
+ errorLog.error_code should be(ErrorConstants.INDEX_KEY_MISSING_OR_BLANK.errorCode)
+ errorLog.error_message should be(ErrorConstants.INDEX_KEY_MISSING_OR_BLANK.errorMsg)
+ errorLog.error_type should be(FunctionalError.MissingTimestampKey)
+ } else {
+ event.data.pipeline_stats.isDefined should be (true)
+ event.data.pipeline_stats.get.latency_time.isDefined should be (true)
+ event.data.pipeline_stats.get.processing_time.isDefined should be (true)
+ event.data.pipeline_stats.get.total_processing_time.isDefined should be (true)
+ }
+
+ })
+ }
+
+ private def validateMetrics(mutableMetricsMap: mutable.Map[String, Long]): Unit = {
+ mutableMetricsMap(s"${routerConfig.jobName}.d1.${routerConfig.routerTotalCount}") should be(1)
+ mutableMetricsMap(s"${routerConfig.jobName}.d1.${routerConfig.routerSuccessCount}") should be(1)
+ mutableMetricsMap(s"${routerConfig.jobName}.d2.${routerConfig.routerTotalCount}") should be(1)
+ mutableMetricsMap(s"${routerConfig.jobName}.d2.${routerConfig.eventFailedMetricsCount}") should be(1)
+ }
+
+}
diff --git a/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/EventFixture.scala b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/EventFixture.scala
new file mode 100644
index 00000000..7856b0cc
--- /dev/null
+++ b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/EventFixture.scala
@@ -0,0 +1,7 @@
+package org.sunbird.obsrv.router
+
+object EventFixture {
+
+ val SUCCESS_EVENT = """{"dataset":"d1","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}"""
+ val FAILED_EVENT = """{"dataset":"d2","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}"""
+}
diff --git a/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala
new file mode 100644
index 00000000..7bf5dfa6
--- /dev/null
+++ b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala
@@ -0,0 +1,125 @@
+package org.sunbird.obsrv.router
+
+import org.scalatest.{FlatSpec, Matchers}
+import org.sunbird.obsrv.core.util.JSONUtil
+import org.sunbird.obsrv.model.DatasetModels.{DatasetConfig, IndexingConfig, KeysConfig}
+import org.sunbird.obsrv.router.functions.TimestampKeyParser
+
+import scala.collection.mutable
+
+class TestTimestampKeyParser extends FlatSpec with Matchers {
+
+ "TimestampKeyParser" should "validate all scenarios of timestamp key in number format" in {
+
+
+ // Validate text date field without providing dateformat and timezone
+ val result1 = TimestampKeyParser.parseTimestampKey(
+ DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)),
+ JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01"}"""))
+ result1.isValid should be(true)
+ result1.value.asInstanceOf[String] should be("2023-03-01")
+
+ // Validate missing timestamp key scenario
+ val result2 = TimestampKeyParser.parseTimestampKey(
+ DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date1"), None)),
+ JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01"}"""))
+ result2.isValid should be(false)
+ result2.value should be(null)
+
+ // Validate number date field which is not epoch
+ val result3 = TimestampKeyParser.parseTimestampKey(
+ DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)),
+ JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":20232201}"""))
+ result3.isValid should be(false)
+ result3.value.asInstanceOf[Int] should be(0)
+
+ // Validate number date field which is epoch in seconds
+ val result4 = TimestampKeyParser.parseTimestampKey(
+ DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)),
+ JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165}"""))
+ result4.isValid should be(true)
+ result4.value.asInstanceOf[Long] should be(1701373165000L)
+
+ // Validate number date field which is epoch in milli-seconds
+ val result5 = TimestampKeyParser.parseTimestampKey(
+ DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)),
+ JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165123}"""))
+ result5.isValid should be(true)
+ result5.value.asInstanceOf[Long] should be(1701373165123L)
+
+ // Validate number date field which is epoch in micro-seconds
+ val result6 = TimestampKeyParser.parseTimestampKey(
+ DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)),
+ JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165123111}"""))
+ result6.isValid should be(true)
+ result6.value.asInstanceOf[Long] should be(1701373165123L)
+
+ // Validate number date field which is epoch in nano-seconds
+ val result7 = TimestampKeyParser.parseTimestampKey(
+ DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)),
+ JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165123111000}"""))
+ result7.isValid should be(true)
+ result7.value.asInstanceOf[Long] should be(1701373165123L)
+
+ // Validate number date field which is not an epoch in milli, micro or nano seconds
+ val result8 = TimestampKeyParser.parseTimestampKey(
+ DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)),
+ JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":170137316512}"""))
+ result8.isValid should be(false)
+ result8.value.asInstanceOf[Int] should be(0)
+
+ // Validate number date field which is an epoch with timezone present
+ val result9 = TimestampKeyParser.parseTimestampKey(
+ DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None), datasetTimezone = Some("GMT+05:30")),
+ JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165123}"""))
+ result9.isValid should be(true)
+ result9.value.asInstanceOf[Long] should be(1701392965123L)
+ }
+
+ it should "validate all scenarios of timestamp key in text format" in {
+
+ // Validate epoch data in text format
+ val result1 = TimestampKeyParser.parseTimestampKey(
+ DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("epoch")), datasetTimezone = Some("GMT+05:30")),
+ JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"1701373165123"}"""))
+ result1.isValid should be(true)
+ result1.value.asInstanceOf[Long] should be(1701392965123L)
+
+ // Validate invalid epoch data in text format (would reset to millis from 1970-01-01 if not epoch in millis)
+ val result2 = TimestampKeyParser.parseTimestampKey(
+ DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("epoch")), datasetTimezone = Some("GMT+05:30")),
+ JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"170137316512"}"""))
+ result2.isValid should be(true)
+ result2.value.asInstanceOf[Long] should be(170157116512L)
+
+ // Validate date parser without timezone
+ val result3 = TimestampKeyParser.parseTimestampKey(
+ DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("yyyy-MM-dd")), datasetTimezone = None),
+ JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01"}"""))
+ result3.isValid should be(true)
+ result3.value.asInstanceOf[Long] should be(1677609000000L)
+
+ // Validate date parser with timezone
+ val result4 = TimestampKeyParser.parseTimestampKey(
+ DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("yyyy-MM-dd")), datasetTimezone = Some("GMT+05:30")),
+ JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01"}"""))
+ result4.isValid should be(true)
+ result4.value.asInstanceOf[Long] should be(1677628800000L)
+
+ // Validate date parser with date time in nano seconds
+ val result5 = TimestampKeyParser.parseTimestampKey(
+ DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS")), datasetTimezone = Some("GMT+05:30")),
+ JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01T12:45:32.123456789"}"""))
+ result5.isValid should be(true)
+ result5.value.asInstanceOf[Long] should be(1677674732123L)
+
+ // Validate date parser with data in invalid format
+ val result6 = TimestampKeyParser.parseTimestampKey(
+ DatasetConfig(
+ IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("yyyy-MM-dd'T'HH:mm:ss.SSS")), datasetTimezone = Some("GMT+05:30")),
+ JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01T12:45:32.123456"}"""))
+ result6.isValid should be(false)
+ result6.value should be(null)
+ }
+
+}
\ No newline at end of file
diff --git a/pipeline/denormalizer/pom.xml b/pipeline/denormalizer/pom.xml
index 2df98cd3..484a81a5 100644
--- a/pipeline/denormalizer/pom.xml
+++ b/pipeline/denormalizer/pom.xml
@@ -54,6 +54,23 @@
${kafka.version}
test
+
+ org.sunbird.obsrv
+ transformation-sdk
+ 1.0.0
+
+
+ org.apache.kafka
+ kafka-clients
+ ${kafka.version}
+ test
+
+
+ org.apache.kafka
+ kafka_${scala.maj.version}
+ ${kafka.version}
+ test
+
org.sunbird.obsrv
framework
diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerFunction.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerFunction.scala
index 45a41c67..699ba75e 100644
--- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerFunction.scala
+++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerFunction.scala
@@ -30,7 +30,7 @@ class DenormalizerFunction(config: DenormalizerConfig) extends BaseDatasetProces
override def open(parameters: Configuration): Unit = {
super.open(parameters)
denormCache = new DenormCache(config)
- denormCache.open(DatasetRegistry.getAllDatasets(config.datasetType()))
+ denormCache.open(DatasetRegistry.getAllDatasets(None))
}
override def close(): Unit = {
diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerWindowFunction.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerWindowFunction.scala
index ce603520..8d188838 100644
--- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerWindowFunction.scala
+++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerWindowFunction.scala
@@ -33,7 +33,7 @@ class DenormalizerWindowFunction(config: DenormalizerConfig)(implicit val eventT
override def open(parameters: Configuration): Unit = {
super.open(parameters)
denormCache = new DenormCache(config)
- denormCache.open(DatasetRegistry.getAllDatasets(config.datasetType()))
+ denormCache.open(DatasetRegistry.getAllDatasets(None))
}
override def close(): Unit = {
diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerConfig.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerConfig.scala
index 118c0307..1fe24d68 100644
--- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerConfig.scala
+++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerConfig.scala
@@ -16,7 +16,6 @@ class DenormalizerConfig(override val config: Config) extends BaseJobConfig[muta
implicit val anyTypeInfo: TypeInformation[String] = TypeExtractor.getForClass(classOf[String])
// Kafka Topics Configuration
- val kafkaInputTopic: String = config.getString("kafka.input.topic")
val denormOutputTopic: String = config.getString("kafka.output.denorm.topic")
// Windows
@@ -41,7 +40,7 @@ class DenormalizerConfig(override val config: Config) extends BaseJobConfig[muta
// Functions
val denormalizationFunction = "DenormalizationFunction"
- override def inputTopic(): String = kafkaInputTopic
+ override def inputTopic(): String = config.getString("kafka.input.topic")
override def inputConsumer(): String = denormalizationConsumer
override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = denormEventsTag
override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events")
diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala
index db0da7d5..5550748a 100644
--- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala
+++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala
@@ -1,11 +1,14 @@
package org.sunbird.obsrv.denormalizer.util
+import com.fasterxml.jackson.databind.JsonNode
+import com.fasterxml.jackson.databind.node.MissingNode
import org.sunbird.obsrv.core.cache.RedisConnect
import org.sunbird.obsrv.core.model.ErrorConstants
import org.sunbird.obsrv.core.model.ErrorConstants.Error
import org.sunbird.obsrv.core.util.{JSONUtil, Util}
import org.sunbird.obsrv.denormalizer.task.DenormalizerConfig
-import org.sunbird.obsrv.model.DatasetModels.{Dataset, DenormFieldConfig}
+import org.sunbird.obsrv.model.DatasetModels.{Dataset, DenormFieldConfig, TransformationFunction}
+import org.sunbird.obsrv.transformer.types.JSONAtaTransformer
import redis.clients.jedis.{Pipeline, Response}
import scala.collection.mutable
@@ -75,7 +78,7 @@ class DenormCache(val config: DenormalizerConfig) {
}
private def extractField(fieldConfig: DenormFieldConfig, eventStr: String): DenormFieldStatus = {
- val denormFieldNode = JSONUtil.getKey(fieldConfig.denormKey, eventStr)
+ val denormFieldNode = getDenormFieldValue(fieldConfig, eventStr)
if (denormFieldNode.isMissingNode) {
DenormFieldStatus("", success = false, Some(ErrorConstants.DENORM_KEY_MISSING))
} else {
@@ -87,6 +90,16 @@ class DenormCache(val config: DenormalizerConfig) {
}
}
+ private def getDenormFieldValue(fieldConfig: DenormFieldConfig, eventStr: String): JsonNode = {
+ if(fieldConfig.denormKey.isDefined) {
+ JSONUtil.getKey(fieldConfig.denormKey.get, eventStr)
+ } else if(fieldConfig.jsonAtaExpr.isDefined) {
+ JSONAtaTransformer.evaluate(JSONUtil.getJsonNode(eventStr), TransformationFunction("jsonata", None, fieldConfig.jsonAtaExpr.get))
+ } else {
+ MissingNode.getInstance()
+ }
+ }
+
private def getFromCache(pipeline: Pipeline, denormField: String, fieldConfig: DenormFieldConfig): Response[String] = {
pipeline.select(fieldConfig.redisDB)
pipeline.get(denormField)
diff --git a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala
index bd9658eb..89256390 100644
--- a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala
+++ b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala
@@ -71,7 +71,7 @@ class DenormalizerStreamTaskTestSpec extends BaseSpecWithDatasetRegistry {
}
private def insertTestData(postgresConnect: PostgresConnect): Unit = {
- postgresConnect.execute("update datasets set denorm_config = '" + s"""{"redis_db_host":"localhost","redis_db_port":$redisPort,"denorm_fields":[{"denorm_key":"vehicleCode","redis_db":3,"denorm_out_field":"vehicle_data"},{"denorm_key":"dealer.dealerCode","redis_db":4,"denorm_out_field":"dealer_data"}]}""" + "' where id='d1';")
+ postgresConnect.execute("update datasets set denorm_config = '" + s"""{"redis_db_host":"localhost","redis_db_port":$redisPort,"denorm_fields":[{"denorm_key":"vehicleCode","redis_db":3,"denorm_out_field":"vehicle_data"},{"jsonata_expr":"$$.dealer.dealerCode","redis_db":4,"denorm_out_field":"dealer_data"}]}""" + "' where id='d1';")
val redisConnection = new RedisConnect(denormConfig.redisHost, denormConfig.redisPort, denormConfig.redisConnectionTimeout)
redisConnection.getConnection(3).set("HYUN-CRE-D6", EventFixture.DENORM_DATA_1)
redisConnection.getConnection(4).set("D123", EventFixture.DENORM_DATA_2)
@@ -118,8 +118,8 @@ class DenormalizerStreamTaskTestSpec extends BaseSpecWithDatasetRegistry {
val denormCache = new DenormCache(denormConfig)
noException should be thrownBy {
denormCache.open(Dataset(id = "d123", datasetType = "dataset", extractionConfig = None, dedupConfig = None, validationConfig = None, jsonSchema = None,
- denormConfig = Some(DenormConfig(redisDBHost = "localhost", redisDBPort = redisPort, denormFields = List(DenormFieldConfig(denormKey = "vehicleCode", redisDB = 3, denormOutField = "vehicle_data")))), routerConfig = RouterConfig(""),
- datasetConfig = DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest"), status = DatasetStatus.Live))
+ denormConfig = Some(DenormConfig(redisDBHost = "localhost", redisDBPort = redisPort, denormFields = List(DenormFieldConfig(denormKey = Some("vehicleCode"), redisDB = 3, denormOutField = "vehicle_data", jsonAtaExpr = None)))), routerConfig = RouterConfig(""),
+ datasetConfig = DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), status = DatasetStatus.Live, "ingest"))
}
}
diff --git a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala
index 52d06e8b..e5bbaa24 100644
--- a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala
+++ b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala
@@ -70,7 +70,7 @@ class DenormalizerWindowStreamTaskTestSpec extends BaseSpecWithDatasetRegistry {
}
private def insertTestData(postgresConnect: PostgresConnect): Unit = {
- postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, created_by, updated_by, created_date, updated_date) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'System', 'System', now(), now());")
+ postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'v1', 'ingest', 'System', 'System', now(), now());")
postgresConnect.execute("update datasets set denorm_config = '" + s"""{"redis_db_host":"localhost","redis_db_port":$redisPort,"denorm_fields":[{"denorm_key":"vehicleCode","redis_db":3,"denorm_out_field":"vehicle_data"},{"denorm_key":"dealer.dealerCode","redis_db":4,"denorm_out_field":"dealer_data"}]}""" + "' where id='d1';")
val redisConnection = new RedisConnect(denormConfig.redisHost, denormConfig.redisPort, denormConfig.redisConnectionTimeout)
redisConnection.getConnection(3).set("HYUN-CRE-D6", EventFixture.DENORM_DATA_1)
diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterStreamTask.scala b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterStreamTask.scala
deleted file mode 100644
index b77e110a..00000000
--- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterStreamTask.scala
+++ /dev/null
@@ -1,72 +0,0 @@
-package org.sunbird.obsrv.router.task
-
-import com.typesafe.config.ConfigFactory
-import org.apache.flink.api.common.typeinfo.TypeInformation
-import org.apache.flink.api.java.typeutils.TypeExtractor
-import org.apache.flink.api.java.utils.ParameterTool
-import org.apache.flink.streaming.api.datastream.DataStream
-import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
-import org.apache.flink.streaming.api.scala.OutputTag
-import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector}
-import org.sunbird.obsrv.core.util.FlinkUtil
-import org.sunbird.obsrv.registry.DatasetRegistry
-import org.sunbird.obsrv.router.functions.DruidRouterFunction
-
-import java.io.File
-import scala.collection.mutable
-
-/**
- * Druid Router stream task routes every event into its respective topic configured at dataset level
- */
-// $COVERAGE-OFF$ Disabling scoverage as this stream task is deprecated
-@Deprecated
-class DruidRouterStreamTask(config: DruidRouterConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] {
-
- private val serialVersionUID = 146697324640926024L
-
- def process(): Unit = {
- implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config)
- val dataStream = getMapDataStream(env, config, kafkaConnector)
- processStream(dataStream)
- env.execute(config.jobName)
- }
-
- override def processStream(dataStream: DataStream[mutable.Map[String, AnyRef]]): DataStream[mutable.Map[String, AnyRef]] = {
-
- implicit val mapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]])
- val datasets = DatasetRegistry.getAllDatasets(config.datasetType())
-
- val routerStream = dataStream.process(new DruidRouterFunction(config)).name(config.druidRouterFunction).uid(config.druidRouterFunction)
- .setParallelism(config.downstreamOperatorsParallelism)
- datasets.map(dataset => {
- routerStream.getSideOutput(OutputTag[mutable.Map[String, AnyRef]](dataset.routerConfig.topic))
- .sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](dataset.routerConfig.topic))
- .name(dataset.id + "-" + config.druidRouterProducer).uid(dataset.id + "-" + config.druidRouterProducer)
- .setParallelism(config.downstreamOperatorsParallelism)
- })
-
- routerStream.getSideOutput(config.statsOutputTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaStatsTopic))
- .name(config.processingStatsProducer).uid(config.processingStatsProducer).setParallelism(config.downstreamOperatorsParallelism)
-
- addDefaultSinks(routerStream, config, kafkaConnector)
- routerStream.getSideOutput(config.successTag())
-
- }
-}
-// $COVERAGE-ON$
-// $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster
-@Deprecated
-object DruidRouterStreamTask {
-
- def main(args: Array[String]): Unit = {
- val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path"))
- val config = configFilePath.map {
- path => ConfigFactory.parseFile(new File(path)).resolve()
- }.getOrElse(ConfigFactory.load("druid-router.conf").withFallback(ConfigFactory.systemEnvironment()))
- val druidRouterConfig = new DruidRouterConfig(config)
- val kafkaUtil = new FlinkKafkaConnector(druidRouterConfig)
- val task = new DruidRouterStreamTask(druidRouterConfig, kafkaUtil)
- task.process()
- }
-}
-// $COVERAGE-ON$
\ No newline at end of file
diff --git a/pipeline/extractor/pom.xml b/pipeline/extractor/pom.xml
index 4cc11c58..2ec8b803 100644
--- a/pipeline/extractor/pom.xml
+++ b/pipeline/extractor/pom.xml
@@ -110,6 +110,18 @@
3.4.0
test
+
+ io.zonky.test
+ embedded-postgres
+ 2.0.3
+ test
+
+
+ io.github.embeddedkafka
+ embedded-kafka_2.12
+ 3.4.0
+ test
+
io.zonky.test
embedded-postgres
diff --git a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala
index 0e79b08c..46c1e68c 100644
--- a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala
+++ b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala
@@ -27,7 +27,7 @@ class ExtractionFunction(config: ExtractorConfig)
override def getMetricsList(): MetricsList = {
val metrics = List(config.successEventCount, config.systemEventCount, config.eventFailedMetricsCount, config.failedExtractionCount,
config.skippedExtractionCount, config.duplicateExtractionCount, config.totalEventCount, config.successExtractionCount)
- MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics)
+ MetricsList(DatasetRegistry.getDataSetIds(), metrics)
}
override def open(parameters: Configuration): Unit = {
diff --git a/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala b/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala
index 6ada824b..3574249a 100644
--- a/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala
+++ b/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala
@@ -140,7 +140,7 @@ class ExtractorStreamTestSpec extends BaseSpecWithDatasetRegistry {
if(event.ctx.dataset.getOrElse("ALL").equals("ALL"))
event.ctx.dataset_type should be(None)
else
- event.ctx.dataset_type should be(Some("dataset"))
+ event.ctx.dataset_type should be(Some("event"))
})
//TODO: Add assertions for all 6 events
diff --git a/pipeline/hudi-connector/pom.xml b/pipeline/hudi-connector/pom.xml
index 5230d8eb..b47b58d3 100644
--- a/pipeline/hudi-connector/pom.xml
+++ b/pipeline/hudi-connector/pom.xml
@@ -124,6 +124,12 @@
+
+ org.scalatest
+ scalatest_2.12
+ 3.0.6
+ test
+
diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/function/RowDataConverterFunction.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/function/RowDataConverterFunction.scala
new file mode 100644
index 00000000..4aadb60a
--- /dev/null
+++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/function/RowDataConverterFunction.scala
@@ -0,0 +1,43 @@
+package org.sunbird.obsrv.function
+
+import org.apache.flink.api.common.functions.RichMapFunction
+import org.apache.flink.configuration.Configuration
+import org.apache.flink.formats.common.TimestampFormat
+import org.apache.flink.formats.json.JsonToRowDataConverters
+import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper
+import org.sunbird.obsrv.util.{HudiSchemaParser, HudiSchemaSpec}
+import org.apache.flink.table.data.RowData
+import org.slf4j.LoggerFactory
+import org.sunbird.obsrv.core.util.{JSONUtil, Util}
+import org.sunbird.obsrv.streaming.HudiConnectorConfig
+import scala.collection.mutable.{Map => MMap}
+
+class RowDataConverterFunction(config: HudiConnectorConfig, datasetId: String) extends RichMapFunction[MMap[String, AnyRef], RowData] {
+
+ var jsonToRowDataConverters: JsonToRowDataConverters = _
+ var objectMapper: ObjectMapper = _
+ var hudiSchemaParser: HudiSchemaParser = _
+
+ private val logger = LoggerFactory.getLogger(classOf[RowDataConverterFunction])
+
+ override def open(parameters: Configuration): Unit = {
+ super.open(parameters)
+ jsonToRowDataConverters = new JsonToRowDataConverters(false, true, TimestampFormat.SQL)
+ objectMapper = new ObjectMapper()
+ hudiSchemaParser = new HudiSchemaParser()
+ }
+
+ override def map(event: MMap[String, AnyRef]): RowData = {
+ convertToRowData(event)
+ }
+
+ def convertToRowData(data: MMap[String, AnyRef]): RowData = {
+ val eventJson = JSONUtil.serialize(data)
+ val flattenedData = hudiSchemaParser.parseJson(datasetId, eventJson)
+ val rowType = hudiSchemaParser.rowTypeMap(datasetId)
+ val converter: JsonToRowDataConverters.JsonToRowDataConverter = jsonToRowDataConverters.createRowConverter(rowType)
+ val rowData = converter.convert(objectMapper.readTree(JSONUtil.serialize(flattenedData))).asInstanceOf[RowData]
+ rowData
+ }
+
+}
diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala
index fd160820..3bde66bd 100644
--- a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala
+++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala
@@ -16,7 +16,7 @@ import org.slf4j.LoggerFactory
import org.sunbird.obsrv.core.model.Constants
import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector}
import org.sunbird.obsrv.core.util.FlinkUtil
-import org.sunbird.obsrv.functions.RowDataConverterFunction
+import org.sunbird.obsrv.function.RowDataConverterFunction
import org.sunbird.obsrv.registry.DatasetRegistry
import org.sunbird.obsrv.util.HudiSchemaParser
import org.apache.hudi.config.HoodieWriteConfig.SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP
diff --git a/pipeline/master-data-processor/pom.xml b/pipeline/master-data-processor/pom.xml
index 0dc1cc60..f97287af 100644
--- a/pipeline/master-data-processor/pom.xml
+++ b/pipeline/master-data-processor/pom.xml
@@ -64,7 +64,7 @@
org.sunbird.obsrv.pipeline
- druid-router
+ dataset-router
1.0.0
diff --git a/pipeline/master-data-processor/src/main/resources/master-data-processor.conf b/pipeline/master-data-processor/src/main/resources/master-data-processor.conf
index 149e795b..0d3f2d89 100644
--- a/pipeline/master-data-processor/src/main/resources/master-data-processor.conf
+++ b/pipeline/master-data-processor/src/main/resources/master-data-processor.conf
@@ -5,16 +5,16 @@ kafka {
output.raw.topic = ${job.env}".masterdata.raw"
output.extractor.duplicate.topic = ${job.env}".masterdata.failed"
output.failed.topic = ${job.env}".masterdata.failed"
- output.batch.failed.topic = ${job.env}".masterdata.extractor.failed"
+ output.batch.failed.topic = ${job.env}".masterdata.failed"
event.max.size = "1048576" # Max is only 1MB
output.invalid.topic = ${job.env}".masterdata.failed"
output.unique.topic = ${job.env}".masterdata.unique"
output.duplicate.topic = ${job.env}".masterdata.failed"
output.denorm.topic = ${job.env}".masterdata.denorm"
output.transform.topic = ${job.env}".masterdata.transform"
+ output.transform.failed.topic = ${job.env}".masterdata.transform.failed"
stats.topic = ${job.env}".masterdata.stats"
groupId = ${job.env}"-masterdata-pipeline-group"
-
producer {
max-request-size = 5242880
}
@@ -36,4 +36,4 @@ redis {
}
}
-dataset.type = "master-dataset"
\ No newline at end of file
+dataset.type = "master-dataset"
diff --git a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/function/MasterDataProcessorFunction.scala b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/function/MasterDataProcessorFunction.scala
index a7ca7471..dcf96a0f 100644
--- a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/function/MasterDataProcessorFunction.scala
+++ b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/function/MasterDataProcessorFunction.scala
@@ -9,6 +9,7 @@ import org.sunbird.obsrv.core.model.{ErrorConstants, FunctionalError, Producer}
import org.sunbird.obsrv.core.streaming.Metrics
import org.sunbird.obsrv.core.util.JSONUtil
import org.sunbird.obsrv.model.DatasetModels.Dataset
+import org.sunbird.obsrv.model.DatasetType
import org.sunbird.obsrv.pipeline.task.MasterDataProcessorConfig
import org.sunbird.obsrv.pipeline.util.MasterDataCache
import org.sunbird.obsrv.registry.DatasetRegistry
@@ -24,7 +25,7 @@ class MasterDataProcessorFunction(config: MasterDataProcessorConfig) extends Bas
override def open(parameters: Configuration): Unit = {
super.open(parameters)
masterDataCache = new MasterDataCache(config)
- masterDataCache.open(DatasetRegistry.getAllDatasets(config.datasetType()))
+ masterDataCache.open(DatasetRegistry.getAllDatasets(Some(DatasetType.master.toString)))
}
override def close(): Unit = {
@@ -37,13 +38,13 @@ class MasterDataProcessorFunction(config: MasterDataProcessorConfig) extends Bas
}
override def processWindow(dataset: Dataset, context: ProcessWindowFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef], String, TimeWindow]#Context, elements: List[mutable.Map[String, AnyRef]], metrics: Metrics): Unit = {
-
+ Console.println("dataset.id", dataset.id, dataset.datasetConfig.cacheConfig)
metrics.incCounter(dataset.id, config.totalEventCount, elements.size.toLong)
masterDataCache.open(dataset)
val eventsMap = elements.map(msg => {
val event = JSONUtil.serialize(msg(config.CONST_EVENT))
val json = parse(event, useBigIntForLong = false)
- val node = JSONUtil.getKey(dataset.datasetConfig.key, event)
+ val node = JSONUtil.getKey(dataset.datasetConfig.keysConfig.dataKey.get, event)
if (node.isMissingNode) {
markFailure(Some(dataset.id), msg, context, metrics, ErrorConstants.MISSING_DATASET_CONFIG_KEY, Producer.masterdataprocessor, FunctionalError.MissingMasterDatasetKey, datasetType = Some(dataset.datasetType))
}
diff --git a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala
index 65847bbd..b5cfebef 100644
--- a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala
+++ b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala
@@ -12,7 +12,7 @@ import org.sunbird.obsrv.extractor.task.{ExtractorConfig, ExtractorStreamTask}
import org.sunbird.obsrv.pipeline.function.MasterDataProcessorFunction
import org.sunbird.obsrv.preprocessor.task.{PipelinePreprocessorConfig, PipelinePreprocessorStreamTask}
import org.sunbird.obsrv.transformer.task.{TransformerConfig, TransformerStreamTask}
-import org.sunbird.obsrv.router.task.{DruidRouterConfig, DynamicRouterStreamTask}
+import org.sunbird.obsrv.router.task.{DynamicRouterConfig, DynamicRouterStreamTask}
import java.io.File
import scala.collection.mutable
@@ -51,7 +51,7 @@ class MasterDataProcessorStreamTask(config: Config, masterDataConfig: MasterData
val preprocessorTask = new PipelinePreprocessorStreamTask(new PipelinePreprocessorConfig(config), kafkaConnector)
val denormalizerTask = new DenormalizerStreamTask(new DenormalizerConfig(config), kafkaConnector)
val transformerTask = new TransformerStreamTask(new TransformerConfig(config), kafkaConnector)
- val routerTask = new DynamicRouterStreamTask(new DruidRouterConfig(config), kafkaConnector)
+ val routerTask = new DynamicRouterStreamTask(new DynamicRouterConfig(config), kafkaConnector)
val transformedStream = transformerTask.processStream(
denormalizerTask.processStream(
diff --git a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/util/MasterDataCache.scala b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/util/MasterDataCache.scala
index e07f4399..930595d3 100644
--- a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/util/MasterDataCache.scala
+++ b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/util/MasterDataCache.scala
@@ -20,15 +20,15 @@ class MasterDataCache(val config: MasterDataProcessorConfig) {
}
def open(datasets: List[Dataset]): Unit = {
- datasets.map(dataset => {
+ datasets.foreach(dataset => {
open(dataset)
})
}
def open(dataset: Dataset): Unit = {
if (!datasetPipelineMap.contains(dataset.id)) {
- val datasetConfig = dataset.datasetConfig
- val redisConnect = new RedisConnect(datasetConfig.redisDBHost.get, datasetConfig.redisDBPort.get, config.redisConnectionTimeout)
+ val redisConfig = dataset.datasetConfig.cacheConfig.get
+ val redisConnect = new RedisConnect(redisConfig.redisDBHost.get, redisConfig.redisDBPort.get, config.redisConnectionTimeout)
val pipeline: Pipeline = redisConnect.getConnection(0).pipelined()
datasetPipelineMap.put(dataset.id, pipeline)
}
@@ -37,7 +37,7 @@ class MasterDataCache(val config: MasterDataProcessorConfig) {
def process(dataset: Dataset, eventMap: Map[String, JValue]): (Int, Int) = {
val pipeline = this.datasetPipelineMap(dataset.id)
val dataFromCache = getDataFromCache(dataset, eventMap.keySet, pipeline)
- val insertCount = dataFromCache.filter(f => f._2 == null).size
+ val insertCount = dataFromCache.count(f => f._2 == null)
val updCount = dataFromCache.size - insertCount
updateCache(dataset, dataFromCache, eventMap, pipeline)
(insertCount, updCount)
@@ -45,7 +45,7 @@ class MasterDataCache(val config: MasterDataProcessorConfig) {
private def getDataFromCache(dataset: Dataset, keys: Set[String], pipeline: Pipeline): mutable.Map[String, String] = {
pipeline.clear()
- pipeline.select(dataset.datasetConfig.redisDB.get)
+ pipeline.select(dataset.datasetConfig.cacheConfig.get.redisDB.get)
val responses: mutable.Map[String, Response[String]] = mutable.Map[String, Response[String]]()
keys.foreach(key => {
responses.put(key, pipeline.get(key))
@@ -56,7 +56,7 @@ class MasterDataCache(val config: MasterDataProcessorConfig) {
private def updateCache(dataset: Dataset, dataFromCache: mutable.Map[String, String], eventMap: Map[String, JValue], pipeline: Pipeline): Unit = {
pipeline.clear()
- pipeline.select(dataset.datasetConfig.redisDB.get)
+ pipeline.select(dataset.datasetConfig.cacheConfig.get.redisDB.get)
eventMap.foreach(f => {
val key = f._1
val newJson = f._2
diff --git a/pipeline/master-data-processor/src/test/resources/test.conf b/pipeline/master-data-processor/src/test/resources/test.conf
index 2c8f0236..3533006c 100644
--- a/pipeline/master-data-processor/src/test/resources/test.conf
+++ b/pipeline/master-data-processor/src/test/resources/test.conf
@@ -5,6 +5,7 @@ job {
}
kafka {
+
input.topic = ${job.env}".masterdata.ingest"
output.raw.topic = ${job.env}".masterdata.raw"
output.extractor.duplicate.topic = ${job.env}".masterdata.failed"
@@ -16,6 +17,7 @@ kafka {
output.duplicate.topic = ${job.env}".masterdata.failed"
output.denorm.topic = ${job.env}".masterdata.denorm"
output.transform.topic = ${job.env}".masterdata.transform"
+ output.transform.failed.topic = ${job.env}".masterdata.transform.failed"
stats.topic = ${job.env}".masterdata.stats"
groupId = ${job.env}"-masterdata-pipeline-group"
producer {
@@ -24,7 +26,7 @@ kafka {
}
task {
- window.time.in.seconds = 5
+ window.time.in.seconds = 2
window.count = 2
window.shards = 1400
consumer.parallelism = 1
@@ -40,4 +42,4 @@ redis {
}
}
-dataset.type = "master-dataset"
\ No newline at end of file
+dataset.type = "master"
diff --git a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala
index e48f8120..cb5ece83 100644
--- a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala
+++ b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala
@@ -2,9 +2,8 @@ package org.sunbird.obsrv.fixture
object EventFixture {
- val VALID_BATCH_EVENT_D3_INSERT = """{"dataset":"d3","id":"event1","events":[{"code":"HYUN-CRE-D6","manufacturer":"Hyundai","model":"Creta","variant":"SX(O)","modelYear":"2023","price":"2200000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel"}]}"""
- val VALID_BATCH_EVENT_D3_INSERT_2 = """{"dataset":"d3","id":"event1","events":[{"code":"HYUN-TUC-D6","manufacturer":"Hyundai","model":"Tucson","variant":"Signature","modelYear":"2023","price":"4000000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel"}]}"""
- val VALID_BATCH_EVENT_D3_UPDATE = """{"dataset":"d3","id":"event1","events":[{"code":"HYUN-CRE-D6","safety":"3 Star (Global NCAP)","seatingCapacity":5}]}"""
+ val VALID_BATCH_EVENT_D3_INSERT = """{"dataset":"d3","id":"event1","events":[{"code":"HYUN-CRE-D6","manufacturer":"Hyundai","model":"Creta","variant":"SX(O)","modelYear":"2023","price":"2200000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","dealer":{"email":"john.doe@example.com","locationId":"KUN12345"}}]}"""
+ val VALID_BATCH_EVENT_D3_INSERT_2 = """{"dataset":"d3","id":"event2","events":[{"code":"HYUN-TUC-D6","manufacturer":"Hyundai","model":"Tucson","variant":"Signature","modelYear":"2023","price":"4000000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","dealer":{"email":"admin.hyun@gmail.com","locationId":"KUN134567"}}]}"""
+ val VALID_BATCH_EVENT_D3_UPDATE = """{"dataset":"d3","id":"event3","events":[{"code":"HYUN-CRE-D6","dealer":{"email":"john.doe@example.com","locationId":"KUN12345"},"safety":"3 Star (Global NCAP)","seatingCapacity":5}]}"""
val VALID_BATCH_EVENT_D4 = """{"dataset":"d4","event":{"code":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}}"""
- val MISSING_DATA_KEY_EVENT_D4 = """{"dataset":"d5","event":{"code1":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}}"""
}
diff --git a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala
index 575e2228..07c69965 100644
--- a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala
+++ b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala
@@ -64,9 +64,10 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry
}
private def insertTestData(postgresConnect: PostgresConnect): Unit = {
- postgresConnect.execute("insert into datasets(id, type, extraction_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d3', 'master-dataset', '{\"is_batch_event\": true, \"extraction_key\": \"events\"}', '{\"topic\":\"d3-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"masterdata.ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+",\"redis_db\":3}', 'Live', 'System', 'System', now(), now());")
- postgresConnect.execute("insert into datasets(id, type, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d4', 'master-dataset', '{\"topic\":\"d4-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"masterdata-ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+",\"redis_db\":4}', 'Live', 'System', 'System', now(), now());")
- postgresConnect.execute("insert into datasets(id, type, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d5', 'master-dataset', '{\"topic\":\"d4-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"masterdata-ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+",\"redis_db\":4}', 'Live', 'System', 'System', now(), now());")
+ postgresConnect.execute("INSERT INTO datasets (id, type, validation_config, extraction_config, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES ('d3', 'master', '{\"validate\":true,\"mode\":\"Strict\"}', '{\"is_batch_event\":true,\"extraction_key\":\"events\"}', '{\"type\":\"object\",\"$schema\":\"http://json-schema.org/draft-04/schema#\",\"properties\":{\"code\":{\"type\":\"string\"},\"manufacturer\":{\"type\":\"string\"}, \"model\":{\"type\":\"string\"},\"variant\":{\"type\":\"string\"},\"modelYear\":{\"type\":\"string\"},\"price\":{\"type\":\"string\"},\"currencyCode\":{\"type\":\"string\"},\"currency\":{\"type\":\"string\"},\"transmission\":{\"type\":\"string\"},\"fuel\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"email\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"}}}}}', '{\"topic\":\"d3-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"local.masterdata.ingest\",\"redis_db\":3,\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+"}', 'Live', 'v1', 'local.masterdata.ingest', 'System', 'System', now(), now());")
+ postgresConnect.execute("INSERT INTO datasets (id, type, validation_config, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES ('d4', 'master', '{\"validate\":true,\"mode\":\"Strict\"}', '{\"type\":\"object\",\"$schema\":\"http://json-schema.org/draft-04/schema#\",\"properties\":{\"code\":{\"type\":\"string\"},\"manufacturer\":{\"type\":\"string\"},\"model\":{\"type\":\"string\"},\"variant\":{\"type\":\"string\"},\"modelYear\":{\"type\":\"string\"},\"price\":{\"type\":\"string\"},\"currencyCode\":{\"type\":\"string\"},\"currency\":{\"type\":\"string\"},\"seatingCapacity\": {\"type\": \"integer\"}, \"safety\": {\"type\": \"string\"},\"transmission\":{\"type\":\"string\"},\"fuel\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"email\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"}}}}}', '{\"topic\":\"d34-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"local.masterdata.ingest\",\"redis_db\":4,\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+"}', 'Live', 'v1', 'local.masterdata.ingest', 'System', 'System', now(), now());")
+ postgresConnect.execute("INSERT INTO dataset_transformations (id, dataset_id, field_key, transformation_function, created_by, updated_by, created_date, updated_date) VALUES ('tf3', 'd3', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'System', 'System', now(), now());")
+ postgresConnect.execute("INSERT INTO dataset_transformations (id, dataset_id, field_key, transformation_function, created_by, updated_by, created_date, updated_date) VALUES ('tf4', 'd3', 'dealer.locationId', '{\"type\":\"encrypt\",\"expr\":\"dealer.locationId\"}', 'System', 'System', now(), now());")
}
override def afterAll(): Unit = {
@@ -90,18 +91,19 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry
}
"MasterDataProcessorStreamTaskTestSpec" should "validate the entire master data pipeline" in {
-
+
implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(masterDataConfig)
val task = new MasterDataProcessorStreamTask(config, masterDataConfig, kafkaConnector)
task.process(env)
Future {
env.execute(masterDataConfig.jobName)
+ Thread.sleep(5000)
}
- val sysEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](config.getString("kafka.output.system.event.topic"), 8, timeout = 30.seconds)
- sysEvents.size should be(8)
+ val input = EmbeddedKafka.consumeNumberMessagesFrom[String](config.getString("kafka.output.system.event.topic"), 7, timeout = 30.seconds)
+ input.size should be (7)
- sysEvents.foreach(se => {
+ input.foreach(se => {
val event = JSONUtil.deserialize[SystemEvent](se)
val error = event.data.error
if (event.ctx.dataset.getOrElse("ALL").equals("ALL"))
@@ -115,12 +117,9 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry
}
}
else
- event.ctx.dataset_type should be(Some("master-dataset"))
+ event.ctx.dataset_type should be(Some("master"))
})
- val failedEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](masterDataConfig.kafkaFailedTopic, 1, timeout = 30.seconds)
- failedEvents.size should be(1)
-
val mutableMetricsMap = mutable.Map[String, Long]();
BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2))
Console.println("### MasterDataProcessorStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap)))
@@ -143,16 +142,15 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry
val redisConnection = new RedisConnect(masterDataConfig.redisHost, masterDataConfig.redisPort, masterDataConfig.redisConnectionTimeout)
val jedis1 = redisConnection.getConnection(3)
val event1 = jedis1.get("HYUN-CRE-D6")
- event1 should be ("""{"model":"Creta","price":"2200000","variant":"SX(O)","fuel":"Diesel","code":"HYUN-CRE-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic","seatingCapacity":5,"safety":"3 Star (Global NCAP)"}""")
+ event1 should be ("""{"dealer":{"email":"jo*****e@example.com","locationId":"ym4iT6lWXt+Y2gEdBldeiw=="},"model":"Creta","price":"2200000","variant":"SX(O)","fuel":"Diesel","code":"HYUN-CRE-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic","seatingCapacity":5,"safety":"3 Star (Global NCAP)"}""")
val event3 = jedis1.get("HYUN-TUC-D6")
- event3 should be ("""{"model":"Tucson","price":"4000000","variant":"Signature","fuel":"Diesel","code":"HYUN-TUC-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic"}""")
+ event3 should be ("""{"dealer":{"email":"ad*******n@gmail.com","locationId":"kJ7mH49gjWHeoM1w+ex9kQ=="},"model":"Tucson","price":"4000000","variant":"Signature","fuel":"Diesel","code":"HYUN-TUC-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic"}""")
jedis1.close()
val jedis2 = redisConnection.getConnection(4)
val event2 = jedis2.get("JEEP-CP-D3")
event2 should be ("""{"model":"Compass","price":"3800000","variant":"Model S (O) Diesel 4x4 AT","fuel":"Diesel","seatingCapacity":5,"code":"JEEP-CP-D3","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Jeep","safety":"5 Star (Euro NCAP)","modelYear":"2023","transmission":"automatic"}""")
jedis2.close()
-
}
diff --git a/pipeline/pom.xml b/pipeline/pom.xml
index 220ebff4..d2128647 100644
--- a/pipeline/pom.xml
+++ b/pipeline/pom.xml
@@ -20,10 +20,11 @@
preprocessor
denormalizer
transformer
- druid-router
- pipeline-merged
+ dataset-router
+ unified-pipeline
master-data-processor
hudi-connector
+ cache-indexer
diff --git a/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/EventValidationFunction.scala b/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/EventValidationFunction.scala
index 93cfefef..f4f34789 100644
--- a/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/EventValidationFunction.scala
+++ b/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/EventValidationFunction.scala
@@ -32,7 +32,7 @@ class EventValidationFunction(config: PipelinePreprocessorConfig)(implicit val e
override def open(parameters: Configuration): Unit = {
super.open(parameters)
schemaValidator = new SchemaValidator()
- schemaValidator.loadDataSchemas(DatasetRegistry.getAllDatasets(config.datasetType()))
+ schemaValidator.loadDataSchemas(DatasetRegistry.getAllDatasets(None))
}
override def close(): Unit = {
diff --git a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala
index d111543b..226d87ec 100644
--- a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala
+++ b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala
@@ -75,12 +75,12 @@ class PipelinePreprocessorStreamTestSpec extends BaseSpecWithDatasetRegistry {
private def prepareTestData(): Unit = {
val postgresConnect = new PostgresConnect(postgresConfig)
- postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d3', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Draft', 'System', 'System', now(), now());")
- postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d4', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now());")
- postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d5', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"IgnoreNewFields\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now());")
- postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d6', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string","maxLength":5},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"DiscardNewFields\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now());")
- postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d7', 'dataset', '"+EventFixtures.INVALID_SCHEMA+"', '{\"validate\": true, \"mode\": \"Strict\"}','{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now());")
- postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, created_by, updated_by, created_date, updated_date) values ('d8', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": false, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'System', 'System', now(), now());")
+ postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d3', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Draft', 'v1', 'ingest', 'System', 'System', now(), now());")
+ postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d4', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now());")
+ postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d5', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"IgnoreNewFields\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now());")
+ postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d6', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string","maxLength":5},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"DiscardNewFields\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now());")
+ postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d7', 'dataset', '"+EventFixtures.INVALID_SCHEMA+"', '{\"validate\": true, \"mode\": \"Strict\"}','{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now());")
+ postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d8', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": false, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'v1', 'ingest', 'System', 'System', now(), now());")
postgresConnect.closeConnection()
}
diff --git a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/TestSchemaValidator.scala b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/TestSchemaValidator.scala
index 0ba13d65..c05c185d 100644
--- a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/TestSchemaValidator.scala
+++ b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/TestSchemaValidator.scala
@@ -3,7 +3,7 @@ package org.sunbird.obsrv.preprocessor
import com.typesafe.config.{Config, ConfigFactory}
import org.scalatest.{FlatSpec, Matchers}
import org.sunbird.obsrv.core.util.JSONUtil
-import org.sunbird.obsrv.model.DatasetModels.{Dataset, DatasetConfig, RouterConfig}
+import org.sunbird.obsrv.model.DatasetModels.{Dataset, DatasetConfig, IndexingConfig, KeysConfig, RouterConfig}
import org.sunbird.obsrv.model.DatasetStatus
import org.sunbird.obsrv.preprocessor.fixture.EventFixtures
import org.sunbird.obsrv.preprocessor.task.PipelinePreprocessorConfig
@@ -17,7 +17,7 @@ class TestSchemaValidator extends FlatSpec with Matchers {
"SchemaValidator" should "return a success report for a valid event" in {
- val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.VALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id","date","ingest"), DatasetStatus.Live)
+ val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.VALID_SCHEMA), None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest")
schemaValidator.loadDataSchema(dataset)
val event = JSONUtil.deserialize[Map[String, AnyRef]](EventFixtures.VALID_SCHEMA_EVENT)
@@ -27,7 +27,7 @@ class TestSchemaValidator extends FlatSpec with Matchers {
it should "return a failed validation report for a invalid event" in {
- val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.VALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id","date","ingest"), DatasetStatus.Live)
+ val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.VALID_SCHEMA), None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest")
schemaValidator.loadDataSchema(dataset)
val event1 = JSONUtil.deserialize[Map[String, AnyRef]](EventFixtures.INVALID_SCHEMA_EVENT)
@@ -37,7 +37,7 @@ class TestSchemaValidator extends FlatSpec with Matchers {
assert(messages1.size == 1)
messages1.head.message should be("object has missing required properties ([\"vehicleCode\"])")
messages1.head.keyword should be("required")
- messages1.head.missing.get.head should be ("vehicleCode")
+ messages1.head.missing.get.head should be("vehicleCode")
val event2 = JSONUtil.deserialize[Map[String, AnyRef]](EventFixtures.INVALID_SCHEMA_EVENT2)
val report2 = schemaValidator.validate("d1", event2)
@@ -51,7 +51,7 @@ class TestSchemaValidator extends FlatSpec with Matchers {
f.instance.pointer should be("/id")
case "array" =>
f.message should be("instance type (array) does not match any allowed primitive type (allowed: [\"string\"])")
- f.instance.pointer should be ("/vehicleCode")
+ f.instance.pointer should be("/vehicleCode")
}
})
@@ -65,7 +65,7 @@ class TestSchemaValidator extends FlatSpec with Matchers {
case "type" =>
f.message should be("instance type (integer) does not match any allowed primitive type (allowed: [\"string\"])")
f.instance.pointer should be("/id")
- f.found.get should be ("integer")
+ f.found.get should be("integer")
f.expected.get.head should be("string")
case "additionalProperties" =>
f.message should be("object instance has properties which are not allowed by the schema: [\"deliveriesRejected\"]")
@@ -76,24 +76,24 @@ class TestSchemaValidator extends FlatSpec with Matchers {
}
it should "validate the negative and missing scenarios" in {
- val dataset = Dataset("d4", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA_JSON), None, RouterConfig(""), DatasetConfig("id","date","ingest"), DatasetStatus.Live)
+ val dataset = Dataset("d4", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA_JSON), None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest")
schemaValidator.loadDataSchema(dataset)
- schemaValidator.schemaFileExists(dataset) should be (false)
+ schemaValidator.schemaFileExists(dataset) should be(false)
schemaValidator.loadDataSchema(dataset)
schemaValidator.schemaFileExists(dataset) should be(false)
- val dataset2 = Dataset("d5", "dataset", None, None, None, None, None, RouterConfig(""), DatasetConfig("id","date","ingest"), DatasetStatus.Live)
+ val dataset2 = Dataset("d5", "dataset", None, None, None, None, None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest")
schemaValidator.loadDataSchemas(List[Dataset](dataset2))
- schemaValidator.schemaFileExists(dataset2) should be (false)
+ schemaValidator.schemaFileExists(dataset2) should be(false)
- val dataset3 = Dataset("d6", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id", "date", "ingest"), DatasetStatus.Live)
+ val dataset3 = Dataset("d6", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA), None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest")
schemaValidator.loadDataSchemas(List[Dataset](dataset3))
schemaValidator.schemaFileExists(dataset3) should be(false)
- val dataset4 = Dataset("d7", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id", "date", "ingest"), DatasetStatus.Live)
- schemaValidator.schemaFileExists(dataset4) should be (false)
+ val dataset4 = Dataset("d7", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA), None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest")
+ schemaValidator.schemaFileExists(dataset4) should be(false)
}
}
diff --git a/pipeline/transformer/pom.xml b/pipeline/transformer/pom.xml
index b695a812..959e549e 100644
--- a/pipeline/transformer/pom.xml
+++ b/pipeline/transformer/pom.xml
@@ -41,6 +41,11 @@
dataset-registry
1.0.0
+
+ org.sunbird.obsrv
+ transformation-sdk
+ 1.0.0
+
org.sunbird.obsrv
framework
@@ -48,6 +53,25 @@
test-jar
test
+
+ org.sunbird.obsrv
+ dataset-registry
+ 1.0.0
+ test-jar
+ test
+
+
+ org.apache.kafka
+ kafka-clients
+ ${kafka.version}
+ test
+
+
+ org.apache.kafka
+ kafka_${scala.maj.version}
+ ${kafka.version}
+ test
+
org.apache.flink
flink-test-utils
@@ -61,6 +85,18 @@
test
tests
+
+ io.github.embeddedkafka
+ embedded-kafka_2.12
+ 3.4.0
+ test
+
+
+ io.zonky.test
+ embedded-postgres
+ 2.0.3
+ test
+
com.github.codemonstur
embedded-redis
@@ -143,7 +179,7 @@
- reference.conf
+ transformer.conf
diff --git a/pipeline/transformer/src/main/resources/transformer.conf b/pipeline/transformer/src/main/resources/transformer.conf
index b7adb850..42fbb22f 100644
--- a/pipeline/transformer/src/main/resources/transformer.conf
+++ b/pipeline/transformer/src/main/resources/transformer.conf
@@ -3,6 +3,7 @@ include "baseconfig.conf"
kafka {
input.topic = ${job.env}".denorm"
output.transform.topic = ${job.env}".transform"
+ output.transform.failed.topic = ${job.env}".transform.failed"
groupId = ${job.env}"-transformer-group"
producer {
max-request-size = 5242880
diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/functions/TransformerFunction.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/functions/TransformerFunction.scala
index fb0da96c..94a8c80f 100644
--- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/functions/TransformerFunction.scala
+++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/functions/TransformerFunction.scala
@@ -1,41 +1,142 @@
package org.sunbird.obsrv.transformer.functions
-import org.apache.flink.api.common.typeinfo.TypeInformation
+import com.fasterxml.jackson.databind.ObjectMapper
+import org.sunbird.obsrv.transformer.task.TransformerConfig
+import org.sunbird.obsrv.transformer.types._
import org.apache.flink.streaming.api.functions.ProcessFunction
-import org.sunbird.obsrv.core.model.Producer
+import org.json4s._
+import org.json4s.native.JsonMethods._
+import org.slf4j.LoggerFactory
+import org.sunbird.obsrv.core.exception.ObsrvException
+import org.sunbird.obsrv.core.model.Models._
+import org.sunbird.obsrv.core.model.StatusCode.StatusCode
+import org.sunbird.obsrv.core.model._
import org.sunbird.obsrv.core.streaming.Metrics
-import org.sunbird.obsrv.model.DatasetModels.Dataset
+import org.sunbird.obsrv.core.util.JSONUtil
+import org.sunbird.obsrv.model.DatasetModels.{Dataset, DatasetTransformation}
+import org.sunbird.obsrv.model.TransformMode
import org.sunbird.obsrv.registry.DatasetRegistry
import org.sunbird.obsrv.streaming.BaseDatasetProcessFunction
-import org.sunbird.obsrv.transformer.task.TransformerConfig
import scala.collection.mutable
-class TransformerFunction(config: TransformerConfig)(implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]])
- extends BaseDatasetProcessFunction(config) {
+case class TransformationStatus(resultJson: JValue, status: StatusCode, fieldStatus: List[TransformFieldStatus])
+
+class TransformerFunction(config: TransformerConfig) extends BaseDatasetProcessFunction(config) {
+
+ private[this] val logger = LoggerFactory.getLogger(classOf[TransformerFunction])
override def getMetrics(): List[String] = {
- List(config.totalEventCount, config.transformSuccessCount, config.transformFailedCount, config.transformSkippedCount)
+ List(config.totalEventCount, config.transformSuccessCount, config.transformPartialCount, config.transformFailedCount, config.transformSkippedCount)
}
-
/**
* Method to process the event transformations
*/
- override def processElement(dataset: Dataset, msg: mutable.Map[String, AnyRef],
- context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context,
- metrics: Metrics): Unit = {
+ override def processElement(dataset: Dataset, msg: mutable.Map[String, AnyRef], context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, metrics: Metrics): Unit = {
+ implicit val jsonFormats: Formats = DefaultFormats.withLong
+ val result = TransformerFunctionHelper.processTransformation(dataset, msg, config)
metrics.incCounter(dataset.id, config.totalEventCount)
+ msg.put(config.CONST_EVENT, result.resultJson.extract[Map[String, AnyRef]])
+ result.status match {
+ case StatusCode.skipped =>
+ metrics.incCounter(dataset.id, config.transformSkippedCount)
+ context.output(config.transformerOutputTag, markSkipped(msg, Producer.transformer))
+ case StatusCode.failed =>
+ metrics.incCounter(dataset.id, config.transformFailedCount)
+ context.output(config.transformerFailedOutputTag, markFailed(msg, ErrorConstants.ERR_TRANSFORMATION_FAILED, Producer.transformer))
+ logSystemEvents(dataset, result, context)
+ case StatusCode.partial =>
+ metrics.incCounter(dataset.id, config.transformPartialCount)
+ context.output(config.transformerOutputTag, markPartial(msg, Producer.transformer))
+ logSystemEvents(dataset, result, context)
+ case StatusCode.success =>
+ metrics.incCounter(dataset.id, config.transformSuccessCount)
+ context.output(config.transformerOutputTag, markSuccess(msg, Producer.transformer))
+ }
+ }
+
+ private def logSystemEvents(dataset: Dataset, result: TransformationStatus, ctx: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context): Unit = {
+ result.fieldStatus.filter(p => !p.success).groupBy(f => f.error.get).map(f => (f._1, f._2.size))
+ .foreach(errCount => {
+ val err = errCount._1
+ val functionalError = err match {
+ case ErrorConstants.INVALID_EXPR_FUNCTION => FunctionalError.TransformParseError
+ case ErrorConstants.ERR_EVAL_EXPR_FUNCTION => FunctionalError.TransformEvalError
+ case ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION => FunctionalError.TransformFailedError
+ case ErrorConstants.TRANSFORMATION_FIELD_MISSING => FunctionalError.TransformFieldMissing
+ }
+
+ ctx.output(config.systemEventsOutputTag, JSONUtil.serialize(SystemEvent(
+ EventID.METRIC,
+ ctx = ContextData(module = ModuleID.processing, pdata = PData(config.jobName, PDataType.flink, Some(Producer.denorm)), dataset = Some(dataset.id), dataset_type = Some(dataset.datasetType)),
+ data = EData(error = Some(ErrorLog(pdata_id = Producer.denorm, pdata_status = StatusCode.failed, error_type = functionalError, error_code = err.errorCode, error_message = err.errorMsg, error_level = ErrorLevel.critical, error_count = Some(errCount._2))))
+ )))
+ })
+
+ logger.warn(s"Transformer | Transform operation is not successful | dataset=${dataset.id} | TransformStatusData=${JSONUtil.serialize(result.fieldStatus)}")
+ }
+
+}
+
+object TransformerFunctionHelper {
+
+ implicit val jsonFormats: Formats = DefaultFormats.withLong
+ private val mapper = new ObjectMapper()
+
+ implicit class JsonHelper(json: JValue) {
+ def customExtract[T](path: String)(implicit mf: Manifest[T]): T = {
+ path.split('.').foldLeft(json)({ case (acc: JValue, node: String) => acc \ node }).extract[T]
+ }
+ }
+
+ @throws[ObsrvException]
+ def processTransformation(dataset: Dataset, msg: mutable.Map[String, AnyRef], config: TransformerConfig): TransformationStatus = {
+
+ val event = JSONUtil.serialize(msg(config.CONST_EVENT))
+ val json = parse(event, useBigIntForLong = false)
val datasetTransformations = DatasetRegistry.getDatasetTransformations(dataset.id)
+ processTransformations(json, datasetTransformations)
+ }
+
+ def processTransformations(json: JValue, datasetTransformations: Option[List[DatasetTransformation]]): TransformationStatus = {
if (datasetTransformations.isDefined) {
- // TODO: Perform transformations
- metrics.incCounter(dataset.id, config.transformSuccessCount)
- context.output(config.transformerOutputTag, markSuccess(msg, Producer.transformer))
+ val result = applyTransformations(json, datasetTransformations.get)
+ TransformationStatus(json merge result.json, getStatus(result.fieldStatus), result.fieldStatus)
} else {
- metrics.incCounter(dataset.id, config.transformSkippedCount)
- context.output(config.transformerOutputTag, markSkipped(msg, Producer.transformer))
+ TransformationStatus(json, StatusCode.skipped, List[TransformFieldStatus]())
}
}
+ private def getStatus(fieldStatus: List[TransformFieldStatus]): StatusCode = {
+ val failedCount = fieldStatus.count(p => p.mode == TransformMode.Strict && !p.success)
+ val partialCount = fieldStatus.count(p => p.mode == TransformMode.Lenient && !p.success)
+ if (failedCount > 0) StatusCode.failed else if (partialCount > 0) StatusCode.partial else StatusCode.success
+
+ }
+
+ private def applyTransformations(json: JValue, datasetTransformations: List[DatasetTransformation]): TransformationResult = {
+ datasetTransformations.groupBy(f => f.transformationFunction.`type`).mapValues(f => {
+ applyTransformation(f.head.transformationFunction.`type`, json, f)
+ }).values.reduceLeft((a, b) => TransformationResult(mergeJson(a, b), mergeStatus(a, b)))
+ }
+
+ private def mergeJson(a: TransformationResult, b: TransformationResult): JValue = {
+ a.json merge b.json
+ }
+
+ private def mergeStatus(a: TransformationResult, b: TransformationResult): List[TransformFieldStatus] = {
+ a.fieldStatus ++ b.fieldStatus
+ }
+
+ private def applyTransformation(tfType: String, json: JValue, dt: List[DatasetTransformation]): TransformationResult = {
+ val jsonNode = mapper.readTree(compact(render(json)))
+ tfType match {
+ case "mask" => MaskTransformer.transform(json, jsonNode, dt)
+ case "jsonata" => JSONAtaTransformer.transform(json, jsonNode, dt)
+ case "encrypt" => EncryptTransformer.transform(json, jsonNode, dt)
+ case _ => TransformationResult(json, List[TransformFieldStatus]())
+ }
+ }
}
\ No newline at end of file
diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerConfig.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerConfig.scala
index 797b3e56..c943702d 100644
--- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerConfig.scala
+++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerConfig.scala
@@ -17,16 +17,22 @@ class TransformerConfig(override val config: Config) extends BaseJobConfig[mutab
// Metric List
val totalEventCount = "transform-total-count"
val transformSuccessCount = "transform-success-count"
+ val transformPartialCount = "transform-partial-count"
val transformFailedCount = "transform-failed-count"
val transformSkippedCount = "transform-skipped-count"
+ private val kafkaInputTopic: String = config.getString("kafka.input.topic")
val kafkaTransformTopic: String = config.getString("kafka.output.transform.topic")
+ val kafkaTransformFailedTopic: String = config.getString("kafka.output.transform.failed.topic")
val transformerFunction = "transformer-function"
val transformerProducer = "transformer-producer"
+ val transformerFailedProducer = "transformer-failed-producer"
- private val TRANSFORMER_OUTPUT_TAG = "transformed-events"
- val transformerOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]](TRANSFORMER_OUTPUT_TAG)
+ private val TRANSFORMER_EVENTS = "transformed-events"
+ private val TRANSFORMER_FAILED_EVENTS = "transformed_failed-events"
+ val transformerOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]](TRANSFORMER_EVENTS)
+ val transformerFailedOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]](TRANSFORMER_FAILED_EVENTS)
override def inputTopic(): String = config.getString("kafka.input.topic")
diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala
index 71e86581..eee8cee2 100644
--- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala
+++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala
@@ -1,8 +1,6 @@
package org.sunbird.obsrv.transformer.task
import com.typesafe.config.ConfigFactory
-import org.apache.flink.api.common.typeinfo.TypeInformation
-import org.apache.flink.api.java.typeutils.TypeExtractor
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.streaming.api.datastream.DataStream
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
@@ -13,47 +11,48 @@ import org.sunbird.obsrv.transformer.functions.TransformerFunction
import java.io.File
import scala.collection.mutable
-/**
- *
- */
class TransformerStreamTask(config: TransformerConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] {
private val serialVersionUID = -7729362727131516112L
- implicit val mapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]])
// $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster
def process(): Unit = {
-
implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config)
- val dataStream = getMapDataStream(env, config, kafkaConnector)
- processStream(dataStream)
+ process(env)
env.execute(config.jobName)
}
// $COVERAGE-ON$
+ def process(env: StreamExecutionEnvironment): Unit = {
+ val dataStream = getMapDataStream(env, config, kafkaConnector)
+ processStream(dataStream)
+ }
+
override def processStream(dataStream: DataStream[mutable.Map[String, AnyRef]]): DataStream[mutable.Map[String, AnyRef]] = {
+
val transformedStream = dataStream.process(new TransformerFunction(config)).name(config.transformerFunction).uid(config.transformerFunction)
.setParallelism(config.downstreamOperatorsParallelism)
transformedStream.getSideOutput(config.transformerOutputTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaTransformTopic))
.name(config.transformerProducer).uid(config.transformerProducer).setParallelism(config.downstreamOperatorsParallelism)
+ transformedStream.getSideOutput(config.transformerFailedOutputTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaTransformFailedTopic))
+ .name(config.transformerFailedProducer).uid(config.transformerFailedProducer).setParallelism(config.downstreamOperatorsParallelism)
addDefaultSinks(transformedStream, config, kafkaConnector)
- transformedStream.getSideOutput(config.successTag())
+ transformedStream.getSideOutput(config.transformerOutputTag)
}
}
// $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster
object TransformerStreamTask {
-
def main(args: Array[String]): Unit = {
val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path"))
val config = configFilePath.map {
path => ConfigFactory.parseFile(new File(path)).resolve()
}.getOrElse(ConfigFactory.load("transformer.conf").withFallback(ConfigFactory.systemEnvironment()))
- val extractorConfig = new TransformerConfig(config)
- val kafkaUtil = new FlinkKafkaConnector(extractorConfig)
- val task = new TransformerStreamTask(extractorConfig, kafkaUtil)
+ val transformerConfig = new TransformerConfig(config)
+ val kafkaUtil = new FlinkKafkaConnector(transformerConfig)
+ val task = new TransformerStreamTask(transformerConfig, kafkaUtil)
task.process()
}
}
diff --git a/pipeline/transformer/src/test/resources/test.conf b/pipeline/transformer/src/test/resources/test.conf
index f1091415..1098ba64 100644
--- a/pipeline/transformer/src/test/resources/test.conf
+++ b/pipeline/transformer/src/test/resources/test.conf
@@ -1,8 +1,11 @@
include "base-test.conf"
kafka {
+ producer.broker-servers = "localhost:9093"
+ consumer.broker-servers = "localhost:9093"
input.topic = "flink.denorm"
output.transform.topic = "flink.transform"
+ output.transform.failed.topic = "flink.transform.failed"
groupId = "flink-transformer-group"
producer {
max-request-size = 5242880
diff --git a/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/EventFixture.scala b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/EventFixture.scala
new file mode 100644
index 00000000..a4f48246
--- /dev/null
+++ b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/EventFixture.scala
@@ -0,0 +1,11 @@
+package org.sunbird.obsrv.transformer
+
+object EventFixture {
+
+ val SUCCESS_TRANSFORM = """{"dataset":"d1","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}"""
+ val FAILED_TRANSFORM = """{"dataset":"d1","event":{"id":"1235","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}"""
+ val PARTIAL_TRANSFORM = """{"dataset":"d2","event":{"id":"1235","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}"""
+ val SKIPPED_TRANSFORM = """{"dataset":"d3","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}"""
+ val FAILED_TRANSFORM_2 = """{"dataset":"d4","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}"""
+
+}
\ No newline at end of file
diff --git a/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TestTransformerFunctionHelper.scala b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TestTransformerFunctionHelper.scala
new file mode 100644
index 00000000..13bd1b40
--- /dev/null
+++ b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TestTransformerFunctionHelper.scala
@@ -0,0 +1,211 @@
+package org.sunbird.obsrv.transformer
+
+import com.fasterxml.jackson.databind.{JsonNode, ObjectMapper}
+import org.json4s._
+import org.json4s.native.JsonMethods._
+import org.scalatest.Matchers
+import org.sunbird.obsrv.core.model.{ErrorConstants, StatusCode}
+import org.sunbird.obsrv.core.util.JSONUtil
+import org.sunbird.obsrv.model.DatasetModels.{Condition, DatasetTransformation, TransformationFunction}
+import org.sunbird.obsrv.model.TransformMode
+import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry
+import org.sunbird.obsrv.transformer.functions.TransformerFunctionHelper
+import org.sunbird.obsrv.transformer.util.{CipherUtil, ConditionEvaluator}
+import org.sunbird.obsrv.transformer.types._
+
+class TestTransformerFunctionHelper extends BaseSpecWithDatasetRegistry with Matchers {
+
+ implicit val jsonFormats: DefaultFormats.type = DefaultFormats
+
+ implicit class JsonHelper(json: JValue) {
+ def customExtract[T](path: String)(implicit mf: Manifest[T]): T = {
+ path.split('.').foldLeft(json)({ case (acc: JValue, node: String) => acc \ node }).extract[T]
+ }
+ }
+
+ val jsonStr = """{"obsCode":"M_BATTERY_CHARGE","accountEmail":"firstname.lastname@gmail.com","accountPhone":"123456","codeComponents":[{"componentCode":"CC_METADATA_DEVICE_FIRMWARE_VER","componentType":"METADATA_DEVICE","selector":"FIRMWARE_VERSION","value":"2.3"}],"phenTime":"2022-06-17T07:12:02Z","valueUoM":"prcnt","value":"100","id":"df4c7aa4-65df-4463-b92a-7a29835f9c4d","parentCollectionRef":"41e9b7a4-5b6f-11ed-8fd5-a6a5696c2aaa","created":"2022-11-03T12:01:32Z","modified":1667476892000,"integrationAccountRef":"zzz11120-f0c8-4064-8d00-a73e58939ce0_mtgc203d-2478-4679-a0ef-d736a7a406fd","assetRef":"9422f7ac-c6e9-5c72-b605-5a7655863866","assetRef2":"","assetRef4":123124,"testBool":false,"contextItems":[{"code":"SYN_SYSTEM","value":"VALENCO"}],"status":"ACTIVE","xMin":3.356701,"xMax":3.356701,"yMin":51.01653,"yMax":51.01653,"spatialExtent":"{\"type\": \"Point\", \"coordinates\": [3.356701, 51.016530]}","phenEndTime":"2022-06-17T07:12:02Z","value_double_type":100.0}"""
+ val mapper = new ObjectMapper()
+ val jsonNode: JsonNode = mapper.readTree(jsonStr)
+
+ "TransformerFunctionHelper" should "mask the events for the given transformation config" in {
+
+ val json = parse(jsonStr)
+ val dtList = Option(List(
+ DatasetTransformation("tf1", "obs2.0", "spatialExtent", TransformationFunction("mask", None, "spatialExtent")),
+ DatasetTransformation("tf1", "obs2.0", "assetRef", TransformationFunction("mask", None, "assetRef")),
+ DatasetTransformation("tf1", "obs2.0", "accountEmail", TransformationFunction("mask", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "accountEmail")),
+ DatasetTransformation("tf1", "obs2.0", "accountPhone2", TransformationFunction("mask", None, "accountPhone")),
+ DatasetTransformation("tf1", "obs2.0", "codeComponentsList", TransformationFunction("jsonata", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "$keys(codeComponents)")),
+ DatasetTransformation("tf1", "obs2.0", "valueAsInt", TransformationFunction("jsonata", None, "$number(value)")),
+ DatasetTransformation("tf1", "obs2.0", "firmwareComponent", TransformationFunction("jsonata", None, "codeComponents[0]")),
+ DatasetTransformation("tf1", "obs2.0", "optionalValue", TransformationFunction("jsonata", None, "$number(optionValue)"))
+ ))
+
+ val result = TransformerFunctionHelper.processTransformations(json, dtList)
+ result.status should be(StatusCode.success)
+ result.fieldStatus.size should be(8)
+ assert(result.resultJson.customExtract[String]("spatialExtent").equals("{type: ***********************************1.016530]}"))
+ assert(result.resultJson.customExtract[String]("assetRef").equals("9422f7***********************5863866"))
+ assert(result.resultJson.customExtract[String]("accountEmail").equals("fi***************e@gmail.com"))
+ assert(result.resultJson.customExtract[String]("accountPhone2").equals("1***56"))
+ assert(JSONUtil.getKey("optionalValue", JSONUtil.serialize(result.resultJson)).isMissingNode.equals(true))
+
+ val dtList2 = Option(List(
+ DatasetTransformation("tf1", "obs2.0", "accountPhone", TransformationFunction("mask", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE1'")), "accountPhone"), Some(TransformMode.Lenient)),
+ DatasetTransformation("tf4", "obs2.0", "asset.assetRef2", TransformationFunction("mask", None, "assetRef2"), Some(TransformMode.Lenient)),
+ DatasetTransformation("tf5", "obs2.0", "asset.assetRef3", TransformationFunction("mask", None, "assetRef3"), Some(TransformMode.Lenient)),
+ DatasetTransformation("tf6", "obs2.0", "asset.assetRef4", TransformationFunction("mask", None, "assetRef4"), Some(TransformMode.Lenient)),
+ DatasetTransformation("tf7", "obs2.0", "asset.assetRef5", TransformationFunction("custom", None, "join(d2.assetRef4)"), Some(TransformMode.Lenient))
+ ))
+ val result2 = TransformerFunctionHelper.processTransformations(json, dtList2)
+ result2.status should be(StatusCode.partial)
+ result2.fieldStatus.size should be(4)
+ result2.resultJson.customExtract[String]("asset.assetRef2") should be("")
+ result2.resultJson.customExtract[String]("asset.assetRef3") should be(null)
+ result2.resultJson.customExtract[String]("asset.assetRef4") should be("1***24")
+ result.resultJson.customExtract[String]("accountPhone") should be ("123456")
+
+ val result3 = TransformerFunctionHelper.processTransformations(json, None)
+ result3.status should be (StatusCode.skipped)
+ result3.fieldStatus.size should be(0)
+ }
+
+ it should "validate the jsonata expressions" in {
+
+ val json = parse(jsonStr)
+ val dtList = Option(List(
+ DatasetTransformation("tf1", "obs2.0", "codeComponentsList", TransformationFunction("jsonata", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "$keys(codeComponents).length"), Some(TransformMode.Lenient)),
+ DatasetTransformation("tf1", "obs2.0", "valueAsInt", TransformationFunction("jsonata", None, "$number(value)")),
+ DatasetTransformation("tf1", "obs2.0", "firmwareComponent", TransformationFunction("jsonata", None, "codeComponents[0]"))
+ ))
+ val result = TransformerFunctionHelper.processTransformations(json, dtList)
+ result.status should be(StatusCode.partial)
+ result.fieldStatus.size should be(3)
+ assert(result.resultJson.customExtract[String]("firmwareComponent.componentCode").equals("CC_METADATA_DEVICE_FIRMWARE_VER"))
+ assert(result.resultJson.customExtract[Int]("valueAsInt").equals(100))
+ }
+
+ it should "handle the jsonata parse and eval exceptions including transformation modes" in {
+
+ val json = parse(jsonStr)
+ val dtList = Option(List(
+ DatasetTransformation("tf1", "obs2.0", "codeComponentsList", TransformationFunction("jsonata", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "$keys(codeComponent).length")),
+ DatasetTransformation("tf1", "obs2.0", "valueAsInt", TransformationFunction("jsonata", None, "number(value)")),
+ DatasetTransformation("tf1", "obs2.0", "valueAsInt2", TransformationFunction("jsonata", None, null), Some(TransformMode.Lenient)),
+ DatasetTransformation("tf1", "obs2.0", "firmwareComponent", TransformationFunction("jsonata", None, "codeComponents[0]"))
+ ))
+ val result = TransformerFunctionHelper.processTransformations(json, dtList)
+ result.status should be(StatusCode.failed)
+ result.fieldStatus.size should be(4)
+ result.fieldStatus.count(f => f.error.isDefined && f.error.get.equals(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION)) should be(1)
+ result.fieldStatus.count(f => f.error.isDefined && f.error.get.equals(ErrorConstants.ERR_EVAL_EXPR_FUNCTION)) should be(1)
+ result.fieldStatus.count(f => f.error.isDefined && f.error.get.equals(ErrorConstants.INVALID_EXPR_FUNCTION)) should be(1)
+ result.fieldStatus.foreach { status: TransformFieldStatus => {
+ status.fieldKey match {
+ case "codeComponentsList" =>
+ status.expr should be("$keys(codeComponent).length")
+ status.success should be(false)
+ status.mode should be(TransformMode.Strict)
+ status.error.get should be(ErrorConstants.ERR_EVAL_EXPR_FUNCTION)
+ case "valueAsInt" =>
+ status.expr should be("number(value)")
+ status.success should be(false)
+ status.mode should be(TransformMode.Strict)
+ status.error.get should be(ErrorConstants.INVALID_EXPR_FUNCTION)
+ case "firmwareComponent" =>
+ status.expr should be("codeComponents[0]")
+ status.success should be(true)
+ status.mode should be(TransformMode.Strict)
+ status.error should be(None)
+ case "valueAsInt2" =>
+ status.expr should be(null)
+ status.success should be(false)
+ status.mode should be(TransformMode.Lenient)
+ status.error.get should be(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION)
+ }
+ }
+ }
+ }
+
+ it should "encrypt the fields in the event" in {
+ val json = parse(jsonStr)
+ val dtList = Option(List(
+ DatasetTransformation("tf1", "obs2.0", "accountEmail", TransformationFunction("encrypt", None, "accountEmail")),
+ DatasetTransformation("tf2", "obs2.0", "accountPhone", TransformationFunction("encrypt", None, "accountPhone")),
+ DatasetTransformation("tf3", "obs2.0", "assetRef", TransformationFunction("encrypt", None, "assetRef")),
+ DatasetTransformation("tf4", "obs2.0", "asset.assetRef2", TransformationFunction("encrypt", None, "assetRef2")),
+ DatasetTransformation("tf5", "obs2.0", "asset.assetRef3", TransformationFunction("encrypt", None, "assetRef3")),
+ DatasetTransformation("tf6", "obs2.0", "asset.assetRef4", TransformationFunction("encrypt", None, "assetRef4"))
+ ))
+ val result = TransformerFunctionHelper.processTransformations(json, dtList)
+ val jsonData = compact(render(result.resultJson))
+ result.status should be(StatusCode.failed)
+ result.fieldStatus.size should be(6)
+ assert(result.resultJson.customExtract[String]("accountEmail").equals("jyx7+dUfzHgODno2jcp67/rfCvOecaLLWICRnSCNvzY="))
+ assert(result.resultJson.customExtract[String]("accountPhone").equals("qqyhkaWkPR3t1k0swyQ7Ow=="))
+ assert(result.resultJson.customExtract[String]("assetRef").equals("e+YNIi1FebmPPI7D8k3/idlQ8XX0AIhuplwcRLbPb3nkS25gt/HyUQkWeuj6KPxf"))
+ result.resultJson.customExtract[String]("asset.assetRef2") should be("")
+ result.resultJson.customExtract[String]("asset.assetRef4") should be("D2ySyi1WGqJsM4mbIjbtJA==")
+ result.resultJson.customExtract[String]("asset.assetRef3") should be(null)
+
+ JSONUtil.getKey("asset.assetRef3", jsonData).isEmpty should be(true)
+
+ assert(CipherUtil.decrypt(result.resultJson.customExtract[String]("accountEmail")).equals("firstname.lastname@gmail.com"))
+ assert(CipherUtil.decrypt(result.resultJson.customExtract[String]("accountPhone")).equals("123456"))
+ assert(CipherUtil.decrypt(result.resultJson.customExtract[String]("assetRef")).equals("9422f7ac-c6e9-5c72-b605-5a7655863866"))
+ }
+
+ it should "validate all scenarios of condition evaluator" in {
+ val status1 = ConditionEvaluator.evalCondition("d1", jsonNode, Some(Condition("custom", "testExpr")), Some(TransformMode.Strict))
+ status1.expr should be("")
+ status1.success should be(false)
+ status1.mode.get should be(TransformMode.Strict)
+ status1.error.get should be(ErrorConstants.NO_IMPLEMENTATION_FOUND)
+
+ val status2 = ConditionEvaluator.evalCondition("d1", jsonNode, Some(Condition("jsonata", "number(value)")), Some(TransformMode.Strict))
+ status2.expr should be("number(value)")
+ status2.success should be(false)
+ status2.mode.get should be(TransformMode.Strict)
+ status2.error.get should be(ErrorConstants.INVALID_EXPR_FUNCTION)
+
+ val status3 = ConditionEvaluator.evalCondition("d1", jsonNode, Some(Condition("jsonata", "$keys(codeComponent).length")), Some(TransformMode.Strict))
+ status3.expr should be("$keys(codeComponent).length")
+ status3.success should be(false)
+ status3.mode.get should be(TransformMode.Strict)
+ status3.error.get should be(ErrorConstants.ERR_EVAL_EXPR_FUNCTION)
+
+ val status4 = ConditionEvaluator.evalCondition("d1", jsonNode, Some(Condition("jsonata", null)), Some(TransformMode.Strict))
+ status4.expr should be(null)
+ status4.success should be(false)
+ status4.mode.get should be(TransformMode.Strict)
+ status4.error.get should be(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION)
+
+ val status5 = ConditionEvaluator.evalCondition("d1", null, Some(Condition("jsonata", "$number(value)")), Some(TransformMode.Lenient))
+ status5.expr should be("$number(value)")
+ status5.success should be(false)
+ status5.mode.get should be(TransformMode.Lenient)
+ status5.error.get should be(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION)
+ }
+
+ it should "cover the unreachable code block in ITransformer" in {
+ val testTransformer = new TestTransformer()
+ val res1 = testTransformer.getJSON("event.key", null.asInstanceOf[JsonNode])
+ compact(render(res1)) should be("""{"event":{"key":null}}""")
+ val res2 = testTransformer.getJSON("event.key.x", JSONUtil.getKey("obsCode", jsonStr))
+ compact(render(res2)) should be("""{"event":{"key":{"x":"M_BATTERY_CHARGE"}}}""")
+ val res3 = testTransformer.getJSON("event.key.y", JSONUtil.getKey("testBool", jsonStr))
+ compact(render(res3)) should be("""{"event":{"key":{"y":false}}}""")
+
+ val res4 = testTransformer.transform(parse(jsonStr), jsonNode, List[DatasetTransformation]())
+ res4.json should be(JNothing)
+ res4.fieldStatus.size should be(0)
+ }
+
+}
+
+class TestTransformer extends ITransformer {
+ override def transformField(json: JValue, jsonNode: JsonNode, dt: DatasetTransformation): (JValue, TransformFieldStatus) = {
+ (JNothing, TransformFieldStatus("", "", success = false, TransformMode.Lenient))
+ }
+
+}
\ No newline at end of file
diff --git a/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TransformerStreamTestSpec.scala b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TransformerStreamTestSpec.scala
new file mode 100644
index 00000000..76500f19
--- /dev/null
+++ b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TransformerStreamTestSpec.scala
@@ -0,0 +1,229 @@
+package org.sunbird.obsrv.transformer
+
+import io.github.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig}
+import org.apache.flink.configuration.Configuration
+import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
+import org.apache.flink.test.util.MiniClusterWithClientResource
+import org.apache.kafka.common.serialization.StringDeserializer
+import org.scalatest.Matchers._
+import org.sunbird.obsrv.BaseMetricsReporter
+import org.sunbird.obsrv.core.model.Models.SystemEvent
+import org.sunbird.obsrv.core.model._
+import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector
+import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect}
+import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry
+import org.sunbird.obsrv.transformer.task.{TransformerConfig, TransformerStreamTask}
+
+import scala.collection.mutable
+import scala.concurrent.ExecutionContext.Implicits.global
+import scala.concurrent.Future
+import scala.concurrent.duration._
+
+class TransformerStreamTestSpec extends BaseSpecWithDatasetRegistry {
+
+ val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder()
+ .setConfiguration(testConfiguration())
+ .setNumberSlotsPerTaskManager(1)
+ .setNumberTaskManagers(1)
+ .build)
+
+ val transformerConfig = new TransformerConfig(config)
+ val redisPort: Int = transformerConfig.redisPort
+ val kafkaConnector = new FlinkKafkaConnector(transformerConfig)
+ val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group")
+ implicit val embeddedKafkaConfig: EmbeddedKafkaConfig =
+ EmbeddedKafkaConfig(
+ kafkaPort = 9093,
+ zooKeeperPort = 2183,
+ customConsumerProperties = customKafkaConsumerProperties
+ )
+ implicit val deserializer: StringDeserializer = new StringDeserializer()
+
+ def testConfiguration(): Configuration = {
+ val config = new Configuration()
+ config.setString("metrics.reporter", "job_metrics_reporter")
+ config.setString("metrics.reporter.job_metrics_reporter.class", classOf[BaseMetricsReporter].getName)
+ config
+ }
+
+ override def beforeAll(): Unit = {
+ super.beforeAll()
+ BaseMetricsReporter.gaugeMetrics.clear()
+ EmbeddedKafka.start()(embeddedKafkaConfig)
+ insertTestData()
+ createTestTopics()
+ publishMessagesToKafka()
+ flinkCluster.before()
+ }
+
+ private def publishMessagesToKafka(): Unit = {
+ EmbeddedKafka.publishStringMessageToKafka(transformerConfig.inputTopic(), EventFixture.SUCCESS_TRANSFORM)
+ EmbeddedKafka.publishStringMessageToKafka(transformerConfig.inputTopic(), EventFixture.FAILED_TRANSFORM)
+ EmbeddedKafka.publishStringMessageToKafka(transformerConfig.inputTopic(), EventFixture.SKIPPED_TRANSFORM)
+ EmbeddedKafka.publishStringMessageToKafka(transformerConfig.inputTopic(), EventFixture.PARTIAL_TRANSFORM)
+ EmbeddedKafka.publishStringMessageToKafka(transformerConfig.inputTopic(), EventFixture.FAILED_TRANSFORM_2)
+ }
+
+ private def insertTestData(): Unit = {
+ val postgresConnect = new PostgresConnect(postgresConfig)
+ postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date, tags) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);")
+ postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date, tags) values ('d4', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);")
+ postgresConnect.execute("insert into dataset_transformations values('tf3', 'd2', 'tfdata.valueAsInt', '{\"type\":\"jsonata\",\"expr\":\"$number(id)\"}', null, 'System', 'System', now(), now());")
+ postgresConnect.execute("insert into dataset_transformations values('tf4', 'd2', 'tfdata.encryptEmail', '{\"type\":\"encrypt\",\"expr\": \"dealer.email\"}', 'Lenient', 'System', 'System', now(), now());")
+ postgresConnect.execute("insert into dataset_transformations values('tf5', 'd4', 'tfdata.expr1', '{\"type\":\"jsonata\",\"expr\":null}', null, 'System', 'System', now(), now());")
+ postgresConnect.execute("insert into dataset_transformations values('tf6', 'd4', 'tfdata.expr2', '{\"type\":\"jsonata\",\"expr\":\"$keys(dealer).length\"}', null, 'System', 'System', now(), now());")
+ postgresConnect.execute("insert into dataset_transformations values('tf7', 'd4', 'tfdata.expr3', '{\"type\":\"jsonata\",\"expr\":\"number(id)\"}', null, 'System', 'System', now(), now());")
+ postgresConnect.closeConnection()
+ }
+
+ override def afterAll(): Unit = {
+ super.afterAll()
+ flinkCluster.after()
+ EmbeddedKafka.stop()
+ }
+
+ def createTestTopics(): Unit = {
+ List(
+ transformerConfig.inputTopic(), transformerConfig.kafkaFailedTopic, transformerConfig.kafkaSystemTopic, transformerConfig.kafkaTransformTopic, transformerConfig.kafkaTransformFailedTopic
+ ).foreach(EmbeddedKafka.createCustomTopic(_))
+ }
+
+ "TransformerStreamTestSpec" should "validate the transform stream task" in {
+
+ implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(transformerConfig)
+ val task = new TransformerStreamTask(transformerConfig, kafkaConnector)
+ task.process(env)
+ Future {
+ env.execute(transformerConfig.jobName)
+ }
+
+ val outputs = EmbeddedKafka.consumeNumberMessagesFrom[String](transformerConfig.kafkaTransformTopic, 3, timeout = 30.seconds)
+ validateOutputs(outputs)
+
+ val failedEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](transformerConfig.kafkaTransformFailedTopic, 2, timeout = 30.seconds)
+ validateFailedEvents(failedEvents)
+
+ val systemEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](transformerConfig.kafkaSystemTopic, 5, timeout = 30.seconds)
+ validateSystemEvents(systemEvents)
+
+ val mutableMetricsMap = mutable.Map[String, Long]()
+ BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2))
+ Console.println("### DenormalizerStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap)))
+ validateMetrics(mutableMetricsMap)
+
+ transformerConfig.successTag().getId should be("transformed-events")
+ }
+
+ private def validateOutputs(outputs: List[String]): Unit = {
+ outputs.size should be(3)
+ outputs.zipWithIndex.foreach {
+ case (elem, idx) =>
+ val msg = JSONUtil.deserialize[Map[String, AnyRef]](elem)
+ val event = JSONUtil.serialize(msg(Constants.EVENT))
+ val obsrvMeta = msg(Constants.OBSRV_META).asInstanceOf[Map[String, AnyRef]]
+ obsrvMeta("timespans").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[Int] should be > 0
+ idx match {
+ case 0 =>
+ event should be("""{"dealer":{"email":"de****1@gmail.com","maskedPhone":"98******45","locationId":"KUN1","dealerCode":"D123","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""")
+ obsrvMeta("flags").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[String] should be("success")
+ case 1 =>
+ event should be("""{"dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""")
+ obsrvMeta("flags").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[String] should be("skipped")
+ case 2 =>
+ event should be("""{"tfdata":{"valueAsInt":1235},"dealer":{"dealerCode":"D123","locationId":"KUN1","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1235","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""")
+ obsrvMeta("flags").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[String] should be("partial")
+ }
+ }
+ /*
+ (Output Event,{"obsrv_meta":{"flags":{"transformer":"success"},"syncts":1701863209956,"prevProcessingTime":1701863215734,"error":{},"processingStartTime":1701863215322,"timespans":{"transformer":412}},"event":{"dealer":{"email":"de****1@gmail.com","maskedPhone":"98******45","locationId":"KUN1","dealerCode":"D123","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}},"dataset":"d1"},0)
+ (Output Event,{"obsrv_meta":{"flags":{"transformer":"skipped"},"syncts":1701863210084,"prevProcessingTime":1701863216141,"error":{},"processingStartTime":1701863215476,"timespans":{"transformer":665}},"event":{"dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}},"dataset":"d3"},1)
+ (Output Event,{"obsrv_meta":{"flags":{"transformer":"partial"},"syncts":1701863210111,"prevProcessingTime":1701863216378,"error":{},"processingStartTime":1701863215477,"timespans":{"transformer":901}},"event":{"tfdata":{"valueAsInt":1235},"dealer":{"dealerCode":"D123","locationId":"KUN1","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1235","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}},"dataset":"d2"},2)
+ */
+ }
+
+ private def validateFailedEvents(failedEvents: List[String]): Unit = {
+ failedEvents.size should be(2)
+ failedEvents.zipWithIndex.foreach {
+ case (elem, idx) =>
+ val msg = JSONUtil.deserialize[Map[String, AnyRef]](elem)
+ val event = msg(Constants.EVENT).asInstanceOf[String]
+ val obsrvMeta = msg(Constants.OBSRV_META).asInstanceOf[Map[String, AnyRef]]
+ obsrvMeta("timespans").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[Int] should be > 0
+ obsrvMeta("flags").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[String] should be (StatusCode.failed.toString)
+ obsrvMeta("error").asInstanceOf[Map[String, AnyRef]]("src").asInstanceOf[String] should be (Producer.transformer.toString)
+ obsrvMeta("error").asInstanceOf[Map[String, AnyRef]]("error_code").asInstanceOf[String] should be (ErrorConstants.ERR_TRANSFORMATION_FAILED.errorCode)
+ idx match {
+ case 0 =>
+ event should be("{\"event\":{\"dealer\":{\"maskedPhone\":\"98******45\",\"locationId\":\"KUN1\",\"dealerCode\":\"D123\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1235\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d1\"}")
+ case 1 =>
+ event should be("{\"event\":{\"tfdata\":{},\"dealer\":{\"dealerCode\":\"D123\",\"locationId\":\"KUN1\",\"email\":\"dealer1@gmail.com\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1234\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d4\"}")
+ }
+ }
+ /*
+ (Failed Event,{"event":"{\"event\":{\"dealer\":{\"maskedPhone\":\"98******45\",\"locationId\":\"KUN1\",\"dealerCode\":\"D123\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1235\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d1\"}","obsrv_meta":{"flags":{"transformer":"failed"},"syncts":1701863210058,"prevProcessingTime":1701863215948,"error":{"src":{"enumClass":"org.sunbird.obsrv.core.model.Producer","value":"transformer"},"error_code":"ERR_TRANSFORM_1023","error_msg":"Atleast one mandatory transformation has failed"},"processingStartTime":1701863215475,"timespans":{"transformer":473}},"dataset":"d1"},0)
+ (Failed Event,{"event":"{\"event\":{\"tfdata\":{},\"dealer\":{\"dealerCode\":\"D123\",\"locationId\":\"KUN1\",\"email\":\"dealer1@gmail.com\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1234\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d4\"}","obsrv_meta":{"flags":{"transformer":"failed"},"syncts":1701863210150,"prevProcessingTime":1701863216421,"error":{"src":{"enumClass":"org.sunbird.obsrv.core.model.Producer","value":"transformer"},"error_code":"ERR_TRANSFORM_1023","error_msg":"Atleast one mandatory transformation has failed"},"processingStartTime":1701863215477,"timespans":{"transformer":944}},"dataset":"d4"},1)
+ */
+ }
+
+ private def validateSystemEvents(systemEvents: List[String]): Unit = {
+ systemEvents.size should be(5)
+ systemEvents.count(f => {
+ val event = JSONUtil.deserialize[SystemEvent](f)
+ FunctionalError.TransformFieldMissing.equals(event.data.error.get.error_type)
+ }) should be(2)
+ systemEvents.count(f => {
+ val event = JSONUtil.deserialize[SystemEvent](f)
+ FunctionalError.TransformFailedError.equals(event.data.error.get.error_type)
+ }) should be(1)
+ systemEvents.count(f => {
+ val event = JSONUtil.deserialize[SystemEvent](f)
+ FunctionalError.TransformEvalError.equals(event.data.error.get.error_type)
+ }) should be(1)
+ systemEvents.count(f => {
+ val event = JSONUtil.deserialize[SystemEvent](f)
+ FunctionalError.TransformParseError.equals(event.data.error.get.error_type)
+ }) should be(1)
+
+ systemEvents.foreach(se => {
+ val event = JSONUtil.deserialize[SystemEvent](se)
+ val error = event.data.error
+ if (event.ctx.dataset.getOrElse("ALL").equals("ALL"))
+ event.ctx.dataset_type should be(None)
+ else if (error.isDefined) {
+ val errorCode = error.get.error_code
+ if (errorCode.equals(ErrorConstants.MISSING_DATASET_ID.errorCode) ||
+ errorCode.equals(ErrorConstants.MISSING_DATASET_CONFIGURATION.errorCode) ||
+ errorCode.equals(ErrorConstants.EVENT_MISSING.errorCode)) {
+ event.ctx.dataset_type should be(None)
+ }
+ }
+ else
+ event.ctx.dataset_type should be(Some("dataset"))
+ })
+ // TODO: Add more assertions
+ /*
+ (Sys Event,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"TransformerJob","type":"flink","pid":"denorm"},"dataset":"d1"},"data":{"error":{"pdata_id":"denorm","pdata_status":"failed","error_type":"TransformFieldMissing","error_code":"ERR_TRANSFORM_1023","error_message":"Transformation field is either missing or blank","error_level":"critical","error_count":1}},"ets":1701863215985},0)
+ (Sys Event,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"TransformerJob","type":"flink","pid":"denorm"},"dataset":"d2"},"data":{"error":{"pdata_id":"denorm","pdata_status":"failed","error_type":"TransformFieldMissing","error_code":"ERR_TRANSFORM_1023","error_message":"Transformation field is either missing or blank","error_level":"critical","error_count":1}},"ets":1701863216391},1)
+ (Sys Event,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"TransformerJob","type":"flink","pid":"denorm"},"dataset":"d4"},"data":{"error":{"pdata_id":"denorm","pdata_status":"failed","error_type":"TransformFailedError","error_code":"ERR_TRANSFORM_1022","error_message":"Unable to evaluate the transformation expression function","error_level":"critical","error_count":1}},"ets":1701863216431},2)
+ (Sys Event,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"TransformerJob","type":"flink","pid":"denorm"},"dataset":"d4"},"data":{"error":{"pdata_id":"denorm","pdata_status":"failed","error_type":"TransformEvalError","error_code":"ERR_TRANSFORM_1021","error_message":"Unable to evaluate the transformation expression function","error_level":"critical","error_count":1}},"ets":1701863216433},3)
+ (Sys Event,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"TransformerJob","type":"flink","pid":"denorm"},"dataset":"d4"},"data":{"error":{"pdata_id":"denorm","pdata_status":"failed","error_type":"TransformParseError","error_code":"ERR_TRANSFORM_1020","error_message":"Transformation expression function is not valid","error_level":"critical","error_count":1}},"ets":1701863216433},4)
+ */
+ }
+
+ private def validateMetrics(mutableMetricsMap: mutable.Map[String, Long]): Unit = {
+ mutableMetricsMap(s"${transformerConfig.jobName}.d1.${transformerConfig.totalEventCount}") should be(2)
+ mutableMetricsMap(s"${transformerConfig.jobName}.d1.${transformerConfig.transformSuccessCount}") should be(1)
+ mutableMetricsMap(s"${transformerConfig.jobName}.d1.${transformerConfig.transformFailedCount}") should be(1)
+
+ mutableMetricsMap(s"${transformerConfig.jobName}.d2.${transformerConfig.totalEventCount}") should be(1)
+ mutableMetricsMap(s"${transformerConfig.jobName}.d2.${transformerConfig.transformPartialCount}") should be(1)
+
+ mutableMetricsMap(s"${transformerConfig.jobName}.d3.${transformerConfig.totalEventCount}") should be(1)
+ mutableMetricsMap(s"${transformerConfig.jobName}.d3.${transformerConfig.transformSkippedCount}") should be(1)
+
+ mutableMetricsMap(s"${transformerConfig.jobName}.d4.${transformerConfig.totalEventCount}") should be(1)
+ mutableMetricsMap(s"${transformerConfig.jobName}.d4.${transformerConfig.transformFailedCount}") should be(1)
+ }
+
+}
\ No newline at end of file
diff --git a/pipeline/pipeline-merged/pom.xml b/pipeline/unified-pipeline/pom.xml
similarity index 87%
rename from pipeline/pipeline-merged/pom.xml
rename to pipeline/unified-pipeline/pom.xml
index f3db71fe..33ef14b9 100644
--- a/pipeline/pipeline-merged/pom.xml
+++ b/pipeline/unified-pipeline/pom.xml
@@ -12,12 +12,12 @@
org.sunbird.obsrv.pipeline
- pipeline-merged
+ unified-pipeline
1.0.0
jar
- Merged Pipeline
+ Unified Pipeline
- Entire pipeline merged into a single processing job
+ Entire pipeline merged into a single processing job
@@ -64,39 +64,9 @@
org.sunbird.obsrv.pipeline
- druid-router
+ dataset-router
1.0.0
-
- com.github.java-json-tools
- json-schema-validator
- 2.2.14
-
-
- joda-time
- joda-time
-
-
- com.fasterxml.jackson.core
- jackson-databind
-
-
- com.google.guava
- guava
-
-
-
-
- com.google.guava
- guava
- 32.1.2-jre
-
-
- org.apache.kafka
- kafka-clients
- ${kafka.version}
- test
-
org.apache.kafka
kafka_${scala.maj.version}
@@ -173,7 +143,6 @@
2.0.3
test
-
@@ -220,7 +189,7 @@
- org.sunbird.obsrv.pipeline.task.MergedPipelineStreamTask
+ in.sanketika.obsrv.pipeline.task.UnifiedPipelineStreamTask
diff --git a/pipeline/pipeline-merged/src/main/resources/merged-pipeline.conf b/pipeline/unified-pipeline/src/main/resources/unified-pipeline.conf
similarity index 89%
rename from pipeline/pipeline-merged/src/main/resources/merged-pipeline.conf
rename to pipeline/unified-pipeline/src/main/resources/unified-pipeline.conf
index 75f43376..9b1e1bdf 100644
--- a/pipeline/pipeline-merged/src/main/resources/merged-pipeline.conf
+++ b/pipeline/unified-pipeline/src/main/resources/unified-pipeline.conf
@@ -12,8 +12,9 @@ kafka {
output.denorm.topic = ${job.env}".denorm"
output.denorm.failed.topic = ${job.env}".failed"
output.transform.topic = ${job.env}".transform"
+ output.transform.failed.topic = ${job.env}".failed"
stats.topic = ${job.env}".stats"
- groupId = ${job.env}"-single-pipeline-group"
+ groupId = ${job.env}"-unified-pipeline-group"
producer {
max-request-size = 5242880
}
diff --git a/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineConfig.scala b/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineConfig.scala
new file mode 100644
index 00000000..75322bc2
--- /dev/null
+++ b/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineConfig.scala
@@ -0,0 +1,20 @@
+package org.sunbird.obsrv.pipeline.task
+
+import com.typesafe.config.Config
+import org.apache.flink.api.common.typeinfo.TypeInformation
+import org.apache.flink.api.java.typeutils.TypeExtractor
+import org.apache.flink.streaming.api.scala.OutputTag
+import org.sunbird.obsrv.core.streaming.BaseJobConfig
+
+import scala.collection.mutable
+
+class UnifiedPipelineConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "UnifiedPipelineJob") {
+
+ private val serialVersionUID = 2905979434303791379L
+ implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]])
+
+ override def inputTopic(): String = config.getString("kafka.input.topic")
+ override def inputConsumer(): String = "pipeline-consumer"
+ override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("processing_stats")
+ override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events")
+}
diff --git a/pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineStreamTask.scala b/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineStreamTask.scala
similarity index 70%
rename from pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineStreamTask.scala
rename to pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineStreamTask.scala
index f7d8dce9..f24bb256 100644
--- a/pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineStreamTask.scala
+++ b/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineStreamTask.scala
@@ -9,7 +9,7 @@ import org.sunbird.obsrv.core.util.FlinkUtil
import org.sunbird.obsrv.denormalizer.task.{DenormalizerConfig, DenormalizerStreamTask}
import org.sunbird.obsrv.extractor.task.{ExtractorConfig, ExtractorStreamTask}
import org.sunbird.obsrv.preprocessor.task.{PipelinePreprocessorConfig, PipelinePreprocessorStreamTask}
-import org.sunbird.obsrv.router.task.{DruidRouterConfig, DynamicRouterStreamTask}
+import org.sunbird.obsrv.router.task.{DynamicRouterConfig, DynamicRouterStreamTask}
import org.sunbird.obsrv.transformer.task.{TransformerConfig, TransformerStreamTask}
import java.io.File
@@ -19,26 +19,21 @@ import scala.collection.mutable
* Druid Router stream task routes every event into its respective topic configured at dataset level
*/
-class MergedPipelineStreamTask(config: Config, mergedPipelineConfig: MergedPipelineConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] {
+class UnifiedPipelineStreamTask(config: Config, pipelineConfig: UnifiedPipelineConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] {
private val serialVersionUID = 146697324640926024L
// $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster
def process(): Unit = {
- implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(mergedPipelineConfig)
+ implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(pipelineConfig)
process(env)
- env.execute(mergedPipelineConfig.jobName)
+ env.execute(pipelineConfig.jobName)
}
// $COVERAGE-ON$
- /**
- * Created an overloaded process function to enable unit testing
- * @param env StreamExecutionEnvironment
- */
def process(env: StreamExecutionEnvironment): Unit = {
-
- val dataStream = getMapDataStream(env, mergedPipelineConfig, kafkaConnector)
+ val dataStream = getMapDataStream(env, pipelineConfig, kafkaConnector)
processStream(dataStream)
}
@@ -48,7 +43,7 @@ class MergedPipelineStreamTask(config: Config, mergedPipelineConfig: MergedPipel
val preprocessorTask = new PipelinePreprocessorStreamTask(new PipelinePreprocessorConfig(config), kafkaConnector)
val denormalizerTask = new DenormalizerStreamTask(new DenormalizerConfig(config), kafkaConnector)
val transformerTask = new TransformerStreamTask(new TransformerConfig(config), kafkaConnector)
- val routerTask = new DynamicRouterStreamTask(new DruidRouterConfig(config), kafkaConnector)
+ val routerTask = new DynamicRouterStreamTask(new DynamicRouterConfig(config), kafkaConnector)
routerTask.processStream(
transformerTask.processStream(
@@ -63,18 +58,17 @@ class MergedPipelineStreamTask(config: Config, mergedPipelineConfig: MergedPipel
}
// $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster
-object MergedPipelineStreamTask {
+object UnifiedPipelineStreamTask {
def main(args: Array[String]): Unit = {
val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path"))
val config = configFilePath.map {
path => ConfigFactory.parseFile(new File(path)).resolve()
- }.getOrElse(ConfigFactory.load("merged-pipeline.conf").withFallback(ConfigFactory.systemEnvironment()))
- val mergedPipelineConfig = new MergedPipelineConfig(config)
- val kafkaUtil = new FlinkKafkaConnector(mergedPipelineConfig)
- val task = new MergedPipelineStreamTask(config, mergedPipelineConfig, kafkaUtil)
+ }.getOrElse(ConfigFactory.load("unified-pipeline.conf").withFallback(ConfigFactory.systemEnvironment()))
+ val pipelineConfig = new UnifiedPipelineConfig(config)
+ val kafkaUtil = new FlinkKafkaConnector(pipelineConfig)
+ val task = new UnifiedPipelineStreamTask(config, pipelineConfig, kafkaUtil)
task.process()
}
}
-
// $COVERAGE-ON$
\ No newline at end of file
diff --git a/pipeline/unified-pipeline/src/test/resources/base-config.conf b/pipeline/unified-pipeline/src/test/resources/base-config.conf
new file mode 100644
index 00000000..3ade36f7
--- /dev/null
+++ b/pipeline/unified-pipeline/src/test/resources/base-config.conf
@@ -0,0 +1,8 @@
+postgres {
+ host = localhost
+ port = 5432
+ maxConnections = 2
+ user = "postgres"
+ password = "postgres"
+ database="postgres"
+}
\ No newline at end of file
diff --git a/pipeline/pipeline-merged/src/test/resources/test.conf b/pipeline/unified-pipeline/src/test/resources/test.conf
similarity index 93%
rename from pipeline/pipeline-merged/src/test/resources/test.conf
rename to pipeline/unified-pipeline/src/test/resources/test.conf
index d2b959c3..aa514d54 100644
--- a/pipeline/pipeline-merged/src/test/resources/test.conf
+++ b/pipeline/unified-pipeline/src/test/resources/test.conf
@@ -16,6 +16,7 @@ kafka {
output.denorm.topic = ${job.env}".denorm"
output.denorm.failed.topic = ${job.env}".failed"
output.transform.topic = ${job.env}".transform"
+ output.transform.failed.topic = ${job.env}".transform.failed"
stats.topic = ${job.env}".stats"
groupId = ${job.env}"-single-pipeline-group"
producer {
@@ -38,4 +39,4 @@ redis {
preprocessor.duplication.store.id = 2
key.expiry.seconds = 3600
}
-}
+}
\ No newline at end of file
diff --git a/pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala b/pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/EventFixture.scala
similarity index 98%
rename from pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala
rename to pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/EventFixture.scala
index dee90323..a5e623b6 100644
--- a/pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala
+++ b/pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/EventFixture.scala
@@ -1,4 +1,4 @@
-package org.sunbird.obsrv.fixture
+package org.sunbird.obsrv.pipeline
object EventFixture {
@@ -11,6 +11,4 @@ object EventFixture {
val VALID_BATCH_EVENT_D2 = """{"dataset":"d2","id":"event4","event":{"id":"4567","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}"""
val INVALID_BATCH_EVENT_D2 = """{"dataset":"d2","id":"event5","event1":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}"""
-
-
-}
+}
\ No newline at end of file
diff --git a/pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/pipeline/MergedPipelineStreamTaskTestSpec.scala b/pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/UnifiedPipelineStreamTaskTestSpec.scala
similarity index 81%
rename from pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/pipeline/MergedPipelineStreamTaskTestSpec.scala
rename to pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/UnifiedPipelineStreamTaskTestSpec.scala
index f3cf86b2..879abeec 100644
--- a/pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/pipeline/MergedPipelineStreamTaskTestSpec.scala
+++ b/pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/UnifiedPipelineStreamTaskTestSpec.scala
@@ -11,18 +11,15 @@ import org.sunbird.obsrv.BaseMetricsReporter
import org.sunbird.obsrv.core.cache.RedisConnect
import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector
import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil}
-import org.sunbird.obsrv.extractor.task.ExtractorConfig
-import org.sunbird.obsrv.fixture.EventFixture
-import org.sunbird.obsrv.pipeline.task.{MergedPipelineConfig, MergedPipelineStreamTask}
+import org.sunbird.obsrv.pipeline.task.{UnifiedPipelineConfig, UnifiedPipelineStreamTask}
import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry
-import org.sunbird.obsrv.transformer.task.TransformerConfig
import scala.collection.mutable
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.Future
import scala.concurrent.duration._
-class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry {
+class UnifiedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry {
val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder()
.setConfiguration(testConfiguration())
@@ -30,8 +27,8 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry {
.setNumberTaskManagers(1)
.build)
- val mergedPipelineConfig = new MergedPipelineConfig(config)
- val kafkaConnector = new FlinkKafkaConnector(mergedPipelineConfig)
+ val unifiedPipelineConfig = new UnifiedPipelineConfig(config)
+ val kafkaConnector = new FlinkKafkaConnector(unifiedPipelineConfig)
val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group")
implicit val embeddedKafkaConfig: EmbeddedKafkaConfig =
EmbeddedKafkaConfig(
@@ -65,7 +62,7 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry {
}
override def afterAll(): Unit = {
- val redisConnection = new RedisConnect(mergedPipelineConfig.redisHost, mergedPipelineConfig.redisPort, mergedPipelineConfig.redisConnectionTimeout)
+ val redisConnection = new RedisConnect(unifiedPipelineConfig.redisHost, unifiedPipelineConfig.redisPort, unifiedPipelineConfig.redisConnectionTimeout)
redisConnection.getConnection(config.getInt("redis.database.extractor.duplication.store.id")).flushAll()
redisConnection.getConnection(config.getInt("redis.database.preprocessor.duplication.store.id")).flushAll()
super.afterAll()
@@ -83,20 +80,20 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry {
).foreach(EmbeddedKafka.createCustomTopic(_))
}
- "MergedPipelineStreamTaskTestSpec" should "validate the entire pipeline" in {
+ "UnifiedPipelineStreamTaskTestSpec" should "validate the entire pipeline" in {
- implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(mergedPipelineConfig)
- val task = new MergedPipelineStreamTask(config, mergedPipelineConfig, kafkaConnector)
+ implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(unifiedPipelineConfig)
+ val task = new UnifiedPipelineStreamTask(config, unifiedPipelineConfig, kafkaConnector)
task.process(env)
Future {
- env.execute(mergedPipelineConfig.jobName)
+ env.execute(unifiedPipelineConfig.jobName)
}
try {
val d1Events = EmbeddedKafka.consumeNumberMessagesFrom[String]("d1-events", 1, timeout = 30.seconds)
- d1Events.size should be (1)
+ d1Events.size should be(1)
val d2Events = EmbeddedKafka.consumeNumberMessagesFrom[String]("d2-events", 1, timeout = 30.seconds)
- d2Events.size should be (1)
+ d2Events.size should be(1)
} catch {
case ex: Exception => ex.printStackTrace()
}
@@ -109,7 +106,7 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry {
val mutableMetricsMap = mutable.Map[String, Long]();
BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2))
- Console.println("### MergedPipelineStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap)))
+ Console.println("### UnifiedPipelineStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap)))
mutableMetricsMap("ExtractorJob.d1.extractor-total-count") should be(4)
mutableMetricsMap("ExtractorJob.d1.extractor-duplicate-count") should be(1)
@@ -144,16 +141,8 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry {
mutableMetricsMap("DruidRouterJob.d2.router-total-count") should be(1)
mutableMetricsMap("DruidRouterJob.d2.router-success-count") should be(1)
- val extractorConfig = new ExtractorConfig(config)
- extractorConfig.inputTopic() should be (config.getString("kafka.input.topic"))
- extractorConfig.inputConsumer() should be ("extractor-consumer")
-
- val transformerConfig = new TransformerConfig(config)
- transformerConfig.inputTopic() should be(config.getString("kafka.input.topic"))
- transformerConfig.inputConsumer() should be("transformer-consumer")
-
- mergedPipelineConfig.successTag().getId should be ("processing_stats")
- mergedPipelineConfig.failedEventsOutputTag().getId should be ("failed-events")
+ unifiedPipelineConfig.successTag().getId should be("processing_stats")
+ unifiedPipelineConfig.failedEventsOutputTag().getId should be("failed-events")
}
}
diff --git a/pom.xml b/pom.xml
index c8f53bd8..4ecdc676 100644
--- a/pom.xml
+++ b/pom.xml
@@ -18,6 +18,7 @@
framework
dataset-registry
+ transformation-sdk
pipeline
data-products
diff --git a/transformation-sdk/pom.xml b/transformation-sdk/pom.xml
new file mode 100644
index 00000000..10d393ce
--- /dev/null
+++ b/transformation-sdk/pom.xml
@@ -0,0 +1,180 @@
+
+
+ 4.0.0
+ transformation-sdk
+ org.sunbird.obsrv
+ 1.0.0
+ jar
+ Obsrv Transformation Library as a SDK
+
+ UTF-8
+ UTF-8
+ 2.12
+ 2.12.11
+ 1.15.2
+ 2.8.1
+ 11
+ 1.9.13
+ 1.4.0
+ 2.14.1
+
+
+
+ org.sunbird.obsrv
+ dataset-registry
+ 1.0.0
+
+
+ org.json4s
+ json4s-native_${scala.maj.version}
+ 4.0.6
+
+
+ com.ibm.jsonata4java
+ JSONata4Java
+ 2.2.6
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+
+
+
+
+ com.github.bancolombia
+ data-mask-core
+ 1.0.1
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+
+
+
+
+ org.scalatest
+ scalatest_2.12
+ 3.0.6
+ test
+
+
+ org.mockito
+ mockito-core
+ 3.3.3
+ test
+
+
+ org.sunbird.obsrv
+ framework
+ 1.0.0
+ test-jar
+ test
+
+
+ org.sunbird.obsrv
+ dataset-registry
+ 1.0.0
+ test-jar
+ test
+
+
+
+
+ src/main/scala
+ src/test/scala
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.8.1
+
+ 11
+
+
+
+ maven-surefire-plugin
+ 2.20
+
+ true
+
+
+
+
+ org.scalatest
+ scalatest-maven-plugin
+ 1.0
+
+ ${project.build.directory}/surefire-reports
+ .
+ dp-core-testsuite.txt
+
+
+
+ test
+
+ test
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+ 3.2.0
+
+
+
+ test-jar
+
+
+
+
+
+
+ org.scoverage
+ scoverage-maven-plugin
+ ${scoverage.plugin.version}
+
+ ${scala.version}
+ true
+ true
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+ 4.4.0
+
+
+ ${java.target.runtime}
+ ${scala.version}
+ false
+
+
+
+ scala-compile-first
+ process-resources
+
+ add-source
+ compile
+
+
+
+ scala-test-compile
+ process-test-resources
+
+ testCompile
+
+
+
+
+
+
+
+
+
diff --git a/transformation-sdk/src/main/resources/transformation-sdk.conf b/transformation-sdk/src/main/resources/transformation-sdk.conf
new file mode 100644
index 00000000..e69de29b
diff --git a/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala
new file mode 100644
index 00000000..125872d0
--- /dev/null
+++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala
@@ -0,0 +1,46 @@
+package org.sunbird.obsrv.transformer.types
+
+import com.fasterxml.jackson.databind.JsonNode
+import com.fasterxml.jackson.databind.node.MissingNode
+import org.json4s.{DefaultFormats, Formats, JValue, MappingException}
+import org.slf4j.LoggerFactory
+import org.sunbird.obsrv.core.model.ErrorConstants
+import org.sunbird.obsrv.core.util.JSONUtil
+import org.sunbird.obsrv.model.DatasetModels.DatasetTransformation
+import org.sunbird.obsrv.transformer.util.CipherUtil
+
+class EncryptTransformer extends ITransformer {
+
+ implicit val jsonFormats: Formats = DefaultFormats.withLong
+ private val logger = LoggerFactory.getLogger(classOf[EncryptTransformer])
+
+ implicit class JsonHelper(json: JValue) {
+ def customExtract[T](path: String)(implicit mf: Manifest[T]): T = {
+ path.split('.').foldLeft(json)({ case (acc: JValue, node: String) => acc \ node }).extract[T]
+ }
+ }
+
+ override def transformField(json: JValue, jsonNode: JsonNode, dt: DatasetTransformation): (JValue, TransformFieldStatus) = {
+ val emptyNode = getJSON(dt.fieldKey, MissingNode.getInstance())
+ try {
+ val currentValue = json.customExtract[String](dt.transformationFunction.expr)
+ val encryptedValue = CipherUtil.encrypt(currentValue)
+ (getJSON(dt.fieldKey, encryptedValue), TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = true, dt.mode.get))
+ } catch {
+ case ex: MappingException =>
+ logger.error(s"Transformer(Encrypt) | Exception parsing transformation expression | Data=${JSONUtil.serialize(dt)} | error=${ex.getMessage}", ex)
+ (emptyNode, TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = false, dt.mode.get, Some(ErrorConstants.TRANSFORMATION_FIELD_MISSING)))
+ }
+ }
+
+}
+
+object EncryptTransformer {
+
+ private val encryptTransformer = new EncryptTransformer()
+
+ def transform(json: JValue, jsonNode: JsonNode, dtList: List[DatasetTransformation]): TransformationResult = {
+ encryptTransformer.transform(json, jsonNode, dtList)
+ }
+
+}
\ No newline at end of file
diff --git a/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala
new file mode 100644
index 00000000..7fee60ca
--- /dev/null
+++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala
@@ -0,0 +1,62 @@
+package org.sunbird.obsrv.transformer.types
+
+import com.fasterxml.jackson.databind.JsonNode
+import com.fasterxml.jackson.databind.node.JsonNodeType
+import org.json4s.native.JsonMethods.parse
+import org.json4s.{JNothing, JObject, JValue}
+import org.sunbird.obsrv.core.model.ErrorConstants.Error
+import org.sunbird.obsrv.model.DatasetModels.DatasetTransformation
+import org.sunbird.obsrv.model.TransformMode.TransformMode
+import org.sunbird.obsrv.transformer.util.ConditionEvaluator
+
+import scala.collection.mutable.ListBuffer
+
+case class TransformFieldStatus(fieldKey: String, expr: String, success: Boolean, mode: TransformMode, error: Option[Error] = None)
+case class TransformationResult(json: JValue, fieldStatus: List[TransformFieldStatus])
+abstract class ITransformer[T] {
+
+ def transformField(json: JValue, jsonNode: JsonNode, dt: DatasetTransformation): (JValue, TransformFieldStatus)
+
+ def transform(json: JValue, jsonNode: JsonNode, dtList: List[DatasetTransformation]): TransformationResult = {
+ val resultBuffer = ListBuffer[TransformFieldStatus]()
+ val evalList = dtList.map(dt => {
+ val conditionStatus = ConditionEvaluator.evalCondition(dt.datasetId, jsonNode, dt.transformationFunction.condition, dt.mode)
+ if (!conditionStatus.success) {
+ resultBuffer.append(TransformFieldStatus(dt.fieldKey, conditionStatus.expr, success = false, dt.mode.get, conditionStatus.error))
+ JObject(dt.fieldKey -> JNothing)
+ } else {
+ val result = transformField(json, jsonNode, dt)
+ resultBuffer.append(result._2)
+ result._1
+ }
+ })
+ val transformedJson = evalList.reduceLeftOption((a, b) => a merge b).getOrElse(JNothing)
+ TransformationResult(transformedJson, resultBuffer.toList)
+ }
+
+ def getJSON(key: String, value: String): JValue = {
+ val path = key.split('.').toList ++ List(s""""$value"""")
+ val outPath = path.reduceRight((a, b) => s"""{"$a":$b}""")
+ parse(outPath, useBigIntForLong = false)
+ }
+
+ def getJSON(key: String, value: AnyRef): JValue = {
+ val path = key.split('.').toList ++ List(s"""$value""")
+ val outPath = path.reduceRight((a, b) => s"""{"$a":$b}""")
+ parse(outPath, useBigIntForLong = false)
+ }
+
+ def getJSON(key: String, value: JsonNode): JValue = {
+ Option(value).map { jsonNodeValue =>
+ jsonNodeValue.getNodeType match {
+ case JsonNodeType.STRING => getJSON(key, jsonNodeValue.textValue())
+ case JsonNodeType.NUMBER => getJSON(key, jsonNodeValue.numberValue().asInstanceOf[AnyRef])
+ case JsonNodeType.BOOLEAN => getJSON(key, jsonNodeValue.booleanValue().asInstanceOf[AnyRef])
+ case JsonNodeType.ARRAY => getJSON(key, jsonNodeValue.toString.asInstanceOf[AnyRef])
+ case JsonNodeType.OBJECT => getJSON(key, jsonNodeValue.toString.asInstanceOf[AnyRef])
+ case _ => getJSON(key, null.asInstanceOf[AnyRef])
+ }
+ }.getOrElse(getJSON(key, null.asInstanceOf[AnyRef]))
+ }
+
+}
\ No newline at end of file
diff --git a/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala
new file mode 100644
index 00000000..d6a55b54
--- /dev/null
+++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala
@@ -0,0 +1,67 @@
+package org.sunbird.obsrv.transformer.types
+
+import com.api.jsonata4java.expressions.{EvaluateException, Expressions, ParseException}
+import com.fasterxml.jackson.databind.JsonNode
+import com.fasterxml.jackson.databind.node.MissingNode
+import org.json4s.JValue
+import org.slf4j.LoggerFactory
+import org.sunbird.obsrv.core.model.ErrorConstants
+import org.sunbird.obsrv.core.util.JSONUtil
+import org.sunbird.obsrv.model.DatasetModels
+import org.sunbird.obsrv.model.DatasetModels.TransformationFunction
+
+class JSONAtaTransformer extends ITransformer {
+
+ private val logger = LoggerFactory.getLogger(classOf[JSONAtaTransformer])
+
+ override def transformField(json: JValue, jsonNode: JsonNode, dt: DatasetModels.DatasetTransformation): (JValue, TransformFieldStatus) = {
+ val emptyNode = getJSON(dt.fieldKey, MissingNode.getInstance())
+ try {
+ val expr = Expressions.parse(dt.transformationFunction.expr)
+ val resNode = expr.evaluate(jsonNode)
+ (Option(resNode).map { node => getJSON(dt.fieldKey, node) }.getOrElse(emptyNode), TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = true, dt.mode.get))
+ } catch {
+ case ex1: ParseException =>
+ logger.error(s"Transformer(JSONATA) | Exception parsing transformation expression | Data=${JSONUtil.serialize(dt)} | error=${ex1.getMessage}", ex1)
+ (emptyNode, TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = false, dt.mode.get, Some(ErrorConstants.INVALID_EXPR_FUNCTION)))
+ case ex2: EvaluateException =>
+ logger.error(s"Transformer(JSONATA) | Exception evaluating transformation expression | Data=${JSONUtil.serialize(dt)} | error=${ex2.getMessage}", ex2)
+ (emptyNode, TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = false, dt.mode.get, Some(ErrorConstants.ERR_EVAL_EXPR_FUNCTION)))
+ case ex3: Exception =>
+ logger.error(s"Transformer(JSONATA) | Unknown error | Data=${JSONUtil.serialize(dt)} | error=${ex3.getMessage}", ex3)
+ (emptyNode, TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = false, dt.mode.get, Some(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION)))
+ }
+ }
+
+ def evaluate(jsonNode: JsonNode, tf: TransformationFunction): JsonNode = {
+
+ try {
+ val expr = Expressions.parse(tf.expr)
+ expr.evaluate(jsonNode)
+ } catch {
+ case ex1: ParseException =>
+ logger.error(s"Transformer(JSONATA) | Exception parsing transformation expression | Data=${JSONUtil.serialize(tf)} | error=${ex1.getMessage}", ex1)
+ MissingNode.getInstance()
+ case ex2: EvaluateException =>
+ logger.error(s"Transformer(JSONATA) | Exception evaluating transformation expression | Data=${JSONUtil.serialize(tf)} | error=${ex2.getMessage}", ex2)
+ MissingNode.getInstance()
+ case ex3: Exception =>
+ logger.error(s"Transformer(JSONATA) | Unknown error | Data=${JSONUtil.serialize(tf)} | error=${ex3.getMessage}", ex3)
+ MissingNode.getInstance()
+ }
+ }
+}
+
+object JSONAtaTransformer {
+
+ private val jsonAtaTransformer = new JSONAtaTransformer()
+
+ def transform(json: JValue, jsonNode: JsonNode, dtList: List[DatasetModels.DatasetTransformation]): TransformationResult = {
+ jsonAtaTransformer.transform(json, jsonNode, dtList)
+ }
+
+ def evaluate(jsonNode: JsonNode, transformation: TransformationFunction): JsonNode = {
+ jsonAtaTransformer.evaluate(jsonNode, transformation)
+ }
+
+}
\ No newline at end of file
diff --git a/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala
new file mode 100644
index 00000000..045e224f
--- /dev/null
+++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala
@@ -0,0 +1,63 @@
+package org.sunbird.obsrv.transformer.types
+
+import co.com.bancolombia.datamask.{MaskUtils => CustomMaskUtils}
+import com.fasterxml.jackson.databind.JsonNode
+import com.fasterxml.jackson.databind.node.MissingNode
+import org.json4s.{DefaultFormats, Formats, JValue, MappingException}
+import org.slf4j.LoggerFactory
+import org.sunbird.obsrv.core.model.ErrorConstants
+import org.sunbird.obsrv.core.util.JSONUtil
+import org.sunbird.obsrv.model.DatasetModels.DatasetTransformation
+
+import java.util.regex.Pattern
+
+class MaskTransformer extends ITransformer[String] {
+
+ implicit val jsonFormats: Formats = DefaultFormats.withLong
+ private val logger = LoggerFactory.getLogger(classOf[EncryptTransformer])
+
+ implicit class JsonHelper(json: JValue) {
+ def customExtract[T](path: String)(implicit mf: Manifest[T]): T = {
+ path.split('.').foldLeft(json)({ case (acc: JValue, node: String) => acc \ node }).extract[T]
+ }
+ }
+
+ private val maskRatio = 0.35 // TODO: Move it to a config
+ private val emailPattern = Pattern.compile("^(.+)@(\\S+)$") // TODO: Read the pattern from config
+
+ private def mask(value: String): String = {
+ if (value.isEmpty) return value
+ if (emailPattern.matcher(value).matches()) {
+ CustomMaskUtils.maskAsEmail(value)
+ } else {
+ val openDigits = (value.length * maskRatio).ceil
+ val firstDigitCount = (openDigits / 2).floor
+ val lastDigitCount = openDigits - firstDigitCount
+ CustomMaskUtils.mask(value, firstDigitCount.intValue(), lastDigitCount.intValue())
+ }
+ }
+
+ override def transformField(json: JValue, jsonNode: JsonNode, dt: DatasetTransformation): (JValue, TransformFieldStatus) = {
+ val emptyNode = getJSON(dt.fieldKey, MissingNode.getInstance())
+ try {
+ val currentValue = json.customExtract[String](dt.transformationFunction.expr)
+ val maskedValue = mask(currentValue).replaceAll("\"", "")
+ (getJSON(dt.fieldKey, maskedValue), TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = true, dt.mode.get))
+ } catch {
+ case ex: MappingException =>
+ logger.error(s"Transformer(Mask) | Exception parsing transformation expression | Data=${JSONUtil.serialize(dt)} | error=${ex.getMessage}", ex)
+ (emptyNode, TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = false, dt.mode.get, Some(ErrorConstants.TRANSFORMATION_FIELD_MISSING)))
+ }
+ }
+
+}
+
+object MaskTransformer {
+
+ private val maskingTransformer = new MaskTransformer()
+
+ def transform(json: JValue, jsonNode: JsonNode, dtList: List[DatasetTransformation]): TransformationResult = {
+ maskingTransformer.transform(json, jsonNode, dtList)
+ }
+
+}
\ No newline at end of file
diff --git a/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala
new file mode 100644
index 00000000..58d489f6
--- /dev/null
+++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala
@@ -0,0 +1,36 @@
+package org.sunbird.obsrv.transformer.util
+
+import org.sunbird.obsrv.core.model.SystemConfig
+
+import java.util.Base64
+import javax.crypto.Cipher
+import javax.crypto.spec.SecretKeySpec
+
+object CipherUtil {
+
+ private val algorithm = "AES"
+
+ private val encryptInstance = getInstance(Cipher.ENCRYPT_MODE)
+
+ private val decryptInstance = getInstance(Cipher.DECRYPT_MODE)
+
+ def encrypt(value: String): String = {
+ if (value.isEmpty) return value
+ val encryptedByteValue = encryptInstance.doFinal(value.getBytes("utf-8"))
+ Base64.getEncoder.encodeToString(encryptedByteValue)
+ }
+
+ def decrypt(value: String): String = {
+ val decryptedValue64 = Base64.getDecoder.decode(value)
+ val decryptedByteValue = decryptInstance.doFinal(decryptedValue64)
+ new String(decryptedByteValue, "utf-8")
+ }
+
+ private def getInstance(mode: Int): Cipher = {
+ val cipher = Cipher.getInstance(algorithm)
+ val key = new SecretKeySpec(SystemConfig.getString("encryptionSecretKey").getBytes("utf-8"), algorithm)
+ cipher.init(mode, key)
+ cipher
+ }
+
+}
\ No newline at end of file
diff --git a/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala
new file mode 100644
index 00000000..0a892b8f
--- /dev/null
+++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala
@@ -0,0 +1,47 @@
+package org.sunbird.obsrv.transformer.util
+
+import com.api.jsonata4java.expressions.{EvaluateException, Expressions, ParseException}
+import com.fasterxml.jackson.databind.JsonNode
+import org.slf4j.LoggerFactory
+import org.sunbird.obsrv.core.model.ErrorConstants.Error
+import org.sunbird.obsrv.core.model.ErrorConstants
+import org.sunbird.obsrv.core.util.JSONUtil
+import org.sunbird.obsrv.model.DatasetModels.Condition
+import org.sunbird.obsrv.model.TransformMode.TransformMode
+
+case class ConditionStatus(expr: String, success: Boolean, mode: Option[TransformMode] = None, error: Option[Error] = None)
+object ConditionEvaluator {
+
+ private val logger = LoggerFactory.getLogger(ConditionEvaluator.getClass)
+
+ def evalCondition(datasetId: String, json: JsonNode, condition: Option[Condition], mode: Option[TransformMode]): ConditionStatus = {
+ if(condition.isDefined) {
+ condition.get.`type` match {
+ case "jsonata" => evalJSONAtaCondition(datasetId, json, condition.get, mode)
+ case _ => ConditionStatus("", success = false, mode, Some(ErrorConstants.NO_IMPLEMENTATION_FOUND))
+ }
+ } else {
+ ConditionStatus("", success = true, mode)
+ }
+ }
+
+ private def evalJSONAtaCondition(datasetId: String, json: JsonNode, condition: Condition, mode: Option[TransformMode]): ConditionStatus = {
+ try {
+ val expr = Expressions.parse(condition.expr)
+ val resultNode = expr.evaluate(json)
+ val result = resultNode.isBoolean && resultNode.asBoolean()
+ ConditionStatus(condition.expr, result, mode)
+ } catch {
+ case ex1: ParseException =>
+ logger.error(s"Transformer(ConditionEvaluator) | Exception parsing condition expression | dataset=$datasetId | ConditionData=${JSONUtil.serialize(condition)} | error=${ex1.getMessage}", ex1)
+ ConditionStatus(condition.expr, success = false, mode, Some(ErrorConstants.INVALID_EXPR_FUNCTION))
+ case ex2: EvaluateException =>
+ logger.error(s"Transformer(ConditionEvaluator) | Exception evaluating condition expression | dataset=$datasetId | ConditionData=${JSONUtil.serialize(condition)} | error=${ex2.getMessage}", ex2)
+ ConditionStatus(condition.expr, success = false, mode, Some(ErrorConstants.ERR_EVAL_EXPR_FUNCTION))
+ case ex3: Exception =>
+ logger.error(s"Transformer(ConditionEvaluator) | Unknown error during condition evaluation | dataset=$datasetId | ConditionData=${JSONUtil.serialize(condition)} | error=${ex3.getMessage}", ex3)
+ ConditionStatus(condition.expr, success = false, mode, Some(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION))
+ }
+ }
+
+}
diff --git a/transformation-sdk/src/test/resources/test.conf b/transformation-sdk/src/test/resources/test.conf
new file mode 100644
index 00000000..e69de29b