diff --git a/.github/workflows/build_and_deploy.yaml b/.github/workflows/build_and_deploy.yaml index 601c8f5e..41801ec8 100644 --- a/.github/workflows/build_and_deploy.yaml +++ b/.github/workflows/build_and_deploy.yaml @@ -27,17 +27,16 @@ jobs: target: "transformer-image" - image: "druid-router" target: "router-image" - - image: "merged-pipeline" - target: "merged-image" + - image: "unified-pipeline" + target: "unified-image" - image: "master-data-processor" target: "master-data-processor-image" - image: "lakehouse-connector" target: "lakehouse-connector-image" - steps: - - uses: actions/checkout@v4 + - image: "cache-indexer" + target: "cache-indexer-image" with: fetch-depth: 0 - - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -97,7 +96,7 @@ jobs: run: | cd deploy/terraform/aws terragrunt init - terragrunt apply -auto-approve -var merged_pipeline_enabled={{ vars.MERGED_PIPELINE || 'true' }} --replace='module.flink.helm_release.flink' \ + terragrunt apply -auto-approve -var unified_pipeline_enabled={{ vars.MERGED_PIPELINE || 'true' }} --replace='module.flink.helm_release.flink' \ -var flink_image_tag=${{ github.ref_name }} azure-deploy: diff --git a/.github/workflows/upload_artifact.yaml b/.github/workflows/upload_artifact.yaml index 38cb7ec8..07943fd1 100644 --- a/.github/workflows/upload_artifact.yaml +++ b/.github/workflows/upload_artifact.yaml @@ -56,7 +56,7 @@ jobs: - image: "denormalizer" - image: "transformer" - image: "druid-router" - - image: "pipeline-merged" + - image: "unified-pipeline" - image: "master-data-processor" steps: - name: Get Tag Name diff --git a/Dockerfile b/Dockerfile index fd4002be..1d5ea6c6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,7 @@ FROM --platform=linux/x86_64 maven:3.9.4-eclipse-temurin-11-focal AS build-core COPY . /app RUN mvn clean install -DskipTests -f /app/framework/pom.xml RUN mvn clean install -DskipTests -f /app/dataset-registry/pom.xml +RUN mvn clean install -DskipTests -f /app/transformation-sdk/pom.xml FROM --platform=linux/x86_64 maven:3.9.4-eclipse-temurin-11-focal AS build-pipeline COPY --from=build-core /root/.m2 /root/.m2 @@ -28,9 +29,9 @@ FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as rout USER flink COPY --from=build-pipeline /app/pipeline/druid-router/target/druid-router-1.0.0.jar $FLINK_HOME/lib/ -FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as merged-image +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as unified-image USER flink -COPY --from=build-pipeline /app/pipeline/pipeline-merged/target/pipeline-merged-1.0.0.jar $FLINK_HOME/lib/ +COPY --from=build-pipeline /app/pipeline/unified-pipeline/target/unified-pipeline-1.0.0.jar $FLINK_HOME/lib/ FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as master-data-processor-image USER flink @@ -40,3 +41,7 @@ FROM --platform=linux/x86_64 sanketikahub/flink:1.15.0-scala_2.12-lakehouse as l USER flink RUN mkdir $FLINK_HOME/custom-lib COPY ./pipeline/hudi-connector/target/hudi-connector-1.0.0.jar $FLINK_HOME/custom-lib + +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as cache-indexer-image +USER flink +COPY --from=build-pipeline /app/pipeline/cache-indexer/target/cache-indexer-1.0.0.jar $FLINK_HOME/lib diff --git a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala index 22729aa0..781b916a 100644 --- a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala +++ b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala @@ -69,7 +69,8 @@ object MasterDataProcessorIndexer { logger.info(s"createDataFile() | START | dataset=${dataset.id} ") import spark.implicits._ val readWriteConf = ReadWriteConfig(scanCount = config.getInt("redis.scan.count"), maxPipelineSize = config.getInt("redis.max.pipeline.size")) - val redisConfig = new RedisConfig(initialHost = RedisEndpoint(host = dataset.datasetConfig.redisDBHost.get, port = dataset.datasetConfig.redisDBPort.get, dbNum = dataset.datasetConfig.redisDB.get)) + val cacheConfig = dataset.datasetConfig.cacheConfig.get + val redisConfig = new RedisConfig(initialHost = RedisEndpoint(host = cacheConfig.redisDBHost.get, port = cacheConfig.redisDBPort.get, dbNum = cacheConfig.redisDB.get)) val ts: Long = new DateTime(DateTimeZone.UTC).withTimeAtStartOfDay().getMillis val rdd = spark.sparkContext.fromRedisKV("*")(redisConfig = redisConfig, readWriteConfig = readWriteConf).map( f => CommonUtil.processEvent(f._2, ts) @@ -83,9 +84,9 @@ object MasterDataProcessorIndexer { } private def getDatasets(): List[Dataset] = { - val datasets: List[Dataset] = DatasetRegistry.getAllDatasets("master-dataset") + val datasets: List[Dataset] = DatasetRegistry.getAllDatasets(Some("master")) datasets.filter(dataset => { - dataset.datasetConfig.indexData.nonEmpty && dataset.datasetConfig.indexData.get && dataset.status == DatasetStatus.Live + dataset.datasetConfig.indexingConfig.olapStoreEnabled && dataset.status == DatasetStatus.Live }) } diff --git a/data-products/src/test/scala/org/sunbird/spec/MasterDataIndexerSpec.scala b/data-products/src/test/scala/org/sunbird/spec/MasterDataIndexerSpec.scala index 0d54050e..6ef1458e 100644 --- a/data-products/src/test/scala/org/sunbird/spec/MasterDataIndexerSpec.scala +++ b/data-products/src/test/scala/org/sunbird/spec/MasterDataIndexerSpec.scala @@ -108,16 +108,16 @@ class MasterDataIndexerSpec extends FlatSpec with BeforeAndAfterAll with Matcher } private def insertTestData(postgresConnect: PostgresConnect) = { - postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md1','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 3}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") + postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES('md1','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 3}', NULL, NULL, 'Live', 'v1', 'local.masterdata.ingest', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md1_md1.1_DAY', 'md1', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md1.1_DAY', 'md1.1_DAY');") postgresConnect.execute("insert into dataset_transformations values('tf1', 'md1', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'Live', 'Strict', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md2','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\":6340, \"index_data\": true, \"redis_db\": 5}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', 'now()', 'now()');") + postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, denorm_config, router_config, dataset_config, tags, data_version, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES('md2','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\":6340, \"index_data\": true, \"redis_db\": 5}', NULL, NULL, 'Live', 'v1', 'local.masterdata.ingest', 'SYSTEM', 'SYSTEM', 'now()', 'now()');") postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md2_md1.1_DAY', 'md2', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md2.1_DAY', 'md2.1_DAY');") - postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md3','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 6}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") + postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES('md3','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 6}', NULL, NULL, 'Live', 'v1', 'local.masterdata.ingest', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md3_md3.1_DAY', 'md3', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md3.1_DAY', 'md3.1_DAY');") postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md3_md3.2_DAY', 'md3', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md3.2_DAY', 'md3.2_DAY');") - postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md5','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 9}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") - postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md4','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 3}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") + postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES('md5','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 9}', NULL, NULL, 'Live', 'v1', 'local.masterdata.ingest', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") + postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES('md4','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 3}', NULL, NULL, 'Live', 'v1', 'local.masterdata.ingest', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md4_md4.1_DAY', 'md4', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md4.1_DAY', 'md4.1_DAY');") } diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala index 3aebe8bd..8bc1623b 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala @@ -4,7 +4,7 @@ import com.fasterxml.jackson.annotation.JsonProperty import com.fasterxml.jackson.core.`type`.TypeReference import com.fasterxml.jackson.module.scala.JsonScalaEnumeration import org.sunbird.obsrv.core.model.SystemConfig -import org.sunbird.obsrv.model.DatasetStatus.DatasetStatus +import org.sunbird.obsrv.model.DatasetStatus.{DatasetStatus, Value} import org.sunbird.obsrv.model.TransformMode.TransformMode import org.sunbird.obsrv.model.ValidationMode.ValidationMode @@ -25,25 +25,42 @@ object DatasetModels { case class ValidationConfig(@JsonProperty("validate") validate: Option[Boolean] = Some(true), @JsonProperty("mode") @JsonScalaEnumeration(classOf[ValidationModeType]) mode: Option[ValidationMode]) - case class DenormFieldConfig(@JsonProperty("denorm_key") denormKey: String, @JsonProperty("redis_db") redisDB: Int, - @JsonProperty("denorm_out_field") denormOutField: String) + case class DenormFieldConfig(@JsonProperty("denorm_key") denormKey: Option[String], @JsonProperty("redis_db") redisDB: Int, + @JsonProperty("denorm_out_field") denormOutField: String, @JsonProperty("jsonata_expr") jsonAtaExpr: Option[String]) case class DenormConfig(@JsonProperty("redis_db_host") redisDBHost: String, @JsonProperty("redis_db_port") redisDBPort: Int, @JsonProperty("denorm_fields") denormFields: List[DenormFieldConfig]) case class RouterConfig(@JsonProperty("topic") topic: String) - case class DatasetConfig(@JsonProperty("data_key") key: String, @JsonProperty("timestamp_key") tsKey: String, @JsonProperty("entry_topic") entryTopic: String, - @JsonProperty("exclude_fields") excludeFields: Option[List[String]] = None, @JsonProperty("redis_db_host") redisDBHost: Option[String] = None, - @JsonProperty("redis_db_port") redisDBPort: Option[Int] = None, @JsonProperty("redis_db") redisDB: Option[Int] = None, - @JsonProperty("index_data") indexData: Option[Boolean] = None, @JsonProperty("timestamp_format") tsFormat: Option[String] = None, - @JsonProperty("dataset_tz") datasetTimezone: Option[String] = None) + case class IndexingConfig(@JsonProperty("olap_store_enabled") olapStoreEnabled: Boolean, @JsonProperty("lakehouse_enabled") lakehouseEnabled: Boolean, + @JsonProperty("cache_enabled") cacheEnabled: Boolean) + + case class KeysConfig(@JsonProperty("data_key") dataKey: Option[String], @JsonProperty("partition_key") partitionKey: Option[String], + @JsonProperty("timestamp_key") tsKey: Option[String], @JsonProperty("timestamp_format") tsFormat: Option[String]) + + case class CacheConfig(@JsonProperty("redis_db_host") redisDBHost: Option[String], @JsonProperty("redis_db_port") redisDBPort: Option[Int], + @JsonProperty("redis_db") redisDB: Option[Int]) + + case class DatasetConfigV1(@JsonProperty("data_key") key: String, @JsonProperty("timestamp_key") tsKey: String, @JsonProperty("entry_topic") entryTopic: String, + @JsonProperty("exclude_fields") excludeFields: Option[List[String]] = None, @JsonProperty("redis_db_host") redisDBHost: Option[String] = None, + @JsonProperty("redis_db_port") redisDBPort: Option[Int] = None, @JsonProperty("redis_db") redisDB: Option[Int] = None, + @JsonProperty("index_data") indexData: Option[Boolean] = None, @JsonProperty("timestamp_format") tsFormat: Option[String] = None, + @JsonProperty("dataset_tz") datasetTimezone: Option[String] = None) + + case class DatasetConfig(@JsonProperty("indexing_config") indexingConfig: IndexingConfig, + @JsonProperty("keys_config") keysConfig: KeysConfig, + @JsonProperty("exclude_fields") excludeFields: Option[List[String]] = None, + @JsonProperty("dataset_tz") datasetTimezone: Option[String] = None, + @JsonProperty("cache_config") cacheConfig: Option[CacheConfig] = None) case class Dataset(@JsonProperty("id") id: String, @JsonProperty("type") datasetType: String, @JsonProperty("extraction_config") extractionConfig: Option[ExtractionConfig], @JsonProperty("dedup_config") dedupConfig: Option[DedupConfig], @JsonProperty("validation_config") validationConfig: Option[ValidationConfig], @JsonProperty("data_schema") jsonSchema: Option[String], @JsonProperty("denorm_config") denormConfig: Option[DenormConfig], - @JsonProperty("router_config") routerConfig: RouterConfig, datasetConfig: DatasetConfig, @JsonProperty("status") @JsonScalaEnumeration(classOf[DatasetStatusType]) status: DatasetStatus, - @JsonProperty("tags") tags: Option[Array[String]] = None, @JsonProperty("data_version") dataVersion: Option[Int] = None) + @JsonProperty("router_config") routerConfig: RouterConfig, datasetConfig: DatasetConfig, + @JsonProperty("status") @JsonScalaEnumeration(classOf[DatasetStatusType]) status: DatasetStatus, + @JsonProperty("entry_topic") entryTopic: String, @JsonProperty("tags") tags: Option[Array[String]] = None, + @JsonProperty("data_version") dataVersion: Option[Int] = None, @JsonProperty("api_version") apiVersion: Option[String] = None) case class Condition(@JsonProperty("type") `type`: String, @JsonProperty("expr") expr: String) @@ -51,7 +68,7 @@ object DatasetModels { case class DatasetTransformation(@JsonProperty("id") id: String, @JsonProperty("dataset_id") datasetId: String, @JsonProperty("field_key") fieldKey: String, @JsonProperty("transformation_function") transformationFunction: TransformationFunction, - @JsonProperty("status") status: String, @JsonProperty("mode") @JsonScalaEnumeration(classOf[TransformModeType]) mode: Option[TransformMode] = Some(TransformMode.Strict)) + @JsonProperty("mode") @JsonScalaEnumeration(classOf[TransformModeType]) mode: Option[TransformMode] = Some(TransformMode.Strict)) case class ConnectorConfig(@JsonProperty("kafkaBrokers") kafkaBrokers: String, @JsonProperty("topic") topic: String, @JsonProperty("type") databaseType: String, @JsonProperty("connection") connection: Connection, @JsonProperty("tableName") tableName: String, @JsonProperty("databaseName") databaseName: String, @@ -94,4 +111,10 @@ class DatasetStatusType extends TypeReference[DatasetStatus.type] object DatasetStatus extends Enumeration { type DatasetStatus = Value val Draft, Publish, Live, Retired, Purged = Value +} + +class DatasetTypeType extends TypeReference[DatasetType.type] +object DatasetType extends Enumeration { + type DatasetType = Value + val event, transaction, master = Value } \ No newline at end of file diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala index 08921adc..0945fa58 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala @@ -12,9 +12,14 @@ object DatasetRegistry { datasets ++= DatasetRegistryService.readAllDatasets() lazy private val datasetTransformations: Map[String, List[DatasetTransformation]] = DatasetRegistryService.readAllDatasetTransformations() - def getAllDatasets(datasetType: String): List[Dataset] = { + def getAllDatasets(datasetType: Option[String]): List[Dataset] = { val datasetList = DatasetRegistryService.readAllDatasets() - datasetList.filter(f => f._2.datasetType.equals(datasetType)).values.toList + if(datasetType.isDefined) { + datasetList.filter(f => f._2.datasetType.equals(datasetType.get)).values.toList + } else { + datasetList.values.toList + } + } def getDataset(id: String): Option[Dataset] = { @@ -47,8 +52,8 @@ object DatasetRegistry { datasourceList.getOrElse(List()) } - def getDataSetIds(datasetType: String): List[String] = { - datasets.filter(f => f._2.datasetType.equals(datasetType)).keySet.toList + def getDataSetIds(): List[String] = { + datasets.keySet.toList } def updateDatasourceRef(datasource: DataSource, datasourceRef: String): Int = { diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala index 0b0abe23..a0a90655 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala @@ -6,7 +6,7 @@ import org.sunbird.obsrv.model.DatasetModels._ import org.sunbird.obsrv.model.{DatasetStatus, TransformMode} import java.io.File -import java.sql.{PreparedStatement, ResultSet, Timestamp} +import java.sql.{ResultSet, Timestamp} object DatasetRegistryService { private val configFile = new File("/data/flink/conf/baseconfig.conf") @@ -41,6 +41,21 @@ object DatasetRegistryService { } } + def readDataset(id: String): Option[Dataset] = { + + val postgresConnect = new PostgresConnect(postgresConfig) + try { + val rs = postgresConnect.executeQuery(s"SELECT * FROM datasets where id='$id'") + if (rs.next()) { + Some(parseDataset(rs)) + } else { + None + } + } finally { + postgresConnect.closeConnection() + } + } + def readDataset(id: String): Option[Dataset] = { val postgresConnect = new PostgresConnect(postgresConfig) var preparedStatement: PreparedStatement = null @@ -76,6 +91,20 @@ object DatasetRegistryService { } } + def readDatasetSourceConfig(datasetId: String): Option[List[DatasetSourceConfig]] = { + + val postgresConnect = new PostgresConnect(postgresConfig) + try { + val rs = postgresConnect.executeQuery(s"SELECT * FROM dataset_source_config where dataset_id='$datasetId'") + Option(Iterator.continually((rs, rs.next)).takeWhile(f => f._2).map(f => f._1).map(result => { + val datasetSourceConfig = parseDatasetSourceConfig(result) + datasetSourceConfig + }).toList) + } finally { + postgresConnect.closeConnection() + } + } + def readDatasetSourceConfig(datasetId: String): Option[List[DatasetSourceConfig]] = { val postgresConnect = new PostgresConnect(postgresConfig) @@ -112,20 +141,14 @@ object DatasetRegistryService { } def readDatasources(datasetId: String): Option[List[DataSource]] = { + val postgresConnect = new PostgresConnect(postgresConfig) - var preparedStatement: PreparedStatement = null - var resultSet: ResultSet = null try { - val query = "SELECT * FROM datasources WHERE dataset_id = ?" - preparedStatement = postgresConnect.prepareStatement(query) - preparedStatement.setString(1, datasetId) - resultSet = postgresConnect.executeQuery(preparedStatement = preparedStatement) - Option(Iterator.continually((resultSet, resultSet.next)).takeWhile(f => f._2).map(f => f._1).map(result => { + val rs = postgresConnect.executeQuery(s"SELECT * FROM datasources where dataset_id='$datasetId'") + Option(Iterator.continually((rs, rs.next)).takeWhile(f => f._2).map(f => f._1).map(result => { parseDatasource(result) }).toList) } finally { - if (resultSet != null) resultSet.close() - if (preparedStatement != null) preparedStatement.close() postgresConnect.closeConnection() } } @@ -133,24 +156,13 @@ object DatasetRegistryService { def readAllDatasources(): Option[List[DataSource]] = { val postgresConnect = new PostgresConnect(postgresConfig) + var preparedStatement: PreparedStatement = null + val query = "UPDATE datasources SET datasource_ref = ? WHERE datasource = ? AND dataset_id = ?" try { val rs = postgresConnect.executeQuery(s"SELECT * FROM datasources") Option(Iterator.continually((rs, rs.next)).takeWhile(f => f._2).map(f => f._1).map(result => { parseDatasource(result) }).toList) - } - } - - def updateDatasourceRef(datasource: DataSource, datasourceRef: String): Int = { - val postgresConnect = new PostgresConnect(postgresConfig) - var preparedStatement: PreparedStatement = null - val query = "UPDATE datasources SET datasource_ref = ? WHERE datasource = ? AND dataset_id = ?" - try { - preparedStatement = postgresConnect.prepareStatement(query) - preparedStatement.setString(1, datasourceRef) - preparedStatement.setString(2, datasource.datasource) - preparedStatement.setString(3, datasource.datasetId) - postgresConnect.executeUpdate(preparedStatement) } finally { if (preparedStatement != null) preparedStatement.close() postgresConnect.closeConnection() @@ -174,6 +186,37 @@ object DatasetRegistryService { } } + def updateDatasourceRef(datasource: DataSource, datasourceRef: String): Int = { + val query = s"UPDATE datasources set datasource_ref = '$datasourceRef' where datasource='${datasource.datasource}' and dataset_id='${datasource.datasetId}'" + updateRegistry(query) + } + + def updateConnectorStats(id: String, lastFetchTimestamp: Timestamp, records: Long): Int = { + val query = s"UPDATE dataset_source_config SET connector_stats = coalesce(connector_stats, '{}')::jsonb || " + + s"jsonb_build_object('records', COALESCE(connector_stats->>'records', '0')::int + '$records'::int) || " + + s"jsonb_build_object('last_fetch_timestamp', '$lastFetchTimestamp'::timestamp) || " + + s"jsonb_build_object('last_run_timestamp', '${new Timestamp(System.currentTimeMillis())}'::timestamp) WHERE id = '$id';" + updateRegistry(query) + } + + def updateConnectorDisconnections(id: String, disconnections: Int): Int = { + val query = s"UPDATE dataset_source_config SET connector_stats = jsonb_set(coalesce(connector_stats, '{}')::jsonb, '{disconnections}','$disconnections') WHERE id = '$id'" + updateRegistry(query) + } + + def updateConnectorAvgBatchReadTime(id: String, avgReadTime: Long): Int = { + val query = s"UPDATE dataset_source_config SET connector_stats = jsonb_set(coalesce(connector_stats, '{}')::jsonb, '{avg_batch_read_time}','$avgReadTime') WHERE id = '$id'" + updateRegistry(query) + } + + private def updateRegistry(query: String): Int = { + val postgresConnect = new PostgresConnect(postgresConfig) + try { + postgresConnect.executeUpdate(query) + } finally { + postgresConnect.closeConnection() + } + } def updateConnectorDisconnections(id: String, disconnections: Int): Int = { val postgresConnect = new PostgresConnect(postgresConfig) @@ -214,11 +257,25 @@ object DatasetRegistryService { val jsonSchema = rs.getString("data_schema") val denormConfig = rs.getString("denorm_config") val routerConfig = rs.getString("router_config") - val datasetConfig = rs.getString("dataset_config") + val datasetConfigStr = rs.getString("dataset_config") val status = rs.getString("status") val tagArray = rs.getArray("tags") val tags = if (tagArray != null) tagArray.getArray.asInstanceOf[Array[String]] else null val dataVersion = rs.getInt("data_version") + val apiVersion = rs.getString("api_version") + val entryTopic = rs.getString("entry_topic") + + val datasetConfig: DatasetConfig = if ("v2".equalsIgnoreCase(apiVersion)) { + JSONUtil.deserialize[DatasetConfig](datasetConfigStr) + } else { + val v1Config = JSONUtil.deserialize[DatasetConfigV1](datasetConfigStr) + DatasetConfig( + indexingConfig = IndexingConfig(olapStoreEnabled = true, lakehouseEnabled = false, cacheEnabled = if ("master".equalsIgnoreCase(datasetType)) true else false), + keysConfig = KeysConfig(dataKey = Some(v1Config.key), None, tsKey = Some(v1Config.tsKey), None), + excludeFields = v1Config.excludeFields, datasetTimezone = v1Config.datasetTimezone, + cacheConfig = Some(CacheConfig(redisDBHost = v1Config.redisDBHost, redisDBPort = v1Config.redisDBPort, redisDB = v1Config.redisDB)) + ) + } Dataset(datasetId, datasetType, if (extractionConfig == null) None else Some(JSONUtil.deserialize[ExtractionConfig](extractionConfig)), @@ -227,10 +284,12 @@ object DatasetRegistryService { Option(jsonSchema), if (denormConfig == null) None else Some(JSONUtil.deserialize[DenormConfig](denormConfig)), JSONUtil.deserialize[RouterConfig](routerConfig), - JSONUtil.deserialize[DatasetConfig](datasetConfig), + datasetConfig, DatasetStatus.withName(status), + entryTopic, Option(tags), - Option(dataVersion) + Option(dataVersion), + Option(apiVersion) ) } @@ -265,10 +324,9 @@ object DatasetRegistryService { val datasetId = rs.getString("dataset_id") val fieldKey = rs.getString("field_key") val transformationFunction = rs.getString("transformation_function") - val status = rs.getString("status") val mode = rs.getString("mode") - DatasetTransformation(id, datasetId, fieldKey, JSONUtil.deserialize[TransformationFunction](transformationFunction), status, Some(if (mode != null) TransformMode.withName(mode) else TransformMode.Strict)) + DatasetTransformation(id, datasetId, fieldKey, JSONUtil.deserialize[TransformationFunction](transformationFunction), Some(if (mode != null) TransformMode.withName(mode) else TransformMode.Strict)) } } \ No newline at end of file diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala index 4e992eba..9c454ec0 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala @@ -28,11 +28,11 @@ trait SystemEventHandler { } private def getTime(timespans: Map[String, AnyRef], producer: Producer): Option[Long] = { - timespans.get(producer.toString).map(f => f.asInstanceOf[Long]) + timespans.get(producer.toString).map(f => f.asInstanceOf[Number].longValue()) } private def getStat(obsrvMeta: Map[String, AnyRef], stat: Stats): Option[Long] = { - obsrvMeta.get(stat.toString).map(f => f.asInstanceOf[Long]) + obsrvMeta.get(stat.toString).map(f => f.asInstanceOf[Number].longValue()) } def getError(error: ErrorConstants.Error, producer: Producer, functionalError: FunctionalError): Option[ErrorLog] = { @@ -74,7 +74,7 @@ abstract class BaseDatasetProcessFunction(config: BaseJobConfig[mutable.Map[Stri override def getMetricsList(): MetricsList = { val metrics = getMetrics() ++ List(config.eventFailedMetricsCount) - MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics) + MetricsList(DatasetRegistry.getDataSetIds(), metrics) } private def initMetrics(datasetId: String): Unit = { @@ -138,7 +138,7 @@ abstract class BaseDatasetWindowProcessFunction(config: BaseJobConfig[mutable.Ma override def getMetricsList(): MetricsList = { val metrics = getMetrics() ++ List(config.eventFailedMetricsCount) - MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics) + MetricsList(DatasetRegistry.getDataSetIds(), metrics) } private def initMetrics(datasetId: String): Unit = { diff --git a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala index 1b3edea0..53a40ddd 100644 --- a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala +++ b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala @@ -35,18 +35,18 @@ class BaseSpecWithDatasetRegistry extends BaseSpecWithPostgres { private def createSchema(postgresConnect: PostgresConnect): Unit = { - postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasets ( id text PRIMARY KEY, type text NOT NULL, validation_config json, extraction_config json, dedup_config json, data_schema json, denorm_config json, router_config json NOT NULL, dataset_config json NOT NULL, status text NOT NULL, tags text[], data_version INT, created_by text NOT NULL, updated_by text NOT NULL, created_date timestamp NOT NULL, updated_date timestamp NOT NULL );") + postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasets ( id text PRIMARY KEY, type text NOT NULL, validation_config json, extraction_config json, dedup_config json, data_schema json, denorm_config json, router_config json NOT NULL, dataset_config json NOT NULL, status text NOT NULL, tags text[], data_version INT, api_version VARCHAR(255) NOT NULL, entry_topic TEXT NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date timestamp NOT NULL, updated_date timestamp NOT NULL );") postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasources ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), type text NOT NULL, ingestion_spec json NOT NULL, datasource text NOT NULL, datasource_ref text NOT NULL, retention_period json, archival_policy json, purge_policy json, backup_config json NOT NULL, status text NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL );") - postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_transformations ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), field_key text NOT NULL, transformation_function json NOT NULL, status text NOT NULL, mode text, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(field_key, dataset_id) );") + postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_transformations ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), field_key text NOT NULL, transformation_function json NOT NULL, mode text, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(field_key, dataset_id) );") postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_source_config ( id text PRIMARY KEY, dataset_id text NOT NULL REFERENCES datasets (id), connector_type text NOT NULL, connector_config json NOT NULL, status text NOT NULL, connector_stats json, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(connector_type, dataset_id) );") } private def insertTestData(postgresConnect: PostgresConnect): Unit = { - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, created_by, updated_by, created_date, updated_date) values ('d1', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d1', 'event', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'v1', 'ingest', 'System', 'System', now(), now());") postgresConnect.execute("update datasets set denorm_config = '{\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"denorm_fields\":[{\"denorm_key\":\"vehicleCode\",\"redis_db\":2,\"denorm_out_field\":\"vehicleData\"}]}' where id='d1';") - postgresConnect.execute("insert into dataset_transformations values('tf1', 'd1', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'Live', 'Strict', 'System', 'System', now(), now());") - postgresConnect.execute("insert into dataset_transformations values('tf2', 'd1', 'dealer.maskedPhone', '{\"type\":\"mask\",\"expr\": \"dealer.phone\"}', 'Live', null, 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date, tags) values ('d2', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") + postgresConnect.execute("insert into dataset_transformations values('tf1', 'd1', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'Strict', 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_transformations values('tf2', 'd1', 'dealer.maskedPhone', '{\"type\":\"mask\",\"expr\": \"dealer.phone\"}', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date, tags) values ('d2', 'event', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") } def getPrintableMetrics(metricsMap: mutable.Map[String, Long]): Map[String, Map[String, Map[String, Long]]] = { diff --git a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala index 3d83552d..dcdcf402 100644 --- a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala +++ b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala @@ -23,7 +23,7 @@ class TestDatasetRegistrySpec extends BaseSpecWithDatasetRegistry with Matchers d2Opt.get.denormConfig should be(None) val postgresConnect = new PostgresConnect(postgresConfig) - postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date, tags) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") + postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date, tags) values ('d3', 'event', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") postgresConnect.closeConnection() val d3Opt = DatasetRegistry.getDataset("d3") @@ -34,14 +34,14 @@ class TestDatasetRegistrySpec extends BaseSpecWithDatasetRegistry with Matchers val d4Opt = DatasetRegistry.getDataset("d4") d4Opt should be (None) - val allDatasets = DatasetRegistry.getAllDatasets("dataset") + val allDatasets = DatasetRegistry.getAllDatasets(Some("event")) allDatasets.size should be(3) val d1Tfs = DatasetRegistry.getDatasetTransformations("d1") d1Tfs should not be None d1Tfs.get.size should be(2) - val ids = DatasetRegistry.getDataSetIds("dataset").sortBy(f => f) + val ids = DatasetRegistry.getDataSetIds().sortBy(f => f) ids.head should be("d1") ids.apply(1) should be("d2") ids.apply(2) should be("d3") diff --git a/framework/pom.xml b/framework/pom.xml index 52ced63f..263a52a7 100644 --- a/framework/pom.xml +++ b/framework/pom.xml @@ -44,7 +44,7 @@ org.apache.httpcomponents httpclient - 4.5.1 + 4.5.13 com.google.code.gson @@ -98,7 +98,7 @@ junit junit - 4.12 + 4.13.1 test @@ -144,12 +144,6 @@ 1.0.0 test - - org.cassandraunit - cassandra-unit - 3.11.2.0 - test - diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala index 466552dd..c20bb925 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala @@ -16,5 +16,4 @@ object Constants { val TOPIC = "topic" val MESSAGE = "message" val DATALAKE_TYPE = "datalake" - val MASTER_DATASET_TYPE = "master-dataset" } diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala index b5e57d87..6b9fcc08 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala @@ -24,7 +24,7 @@ object ErrorConstants extends Enumeration { val JSON_SCHEMA_NOT_FOUND = ErrorInternalValue("ERR_PP_1011", "Json schema not found for the dataset") val INVALID_JSON_SCHEMA = ErrorInternalValue("ERR_PP_1012", "Invalid json schema") val SCHEMA_VALIDATION_FAILED = ErrorInternalValue("ERR_PP_1013", "Event failed the schema validation") - val DENORM_KEY_MISSING = ErrorInternalValue("ERR_DENORM_1014", "No denorm key found or missing data for the specified key") + val DENORM_KEY_MISSING = ErrorInternalValue("ERR_DENORM_1014", "No denorm key or transformation expr found or missing data for the specified key") val DENORM_KEY_NOT_A_STRING_OR_NUMBER = ErrorInternalValue("ERR_DENORM_1015", "Denorm key value is not a String or Number") val DENORM_DATA_NOT_FOUND = ErrorInternalValue("ERR_DENORM_1016", "Denorm data not found for the given key") val MISSING_DATASET_CONFIG_KEY = ErrorInternalValue("ERR_MASTER_DATA_1017", "Master dataset configuration key is missing") diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/SystemConfig.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/SystemConfig.scala index e4c05e4c..0adb1098 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/model/SystemConfig.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/SystemConfig.scala @@ -6,7 +6,7 @@ import org.sunbird.obsrv.core.model.Models.SystemSetting import org.sunbird.obsrv.core.util.{PostgresConnect, PostgresConnectionConfig} import java.io.File -import java.sql.{PreparedStatement, ResultSet} +import java.sql.ResultSet object SystemConfig { @@ -102,17 +102,10 @@ object SystemConfigService { @throws[Exception] def getSystemSetting(key: String): Option[SystemSetting] = { val postgresConnect = new PostgresConnect(postgresConfig) - var preparedStatement: PreparedStatement = null - var rs: ResultSet = null - val query = "SELECT * FROM system_settings WHERE key = ?" - preparedStatement = postgresConnect.prepareStatement(query) - preparedStatement.setString(1, key) try { - rs = postgresConnect.executeQuery(preparedStatement = preparedStatement) + val rs = postgresConnect.executeQuery(s"SELECT * FROM system_settings WHERE key = '$key'") if (rs.next) Option(parseSystemSetting(rs)) else None } finally { - if (rs != null) rs.close() - if (preparedStatement != null) preparedStatement.close() postgresConnect.closeConnection() } } diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala b/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala index 370353c7..d68b924a 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala @@ -46,6 +46,40 @@ class MapDeserializationSchema extends KafkaRecordDeserializationSchema[mutable. } +class TopicDeserializationSchema extends KafkaRecordDeserializationSchema[mutable.Map[String, AnyRef]] { + + private val serialVersionUID = -3224825136576915426L + + override def getProducedType: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) + + override def deserialize(record: ConsumerRecord[Array[Byte], Array[Byte]], out: Collector[mutable.Map[String, AnyRef]]): Unit = { + val msg = try { + val event = JSONUtil.deserialize[Map[String, AnyRef]](record.value()) + mutable.Map[String, AnyRef]( + "dataset" -> record.topic(), + "event" -> event + ) + } catch { + case _: Exception => + mutable.Map[String, AnyRef](Constants.INVALID_JSON -> new String(record.value, "UTF-8")) + } + initObsrvMeta(msg, record) + out.collect(msg) + } + + private def initObsrvMeta(msg: mutable.Map[String, AnyRef], record: ConsumerRecord[Array[Byte], Array[Byte]]): Unit = { + if (!msg.contains("obsrv_meta")) { + msg.put("obsrv_meta", Map( + "syncts" -> record.timestamp(), + "processingStartTime" -> System.currentTimeMillis(), + "flags" -> Map(), + "timespans" -> Map(), + "error" -> Map() + )) + } + } +} + class StringDeserializationSchema extends KafkaRecordDeserializationSchema[String] { private val serialVersionUID = -3224825136576915426L diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala index cb4657c3..51753f75 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala @@ -53,8 +53,6 @@ abstract class BaseJobConfig[T](val config: Config, val jobName: String) extends val checkpointingBaseUrl: Option[String] = if (config.hasPath("job.statebackend.base.url")) Option(config.getString("job.statebackend.base.url")) else None // Base Methods - def datasetType(): String = if (config.hasPath("dataset.type")) config.getString("dataset.type") else "dataset" - def inputTopic(): String def inputConsumer(): String diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseStreamTask.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseStreamTask.scala index 8ebdb8a7..bdc897da 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseStreamTask.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseStreamTask.scala @@ -38,6 +38,13 @@ abstract class BaseStreamTask[T] extends BaseStreamTaskSink[T] { .rebalance() } + def getTopicMapDataStream(env: StreamExecutionEnvironment, config: BaseJobConfig[T], kafkaTopics: List[String], + consumerSourceName: String, kafkaConnector: FlinkKafkaConnector): DataStream[mutable.Map[String, AnyRef]] = { + env.fromSource(kafkaConnector.kafkaTopicMapSource(kafkaTopics), WatermarkStrategy.noWatermarks[mutable.Map[String, AnyRef]](), consumerSourceName) + .uid(consumerSourceName).setParallelism(config.kafkaConsumerParallelism) + .rebalance() + } + def getStringDataStream(env: StreamExecutionEnvironment, config: BaseJobConfig[T], kafkaConnector: FlinkKafkaConnector): DataStream[String] = { env.fromSource(kafkaConnector.kafkaStringSource(config.inputTopic()), WatermarkStrategy.noWatermarks[String](), config.inputConsumer()) .uid(config.inputConsumer()).setParallelism(config.kafkaConsumerParallelism) diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/FlinkKafkaConnector.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/FlinkKafkaConnector.scala index 508e1e7c..39552dd7 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/FlinkKafkaConnector.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/FlinkKafkaConnector.scala @@ -47,6 +47,15 @@ class FlinkKafkaConnector(config: BaseJobConfig[_]) extends Serializable { .build() } + def kafkaTopicMapSource(kafkaTopics: List[String]): KafkaSource[mutable.Map[String, AnyRef]] = { + KafkaSource.builder[mutable.Map[String, AnyRef]]() + .setTopics(kafkaTopics.asJava) + .setDeserializer(new TopicDeserializationSchema) + .setProperties(config.kafkaConsumerProperties()) + .setStartingOffsets(OffsetsInitializer.committedOffsets(OffsetResetStrategy.EARLIEST)) + .build() + } + def kafkaMapDynamicSink(): KafkaSink[mutable.Map[String, AnyRef]] = { KafkaSink.builder[mutable.Map[String, AnyRef]]() .setDeliverGuarantee(DeliveryGuarantee.AT_LEAST_ONCE) diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala b/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala index 67156256..550e99d8 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala @@ -57,6 +57,10 @@ object JSONUtil { root.at(path); } + def getJsonNode(json: String): JsonNode = { + mapper.readTree(json); + } + private[this] def typeReference[T: Manifest] = new TypeReference[T] { override def getType: Type = typeFromManifest(manifest[T]) } diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/util/PostgresConnect.scala b/framework/src/main/scala/org/sunbird/obsrv/core/util/PostgresConnect.scala index a1a23df9..64469882 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/util/PostgresConnect.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/util/PostgresConnect.scala @@ -71,41 +71,6 @@ class PostgresConnect(config: PostgresConnectionConfig) { // $COVERAGE-ON$ } - def prepareStatement(query: String): PreparedStatement = { - try { - connection.prepareStatement(query) - } catch { - case ex: SQLException => - ex.printStackTrace() - logger.error("PostgresConnect:prepareStatement() - Exception", ex) - reset() - connection.prepareStatement(query) - } - } - - def executeUpdate(preparedStatement: PreparedStatement): Int = { - try { - preparedStatement.executeUpdate() - } catch { - case ex: SQLException => - ex.printStackTrace() - logger.error("PostgresConnect:executeUpdate():PreparedStatement - Exception", ex) - reset() - preparedStatement.executeUpdate() - } - } - - def executeQuery(preparedStatement: PreparedStatement): ResultSet = { - try { - preparedStatement.executeQuery() - } catch { - case ex: SQLException => - logger.error("PostgresConnect:execute():PreparedStatement - Exception", ex) - reset() - preparedStatement.executeQuery() - } - } - def executeQuery(query:String):ResultSet = statement.executeQuery(query) } diff --git a/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala b/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala index bac2b0ae..cdffa023 100644 --- a/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala +++ b/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala @@ -142,7 +142,6 @@ class BaseProcessFunctionTestSpec extends BaseSpecWithPostgres with Matchers { val metrics = Metrics(mutable.Map("test" -> new ConcurrentHashMap[String, AtomicLong]())) metrics.reset("test1", "m1") - bsConfig.datasetType() should be ("dataset") } "TestBaseStreamTask" should "validate the getMapDataStream method" in { diff --git a/framework/src/test/scala/org/sunbird/spec/ModelsTestSpec.scala b/framework/src/test/scala/org/sunbird/spec/ModelsTestSpec.scala index 4ca0ad5e..f85347dd 100644 --- a/framework/src/test/scala/org/sunbird/spec/ModelsTestSpec.scala +++ b/framework/src/test/scala/org/sunbird/spec/ModelsTestSpec.scala @@ -104,7 +104,6 @@ class ModelsTestSpec extends FlatSpec with Matchers { bsMapConfig.kafkaConsumerProperties() bsMapConfig.enableDistributedCheckpointing should be (None) bsMapConfig.checkpointingBaseUrl should be (None) - bsMapConfig.datasetType() should be ("master-dataset") val dsk = new DatasetKeySelector() dsk.getKey(mutable.Map("dataset" -> "d1".asInstanceOf[AnyRef])) should be ("d1") diff --git a/pipeline/cache-indexer/pom.xml b/pipeline/cache-indexer/pom.xml new file mode 100644 index 00000000..36d76208 --- /dev/null +++ b/pipeline/cache-indexer/pom.xml @@ -0,0 +1,248 @@ + + + 4.0.0 + + pipeline + org.sunbird.obsrv + 1.0 + + cache-indexer + 1.0.0 + Cache Indexer + + UTF-8 + 1.4.0 + + + + + org.apache.flink + flink-streaming-scala_${scala.maj.version} + ${flink.version} + provided + + + com.fasterxml.jackson.core + jackson-databind + + + + + org.sunbird.obsrv + framework + 1.0.0 + + + org.sunbird.obsrv + dataset-registry + 1.0.0 + + + org.json4s + json4s-native_${scala.maj.version} + 4.0.6 + + + org.apache.kafka + kafka_${scala.maj.version} + ${kafka.version} + test + + + org.sunbird.obsrv + framework + 1.0.0 + test-jar + test + + + org.sunbird.obsrv + dataset-registry + 1.0.0 + test-jar + test + + + org.apache.flink + flink-test-utils + ${flink.version} + test + + + org.apache.flink + flink-runtime + ${flink.version} + test + tests + + + com.github.codemonstur + embedded-redis + 1.0.0 + test + + + org.apache.flink + flink-streaming-java + ${flink.version} + test + tests + + + org.scalatest + scalatest_2.12 + 3.0.6 + test + + + org.mockito + mockito-core + 3.3.3 + test + + + com.fiftyonred + mock-jedis + 0.4.0 + test + + + io.github.embeddedkafka + embedded-kafka_2.12 + 3.4.0 + test + + + io.zonky.test + embedded-postgres + 2.0.3 + test + + + + + src/main/scala + src/test/scala + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + 11 + + + + org.apache.maven.plugins + maven-shade-plugin + 3.2.1 + + + + package + + shade + + + false + + + com.google.code.findbugs:jsr305 + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + core-site.xml + + + + + + org.sunbird.obsrv.streaming.CacheIndexerStreamTask + + + + reference.conf + + + + + + + + + net.alchim31.maven + scala-maven-plugin + 4.4.0 + + ${java.target.runtime} + ${java.target.runtime} + ${scala.version} + false + + + + scala-compile-first + process-resources + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + + + maven-surefire-plugin + 2.22.2 + + true + + + + + org.scalatest + scalatest-maven-plugin + 1.0 + + ${project.build.directory}/surefire-reports + . + cache-indexer-testsuite.txt + + + + test + + test + + + + + + org.scoverage + scoverage-maven-plugin + ${scoverage.plugin.version} + + ${scala.version} + true + true + + + + + diff --git a/pipeline/cache-indexer/src/main/resources/cache-indexer.conf b/pipeline/cache-indexer/src/main/resources/cache-indexer.conf new file mode 100644 index 00000000..58a9c9d1 --- /dev/null +++ b/pipeline/cache-indexer/src/main/resources/cache-indexer.conf @@ -0,0 +1,15 @@ +include "baseconfig.conf" + +kafka { + output.failed.topic = ${job.env}".masterdata.failed" + groupId = ${job.env}"-cache-indexer-group" + producer { + max-request-size = 5242880 + } +} + +task { + window.time.in.seconds = 5 + window.count = 30 + window.shards = 1400 +} \ No newline at end of file diff --git a/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/function/MasterDataProcessorFunction.scala b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/function/MasterDataProcessorFunction.scala new file mode 100644 index 00000000..bbab5307 --- /dev/null +++ b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/function/MasterDataProcessorFunction.scala @@ -0,0 +1,56 @@ +package org.sunbird.obsrv.function + +import org.apache.flink.configuration.Configuration +import org.apache.flink.streaming.api.functions.ProcessFunction +import org.json4s.native.JsonMethods._ +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.{ErrorConstants, FunctionalError, Producer} +import org.sunbird.obsrv.core.streaming.Metrics +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.Dataset +import org.sunbird.obsrv.pipeline.task.CacheIndexerConfig +import org.sunbird.obsrv.registry.DatasetRegistry +import org.sunbird.obsrv.streaming.BaseDatasetProcessFunction +import org.sunbird.obsrv.util.MasterDataCache + +import scala.collection.mutable + +class MasterDataProcessorFunction(config: CacheIndexerConfig) extends BaseDatasetProcessFunction(config) { + + private[this] val logger = LoggerFactory.getLogger(classOf[MasterDataProcessorFunction]) + private[this] var masterDataCache: MasterDataCache = _ + + override def open(parameters: Configuration): Unit = { + super.open(parameters) + masterDataCache = new MasterDataCache(config) + masterDataCache.open(DatasetRegistry.getAllDatasets(Some("master"))) + } + + override def close(): Unit = { + super.close() + masterDataCache.close() + } + + override def getMetrics(): List[String] = { + List(config.successEventCount, config.systemEventCount, config.totalEventCount, config.successInsertCount, config.successUpdateCount) + } + + override def processElement(dataset: Dataset, msg: mutable.Map[String, AnyRef], context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, metrics: Metrics): Unit = { + + metrics.incCounter(dataset.id, config.totalEventCount) + masterDataCache.open(dataset) + val event = JSONUtil.serialize(msg(config.CONST_EVENT)) + val json = parse(event, useBigIntForLong = false) + val node = JSONUtil.getKey(dataset.datasetConfig.keysConfig.dataKey.get, event) + if (node.isMissingNode) { + markFailure(Some(dataset.id), msg, context, metrics, ErrorConstants.MISSING_DATASET_CONFIG_KEY, Producer.masterdataprocessor, FunctionalError.MissingMasterDatasetKey, datasetType = Some(dataset.datasetType)) + } else { + val result = masterDataCache.process(dataset, node.asText(), json) + metrics.incCounter(dataset.id, config.successInsertCount, result._1) + metrics.incCounter(dataset.id, config.successUpdateCount, result._2) + metrics.incCounter(dataset.id, config.successEventCount) + } + + } + +} \ No newline at end of file diff --git a/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerConfig.scala b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerConfig.scala new file mode 100644 index 00000000..c6a49f57 --- /dev/null +++ b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerConfig.scala @@ -0,0 +1,33 @@ +package org.sunbird.obsrv.pipeline.task + +import com.typesafe.config.Config +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.java.typeutils.TypeExtractor +import org.apache.flink.streaming.api.scala.OutputTag +import org.sunbird.obsrv.core.streaming.BaseJobConfig + +import scala.collection.mutable + +class CacheIndexerConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "CacheIndexerJob") { + + private val serialVersionUID = 2905979434303791379L + implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) + + // Metric List + val totalEventCount = "total-event-count" + val successEventCount = "success-event-count" + val successInsertCount = "success-insert-count" + val successUpdateCount = "success-update-count" + + val windowTime: Int = config.getInt("task.window.time.in.seconds") + val windowCount: Int = config.getInt("task.window.count") + + // Functions + val cacheIndexerFunction = "CacheIndexerFunction" + + override def inputTopic(): String = null + override def inputConsumer(): String = "cache-indexer" + override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("processing_stats") + + override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") +} diff --git a/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerStreamTask.scala b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerStreamTask.scala new file mode 100644 index 00000000..735440b7 --- /dev/null +++ b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerStreamTask.scala @@ -0,0 +1,61 @@ +package org.sunbird.obsrv.streaming + +import com.typesafe.config.ConfigFactory +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.java.typeutils.TypeExtractor +import org.apache.flink.api.java.utils.ParameterTool +import org.apache.flink.streaming.api.datastream.DataStream +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector} +import org.sunbird.obsrv.core.util.FlinkUtil +import org.sunbird.obsrv.function.MasterDataProcessorFunction +import org.sunbird.obsrv.model.DatasetType +import org.sunbird.obsrv.pipeline.task.CacheIndexerConfig +import org.sunbird.obsrv.registry.DatasetRegistry + +import java.io.File +import scala.collection.mutable + +class CacheIndexerStreamTask(config: CacheIndexerConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { + + implicit val mutableMapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) + private val logger = LoggerFactory.getLogger(classOf[CacheIndexerStreamTask]) + + def process(): Unit = { + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) + process(env) + env.execute(config.jobName) + } + + def process(env: StreamExecutionEnvironment): Unit = { + + val datasets = DatasetRegistry.getAllDatasets(Some(DatasetType.master.toString)) + val datasetIds = datasets.map(f => f.id) + val dataStream = getTopicMapDataStream(env, config, datasetIds, consumerSourceName = s"cache-indexer-consumer", kafkaConnector) + processStream(dataStream) + } + + override def processStream(dataStream: DataStream[mutable.Map[String, AnyRef]]): DataStream[mutable.Map[String, AnyRef]] = { + val processedStream = dataStream.process(new MasterDataProcessorFunction(config)).name(config.cacheIndexerFunction) + .uid(config.cacheIndexerFunction).setParallelism(config.downstreamOperatorsParallelism) + addDefaultSinks(processedStream, config, kafkaConnector) + processedStream.getSideOutput(config.successTag()) + } + +} + +object CacheIndexerStreamTask { + + def main(args: Array[String]): Unit = { + val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path")) + val config = configFilePath.map { + path => ConfigFactory.parseFile(new File(path)).resolve() + }.getOrElse(ConfigFactory.load("cache-indexer.conf").withFallback(ConfigFactory.systemEnvironment())) + val cacheConfig = new CacheIndexerConfig(config) + val kafkaUtil = new FlinkKafkaConnector(cacheConfig) + val task = new CacheIndexerStreamTask(cacheConfig, kafkaUtil) + task.process() + } + +} \ No newline at end of file diff --git a/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/util/MasterDataCache.scala b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/util/MasterDataCache.scala new file mode 100644 index 00000000..c5f95f32 --- /dev/null +++ b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/util/MasterDataCache.scala @@ -0,0 +1,63 @@ +package org.sunbird.obsrv.util + +import org.json4s.native.JsonMethods._ +import org.json4s.{JField, JNothing, JValue} +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.cache.RedisConnect +import org.sunbird.obsrv.core.model.Constants.OBSRV_META +import org.sunbird.obsrv.model.DatasetModels.Dataset +import org.sunbird.obsrv.pipeline.task.CacheIndexerConfig +import redis.clients.jedis.Jedis + +import scala.collection.mutable + +class MasterDataCache(val config: CacheIndexerConfig) { + + private[this] val logger = LoggerFactory.getLogger(classOf[MasterDataCache]) + private val datasetPipelineMap: mutable.Map[String, Jedis] = mutable.Map[String, Jedis]() + + def close(): Unit = { + datasetPipelineMap.values.foreach(pipeline => pipeline.close()) + } + + def open(datasets: List[Dataset]): Unit = { + datasets.foreach(dataset => { + open(dataset) + }) + } + + def open(dataset: Dataset): Unit = { + if (!datasetPipelineMap.contains(dataset.id)) { + val redisConfig = dataset.datasetConfig.cacheConfig.get + val redisConnect = new RedisConnect(redisConfig.redisDBHost.get, redisConfig.redisDBPort.get, config.redisConnectionTimeout) + val jedis: Jedis = redisConnect.getConnection(0) + datasetPipelineMap.put(dataset.id, jedis) + } + } + + def process(dataset: Dataset, key: String, event: JValue): (Int, Int) = { + val jedis = this.datasetPipelineMap(dataset.id) + val dataFromCache = getDataFromCache(dataset, key, jedis) + val updatedEvent = event.removeField { + case JField(OBSRV_META, _) => true + case _ => false + } + updateCache(dataset, dataFromCache, key, updatedEvent, jedis) + (if (dataFromCache == null) 1 else 0, if (dataFromCache == null) 0 else 1) + } + + private def getDataFromCache(dataset: Dataset, key: String, jedis: Jedis): String = { + + jedis.select(dataset.datasetConfig.cacheConfig.get.redisDB.get) + jedis.get(key) + } + + private def updateCache(dataset: Dataset, dataFromCache: String, key: String, event: JValue, jedis: Jedis): Unit = { + + jedis.select(dataset.datasetConfig.cacheConfig.get.redisDB.get) + val existingJson = if (dataFromCache != null) parse(dataFromCache) else JNothing + val mergedJson = existingJson merge event + jedis.set(key, compact(render(mergedJson))) + } + +} diff --git a/pipeline/pipeline-merged/src/test/resources/base-config.conf b/pipeline/cache-indexer/src/test/resources/base-config.conf similarity index 100% rename from pipeline/pipeline-merged/src/test/resources/base-config.conf rename to pipeline/cache-indexer/src/test/resources/base-config.conf diff --git a/pipeline/cache-indexer/src/test/resources/test.conf b/pipeline/cache-indexer/src/test/resources/test.conf new file mode 100644 index 00000000..7861c8d0 --- /dev/null +++ b/pipeline/cache-indexer/src/test/resources/test.conf @@ -0,0 +1,20 @@ +include "base-test.conf" + +kafka { + + output.failed.topic = ${job.env}".masterdata.failed" + groupId = ${job.env}"-cache-indexer-group" + producer { + max-request-size = 5242880 + } +} + +task { + window.time.in.seconds = 2 + window.count = 2 + window.shards = 1400 +} + +redis { + port = 6340 +} \ No newline at end of file diff --git a/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala b/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala new file mode 100644 index 00000000..078cde33 --- /dev/null +++ b/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala @@ -0,0 +1,10 @@ +package org.sunbird.obsrv.fixture + +object EventFixture { + + val VALID_BATCH_EVENT_D3_INSERT = """{"code":"HYUN-CRE-D6","manufacturer":"Hyundai","model":"Creta","variant":"SX(O)","modelYear":"2023","price":"2200000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","dealer":{"email":"john.doe@example.com","locationId":"KUN12345"}}""" + val VALID_BATCH_EVENT_D3_INSERT_2 = """{"code":"HYUN-TUC-D6","manufacturer":"Hyundai","model":"Tucson","variant":"Signature","modelYear":"2023","price":"4000000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","dealer":{"email":"admin.hyun@gmail.com","locationId":"KUN134567"}}""" + val VALID_BATCH_EVENT_D3_UPDATE = """{"code":"HYUN-CRE-D6","dealer":{"email":"john.doe@example.com","locationId":"KUN12345"},"safety":"3 Star (Global NCAP)","seatingCapacity":5}""" + val VALID_BATCH_EVENT_D4 = """{"code":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}""" + val INVALID_BATCH_EVENT_D4 = """{"code1":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}""" +} diff --git a/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/pipeline/CacheIndexerStreamTaskTestSpec.scala b/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/pipeline/CacheIndexerStreamTaskTestSpec.scala new file mode 100644 index 00000000..b95d754d --- /dev/null +++ b/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/pipeline/CacheIndexerStreamTaskTestSpec.scala @@ -0,0 +1,142 @@ +package org.sunbird.obsrv.pipeline + +import io.github.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig} +import org.apache.flink.configuration.Configuration +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.apache.flink.test.util.MiniClusterWithClientResource +import org.apache.kafka.common.serialization.StringDeserializer +import org.scalatest.Matchers._ +import org.sunbird.obsrv.BaseMetricsReporter +import org.sunbird.obsrv.core.cache.RedisConnect +import org.sunbird.obsrv.core.model.ErrorConstants +import org.sunbird.obsrv.core.model.Models.SystemEvent +import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector +import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect} +import org.sunbird.obsrv.fixture.EventFixture +import org.sunbird.obsrv.pipeline.task.CacheIndexerConfig +import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry +import org.sunbird.obsrv.streaming.CacheIndexerStreamTask + +import scala.collection.mutable +import scala.concurrent.ExecutionContext.Implicits.global +import scala.concurrent.Future +import scala.concurrent.duration._ + +class CacheIndexerStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { + + val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder() + .setConfiguration(testConfiguration()) + .setNumberSlotsPerTaskManager(1) + .setNumberTaskManagers(1) + .build) + + val cacheIndexerConfig = new CacheIndexerConfig(config) + val kafkaConnector = new FlinkKafkaConnector(cacheIndexerConfig) + val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group") + implicit val embeddedKafkaConfig: EmbeddedKafkaConfig = + EmbeddedKafkaConfig( + kafkaPort = 9093, + zooKeeperPort = 2183, + customConsumerProperties = customKafkaConsumerProperties + ) + implicit val deserializer: StringDeserializer = new StringDeserializer() + + def testConfiguration(): Configuration = { + val config = new Configuration() + config.setString("metrics.reporter", "job_metrics_reporter") + config.setString("metrics.reporter.job_metrics_reporter.class", classOf[BaseMetricsReporter].getName) + config + } + + override def beforeAll(): Unit = { + super.beforeAll() + BaseMetricsReporter.gaugeMetrics.clear() + EmbeddedKafka.start()(embeddedKafkaConfig) + val postgresConnect = new PostgresConnect(postgresConfig) + insertTestData(postgresConnect) + createTestTopics() + EmbeddedKafka.publishStringMessageToKafka("dataset3", EventFixture.VALID_BATCH_EVENT_D3_INSERT) + EmbeddedKafka.publishStringMessageToKafka("dataset3", EventFixture.VALID_BATCH_EVENT_D3_INSERT_2) + EmbeddedKafka.publishStringMessageToKafka("dataset4", EventFixture.VALID_BATCH_EVENT_D4) + EmbeddedKafka.publishStringMessageToKafka("dataset3", EventFixture.VALID_BATCH_EVENT_D3_UPDATE) + EmbeddedKafka.publishStringMessageToKafka("dataset4", EventFixture.INVALID_BATCH_EVENT_D4) + flinkCluster.before() + } + + private def insertTestData(postgresConnect: PostgresConnect): Unit = { + postgresConnect.execute("INSERT INTO datasets (id, type, validation_config, extraction_config, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES ('dataset3', 'master', '{\"validate\":true,\"mode\":\"Strict\"}', '{\"is_batch_event\":true,\"extraction_key\":\"events\"}', '{\"type\":\"object\",\"$schema\":\"http://json-schema.org/draft-04/schema#\",\"properties\":{\"code\":{\"type\":\"string\"},\"manufacturer\":{\"type\":\"string\"}, \"model\":{\"type\":\"string\"},\"variant\":{\"type\":\"string\"},\"modelYear\":{\"type\":\"string\"},\"price\":{\"type\":\"string\"},\"currencyCode\":{\"type\":\"string\"},\"currency\":{\"type\":\"string\"},\"transmission\":{\"type\":\"string\"},\"fuel\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"email\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"}}}}}', '{\"topic\":\"d3-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"local.masterdata.ingest\",\"redis_db\":3,\"redis_db_host\":\"localhost\",\"redis_db_port\":" + cacheIndexerConfig.redisPort + "}', 'Live', 'v1', 'local.masterdata.ingest', 'System', 'System', now(), now());") + postgresConnect.execute("INSERT INTO datasets (id, type, validation_config, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES ('dataset4', 'master', '{\"validate\":true,\"mode\":\"Strict\"}', '{\"type\":\"object\",\"$schema\":\"http://json-schema.org/draft-04/schema#\",\"properties\":{\"code\":{\"type\":\"string\"},\"manufacturer\":{\"type\":\"string\"},\"model\":{\"type\":\"string\"},\"variant\":{\"type\":\"string\"},\"modelYear\":{\"type\":\"string\"},\"price\":{\"type\":\"string\"},\"currencyCode\":{\"type\":\"string\"},\"currency\":{\"type\":\"string\"},\"seatingCapacity\": {\"type\": \"integer\"}, \"safety\": {\"type\": \"string\"},\"transmission\":{\"type\":\"string\"},\"fuel\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"email\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"}}}}}', '{\"topic\":\"d34-events\"}', '{\"indexing_config\":{\"olap_store_enabled\":false,\"lakehouse_enabled\":false,\"cache_enabled\":true},\"keys_config\":{\"data_key\":\"code\",\"timestamp_key\":\"date\"},\"cache_config\":{\"redis_db\":4,\"redis_db_host\":\"localhost\",\"redis_db_port\":" + cacheIndexerConfig.redisPort + "}}', 'Live', 'v2', 'local.masterdata.ingest', 'System', 'System', now(), now());") + } + + override def afterAll(): Unit = { + + super.afterAll() + flinkCluster.after() + EmbeddedKafka.stop() + } + + def createTestTopics(): Unit = { + List(config.getString("kafka.output.system.event.topic"), "dataset3", "dataset4").foreach(EmbeddedKafka.createCustomTopic(_)) + } + + "CacheIndexerStreamTaskTestSpec" should "validate the cache indexer job for master datasets" in { + + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(cacheIndexerConfig) + val task = new CacheIndexerStreamTask(cacheIndexerConfig, kafkaConnector) + task.process(env) + Future { + env.execute(cacheIndexerConfig.jobName) + } + + val input = EmbeddedKafka.consumeNumberMessagesFrom[String](config.getString("kafka.output.system.event.topic"), 1, timeout = 30.seconds) + input.size should be(1) + + input.foreach(se => { + val event = JSONUtil.deserialize[SystemEvent](se) + val error = event.data.error + if (event.ctx.dataset.getOrElse("ALL").equals("ALL")) + event.ctx.dataset_type should be(None) + else if (error.isDefined) { + val errorCode = error.get.error_code + if (errorCode.equals(ErrorConstants.MISSING_DATASET_ID.errorCode) || + errorCode.equals(ErrorConstants.MISSING_DATASET_CONFIGURATION.errorCode) || + errorCode.equals(ErrorConstants.EVENT_MISSING.errorCode)) { + event.ctx.dataset_type should be(None) + } + } + else + event.ctx.dataset_type should be(Some("master")) + }) + + val mutableMetricsMap = mutable.Map[String, Long](); + BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) + + cacheIndexerConfig.successTag().getId should be("processing_stats") + + mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset3.${cacheIndexerConfig.totalEventCount}") should be(3) + mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset3.${cacheIndexerConfig.successEventCount}") should be(3) + mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset3.${cacheIndexerConfig.successInsertCount}") should be(2) + mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset3.${cacheIndexerConfig.successUpdateCount}") should be(1) + + mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset4.${cacheIndexerConfig.totalEventCount}") should be(2) + mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset4.${cacheIndexerConfig.successEventCount}") should be(1) + mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset4.${cacheIndexerConfig.successInsertCount}") should be(1) + mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset4.${cacheIndexerConfig.eventFailedMetricsCount}") should be(1) + + val redisConnection = new RedisConnect(cacheIndexerConfig.redisHost, cacheIndexerConfig.redisPort, cacheIndexerConfig.redisConnectionTimeout) + val jedis1 = redisConnection.getConnection(3) + val event1 = jedis1.get("HYUN-CRE-D6") + event1 should be("""{"dealer":{"email":"john.doe@example.com","locationId":"KUN12345"},"model":"Creta","price":"2200000","variant":"SX(O)","fuel":"Diesel","code":"HYUN-CRE-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic","safety":"3 Star (Global NCAP)","seatingCapacity":5}""") + val event3 = jedis1.get("HYUN-TUC-D6") + event3 should be("""{"dealer":{"email":"admin.hyun@gmail.com","locationId":"KUN134567"},"model":"Tucson","price":"4000000","variant":"Signature","fuel":"Diesel","code":"HYUN-TUC-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic"}""") + jedis1.close() + + val jedis2 = redisConnection.getConnection(4) + val event2 = jedis2.get("JEEP-CP-D3") + event2 should be("""{"model":"Compass","price":"3800000","variant":"Model S (O) Diesel 4x4 AT","fuel":"Diesel","seatingCapacity":5,"code":"JEEP-CP-D3","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Jeep","safety":"5 Star (Euro NCAP)","modelYear":"2023","transmission":"automatic"}""") + jedis2.close() + } + + +} diff --git a/pipeline/druid-router/pom.xml b/pipeline/dataset-router/pom.xml similarity index 97% rename from pipeline/druid-router/pom.xml rename to pipeline/dataset-router/pom.xml index 41e2e390..5c6b5d23 100644 --- a/pipeline/druid-router/pom.xml +++ b/pipeline/dataset-router/pom.xml @@ -12,12 +12,12 @@ org.sunbird.obsrv.pipeline - druid-router + dataset-router 1.0.0 jar - Druid Events Router + Dataset Events Router - Validate and Route Datasets for Druid Indexing + Validate and Route Datasets for Indexing into OLAP Store or a Lakehouse @@ -198,7 +198,7 @@ - reference.conf + dataset-router.conf diff --git a/pipeline/druid-router/src/main/resources/druid-router.conf b/pipeline/dataset-router/src/main/resources/dataset-router.conf similarity index 100% rename from pipeline/druid-router/src/main/resources/druid-router.conf rename to pipeline/dataset-router/src/main/resources/dataset-router.conf diff --git a/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala new file mode 100644 index 00000000..9f2c7907 --- /dev/null +++ b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala @@ -0,0 +1,117 @@ +package org.sunbird.obsrv.router.functions + +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.JsonNodeType +import org.apache.flink.configuration.Configuration +import org.apache.flink.streaming.api.functions.ProcessFunction +import org.joda.time.format.DateTimeFormat +import org.joda.time.{DateTime, DateTimeZone} +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.{Constants, ErrorConstants, FunctionalError, Producer} +import org.sunbird.obsrv.core.streaming.Metrics +import org.sunbird.obsrv.core.util.{JSONUtil, Util} +import org.sunbird.obsrv.model.DatasetModels.{Dataset, DatasetConfig} +import org.sunbird.obsrv.model.DatasetType +import org.sunbird.obsrv.router.task.DynamicRouterConfig +import org.sunbird.obsrv.streaming.BaseDatasetProcessFunction + +import java.util.TimeZone +import scala.collection.mutable + +case class TimestampKey(isValid: Boolean, value: AnyRef) + +class DynamicRouterFunction(config: DynamicRouterConfig) extends BaseDatasetProcessFunction(config) { + + private[this] val logger = LoggerFactory.getLogger(classOf[DynamicRouterFunction]) + + override def open(parameters: Configuration): Unit = { + super.open(parameters) + } + + override def close(): Unit = { + super.close() + } + + override def getMetrics(): List[String] = { + List(config.routerTotalCount, config.routerSuccessCount) + } + + override def processElement(dataset: Dataset, msg: mutable.Map[String, AnyRef], + ctx: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, + metrics: Metrics): Unit = { + + metrics.incCounter(dataset.id, config.routerTotalCount) + val event = Util.getMutableMap(msg(config.CONST_EVENT).asInstanceOf[Map[String, AnyRef]]) + event.put(config.CONST_OBSRV_META, msg(config.CONST_OBSRV_META).asInstanceOf[Map[String, AnyRef]]) + val tsKeyData = TimestampKeyParser.parseTimestampKey(dataset.datasetConfig, event) + event.put("indexTS", tsKeyData.value) + if (tsKeyData.isValid || dataset.datasetType.equalsIgnoreCase(DatasetType.master.toString)) { + val routerConfig = dataset.routerConfig + val topicEventMap = mutable.Map(Constants.TOPIC -> routerConfig.topic, Constants.MESSAGE -> event) + ctx.output(config.routerOutputTag, topicEventMap) + metrics.incCounter(dataset.id, config.routerSuccessCount) + markCompletion(dataset, super.markComplete(event, dataset.dataVersion), ctx, Producer.router) + } else { + markFailure(Some(dataset.id), msg, ctx, metrics, ErrorConstants.INDEX_KEY_MISSING_OR_BLANK, Producer.router, FunctionalError.MissingTimestampKey, datasetType = Some(dataset.datasetType)) + } + } + +} + +object TimestampKeyParser { + + def parseTimestampKey(datasetConfig: DatasetConfig, event: mutable.Map[String, AnyRef]): TimestampKey = { + val indexKey = datasetConfig.keysConfig.tsKey.get + val node = JSONUtil.getKey(indexKey, JSONUtil.serialize(event)) + node.getNodeType match { + case JsonNodeType.NUMBER => onNumber(datasetConfig, node) + case JsonNodeType.STRING => onText(datasetConfig, node) + case _ => TimestampKey(isValid = false, null) + } + } + + private def onNumber(datasetConfig: DatasetConfig, node: JsonNode): TimestampKey = { + val length = node.asText().length + val value = node.numberValue().longValue() + // TODO: [P3] Crude implementation. Checking if the epoch timestamp format is one of seconds, milli-seconds, micro-second and nano-seconds. Find a elegant approach + if (length == 10 || length == 13 || length == 16 || length == 19) { + val tfValue:Long = if (length == 10) (value * 1000).longValue() else if (length == 16) (value / 1000).longValue() else if (length == 19) (value / 1000000).longValue() else value + TimestampKey(isValid = true, addTimeZone(datasetConfig, new DateTime(tfValue)).asInstanceOf[AnyRef]) + } else { + TimestampKey(isValid = false, 0.asInstanceOf[AnyRef]) + } + } + + private def onText(datasetConfig: DatasetConfig, node: JsonNode): TimestampKey = { + val value = node.textValue() + if (datasetConfig.keysConfig.tsFormat.isDefined) { + parseDateTime(datasetConfig, value) + } else { + TimestampKey(isValid = true, value) + } + } + + private def parseDateTime(datasetConfig: DatasetConfig, value: String): TimestampKey = { + try { + datasetConfig.keysConfig.tsFormat.get match { + case "epoch" => TimestampKey(isValid = true, addTimeZone(datasetConfig, new DateTime(value.toLong)).asInstanceOf[AnyRef]) + case _ => + val dtf = DateTimeFormat.forPattern(datasetConfig.keysConfig.tsFormat.get) + TimestampKey(isValid = true, addTimeZone(datasetConfig, dtf.parseDateTime(value)).asInstanceOf[AnyRef]) + } + } catch { + case _: Exception => TimestampKey(isValid = false, null) + } + } + + private def addTimeZone(datasetConfig: DatasetConfig, dateTime: DateTime): Long = { + if (datasetConfig.datasetTimezone.isDefined) { + val tz = DateTimeZone.forTimeZone(TimeZone.getTimeZone(datasetConfig.datasetTimezone.get)) + val offsetInMilliseconds = tz.getOffset(dateTime) + dateTime.plusMillis(offsetInMilliseconds).getMillis + } else { + dateTime.getMillis + } + } + +} diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterConfig.scala b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterConfig.scala similarity index 92% rename from pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterConfig.scala rename to pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterConfig.scala index 31106b00..a9309016 100644 --- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterConfig.scala +++ b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterConfig.scala @@ -8,7 +8,7 @@ import org.sunbird.obsrv.core.streaming.BaseJobConfig import scala.collection.mutable -class DruidRouterConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "DruidRouterJob") { +class DynamicRouterConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "DruidRouterJob") { private val serialVersionUID = 2905979434303791379L implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala similarity index 89% rename from pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala rename to pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala index 9e17a974..5ac1067f 100644 --- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala +++ b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala @@ -17,7 +17,7 @@ import scala.collection.mutable * Druid Router stream task routes every event into its respective topic configured at dataset level */ -class DynamicRouterStreamTask(config: DruidRouterConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { +class DynamicRouterStreamTask(config: DynamicRouterConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { private val serialVersionUID = 146697324640926024L @@ -56,8 +56,8 @@ object DynamicRouterStreamTask { val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path")) val config = configFilePath.map { path => ConfigFactory.parseFile(new File(path)).resolve() - }.getOrElse(ConfigFactory.load("druid-router.conf").withFallback(ConfigFactory.systemEnvironment())) - val druidRouterConfig = new DruidRouterConfig(config) + }.getOrElse(ConfigFactory.load("dataset-router.conf").withFallback(ConfigFactory.systemEnvironment())) + val druidRouterConfig = new DynamicRouterConfig(config) val kafkaUtil = new FlinkKafkaConnector(druidRouterConfig) val task = new DynamicRouterStreamTask(druidRouterConfig, kafkaUtil) task.process() diff --git a/pipeline/druid-router/src/test/resources/test.conf b/pipeline/dataset-router/src/test/resources/test.conf similarity index 100% rename from pipeline/druid-router/src/test/resources/test.conf rename to pipeline/dataset-router/src/test/resources/test.conf diff --git a/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala new file mode 100644 index 00000000..98370128 --- /dev/null +++ b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala @@ -0,0 +1,171 @@ +package org.sunbird.obsrv.router + +import io.github.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig} +import org.apache.flink.configuration.Configuration +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.apache.flink.test.util.MiniClusterWithClientResource +import org.apache.kafka.common.serialization.StringDeserializer +import org.scalatest.Matchers._ +import org.sunbird.obsrv.BaseMetricsReporter +import org.sunbird.obsrv.core.model.Models.SystemEvent +import org.sunbird.obsrv.core.model._ +import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector +import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect} +import org.sunbird.obsrv.router.task.{DynamicRouterConfig, DynamicRouterStreamTask} +import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry + +import scala.collection.mutable +import scala.concurrent.ExecutionContext.Implicits.global +import scala.concurrent.Future +import scala.concurrent.duration._ + +class DynamicRouterStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { + + val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder() + .setConfiguration(testConfiguration()) + .setNumberSlotsPerTaskManager(1) + .setNumberTaskManagers(1) + .build) + + val routerConfig = new DynamicRouterConfig(config) + val kafkaConnector = new FlinkKafkaConnector(routerConfig) + val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group") + implicit val embeddedKafkaConfig: EmbeddedKafkaConfig = + EmbeddedKafkaConfig( + kafkaPort = 9093, + zooKeeperPort = 2183, + customConsumerProperties = customKafkaConsumerProperties + ) + implicit val deserializer: StringDeserializer = new StringDeserializer() + + def testConfiguration(): Configuration = { + val config = new Configuration() + config.setString("metrics.reporter", "job_metrics_reporter") + config.setString("metrics.reporter.job_metrics_reporter.class", classOf[BaseMetricsReporter].getName) + config + } + + override def beforeAll(): Unit = { + super.beforeAll() + BaseMetricsReporter.gaugeMetrics.clear() + EmbeddedKafka.start()(embeddedKafkaConfig) + val postgresConnect = new PostgresConnect(postgresConfig) + insertTestData(postgresConnect) + postgresConnect.closeConnection() + createTestTopics() + publishMessagesToKafka() + flinkCluster.before() + } + + private def publishMessagesToKafka(): Unit = { + EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.SUCCESS_EVENT) + EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.FAILED_EVENT) + } + + private def insertTestData(postgresConnect: PostgresConnect): Unit = { + postgresConnect.execute("update datasets set dataset_config = '" + """{"data_key":"id","timestamp_key":"date1","entry_topic":"ingest"}""" + "' where id='d2';") + + } + + override def afterAll(): Unit = { + + super.afterAll() + flinkCluster.after() + EmbeddedKafka.stop() + } + + def createTestTopics(): Unit = { + List( + routerConfig.kafkaSystemTopic, routerConfig.kafkaInputTopic, "d1-events", routerConfig.kafkaFailedTopic + ).foreach(EmbeddedKafka.createCustomTopic(_)) + } + + "DynamicRouterStreamTaskTestSpec" should "validate the router stream task" in { + + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(routerConfig) + val task = new DynamicRouterStreamTask(routerConfig, kafkaConnector) + task.process(env) + Future { + env.execute(routerConfig.jobName) + } + + val outputs = EmbeddedKafka.consumeNumberMessagesFrom[String]("d1-events", 1, timeout = 30.seconds) + validateOutputs(outputs) + + val failedEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](routerConfig.kafkaFailedTopic, 1, timeout = 30.seconds) + validateFailedEvents(failedEvents) + + val systemEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](routerConfig.kafkaSystemTopic, 2, timeout = 30.seconds) + validateSystemEvents(systemEvents) + + val mutableMetricsMap = mutable.Map[String, Long]() + BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) + Console.println("### DynamicRouterStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) + validateMetrics(mutableMetricsMap) + } + + private def validateOutputs(outputs: List[String]): Unit = { + outputs.size should be(1) + Console.println("Output", outputs.head) + } + + private def validateFailedEvents(failedEvents: List[String]): Unit = { + failedEvents.size should be(1) + Console.println("Output", failedEvents.head) + } + + private def validateSystemEvents(systemEvents: List[String]): Unit = { + systemEvents.size should be(2) + + systemEvents.foreach(se => { + val event = JSONUtil.deserialize[SystemEvent](se) + val error = event.data.error + if (event.ctx.dataset.getOrElse("ALL").equals("ALL")) + event.ctx.dataset_type should be(None) + else if (error.isDefined) { + val errorCode = error.get.error_code + if (errorCode.equals(ErrorConstants.MISSING_DATASET_ID.errorCode) || + errorCode.equals(ErrorConstants.MISSING_DATASET_CONFIGURATION.errorCode) || + errorCode.equals(ErrorConstants.EVENT_MISSING.errorCode)) { + event.ctx.dataset_type should be(None) + } + } + else + event.ctx.dataset_type should be(Some("event")) + }) + + systemEvents.foreach(f => { + val event = JSONUtil.deserialize[SystemEvent](f) + event.etype should be(EventID.METRIC) + event.ctx.module should be(ModuleID.processing) + event.ctx.pdata.id should be(routerConfig.jobName) + event.ctx.pdata.`type` should be(PDataType.flink) + event.ctx.pdata.pid.get should be(Producer.router) + if(event.data.error.isDefined) { + val errorLog = event.data.error.get + errorLog.error_level should be(ErrorLevel.critical) + errorLog.pdata_id should be(Producer.router) + errorLog.pdata_status should be(StatusCode.failed) + errorLog.error_count.get should be(1) + errorLog.error_code should be(ErrorConstants.INDEX_KEY_MISSING_OR_BLANK.errorCode) + errorLog.error_message should be(ErrorConstants.INDEX_KEY_MISSING_OR_BLANK.errorMsg) + errorLog.error_type should be(FunctionalError.MissingTimestampKey) + } else { + event.data.pipeline_stats.isDefined should be (true) + event.data.pipeline_stats.get.latency_time.isDefined should be (true) + event.data.pipeline_stats.get.processing_time.isDefined should be (true) + event.data.pipeline_stats.get.total_processing_time.isDefined should be (true) + } + + }) + } + + private def validateMetrics(mutableMetricsMap: mutable.Map[String, Long]): Unit = { + mutableMetricsMap(s"${routerConfig.jobName}.d1.${routerConfig.routerTotalCount}") should be(1) + mutableMetricsMap(s"${routerConfig.jobName}.d1.${routerConfig.routerSuccessCount}") should be(1) + mutableMetricsMap(s"${routerConfig.jobName}.d2.${routerConfig.routerTotalCount}") should be(1) + mutableMetricsMap(s"${routerConfig.jobName}.d2.${routerConfig.eventFailedMetricsCount}") should be(1) + } + +} diff --git a/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/EventFixture.scala b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/EventFixture.scala new file mode 100644 index 00000000..7856b0cc --- /dev/null +++ b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/EventFixture.scala @@ -0,0 +1,7 @@ +package org.sunbird.obsrv.router + +object EventFixture { + + val SUCCESS_EVENT = """{"dataset":"d1","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val FAILED_EVENT = """{"dataset":"d2","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" +} diff --git a/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala new file mode 100644 index 00000000..7bf5dfa6 --- /dev/null +++ b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala @@ -0,0 +1,125 @@ +package org.sunbird.obsrv.router + +import org.scalatest.{FlatSpec, Matchers} +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.{DatasetConfig, IndexingConfig, KeysConfig} +import org.sunbird.obsrv.router.functions.TimestampKeyParser + +import scala.collection.mutable + +class TestTimestampKeyParser extends FlatSpec with Matchers { + + "TimestampKeyParser" should "validate all scenarios of timestamp key in number format" in { + + + // Validate text date field without providing dateformat and timezone + val result1 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01"}""")) + result1.isValid should be(true) + result1.value.asInstanceOf[String] should be("2023-03-01") + + // Validate missing timestamp key scenario + val result2 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date1"), None)), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01"}""")) + result2.isValid should be(false) + result2.value should be(null) + + // Validate number date field which is not epoch + val result3 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":20232201}""")) + result3.isValid should be(false) + result3.value.asInstanceOf[Int] should be(0) + + // Validate number date field which is epoch in seconds + val result4 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165}""")) + result4.isValid should be(true) + result4.value.asInstanceOf[Long] should be(1701373165000L) + + // Validate number date field which is epoch in milli-seconds + val result5 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165123}""")) + result5.isValid should be(true) + result5.value.asInstanceOf[Long] should be(1701373165123L) + + // Validate number date field which is epoch in micro-seconds + val result6 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165123111}""")) + result6.isValid should be(true) + result6.value.asInstanceOf[Long] should be(1701373165123L) + + // Validate number date field which is epoch in nano-seconds + val result7 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165123111000}""")) + result7.isValid should be(true) + result7.value.asInstanceOf[Long] should be(1701373165123L) + + // Validate number date field which is not an epoch in milli, micro or nano seconds + val result8 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":170137316512}""")) + result8.isValid should be(false) + result8.value.asInstanceOf[Int] should be(0) + + // Validate number date field which is an epoch with timezone present + val result9 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None), datasetTimezone = Some("GMT+05:30")), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165123}""")) + result9.isValid should be(true) + result9.value.asInstanceOf[Long] should be(1701392965123L) + } + + it should "validate all scenarios of timestamp key in text format" in { + + // Validate epoch data in text format + val result1 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("epoch")), datasetTimezone = Some("GMT+05:30")), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"1701373165123"}""")) + result1.isValid should be(true) + result1.value.asInstanceOf[Long] should be(1701392965123L) + + // Validate invalid epoch data in text format (would reset to millis from 1970-01-01 if not epoch in millis) + val result2 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("epoch")), datasetTimezone = Some("GMT+05:30")), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"170137316512"}""")) + result2.isValid should be(true) + result2.value.asInstanceOf[Long] should be(170157116512L) + + // Validate date parser without timezone + val result3 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("yyyy-MM-dd")), datasetTimezone = None), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01"}""")) + result3.isValid should be(true) + result3.value.asInstanceOf[Long] should be(1677609000000L) + + // Validate date parser with timezone + val result4 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("yyyy-MM-dd")), datasetTimezone = Some("GMT+05:30")), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01"}""")) + result4.isValid should be(true) + result4.value.asInstanceOf[Long] should be(1677628800000L) + + // Validate date parser with date time in nano seconds + val result5 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS")), datasetTimezone = Some("GMT+05:30")), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01T12:45:32.123456789"}""")) + result5.isValid should be(true) + result5.value.asInstanceOf[Long] should be(1677674732123L) + + // Validate date parser with data in invalid format + val result6 = TimestampKeyParser.parseTimestampKey( + DatasetConfig( + IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("yyyy-MM-dd'T'HH:mm:ss.SSS")), datasetTimezone = Some("GMT+05:30")), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01T12:45:32.123456"}""")) + result6.isValid should be(false) + result6.value should be(null) + } + +} \ No newline at end of file diff --git a/pipeline/denormalizer/pom.xml b/pipeline/denormalizer/pom.xml index 2df98cd3..484a81a5 100644 --- a/pipeline/denormalizer/pom.xml +++ b/pipeline/denormalizer/pom.xml @@ -54,6 +54,23 @@ ${kafka.version} test + + org.sunbird.obsrv + transformation-sdk + 1.0.0 + + + org.apache.kafka + kafka-clients + ${kafka.version} + test + + + org.apache.kafka + kafka_${scala.maj.version} + ${kafka.version} + test + org.sunbird.obsrv framework diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerFunction.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerFunction.scala index 45a41c67..699ba75e 100644 --- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerFunction.scala +++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerFunction.scala @@ -30,7 +30,7 @@ class DenormalizerFunction(config: DenormalizerConfig) extends BaseDatasetProces override def open(parameters: Configuration): Unit = { super.open(parameters) denormCache = new DenormCache(config) - denormCache.open(DatasetRegistry.getAllDatasets(config.datasetType())) + denormCache.open(DatasetRegistry.getAllDatasets(None)) } override def close(): Unit = { diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerWindowFunction.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerWindowFunction.scala index ce603520..8d188838 100644 --- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerWindowFunction.scala +++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerWindowFunction.scala @@ -33,7 +33,7 @@ class DenormalizerWindowFunction(config: DenormalizerConfig)(implicit val eventT override def open(parameters: Configuration): Unit = { super.open(parameters) denormCache = new DenormCache(config) - denormCache.open(DatasetRegistry.getAllDatasets(config.datasetType())) + denormCache.open(DatasetRegistry.getAllDatasets(None)) } override def close(): Unit = { diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerConfig.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerConfig.scala index 118c0307..1fe24d68 100644 --- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerConfig.scala +++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerConfig.scala @@ -16,7 +16,6 @@ class DenormalizerConfig(override val config: Config) extends BaseJobConfig[muta implicit val anyTypeInfo: TypeInformation[String] = TypeExtractor.getForClass(classOf[String]) // Kafka Topics Configuration - val kafkaInputTopic: String = config.getString("kafka.input.topic") val denormOutputTopic: String = config.getString("kafka.output.denorm.topic") // Windows @@ -41,7 +40,7 @@ class DenormalizerConfig(override val config: Config) extends BaseJobConfig[muta // Functions val denormalizationFunction = "DenormalizationFunction" - override def inputTopic(): String = kafkaInputTopic + override def inputTopic(): String = config.getString("kafka.input.topic") override def inputConsumer(): String = denormalizationConsumer override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = denormEventsTag override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala index db0da7d5..5550748a 100644 --- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala +++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala @@ -1,11 +1,14 @@ package org.sunbird.obsrv.denormalizer.util +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.MissingNode import org.sunbird.obsrv.core.cache.RedisConnect import org.sunbird.obsrv.core.model.ErrorConstants import org.sunbird.obsrv.core.model.ErrorConstants.Error import org.sunbird.obsrv.core.util.{JSONUtil, Util} import org.sunbird.obsrv.denormalizer.task.DenormalizerConfig -import org.sunbird.obsrv.model.DatasetModels.{Dataset, DenormFieldConfig} +import org.sunbird.obsrv.model.DatasetModels.{Dataset, DenormFieldConfig, TransformationFunction} +import org.sunbird.obsrv.transformer.types.JSONAtaTransformer import redis.clients.jedis.{Pipeline, Response} import scala.collection.mutable @@ -75,7 +78,7 @@ class DenormCache(val config: DenormalizerConfig) { } private def extractField(fieldConfig: DenormFieldConfig, eventStr: String): DenormFieldStatus = { - val denormFieldNode = JSONUtil.getKey(fieldConfig.denormKey, eventStr) + val denormFieldNode = getDenormFieldValue(fieldConfig, eventStr) if (denormFieldNode.isMissingNode) { DenormFieldStatus("", success = false, Some(ErrorConstants.DENORM_KEY_MISSING)) } else { @@ -87,6 +90,16 @@ class DenormCache(val config: DenormalizerConfig) { } } + private def getDenormFieldValue(fieldConfig: DenormFieldConfig, eventStr: String): JsonNode = { + if(fieldConfig.denormKey.isDefined) { + JSONUtil.getKey(fieldConfig.denormKey.get, eventStr) + } else if(fieldConfig.jsonAtaExpr.isDefined) { + JSONAtaTransformer.evaluate(JSONUtil.getJsonNode(eventStr), TransformationFunction("jsonata", None, fieldConfig.jsonAtaExpr.get)) + } else { + MissingNode.getInstance() + } + } + private def getFromCache(pipeline: Pipeline, denormField: String, fieldConfig: DenormFieldConfig): Response[String] = { pipeline.select(fieldConfig.redisDB) pipeline.get(denormField) diff --git a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala index bd9658eb..89256390 100644 --- a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala +++ b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala @@ -71,7 +71,7 @@ class DenormalizerStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { } private def insertTestData(postgresConnect: PostgresConnect): Unit = { - postgresConnect.execute("update datasets set denorm_config = '" + s"""{"redis_db_host":"localhost","redis_db_port":$redisPort,"denorm_fields":[{"denorm_key":"vehicleCode","redis_db":3,"denorm_out_field":"vehicle_data"},{"denorm_key":"dealer.dealerCode","redis_db":4,"denorm_out_field":"dealer_data"}]}""" + "' where id='d1';") + postgresConnect.execute("update datasets set denorm_config = '" + s"""{"redis_db_host":"localhost","redis_db_port":$redisPort,"denorm_fields":[{"denorm_key":"vehicleCode","redis_db":3,"denorm_out_field":"vehicle_data"},{"jsonata_expr":"$$.dealer.dealerCode","redis_db":4,"denorm_out_field":"dealer_data"}]}""" + "' where id='d1';") val redisConnection = new RedisConnect(denormConfig.redisHost, denormConfig.redisPort, denormConfig.redisConnectionTimeout) redisConnection.getConnection(3).set("HYUN-CRE-D6", EventFixture.DENORM_DATA_1) redisConnection.getConnection(4).set("D123", EventFixture.DENORM_DATA_2) @@ -118,8 +118,8 @@ class DenormalizerStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { val denormCache = new DenormCache(denormConfig) noException should be thrownBy { denormCache.open(Dataset(id = "d123", datasetType = "dataset", extractionConfig = None, dedupConfig = None, validationConfig = None, jsonSchema = None, - denormConfig = Some(DenormConfig(redisDBHost = "localhost", redisDBPort = redisPort, denormFields = List(DenormFieldConfig(denormKey = "vehicleCode", redisDB = 3, denormOutField = "vehicle_data")))), routerConfig = RouterConfig(""), - datasetConfig = DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest"), status = DatasetStatus.Live)) + denormConfig = Some(DenormConfig(redisDBHost = "localhost", redisDBPort = redisPort, denormFields = List(DenormFieldConfig(denormKey = Some("vehicleCode"), redisDB = 3, denormOutField = "vehicle_data", jsonAtaExpr = None)))), routerConfig = RouterConfig(""), + datasetConfig = DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), status = DatasetStatus.Live, "ingest")) } } diff --git a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala index 52d06e8b..e5bbaa24 100644 --- a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala +++ b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala @@ -70,7 +70,7 @@ class DenormalizerWindowStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { } private def insertTestData(postgresConnect: PostgresConnect): Unit = { - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, created_by, updated_by, created_date, updated_date) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'v1', 'ingest', 'System', 'System', now(), now());") postgresConnect.execute("update datasets set denorm_config = '" + s"""{"redis_db_host":"localhost","redis_db_port":$redisPort,"denorm_fields":[{"denorm_key":"vehicleCode","redis_db":3,"denorm_out_field":"vehicle_data"},{"denorm_key":"dealer.dealerCode","redis_db":4,"denorm_out_field":"dealer_data"}]}""" + "' where id='d1';") val redisConnection = new RedisConnect(denormConfig.redisHost, denormConfig.redisPort, denormConfig.redisConnectionTimeout) redisConnection.getConnection(3).set("HYUN-CRE-D6", EventFixture.DENORM_DATA_1) diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterStreamTask.scala b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterStreamTask.scala deleted file mode 100644 index b77e110a..00000000 --- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterStreamTask.scala +++ /dev/null @@ -1,72 +0,0 @@ -package org.sunbird.obsrv.router.task - -import com.typesafe.config.ConfigFactory -import org.apache.flink.api.common.typeinfo.TypeInformation -import org.apache.flink.api.java.typeutils.TypeExtractor -import org.apache.flink.api.java.utils.ParameterTool -import org.apache.flink.streaming.api.datastream.DataStream -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment -import org.apache.flink.streaming.api.scala.OutputTag -import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector} -import org.sunbird.obsrv.core.util.FlinkUtil -import org.sunbird.obsrv.registry.DatasetRegistry -import org.sunbird.obsrv.router.functions.DruidRouterFunction - -import java.io.File -import scala.collection.mutable - -/** - * Druid Router stream task routes every event into its respective topic configured at dataset level - */ -// $COVERAGE-OFF$ Disabling scoverage as this stream task is deprecated -@Deprecated -class DruidRouterStreamTask(config: DruidRouterConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { - - private val serialVersionUID = 146697324640926024L - - def process(): Unit = { - implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) - val dataStream = getMapDataStream(env, config, kafkaConnector) - processStream(dataStream) - env.execute(config.jobName) - } - - override def processStream(dataStream: DataStream[mutable.Map[String, AnyRef]]): DataStream[mutable.Map[String, AnyRef]] = { - - implicit val mapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) - val datasets = DatasetRegistry.getAllDatasets(config.datasetType()) - - val routerStream = dataStream.process(new DruidRouterFunction(config)).name(config.druidRouterFunction).uid(config.druidRouterFunction) - .setParallelism(config.downstreamOperatorsParallelism) - datasets.map(dataset => { - routerStream.getSideOutput(OutputTag[mutable.Map[String, AnyRef]](dataset.routerConfig.topic)) - .sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](dataset.routerConfig.topic)) - .name(dataset.id + "-" + config.druidRouterProducer).uid(dataset.id + "-" + config.druidRouterProducer) - .setParallelism(config.downstreamOperatorsParallelism) - }) - - routerStream.getSideOutput(config.statsOutputTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaStatsTopic)) - .name(config.processingStatsProducer).uid(config.processingStatsProducer).setParallelism(config.downstreamOperatorsParallelism) - - addDefaultSinks(routerStream, config, kafkaConnector) - routerStream.getSideOutput(config.successTag()) - - } -} -// $COVERAGE-ON$ -// $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster -@Deprecated -object DruidRouterStreamTask { - - def main(args: Array[String]): Unit = { - val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path")) - val config = configFilePath.map { - path => ConfigFactory.parseFile(new File(path)).resolve() - }.getOrElse(ConfigFactory.load("druid-router.conf").withFallback(ConfigFactory.systemEnvironment())) - val druidRouterConfig = new DruidRouterConfig(config) - val kafkaUtil = new FlinkKafkaConnector(druidRouterConfig) - val task = new DruidRouterStreamTask(druidRouterConfig, kafkaUtil) - task.process() - } -} -// $COVERAGE-ON$ \ No newline at end of file diff --git a/pipeline/extractor/pom.xml b/pipeline/extractor/pom.xml index 4cc11c58..2ec8b803 100644 --- a/pipeline/extractor/pom.xml +++ b/pipeline/extractor/pom.xml @@ -110,6 +110,18 @@ 3.4.0 test + + io.zonky.test + embedded-postgres + 2.0.3 + test + + + io.github.embeddedkafka + embedded-kafka_2.12 + 3.4.0 + test + io.zonky.test embedded-postgres diff --git a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala index 0e79b08c..46c1e68c 100644 --- a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala +++ b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala @@ -27,7 +27,7 @@ class ExtractionFunction(config: ExtractorConfig) override def getMetricsList(): MetricsList = { val metrics = List(config.successEventCount, config.systemEventCount, config.eventFailedMetricsCount, config.failedExtractionCount, config.skippedExtractionCount, config.duplicateExtractionCount, config.totalEventCount, config.successExtractionCount) - MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics) + MetricsList(DatasetRegistry.getDataSetIds(), metrics) } override def open(parameters: Configuration): Unit = { diff --git a/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala b/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala index 6ada824b..3574249a 100644 --- a/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala +++ b/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala @@ -140,7 +140,7 @@ class ExtractorStreamTestSpec extends BaseSpecWithDatasetRegistry { if(event.ctx.dataset.getOrElse("ALL").equals("ALL")) event.ctx.dataset_type should be(None) else - event.ctx.dataset_type should be(Some("dataset")) + event.ctx.dataset_type should be(Some("event")) }) //TODO: Add assertions for all 6 events diff --git a/pipeline/hudi-connector/pom.xml b/pipeline/hudi-connector/pom.xml index 5230d8eb..b47b58d3 100644 --- a/pipeline/hudi-connector/pom.xml +++ b/pipeline/hudi-connector/pom.xml @@ -124,6 +124,12 @@ + + org.scalatest + scalatest_2.12 + 3.0.6 + test + diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/function/RowDataConverterFunction.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/function/RowDataConverterFunction.scala new file mode 100644 index 00000000..4aadb60a --- /dev/null +++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/function/RowDataConverterFunction.scala @@ -0,0 +1,43 @@ +package org.sunbird.obsrv.function + +import org.apache.flink.api.common.functions.RichMapFunction +import org.apache.flink.configuration.Configuration +import org.apache.flink.formats.common.TimestampFormat +import org.apache.flink.formats.json.JsonToRowDataConverters +import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper +import org.sunbird.obsrv.util.{HudiSchemaParser, HudiSchemaSpec} +import org.apache.flink.table.data.RowData +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.util.{JSONUtil, Util} +import org.sunbird.obsrv.streaming.HudiConnectorConfig +import scala.collection.mutable.{Map => MMap} + +class RowDataConverterFunction(config: HudiConnectorConfig, datasetId: String) extends RichMapFunction[MMap[String, AnyRef], RowData] { + + var jsonToRowDataConverters: JsonToRowDataConverters = _ + var objectMapper: ObjectMapper = _ + var hudiSchemaParser: HudiSchemaParser = _ + + private val logger = LoggerFactory.getLogger(classOf[RowDataConverterFunction]) + + override def open(parameters: Configuration): Unit = { + super.open(parameters) + jsonToRowDataConverters = new JsonToRowDataConverters(false, true, TimestampFormat.SQL) + objectMapper = new ObjectMapper() + hudiSchemaParser = new HudiSchemaParser() + } + + override def map(event: MMap[String, AnyRef]): RowData = { + convertToRowData(event) + } + + def convertToRowData(data: MMap[String, AnyRef]): RowData = { + val eventJson = JSONUtil.serialize(data) + val flattenedData = hudiSchemaParser.parseJson(datasetId, eventJson) + val rowType = hudiSchemaParser.rowTypeMap(datasetId) + val converter: JsonToRowDataConverters.JsonToRowDataConverter = jsonToRowDataConverters.createRowConverter(rowType) + val rowData = converter.convert(objectMapper.readTree(JSONUtil.serialize(flattenedData))).asInstanceOf[RowData] + rowData + } + +} diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala index fd160820..3bde66bd 100644 --- a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala +++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala @@ -16,7 +16,7 @@ import org.slf4j.LoggerFactory import org.sunbird.obsrv.core.model.Constants import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector} import org.sunbird.obsrv.core.util.FlinkUtil -import org.sunbird.obsrv.functions.RowDataConverterFunction +import org.sunbird.obsrv.function.RowDataConverterFunction import org.sunbird.obsrv.registry.DatasetRegistry import org.sunbird.obsrv.util.HudiSchemaParser import org.apache.hudi.config.HoodieWriteConfig.SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP diff --git a/pipeline/master-data-processor/pom.xml b/pipeline/master-data-processor/pom.xml index 0dc1cc60..f97287af 100644 --- a/pipeline/master-data-processor/pom.xml +++ b/pipeline/master-data-processor/pom.xml @@ -64,7 +64,7 @@ org.sunbird.obsrv.pipeline - druid-router + dataset-router 1.0.0 diff --git a/pipeline/master-data-processor/src/main/resources/master-data-processor.conf b/pipeline/master-data-processor/src/main/resources/master-data-processor.conf index 149e795b..0d3f2d89 100644 --- a/pipeline/master-data-processor/src/main/resources/master-data-processor.conf +++ b/pipeline/master-data-processor/src/main/resources/master-data-processor.conf @@ -5,16 +5,16 @@ kafka { output.raw.topic = ${job.env}".masterdata.raw" output.extractor.duplicate.topic = ${job.env}".masterdata.failed" output.failed.topic = ${job.env}".masterdata.failed" - output.batch.failed.topic = ${job.env}".masterdata.extractor.failed" + output.batch.failed.topic = ${job.env}".masterdata.failed" event.max.size = "1048576" # Max is only 1MB output.invalid.topic = ${job.env}".masterdata.failed" output.unique.topic = ${job.env}".masterdata.unique" output.duplicate.topic = ${job.env}".masterdata.failed" output.denorm.topic = ${job.env}".masterdata.denorm" output.transform.topic = ${job.env}".masterdata.transform" + output.transform.failed.topic = ${job.env}".masterdata.transform.failed" stats.topic = ${job.env}".masterdata.stats" groupId = ${job.env}"-masterdata-pipeline-group" - producer { max-request-size = 5242880 } @@ -36,4 +36,4 @@ redis { } } -dataset.type = "master-dataset" \ No newline at end of file +dataset.type = "master-dataset" diff --git a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/function/MasterDataProcessorFunction.scala b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/function/MasterDataProcessorFunction.scala index a7ca7471..dcf96a0f 100644 --- a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/function/MasterDataProcessorFunction.scala +++ b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/function/MasterDataProcessorFunction.scala @@ -9,6 +9,7 @@ import org.sunbird.obsrv.core.model.{ErrorConstants, FunctionalError, Producer} import org.sunbird.obsrv.core.streaming.Metrics import org.sunbird.obsrv.core.util.JSONUtil import org.sunbird.obsrv.model.DatasetModels.Dataset +import org.sunbird.obsrv.model.DatasetType import org.sunbird.obsrv.pipeline.task.MasterDataProcessorConfig import org.sunbird.obsrv.pipeline.util.MasterDataCache import org.sunbird.obsrv.registry.DatasetRegistry @@ -24,7 +25,7 @@ class MasterDataProcessorFunction(config: MasterDataProcessorConfig) extends Bas override def open(parameters: Configuration): Unit = { super.open(parameters) masterDataCache = new MasterDataCache(config) - masterDataCache.open(DatasetRegistry.getAllDatasets(config.datasetType())) + masterDataCache.open(DatasetRegistry.getAllDatasets(Some(DatasetType.master.toString))) } override def close(): Unit = { @@ -37,13 +38,13 @@ class MasterDataProcessorFunction(config: MasterDataProcessorConfig) extends Bas } override def processWindow(dataset: Dataset, context: ProcessWindowFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef], String, TimeWindow]#Context, elements: List[mutable.Map[String, AnyRef]], metrics: Metrics): Unit = { - + Console.println("dataset.id", dataset.id, dataset.datasetConfig.cacheConfig) metrics.incCounter(dataset.id, config.totalEventCount, elements.size.toLong) masterDataCache.open(dataset) val eventsMap = elements.map(msg => { val event = JSONUtil.serialize(msg(config.CONST_EVENT)) val json = parse(event, useBigIntForLong = false) - val node = JSONUtil.getKey(dataset.datasetConfig.key, event) + val node = JSONUtil.getKey(dataset.datasetConfig.keysConfig.dataKey.get, event) if (node.isMissingNode) { markFailure(Some(dataset.id), msg, context, metrics, ErrorConstants.MISSING_DATASET_CONFIG_KEY, Producer.masterdataprocessor, FunctionalError.MissingMasterDatasetKey, datasetType = Some(dataset.datasetType)) } diff --git a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala index 65847bbd..b5cfebef 100644 --- a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala +++ b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala @@ -12,7 +12,7 @@ import org.sunbird.obsrv.extractor.task.{ExtractorConfig, ExtractorStreamTask} import org.sunbird.obsrv.pipeline.function.MasterDataProcessorFunction import org.sunbird.obsrv.preprocessor.task.{PipelinePreprocessorConfig, PipelinePreprocessorStreamTask} import org.sunbird.obsrv.transformer.task.{TransformerConfig, TransformerStreamTask} -import org.sunbird.obsrv.router.task.{DruidRouterConfig, DynamicRouterStreamTask} +import org.sunbird.obsrv.router.task.{DynamicRouterConfig, DynamicRouterStreamTask} import java.io.File import scala.collection.mutable @@ -51,7 +51,7 @@ class MasterDataProcessorStreamTask(config: Config, masterDataConfig: MasterData val preprocessorTask = new PipelinePreprocessorStreamTask(new PipelinePreprocessorConfig(config), kafkaConnector) val denormalizerTask = new DenormalizerStreamTask(new DenormalizerConfig(config), kafkaConnector) val transformerTask = new TransformerStreamTask(new TransformerConfig(config), kafkaConnector) - val routerTask = new DynamicRouterStreamTask(new DruidRouterConfig(config), kafkaConnector) + val routerTask = new DynamicRouterStreamTask(new DynamicRouterConfig(config), kafkaConnector) val transformedStream = transformerTask.processStream( denormalizerTask.processStream( diff --git a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/util/MasterDataCache.scala b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/util/MasterDataCache.scala index e07f4399..930595d3 100644 --- a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/util/MasterDataCache.scala +++ b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/util/MasterDataCache.scala @@ -20,15 +20,15 @@ class MasterDataCache(val config: MasterDataProcessorConfig) { } def open(datasets: List[Dataset]): Unit = { - datasets.map(dataset => { + datasets.foreach(dataset => { open(dataset) }) } def open(dataset: Dataset): Unit = { if (!datasetPipelineMap.contains(dataset.id)) { - val datasetConfig = dataset.datasetConfig - val redisConnect = new RedisConnect(datasetConfig.redisDBHost.get, datasetConfig.redisDBPort.get, config.redisConnectionTimeout) + val redisConfig = dataset.datasetConfig.cacheConfig.get + val redisConnect = new RedisConnect(redisConfig.redisDBHost.get, redisConfig.redisDBPort.get, config.redisConnectionTimeout) val pipeline: Pipeline = redisConnect.getConnection(0).pipelined() datasetPipelineMap.put(dataset.id, pipeline) } @@ -37,7 +37,7 @@ class MasterDataCache(val config: MasterDataProcessorConfig) { def process(dataset: Dataset, eventMap: Map[String, JValue]): (Int, Int) = { val pipeline = this.datasetPipelineMap(dataset.id) val dataFromCache = getDataFromCache(dataset, eventMap.keySet, pipeline) - val insertCount = dataFromCache.filter(f => f._2 == null).size + val insertCount = dataFromCache.count(f => f._2 == null) val updCount = dataFromCache.size - insertCount updateCache(dataset, dataFromCache, eventMap, pipeline) (insertCount, updCount) @@ -45,7 +45,7 @@ class MasterDataCache(val config: MasterDataProcessorConfig) { private def getDataFromCache(dataset: Dataset, keys: Set[String], pipeline: Pipeline): mutable.Map[String, String] = { pipeline.clear() - pipeline.select(dataset.datasetConfig.redisDB.get) + pipeline.select(dataset.datasetConfig.cacheConfig.get.redisDB.get) val responses: mutable.Map[String, Response[String]] = mutable.Map[String, Response[String]]() keys.foreach(key => { responses.put(key, pipeline.get(key)) @@ -56,7 +56,7 @@ class MasterDataCache(val config: MasterDataProcessorConfig) { private def updateCache(dataset: Dataset, dataFromCache: mutable.Map[String, String], eventMap: Map[String, JValue], pipeline: Pipeline): Unit = { pipeline.clear() - pipeline.select(dataset.datasetConfig.redisDB.get) + pipeline.select(dataset.datasetConfig.cacheConfig.get.redisDB.get) eventMap.foreach(f => { val key = f._1 val newJson = f._2 diff --git a/pipeline/master-data-processor/src/test/resources/test.conf b/pipeline/master-data-processor/src/test/resources/test.conf index 2c8f0236..3533006c 100644 --- a/pipeline/master-data-processor/src/test/resources/test.conf +++ b/pipeline/master-data-processor/src/test/resources/test.conf @@ -5,6 +5,7 @@ job { } kafka { + input.topic = ${job.env}".masterdata.ingest" output.raw.topic = ${job.env}".masterdata.raw" output.extractor.duplicate.topic = ${job.env}".masterdata.failed" @@ -16,6 +17,7 @@ kafka { output.duplicate.topic = ${job.env}".masterdata.failed" output.denorm.topic = ${job.env}".masterdata.denorm" output.transform.topic = ${job.env}".masterdata.transform" + output.transform.failed.topic = ${job.env}".masterdata.transform.failed" stats.topic = ${job.env}".masterdata.stats" groupId = ${job.env}"-masterdata-pipeline-group" producer { @@ -24,7 +26,7 @@ kafka { } task { - window.time.in.seconds = 5 + window.time.in.seconds = 2 window.count = 2 window.shards = 1400 consumer.parallelism = 1 @@ -40,4 +42,4 @@ redis { } } -dataset.type = "master-dataset" \ No newline at end of file +dataset.type = "master" diff --git a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala index e48f8120..cb5ece83 100644 --- a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala +++ b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala @@ -2,9 +2,8 @@ package org.sunbird.obsrv.fixture object EventFixture { - val VALID_BATCH_EVENT_D3_INSERT = """{"dataset":"d3","id":"event1","events":[{"code":"HYUN-CRE-D6","manufacturer":"Hyundai","model":"Creta","variant":"SX(O)","modelYear":"2023","price":"2200000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel"}]}""" - val VALID_BATCH_EVENT_D3_INSERT_2 = """{"dataset":"d3","id":"event1","events":[{"code":"HYUN-TUC-D6","manufacturer":"Hyundai","model":"Tucson","variant":"Signature","modelYear":"2023","price":"4000000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel"}]}""" - val VALID_BATCH_EVENT_D3_UPDATE = """{"dataset":"d3","id":"event1","events":[{"code":"HYUN-CRE-D6","safety":"3 Star (Global NCAP)","seatingCapacity":5}]}""" + val VALID_BATCH_EVENT_D3_INSERT = """{"dataset":"d3","id":"event1","events":[{"code":"HYUN-CRE-D6","manufacturer":"Hyundai","model":"Creta","variant":"SX(O)","modelYear":"2023","price":"2200000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","dealer":{"email":"john.doe@example.com","locationId":"KUN12345"}}]}""" + val VALID_BATCH_EVENT_D3_INSERT_2 = """{"dataset":"d3","id":"event2","events":[{"code":"HYUN-TUC-D6","manufacturer":"Hyundai","model":"Tucson","variant":"Signature","modelYear":"2023","price":"4000000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","dealer":{"email":"admin.hyun@gmail.com","locationId":"KUN134567"}}]}""" + val VALID_BATCH_EVENT_D3_UPDATE = """{"dataset":"d3","id":"event3","events":[{"code":"HYUN-CRE-D6","dealer":{"email":"john.doe@example.com","locationId":"KUN12345"},"safety":"3 Star (Global NCAP)","seatingCapacity":5}]}""" val VALID_BATCH_EVENT_D4 = """{"dataset":"d4","event":{"code":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}}""" - val MISSING_DATA_KEY_EVENT_D4 = """{"dataset":"d5","event":{"code1":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}}""" } diff --git a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala index 575e2228..07c69965 100644 --- a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala +++ b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala @@ -64,9 +64,10 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry } private def insertTestData(postgresConnect: PostgresConnect): Unit = { - postgresConnect.execute("insert into datasets(id, type, extraction_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d3', 'master-dataset', '{\"is_batch_event\": true, \"extraction_key\": \"events\"}', '{\"topic\":\"d3-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"masterdata.ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+",\"redis_db\":3}', 'Live', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d4', 'master-dataset', '{\"topic\":\"d4-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"masterdata-ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+",\"redis_db\":4}', 'Live', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d5', 'master-dataset', '{\"topic\":\"d4-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"masterdata-ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+",\"redis_db\":4}', 'Live', 'System', 'System', now(), now());") + postgresConnect.execute("INSERT INTO datasets (id, type, validation_config, extraction_config, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES ('d3', 'master', '{\"validate\":true,\"mode\":\"Strict\"}', '{\"is_batch_event\":true,\"extraction_key\":\"events\"}', '{\"type\":\"object\",\"$schema\":\"http://json-schema.org/draft-04/schema#\",\"properties\":{\"code\":{\"type\":\"string\"},\"manufacturer\":{\"type\":\"string\"}, \"model\":{\"type\":\"string\"},\"variant\":{\"type\":\"string\"},\"modelYear\":{\"type\":\"string\"},\"price\":{\"type\":\"string\"},\"currencyCode\":{\"type\":\"string\"},\"currency\":{\"type\":\"string\"},\"transmission\":{\"type\":\"string\"},\"fuel\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"email\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"}}}}}', '{\"topic\":\"d3-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"local.masterdata.ingest\",\"redis_db\":3,\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+"}', 'Live', 'v1', 'local.masterdata.ingest', 'System', 'System', now(), now());") + postgresConnect.execute("INSERT INTO datasets (id, type, validation_config, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES ('d4', 'master', '{\"validate\":true,\"mode\":\"Strict\"}', '{\"type\":\"object\",\"$schema\":\"http://json-schema.org/draft-04/schema#\",\"properties\":{\"code\":{\"type\":\"string\"},\"manufacturer\":{\"type\":\"string\"},\"model\":{\"type\":\"string\"},\"variant\":{\"type\":\"string\"},\"modelYear\":{\"type\":\"string\"},\"price\":{\"type\":\"string\"},\"currencyCode\":{\"type\":\"string\"},\"currency\":{\"type\":\"string\"},\"seatingCapacity\": {\"type\": \"integer\"}, \"safety\": {\"type\": \"string\"},\"transmission\":{\"type\":\"string\"},\"fuel\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"email\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"}}}}}', '{\"topic\":\"d34-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"local.masterdata.ingest\",\"redis_db\":4,\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+"}', 'Live', 'v1', 'local.masterdata.ingest', 'System', 'System', now(), now());") + postgresConnect.execute("INSERT INTO dataset_transformations (id, dataset_id, field_key, transformation_function, created_by, updated_by, created_date, updated_date) VALUES ('tf3', 'd3', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'System', 'System', now(), now());") + postgresConnect.execute("INSERT INTO dataset_transformations (id, dataset_id, field_key, transformation_function, created_by, updated_by, created_date, updated_date) VALUES ('tf4', 'd3', 'dealer.locationId', '{\"type\":\"encrypt\",\"expr\":\"dealer.locationId\"}', 'System', 'System', now(), now());") } override def afterAll(): Unit = { @@ -90,18 +91,19 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry } "MasterDataProcessorStreamTaskTestSpec" should "validate the entire master data pipeline" in { - + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(masterDataConfig) val task = new MasterDataProcessorStreamTask(config, masterDataConfig, kafkaConnector) task.process(env) Future { env.execute(masterDataConfig.jobName) + Thread.sleep(5000) } - val sysEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](config.getString("kafka.output.system.event.topic"), 8, timeout = 30.seconds) - sysEvents.size should be(8) + val input = EmbeddedKafka.consumeNumberMessagesFrom[String](config.getString("kafka.output.system.event.topic"), 7, timeout = 30.seconds) + input.size should be (7) - sysEvents.foreach(se => { + input.foreach(se => { val event = JSONUtil.deserialize[SystemEvent](se) val error = event.data.error if (event.ctx.dataset.getOrElse("ALL").equals("ALL")) @@ -115,12 +117,9 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry } } else - event.ctx.dataset_type should be(Some("master-dataset")) + event.ctx.dataset_type should be(Some("master")) }) - val failedEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](masterDataConfig.kafkaFailedTopic, 1, timeout = 30.seconds) - failedEvents.size should be(1) - val mutableMetricsMap = mutable.Map[String, Long](); BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) Console.println("### MasterDataProcessorStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) @@ -143,16 +142,15 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry val redisConnection = new RedisConnect(masterDataConfig.redisHost, masterDataConfig.redisPort, masterDataConfig.redisConnectionTimeout) val jedis1 = redisConnection.getConnection(3) val event1 = jedis1.get("HYUN-CRE-D6") - event1 should be ("""{"model":"Creta","price":"2200000","variant":"SX(O)","fuel":"Diesel","code":"HYUN-CRE-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic","seatingCapacity":5,"safety":"3 Star (Global NCAP)"}""") + event1 should be ("""{"dealer":{"email":"jo*****e@example.com","locationId":"ym4iT6lWXt+Y2gEdBldeiw=="},"model":"Creta","price":"2200000","variant":"SX(O)","fuel":"Diesel","code":"HYUN-CRE-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic","seatingCapacity":5,"safety":"3 Star (Global NCAP)"}""") val event3 = jedis1.get("HYUN-TUC-D6") - event3 should be ("""{"model":"Tucson","price":"4000000","variant":"Signature","fuel":"Diesel","code":"HYUN-TUC-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic"}""") + event3 should be ("""{"dealer":{"email":"ad*******n@gmail.com","locationId":"kJ7mH49gjWHeoM1w+ex9kQ=="},"model":"Tucson","price":"4000000","variant":"Signature","fuel":"Diesel","code":"HYUN-TUC-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic"}""") jedis1.close() val jedis2 = redisConnection.getConnection(4) val event2 = jedis2.get("JEEP-CP-D3") event2 should be ("""{"model":"Compass","price":"3800000","variant":"Model S (O) Diesel 4x4 AT","fuel":"Diesel","seatingCapacity":5,"code":"JEEP-CP-D3","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Jeep","safety":"5 Star (Euro NCAP)","modelYear":"2023","transmission":"automatic"}""") jedis2.close() - } diff --git a/pipeline/pom.xml b/pipeline/pom.xml index 220ebff4..d2128647 100644 --- a/pipeline/pom.xml +++ b/pipeline/pom.xml @@ -20,10 +20,11 @@ preprocessor denormalizer transformer - druid-router - pipeline-merged + dataset-router + unified-pipeline master-data-processor hudi-connector + cache-indexer diff --git a/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/EventValidationFunction.scala b/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/EventValidationFunction.scala index 93cfefef..f4f34789 100644 --- a/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/EventValidationFunction.scala +++ b/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/EventValidationFunction.scala @@ -32,7 +32,7 @@ class EventValidationFunction(config: PipelinePreprocessorConfig)(implicit val e override def open(parameters: Configuration): Unit = { super.open(parameters) schemaValidator = new SchemaValidator() - schemaValidator.loadDataSchemas(DatasetRegistry.getAllDatasets(config.datasetType())) + schemaValidator.loadDataSchemas(DatasetRegistry.getAllDatasets(None)) } override def close(): Unit = { diff --git a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala index d111543b..226d87ec 100644 --- a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala +++ b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala @@ -75,12 +75,12 @@ class PipelinePreprocessorStreamTestSpec extends BaseSpecWithDatasetRegistry { private def prepareTestData(): Unit = { val postgresConnect = new PostgresConnect(postgresConfig) - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d3', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Draft', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d4', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d5', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"IgnoreNewFields\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d6', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string","maxLength":5},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"DiscardNewFields\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d7', 'dataset', '"+EventFixtures.INVALID_SCHEMA+"', '{\"validate\": true, \"mode\": \"Strict\"}','{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, created_by, updated_by, created_date, updated_date) values ('d8', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": false, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d3', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Draft', 'v1', 'ingest', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d4', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d5', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"IgnoreNewFields\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d6', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string","maxLength":5},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"DiscardNewFields\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d7', 'dataset', '"+EventFixtures.INVALID_SCHEMA+"', '{\"validate\": true, \"mode\": \"Strict\"}','{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d8', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": false, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'v1', 'ingest', 'System', 'System', now(), now());") postgresConnect.closeConnection() } diff --git a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/TestSchemaValidator.scala b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/TestSchemaValidator.scala index 0ba13d65..c05c185d 100644 --- a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/TestSchemaValidator.scala +++ b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/TestSchemaValidator.scala @@ -3,7 +3,7 @@ package org.sunbird.obsrv.preprocessor import com.typesafe.config.{Config, ConfigFactory} import org.scalatest.{FlatSpec, Matchers} import org.sunbird.obsrv.core.util.JSONUtil -import org.sunbird.obsrv.model.DatasetModels.{Dataset, DatasetConfig, RouterConfig} +import org.sunbird.obsrv.model.DatasetModels.{Dataset, DatasetConfig, IndexingConfig, KeysConfig, RouterConfig} import org.sunbird.obsrv.model.DatasetStatus import org.sunbird.obsrv.preprocessor.fixture.EventFixtures import org.sunbird.obsrv.preprocessor.task.PipelinePreprocessorConfig @@ -17,7 +17,7 @@ class TestSchemaValidator extends FlatSpec with Matchers { "SchemaValidator" should "return a success report for a valid event" in { - val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.VALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id","date","ingest"), DatasetStatus.Live) + val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.VALID_SCHEMA), None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest") schemaValidator.loadDataSchema(dataset) val event = JSONUtil.deserialize[Map[String, AnyRef]](EventFixtures.VALID_SCHEMA_EVENT) @@ -27,7 +27,7 @@ class TestSchemaValidator extends FlatSpec with Matchers { it should "return a failed validation report for a invalid event" in { - val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.VALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id","date","ingest"), DatasetStatus.Live) + val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.VALID_SCHEMA), None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest") schemaValidator.loadDataSchema(dataset) val event1 = JSONUtil.deserialize[Map[String, AnyRef]](EventFixtures.INVALID_SCHEMA_EVENT) @@ -37,7 +37,7 @@ class TestSchemaValidator extends FlatSpec with Matchers { assert(messages1.size == 1) messages1.head.message should be("object has missing required properties ([\"vehicleCode\"])") messages1.head.keyword should be("required") - messages1.head.missing.get.head should be ("vehicleCode") + messages1.head.missing.get.head should be("vehicleCode") val event2 = JSONUtil.deserialize[Map[String, AnyRef]](EventFixtures.INVALID_SCHEMA_EVENT2) val report2 = schemaValidator.validate("d1", event2) @@ -51,7 +51,7 @@ class TestSchemaValidator extends FlatSpec with Matchers { f.instance.pointer should be("/id") case "array" => f.message should be("instance type (array) does not match any allowed primitive type (allowed: [\"string\"])") - f.instance.pointer should be ("/vehicleCode") + f.instance.pointer should be("/vehicleCode") } }) @@ -65,7 +65,7 @@ class TestSchemaValidator extends FlatSpec with Matchers { case "type" => f.message should be("instance type (integer) does not match any allowed primitive type (allowed: [\"string\"])") f.instance.pointer should be("/id") - f.found.get should be ("integer") + f.found.get should be("integer") f.expected.get.head should be("string") case "additionalProperties" => f.message should be("object instance has properties which are not allowed by the schema: [\"deliveriesRejected\"]") @@ -76,24 +76,24 @@ class TestSchemaValidator extends FlatSpec with Matchers { } it should "validate the negative and missing scenarios" in { - val dataset = Dataset("d4", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA_JSON), None, RouterConfig(""), DatasetConfig("id","date","ingest"), DatasetStatus.Live) + val dataset = Dataset("d4", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA_JSON), None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest") schemaValidator.loadDataSchema(dataset) - schemaValidator.schemaFileExists(dataset) should be (false) + schemaValidator.schemaFileExists(dataset) should be(false) schemaValidator.loadDataSchema(dataset) schemaValidator.schemaFileExists(dataset) should be(false) - val dataset2 = Dataset("d5", "dataset", None, None, None, None, None, RouterConfig(""), DatasetConfig("id","date","ingest"), DatasetStatus.Live) + val dataset2 = Dataset("d5", "dataset", None, None, None, None, None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest") schemaValidator.loadDataSchemas(List[Dataset](dataset2)) - schemaValidator.schemaFileExists(dataset2) should be (false) + schemaValidator.schemaFileExists(dataset2) should be(false) - val dataset3 = Dataset("d6", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id", "date", "ingest"), DatasetStatus.Live) + val dataset3 = Dataset("d6", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA), None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest") schemaValidator.loadDataSchemas(List[Dataset](dataset3)) schemaValidator.schemaFileExists(dataset3) should be(false) - val dataset4 = Dataset("d7", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id", "date", "ingest"), DatasetStatus.Live) - schemaValidator.schemaFileExists(dataset4) should be (false) + val dataset4 = Dataset("d7", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA), None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest") + schemaValidator.schemaFileExists(dataset4) should be(false) } } diff --git a/pipeline/transformer/pom.xml b/pipeline/transformer/pom.xml index b695a812..959e549e 100644 --- a/pipeline/transformer/pom.xml +++ b/pipeline/transformer/pom.xml @@ -41,6 +41,11 @@ dataset-registry 1.0.0 + + org.sunbird.obsrv + transformation-sdk + 1.0.0 + org.sunbird.obsrv framework @@ -48,6 +53,25 @@ test-jar test + + org.sunbird.obsrv + dataset-registry + 1.0.0 + test-jar + test + + + org.apache.kafka + kafka-clients + ${kafka.version} + test + + + org.apache.kafka + kafka_${scala.maj.version} + ${kafka.version} + test + org.apache.flink flink-test-utils @@ -61,6 +85,18 @@ test tests + + io.github.embeddedkafka + embedded-kafka_2.12 + 3.4.0 + test + + + io.zonky.test + embedded-postgres + 2.0.3 + test + com.github.codemonstur embedded-redis @@ -143,7 +179,7 @@ - reference.conf + transformer.conf diff --git a/pipeline/transformer/src/main/resources/transformer.conf b/pipeline/transformer/src/main/resources/transformer.conf index b7adb850..42fbb22f 100644 --- a/pipeline/transformer/src/main/resources/transformer.conf +++ b/pipeline/transformer/src/main/resources/transformer.conf @@ -3,6 +3,7 @@ include "baseconfig.conf" kafka { input.topic = ${job.env}".denorm" output.transform.topic = ${job.env}".transform" + output.transform.failed.topic = ${job.env}".transform.failed" groupId = ${job.env}"-transformer-group" producer { max-request-size = 5242880 diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/functions/TransformerFunction.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/functions/TransformerFunction.scala index fb0da96c..94a8c80f 100644 --- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/functions/TransformerFunction.scala +++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/functions/TransformerFunction.scala @@ -1,41 +1,142 @@ package org.sunbird.obsrv.transformer.functions -import org.apache.flink.api.common.typeinfo.TypeInformation +import com.fasterxml.jackson.databind.ObjectMapper +import org.sunbird.obsrv.transformer.task.TransformerConfig +import org.sunbird.obsrv.transformer.types._ import org.apache.flink.streaming.api.functions.ProcessFunction -import org.sunbird.obsrv.core.model.Producer +import org.json4s._ +import org.json4s.native.JsonMethods._ +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.exception.ObsrvException +import org.sunbird.obsrv.core.model.Models._ +import org.sunbird.obsrv.core.model.StatusCode.StatusCode +import org.sunbird.obsrv.core.model._ import org.sunbird.obsrv.core.streaming.Metrics -import org.sunbird.obsrv.model.DatasetModels.Dataset +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.{Dataset, DatasetTransformation} +import org.sunbird.obsrv.model.TransformMode import org.sunbird.obsrv.registry.DatasetRegistry import org.sunbird.obsrv.streaming.BaseDatasetProcessFunction -import org.sunbird.obsrv.transformer.task.TransformerConfig import scala.collection.mutable -class TransformerFunction(config: TransformerConfig)(implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]]) - extends BaseDatasetProcessFunction(config) { +case class TransformationStatus(resultJson: JValue, status: StatusCode, fieldStatus: List[TransformFieldStatus]) + +class TransformerFunction(config: TransformerConfig) extends BaseDatasetProcessFunction(config) { + + private[this] val logger = LoggerFactory.getLogger(classOf[TransformerFunction]) override def getMetrics(): List[String] = { - List(config.totalEventCount, config.transformSuccessCount, config.transformFailedCount, config.transformSkippedCount) + List(config.totalEventCount, config.transformSuccessCount, config.transformPartialCount, config.transformFailedCount, config.transformSkippedCount) } - /** * Method to process the event transformations */ - override def processElement(dataset: Dataset, msg: mutable.Map[String, AnyRef], - context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, - metrics: Metrics): Unit = { + override def processElement(dataset: Dataset, msg: mutable.Map[String, AnyRef], context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, metrics: Metrics): Unit = { + implicit val jsonFormats: Formats = DefaultFormats.withLong + val result = TransformerFunctionHelper.processTransformation(dataset, msg, config) metrics.incCounter(dataset.id, config.totalEventCount) + msg.put(config.CONST_EVENT, result.resultJson.extract[Map[String, AnyRef]]) + result.status match { + case StatusCode.skipped => + metrics.incCounter(dataset.id, config.transformSkippedCount) + context.output(config.transformerOutputTag, markSkipped(msg, Producer.transformer)) + case StatusCode.failed => + metrics.incCounter(dataset.id, config.transformFailedCount) + context.output(config.transformerFailedOutputTag, markFailed(msg, ErrorConstants.ERR_TRANSFORMATION_FAILED, Producer.transformer)) + logSystemEvents(dataset, result, context) + case StatusCode.partial => + metrics.incCounter(dataset.id, config.transformPartialCount) + context.output(config.transformerOutputTag, markPartial(msg, Producer.transformer)) + logSystemEvents(dataset, result, context) + case StatusCode.success => + metrics.incCounter(dataset.id, config.transformSuccessCount) + context.output(config.transformerOutputTag, markSuccess(msg, Producer.transformer)) + } + } + + private def logSystemEvents(dataset: Dataset, result: TransformationStatus, ctx: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context): Unit = { + result.fieldStatus.filter(p => !p.success).groupBy(f => f.error.get).map(f => (f._1, f._2.size)) + .foreach(errCount => { + val err = errCount._1 + val functionalError = err match { + case ErrorConstants.INVALID_EXPR_FUNCTION => FunctionalError.TransformParseError + case ErrorConstants.ERR_EVAL_EXPR_FUNCTION => FunctionalError.TransformEvalError + case ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION => FunctionalError.TransformFailedError + case ErrorConstants.TRANSFORMATION_FIELD_MISSING => FunctionalError.TransformFieldMissing + } + + ctx.output(config.systemEventsOutputTag, JSONUtil.serialize(SystemEvent( + EventID.METRIC, + ctx = ContextData(module = ModuleID.processing, pdata = PData(config.jobName, PDataType.flink, Some(Producer.denorm)), dataset = Some(dataset.id), dataset_type = Some(dataset.datasetType)), + data = EData(error = Some(ErrorLog(pdata_id = Producer.denorm, pdata_status = StatusCode.failed, error_type = functionalError, error_code = err.errorCode, error_message = err.errorMsg, error_level = ErrorLevel.critical, error_count = Some(errCount._2)))) + ))) + }) + + logger.warn(s"Transformer | Transform operation is not successful | dataset=${dataset.id} | TransformStatusData=${JSONUtil.serialize(result.fieldStatus)}") + } + +} + +object TransformerFunctionHelper { + + implicit val jsonFormats: Formats = DefaultFormats.withLong + private val mapper = new ObjectMapper() + + implicit class JsonHelper(json: JValue) { + def customExtract[T](path: String)(implicit mf: Manifest[T]): T = { + path.split('.').foldLeft(json)({ case (acc: JValue, node: String) => acc \ node }).extract[T] + } + } + + @throws[ObsrvException] + def processTransformation(dataset: Dataset, msg: mutable.Map[String, AnyRef], config: TransformerConfig): TransformationStatus = { + + val event = JSONUtil.serialize(msg(config.CONST_EVENT)) + val json = parse(event, useBigIntForLong = false) val datasetTransformations = DatasetRegistry.getDatasetTransformations(dataset.id) + processTransformations(json, datasetTransformations) + } + + def processTransformations(json: JValue, datasetTransformations: Option[List[DatasetTransformation]]): TransformationStatus = { if (datasetTransformations.isDefined) { - // TODO: Perform transformations - metrics.incCounter(dataset.id, config.transformSuccessCount) - context.output(config.transformerOutputTag, markSuccess(msg, Producer.transformer)) + val result = applyTransformations(json, datasetTransformations.get) + TransformationStatus(json merge result.json, getStatus(result.fieldStatus), result.fieldStatus) } else { - metrics.incCounter(dataset.id, config.transformSkippedCount) - context.output(config.transformerOutputTag, markSkipped(msg, Producer.transformer)) + TransformationStatus(json, StatusCode.skipped, List[TransformFieldStatus]()) } } + private def getStatus(fieldStatus: List[TransformFieldStatus]): StatusCode = { + val failedCount = fieldStatus.count(p => p.mode == TransformMode.Strict && !p.success) + val partialCount = fieldStatus.count(p => p.mode == TransformMode.Lenient && !p.success) + if (failedCount > 0) StatusCode.failed else if (partialCount > 0) StatusCode.partial else StatusCode.success + + } + + private def applyTransformations(json: JValue, datasetTransformations: List[DatasetTransformation]): TransformationResult = { + datasetTransformations.groupBy(f => f.transformationFunction.`type`).mapValues(f => { + applyTransformation(f.head.transformationFunction.`type`, json, f) + }).values.reduceLeft((a, b) => TransformationResult(mergeJson(a, b), mergeStatus(a, b))) + } + + private def mergeJson(a: TransformationResult, b: TransformationResult): JValue = { + a.json merge b.json + } + + private def mergeStatus(a: TransformationResult, b: TransformationResult): List[TransformFieldStatus] = { + a.fieldStatus ++ b.fieldStatus + } + + private def applyTransformation(tfType: String, json: JValue, dt: List[DatasetTransformation]): TransformationResult = { + val jsonNode = mapper.readTree(compact(render(json))) + tfType match { + case "mask" => MaskTransformer.transform(json, jsonNode, dt) + case "jsonata" => JSONAtaTransformer.transform(json, jsonNode, dt) + case "encrypt" => EncryptTransformer.transform(json, jsonNode, dt) + case _ => TransformationResult(json, List[TransformFieldStatus]()) + } + } } \ No newline at end of file diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerConfig.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerConfig.scala index 797b3e56..c943702d 100644 --- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerConfig.scala +++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerConfig.scala @@ -17,16 +17,22 @@ class TransformerConfig(override val config: Config) extends BaseJobConfig[mutab // Metric List val totalEventCount = "transform-total-count" val transformSuccessCount = "transform-success-count" + val transformPartialCount = "transform-partial-count" val transformFailedCount = "transform-failed-count" val transformSkippedCount = "transform-skipped-count" + private val kafkaInputTopic: String = config.getString("kafka.input.topic") val kafkaTransformTopic: String = config.getString("kafka.output.transform.topic") + val kafkaTransformFailedTopic: String = config.getString("kafka.output.transform.failed.topic") val transformerFunction = "transformer-function" val transformerProducer = "transformer-producer" + val transformerFailedProducer = "transformer-failed-producer" - private val TRANSFORMER_OUTPUT_TAG = "transformed-events" - val transformerOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]](TRANSFORMER_OUTPUT_TAG) + private val TRANSFORMER_EVENTS = "transformed-events" + private val TRANSFORMER_FAILED_EVENTS = "transformed_failed-events" + val transformerOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]](TRANSFORMER_EVENTS) + val transformerFailedOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]](TRANSFORMER_FAILED_EVENTS) override def inputTopic(): String = config.getString("kafka.input.topic") diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala index 71e86581..eee8cee2 100644 --- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala +++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala @@ -1,8 +1,6 @@ package org.sunbird.obsrv.transformer.task import com.typesafe.config.ConfigFactory -import org.apache.flink.api.common.typeinfo.TypeInformation -import org.apache.flink.api.java.typeutils.TypeExtractor import org.apache.flink.api.java.utils.ParameterTool import org.apache.flink.streaming.api.datastream.DataStream import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment @@ -13,47 +11,48 @@ import org.sunbird.obsrv.transformer.functions.TransformerFunction import java.io.File import scala.collection.mutable -/** - * - */ class TransformerStreamTask(config: TransformerConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { private val serialVersionUID = -7729362727131516112L - implicit val mapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster def process(): Unit = { - implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) - val dataStream = getMapDataStream(env, config, kafkaConnector) - processStream(dataStream) + process(env) env.execute(config.jobName) } // $COVERAGE-ON$ + def process(env: StreamExecutionEnvironment): Unit = { + val dataStream = getMapDataStream(env, config, kafkaConnector) + processStream(dataStream) + } + override def processStream(dataStream: DataStream[mutable.Map[String, AnyRef]]): DataStream[mutable.Map[String, AnyRef]] = { + val transformedStream = dataStream.process(new TransformerFunction(config)).name(config.transformerFunction).uid(config.transformerFunction) .setParallelism(config.downstreamOperatorsParallelism) transformedStream.getSideOutput(config.transformerOutputTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaTransformTopic)) .name(config.transformerProducer).uid(config.transformerProducer).setParallelism(config.downstreamOperatorsParallelism) + transformedStream.getSideOutput(config.transformerFailedOutputTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaTransformFailedTopic)) + .name(config.transformerFailedProducer).uid(config.transformerFailedProducer).setParallelism(config.downstreamOperatorsParallelism) addDefaultSinks(transformedStream, config, kafkaConnector) - transformedStream.getSideOutput(config.successTag()) + transformedStream.getSideOutput(config.transformerOutputTag) } } // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster object TransformerStreamTask { - def main(args: Array[String]): Unit = { val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path")) val config = configFilePath.map { path => ConfigFactory.parseFile(new File(path)).resolve() }.getOrElse(ConfigFactory.load("transformer.conf").withFallback(ConfigFactory.systemEnvironment())) - val extractorConfig = new TransformerConfig(config) - val kafkaUtil = new FlinkKafkaConnector(extractorConfig) - val task = new TransformerStreamTask(extractorConfig, kafkaUtil) + val transformerConfig = new TransformerConfig(config) + val kafkaUtil = new FlinkKafkaConnector(transformerConfig) + val task = new TransformerStreamTask(transformerConfig, kafkaUtil) task.process() } } diff --git a/pipeline/transformer/src/test/resources/test.conf b/pipeline/transformer/src/test/resources/test.conf index f1091415..1098ba64 100644 --- a/pipeline/transformer/src/test/resources/test.conf +++ b/pipeline/transformer/src/test/resources/test.conf @@ -1,8 +1,11 @@ include "base-test.conf" kafka { + producer.broker-servers = "localhost:9093" + consumer.broker-servers = "localhost:9093" input.topic = "flink.denorm" output.transform.topic = "flink.transform" + output.transform.failed.topic = "flink.transform.failed" groupId = "flink-transformer-group" producer { max-request-size = 5242880 diff --git a/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/EventFixture.scala b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/EventFixture.scala new file mode 100644 index 00000000..a4f48246 --- /dev/null +++ b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/EventFixture.scala @@ -0,0 +1,11 @@ +package org.sunbird.obsrv.transformer + +object EventFixture { + + val SUCCESS_TRANSFORM = """{"dataset":"d1","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val FAILED_TRANSFORM = """{"dataset":"d1","event":{"id":"1235","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val PARTIAL_TRANSFORM = """{"dataset":"d2","event":{"id":"1235","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val SKIPPED_TRANSFORM = """{"dataset":"d3","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val FAILED_TRANSFORM_2 = """{"dataset":"d4","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + +} \ No newline at end of file diff --git a/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TestTransformerFunctionHelper.scala b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TestTransformerFunctionHelper.scala new file mode 100644 index 00000000..13bd1b40 --- /dev/null +++ b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TestTransformerFunctionHelper.scala @@ -0,0 +1,211 @@ +package org.sunbird.obsrv.transformer + +import com.fasterxml.jackson.databind.{JsonNode, ObjectMapper} +import org.json4s._ +import org.json4s.native.JsonMethods._ +import org.scalatest.Matchers +import org.sunbird.obsrv.core.model.{ErrorConstants, StatusCode} +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.{Condition, DatasetTransformation, TransformationFunction} +import org.sunbird.obsrv.model.TransformMode +import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry +import org.sunbird.obsrv.transformer.functions.TransformerFunctionHelper +import org.sunbird.obsrv.transformer.util.{CipherUtil, ConditionEvaluator} +import org.sunbird.obsrv.transformer.types._ + +class TestTransformerFunctionHelper extends BaseSpecWithDatasetRegistry with Matchers { + + implicit val jsonFormats: DefaultFormats.type = DefaultFormats + + implicit class JsonHelper(json: JValue) { + def customExtract[T](path: String)(implicit mf: Manifest[T]): T = { + path.split('.').foldLeft(json)({ case (acc: JValue, node: String) => acc \ node }).extract[T] + } + } + + val jsonStr = """{"obsCode":"M_BATTERY_CHARGE","accountEmail":"firstname.lastname@gmail.com","accountPhone":"123456","codeComponents":[{"componentCode":"CC_METADATA_DEVICE_FIRMWARE_VER","componentType":"METADATA_DEVICE","selector":"FIRMWARE_VERSION","value":"2.3"}],"phenTime":"2022-06-17T07:12:02Z","valueUoM":"prcnt","value":"100","id":"df4c7aa4-65df-4463-b92a-7a29835f9c4d","parentCollectionRef":"41e9b7a4-5b6f-11ed-8fd5-a6a5696c2aaa","created":"2022-11-03T12:01:32Z","modified":1667476892000,"integrationAccountRef":"zzz11120-f0c8-4064-8d00-a73e58939ce0_mtgc203d-2478-4679-a0ef-d736a7a406fd","assetRef":"9422f7ac-c6e9-5c72-b605-5a7655863866","assetRef2":"","assetRef4":123124,"testBool":false,"contextItems":[{"code":"SYN_SYSTEM","value":"VALENCO"}],"status":"ACTIVE","xMin":3.356701,"xMax":3.356701,"yMin":51.01653,"yMax":51.01653,"spatialExtent":"{\"type\": \"Point\", \"coordinates\": [3.356701, 51.016530]}","phenEndTime":"2022-06-17T07:12:02Z","value_double_type":100.0}""" + val mapper = new ObjectMapper() + val jsonNode: JsonNode = mapper.readTree(jsonStr) + + "TransformerFunctionHelper" should "mask the events for the given transformation config" in { + + val json = parse(jsonStr) + val dtList = Option(List( + DatasetTransformation("tf1", "obs2.0", "spatialExtent", TransformationFunction("mask", None, "spatialExtent")), + DatasetTransformation("tf1", "obs2.0", "assetRef", TransformationFunction("mask", None, "assetRef")), + DatasetTransformation("tf1", "obs2.0", "accountEmail", TransformationFunction("mask", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "accountEmail")), + DatasetTransformation("tf1", "obs2.0", "accountPhone2", TransformationFunction("mask", None, "accountPhone")), + DatasetTransformation("tf1", "obs2.0", "codeComponentsList", TransformationFunction("jsonata", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "$keys(codeComponents)")), + DatasetTransformation("tf1", "obs2.0", "valueAsInt", TransformationFunction("jsonata", None, "$number(value)")), + DatasetTransformation("tf1", "obs2.0", "firmwareComponent", TransformationFunction("jsonata", None, "codeComponents[0]")), + DatasetTransformation("tf1", "obs2.0", "optionalValue", TransformationFunction("jsonata", None, "$number(optionValue)")) + )) + + val result = TransformerFunctionHelper.processTransformations(json, dtList) + result.status should be(StatusCode.success) + result.fieldStatus.size should be(8) + assert(result.resultJson.customExtract[String]("spatialExtent").equals("{type: ***********************************1.016530]}")) + assert(result.resultJson.customExtract[String]("assetRef").equals("9422f7***********************5863866")) + assert(result.resultJson.customExtract[String]("accountEmail").equals("fi***************e@gmail.com")) + assert(result.resultJson.customExtract[String]("accountPhone2").equals("1***56")) + assert(JSONUtil.getKey("optionalValue", JSONUtil.serialize(result.resultJson)).isMissingNode.equals(true)) + + val dtList2 = Option(List( + DatasetTransformation("tf1", "obs2.0", "accountPhone", TransformationFunction("mask", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE1'")), "accountPhone"), Some(TransformMode.Lenient)), + DatasetTransformation("tf4", "obs2.0", "asset.assetRef2", TransformationFunction("mask", None, "assetRef2"), Some(TransformMode.Lenient)), + DatasetTransformation("tf5", "obs2.0", "asset.assetRef3", TransformationFunction("mask", None, "assetRef3"), Some(TransformMode.Lenient)), + DatasetTransformation("tf6", "obs2.0", "asset.assetRef4", TransformationFunction("mask", None, "assetRef4"), Some(TransformMode.Lenient)), + DatasetTransformation("tf7", "obs2.0", "asset.assetRef5", TransformationFunction("custom", None, "join(d2.assetRef4)"), Some(TransformMode.Lenient)) + )) + val result2 = TransformerFunctionHelper.processTransformations(json, dtList2) + result2.status should be(StatusCode.partial) + result2.fieldStatus.size should be(4) + result2.resultJson.customExtract[String]("asset.assetRef2") should be("") + result2.resultJson.customExtract[String]("asset.assetRef3") should be(null) + result2.resultJson.customExtract[String]("asset.assetRef4") should be("1***24") + result.resultJson.customExtract[String]("accountPhone") should be ("123456") + + val result3 = TransformerFunctionHelper.processTransformations(json, None) + result3.status should be (StatusCode.skipped) + result3.fieldStatus.size should be(0) + } + + it should "validate the jsonata expressions" in { + + val json = parse(jsonStr) + val dtList = Option(List( + DatasetTransformation("tf1", "obs2.0", "codeComponentsList", TransformationFunction("jsonata", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "$keys(codeComponents).length"), Some(TransformMode.Lenient)), + DatasetTransformation("tf1", "obs2.0", "valueAsInt", TransformationFunction("jsonata", None, "$number(value)")), + DatasetTransformation("tf1", "obs2.0", "firmwareComponent", TransformationFunction("jsonata", None, "codeComponents[0]")) + )) + val result = TransformerFunctionHelper.processTransformations(json, dtList) + result.status should be(StatusCode.partial) + result.fieldStatus.size should be(3) + assert(result.resultJson.customExtract[String]("firmwareComponent.componentCode").equals("CC_METADATA_DEVICE_FIRMWARE_VER")) + assert(result.resultJson.customExtract[Int]("valueAsInt").equals(100)) + } + + it should "handle the jsonata parse and eval exceptions including transformation modes" in { + + val json = parse(jsonStr) + val dtList = Option(List( + DatasetTransformation("tf1", "obs2.0", "codeComponentsList", TransformationFunction("jsonata", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "$keys(codeComponent).length")), + DatasetTransformation("tf1", "obs2.0", "valueAsInt", TransformationFunction("jsonata", None, "number(value)")), + DatasetTransformation("tf1", "obs2.0", "valueAsInt2", TransformationFunction("jsonata", None, null), Some(TransformMode.Lenient)), + DatasetTransformation("tf1", "obs2.0", "firmwareComponent", TransformationFunction("jsonata", None, "codeComponents[0]")) + )) + val result = TransformerFunctionHelper.processTransformations(json, dtList) + result.status should be(StatusCode.failed) + result.fieldStatus.size should be(4) + result.fieldStatus.count(f => f.error.isDefined && f.error.get.equals(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION)) should be(1) + result.fieldStatus.count(f => f.error.isDefined && f.error.get.equals(ErrorConstants.ERR_EVAL_EXPR_FUNCTION)) should be(1) + result.fieldStatus.count(f => f.error.isDefined && f.error.get.equals(ErrorConstants.INVALID_EXPR_FUNCTION)) should be(1) + result.fieldStatus.foreach { status: TransformFieldStatus => { + status.fieldKey match { + case "codeComponentsList" => + status.expr should be("$keys(codeComponent).length") + status.success should be(false) + status.mode should be(TransformMode.Strict) + status.error.get should be(ErrorConstants.ERR_EVAL_EXPR_FUNCTION) + case "valueAsInt" => + status.expr should be("number(value)") + status.success should be(false) + status.mode should be(TransformMode.Strict) + status.error.get should be(ErrorConstants.INVALID_EXPR_FUNCTION) + case "firmwareComponent" => + status.expr should be("codeComponents[0]") + status.success should be(true) + status.mode should be(TransformMode.Strict) + status.error should be(None) + case "valueAsInt2" => + status.expr should be(null) + status.success should be(false) + status.mode should be(TransformMode.Lenient) + status.error.get should be(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION) + } + } + } + } + + it should "encrypt the fields in the event" in { + val json = parse(jsonStr) + val dtList = Option(List( + DatasetTransformation("tf1", "obs2.0", "accountEmail", TransformationFunction("encrypt", None, "accountEmail")), + DatasetTransformation("tf2", "obs2.0", "accountPhone", TransformationFunction("encrypt", None, "accountPhone")), + DatasetTransformation("tf3", "obs2.0", "assetRef", TransformationFunction("encrypt", None, "assetRef")), + DatasetTransformation("tf4", "obs2.0", "asset.assetRef2", TransformationFunction("encrypt", None, "assetRef2")), + DatasetTransformation("tf5", "obs2.0", "asset.assetRef3", TransformationFunction("encrypt", None, "assetRef3")), + DatasetTransformation("tf6", "obs2.0", "asset.assetRef4", TransformationFunction("encrypt", None, "assetRef4")) + )) + val result = TransformerFunctionHelper.processTransformations(json, dtList) + val jsonData = compact(render(result.resultJson)) + result.status should be(StatusCode.failed) + result.fieldStatus.size should be(6) + assert(result.resultJson.customExtract[String]("accountEmail").equals("jyx7+dUfzHgODno2jcp67/rfCvOecaLLWICRnSCNvzY=")) + assert(result.resultJson.customExtract[String]("accountPhone").equals("qqyhkaWkPR3t1k0swyQ7Ow==")) + assert(result.resultJson.customExtract[String]("assetRef").equals("e+YNIi1FebmPPI7D8k3/idlQ8XX0AIhuplwcRLbPb3nkS25gt/HyUQkWeuj6KPxf")) + result.resultJson.customExtract[String]("asset.assetRef2") should be("") + result.resultJson.customExtract[String]("asset.assetRef4") should be("D2ySyi1WGqJsM4mbIjbtJA==") + result.resultJson.customExtract[String]("asset.assetRef3") should be(null) + + JSONUtil.getKey("asset.assetRef3", jsonData).isEmpty should be(true) + + assert(CipherUtil.decrypt(result.resultJson.customExtract[String]("accountEmail")).equals("firstname.lastname@gmail.com")) + assert(CipherUtil.decrypt(result.resultJson.customExtract[String]("accountPhone")).equals("123456")) + assert(CipherUtil.decrypt(result.resultJson.customExtract[String]("assetRef")).equals("9422f7ac-c6e9-5c72-b605-5a7655863866")) + } + + it should "validate all scenarios of condition evaluator" in { + val status1 = ConditionEvaluator.evalCondition("d1", jsonNode, Some(Condition("custom", "testExpr")), Some(TransformMode.Strict)) + status1.expr should be("") + status1.success should be(false) + status1.mode.get should be(TransformMode.Strict) + status1.error.get should be(ErrorConstants.NO_IMPLEMENTATION_FOUND) + + val status2 = ConditionEvaluator.evalCondition("d1", jsonNode, Some(Condition("jsonata", "number(value)")), Some(TransformMode.Strict)) + status2.expr should be("number(value)") + status2.success should be(false) + status2.mode.get should be(TransformMode.Strict) + status2.error.get should be(ErrorConstants.INVALID_EXPR_FUNCTION) + + val status3 = ConditionEvaluator.evalCondition("d1", jsonNode, Some(Condition("jsonata", "$keys(codeComponent).length")), Some(TransformMode.Strict)) + status3.expr should be("$keys(codeComponent).length") + status3.success should be(false) + status3.mode.get should be(TransformMode.Strict) + status3.error.get should be(ErrorConstants.ERR_EVAL_EXPR_FUNCTION) + + val status4 = ConditionEvaluator.evalCondition("d1", jsonNode, Some(Condition("jsonata", null)), Some(TransformMode.Strict)) + status4.expr should be(null) + status4.success should be(false) + status4.mode.get should be(TransformMode.Strict) + status4.error.get should be(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION) + + val status5 = ConditionEvaluator.evalCondition("d1", null, Some(Condition("jsonata", "$number(value)")), Some(TransformMode.Lenient)) + status5.expr should be("$number(value)") + status5.success should be(false) + status5.mode.get should be(TransformMode.Lenient) + status5.error.get should be(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION) + } + + it should "cover the unreachable code block in ITransformer" in { + val testTransformer = new TestTransformer() + val res1 = testTransformer.getJSON("event.key", null.asInstanceOf[JsonNode]) + compact(render(res1)) should be("""{"event":{"key":null}}""") + val res2 = testTransformer.getJSON("event.key.x", JSONUtil.getKey("obsCode", jsonStr)) + compact(render(res2)) should be("""{"event":{"key":{"x":"M_BATTERY_CHARGE"}}}""") + val res3 = testTransformer.getJSON("event.key.y", JSONUtil.getKey("testBool", jsonStr)) + compact(render(res3)) should be("""{"event":{"key":{"y":false}}}""") + + val res4 = testTransformer.transform(parse(jsonStr), jsonNode, List[DatasetTransformation]()) + res4.json should be(JNothing) + res4.fieldStatus.size should be(0) + } + +} + +class TestTransformer extends ITransformer { + override def transformField(json: JValue, jsonNode: JsonNode, dt: DatasetTransformation): (JValue, TransformFieldStatus) = { + (JNothing, TransformFieldStatus("", "", success = false, TransformMode.Lenient)) + } + +} \ No newline at end of file diff --git a/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TransformerStreamTestSpec.scala b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TransformerStreamTestSpec.scala new file mode 100644 index 00000000..76500f19 --- /dev/null +++ b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TransformerStreamTestSpec.scala @@ -0,0 +1,229 @@ +package org.sunbird.obsrv.transformer + +import io.github.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig} +import org.apache.flink.configuration.Configuration +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.apache.flink.test.util.MiniClusterWithClientResource +import org.apache.kafka.common.serialization.StringDeserializer +import org.scalatest.Matchers._ +import org.sunbird.obsrv.BaseMetricsReporter +import org.sunbird.obsrv.core.model.Models.SystemEvent +import org.sunbird.obsrv.core.model._ +import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector +import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect} +import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry +import org.sunbird.obsrv.transformer.task.{TransformerConfig, TransformerStreamTask} + +import scala.collection.mutable +import scala.concurrent.ExecutionContext.Implicits.global +import scala.concurrent.Future +import scala.concurrent.duration._ + +class TransformerStreamTestSpec extends BaseSpecWithDatasetRegistry { + + val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder() + .setConfiguration(testConfiguration()) + .setNumberSlotsPerTaskManager(1) + .setNumberTaskManagers(1) + .build) + + val transformerConfig = new TransformerConfig(config) + val redisPort: Int = transformerConfig.redisPort + val kafkaConnector = new FlinkKafkaConnector(transformerConfig) + val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group") + implicit val embeddedKafkaConfig: EmbeddedKafkaConfig = + EmbeddedKafkaConfig( + kafkaPort = 9093, + zooKeeperPort = 2183, + customConsumerProperties = customKafkaConsumerProperties + ) + implicit val deserializer: StringDeserializer = new StringDeserializer() + + def testConfiguration(): Configuration = { + val config = new Configuration() + config.setString("metrics.reporter", "job_metrics_reporter") + config.setString("metrics.reporter.job_metrics_reporter.class", classOf[BaseMetricsReporter].getName) + config + } + + override def beforeAll(): Unit = { + super.beforeAll() + BaseMetricsReporter.gaugeMetrics.clear() + EmbeddedKafka.start()(embeddedKafkaConfig) + insertTestData() + createTestTopics() + publishMessagesToKafka() + flinkCluster.before() + } + + private def publishMessagesToKafka(): Unit = { + EmbeddedKafka.publishStringMessageToKafka(transformerConfig.inputTopic(), EventFixture.SUCCESS_TRANSFORM) + EmbeddedKafka.publishStringMessageToKafka(transformerConfig.inputTopic(), EventFixture.FAILED_TRANSFORM) + EmbeddedKafka.publishStringMessageToKafka(transformerConfig.inputTopic(), EventFixture.SKIPPED_TRANSFORM) + EmbeddedKafka.publishStringMessageToKafka(transformerConfig.inputTopic(), EventFixture.PARTIAL_TRANSFORM) + EmbeddedKafka.publishStringMessageToKafka(transformerConfig.inputTopic(), EventFixture.FAILED_TRANSFORM_2) + } + + private def insertTestData(): Unit = { + val postgresConnect = new PostgresConnect(postgresConfig) + postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date, tags) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") + postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date, tags) values ('d4', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") + postgresConnect.execute("insert into dataset_transformations values('tf3', 'd2', 'tfdata.valueAsInt', '{\"type\":\"jsonata\",\"expr\":\"$number(id)\"}', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_transformations values('tf4', 'd2', 'tfdata.encryptEmail', '{\"type\":\"encrypt\",\"expr\": \"dealer.email\"}', 'Lenient', 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_transformations values('tf5', 'd4', 'tfdata.expr1', '{\"type\":\"jsonata\",\"expr\":null}', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_transformations values('tf6', 'd4', 'tfdata.expr2', '{\"type\":\"jsonata\",\"expr\":\"$keys(dealer).length\"}', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_transformations values('tf7', 'd4', 'tfdata.expr3', '{\"type\":\"jsonata\",\"expr\":\"number(id)\"}', null, 'System', 'System', now(), now());") + postgresConnect.closeConnection() + } + + override def afterAll(): Unit = { + super.afterAll() + flinkCluster.after() + EmbeddedKafka.stop() + } + + def createTestTopics(): Unit = { + List( + transformerConfig.inputTopic(), transformerConfig.kafkaFailedTopic, transformerConfig.kafkaSystemTopic, transformerConfig.kafkaTransformTopic, transformerConfig.kafkaTransformFailedTopic + ).foreach(EmbeddedKafka.createCustomTopic(_)) + } + + "TransformerStreamTestSpec" should "validate the transform stream task" in { + + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(transformerConfig) + val task = new TransformerStreamTask(transformerConfig, kafkaConnector) + task.process(env) + Future { + env.execute(transformerConfig.jobName) + } + + val outputs = EmbeddedKafka.consumeNumberMessagesFrom[String](transformerConfig.kafkaTransformTopic, 3, timeout = 30.seconds) + validateOutputs(outputs) + + val failedEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](transformerConfig.kafkaTransformFailedTopic, 2, timeout = 30.seconds) + validateFailedEvents(failedEvents) + + val systemEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](transformerConfig.kafkaSystemTopic, 5, timeout = 30.seconds) + validateSystemEvents(systemEvents) + + val mutableMetricsMap = mutable.Map[String, Long]() + BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) + Console.println("### DenormalizerStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) + validateMetrics(mutableMetricsMap) + + transformerConfig.successTag().getId should be("transformed-events") + } + + private def validateOutputs(outputs: List[String]): Unit = { + outputs.size should be(3) + outputs.zipWithIndex.foreach { + case (elem, idx) => + val msg = JSONUtil.deserialize[Map[String, AnyRef]](elem) + val event = JSONUtil.serialize(msg(Constants.EVENT)) + val obsrvMeta = msg(Constants.OBSRV_META).asInstanceOf[Map[String, AnyRef]] + obsrvMeta("timespans").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[Int] should be > 0 + idx match { + case 0 => + event should be("""{"dealer":{"email":"de****1@gmail.com","maskedPhone":"98******45","locationId":"KUN1","dealerCode":"D123","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""") + obsrvMeta("flags").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[String] should be("success") + case 1 => + event should be("""{"dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""") + obsrvMeta("flags").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[String] should be("skipped") + case 2 => + event should be("""{"tfdata":{"valueAsInt":1235},"dealer":{"dealerCode":"D123","locationId":"KUN1","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1235","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""") + obsrvMeta("flags").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[String] should be("partial") + } + } + /* + (Output Event,{"obsrv_meta":{"flags":{"transformer":"success"},"syncts":1701863209956,"prevProcessingTime":1701863215734,"error":{},"processingStartTime":1701863215322,"timespans":{"transformer":412}},"event":{"dealer":{"email":"de****1@gmail.com","maskedPhone":"98******45","locationId":"KUN1","dealerCode":"D123","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}},"dataset":"d1"},0) + (Output Event,{"obsrv_meta":{"flags":{"transformer":"skipped"},"syncts":1701863210084,"prevProcessingTime":1701863216141,"error":{},"processingStartTime":1701863215476,"timespans":{"transformer":665}},"event":{"dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}},"dataset":"d3"},1) + (Output Event,{"obsrv_meta":{"flags":{"transformer":"partial"},"syncts":1701863210111,"prevProcessingTime":1701863216378,"error":{},"processingStartTime":1701863215477,"timespans":{"transformer":901}},"event":{"tfdata":{"valueAsInt":1235},"dealer":{"dealerCode":"D123","locationId":"KUN1","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1235","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}},"dataset":"d2"},2) + */ + } + + private def validateFailedEvents(failedEvents: List[String]): Unit = { + failedEvents.size should be(2) + failedEvents.zipWithIndex.foreach { + case (elem, idx) => + val msg = JSONUtil.deserialize[Map[String, AnyRef]](elem) + val event = msg(Constants.EVENT).asInstanceOf[String] + val obsrvMeta = msg(Constants.OBSRV_META).asInstanceOf[Map[String, AnyRef]] + obsrvMeta("timespans").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[Int] should be > 0 + obsrvMeta("flags").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[String] should be (StatusCode.failed.toString) + obsrvMeta("error").asInstanceOf[Map[String, AnyRef]]("src").asInstanceOf[String] should be (Producer.transformer.toString) + obsrvMeta("error").asInstanceOf[Map[String, AnyRef]]("error_code").asInstanceOf[String] should be (ErrorConstants.ERR_TRANSFORMATION_FAILED.errorCode) + idx match { + case 0 => + event should be("{\"event\":{\"dealer\":{\"maskedPhone\":\"98******45\",\"locationId\":\"KUN1\",\"dealerCode\":\"D123\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1235\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d1\"}") + case 1 => + event should be("{\"event\":{\"tfdata\":{},\"dealer\":{\"dealerCode\":\"D123\",\"locationId\":\"KUN1\",\"email\":\"dealer1@gmail.com\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1234\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d4\"}") + } + } + /* + (Failed Event,{"event":"{\"event\":{\"dealer\":{\"maskedPhone\":\"98******45\",\"locationId\":\"KUN1\",\"dealerCode\":\"D123\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1235\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d1\"}","obsrv_meta":{"flags":{"transformer":"failed"},"syncts":1701863210058,"prevProcessingTime":1701863215948,"error":{"src":{"enumClass":"org.sunbird.obsrv.core.model.Producer","value":"transformer"},"error_code":"ERR_TRANSFORM_1023","error_msg":"Atleast one mandatory transformation has failed"},"processingStartTime":1701863215475,"timespans":{"transformer":473}},"dataset":"d1"},0) + (Failed Event,{"event":"{\"event\":{\"tfdata\":{},\"dealer\":{\"dealerCode\":\"D123\",\"locationId\":\"KUN1\",\"email\":\"dealer1@gmail.com\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1234\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d4\"}","obsrv_meta":{"flags":{"transformer":"failed"},"syncts":1701863210150,"prevProcessingTime":1701863216421,"error":{"src":{"enumClass":"org.sunbird.obsrv.core.model.Producer","value":"transformer"},"error_code":"ERR_TRANSFORM_1023","error_msg":"Atleast one mandatory transformation has failed"},"processingStartTime":1701863215477,"timespans":{"transformer":944}},"dataset":"d4"},1) + */ + } + + private def validateSystemEvents(systemEvents: List[String]): Unit = { + systemEvents.size should be(5) + systemEvents.count(f => { + val event = JSONUtil.deserialize[SystemEvent](f) + FunctionalError.TransformFieldMissing.equals(event.data.error.get.error_type) + }) should be(2) + systemEvents.count(f => { + val event = JSONUtil.deserialize[SystemEvent](f) + FunctionalError.TransformFailedError.equals(event.data.error.get.error_type) + }) should be(1) + systemEvents.count(f => { + val event = JSONUtil.deserialize[SystemEvent](f) + FunctionalError.TransformEvalError.equals(event.data.error.get.error_type) + }) should be(1) + systemEvents.count(f => { + val event = JSONUtil.deserialize[SystemEvent](f) + FunctionalError.TransformParseError.equals(event.data.error.get.error_type) + }) should be(1) + + systemEvents.foreach(se => { + val event = JSONUtil.deserialize[SystemEvent](se) + val error = event.data.error + if (event.ctx.dataset.getOrElse("ALL").equals("ALL")) + event.ctx.dataset_type should be(None) + else if (error.isDefined) { + val errorCode = error.get.error_code + if (errorCode.equals(ErrorConstants.MISSING_DATASET_ID.errorCode) || + errorCode.equals(ErrorConstants.MISSING_DATASET_CONFIGURATION.errorCode) || + errorCode.equals(ErrorConstants.EVENT_MISSING.errorCode)) { + event.ctx.dataset_type should be(None) + } + } + else + event.ctx.dataset_type should be(Some("dataset")) + }) + // TODO: Add more assertions + /* + (Sys Event,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"TransformerJob","type":"flink","pid":"denorm"},"dataset":"d1"},"data":{"error":{"pdata_id":"denorm","pdata_status":"failed","error_type":"TransformFieldMissing","error_code":"ERR_TRANSFORM_1023","error_message":"Transformation field is either missing or blank","error_level":"critical","error_count":1}},"ets":1701863215985},0) + (Sys Event,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"TransformerJob","type":"flink","pid":"denorm"},"dataset":"d2"},"data":{"error":{"pdata_id":"denorm","pdata_status":"failed","error_type":"TransformFieldMissing","error_code":"ERR_TRANSFORM_1023","error_message":"Transformation field is either missing or blank","error_level":"critical","error_count":1}},"ets":1701863216391},1) + (Sys Event,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"TransformerJob","type":"flink","pid":"denorm"},"dataset":"d4"},"data":{"error":{"pdata_id":"denorm","pdata_status":"failed","error_type":"TransformFailedError","error_code":"ERR_TRANSFORM_1022","error_message":"Unable to evaluate the transformation expression function","error_level":"critical","error_count":1}},"ets":1701863216431},2) + (Sys Event,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"TransformerJob","type":"flink","pid":"denorm"},"dataset":"d4"},"data":{"error":{"pdata_id":"denorm","pdata_status":"failed","error_type":"TransformEvalError","error_code":"ERR_TRANSFORM_1021","error_message":"Unable to evaluate the transformation expression function","error_level":"critical","error_count":1}},"ets":1701863216433},3) + (Sys Event,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"TransformerJob","type":"flink","pid":"denorm"},"dataset":"d4"},"data":{"error":{"pdata_id":"denorm","pdata_status":"failed","error_type":"TransformParseError","error_code":"ERR_TRANSFORM_1020","error_message":"Transformation expression function is not valid","error_level":"critical","error_count":1}},"ets":1701863216433},4) + */ + } + + private def validateMetrics(mutableMetricsMap: mutable.Map[String, Long]): Unit = { + mutableMetricsMap(s"${transformerConfig.jobName}.d1.${transformerConfig.totalEventCount}") should be(2) + mutableMetricsMap(s"${transformerConfig.jobName}.d1.${transformerConfig.transformSuccessCount}") should be(1) + mutableMetricsMap(s"${transformerConfig.jobName}.d1.${transformerConfig.transformFailedCount}") should be(1) + + mutableMetricsMap(s"${transformerConfig.jobName}.d2.${transformerConfig.totalEventCount}") should be(1) + mutableMetricsMap(s"${transformerConfig.jobName}.d2.${transformerConfig.transformPartialCount}") should be(1) + + mutableMetricsMap(s"${transformerConfig.jobName}.d3.${transformerConfig.totalEventCount}") should be(1) + mutableMetricsMap(s"${transformerConfig.jobName}.d3.${transformerConfig.transformSkippedCount}") should be(1) + + mutableMetricsMap(s"${transformerConfig.jobName}.d4.${transformerConfig.totalEventCount}") should be(1) + mutableMetricsMap(s"${transformerConfig.jobName}.d4.${transformerConfig.transformFailedCount}") should be(1) + } + +} \ No newline at end of file diff --git a/pipeline/pipeline-merged/pom.xml b/pipeline/unified-pipeline/pom.xml similarity index 87% rename from pipeline/pipeline-merged/pom.xml rename to pipeline/unified-pipeline/pom.xml index f3db71fe..33ef14b9 100644 --- a/pipeline/pipeline-merged/pom.xml +++ b/pipeline/unified-pipeline/pom.xml @@ -12,12 +12,12 @@ org.sunbird.obsrv.pipeline - pipeline-merged + unified-pipeline 1.0.0 jar - Merged Pipeline + Unified Pipeline - Entire pipeline merged into a single processing job + Entire pipeline merged into a single processing job @@ -64,39 +64,9 @@ org.sunbird.obsrv.pipeline - druid-router + dataset-router 1.0.0 - - com.github.java-json-tools - json-schema-validator - 2.2.14 - - - joda-time - joda-time - - - com.fasterxml.jackson.core - jackson-databind - - - com.google.guava - guava - - - - - com.google.guava - guava - 32.1.2-jre - - - org.apache.kafka - kafka-clients - ${kafka.version} - test - org.apache.kafka kafka_${scala.maj.version} @@ -173,7 +143,6 @@ 2.0.3 test - @@ -220,7 +189,7 @@ - org.sunbird.obsrv.pipeline.task.MergedPipelineStreamTask + in.sanketika.obsrv.pipeline.task.UnifiedPipelineStreamTask diff --git a/pipeline/pipeline-merged/src/main/resources/merged-pipeline.conf b/pipeline/unified-pipeline/src/main/resources/unified-pipeline.conf similarity index 89% rename from pipeline/pipeline-merged/src/main/resources/merged-pipeline.conf rename to pipeline/unified-pipeline/src/main/resources/unified-pipeline.conf index 75f43376..9b1e1bdf 100644 --- a/pipeline/pipeline-merged/src/main/resources/merged-pipeline.conf +++ b/pipeline/unified-pipeline/src/main/resources/unified-pipeline.conf @@ -12,8 +12,9 @@ kafka { output.denorm.topic = ${job.env}".denorm" output.denorm.failed.topic = ${job.env}".failed" output.transform.topic = ${job.env}".transform" + output.transform.failed.topic = ${job.env}".failed" stats.topic = ${job.env}".stats" - groupId = ${job.env}"-single-pipeline-group" + groupId = ${job.env}"-unified-pipeline-group" producer { max-request-size = 5242880 } diff --git a/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineConfig.scala b/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineConfig.scala new file mode 100644 index 00000000..75322bc2 --- /dev/null +++ b/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineConfig.scala @@ -0,0 +1,20 @@ +package org.sunbird.obsrv.pipeline.task + +import com.typesafe.config.Config +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.java.typeutils.TypeExtractor +import org.apache.flink.streaming.api.scala.OutputTag +import org.sunbird.obsrv.core.streaming.BaseJobConfig + +import scala.collection.mutable + +class UnifiedPipelineConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "UnifiedPipelineJob") { + + private val serialVersionUID = 2905979434303791379L + implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) + + override def inputTopic(): String = config.getString("kafka.input.topic") + override def inputConsumer(): String = "pipeline-consumer" + override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("processing_stats") + override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") +} diff --git a/pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineStreamTask.scala b/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineStreamTask.scala similarity index 70% rename from pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineStreamTask.scala rename to pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineStreamTask.scala index f7d8dce9..f24bb256 100644 --- a/pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineStreamTask.scala +++ b/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineStreamTask.scala @@ -9,7 +9,7 @@ import org.sunbird.obsrv.core.util.FlinkUtil import org.sunbird.obsrv.denormalizer.task.{DenormalizerConfig, DenormalizerStreamTask} import org.sunbird.obsrv.extractor.task.{ExtractorConfig, ExtractorStreamTask} import org.sunbird.obsrv.preprocessor.task.{PipelinePreprocessorConfig, PipelinePreprocessorStreamTask} -import org.sunbird.obsrv.router.task.{DruidRouterConfig, DynamicRouterStreamTask} +import org.sunbird.obsrv.router.task.{DynamicRouterConfig, DynamicRouterStreamTask} import org.sunbird.obsrv.transformer.task.{TransformerConfig, TransformerStreamTask} import java.io.File @@ -19,26 +19,21 @@ import scala.collection.mutable * Druid Router stream task routes every event into its respective topic configured at dataset level */ -class MergedPipelineStreamTask(config: Config, mergedPipelineConfig: MergedPipelineConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { +class UnifiedPipelineStreamTask(config: Config, pipelineConfig: UnifiedPipelineConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { private val serialVersionUID = 146697324640926024L // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster def process(): Unit = { - implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(mergedPipelineConfig) + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(pipelineConfig) process(env) - env.execute(mergedPipelineConfig.jobName) + env.execute(pipelineConfig.jobName) } // $COVERAGE-ON$ - /** - * Created an overloaded process function to enable unit testing - * @param env StreamExecutionEnvironment - */ def process(env: StreamExecutionEnvironment): Unit = { - - val dataStream = getMapDataStream(env, mergedPipelineConfig, kafkaConnector) + val dataStream = getMapDataStream(env, pipelineConfig, kafkaConnector) processStream(dataStream) } @@ -48,7 +43,7 @@ class MergedPipelineStreamTask(config: Config, mergedPipelineConfig: MergedPipel val preprocessorTask = new PipelinePreprocessorStreamTask(new PipelinePreprocessorConfig(config), kafkaConnector) val denormalizerTask = new DenormalizerStreamTask(new DenormalizerConfig(config), kafkaConnector) val transformerTask = new TransformerStreamTask(new TransformerConfig(config), kafkaConnector) - val routerTask = new DynamicRouterStreamTask(new DruidRouterConfig(config), kafkaConnector) + val routerTask = new DynamicRouterStreamTask(new DynamicRouterConfig(config), kafkaConnector) routerTask.processStream( transformerTask.processStream( @@ -63,18 +58,17 @@ class MergedPipelineStreamTask(config: Config, mergedPipelineConfig: MergedPipel } // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster -object MergedPipelineStreamTask { +object UnifiedPipelineStreamTask { def main(args: Array[String]): Unit = { val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path")) val config = configFilePath.map { path => ConfigFactory.parseFile(new File(path)).resolve() - }.getOrElse(ConfigFactory.load("merged-pipeline.conf").withFallback(ConfigFactory.systemEnvironment())) - val mergedPipelineConfig = new MergedPipelineConfig(config) - val kafkaUtil = new FlinkKafkaConnector(mergedPipelineConfig) - val task = new MergedPipelineStreamTask(config, mergedPipelineConfig, kafkaUtil) + }.getOrElse(ConfigFactory.load("unified-pipeline.conf").withFallback(ConfigFactory.systemEnvironment())) + val pipelineConfig = new UnifiedPipelineConfig(config) + val kafkaUtil = new FlinkKafkaConnector(pipelineConfig) + val task = new UnifiedPipelineStreamTask(config, pipelineConfig, kafkaUtil) task.process() } } - // $COVERAGE-ON$ \ No newline at end of file diff --git a/pipeline/unified-pipeline/src/test/resources/base-config.conf b/pipeline/unified-pipeline/src/test/resources/base-config.conf new file mode 100644 index 00000000..3ade36f7 --- /dev/null +++ b/pipeline/unified-pipeline/src/test/resources/base-config.conf @@ -0,0 +1,8 @@ +postgres { + host = localhost + port = 5432 + maxConnections = 2 + user = "postgres" + password = "postgres" + database="postgres" +} \ No newline at end of file diff --git a/pipeline/pipeline-merged/src/test/resources/test.conf b/pipeline/unified-pipeline/src/test/resources/test.conf similarity index 93% rename from pipeline/pipeline-merged/src/test/resources/test.conf rename to pipeline/unified-pipeline/src/test/resources/test.conf index d2b959c3..aa514d54 100644 --- a/pipeline/pipeline-merged/src/test/resources/test.conf +++ b/pipeline/unified-pipeline/src/test/resources/test.conf @@ -16,6 +16,7 @@ kafka { output.denorm.topic = ${job.env}".denorm" output.denorm.failed.topic = ${job.env}".failed" output.transform.topic = ${job.env}".transform" + output.transform.failed.topic = ${job.env}".transform.failed" stats.topic = ${job.env}".stats" groupId = ${job.env}"-single-pipeline-group" producer { @@ -38,4 +39,4 @@ redis { preprocessor.duplication.store.id = 2 key.expiry.seconds = 3600 } -} +} \ No newline at end of file diff --git a/pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala b/pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/EventFixture.scala similarity index 98% rename from pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala rename to pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/EventFixture.scala index dee90323..a5e623b6 100644 --- a/pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala +++ b/pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/EventFixture.scala @@ -1,4 +1,4 @@ -package org.sunbird.obsrv.fixture +package org.sunbird.obsrv.pipeline object EventFixture { @@ -11,6 +11,4 @@ object EventFixture { val VALID_BATCH_EVENT_D2 = """{"dataset":"d2","id":"event4","event":{"id":"4567","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" val INVALID_BATCH_EVENT_D2 = """{"dataset":"d2","id":"event5","event1":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" - - -} +} \ No newline at end of file diff --git a/pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/pipeline/MergedPipelineStreamTaskTestSpec.scala b/pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/UnifiedPipelineStreamTaskTestSpec.scala similarity index 81% rename from pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/pipeline/MergedPipelineStreamTaskTestSpec.scala rename to pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/UnifiedPipelineStreamTaskTestSpec.scala index f3cf86b2..879abeec 100644 --- a/pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/pipeline/MergedPipelineStreamTaskTestSpec.scala +++ b/pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/UnifiedPipelineStreamTaskTestSpec.scala @@ -11,18 +11,15 @@ import org.sunbird.obsrv.BaseMetricsReporter import org.sunbird.obsrv.core.cache.RedisConnect import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil} -import org.sunbird.obsrv.extractor.task.ExtractorConfig -import org.sunbird.obsrv.fixture.EventFixture -import org.sunbird.obsrv.pipeline.task.{MergedPipelineConfig, MergedPipelineStreamTask} +import org.sunbird.obsrv.pipeline.task.{UnifiedPipelineConfig, UnifiedPipelineStreamTask} import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry -import org.sunbird.obsrv.transformer.task.TransformerConfig import scala.collection.mutable import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent.Future import scala.concurrent.duration._ -class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { +class UnifiedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder() .setConfiguration(testConfiguration()) @@ -30,8 +27,8 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { .setNumberTaskManagers(1) .build) - val mergedPipelineConfig = new MergedPipelineConfig(config) - val kafkaConnector = new FlinkKafkaConnector(mergedPipelineConfig) + val unifiedPipelineConfig = new UnifiedPipelineConfig(config) + val kafkaConnector = new FlinkKafkaConnector(unifiedPipelineConfig) val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group") implicit val embeddedKafkaConfig: EmbeddedKafkaConfig = EmbeddedKafkaConfig( @@ -65,7 +62,7 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { } override def afterAll(): Unit = { - val redisConnection = new RedisConnect(mergedPipelineConfig.redisHost, mergedPipelineConfig.redisPort, mergedPipelineConfig.redisConnectionTimeout) + val redisConnection = new RedisConnect(unifiedPipelineConfig.redisHost, unifiedPipelineConfig.redisPort, unifiedPipelineConfig.redisConnectionTimeout) redisConnection.getConnection(config.getInt("redis.database.extractor.duplication.store.id")).flushAll() redisConnection.getConnection(config.getInt("redis.database.preprocessor.duplication.store.id")).flushAll() super.afterAll() @@ -83,20 +80,20 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { ).foreach(EmbeddedKafka.createCustomTopic(_)) } - "MergedPipelineStreamTaskTestSpec" should "validate the entire pipeline" in { + "UnifiedPipelineStreamTaskTestSpec" should "validate the entire pipeline" in { - implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(mergedPipelineConfig) - val task = new MergedPipelineStreamTask(config, mergedPipelineConfig, kafkaConnector) + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(unifiedPipelineConfig) + val task = new UnifiedPipelineStreamTask(config, unifiedPipelineConfig, kafkaConnector) task.process(env) Future { - env.execute(mergedPipelineConfig.jobName) + env.execute(unifiedPipelineConfig.jobName) } try { val d1Events = EmbeddedKafka.consumeNumberMessagesFrom[String]("d1-events", 1, timeout = 30.seconds) - d1Events.size should be (1) + d1Events.size should be(1) val d2Events = EmbeddedKafka.consumeNumberMessagesFrom[String]("d2-events", 1, timeout = 30.seconds) - d2Events.size should be (1) + d2Events.size should be(1) } catch { case ex: Exception => ex.printStackTrace() } @@ -109,7 +106,7 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { val mutableMetricsMap = mutable.Map[String, Long](); BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) - Console.println("### MergedPipelineStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) + Console.println("### UnifiedPipelineStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) mutableMetricsMap("ExtractorJob.d1.extractor-total-count") should be(4) mutableMetricsMap("ExtractorJob.d1.extractor-duplicate-count") should be(1) @@ -144,16 +141,8 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { mutableMetricsMap("DruidRouterJob.d2.router-total-count") should be(1) mutableMetricsMap("DruidRouterJob.d2.router-success-count") should be(1) - val extractorConfig = new ExtractorConfig(config) - extractorConfig.inputTopic() should be (config.getString("kafka.input.topic")) - extractorConfig.inputConsumer() should be ("extractor-consumer") - - val transformerConfig = new TransformerConfig(config) - transformerConfig.inputTopic() should be(config.getString("kafka.input.topic")) - transformerConfig.inputConsumer() should be("transformer-consumer") - - mergedPipelineConfig.successTag().getId should be ("processing_stats") - mergedPipelineConfig.failedEventsOutputTag().getId should be ("failed-events") + unifiedPipelineConfig.successTag().getId should be("processing_stats") + unifiedPipelineConfig.failedEventsOutputTag().getId should be("failed-events") } } diff --git a/pom.xml b/pom.xml index c8f53bd8..4ecdc676 100644 --- a/pom.xml +++ b/pom.xml @@ -18,6 +18,7 @@ framework dataset-registry + transformation-sdk pipeline data-products diff --git a/transformation-sdk/pom.xml b/transformation-sdk/pom.xml new file mode 100644 index 00000000..10d393ce --- /dev/null +++ b/transformation-sdk/pom.xml @@ -0,0 +1,180 @@ + + + 4.0.0 + transformation-sdk + org.sunbird.obsrv + 1.0.0 + jar + Obsrv Transformation Library as a SDK + + UTF-8 + UTF-8 + 2.12 + 2.12.11 + 1.15.2 + 2.8.1 + 11 + 1.9.13 + 1.4.0 + 2.14.1 + + + + org.sunbird.obsrv + dataset-registry + 1.0.0 + + + org.json4s + json4s-native_${scala.maj.version} + 4.0.6 + + + com.ibm.jsonata4java + JSONata4Java + 2.2.6 + + + com.fasterxml.jackson.core + jackson-databind + + + + + com.github.bancolombia + data-mask-core + 1.0.1 + + + com.fasterxml.jackson.core + jackson-databind + + + + + org.scalatest + scalatest_2.12 + 3.0.6 + test + + + org.mockito + mockito-core + 3.3.3 + test + + + org.sunbird.obsrv + framework + 1.0.0 + test-jar + test + + + org.sunbird.obsrv + dataset-registry + 1.0.0 + test-jar + test + + + + + src/main/scala + src/test/scala + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + 11 + + + + maven-surefire-plugin + 2.20 + + true + + + + + org.scalatest + scalatest-maven-plugin + 1.0 + + ${project.build.directory}/surefire-reports + . + dp-core-testsuite.txt + + + + test + + test + + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.2.0 + + + + test-jar + + + + + + + org.scoverage + scoverage-maven-plugin + ${scoverage.plugin.version} + + ${scala.version} + true + true + + + + + net.alchim31.maven + scala-maven-plugin + 4.4.0 + + ${java.target.runtime} + ${java.target.runtime} + ${scala.version} + false + + + + scala-compile-first + process-resources + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + + + + + diff --git a/transformation-sdk/src/main/resources/transformation-sdk.conf b/transformation-sdk/src/main/resources/transformation-sdk.conf new file mode 100644 index 00000000..e69de29b diff --git a/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala new file mode 100644 index 00000000..125872d0 --- /dev/null +++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala @@ -0,0 +1,46 @@ +package org.sunbird.obsrv.transformer.types + +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.MissingNode +import org.json4s.{DefaultFormats, Formats, JValue, MappingException} +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.ErrorConstants +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.DatasetTransformation +import org.sunbird.obsrv.transformer.util.CipherUtil + +class EncryptTransformer extends ITransformer { + + implicit val jsonFormats: Formats = DefaultFormats.withLong + private val logger = LoggerFactory.getLogger(classOf[EncryptTransformer]) + + implicit class JsonHelper(json: JValue) { + def customExtract[T](path: String)(implicit mf: Manifest[T]): T = { + path.split('.').foldLeft(json)({ case (acc: JValue, node: String) => acc \ node }).extract[T] + } + } + + override def transformField(json: JValue, jsonNode: JsonNode, dt: DatasetTransformation): (JValue, TransformFieldStatus) = { + val emptyNode = getJSON(dt.fieldKey, MissingNode.getInstance()) + try { + val currentValue = json.customExtract[String](dt.transformationFunction.expr) + val encryptedValue = CipherUtil.encrypt(currentValue) + (getJSON(dt.fieldKey, encryptedValue), TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = true, dt.mode.get)) + } catch { + case ex: MappingException => + logger.error(s"Transformer(Encrypt) | Exception parsing transformation expression | Data=${JSONUtil.serialize(dt)} | error=${ex.getMessage}", ex) + (emptyNode, TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = false, dt.mode.get, Some(ErrorConstants.TRANSFORMATION_FIELD_MISSING))) + } + } + +} + +object EncryptTransformer { + + private val encryptTransformer = new EncryptTransformer() + + def transform(json: JValue, jsonNode: JsonNode, dtList: List[DatasetTransformation]): TransformationResult = { + encryptTransformer.transform(json, jsonNode, dtList) + } + +} \ No newline at end of file diff --git a/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala new file mode 100644 index 00000000..7fee60ca --- /dev/null +++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala @@ -0,0 +1,62 @@ +package org.sunbird.obsrv.transformer.types + +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.JsonNodeType +import org.json4s.native.JsonMethods.parse +import org.json4s.{JNothing, JObject, JValue} +import org.sunbird.obsrv.core.model.ErrorConstants.Error +import org.sunbird.obsrv.model.DatasetModels.DatasetTransformation +import org.sunbird.obsrv.model.TransformMode.TransformMode +import org.sunbird.obsrv.transformer.util.ConditionEvaluator + +import scala.collection.mutable.ListBuffer + +case class TransformFieldStatus(fieldKey: String, expr: String, success: Boolean, mode: TransformMode, error: Option[Error] = None) +case class TransformationResult(json: JValue, fieldStatus: List[TransformFieldStatus]) +abstract class ITransformer[T] { + + def transformField(json: JValue, jsonNode: JsonNode, dt: DatasetTransformation): (JValue, TransformFieldStatus) + + def transform(json: JValue, jsonNode: JsonNode, dtList: List[DatasetTransformation]): TransformationResult = { + val resultBuffer = ListBuffer[TransformFieldStatus]() + val evalList = dtList.map(dt => { + val conditionStatus = ConditionEvaluator.evalCondition(dt.datasetId, jsonNode, dt.transformationFunction.condition, dt.mode) + if (!conditionStatus.success) { + resultBuffer.append(TransformFieldStatus(dt.fieldKey, conditionStatus.expr, success = false, dt.mode.get, conditionStatus.error)) + JObject(dt.fieldKey -> JNothing) + } else { + val result = transformField(json, jsonNode, dt) + resultBuffer.append(result._2) + result._1 + } + }) + val transformedJson = evalList.reduceLeftOption((a, b) => a merge b).getOrElse(JNothing) + TransformationResult(transformedJson, resultBuffer.toList) + } + + def getJSON(key: String, value: String): JValue = { + val path = key.split('.').toList ++ List(s""""$value"""") + val outPath = path.reduceRight((a, b) => s"""{"$a":$b}""") + parse(outPath, useBigIntForLong = false) + } + + def getJSON(key: String, value: AnyRef): JValue = { + val path = key.split('.').toList ++ List(s"""$value""") + val outPath = path.reduceRight((a, b) => s"""{"$a":$b}""") + parse(outPath, useBigIntForLong = false) + } + + def getJSON(key: String, value: JsonNode): JValue = { + Option(value).map { jsonNodeValue => + jsonNodeValue.getNodeType match { + case JsonNodeType.STRING => getJSON(key, jsonNodeValue.textValue()) + case JsonNodeType.NUMBER => getJSON(key, jsonNodeValue.numberValue().asInstanceOf[AnyRef]) + case JsonNodeType.BOOLEAN => getJSON(key, jsonNodeValue.booleanValue().asInstanceOf[AnyRef]) + case JsonNodeType.ARRAY => getJSON(key, jsonNodeValue.toString.asInstanceOf[AnyRef]) + case JsonNodeType.OBJECT => getJSON(key, jsonNodeValue.toString.asInstanceOf[AnyRef]) + case _ => getJSON(key, null.asInstanceOf[AnyRef]) + } + }.getOrElse(getJSON(key, null.asInstanceOf[AnyRef])) + } + +} \ No newline at end of file diff --git a/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala new file mode 100644 index 00000000..d6a55b54 --- /dev/null +++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala @@ -0,0 +1,67 @@ +package org.sunbird.obsrv.transformer.types + +import com.api.jsonata4java.expressions.{EvaluateException, Expressions, ParseException} +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.MissingNode +import org.json4s.JValue +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.ErrorConstants +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels +import org.sunbird.obsrv.model.DatasetModels.TransformationFunction + +class JSONAtaTransformer extends ITransformer { + + private val logger = LoggerFactory.getLogger(classOf[JSONAtaTransformer]) + + override def transformField(json: JValue, jsonNode: JsonNode, dt: DatasetModels.DatasetTransformation): (JValue, TransformFieldStatus) = { + val emptyNode = getJSON(dt.fieldKey, MissingNode.getInstance()) + try { + val expr = Expressions.parse(dt.transformationFunction.expr) + val resNode = expr.evaluate(jsonNode) + (Option(resNode).map { node => getJSON(dt.fieldKey, node) }.getOrElse(emptyNode), TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = true, dt.mode.get)) + } catch { + case ex1: ParseException => + logger.error(s"Transformer(JSONATA) | Exception parsing transformation expression | Data=${JSONUtil.serialize(dt)} | error=${ex1.getMessage}", ex1) + (emptyNode, TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = false, dt.mode.get, Some(ErrorConstants.INVALID_EXPR_FUNCTION))) + case ex2: EvaluateException => + logger.error(s"Transformer(JSONATA) | Exception evaluating transformation expression | Data=${JSONUtil.serialize(dt)} | error=${ex2.getMessage}", ex2) + (emptyNode, TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = false, dt.mode.get, Some(ErrorConstants.ERR_EVAL_EXPR_FUNCTION))) + case ex3: Exception => + logger.error(s"Transformer(JSONATA) | Unknown error | Data=${JSONUtil.serialize(dt)} | error=${ex3.getMessage}", ex3) + (emptyNode, TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = false, dt.mode.get, Some(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION))) + } + } + + def evaluate(jsonNode: JsonNode, tf: TransformationFunction): JsonNode = { + + try { + val expr = Expressions.parse(tf.expr) + expr.evaluate(jsonNode) + } catch { + case ex1: ParseException => + logger.error(s"Transformer(JSONATA) | Exception parsing transformation expression | Data=${JSONUtil.serialize(tf)} | error=${ex1.getMessage}", ex1) + MissingNode.getInstance() + case ex2: EvaluateException => + logger.error(s"Transformer(JSONATA) | Exception evaluating transformation expression | Data=${JSONUtil.serialize(tf)} | error=${ex2.getMessage}", ex2) + MissingNode.getInstance() + case ex3: Exception => + logger.error(s"Transformer(JSONATA) | Unknown error | Data=${JSONUtil.serialize(tf)} | error=${ex3.getMessage}", ex3) + MissingNode.getInstance() + } + } +} + +object JSONAtaTransformer { + + private val jsonAtaTransformer = new JSONAtaTransformer() + + def transform(json: JValue, jsonNode: JsonNode, dtList: List[DatasetModels.DatasetTransformation]): TransformationResult = { + jsonAtaTransformer.transform(json, jsonNode, dtList) + } + + def evaluate(jsonNode: JsonNode, transformation: TransformationFunction): JsonNode = { + jsonAtaTransformer.evaluate(jsonNode, transformation) + } + +} \ No newline at end of file diff --git a/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala new file mode 100644 index 00000000..045e224f --- /dev/null +++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala @@ -0,0 +1,63 @@ +package org.sunbird.obsrv.transformer.types + +import co.com.bancolombia.datamask.{MaskUtils => CustomMaskUtils} +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.MissingNode +import org.json4s.{DefaultFormats, Formats, JValue, MappingException} +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.ErrorConstants +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.DatasetTransformation + +import java.util.regex.Pattern + +class MaskTransformer extends ITransformer[String] { + + implicit val jsonFormats: Formats = DefaultFormats.withLong + private val logger = LoggerFactory.getLogger(classOf[EncryptTransformer]) + + implicit class JsonHelper(json: JValue) { + def customExtract[T](path: String)(implicit mf: Manifest[T]): T = { + path.split('.').foldLeft(json)({ case (acc: JValue, node: String) => acc \ node }).extract[T] + } + } + + private val maskRatio = 0.35 // TODO: Move it to a config + private val emailPattern = Pattern.compile("^(.+)@(\\S+)$") // TODO: Read the pattern from config + + private def mask(value: String): String = { + if (value.isEmpty) return value + if (emailPattern.matcher(value).matches()) { + CustomMaskUtils.maskAsEmail(value) + } else { + val openDigits = (value.length * maskRatio).ceil + val firstDigitCount = (openDigits / 2).floor + val lastDigitCount = openDigits - firstDigitCount + CustomMaskUtils.mask(value, firstDigitCount.intValue(), lastDigitCount.intValue()) + } + } + + override def transformField(json: JValue, jsonNode: JsonNode, dt: DatasetTransformation): (JValue, TransformFieldStatus) = { + val emptyNode = getJSON(dt.fieldKey, MissingNode.getInstance()) + try { + val currentValue = json.customExtract[String](dt.transformationFunction.expr) + val maskedValue = mask(currentValue).replaceAll("\"", "") + (getJSON(dt.fieldKey, maskedValue), TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = true, dt.mode.get)) + } catch { + case ex: MappingException => + logger.error(s"Transformer(Mask) | Exception parsing transformation expression | Data=${JSONUtil.serialize(dt)} | error=${ex.getMessage}", ex) + (emptyNode, TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = false, dt.mode.get, Some(ErrorConstants.TRANSFORMATION_FIELD_MISSING))) + } + } + +} + +object MaskTransformer { + + private val maskingTransformer = new MaskTransformer() + + def transform(json: JValue, jsonNode: JsonNode, dtList: List[DatasetTransformation]): TransformationResult = { + maskingTransformer.transform(json, jsonNode, dtList) + } + +} \ No newline at end of file diff --git a/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala new file mode 100644 index 00000000..58d489f6 --- /dev/null +++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala @@ -0,0 +1,36 @@ +package org.sunbird.obsrv.transformer.util + +import org.sunbird.obsrv.core.model.SystemConfig + +import java.util.Base64 +import javax.crypto.Cipher +import javax.crypto.spec.SecretKeySpec + +object CipherUtil { + + private val algorithm = "AES" + + private val encryptInstance = getInstance(Cipher.ENCRYPT_MODE) + + private val decryptInstance = getInstance(Cipher.DECRYPT_MODE) + + def encrypt(value: String): String = { + if (value.isEmpty) return value + val encryptedByteValue = encryptInstance.doFinal(value.getBytes("utf-8")) + Base64.getEncoder.encodeToString(encryptedByteValue) + } + + def decrypt(value: String): String = { + val decryptedValue64 = Base64.getDecoder.decode(value) + val decryptedByteValue = decryptInstance.doFinal(decryptedValue64) + new String(decryptedByteValue, "utf-8") + } + + private def getInstance(mode: Int): Cipher = { + val cipher = Cipher.getInstance(algorithm) + val key = new SecretKeySpec(SystemConfig.getString("encryptionSecretKey").getBytes("utf-8"), algorithm) + cipher.init(mode, key) + cipher + } + +} \ No newline at end of file diff --git a/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala new file mode 100644 index 00000000..0a892b8f --- /dev/null +++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala @@ -0,0 +1,47 @@ +package org.sunbird.obsrv.transformer.util + +import com.api.jsonata4java.expressions.{EvaluateException, Expressions, ParseException} +import com.fasterxml.jackson.databind.JsonNode +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.ErrorConstants.Error +import org.sunbird.obsrv.core.model.ErrorConstants +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.Condition +import org.sunbird.obsrv.model.TransformMode.TransformMode + +case class ConditionStatus(expr: String, success: Boolean, mode: Option[TransformMode] = None, error: Option[Error] = None) +object ConditionEvaluator { + + private val logger = LoggerFactory.getLogger(ConditionEvaluator.getClass) + + def evalCondition(datasetId: String, json: JsonNode, condition: Option[Condition], mode: Option[TransformMode]): ConditionStatus = { + if(condition.isDefined) { + condition.get.`type` match { + case "jsonata" => evalJSONAtaCondition(datasetId, json, condition.get, mode) + case _ => ConditionStatus("", success = false, mode, Some(ErrorConstants.NO_IMPLEMENTATION_FOUND)) + } + } else { + ConditionStatus("", success = true, mode) + } + } + + private def evalJSONAtaCondition(datasetId: String, json: JsonNode, condition: Condition, mode: Option[TransformMode]): ConditionStatus = { + try { + val expr = Expressions.parse(condition.expr) + val resultNode = expr.evaluate(json) + val result = resultNode.isBoolean && resultNode.asBoolean() + ConditionStatus(condition.expr, result, mode) + } catch { + case ex1: ParseException => + logger.error(s"Transformer(ConditionEvaluator) | Exception parsing condition expression | dataset=$datasetId | ConditionData=${JSONUtil.serialize(condition)} | error=${ex1.getMessage}", ex1) + ConditionStatus(condition.expr, success = false, mode, Some(ErrorConstants.INVALID_EXPR_FUNCTION)) + case ex2: EvaluateException => + logger.error(s"Transformer(ConditionEvaluator) | Exception evaluating condition expression | dataset=$datasetId | ConditionData=${JSONUtil.serialize(condition)} | error=${ex2.getMessage}", ex2) + ConditionStatus(condition.expr, success = false, mode, Some(ErrorConstants.ERR_EVAL_EXPR_FUNCTION)) + case ex3: Exception => + logger.error(s"Transformer(ConditionEvaluator) | Unknown error during condition evaluation | dataset=$datasetId | ConditionData=${JSONUtil.serialize(condition)} | error=${ex3.getMessage}", ex3) + ConditionStatus(condition.expr, success = false, mode, Some(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION)) + } + } + +} diff --git a/transformation-sdk/src/test/resources/test.conf b/transformation-sdk/src/test/resources/test.conf new file mode 100644 index 00000000..e69de29b