From 5c9d3f2bc8c60c5699860d752debf82630bc25a8 Mon Sep 17 00:00:00 2001 From: shiva-rakshith Date: Tue, 6 Jun 2023 18:50:58 +0530 Subject: [PATCH 01/37] feat: add connector config and connector stats update functions --- .../sunbird/obsrv/model/DatasetModels.scala | 10 ++++-- .../obsrv/registry/DatasetRegistry.scala | 18 ++++++++++ .../service/DatasetRegistryService.scala | 35 ++++++++++++++++--- 3 files changed, 56 insertions(+), 7 deletions(-) diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala index 42feb3ae..48791415 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala @@ -48,10 +48,16 @@ object DatasetModels { @JsonProperty("field_key") fieldKey: String, @JsonProperty("transformation_function") transformationFunction: TransformationFunction, @JsonProperty("status") status: String) - case class ConnectorConfig(@JsonProperty("kafkaBrokers") kafkaBrokers: String, @JsonProperty("topic") topic: String) + case class ConnectorConfig(@JsonProperty("kafkaBrokers") kafkaBrokers: String, @JsonProperty("topic") topic: String, @JsonProperty("jdbc_user") jdbcUser: String, + @JsonProperty("jdbc_password") jdbcPassword: String, @JsonProperty("jdbc_host") jdbcHost: String, @JsonProperty("jdbc_port") jdbcPort: Int, + @JsonProperty("jdbc_database") jdbcDatabase: String, @JsonProperty("jdbc_database_table") jdbcDatabaseTable: String, @JsonProperty("jdbc_batch_size") jdbcBatchSize: Int, + @JsonProperty("jdbc_batches_per_minute") jdbcBatchesPerMinute: Int, @JsonProperty("jdbc_database_type") jdbcDatabaseType: String) + + case class ConnectorStats(@JsonProperty("last_fetch_timestamp") lastFetchTimestamp: String, @JsonProperty("records") records: Long, @JsonProperty("avg_batch_read_time") avgBatchReadTime: Long, @JsonProperty("disconnections") disconnections: Int) + case class DatasetSourceConfig(@JsonProperty("id") id: String, @JsonProperty("dataset_id") datasetId: String, @JsonProperty("connector_type") connectorType: String, @JsonProperty("connector_config") connectorConfig: ConnectorConfig, - @JsonProperty("status") status: String) + @JsonProperty("connector_stats") connectorStats: ConnectorStats, @JsonProperty("status") status: String) case class DataSource(@JsonProperty("datasource") datasource: String, @JsonProperty("dataset_id") datasetId: String, @JsonProperty("ingestion_spec") ingestionSpec: String, @JsonProperty("datasource_ref") datasourceRef: String) diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala index e7265e23..e71a0915 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala @@ -3,6 +3,8 @@ package org.sunbird.obsrv.registry import org.sunbird.obsrv.model.DatasetModels.{DataSource, Dataset, DatasetSourceConfig, DatasetTransformation} import org.sunbird.obsrv.service.DatasetRegistryService +import java.sql.Timestamp + object DatasetRegistry { private val datasets: Map[String, Dataset] = DatasetRegistryService.readAllDatasets() @@ -22,6 +24,10 @@ object DatasetRegistry { datasetSourceConfig } + def getDatasetSourceConfigById(datasetId: String): DatasetSourceConfig = { + datasetSourceConfig.map(configList => configList.filter(_.datasetId.equalsIgnoreCase(datasetId))).get.head + } + def getDatasetTransformations(id: String): Option[List[DatasetTransformation]] = { datasetTransformations.get(id) } @@ -38,4 +44,16 @@ object DatasetRegistry { DatasetRegistryService.updateDatasourceRef(datasource, datasourceRef) } + def updateConnectorStats(datasetId: String, lastFetchTimestamp: Timestamp, records: Long): Unit = { + DatasetRegistryService.updateConnectorStats(datasetId, lastFetchTimestamp, records) + } + + def updateConnectorDisconnections(datasetId: String, disconnections: Int): Unit = { + DatasetRegistryService.updateConnectorDisconnections(datasetId, disconnections) + } + + def updateConnectorAvgBatchReadTime(datasetId: String, avgReadTime: Long): Unit = { + DatasetRegistryService.updateConnectorAvgBatchReadTime(datasetId, avgReadTime) + } + } \ No newline at end of file diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala index b7ba4ff4..05c80ddc 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala @@ -4,10 +4,10 @@ import com.typesafe.config.{Config, ConfigFactory} import org.slf4j.LoggerFactory import org.sunbird.obsrv.core.streaming.BaseDeduplication import org.sunbird.obsrv.core.util.{JSONUtil, PostgresConnect, PostgresConnectionConfig} -import org.sunbird.obsrv.model.DatasetModels.{ConnectorConfig, DataSource, Dataset, DatasetConfig, DatasetSourceConfig, DatasetTransformation, DedupConfig, DenormConfig, ExtractionConfig, RouterConfig, TransformationFunction, ValidationConfig} +import org.sunbird.obsrv.model.DatasetModels.{ConnectorConfig, ConnectorStats, DataSource, Dataset, DatasetConfig, DatasetSourceConfig, DatasetTransformation, DedupConfig, DenormConfig, ExtractionConfig, RouterConfig, TransformationFunction, ValidationConfig} import java.io.File -import java.sql.ResultSet +import java.sql.{ResultSet, Timestamp} object DatasetRegistryService { @@ -102,20 +102,43 @@ object DatasetRegistryService { } def updateDatasourceRef(datasource: DataSource, datasourceRef: String): Unit = { + val query = s"UPDATE datasources set datasource_ref = '$datasourceRef' where datasource='${datasource.datasource}' and dataset_id='${datasource.datasetId}'" + updateRegistry(query, "Exception while updating data source reference in Postgres") + } + + def updateConnectorStats(datasetId: String, lastFetchTimestamp: Timestamp, records: Long): Unit = { + val query = s"UPDATE dataset_source_config SET connector_stats = jsonb_set(jsonb_set(connector_stats::jsonb, '{records}'," + + s" ((COALESCE(connector_stats->>'records', '0')::int + $records)::text)::jsonb, true), '{last_fetch_timestamp}', " + + s"to_jsonb('$lastFetchTimestamp'::timestamp), true) WHERE dataset_id = '$datasetId'" + updateRegistry(query, "Exception while updating connector stats in Postgres") + } + + def updateConnectorDisconnections(datasetId: String, disconnections: Int): Unit = { + val query = s"UPDATE dataset_source_config SET connector_stats = jsonb_set(connector_stats::jsonb, " + + s"'{disconnections}','$disconnections') WHERE dataset_id = '$datasetId'" + updateRegistry(query, "Exception while updating connector disconnections in Postgres") + } + + def updateConnectorAvgBatchReadTime(datasetId: String, avgReadTime: Long): Unit = { + val query = s"UPDATE dataset_source_config SET connector_stats = jsonb_set(connector_stats::jsonb, " + + s"'{avg_batch_read_time}','$avgReadTime') WHERE dataset_id = '$datasetId'" + updateRegistry(query, "Exception while updating connector average batch read time in Postgres") + } + + def updateRegistry(query: String, errorMsg: String): Unit = { val postgresConnect = new PostgresConnect(postgresConfig) try { // TODO: Check if the udpate is successful. Else throw an Exception - postgresConnect.executeQuery(s"UPDATE datasources set datasource_ref = '$datasourceRef' where datasource='${datasource.datasource}' and dataset_id='${datasource.datasetId}'") + postgresConnect.execute(query) } catch { case ex: Exception => - logger.error("Exception while reading dataset transformations from Postgres", ex) + logger.error(errorMsg, ex) Map() } finally { postgresConnect.closeConnection() } } - private def parseDataset(rs: ResultSet): Dataset = { val datasetId = rs.getString("id") val datasetType = rs.getString("type") @@ -145,10 +168,12 @@ object DatasetRegistryService { val datasetId = rs.getString("dataset_id") val connectorType = rs.getString("connector_type") val connectorConfig = rs.getString("connector_config") + val connectorStats = rs.getString("connector_stats") val status = rs.getString("status") DatasetSourceConfig(id = id, datasetId = datasetId, connectorType = connectorType, JSONUtil.deserialize[ConnectorConfig](connectorConfig), + JSONUtil.deserialize[ConnectorStats](connectorStats), status ) } From a64f8395014e3f88981ae694dae7e01891977871 Mon Sep 17 00:00:00 2001 From: Aniket Sakinala Date: Wed, 7 Jun 2023 19:32:28 +0530 Subject: [PATCH 02/37] Issue #33 feat: add documentation for Dataset, Datasources, Data In and Query APIs --- README.md | 516 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 515 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f7f654b6..7e0fbccf 100644 --- a/README.md +++ b/README.md @@ -1 +1,515 @@ -# obsrv-core \ No newline at end of file +# Obsrv Core Service +To enable creation, configuration, ingestion and querying of data over OBSRV, following APIs are made available. The folowing concepts are used: + +## Dataset +A dataset is an entity that stores the data. There are two types of Datasets: +1. Dataset: This entity holds your main data. This entity will be reularly updated from it's source and you can run your analytics on top of it. +2. Master Dataset: A Master Dataset holds your denorm data. This entity is not updated as regularly and not indexed into the analytical store. +Both types of Datasets will have a DataSource. + +## Datasource +A datasource is an entity which holds information regarding the source of truth for your data. + +## Dataset APIs +The following CRUL APIs are enabled for Dataset: +### Create +**End Point**: `/obsrv/v1/datasets` +**Method**: `POST` +**Body Schema**: +``` +description: dataset_id, type, router_config, published_date are required +type: object +properties: + id: + type: string + dataset_id: + type: string + name: + type: string + type: + type: string + enum: + - dataset + - master-dataset + extraction_config: + type: object + properties: + is_batch_event: + type: boolean + extraction_key: + type: string + validation_config: + type: object + properties: + validate: + type: boolean + mode: + type: string + dedup_config: + type: object + properties: + drop_duplicates: + type: boolean + dedup_key: + type: string + dedup_period: + type: integer + data_schema: + type: object + properties: + type: + type: string + denorm_config: + type: object + properties: + redis_db_host: + type: string + redis_db_port: + type: string + denorm_fields: + type: array + items: + type: string + properties: + denorm_key: + type: string + redis_db: + type: integer + denorm_out_field: + type: string + router_config: + type: object + properties: + topic: + type: string + required: + - topic + tags: + type: array + items: + type: string + status: + type: string + enum: + - ACTIVE + - DISABLED + created_by: + type: string + updated_by: + type: string + published_date: + type: string +``` +### Read +**End Point**: `/obsrv/v1/datasets/{datasetId}` +**Method**: `GET` +**Params**: +``` +name: datasetId +in: path +required: true +schema: + type: string + format: uuid +``` +### Update +**End Point**: `/obsrv/v1/datasets` +**Method**: `PATCH` +**Body Schema**: +``` +description: dataset_id is required +type: object +properties: + id: + type: string + dataset_id: + type: string + name: + type: string + type: + type: string + enum: + - dataset + - master-dataset + extraction_config: + type: object + properties: + is_batch_event: + type: boolean + extraction_key: + type: string + validation_config: + type: object + properties: + validate: + type: boolean + mode: + type: string + dedup_config: + type: object + properties: + drop_duplicates: + type: boolean + dedup_key: + type: string + dedup_period: + type: integer + data_schema: + type: object + properties: + type: + type: string + denorm_config: + type: object + properties: + redis_db_host: + type: string + redis_db_port: + type: string + denorm_fields: + type: array + items: + type: string + properties: + denorm_key: + type: string + redis_db: + type: integer + denorm_out_field: + type: string + router_config: + type: object + properties: + topic: + type: string + required: + - topic + tags: + type: array + items: + type: string + status: + type: string + enum: + - ACTIVE + - DISABLED + created_by: + type: string + updated_by: + type: string + published_date: + type: string +``` +### List +**End Point**: `/obsrv/v1/datasets/list` +**Method**: `POST` +**Body Schema**: +``` +description: filters are required +type: object +properties: + filters: + type: object + properties: + status: + oneOf: + - type: string + - type: array + items: + type: string + enum: + - ACTIVE + - DISABLED +``` +## Data In(Ingestion) +### +**End Point**: `/obsrv/v1/data/{datasetId}` +**Method**: `POST` +**Body Schema**: +``` +description: datasetId in request params is required +type: object +properties: + data: + type: object +``` +## Data Query +### Native Query +**End Point**: `/obsrv/v1/query` +**Method**: `POST` +**Body Schema**: +``` +description: context parameter is required +type: object +properties: + context: + type: object + properties: + dataSource: + type: string + query: + type: object + properties: + queryType: + type: string + enum: + - scan + - groupBy + - topN + - timeBoundary + - search + - timeseries + dataSource: + type: string + dimensions: + type: array + items: + type: string + granularity: + type: string + intervals: + oneOf: + - type: string + - type: array + items: + type: string + filter: + type: object + properties: + type: + type: string + dimension: + type: string + value: + type: string + aggregations: + type: array + items: + properties: + type: + type: string + name: + type: string + fieldName: + type: string +``` +### SQL Query +**End Point**: `/obsrv/v1/sql-query` +**Method**: `POST` +**Body Schema**: +``` +description: context parameter is required +type: object +properties: + context: + type: object + properties: + dataSource: + type: string + querySql: + type: object + properties: + query: + type: string +``` +## DataSource APIs +The following CRUL APIs are enabled for Datasources: +### Create +**End Point**: `/obsrv/v1/datasources` +**Method**: `POST` +**Body Schema**: +``` +description: dataset_id, datasource parameters are required +type: object +properties: + id: + type: string + dataset_id: + type: string + ingestion_spec: + type: object + datasource: + type: string + datasource_ref: + type: string + retention_period: + type: object + archival_policy: + type: object + purge_policy: + type: object + backup_config: + type: object + status: + type: string + enum: + - ACTIVE + - DISABLED + created_by: + type: string + updated_by: + type: string + published_date: + type: string +``` +### Read +**End Point**: `/obsrv/v1/datasources/{datasourceId}` +**Method**: `GET` +**Params**: +``` +name: datasourceId +in: path +required: true +schema: + type: string + format: uuid +``` +### Update +**End Point**: `/obsrv/v1/datasources` +**Method**: `PATCH` +**Body Schema**: +``` +description: dataset_id, datasource parameters are required +type: object +properties: + id: + type: string + dataset_id: + type: string + ingestion_spec: + type: object + datasource: + type: string + datasource_ref: + type: string + retention_period: + type: object + archival_policy: + type: object + purge_policy: + type: object + backup_config: + type: object + status: + type: string + enum: + - ACTIVE + - DISABLED + created_by: + type: string + updated_by: + type: string + published_date: + type: string +``` +### List +**End Point**: `/obsrv/v1/datasources/list` +**Method**: `POST` +**Body Schema**: +``` +description: filters are required +type: object +properties: + filters: + type: object + properties: + status: + oneOf: + - type: string + - type: array + items: + type: string + enum: + - ACTIVE + - DISABLED +``` +## Dataset Config APIs +The following CRUL APIs are enabled to interact with Dataset Source Configurations: +### Create +**End Point**: `/obsrv/v1/datasets/source/config` +**Method**: `POST` +**Body Schema**: +``` +description: dataset_id, connector_type are required +type: object +properties: + id: + type: string + dataset_id: + type: string + connector_type: + type: string + connector_config: + type: object + status: + type: string + connector_stats: + type: object + created_by: + type: string + updated_by: + type: string + published_date: + type: string + +``` +### Read +**End Point**: `/obsrv/v1/datasets/source/config` +**Method**: `GET` +**Params**: +``` +name: datasetId +in: path +required: true +schema: + type: string + format: uuid +``` +### Update +**End Point**: `/obsrv/v1/datasets/source/config` +**Method**: `PATCH` +**Body Schema**: +``` +description: dataset_id, connector_type are required +type: object +properties: + id: + type: string + dataset_id: + type: string + connector_type: + type: string + connector_config: + type: object + status: + type: string + connector_stats: + type: object + created_by: + type: string + updated_by: + type: string + published_date: + type: string +``` +### List +**End Point**: `/obsrv/v1/datasets/source/config/list` +**Method**: `POST` +**Body Schema**: +``` +description: filters are required +type: object +properties: + filters: + type: object + properties: + status: + oneOf: + - type: string + - type: array + items: + type: string + enum: + - ACTIVE + - DISABLED +``` \ No newline at end of file From ede1567c658e8a1e97466d413f760e2b9c68b117 Mon Sep 17 00:00:00 2001 From: GayathriSrividya Date: Thu, 8 Jun 2023 13:08:29 +0530 Subject: [PATCH 03/37] feat: added descriptions for default configurations --- README.md | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f7f654b6..c608c249 100644 --- a/README.md +++ b/README.md @@ -1 +1,22 @@ -# obsrv-core \ No newline at end of file +# obsrv-core + +Default Configurations in flink job and dataset registry Settings: + +These configurations can be modified as needed to customize the behavior of the pipeline. + +## Dataset Registry + +Configuration for the Dataset Registry: + +| Configuration | Description | Default Value | +|-----------------------|------------------------------|-----------------| +| postgres.host | Hostname or IP address | localhost | +| postgres.port | Port number | 5432 | +| postgres.maxConnections | Maximum number of connections | 2 | +| postgres.user | PostgreSQL username | obsrv | +| postgres.password | PostgreSQL password | obsrv123 | +| postgres.database | Database name | obsrv-registry | + + +## Extractor job + From 18e55fed845681a494077bfd71e0649e2c4bef64 Mon Sep 17 00:00:00 2001 From: GayathriSrividya Date: Thu, 8 Jun 2023 13:10:42 +0530 Subject: [PATCH 04/37] feat: added descriptions for default configurations --- README.md | 154 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 141 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index c608c249..d5c116a4 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,150 @@ -# obsrv-core +# Obsrv core -Default Configurations in flink job and dataset registry Settings: -These configurations can be modified as needed to customize the behavior of the pipeline. +## Overview +Obsrv-core is a framework consisting of Flink jobs designed to handle data extraction and processing tasks efficiently. It provides a flexible and customizable pipeline for various data-related operations. These jobs have been designed to process, enrich, and validate data from various sources, making them highly adaptable to a wide range of datasets. The data streaming jobs are built with a generic approach that makes them robust and able to handle diverse datasets without requiring significant changes to the underlying code. +## Default Configurations in flink job and dataset registry Settings: + +Please note that these configurations can be modified as needed to customize the behavior of the pipeline. + +### Indices + + * [Common config](#common-config) + * [Dataset Registry config](#dataset-registry) + * [Extraction Job config](#extractor-job) + * [Preprocessor Job config](#preprocessor-job) + * [Denorm Job config](#denormalizer-job) + * [Transformer Job config](#transformer-job) + * [Router Job config](#router-job) + * [Kafka Connector Job config](#kafka-connector-job) + +## Common Config + +| Configuration | Description |Data Type| Default Value | +|----------------------------------------------------|-------|----------------------------------------------------------------------------------|-------------------------------| +| kafka.consumer.broker-servers | Kafka broker servers for the consumer | string | localhost:9092 | +| kafka.producer.broker-servers | Kafka broker servers for the producer| string | localhost:9092 | +| kafka.producer.max-request-size | Maximum request size for the Kafka producer in bytes | number | 1572864 | +| kafka.producer.batch.size | Batch size for the Kafka producer in bytes | number | 98304 | +| kafka.producer.linger.ms | Linger time in milliseconds for the Kafka producer | number | 10 | +| kafka.producer.compression | Compression type for the Kafka producer | string | snappy | +| kafka.output.system.event.topic | Output Kafka topic for system events | string | local.system.events | +| job.env | Environment for the Flink job | string | local | +| job.enable.distributed.checkpointing | Flag indicating whether distributed checkpointing is enabled for the job |boolean | false | +| job.statebackend.blob.storage.account | Blob storage account for the state backend | string | blob.storage.account | +| job.statebackend.blob.storage.container | Blob storage container for the state backend | string | obsrv-container | +| job.statebackend.blob.storage.checkpointing.dir | Directory for checkpointing in the blob storage | string | flink-jobs | +| job.statebackend.base.url | Base URL for the state backend |string url | wasbs://obsrv-container@blob.storage.account/flink-jobs | +| task.checkpointing.compressed | Flag indicating whether checkpointing is compressed |boolean | true | +| task.checkpointing.interval | Interval between checkpoints in milliseconds |number | 60000 | +| task.checkpointing.pause.between.seconds | Pause between checkpoints in seconds |number | 30000 | +| task.restart-strategy.attempts | Number of restart attempts for the job|number | 3 | +| task.restart-strategy.delay | Delay between restart attempts in milliseconds |number | 30000 | +| task.parallelism | Parallelism for the Flink job tasks|number | 1 | +| task.consumer.parallelism | Parallelism for the task consumers |number | 1 | +| task.downstream.operators.parallelism | Parallelism for downstream operators |number | 1 | +| redis.host | Hostname of the Redis server| string | localhost | +| redis.port | Port number of the Redis server| number | 6379 | +| redis.connection.timeout | Connection timeout for Redis in milliseconds |number | 30000 | +| redis-meta.host | Hostname of the Redis server for metadata |string | localhost | +| redis-meta.port | Port number of the Redis server for metadata |number | 6379 | +| postgres.host | Hostname or IP address of the PostgreSQL server |string | localhost | +| postgres.port | Port number of the PostgreSQL server |number | 5432 | +| postgres.maxConnections | Maximum number of connections to the PostgreSQL server|number | 2 | +| postgres.user | PostgreSQL username | string | postgres | +| postgres.password | PostgreSQL password |string | postgres | +| postgres.database | Name of the PostgreSQL database |string | postgres | +| lms-cassandra.host | Hostname or IP address of the Cassandra server|string | localhost | +| lms-cassandra.port | Port number of the Cassandra server|number | 9042 | + + ## Dataset Registry -Configuration for the Dataset Registry: +| Configuration | Description |Data type| Default Value | +|-----------------------|-----------------------------|----------|-----------------| +| postgres.host | Hostname or IP address |string| localhost | +| postgres.port | Port number |number| 5432 | +| postgres.maxConnections | Maximum number of connections |number| 2 | +| postgres.user | PostgreSQL username |string | obsrv | +| postgres.password | PostgreSQL password |string| obsrv123 | +| postgres.database | Database name |string| obsrv-registry | + +## Extractor Job -| Configuration | Description | Default Value | -|-----------------------|------------------------------|-----------------| -| postgres.host | Hostname or IP address | localhost | -| postgres.port | Port number | 5432 | -| postgres.maxConnections | Maximum number of connections | 2 | -| postgres.user | PostgreSQL username | obsrv | -| postgres.password | PostgreSQL password | obsrv123 | -| postgres.database | Database name | obsrv-registry | +| Configuration | Description |Data type| Default Value | +|---------------------------|----------------------------------|---------|-----------------| +| kafka.input.topic | Input Kafka topic |string| local.ingest | +| kafka.output.raw.topic | Output Kafka topic for raw data |string| local.raw | +| kafka.output.extractor.duplicate.topic | Output Kafka topic for duplicate data in extractor |string| local.extractor.duplicate | +| kafka.output.failed.topic | Output Kafka topic for failed data |string| local.failed | +| kafka.output.batch.failed.topic | Output Kafka topic for failed extractor batches |string| local.extractor.failed | +| kafka.event.max.size | Maximum size of a Kafka event |string| "1048576" (1MB) | +| kafka.groupId | Kafka consumer group ID |string| local-extractor-group | +| kafka.producer.max-request-size | Maximum request size for Kafka producer |number| 5242880 | +| task.consumer.parallelism | Parallelism for task consumers |number| 1 | +| task.downstream.operators.parallelism | Parallelism for downstream operators |number| 1 | +| redis.database.extractor.duplication.store.id | Redis database ID for extractor duplication store |number| 1 | +| redis.database.key.expiry.seconds | Expiry time for Redis keys (in seconds) |number| 3600 | +## Preprocessor Job + +| Configuration | Description |Data type| Default Value | +|---------------------------|----------------------------------|----------|-----------------| +| kafka.input.topic | Input Kafka topic |string| local.raw | +| kafka.output.failed.topic | Output Kafka topic for failed data |string| local.failed | +| kafka.output.invalid.topic | Output Kafka topic for invalid data |string| local.invalid | +| kafka.output.unique.topic | Output Kafka topic for unique data |string| local.unique | +| kafka.output.duplicate.topic | Output Kafka topic for duplicate data |string| local.duplicate | +| kafka.groupId | Kafka consumer group ID |string| local-pipeline-preprocessor-group | +| task.consumer.parallelism | Parallelism for task consumers |number| 1 | +| task.downstream.operators.parallelism | Parallelism for downstream operators |number| 1 | +| redis.database.preprocessor.duplication.store.id | Redis database ID for preprocessor duplication store |number| 2 | +| redis.database.key.expiry.seconds | Expiry time for Redis keys (in seconds) |number| 3600 | + +## Denormalizer Job -## Extractor job +| Configuration | Description |Data type| Default Value | +|------------------------------------|------------------------------------------------------|----|------------------------| +| kafka.input.topic | Input Kafka topic |string| local.unique | +| kafka.output.denorm.topic | Output Kafka topic for denormalized data |string| local.denorm | +| kafka.output.denorm.failed.topic | Output Kafka topic for failed denormalization |string| local.denorm.failed | +| kafka.groupId | Kafka consumer group ID |string| local-denormalizer-group | +| task.window.time.in.seconds | Time duration for window in seconds |number| 5 | +| task.window.count | configuration specifies the number of events (elements) that will be included in each window. It determines the size of each window for processing. |number| 30 | +| task.window.shards | determines the number of parallel shards (instances) used for processing windows. It enables parallel processing of windows for improved scalability and performance. |number| 1400 | +| task.consumer.parallelism | Parallelism for task consumers |number| 1 | +| task.downstream.operators.parallelism | Parallelism for downstream operators |number| 1 | + +## Transformer Job + +| Configuration | Description |Data type| Default Value | +|------------------------------|--------------------------------------------|----|----------------------------| +| kafka.input.topic | Input Kafka topic |string| local.denorm | +| kafka.output.transform.topic | Output Kafka topic for transformed data |string| local.transform | +| kafka.groupId | Kafka consumer group ID |string| local-transformer-group | +| kafka.producer.max-request-size | Maximum request size for Kafka producer |number| 5242880 | +| task.consumer.parallelism | Parallelism for task consumers |number | 1 | +| task.downstream.operators.parallelism | Parallelism for downstream operators |number| 1 | + +## Router Job + +| Configuration | Description |Data type| Default Value | +|------------------------|----------------------------------------------|----|--------------------------| +| kafka.input.topic | Input Kafka topic |string| local.transform | +| kafka.stats.topic | Kafka topic for storing statistics |string| local.stats | +| kafka.groupId | Kafka consumer group ID |string| local-druid-router-group | +| task.consumer.parallelism | Parallelism for task consumers |number| 1 | +| task.downstream.operators.parallelism | Parallelism for downstream operators |number| 1 | + +## Kafka connector Job +| Configuration | Description |Data type| Default Value | +|------------------------------------|----------------------------------------------------|----|--------------------------------| +| kafka.input.topic | Input Kafka topic |string| local.test | +| kafka.output.failed.topic | Output Kafka topic for failed data |string| local.failed | +| kafka.event.max.size | Maximum size of events in bytes |number| 1048576 (1MB) | +| kafka.groupId | Kafka consumer group ID |string| local-kafkaconnector-group | +| kafka.producer.max-request-size | Maximum request size for Kafka producer in bytes |number| 5242880 (5MB) | +| task.consumer.parallelism | Parallelism for task consumers |number| 1 | +| task.downstream.operators.parallelism | Parallelism for downstream operators |number|1 | \ No newline at end of file From d3f4c9ca74f0073d55ab95ccf0327bd6ec6ea14d Mon Sep 17 00:00:00 2001 From: GayathriSrividya Date: Thu, 8 Jun 2023 13:29:26 +0530 Subject: [PATCH 05/37] feat: modified kafka connector input topic --- README.md | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index d5c116a4..83ca9d19 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Please note that these configurations can be modified as needed to customize the ### Indices - * [Common config](#common-config) + * [Common config](#common-configuration) * [Dataset Registry config](#dataset-registry) * [Extraction Job config](#extractor-job) * [Preprocessor Job config](#preprocessor-job) @@ -19,10 +19,10 @@ Please note that these configurations can be modified as needed to customize the * [Router Job config](#router-job) * [Kafka Connector Job config](#kafka-connector-job) -## Common Config +## Common Configuration -| Configuration | Description |Data Type| Default Value | -|----------------------------------------------------|-------|----------------------------------------------------------------------------------|-------------------------------| +| Configuration |Description |Data Type| Default Value | +|--------------------------------------------|-------|---------------------------------------------------------------------|-------------------------------| | kafka.consumer.broker-servers | Kafka broker servers for the consumer | string | localhost:9092 | | kafka.producer.broker-servers | Kafka broker servers for the producer| string | localhost:9092 | | kafka.producer.max-request-size | Maximum request size for the Kafka producer in bytes | number | 1572864 | @@ -55,8 +55,6 @@ Please note that these configurations can be modified as needed to customize the | postgres.user | PostgreSQL username | string | postgres | | postgres.password | PostgreSQL password |string | postgres | | postgres.database | Name of the PostgreSQL database |string | postgres | -| lms-cassandra.host | Hostname or IP address of the Cassandra server|string | localhost | -| lms-cassandra.port | Port number of the Cassandra server|number | 9042 | ## Dataset Registry @@ -141,7 +139,7 @@ Please note that these configurations can be modified as needed to customize the | Configuration | Description |Data type| Default Value | |------------------------------------|----------------------------------------------------|----|--------------------------------| -| kafka.input.topic | Input Kafka topic |string| local.test | +| kafka.input.topic | Input Kafka topic |string| local.input | | kafka.output.failed.topic | Output Kafka topic for failed data |string| local.failed | | kafka.event.max.size | Maximum size of events in bytes |number| 1048576 (1MB) | | kafka.groupId | Kafka consumer group ID |string| local-kafkaconnector-group | From 4c64e12a0b38a21d5c277d9e337740d0286a017f Mon Sep 17 00:00:00 2001 From: GayathriSrividya Date: Thu, 8 Jun 2023 20:07:12 +0530 Subject: [PATCH 06/37] feat: obsrv setup instructions --- INSTALLATION.md | 172 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 INSTALLATION.md diff --git a/INSTALLATION.md b/INSTALLATION.md new file mode 100644 index 00000000..ce52afd4 --- /dev/null +++ b/INSTALLATION.md @@ -0,0 +1,172 @@ +# Obsrv + +## Overview + +Obsrv comprises several pluggable tools and microservices that come together to enable observability features on any platform/solution. This includes the ability to capture granular events via telemetry, create measures, and observe various events/actions carried out by the system/users/devices (like IoT devices) on any platform/solution. Obsrv comes with a set of microservices, APIs, and some utility SDKs to make it easy for adopters to rapidly enable powerful data processing and aggregation infrastructure to process telemetry data, validate telemetry stream data, as well as aggregate and generate actionable insights via APIs. It also has built-in open data cataloging and publishing capability. It is built keeping extensibility in mind, so that adopters have the flexibility to adapt the telemetry and tools to their specific use-cases. + +## Keywords + +- Dataset: +In event-driven applications, a dataset is a structured collection of raw data representing specific events. Each event has attributes like timestamp, type, and metadata. Datasets are vital for collecting, transforming, and analyzing data in real-time for various purposes. +- Master Dataset: +A master dataset is a consolidated collection of relevant data from various sources, serving as a unified reference for analysis, decision-making, and reporting. It combines and integrates data from multiple datasets to provide a complete and consistent view. The master dataset is denormalized for improved performance and simplified data access. +- Datasource: +A datasource refers to a specific subset or portion of a dataset that is selected or derived for further processing, analysis, or presentation. It represents a specific source or view of data within the larger dataset. Datasources are created by extracting and manipulating data from the original dataset based on specific criteria, such as filtering, aggregating, or transforming the data. Datasources allow for focused analysis and interpretation of the data within a specific context or for a particular purpose. + +## How to setup the obsrv? + +The Obsrv Automation repository provides a set of tools and scripts for setting up and configuring Obsrv. Clone the obsrv automation repository from [here](https://github.com/Sunbird-Obsrv/obsrv-automation). + +### **Key Words:** + +- Terraform: Terraform is an open-source infrastructure provisioning tool that allows for declarative configuration and automation of cloud infrastructure resources. +- S3 Cloud Storage: Amazon S3 (Simple Storage Service) is a scalable and secure cloud storage service offered by AWS, allowing users to store and retrieve data in the form of objects within buckets. + +### Prerequisites: + +- Install terragrunt. Please see [**Install Terragrunt**](https://terragrunt.gruntwork.io/docs/getting-started/install/) for reference. + +**(for aws)** + +- You will need key-secret pair to access AWS. Learn how to create or manage these at [Managing access keys for IAM users](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html). Please export these variables in terminal session + + `export AWS_ACCESS_KEY_ID=mykey` + + `export AWS_SECRET_ACCESS_KEY=mysecret` + +- You will require an S3 bucket to store tf-state. Learn how to create or manage these at [Create an Amazon S3 bucket](https://docs.aws.amazon.com/transfer/latest/userguide/requirements-S3.html). Please export this variable at + + `export AWS_TERRAFORM_BACKEND_BUCKET_NAME=mybucket` + + `export AWS_TERRAFORM_BACKEND_BUCKET_REGION=myregion` + + +### Steps: + +* In order to complete the installation, please run the below steps in the same terminal. + + `cd terraform/aws` + + `terragrunt init` + + `terragrunt plan` + + `terragrunt apply` + +Please refer to the repository's README file for specific instructions on configuring OBSRV on AWS and other cloud providers like GCP and Azure. + +## How to create a dataset? + +- Assuming that the Obsrv API service is running on localhost:3000 within a cluster, to access the API, you would need to perform a port forwarding operation. This can be achieved using the command: `kubectl port-forward 3000:3000` . Once done, you can access the Obsrv API service on your local machine at **`localhost:3000`**. +- **Dataset Configurations** + - **`extraction_config`**: defines how the data is extracted from the source. `is_batch_event` determines whether the extraction is done in batches or not. The `extraction_key` specifies the key used for extraction. + - **`validation_config`**: defines the validation rules applied to the dataset. It includes parameters like whether validation is enabled (**`validate`**) and the validation mode (**`mode`**). + - **`dedup_config`**: handles duplicate records in the dataset. It includes parameters like whether to drop duplicates (**`drop_duplicates`**), the key used for deduplication (**`dedup_key`**), and the deduplication period (**`dedup_period`**) in seconds. + - **`data_schema`**: Json schema of the data in the dataset. + - **`denorm_config`**: By denormalizing the user information, the telemetry dataset can become more self-contained and easier to analyze. It eliminates the need for additional queries or joins to retrieve user information when analyzing telemetry data. It has redis config and denorm_fields + - **`router_config`**: It includes (**`topic`**) to which the dataset is published. + +- **Create a master dataset** + + **End Point**:`/obsrv/v1/datasets` + + **Method**:`POST` + + **Request Body:** + + ```json + {"id":"sb-telemetry-user","dataset_id":"sb-telemetry-user","type":"master-dataset","name":"sb-telemetry-user","validation_config":{"validate":true,"mode":"Strict"},"extraction_config":{"is_batch_event":false,"extraction_key":"","dedup_config":{"drop_duplicates":false,"dedup_key":"id","dedup_period":1036800}},"dedup_config":{"drop_duplicates":true,"dedup_key":"id","dedup_period":1036800},"data_schema":{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"subject":{"type":"array","items":{"type":"string"}},"channel":{"type":"string"},"language":{"type":"array","items":{"type":"string"}},"id":{"type":"string"},"firstName":{"type":"string"},"lastName":{"type":"string"},"mobile":{"type":"string"},"email":{"type":"string"},"state":{"type":"string"},"district":{"type":"string"}}},"denorm_config":{"redis_db_host":"obsrv-redis-master.redis.svc.cluster.local","redis_db_port":6379,"denorm_fields":[]},"router_config":{"topic":"user-master"},"dataset_config":{"data_key":"id","timestamp_key":"","exclude_fields":[],"entry_topic":"dev.masterdata.ingest","redis_db_host":"obsrv-redis-master.redis.svc.cluster.local","redis_db_port":6379,"index_data":false,"redis_db":3},"status":"ACTIVE","created_by":"SYSTEM","updated_by":"SYSTEM","published_date":"2023-05-19 05:46:01.854692","tags":[],"data_version":null} + ``` + +- **Create a dataset with denormalized configurations** + + **End Point**:`/obsrv/v1/datasets` + + **Method**:`POST` + + **Request Body:** + + ```json + {"id":"sb-telemetry","dataset_id":"sb-telemetry","type":"dataset","name":"sb-telemetry","validation_config":{"validate":true,"mode":"Strict","validation_mode":"Strict"},"extraction_config":{"is_batch_event":true,"extraction_key":"events","dedup_config":{"drop_duplicates":true,"dedup_key":"id","dedup_period":1036800},"batch_id":"id"},"dedup_config":{"drop_duplicates":true,"dedup_key":"mid","dedup_period":1036800},"data_schema":{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"eid":{"type":"string"},"ets":{"type":"integer","format":"date-time"},"ver":{"type":"string"},"mid":{"type":"string","oneof":[{"type":"integer"},{"type":"string"}]},"actor":{"type":"object","properties":{"id":{"type":"string"},"type":{"type":"string"}}},"context":{"type":"object","properties":{"channel":{"type":"string"},"pdata":{"type":"object","properties":{"id":{"type":"string"},"ver":{"type":"string"},"pid":{"type":"string"}}},"env":{"type":"string"},"sid":{"type":"string","format":"uuid"},"did":{"type":"string"},"rollup":{"type":"object","properties":{"l1":{"type":"string"}}},"uid":{"type":"string"},"cdata":{"type":"array","additionalProperties":true}}},"object":{"type":"object","properties":{"id":{"type":"string"},"type":{"type":"string"},"ver":{"type":"string"}}},"tags":{"type":"array","items":{"type":"string"}},"edata":{"type":"object","properties":{"type":{"type":"string"},"pageid":{"type":"string"},"subtype":{"type":"string"},"uri":{"type":"string","format":"uri"},"visits":{"type":"array","additionalProperties":true},"level":{"type":"string"},"message":{"type":"string"},"params":{"type":"array","additionalProperties":true},"size":{"type":"integer"},"query":{"type":"string"},"filters":{"type":"object","properties":{"isTenant":{"type":"boolean"},"framework":{"type":"object"},"mimeType":{"type":"object"},"resourceType":{"type":"object"},"subject":{"type":"array","additionalProperties":true},"se_boards":{"type":"array","additionalProperties":true},"se_mediums":{"type":"array","additionalProperties":true},"se_gradeLevels":{"type":"array","additionalProperties":true},"primaryCategory":{"type":"array","additionalProperties":true},"objectType":{"type":"array","additionalProperties":true},"channel":{"type":"array","additionalProperties":true},"contentType":{"type":"array","additionalProperties":true},"visibility":{"type":"array","additionalProperties":true},"batches.status":{"type":"array","items":{"type":"integer"}},"batches.enrollmentType":{"type":"string"},"status":{"type":"array","additionalProperties":true},"migratedVersion":{"type":"integer"},"identifiers":{"type":"array","additionalProperties":true}}},"sort":{"type":"object","properties":{"lastPublishedOn":{"type":"string"}}},"topn":{"type":"array","additionalProperties":true},"props":{"type":"array","additionalProperties":true},"duration":{"type":"integer"},"state":{"type":"string"},"prevstate":{"type":"string"}}},"syncts":{"type":"integer","format":"date-time"},"@timestamp":{"type":"string","format":"date-time"},"flags":{"type":"object","properties":{"ex_processed":{"type":"boolean"}}}},"required":["ets"]},"denorm_config":{"redis_db_host":"obsrv-redis-master.redis.svc.cluster.local","redis_db_port":6379,"denorm_fields":[{"denorm_key":"actor.id","redis_db":3,"denorm_out_field":"user_metadata"}]},"router_config":{"topic":"sb-telemetry"},"dataset_config":{"data_key":"id","timestamp_key":"","exclude_fields":[],"entry_topic":"dev.masterdata.ingest","redis_db_host":"obsrv-redis-master.redis.svc.cluster.local","redis_db_port":6379,"index_data":false,"redis_db":3},"status":"ACTIVE","created_by":"SYSTEM","updated_by":"SYSTEM","created_date":"2023-05-31 12:15:42.845622","updated_date":"2023-05-31 12:15:42.845622","published_date":"2023-05-31 12:15:42.845622","tags":null,"data_version":null} + ``` + + +## How to ingest data? + +- First port forward Druid service within the cluster, use the command: **`kubectl port-forward 8888:8888`**. Access the service on your local machine at localhost:8888. +- Create ingestion spec, you can refer to the [**official documentation**](https://druid.apache.org/docs/latest/development/extensions-core/kafka-ingestion.html) which provides detailed instructions and examples. +- Create a new data source by deriving it from the previously created dataset. + + **End Point**: `/obsrv/v1/datasources` + + **Method**: `POST` + + **Request Body**: + + ```json + {"id":"sb-telemetry_sb-telemetry","datasource":"sb-telemetry","dataset_id":"sb-telemetry","ingestion_spec":{"type":"kafka","spec":{"dataSchema":{"dataSource":"sb-telemetry","dimensionsSpec":{"dimensions":[{"type":"string","name":"eid"},{"type":"long","name":"ets"},{"type":"string","name":"ver"},{"type":"string","name":"mid"},{"type":"string","name":"actor_id"},{"type":"string","name":"actor_type"},{"type":"string","name":"context_channel"},{"type":"string","name":"context_pdata_id"},{"type":"string","name":"context_pdata_ver"},{"type":"string","name":"context_pdata_pid"},{"type":"string","name":"context_env"},{"type":"string","name":"context_sid"},{"type":"string","name":"context_did"},{"type":"string","name":"context_rollup_l1"},{"type":"string","name":"context_uid"},{"type":"array","name":"context_cdata"},{"type":"string","name":"object_id"},{"type":"string","name":"object_type"},{"type":"string","name":"object_ver"},{"type":"array","name":"tags"},{"type":"string","name":"edata_type"},{"type":"string","name":"edata_pageid"},{"type":"string","name":"edata_subtype"},{"type":"string","name":"edata_uri"},{"type":"array","name":"edata_visits"},{"type":"string","name":"edata_level"},{"type":"string","name":"edata_message"},{"type":"array","name":"edata_params"},{"type":"string","name":"edata_query"},{"type":"boolean","name":"edata_filters_isTenant"},{"type":"array","name":"edata_filters_subject"},{"type":"array","name":"edata_filters_se_boards"},{"type":"array","name":"edata_filters_se_mediums"},{"type":"array","name":"edata_filters_se_gradeLevels"},{"type":"array","name":"edata_filters_primaryCategory"},{"type":"array","name":"edata_filters_objectType"},{"type":"array","name":"edata_filters_channel"},{"type":"array","name":"edata_filters_contentType"},{"type":"array","name":"edata_filters_visibility"},{"type":"array","name":"edata_filters_batches_status"},{"type":"string","name":"edata_filters_batches_enrollmentType"},{"type":"array","name":"edata_filters_status"},{"type":"array","name":"edata_filters_identifiers"},{"name":"edata_filters_batches"},{"type":"string","name":"edata_sort_lastPublishedOn"},{"type":"array","name":"edata_topn"},{"type":"array","name":"edata_props"},{"type":"string","name":"edata_state"},{"type":"string","name":"edata_prevstate"},{"type":"string","name":"@timestamp"},{"type":"boolean","name":"flags_ex_processed"},{"type":"json","name":"user_metadata"}]},"timestampSpec":{"column":"syncts","format":"auto"},"metricsSpec":[{"type":"doubleSum","name":"edata_size","fieldName":"edata_size"},{"type":"doubleSum","name":"edata_filters_migratedVersion","fieldName":"edata_filters_migratedVersion"},{"type":"doubleSum","name":"edata_duration","fieldName":"edata_duration"}],"granularitySpec":{"type":"uniform","segmentGranularity":"DAY","rollup":false}},"tuningConfig":{"type":"kafka","maxBytesInMemory":134217728,"maxRowsPerSegment":500000,"logParseExceptions":true},"ioConfig":{"type":"kafka","topic":"sb-telemetry","consumerProperties":{"bootstrap.servers":"kafka-headless.kafka.svc:9092"},"taskCount":1,"replicas":1,"taskDuration":"PT1H","useEarliestOffset":true,"completionTimeout":"PT1H","inputFormat":{"type":"json","flattenSpec":{"useFieldDiscovery":true,"fields":[{"type":"path","expr":"$.eid","name":"eid"},{"type":"path","expr":"$.ets","name":"ets"},{"type":"path","expr":"$.ver","name":"ver"},{"type":"path","expr":"$.mid","name":"mid"},{"type":"path","expr":"$.actor.id","name":"actor_id"},{"type":"path","expr":"$.actor.type","name":"actor_type"},{"type":"path","expr":"$.context.channel","name":"context_channel"},{"type":"path","expr":"$.context.pdata.id","name":"context_pdata_id"},{"type":"path","expr":"$.context.pdata.ver","name":"context_pdata_ver"},{"type":"path","expr":"$.context.pdata.pid","name":"context_pdata_pid"},{"type":"path","expr":"$.context.env","name":"context_env"},{"type":"path","expr":"$.context.sid","name":"context_sid"},{"type":"path","expr":"$.context.did","name":"context_did"},{"type":"path","expr":"$.context.rollup.l1","name":"context_rollup_l1"},{"type":"path","expr":"$.context.uid","name":"context_uid"},{"type":"path","expr":"$.context.cdata[*]","name":"context_cdata"},{"type":"path","expr":"$.object.id","name":"object_id"},{"type":"path","expr":"$.object.type","name":"object_type"},{"type":"path","expr":"$.object.ver","name":"object_ver"},{"type":"path","expr":"$.tags[*]","name":"tags"},{"type":"path","expr":"$.edata.type","name":"edata_type"},{"type":"path","expr":"$.edata.pageid","name":"edata_pageid"},{"type":"path","expr":"$.edata.subtype","name":"edata_subtype"},{"type":"path","expr":"$.edata.uri","name":"edata_uri"},{"type":"path","expr":"$.edata.visits[*]","name":"edata_visits"},{"type":"path","expr":"$.edata.level","name":"edata_level"},{"type":"path","expr":"$.edata.message","name":"edata_message"},{"type":"path","expr":"$.edata.params[*]","name":"edata_params"},{"type":"path","expr":"$.edata.query","name":"edata_query"},{"type":"path","expr":"$.edata.filters.isTenant","name":"edata_filters_isTenant"},{"type":"path","expr":"$.edata.filters.subject[*]","name":"edata_filters_subject"},{"type":"path","expr":"$.edata.filters.se_boards[*]","name":"edata_filters_se_boards"},{"type":"path","expr":"$.edata.filters.se_mediums[*]","name":"edata_filters_se_mediums"},{"type":"path","expr":"$.edata.filters.se_gradeLevels[*]","name":"edata_filters_se_gradeLevels"},{"type":"path","expr":"$.edata.filters.primaryCategory[*]","name":"edata_filters_primaryCategory"},{"type":"path","expr":"$.edata.filters.objectType[*]","name":"edata_filters_objectType"},{"type":"path","expr":"$.edata.filters.channel[*]","name":"edata_filters_channel"},{"type":"path","expr":"$.edata.filters.contentType[*]","name":"edata_filters_contentType"},{"type":"path","expr":"$.edata.filters.visibility[*]","name":"edata_filters_visibility"},{"type":"path","expr":"$.edata.filters.batches.status[*]","name":"edata_filters_batches_status"},{"type":"path","expr":"$.edata.filters.batches.enrollmentType","name":"edata_filters_batches_enrollmentType"},{"type":"path","expr":"$.edata.filters.status[*]","name":"edata_filters_status"},{"type":"path","expr":"$.edata.filters.identifiers[*]","name":"edata_filters_identifiers"},{"type":"path","expr":"$.edata.filters.batches","name":"edata_filters_batches"},{"type":"path","expr":"$.edata.sort.lastPublishedOn","name":"edata_sort_lastPublishedOn"},{"type":"path","expr":"$.edata.topn[*]","name":"edata_topn"},{"type":"path","expr":"$.edata.props[*]","name":"edata_props"},{"type":"path","expr":"$.edata.state","name":"edata_state"},{"type":"path","expr":"$.edata.prevstate","name":"edata_prevstate"},{"type":"path","expr":"$.obsrv_meta.syncts","name":"syncts"},{"type":"path","expr":"$.@timestamp","name":"@timestamp"},{"type":"path","expr":"$.flags.ex_processed","name":"flags_ex_processed"},{"type":"path","expr":"$.user_metadata","name":"user_metadata"},{"type":"path","expr":"$.edata.size","name":"edata_size"},{"type":"path","expr":"$.edata.filters.migratedVersion","name":"edata_filters_migratedVersion"},{"type":"path","expr":"$.edata.duration","name":"edata_duration"}]}},"appendToExisting":false}}},"datasource_ref":"sb-telemetry","retention_period":{"enabled":"false"},"archival_policy":{"enabled":"false"},"purge_policy":{"enabled":"false"},"backup_config":{"enabled":"false"},"status":"ACTIVE","created_by":"SYSTEM","updated_by":"SYSTEM","published_date":"2023-05-31 12:15:42.881752"} + ``` + +- Submit ingestion to Druid + + **URL:** `localhost:8888/druid/indexer/v1/supervisor` + + **Request Body:** `` + +- Push events for the master dataset: Pushing events involves loading data into the master dataset. You can push events through obsrv API using endpoint `/obsrv/v1/data/:datasetId` + + **End Point**:`/obsrv/v1/data/sb-telemetry-user` + + **Method**:`POST` + + **Request Body**: + + ```json + {"data":{"event":{"subject":["Mathematics"],"channel":"Future Assurance Consultant","language":["English"],"id":"user-00","firstName":"Karan","lastName":"Panicker","mobile":"+91-602-8988588","email":"Karan_Panicker@obsrv.ai","state":"Gujarat","district":"Bedfordshire"}}} + ``` + +- Push events for the normal dataset: Pushing events involves loading data into the normal dataset. + + **End Point**:`/obsrv/v1/data/sb-telemetry` + + **Method**:`POST` + + **Request Body**: + + ```json + {"data":{"id":"dedup-id-1","events":[{"eid":"IMPRESSION","ets":1672657002221,"ver":"3.0","mid":124435,"actor":{"id":"user-00","type":"User"},"context":{"channel":"01268904781886259221","pdata":{"id":"staging.diksha.portal","ver":"5.1.0","pid":"sunbird-portal"},"env":"public","sid":"23850c90-8a8c-11ed-95d0-276800e1048c","did":"0c45959486f579c24854d40a225d6161","cdata":[],"rollup":{"l1":"01268904781886259221"},"uid":"anonymous"},"object":{},"tags":["01268904781886259221"],"edata":{"type":"view","pageid":"login","subtype":"pageexit","uri":"https://staging.sunbirded.org/auth/realms/sunbird/protocol/openid-connect/auth?client_id=portal&state=254efd70-6b89-4f7d-868b-5c957f54174e&redirect_uri=https%253A%252F%252Fstaging.sunbirded.org%252Fresources%253Fboard%253DState%252520(Andhra%252520Pradesh)%2526medium%253DEnglish%2526gradeLevel%253DClass%2525201%2526%2526id%253Dap_k-12_1%2526selectedTab%253Dhome%2526auth_callback%253D1&scope=openid&response_type=code&version=4","visits":[]},"syncts":1672657005814,"@timestamp":"2023-01-02T10:56:45.814Z","flags":{"ex_processed":true}}]}} + ``` + + + + +## How to query on data source? + +- You can use Obsrv API for druid native and sql queries. + + **For native query:** + + **End Point**:`/obsrv/v1/query` + + **Method**:`POST` + + **Request Body**: + + ```json + {"context":{"dataSource":"sb-telemetry"},"query":{"queryType":"scan","dataSource":"sb-telemetry","intervals":"2023-03-31/2023-04-01","granularity":"DAY"}} + ``` + + **For SQL query:** + + **End Point**:`/obsrv/v1/sql-query` + + **Method**:`POST` + + **Request Body**: + + ```json + {"context":{"dataSource":"sb-telemetry"},"querySql":"YOUR QUERY STRING"} + ``` + +For more info on Obsrv API Service refer [**here**](https://github.com/Sunbird-Obsrv/obsrv-api-service/tree/main/swagger-doc) + \ No newline at end of file From 17579d690f834f99b04e9585f926d696abab4fef Mon Sep 17 00:00:00 2001 From: GayathriSrividya Date: Fri, 9 Jun 2023 12:00:30 +0530 Subject: [PATCH 07/37] feat: revisiting open source features --- INSTALLATION.md | 172 ------------------------------------------------ README.md | 12 ---- 2 files changed, 184 deletions(-) delete mode 100644 INSTALLATION.md diff --git a/INSTALLATION.md b/INSTALLATION.md deleted file mode 100644 index ce52afd4..00000000 --- a/INSTALLATION.md +++ /dev/null @@ -1,172 +0,0 @@ -# Obsrv - -## Overview - -Obsrv comprises several pluggable tools and microservices that come together to enable observability features on any platform/solution. This includes the ability to capture granular events via telemetry, create measures, and observe various events/actions carried out by the system/users/devices (like IoT devices) on any platform/solution. Obsrv comes with a set of microservices, APIs, and some utility SDKs to make it easy for adopters to rapidly enable powerful data processing and aggregation infrastructure to process telemetry data, validate telemetry stream data, as well as aggregate and generate actionable insights via APIs. It also has built-in open data cataloging and publishing capability. It is built keeping extensibility in mind, so that adopters have the flexibility to adapt the telemetry and tools to their specific use-cases. - -## Keywords - -- Dataset: -In event-driven applications, a dataset is a structured collection of raw data representing specific events. Each event has attributes like timestamp, type, and metadata. Datasets are vital for collecting, transforming, and analyzing data in real-time for various purposes. -- Master Dataset: -A master dataset is a consolidated collection of relevant data from various sources, serving as a unified reference for analysis, decision-making, and reporting. It combines and integrates data from multiple datasets to provide a complete and consistent view. The master dataset is denormalized for improved performance and simplified data access. -- Datasource: -A datasource refers to a specific subset or portion of a dataset that is selected or derived for further processing, analysis, or presentation. It represents a specific source or view of data within the larger dataset. Datasources are created by extracting and manipulating data from the original dataset based on specific criteria, such as filtering, aggregating, or transforming the data. Datasources allow for focused analysis and interpretation of the data within a specific context or for a particular purpose. - -## How to setup the obsrv? - -The Obsrv Automation repository provides a set of tools and scripts for setting up and configuring Obsrv. Clone the obsrv automation repository from [here](https://github.com/Sunbird-Obsrv/obsrv-automation). - -### **Key Words:** - -- Terraform: Terraform is an open-source infrastructure provisioning tool that allows for declarative configuration and automation of cloud infrastructure resources. -- S3 Cloud Storage: Amazon S3 (Simple Storage Service) is a scalable and secure cloud storage service offered by AWS, allowing users to store and retrieve data in the form of objects within buckets. - -### Prerequisites: - -- Install terragrunt. Please see [**Install Terragrunt**](https://terragrunt.gruntwork.io/docs/getting-started/install/) for reference. - -**(for aws)** - -- You will need key-secret pair to access AWS. Learn how to create or manage these at [Managing access keys for IAM users](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html). Please export these variables in terminal session - - `export AWS_ACCESS_KEY_ID=mykey` - - `export AWS_SECRET_ACCESS_KEY=mysecret` - -- You will require an S3 bucket to store tf-state. Learn how to create or manage these at [Create an Amazon S3 bucket](https://docs.aws.amazon.com/transfer/latest/userguide/requirements-S3.html). Please export this variable at - - `export AWS_TERRAFORM_BACKEND_BUCKET_NAME=mybucket` - - `export AWS_TERRAFORM_BACKEND_BUCKET_REGION=myregion` - - -### Steps: - -* In order to complete the installation, please run the below steps in the same terminal. - - `cd terraform/aws` - - `terragrunt init` - - `terragrunt plan` - - `terragrunt apply` - -Please refer to the repository's README file for specific instructions on configuring OBSRV on AWS and other cloud providers like GCP and Azure. - -## How to create a dataset? - -- Assuming that the Obsrv API service is running on localhost:3000 within a cluster, to access the API, you would need to perform a port forwarding operation. This can be achieved using the command: `kubectl port-forward 3000:3000` . Once done, you can access the Obsrv API service on your local machine at **`localhost:3000`**. -- **Dataset Configurations** - - **`extraction_config`**: defines how the data is extracted from the source. `is_batch_event` determines whether the extraction is done in batches or not. The `extraction_key` specifies the key used for extraction. - - **`validation_config`**: defines the validation rules applied to the dataset. It includes parameters like whether validation is enabled (**`validate`**) and the validation mode (**`mode`**). - - **`dedup_config`**: handles duplicate records in the dataset. It includes parameters like whether to drop duplicates (**`drop_duplicates`**), the key used for deduplication (**`dedup_key`**), and the deduplication period (**`dedup_period`**) in seconds. - - **`data_schema`**: Json schema of the data in the dataset. - - **`denorm_config`**: By denormalizing the user information, the telemetry dataset can become more self-contained and easier to analyze. It eliminates the need for additional queries or joins to retrieve user information when analyzing telemetry data. It has redis config and denorm_fields - - **`router_config`**: It includes (**`topic`**) to which the dataset is published. - -- **Create a master dataset** - - **End Point**:`/obsrv/v1/datasets` - - **Method**:`POST` - - **Request Body:** - - ```json - {"id":"sb-telemetry-user","dataset_id":"sb-telemetry-user","type":"master-dataset","name":"sb-telemetry-user","validation_config":{"validate":true,"mode":"Strict"},"extraction_config":{"is_batch_event":false,"extraction_key":"","dedup_config":{"drop_duplicates":false,"dedup_key":"id","dedup_period":1036800}},"dedup_config":{"drop_duplicates":true,"dedup_key":"id","dedup_period":1036800},"data_schema":{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"subject":{"type":"array","items":{"type":"string"}},"channel":{"type":"string"},"language":{"type":"array","items":{"type":"string"}},"id":{"type":"string"},"firstName":{"type":"string"},"lastName":{"type":"string"},"mobile":{"type":"string"},"email":{"type":"string"},"state":{"type":"string"},"district":{"type":"string"}}},"denorm_config":{"redis_db_host":"obsrv-redis-master.redis.svc.cluster.local","redis_db_port":6379,"denorm_fields":[]},"router_config":{"topic":"user-master"},"dataset_config":{"data_key":"id","timestamp_key":"","exclude_fields":[],"entry_topic":"dev.masterdata.ingest","redis_db_host":"obsrv-redis-master.redis.svc.cluster.local","redis_db_port":6379,"index_data":false,"redis_db":3},"status":"ACTIVE","created_by":"SYSTEM","updated_by":"SYSTEM","published_date":"2023-05-19 05:46:01.854692","tags":[],"data_version":null} - ``` - -- **Create a dataset with denormalized configurations** - - **End Point**:`/obsrv/v1/datasets` - - **Method**:`POST` - - **Request Body:** - - ```json - {"id":"sb-telemetry","dataset_id":"sb-telemetry","type":"dataset","name":"sb-telemetry","validation_config":{"validate":true,"mode":"Strict","validation_mode":"Strict"},"extraction_config":{"is_batch_event":true,"extraction_key":"events","dedup_config":{"drop_duplicates":true,"dedup_key":"id","dedup_period":1036800},"batch_id":"id"},"dedup_config":{"drop_duplicates":true,"dedup_key":"mid","dedup_period":1036800},"data_schema":{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"eid":{"type":"string"},"ets":{"type":"integer","format":"date-time"},"ver":{"type":"string"},"mid":{"type":"string","oneof":[{"type":"integer"},{"type":"string"}]},"actor":{"type":"object","properties":{"id":{"type":"string"},"type":{"type":"string"}}},"context":{"type":"object","properties":{"channel":{"type":"string"},"pdata":{"type":"object","properties":{"id":{"type":"string"},"ver":{"type":"string"},"pid":{"type":"string"}}},"env":{"type":"string"},"sid":{"type":"string","format":"uuid"},"did":{"type":"string"},"rollup":{"type":"object","properties":{"l1":{"type":"string"}}},"uid":{"type":"string"},"cdata":{"type":"array","additionalProperties":true}}},"object":{"type":"object","properties":{"id":{"type":"string"},"type":{"type":"string"},"ver":{"type":"string"}}},"tags":{"type":"array","items":{"type":"string"}},"edata":{"type":"object","properties":{"type":{"type":"string"},"pageid":{"type":"string"},"subtype":{"type":"string"},"uri":{"type":"string","format":"uri"},"visits":{"type":"array","additionalProperties":true},"level":{"type":"string"},"message":{"type":"string"},"params":{"type":"array","additionalProperties":true},"size":{"type":"integer"},"query":{"type":"string"},"filters":{"type":"object","properties":{"isTenant":{"type":"boolean"},"framework":{"type":"object"},"mimeType":{"type":"object"},"resourceType":{"type":"object"},"subject":{"type":"array","additionalProperties":true},"se_boards":{"type":"array","additionalProperties":true},"se_mediums":{"type":"array","additionalProperties":true},"se_gradeLevels":{"type":"array","additionalProperties":true},"primaryCategory":{"type":"array","additionalProperties":true},"objectType":{"type":"array","additionalProperties":true},"channel":{"type":"array","additionalProperties":true},"contentType":{"type":"array","additionalProperties":true},"visibility":{"type":"array","additionalProperties":true},"batches.status":{"type":"array","items":{"type":"integer"}},"batches.enrollmentType":{"type":"string"},"status":{"type":"array","additionalProperties":true},"migratedVersion":{"type":"integer"},"identifiers":{"type":"array","additionalProperties":true}}},"sort":{"type":"object","properties":{"lastPublishedOn":{"type":"string"}}},"topn":{"type":"array","additionalProperties":true},"props":{"type":"array","additionalProperties":true},"duration":{"type":"integer"},"state":{"type":"string"},"prevstate":{"type":"string"}}},"syncts":{"type":"integer","format":"date-time"},"@timestamp":{"type":"string","format":"date-time"},"flags":{"type":"object","properties":{"ex_processed":{"type":"boolean"}}}},"required":["ets"]},"denorm_config":{"redis_db_host":"obsrv-redis-master.redis.svc.cluster.local","redis_db_port":6379,"denorm_fields":[{"denorm_key":"actor.id","redis_db":3,"denorm_out_field":"user_metadata"}]},"router_config":{"topic":"sb-telemetry"},"dataset_config":{"data_key":"id","timestamp_key":"","exclude_fields":[],"entry_topic":"dev.masterdata.ingest","redis_db_host":"obsrv-redis-master.redis.svc.cluster.local","redis_db_port":6379,"index_data":false,"redis_db":3},"status":"ACTIVE","created_by":"SYSTEM","updated_by":"SYSTEM","created_date":"2023-05-31 12:15:42.845622","updated_date":"2023-05-31 12:15:42.845622","published_date":"2023-05-31 12:15:42.845622","tags":null,"data_version":null} - ``` - - -## How to ingest data? - -- First port forward Druid service within the cluster, use the command: **`kubectl port-forward 8888:8888`**. Access the service on your local machine at localhost:8888. -- Create ingestion spec, you can refer to the [**official documentation**](https://druid.apache.org/docs/latest/development/extensions-core/kafka-ingestion.html) which provides detailed instructions and examples. -- Create a new data source by deriving it from the previously created dataset. - - **End Point**: `/obsrv/v1/datasources` - - **Method**: `POST` - - **Request Body**: - - ```json - {"id":"sb-telemetry_sb-telemetry","datasource":"sb-telemetry","dataset_id":"sb-telemetry","ingestion_spec":{"type":"kafka","spec":{"dataSchema":{"dataSource":"sb-telemetry","dimensionsSpec":{"dimensions":[{"type":"string","name":"eid"},{"type":"long","name":"ets"},{"type":"string","name":"ver"},{"type":"string","name":"mid"},{"type":"string","name":"actor_id"},{"type":"string","name":"actor_type"},{"type":"string","name":"context_channel"},{"type":"string","name":"context_pdata_id"},{"type":"string","name":"context_pdata_ver"},{"type":"string","name":"context_pdata_pid"},{"type":"string","name":"context_env"},{"type":"string","name":"context_sid"},{"type":"string","name":"context_did"},{"type":"string","name":"context_rollup_l1"},{"type":"string","name":"context_uid"},{"type":"array","name":"context_cdata"},{"type":"string","name":"object_id"},{"type":"string","name":"object_type"},{"type":"string","name":"object_ver"},{"type":"array","name":"tags"},{"type":"string","name":"edata_type"},{"type":"string","name":"edata_pageid"},{"type":"string","name":"edata_subtype"},{"type":"string","name":"edata_uri"},{"type":"array","name":"edata_visits"},{"type":"string","name":"edata_level"},{"type":"string","name":"edata_message"},{"type":"array","name":"edata_params"},{"type":"string","name":"edata_query"},{"type":"boolean","name":"edata_filters_isTenant"},{"type":"array","name":"edata_filters_subject"},{"type":"array","name":"edata_filters_se_boards"},{"type":"array","name":"edata_filters_se_mediums"},{"type":"array","name":"edata_filters_se_gradeLevels"},{"type":"array","name":"edata_filters_primaryCategory"},{"type":"array","name":"edata_filters_objectType"},{"type":"array","name":"edata_filters_channel"},{"type":"array","name":"edata_filters_contentType"},{"type":"array","name":"edata_filters_visibility"},{"type":"array","name":"edata_filters_batches_status"},{"type":"string","name":"edata_filters_batches_enrollmentType"},{"type":"array","name":"edata_filters_status"},{"type":"array","name":"edata_filters_identifiers"},{"name":"edata_filters_batches"},{"type":"string","name":"edata_sort_lastPublishedOn"},{"type":"array","name":"edata_topn"},{"type":"array","name":"edata_props"},{"type":"string","name":"edata_state"},{"type":"string","name":"edata_prevstate"},{"type":"string","name":"@timestamp"},{"type":"boolean","name":"flags_ex_processed"},{"type":"json","name":"user_metadata"}]},"timestampSpec":{"column":"syncts","format":"auto"},"metricsSpec":[{"type":"doubleSum","name":"edata_size","fieldName":"edata_size"},{"type":"doubleSum","name":"edata_filters_migratedVersion","fieldName":"edata_filters_migratedVersion"},{"type":"doubleSum","name":"edata_duration","fieldName":"edata_duration"}],"granularitySpec":{"type":"uniform","segmentGranularity":"DAY","rollup":false}},"tuningConfig":{"type":"kafka","maxBytesInMemory":134217728,"maxRowsPerSegment":500000,"logParseExceptions":true},"ioConfig":{"type":"kafka","topic":"sb-telemetry","consumerProperties":{"bootstrap.servers":"kafka-headless.kafka.svc:9092"},"taskCount":1,"replicas":1,"taskDuration":"PT1H","useEarliestOffset":true,"completionTimeout":"PT1H","inputFormat":{"type":"json","flattenSpec":{"useFieldDiscovery":true,"fields":[{"type":"path","expr":"$.eid","name":"eid"},{"type":"path","expr":"$.ets","name":"ets"},{"type":"path","expr":"$.ver","name":"ver"},{"type":"path","expr":"$.mid","name":"mid"},{"type":"path","expr":"$.actor.id","name":"actor_id"},{"type":"path","expr":"$.actor.type","name":"actor_type"},{"type":"path","expr":"$.context.channel","name":"context_channel"},{"type":"path","expr":"$.context.pdata.id","name":"context_pdata_id"},{"type":"path","expr":"$.context.pdata.ver","name":"context_pdata_ver"},{"type":"path","expr":"$.context.pdata.pid","name":"context_pdata_pid"},{"type":"path","expr":"$.context.env","name":"context_env"},{"type":"path","expr":"$.context.sid","name":"context_sid"},{"type":"path","expr":"$.context.did","name":"context_did"},{"type":"path","expr":"$.context.rollup.l1","name":"context_rollup_l1"},{"type":"path","expr":"$.context.uid","name":"context_uid"},{"type":"path","expr":"$.context.cdata[*]","name":"context_cdata"},{"type":"path","expr":"$.object.id","name":"object_id"},{"type":"path","expr":"$.object.type","name":"object_type"},{"type":"path","expr":"$.object.ver","name":"object_ver"},{"type":"path","expr":"$.tags[*]","name":"tags"},{"type":"path","expr":"$.edata.type","name":"edata_type"},{"type":"path","expr":"$.edata.pageid","name":"edata_pageid"},{"type":"path","expr":"$.edata.subtype","name":"edata_subtype"},{"type":"path","expr":"$.edata.uri","name":"edata_uri"},{"type":"path","expr":"$.edata.visits[*]","name":"edata_visits"},{"type":"path","expr":"$.edata.level","name":"edata_level"},{"type":"path","expr":"$.edata.message","name":"edata_message"},{"type":"path","expr":"$.edata.params[*]","name":"edata_params"},{"type":"path","expr":"$.edata.query","name":"edata_query"},{"type":"path","expr":"$.edata.filters.isTenant","name":"edata_filters_isTenant"},{"type":"path","expr":"$.edata.filters.subject[*]","name":"edata_filters_subject"},{"type":"path","expr":"$.edata.filters.se_boards[*]","name":"edata_filters_se_boards"},{"type":"path","expr":"$.edata.filters.se_mediums[*]","name":"edata_filters_se_mediums"},{"type":"path","expr":"$.edata.filters.se_gradeLevels[*]","name":"edata_filters_se_gradeLevels"},{"type":"path","expr":"$.edata.filters.primaryCategory[*]","name":"edata_filters_primaryCategory"},{"type":"path","expr":"$.edata.filters.objectType[*]","name":"edata_filters_objectType"},{"type":"path","expr":"$.edata.filters.channel[*]","name":"edata_filters_channel"},{"type":"path","expr":"$.edata.filters.contentType[*]","name":"edata_filters_contentType"},{"type":"path","expr":"$.edata.filters.visibility[*]","name":"edata_filters_visibility"},{"type":"path","expr":"$.edata.filters.batches.status[*]","name":"edata_filters_batches_status"},{"type":"path","expr":"$.edata.filters.batches.enrollmentType","name":"edata_filters_batches_enrollmentType"},{"type":"path","expr":"$.edata.filters.status[*]","name":"edata_filters_status"},{"type":"path","expr":"$.edata.filters.identifiers[*]","name":"edata_filters_identifiers"},{"type":"path","expr":"$.edata.filters.batches","name":"edata_filters_batches"},{"type":"path","expr":"$.edata.sort.lastPublishedOn","name":"edata_sort_lastPublishedOn"},{"type":"path","expr":"$.edata.topn[*]","name":"edata_topn"},{"type":"path","expr":"$.edata.props[*]","name":"edata_props"},{"type":"path","expr":"$.edata.state","name":"edata_state"},{"type":"path","expr":"$.edata.prevstate","name":"edata_prevstate"},{"type":"path","expr":"$.obsrv_meta.syncts","name":"syncts"},{"type":"path","expr":"$.@timestamp","name":"@timestamp"},{"type":"path","expr":"$.flags.ex_processed","name":"flags_ex_processed"},{"type":"path","expr":"$.user_metadata","name":"user_metadata"},{"type":"path","expr":"$.edata.size","name":"edata_size"},{"type":"path","expr":"$.edata.filters.migratedVersion","name":"edata_filters_migratedVersion"},{"type":"path","expr":"$.edata.duration","name":"edata_duration"}]}},"appendToExisting":false}}},"datasource_ref":"sb-telemetry","retention_period":{"enabled":"false"},"archival_policy":{"enabled":"false"},"purge_policy":{"enabled":"false"},"backup_config":{"enabled":"false"},"status":"ACTIVE","created_by":"SYSTEM","updated_by":"SYSTEM","published_date":"2023-05-31 12:15:42.881752"} - ``` - -- Submit ingestion to Druid - - **URL:** `localhost:8888/druid/indexer/v1/supervisor` - - **Request Body:** `` - -- Push events for the master dataset: Pushing events involves loading data into the master dataset. You can push events through obsrv API using endpoint `/obsrv/v1/data/:datasetId` - - **End Point**:`/obsrv/v1/data/sb-telemetry-user` - - **Method**:`POST` - - **Request Body**: - - ```json - {"data":{"event":{"subject":["Mathematics"],"channel":"Future Assurance Consultant","language":["English"],"id":"user-00","firstName":"Karan","lastName":"Panicker","mobile":"+91-602-8988588","email":"Karan_Panicker@obsrv.ai","state":"Gujarat","district":"Bedfordshire"}}} - ``` - -- Push events for the normal dataset: Pushing events involves loading data into the normal dataset. - - **End Point**:`/obsrv/v1/data/sb-telemetry` - - **Method**:`POST` - - **Request Body**: - - ```json - {"data":{"id":"dedup-id-1","events":[{"eid":"IMPRESSION","ets":1672657002221,"ver":"3.0","mid":124435,"actor":{"id":"user-00","type":"User"},"context":{"channel":"01268904781886259221","pdata":{"id":"staging.diksha.portal","ver":"5.1.0","pid":"sunbird-portal"},"env":"public","sid":"23850c90-8a8c-11ed-95d0-276800e1048c","did":"0c45959486f579c24854d40a225d6161","cdata":[],"rollup":{"l1":"01268904781886259221"},"uid":"anonymous"},"object":{},"tags":["01268904781886259221"],"edata":{"type":"view","pageid":"login","subtype":"pageexit","uri":"https://staging.sunbirded.org/auth/realms/sunbird/protocol/openid-connect/auth?client_id=portal&state=254efd70-6b89-4f7d-868b-5c957f54174e&redirect_uri=https%253A%252F%252Fstaging.sunbirded.org%252Fresources%253Fboard%253DState%252520(Andhra%252520Pradesh)%2526medium%253DEnglish%2526gradeLevel%253DClass%2525201%2526%2526id%253Dap_k-12_1%2526selectedTab%253Dhome%2526auth_callback%253D1&scope=openid&response_type=code&version=4","visits":[]},"syncts":1672657005814,"@timestamp":"2023-01-02T10:56:45.814Z","flags":{"ex_processed":true}}]}} - ``` - - - - -## How to query on data source? - -- You can use Obsrv API for druid native and sql queries. - - **For native query:** - - **End Point**:`/obsrv/v1/query` - - **Method**:`POST` - - **Request Body**: - - ```json - {"context":{"dataSource":"sb-telemetry"},"query":{"queryType":"scan","dataSource":"sb-telemetry","intervals":"2023-03-31/2023-04-01","granularity":"DAY"}} - ``` - - **For SQL query:** - - **End Point**:`/obsrv/v1/sql-query` - - **Method**:`POST` - - **Request Body**: - - ```json - {"context":{"dataSource":"sb-telemetry"},"querySql":"YOUR QUERY STRING"} - ``` - -For more info on Obsrv API Service refer [**here**](https://github.com/Sunbird-Obsrv/obsrv-api-service/tree/main/swagger-doc) - \ No newline at end of file diff --git a/README.md b/README.md index 83ca9d19..a6a7a496 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,6 @@ Please note that these configurations can be modified as needed to customize the * [Extraction Job config](#extractor-job) * [Preprocessor Job config](#preprocessor-job) * [Denorm Job config](#denormalizer-job) - * [Transformer Job config](#transformer-job) * [Router Job config](#router-job) * [Kafka Connector Job config](#kafka-connector-job) @@ -114,17 +113,6 @@ Please note that these configurations can be modified as needed to customize the | task.consumer.parallelism | Parallelism for task consumers |number| 1 | | task.downstream.operators.parallelism | Parallelism for downstream operators |number| 1 | -## Transformer Job - -| Configuration | Description |Data type| Default Value | -|------------------------------|--------------------------------------------|----|----------------------------| -| kafka.input.topic | Input Kafka topic |string| local.denorm | -| kafka.output.transform.topic | Output Kafka topic for transformed data |string| local.transform | -| kafka.groupId | Kafka consumer group ID |string| local-transformer-group | -| kafka.producer.max-request-size | Maximum request size for Kafka producer |number| 5242880 | -| task.consumer.parallelism | Parallelism for task consumers |number | 1 | -| task.downstream.operators.parallelism | Parallelism for downstream operators |number| 1 | - ## Router Job | Configuration | Description |Data type| Default Value | From 4d734d02564c58439993aac34bbf2e2617c73d3e Mon Sep 17 00:00:00 2001 From: GayathriSrividya Date: Fri, 9 Jun 2023 15:58:52 +0530 Subject: [PATCH 08/37] feat: masterdata processor job config --- README.md | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a6a7a496..f000cc6b 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ Please note that these configurations can be modified as needed to customize the * [Denorm Job config](#denormalizer-job) * [Router Job config](#router-job) * [Kafka Connector Job config](#kafka-connector-job) + * [Masterdata Processor Job config](#masterdata-processor-job) ## Common Configuration @@ -133,4 +134,31 @@ Please note that these configurations can be modified as needed to customize the | kafka.groupId | Kafka consumer group ID |string| local-kafkaconnector-group | | kafka.producer.max-request-size | Maximum request size for Kafka producer in bytes |number| 5242880 (5MB) | | task.consumer.parallelism | Parallelism for task consumers |number| 1 | -| task.downstream.operators.parallelism | Parallelism for downstream operators |number|1 | \ No newline at end of file +| task.downstream.operators.parallelism | Parallelism for downstream operators |number|1 | + +## MasterData Processor Job + +| Configuration | Description | Data Type | Default Value | +|-------------------------------------------|----------------------------------------------------|-----------|--------------------------------| +| master-data-processor.kafka.input.topic | Input Kafka topic | String | local.masterdata.ingest | +| master-data-processor.kafka.output.raw.topic | Output Kafka topic for raw data | String | local.masterdata.raw | +| master-data-processor.kafka.output.extractor.duplicate.topic | Output Kafka topic for duplicate data extraction | String | local.masterdata.extractor.duplicate | +| master-data-processor.kafka.output.failed.topic | Output Kafka topic for failed data | String | local.masterdata.failed | +| master-data-processor.kafka.output.batch.failed.topic | Output Kafka topic for batch extraction failures | String | local.masterdata.extractor.failed | +| master-data-processor.kafka.event.max.size | Maximum size of events in bytes | Number | 1048576 (1MB) | +| master-data-processor.kafka.output.invalid.topic | Output Kafka topic for invalid data | String | local.masterdata.invalid | +| master-data-processor.kafka.output.unique.topic | Output Kafka topic for unique data | String | local.masterdata.unique | +| master-data-processor.kafka.output.duplicate.topic | Output Kafka topic for duplicate data | String | local.masterdata.duplicate | +| master-data-processor.kafka.output.transform.topic | Output Kafka topic for transformed data | String | local.masterdata.transform | +| master-data-processor.kafka.stats.topic | Kafka topic for statistics data | String | local.masterdata.stats | +| master-data-processor.kafka.groupId | Kafka consumer group ID | String | local-masterdata-pipeline-group | +| master-data-processor.kafka.producer.max-request-size | Maximum request size for Kafka producer | Number | 5242880 (5MB) | +| master-data-processor.task.window.time.in.seconds | Time window in seconds for tasks | Number | 5 | +| master-data-processor.task.window.count | Count of events within the time window | Number | 30 | +| master-data-processor.task.window.shards | Number of shards for the time window | Number | 1400 | +| master-data-processor.task.consumer.parallelism | Parallelism for task consumers | Number | 1 | +| master-data-processor.task.downstream.operators.parallelism | Parallelism for downstream operators | Number | 1 | +| master-data-processor.redis.database.extractor.duplication.store.id | Redis store ID for extractor duplication | Number | 1 | +| master-data-processor.redis.database.preprocessor.duplication.store.id | Redis store ID for preprocessor duplication | Number | 2 | +| master-data-processor.redis.database.key.expiry.seconds | Expiry time for Redis keys in seconds | Number | 3600 | +| master-data-processor.dataset.type | Type of master dataset | String | master-dataset | From fcacd2a0009845fbcc3cbd53860431549422cd7e Mon Sep 17 00:00:00 2001 From: Manoj Krishna <92361832+ManojKrishnaChintauri@users.noreply.github.com> Date: Fri, 9 Jun 2023 16:16:43 +0530 Subject: [PATCH 09/37] Build deploy v2 (#19) * #0 - Refactor Dockerfile and Github actions workflow --------- Co-authored-by: Santhosh Vasabhaktula Co-authored-by: ManojCKrishna --- .dockerignore | 1 + .github/workflows/build_and_deploy.yaml | 83 +++++++++++++++++++++++-- Dockerfile | 53 ++++++++++++---- 3 files changed, 119 insertions(+), 18 deletions(-) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..6b8710a7 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +.git diff --git a/.github/workflows/build_and_deploy.yaml b/.github/workflows/build_and_deploy.yaml index 8610ad28..03a090b0 100644 --- a/.github/workflows/build_and_deploy.yaml +++ b/.github/workflows/build_and_deploy.yaml @@ -21,32 +21,105 @@ jobs: needs: check-tag if: needs.check-tag.outputs.ALLOWED_TAG == 'True' runs-on: ubuntu-latest + strategy: + matrix: + include: + - image: "extractor" + target: "extractor-image" + - image: "preprocessor" + target: "preprocessor-image" + - image: "denormalizer" + target: "denormalizer-image" + - image: "transformer" + target: "transformer-image" + - image: "druid-router" + target: "router-image" + - image: "merged-pipeline" + target: "merged-image" + - image: "master-data-processor" + target: "master-data-processor-image" + - image: "kafka-connector" + target: "kafka-connector-image" + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Maven Build + run: | + mvn clean install + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - name: Login to docker hub uses: docker/login-action@v2 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Build docker image and push + - name: Build merged-pipeline image and push uses: docker/build-push-action@v4 with: platforms: linux/amd64 + target: merged-image push: true - tags: ${{ secrets.DOCKERHUB_USERNAME }}/obsrv-core:${{ github.ref_name }} + tags: ${{ secrets.DOCKERHUB_USERNAME }}/merged-pipeline:${{ github.ref_name }} + + - name: Build merged-pipeline image and push + uses: docker/build-push-action@v4 + with: + platforms: linux/amd64 + target: master-data-processor-image + push: true + tags: ${{ secrets.DOCKERHUB_USERNAME }}/master-data-processor:${{ github.ref_name }} + + - name: Build merged-pipeline image and push + uses: docker/build-push-action@v4 + with: + platforms: linux/amd64 + target: kafka-connector-image + push: true + tags: ${{ secrets.DOCKERHUB_USERNAME }}/kafka-connector:${{ github.ref_name }} + + - name: Build ${{matrix.image}} image and push + uses: docker/build-push-action@v4 + with: + platforms: linux/amd64 + target: ${{matrix.target}} + push: true + tags: ${{ secrets.DOCKERHUB_USERNAME }}/${{matrix.image}}:${{ github.ref_name }} aws-deploy: needs: [check-tag, docker-build] - if: needs.check-tag.outputs.ALLOWED_TAG == 'True' && vars.CLOUD_PROVIDER == 'aws' + if: needs.check-tag.outputs.ALLOWED_TAG == 'True' runs-on: ubuntu-latest environment: aws-dev steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Setup Terragrunt + uses: autero1/action-terragrunt@v1.1.0 + with: + terragrunt_version: v0.45.8 + - name: Terragrunt installation + run: terragrunt --version + - name: Clone the terraform deployment repo uses: actions/checkout@v3 with: repository: ${{ vars.DEPLOY_REPO }} path: deploy ref: ${{ vars.DEPLOY_REPO_REF }} + + - name: Fetch and update kubeconfig file + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ vars.AWS_REGION }} + run: | + aws eks --region ${{ vars.AWS_REGION }} update-kubeconfig --name ${{ vars.KUBERNETES_CLUSTER_NAME }} - name: Run terraform init and apply env: @@ -55,11 +128,11 @@ jobs: AWS_REGION: ${{ vars.AWS_REGION }} AWS_TERRAFORM_BACKEND_BUCKET_NAME: ${{ vars.AWS_TERRAFORM_BACKEND_BUCKET_NAME }} AWS_TERRAFORM_BACKEND_BUCKET_REGION: ${{ vars.AWS_TERRAFORM_BACKEND_BUCKET_REGION }} + KUBE_CONFIG_PATH: ~/.kube/config run: | cd deploy/terraform/aws terragrunt init - terragrunt apply -auto-approve -replace=module.flink.helm_release.flink \ - -var flink_container_registry=${{ secrets.DOCKERHUB_USERNAME }} \ + terragrunt apply -auto-approve -var merged_pipeline_enabled={{ vars.MERGED_PIPELINE || 'true' }} --replace='module.flink.helm_release.flink' \ -var flink_image_tag=${{ github.ref_name }} azure-deploy: diff --git a/Dockerfile b/Dockerfile index 03b4dbb4..3ba6a51c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,14 +1,41 @@ -FROM --platform=linux/x86_64 maven:3.6.0-jdk-11-slim AS build +FROM --platform=linux/x86_64 maven:3.6.0-jdk-11-slim AS build-core COPY . /app -RUN mvn -f /app/pom.xml clean package -DskipTests - -FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-java11 -USER flink -COPY --from=build /app/dataset-registry/target/dataset-registry-1.0.0.jar $FLINK_HOME/lib/ -COPY --from=build /app/framework/target/framework-1.0.0.jar $FLINK_HOME/lib/ -COPY --from=build /app/pipeline/denormalizer/target/denormalizer-1.0.0.jar $FLINK_HOME/lib/ -COPY --from=build /app/pipeline/druid-router/target/druid-router-1.0.0.jar $FLINK_HOME/lib/ -COPY --from=build /app/pipeline/extractor/target/extractor-1.0.0.jar $FLINK_HOME/lib/ -COPY --from=build /app/pipeline/pipeline-merged/target/pipeline-merged-1.0.0.jar $FLINK_HOME/lib/ -COPY --from=build /app/pipeline/preprocessor/target/preprocessor-1.0.0.jar $FLINK_HOME/lib/ -COPY --from=build /app/pipeline/transformer/target/transformer-1.0.0.jar $FLINK_HOME/lib/ +RUN mvn clean install -DskipTests -f /app/framework/pom.xml +RUN mvn clean install -DskipTests -f /app/dataset-registry/pom.xml + +FROM --platform=linux/x86_64 maven:3.6.0-jdk-11-slim AS build-pipeline +COPY --from=build-core /root/.m2 /root/.m2 +COPY . /app +RUN mvn clean package -DskipTests -f /app/pipeline/pom.xml + +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-java11 as extractor-image +USER flink +COPY --from=build-pipeline /app/pipeline/extractor/target/extractor-1.0.0.jar $FLINK_HOME/lib/ + +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-java11 as preprocessor-image +USER flink +COPY --from=build-pipeline /app/pipeline/preprocessor/target/preprocessor-1.0.0.jar $FLINK_HOME/lib/ + +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-java11 as denormalizer-image +USER flink +COPY --from=build-pipeline /app/pipeline/denormalizer/target/denormalizer-1.0.0.jar $FLINK_HOME/lib/ + +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-java11 as transformer-image +USER flink +COPY --from=build-pipeline /app/pipeline/transformer/target/transformer-1.0.0.jar $FLINK_HOME/lib/ + +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-java11 as router-image +USER flink +COPY --from=build-pipeline /app/pipeline/druid-router/target/druid-router-1.0.0.jar $FLINK_HOME/lib/ + +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-java11 as merged-image +USER flink +COPY --from=build-pipeline /app/pipeline/pipeline-merged/target/pipeline-merged-1.0.0.jar $FLINK_HOME/lib/ + +FROM --platform=linux/x86_64 sunbird/flink:1.15.2-scala_2.12-java11 as master-data-processor-image +USER flink +COPY --from=build-pipeline /app/pipeline/master-data-processor/target/master-data-processor-1.0.0.jar $FLINK_HOME/lib + +FROM --platform=linux/x86_64 sunbird/flink:1.15.2-scala_2.12-java11 as kafka-connector-image +USER flink +COPY --from=build-pipeline /app/pipeline/kafka-connector/target/kafka-connector-1.0.0.jar $FLINK_HOME/lib \ No newline at end of file From e1abfcd4383398ee84f3ecd11488da5958d01ac8 Mon Sep 17 00:00:00 2001 From: shiva-rakshith Date: Fri, 10 Nov 2023 16:15:51 +0530 Subject: [PATCH 10/37] Update DatasetModels.scala --- .../org/sunbird/obsrv/model/DatasetModels.scala | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala index 48791415..68d535a0 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala @@ -48,10 +48,16 @@ object DatasetModels { @JsonProperty("field_key") fieldKey: String, @JsonProperty("transformation_function") transformationFunction: TransformationFunction, @JsonProperty("status") status: String) - case class ConnectorConfig(@JsonProperty("kafkaBrokers") kafkaBrokers: String, @JsonProperty("topic") topic: String, @JsonProperty("jdbc_user") jdbcUser: String, - @JsonProperty("jdbc_password") jdbcPassword: String, @JsonProperty("jdbc_host") jdbcHost: String, @JsonProperty("jdbc_port") jdbcPort: Int, - @JsonProperty("jdbc_database") jdbcDatabase: String, @JsonProperty("jdbc_database_table") jdbcDatabaseTable: String, @JsonProperty("jdbc_batch_size") jdbcBatchSize: Int, - @JsonProperty("jdbc_batches_per_minute") jdbcBatchesPerMinute: Int, @JsonProperty("jdbc_database_type") jdbcDatabaseType: String) + case class ConnectorConfig(@JsonProperty("kafkaBrokers") kafkaBrokers: String, @JsonProperty("topic") topic: String, @JsonProperty("type")databaseType: String, + @JsonProperty("connection") connection: Connection, @JsonProperty("tableName") tableName: String, @JsonProperty("databaseName") databaseName: String, + @JsonProperty("pollingInterval") pollingInterval: PollingInterval, @JsonProperty("authenticationMechanism") authenticationMechanism: AuthenticationMechanism, + @JsonProperty("batchSize") batchSize: Int) + + case class Connection(@JsonProperty("host") host: String, @JsonProperty("port") port: String) + + case class PollingInterval(@JsonProperty("type") pollingType: String, @JsonProperty("cronExpression") cronExpression: String) + + case class AuthenticationMechanism(@JsonProperty("encrypted") encrypted: Boolean, @JsonProperty("encryptedValues") encryptedValues: String) case class ConnectorStats(@JsonProperty("last_fetch_timestamp") lastFetchTimestamp: String, @JsonProperty("records") records: Long, @JsonProperty("avg_batch_read_time") avgBatchReadTime: Long, @JsonProperty("disconnections") disconnections: Int) From 9a6918ed652698d0a8c9df2937c278bd84bc5916 Mon Sep 17 00:00:00 2001 From: Manjunath Davanam Date: Wed, 15 Nov 2023 16:45:54 +0530 Subject: [PATCH 11/37] Release 1.3.0 into Main branch (#34) * testing new images * testing new images * testing new images * testing new images * testing new images * build new image with bug fixes * update dockerfile * update dockerfile * #0 fix: upgrade packages * #0 feat: add flink dockerfiles * #0 fix: add individual extraction * Issue #0 fix: upgrade ubuntu packages for vulnerabilities * #0 fix: update github actions release condition --------- Co-authored-by: ManojKrishnaChintaluri Co-authored-by: Praveen <66662436+pveleneni@users.noreply.github.com> Co-authored-by: Sowmya N Dixit --- .github/workflows/build_and_deploy.yaml | 2 +- Dockerfile | 22 +-- data-products/pom.xml | 2 +- dataset-registry/pom.xml | 4 +- framework/pom.xml | 14 +- pipeline/druid-router/pom.xml | 10 +- pipeline/extractor/pom.xml | 13 +- .../functions/ExtractionFunction.scala | 2 +- pipeline/kafka-connector/pom.xml | 4 +- pipeline/master-data-processor/pom.xml | 9 ++ pipeline/pipeline-merged/pom.xml | 10 +- pipeline/preprocessor/pom.xml | 9 ++ stubs/docker/apache-flink-plugins/Dockerfile | 14 ++ stubs/docker/apache-flink/Dockerfile | 98 +++++++++++ .../docker/apache-flink/docker-entrypoint.sh | 152 ++++++++++++++++++ 15 files changed, 334 insertions(+), 31 deletions(-) create mode 100644 stubs/docker/apache-flink-plugins/Dockerfile create mode 100644 stubs/docker/apache-flink/Dockerfile create mode 100644 stubs/docker/apache-flink/docker-entrypoint.sh diff --git a/.github/workflows/build_and_deploy.yaml b/.github/workflows/build_and_deploy.yaml index 03a090b0..90b01883 100644 --- a/.github/workflows/build_and_deploy.yaml +++ b/.github/workflows/build_and_deploy.yaml @@ -15,7 +15,7 @@ jobs: id: tag-checker run: | (echo -n TRIGGER_ALLOWED= && echo 'print("${{ github.ref_name }}".split("_")[0] - in ${{ vars.CURRENT_RELEASE }})' | python3) >> "$GITHUB_OUTPUT" + not in ${{ vars.CURRENT_RELEASE }})' | python3) >> "$GITHUB_OUTPUT" docker-build: needs: check-tag diff --git a/Dockerfile b/Dockerfile index 3ba6a51c..17efe642 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,41 +1,41 @@ -FROM --platform=linux/x86_64 maven:3.6.0-jdk-11-slim AS build-core +FROM --platform=linux/x86_64 maven:3.9.4-eclipse-temurin-11-focal AS build-core COPY . /app RUN mvn clean install -DskipTests -f /app/framework/pom.xml RUN mvn clean install -DskipTests -f /app/dataset-registry/pom.xml -FROM --platform=linux/x86_64 maven:3.6.0-jdk-11-slim AS build-pipeline +FROM --platform=linux/x86_64 maven:3.9.4-eclipse-temurin-11-focal AS build-pipeline COPY --from=build-core /root/.m2 /root/.m2 COPY . /app RUN mvn clean package -DskipTests -f /app/pipeline/pom.xml -FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-java11 as extractor-image +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as extractor-image USER flink COPY --from=build-pipeline /app/pipeline/extractor/target/extractor-1.0.0.jar $FLINK_HOME/lib/ -FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-java11 as preprocessor-image +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as preprocessor-image USER flink COPY --from=build-pipeline /app/pipeline/preprocessor/target/preprocessor-1.0.0.jar $FLINK_HOME/lib/ -FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-java11 as denormalizer-image +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as denormalizer-image USER flink COPY --from=build-pipeline /app/pipeline/denormalizer/target/denormalizer-1.0.0.jar $FLINK_HOME/lib/ -FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-java11 as transformer-image +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as transformer-image USER flink COPY --from=build-pipeline /app/pipeline/transformer/target/transformer-1.0.0.jar $FLINK_HOME/lib/ -FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-java11 as router-image +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as router-image USER flink COPY --from=build-pipeline /app/pipeline/druid-router/target/druid-router-1.0.0.jar $FLINK_HOME/lib/ -FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-java11 as merged-image +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as merged-image USER flink COPY --from=build-pipeline /app/pipeline/pipeline-merged/target/pipeline-merged-1.0.0.jar $FLINK_HOME/lib/ -FROM --platform=linux/x86_64 sunbird/flink:1.15.2-scala_2.12-java11 as master-data-processor-image +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as master-data-processor-image USER flink COPY --from=build-pipeline /app/pipeline/master-data-processor/target/master-data-processor-1.0.0.jar $FLINK_HOME/lib -FROM --platform=linux/x86_64 sunbird/flink:1.15.2-scala_2.12-java11 as kafka-connector-image +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as kafka-connector-image USER flink -COPY --from=build-pipeline /app/pipeline/kafka-connector/target/kafka-connector-1.0.0.jar $FLINK_HOME/lib \ No newline at end of file +COPY --from=build-pipeline /app/pipeline/kafka-connector/target/kafka-connector-1.0.0.jar $FLINK_HOME/lib diff --git a/data-products/pom.xml b/data-products/pom.xml index fa15ad3c..e79564e5 100644 --- a/data-products/pom.xml +++ b/data-products/pom.xml @@ -228,4 +228,4 @@ - \ No newline at end of file + diff --git a/dataset-registry/pom.xml b/dataset-registry/pom.xml index 56107c1e..e3950291 100644 --- a/dataset-registry/pom.xml +++ b/dataset-registry/pom.xml @@ -36,7 +36,7 @@ com.google.code.gson gson - 2.4 + 2.8.9 com.typesafe @@ -178,4 +178,4 @@ - \ No newline at end of file + diff --git a/framework/pom.xml b/framework/pom.xml index cd411347..31402224 100644 --- a/framework/pom.xml +++ b/framework/pom.xml @@ -25,7 +25,6 @@ org.apache.flink flink-streaming-scala_${scala.maj.version} ${flink.version} - provided org.apache.flink @@ -51,17 +50,17 @@ com.google.code.gson gson - 2.4 + 2.8.9 com.fasterxml.jackson.core jackson-databind - 2.12.7 + 2.15.2 com.fasterxml.jackson.module jackson-module-scala_${scala.maj.version} - 2.12.7 + 2.15.2 com.fasterxml.jackson.core @@ -74,11 +73,6 @@ jedis 2.9.0 - - com.datastax.cassandra - cassandra-driver-core - 3.7.0 - com.typesafe config @@ -249,4 +243,4 @@ - \ No newline at end of file + diff --git a/pipeline/druid-router/pom.xml b/pipeline/druid-router/pom.xml index 9ff38aff..4945f84d 100644 --- a/pipeline/druid-router/pom.xml +++ b/pipeline/druid-router/pom.xml @@ -65,9 +65,17 @@ com.fasterxml.jackson.core jackson-databind + + com.google.guava + guava + - + + com.google.guava + guava + 32.1.2-jre + org.apache.flink flink-test-utils diff --git a/pipeline/extractor/pom.xml b/pipeline/extractor/pom.xml index 206df3cf..b73d697d 100644 --- a/pipeline/extractor/pom.xml +++ b/pipeline/extractor/pom.xml @@ -72,6 +72,17 @@ embedded-redis 0.7.1 test + + + com.google.guava + guava + + + + + com.google.guava + guava + 32.1.2-jre org.apache.flink @@ -225,4 +236,4 @@ - \ No newline at end of file + diff --git a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala index 48dac855..f8f4520c 100644 --- a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala +++ b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala @@ -50,7 +50,7 @@ class ExtractionFunction(config: ExtractorConfig, @transient var dedupEngine: De return } val dataset = datasetOpt.get - if (dataset.extractionConfig.isDefined && dataset.extractionConfig.get.isBatchEvent.get) { + if (!containsEvent(batchEvent) && dataset.extractionConfig.isDefined && dataset.extractionConfig.get.isBatchEvent.get) { val eventAsText = JSONUtil.serialize(batchEvent) if (dataset.extractionConfig.get.dedupConfig.isDefined && dataset.extractionConfig.get.dedupConfig.get.dropDuplicates.get) { val isDup = isDuplicate(dataset.id, dataset.extractionConfig.get.dedupConfig.get.dedupKey, eventAsText, context, config)(dedupEngine) diff --git a/pipeline/kafka-connector/pom.xml b/pipeline/kafka-connector/pom.xml index f665fc22..bdf2fe8f 100644 --- a/pipeline/kafka-connector/pom.xml +++ b/pipeline/kafka-connector/pom.xml @@ -50,7 +50,7 @@ com.fasterxml.jackson.datatype jackson-datatype-joda - 2.12.7 + 2.15.2 org.sunbird.obsrv @@ -229,4 +229,4 @@ - \ No newline at end of file + diff --git a/pipeline/master-data-processor/pom.xml b/pipeline/master-data-processor/pom.xml index 38f1b504..52783714 100644 --- a/pipeline/master-data-processor/pom.xml +++ b/pipeline/master-data-processor/pom.xml @@ -73,8 +73,17 @@ com.fasterxml.jackson.core jackson-databind + + com.google.guava + guava + + + com.google.guava + guava + 32.1.2-jre + org.json4s json4s-native_${scala.maj.version} diff --git a/pipeline/pipeline-merged/pom.xml b/pipeline/pipeline-merged/pom.xml index 5aa273ea..e19bc800 100644 --- a/pipeline/pipeline-merged/pom.xml +++ b/pipeline/pipeline-merged/pom.xml @@ -83,9 +83,17 @@ com.fasterxml.jackson.core jackson-databind + + com.google.guava + guava + - + + com.google.guava + guava + 32.1.2-jre + org.apache.kafka kafka-clients diff --git a/pipeline/preprocessor/pom.xml b/pipeline/preprocessor/pom.xml index 63e3334b..96171103 100644 --- a/pipeline/preprocessor/pom.xml +++ b/pipeline/preprocessor/pom.xml @@ -53,8 +53,17 @@ com.fasterxml.jackson.core jackson-databind + + com.google.guava + guava + + + com.google.guava + guava + 32.1.2-jre + org.apache.commons commons-lang3 diff --git a/stubs/docker/apache-flink-plugins/Dockerfile b/stubs/docker/apache-flink-plugins/Dockerfile new file mode 100644 index 00000000..1351c9c2 --- /dev/null +++ b/stubs/docker/apache-flink-plugins/Dockerfile @@ -0,0 +1,14 @@ +FROM sanketikahub/flink:1.15.2-scala_2.12-jdk-11-source +USER flink +RUN mkdir $FLINK_HOME/plugins/s3-fs-presto +RUN mkdir $FLINK_HOME/plugins/gs-fs-hadoop +RUN wget -nv -O flink-streaming-scala_2.12-1.15.2.jar "https://repo1.maven.org/maven2/org/apache/flink/flink-streaming-scala_2.12/1.15.2/flink-streaming-scala_2.12-1.15.2.jar"; \ + mv flink-streaming-scala_2.12-1.15.2.jar $FLINK_HOME/lib/ +# COPY flink-shaded-hadoop2-uber-2.8.3-1.8.3.jar $FLINK_HOME/lib/ +# COPY flink-s3-fs-hadoop-1.15.2.jar $FLINK_HOME/lib/ +RUN wget -nv -O flink-azure-fs-hadoop-1.15.2.jar "https://repo1.maven.org/maven2/org/apache/flink/flink-azure-fs-hadoop/1.15.2/flink-azure-fs-hadoop-1.15.2.jar"; \ + mv flink-azure-fs-hadoop-1.15.2.jar $FLINK_HOME/lib/ +RUN wget -nv -O flink-s3-fs-presto-1.15.2.jar "https://repo1.maven.org/maven2/org/apache/flink/flink-s3-fs-presto/1.15.2/flink-s3-fs-presto-1.15.2.jar"; \ + mv flink-s3-fs-presto-1.15.2.jar $FLINK_HOME/plugins/s3-fs-presto +RUN wget -nv -O flink-gs-fs-hadoop-1.15.2.jar "https://repo1.maven.org/maven2/org/apache/flink/flink-gs-fs-hadoop/1.15.2/flink-gs-fs-hadoop-1.15.2.jar"; \ + mv flink-gs-fs-hadoop-1.15.2.jar $FLINK_HOME/plugins/gs-fs-hadoop diff --git a/stubs/docker/apache-flink/Dockerfile b/stubs/docker/apache-flink/Dockerfile new file mode 100644 index 00000000..e3d64562 --- /dev/null +++ b/stubs/docker/apache-flink/Dockerfile @@ -0,0 +1,98 @@ +# FLINK SOURCE LINK - https://github.com/apache/flink-docker/blob/4794f9425513fb4c0b55ec1efd629e8eb7e5d8c5/1.15/scala_2.12-java11-ubuntu/Dockerfile +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### + +FROM --platform=linux/x86_64 eclipse-temurin:11.0.20.1_1-jdk-focal + +RUN apt-get update +RUN apt-get install libcurl4 curl -y + +# Install dependencies +RUN set -ex; \ + apt-get update; \ + apt-get -y install gpg libsnappy1v5 gettext-base libjemalloc-dev; \ + rm -rf /var/lib/apt/lists/* + +# Grab gosu for easy step-down from root +ENV GOSU_VERSION 1.11 +RUN set -ex; \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-$(dpkg --print-architecture)"; \ + wget -nv -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-$(dpkg --print-architecture).asc"; \ + export GNUPGHOME="$(mktemp -d)"; \ + for server in ha.pool.sks-keyservers.net $(shuf -e \ + hkp://p80.pool.sks-keyservers.net:80 \ + keyserver.ubuntu.com \ + hkp://keyserver.ubuntu.com:80 \ + pgp.mit.edu) ; do \ + gpg --batch --keyserver "$server" --recv-keys B42F6819007F00F88E364FD4036A9C25BF357DD4 && break || : ; \ + done && \ + gpg --batch --verify /usr/local/bin/gosu.asc /usr/local/bin/gosu; \ + gpgconf --kill all; \ + rm -rf "$GNUPGHOME" /usr/local/bin/gosu.asc; \ + chmod +x /usr/local/bin/gosu; \ + gosu nobody true + +# Configure Flink version +ENV FLINK_TGZ_URL=https://www.apache.org/dyn/closer.cgi?action=download&filename=flink/flink-1.15.2/flink-1.15.2-bin-scala_2.12.tgz \ + FLINK_ASC_URL=https://www.apache.org/dist/flink/flink-1.15.2/flink-1.15.2-bin-scala_2.12.tgz.asc \ + GPG_KEY=0F79F2AFB2351BC29678544591F9C1EC125FD8DB \ + CHECK_GPG=true + +# Prepare environment +ENV FLINK_HOME=/opt/flink +ENV PATH=$FLINK_HOME/bin:$PATH +RUN groupadd --system --gid=9999 flink && \ + useradd --system --home-dir $FLINK_HOME --uid=9999 --gid=flink flink +WORKDIR $FLINK_HOME + +# Install Flink +RUN set -ex; \ + wget -nv -O flink.tgz "$FLINK_TGZ_URL"; \ + \ + if [ "$CHECK_GPG" = "true" ]; then \ + wget -nv -O flink.tgz.asc "$FLINK_ASC_URL"; \ + export GNUPGHOME="$(mktemp -d)"; \ + for server in ha.pool.sks-keyservers.net $(shuf -e \ + hkp://p80.pool.sks-keyservers.net:80 \ + keyserver.ubuntu.com \ + hkp://keyserver.ubuntu.com:80 \ + pgp.mit.edu) ; do \ + gpg --batch --keyserver "$server" --recv-keys "$GPG_KEY" && break || : ; \ + done && \ + gpg --batch --verify flink.tgz.asc flink.tgz; \ + gpgconf --kill all; \ + rm -rf "$GNUPGHOME" flink.tgz.asc; \ + fi; \ + \ + tar -xf flink.tgz --strip-components=1; \ + rm flink.tgz; \ + \ + chown -R flink:flink .; \ + \ + # Replace default REST/RPC endpoint bind address to use the container's network interface \ + sed -i 's/rest.address: localhost/rest.address: 0.0.0.0/g' $FLINK_HOME/conf/flink-conf.yaml; \ + sed -i 's/rest.bind-address: localhost/rest.bind-address: 0.0.0.0/g' $FLINK_HOME/conf/flink-conf.yaml; \ + sed -i 's/jobmanager.bind-host: localhost/jobmanager.bind-host: 0.0.0.0/g' $FLINK_HOME/conf/flink-conf.yaml; \ + sed -i 's/taskmanager.bind-host: localhost/taskmanager.bind-host: 0.0.0.0/g' $FLINK_HOME/conf/flink-conf.yaml; \ + sed -i '/taskmanager.host: localhost/d' $FLINK_HOME/conf/flink-conf.yaml; + +# Configure container +COPY docker-entrypoint.sh / +ENTRYPOINT ["/docker-entrypoint.sh"] +EXPOSE 6123 8081 +CMD ["help"] diff --git a/stubs/docker/apache-flink/docker-entrypoint.sh b/stubs/docker/apache-flink/docker-entrypoint.sh new file mode 100644 index 00000000..8b0350e2 --- /dev/null +++ b/stubs/docker/apache-flink/docker-entrypoint.sh @@ -0,0 +1,152 @@ +#!/usr/bin/env bash + +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### + +COMMAND_STANDALONE="standalone-job" +COMMAND_HISTORY_SERVER="history-server" + +# If unspecified, the hostname of the container is taken as the JobManager address +JOB_MANAGER_RPC_ADDRESS=${JOB_MANAGER_RPC_ADDRESS:-$(hostname -f)} +CONF_FILE="${FLINK_HOME}/conf/flink-conf.yaml" + +drop_privs_cmd() { + if [ $(id -u) != 0 ]; then + # Don't need to drop privs if EUID != 0 + return + elif [ -x /sbin/su-exec ]; then + # Alpine + echo su-exec flink + else + # Others + echo gosu flink + fi +} + +copy_plugins_if_required() { + if [ -z "$ENABLE_BUILT_IN_PLUGINS" ]; then + return 0 + fi + + echo "Enabling required built-in plugins" + for target_plugin in $(echo "$ENABLE_BUILT_IN_PLUGINS" | tr ';' ' '); do + echo "Linking ${target_plugin} to plugin directory" + plugin_name=${target_plugin%.jar} + + mkdir -p "${FLINK_HOME}/plugins/${plugin_name}" + if [ ! -e "${FLINK_HOME}/opt/${target_plugin}" ]; then + echo "Plugin ${target_plugin} does not exist. Exiting." + exit 1 + else + ln -fs "${FLINK_HOME}/opt/${target_plugin}" "${FLINK_HOME}/plugins/${plugin_name}" + echo "Successfully enabled ${target_plugin}" + fi + done +} + +set_config_option() { + local option=$1 + local value=$2 + + # escape periods for usage in regular expressions + local escaped_option=$(echo ${option} | sed -e "s/\./\\\./g") + + # either override an existing entry, or append a new one + if grep -E "^${escaped_option}:.*" "${CONF_FILE}" > /dev/null; then + sed -i -e "s/${escaped_option}:.*/$option: $value/g" "${CONF_FILE}" + else + echo "${option}: ${value}" >> "${CONF_FILE}" + fi +} + +prepare_configuration() { + set_config_option jobmanager.rpc.address ${JOB_MANAGER_RPC_ADDRESS} + set_config_option blob.server.port 6124 + set_config_option query.server.port 6125 + + if [ -n "${TASK_MANAGER_NUMBER_OF_TASK_SLOTS}" ]; then + set_config_option taskmanager.numberOfTaskSlots ${TASK_MANAGER_NUMBER_OF_TASK_SLOTS} + fi + + if [ -n "${FLINK_PROPERTIES}" ]; then + echo "${FLINK_PROPERTIES}" >> "${CONF_FILE}" + fi + envsubst < "${CONF_FILE}" > "${CONF_FILE}.tmp" && mv "${CONF_FILE}.tmp" "${CONF_FILE}" +} + +maybe_enable_jemalloc() { + if [ "${DISABLE_JEMALLOC:-false}" == "false" ]; then + JEMALLOC_PATH="/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so" + JEMALLOC_FALLBACK="/usr/lib/x86_64-linux-gnu/libjemalloc.so" + if [ -f "$JEMALLOC_PATH" ]; then + export LD_PRELOAD=$LD_PRELOAD:$JEMALLOC_PATH + elif [ -f "$JEMALLOC_FALLBACK" ]; then + export LD_PRELOAD=$LD_PRELOAD:$JEMALLOC_FALLBACK + else + if [ "$JEMALLOC_PATH" = "$JEMALLOC_FALLBACK" ]; then + MSG_PATH=$JEMALLOC_PATH + else + MSG_PATH="$JEMALLOC_PATH and $JEMALLOC_FALLBACK" + fi + echo "WARNING: attempted to load jemalloc from $MSG_PATH but the library couldn't be found. glibc will be used instead." + fi + fi +} + +maybe_enable_jemalloc + +copy_plugins_if_required + +prepare_configuration + +args=("$@") +if [ "$1" = "help" ]; then + printf "Usage: $(basename "$0") (jobmanager|${COMMAND_STANDALONE}|taskmanager|${COMMAND_HISTORY_SERVER})\n" + printf " Or $(basename "$0") help\n\n" + printf "By default, Flink image adopts jemalloc as default memory allocator. This behavior can be disabled by setting the 'DISABLE_JEMALLOC' environment variable to 'true'.\n" + exit 0 +elif [ "$1" = "jobmanager" ]; then + args=("${args[@]:1}") + + echo "Starting Job Manager" + + exec $(drop_privs_cmd) "$FLINK_HOME/bin/jobmanager.sh" start-foreground "${args[@]}" +elif [ "$1" = ${COMMAND_STANDALONE} ]; then + args=("${args[@]:1}") + + echo "Starting Job Manager" + + exec $(drop_privs_cmd) "$FLINK_HOME/bin/standalone-job.sh" start-foreground "${args[@]}" +elif [ "$1" = ${COMMAND_HISTORY_SERVER} ]; then + args=("${args[@]:1}") + + echo "Starting History Server" + + exec $(drop_privs_cmd) "$FLINK_HOME/bin/historyserver.sh" start-foreground "${args[@]}" +elif [ "$1" = "taskmanager" ]; then + args=("${args[@]:1}") + + echo "Starting Task Manager" + + exec $(drop_privs_cmd) "$FLINK_HOME/bin/taskmanager.sh" start-foreground "${args[@]}" +fi + +args=("${args[@]}") + +# Running command in pass-through mode +exec $(drop_privs_cmd) "${args[@]}" From ca5be1329622043da254e4e39314e77fb70461b7 Mon Sep 17 00:00:00 2001 From: shiva-rakshith Date: Wed, 15 Nov 2023 18:14:41 +0530 Subject: [PATCH 12/37] Update DatasetModels.scala --- .../src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala index 68d535a0..21790f79 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala @@ -51,7 +51,7 @@ object DatasetModels { case class ConnectorConfig(@JsonProperty("kafkaBrokers") kafkaBrokers: String, @JsonProperty("topic") topic: String, @JsonProperty("type")databaseType: String, @JsonProperty("connection") connection: Connection, @JsonProperty("tableName") tableName: String, @JsonProperty("databaseName") databaseName: String, @JsonProperty("pollingInterval") pollingInterval: PollingInterval, @JsonProperty("authenticationMechanism") authenticationMechanism: AuthenticationMechanism, - @JsonProperty("batchSize") batchSize: Int) + @JsonProperty("batchSize") batchSize: Int, @JsonProperty("timestampColumn") timestampColumn: String) case class Connection(@JsonProperty("host") host: String, @JsonProperty("port") port: String) From cacf7585b9612d8c34acbc8dab986e95885a7a21 Mon Sep 17 00:00:00 2001 From: Anand Parthasarathy Date: Fri, 17 Nov 2023 15:02:04 +0530 Subject: [PATCH 13/37] Issue #2 feat: Remove kafka connector code --- pipeline/kafka-connector/pom.xml | 232 ------------------ .../src/main/resources/kafka-connector.conf | 17 -- .../task/KafkaConnectorConfig.scala | 24 -- .../task/KafkaConnectorStreamTask.scala | 72 ------ pipeline/pom.xml | 1 - 5 files changed, 346 deletions(-) delete mode 100644 pipeline/kafka-connector/pom.xml delete mode 100644 pipeline/kafka-connector/src/main/resources/kafka-connector.conf delete mode 100644 pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/kafkaconnector/task/KafkaConnectorConfig.scala delete mode 100644 pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/kafkaconnector/task/KafkaConnectorStreamTask.scala diff --git a/pipeline/kafka-connector/pom.xml b/pipeline/kafka-connector/pom.xml deleted file mode 100644 index bdf2fe8f..00000000 --- a/pipeline/kafka-connector/pom.xml +++ /dev/null @@ -1,232 +0,0 @@ - - - - 4.0.0 - - org.sunbird.obsrv - pipeline - 1.0 - - - org.sunbird.obsrv.pipeline - kafka-connector - 1.0.0 - jar - Kafka Connector - - Reads data from source kafka topic(s) and writes them to a configurable topic - - - - UTF-8 - 1.4.0 - - - - - org.apache.flink - flink-streaming-scala_${scala.maj.version} - ${flink.version} - provided - - - org.sunbird.obsrv - dataset-registry - 1.0.0 - - - org.apache.kafka - kafka-clients - - - - - joda-time - joda-time - 2.12.5 - - - com.fasterxml.jackson.datatype - jackson-datatype-joda - 2.15.2 - - - org.sunbird.obsrv - framework - 1.0.0 - - - org.sunbird.obsrv - framework - 1.0.0 - test-jar - test - - - org.apache.flink - flink-test-utils - ${flink.version} - test - - - org.apache.flink - flink-runtime - ${flink.version} - test - tests - - - it.ozimov - embedded-redis - 0.7.1 - test - - - org.apache.flink - flink-streaming-java - ${flink.version} - test - tests - - - org.scalatest - scalatest_2.12 - 3.0.6 - test - - - org.mockito - mockito-core - 3.3.3 - test - - - - - src/main/scala - src/test/scala - - - org.apache.maven.plugins - maven-compiler-plugin - 3.8.1 - - 11 - - - - org.apache.maven.plugins - maven-shade-plugin - 3.2.1 - - - - package - - shade - - - - - com.google.code.findbugs:jsr305 - - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - org.sunbird.obsrv.kafkaconnector.task.KafkaConnectorStreamTask - - - - reference.conf - - - - - - - - - net.alchim31.maven - scala-maven-plugin - 4.4.0 - - ${java.target.runtime} - ${java.target.runtime} - ${scala.version} - false - - - - scala-compile-first - process-resources - - add-source - compile - - - - scala-test-compile - process-test-resources - - testCompile - - - - - - - maven-surefire-plugin - 2.22.2 - - true - - - - - org.scalatest - scalatest-maven-plugin - 1.0 - - ${project.build.directory}/surefire-reports - . - dp-duplication-testsuite.txt - - - - test - - test - - - - - - org.scoverage - scoverage-maven-plugin - ${scoverage.plugin.version} - - ${scala.version} - true - true - - - - - - diff --git a/pipeline/kafka-connector/src/main/resources/kafka-connector.conf b/pipeline/kafka-connector/src/main/resources/kafka-connector.conf deleted file mode 100644 index 093a94f1..00000000 --- a/pipeline/kafka-connector/src/main/resources/kafka-connector.conf +++ /dev/null @@ -1,17 +0,0 @@ -include "baseconfig.conf" - -kafka { - input.topic = ${job.env}".test" - // output.topic = ${job.env}".ingest" - output.failed.topic = ${job.env}".failed" - event.max.size = "1048576" # Max is only 1MB - groupId = ${job.env}"-kafkaconnector-group" - producer { - max-request-size = 5242880 - } -} - -task { - consumer.parallelism = 1 - downstream.operators.parallelism = 1 -} \ No newline at end of file diff --git a/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/kafkaconnector/task/KafkaConnectorConfig.scala b/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/kafkaconnector/task/KafkaConnectorConfig.scala deleted file mode 100644 index 902a8ee2..00000000 --- a/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/kafkaconnector/task/KafkaConnectorConfig.scala +++ /dev/null @@ -1,24 +0,0 @@ -package org.sunbird.obsrv.kafkaconnector.task - -import com.typesafe.config.Config -import org.apache.flink.api.common.typeinfo.TypeInformation -import org.apache.flink.api.java.typeutils.TypeExtractor -import org.apache.flink.streaming.api.scala.OutputTag -import org.sunbird.obsrv.core.streaming.BaseJobConfig - -import scala.collection.mutable - -class KafkaConnectorConfig (override val config: Config) extends BaseJobConfig[String](config, "KafkaConnectorJob") { - - private val serialVersionUID = 2905979435603791379L - - implicit val mapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) - implicit val stringTypeInfo: TypeInformation[String] = TypeExtractor.getForClass(classOf[String]) - - override def inputTopic(): String = "" - override def inputConsumer(): String = "" - - private val DUMMY_OUTPUT_TAG = "dummy-events" - override def successTag(): OutputTag[String] = OutputTag[String](DUMMY_OUTPUT_TAG) - -} diff --git a/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/kafkaconnector/task/KafkaConnectorStreamTask.scala b/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/kafkaconnector/task/KafkaConnectorStreamTask.scala deleted file mode 100644 index c36f745c..00000000 --- a/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/kafkaconnector/task/KafkaConnectorStreamTask.scala +++ /dev/null @@ -1,72 +0,0 @@ -package org.sunbird.obsrv.kafkaconnector.task - -import com.typesafe.config.ConfigFactory -import org.apache.flink.api.java.utils.ParameterTool -import org.apache.flink.streaming.api.datastream.DataStream -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment -import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector} -import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil} -import org.sunbird.obsrv.registry.DatasetRegistry -import org.joda.time.DateTime -import org.joda.time.DateTimeZone - -import java.io.File -import scala.collection.mutable - -class KafkaConnectorStreamTask(config: KafkaConnectorConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[String] { - - private val serialVersionUID = -7729362727131516112L - - // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster - def process(): Unit = { - implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) - - val datasetSourceConfig = DatasetRegistry.getDatasetSourceConfig() - datasetSourceConfig.map { configList => - configList.filter(_.connectorType.equalsIgnoreCase("kafka")).map { - dataSourceConfig => - val dataStream: DataStream[String] = - getStringDataStream(env, config, List(dataSourceConfig.connectorConfig.topic), - config.kafkaConsumerProperties(kafkaBrokerServers = Some(dataSourceConfig.connectorConfig.kafkaBrokers), - kafkaConsumerGroup = Some(s"kafka-${dataSourceConfig.connectorConfig.topic}-consumer")), - consumerSourceName = s"kafka-${dataSourceConfig.connectorConfig.topic}", kafkaConnector) - val datasetId = dataSourceConfig.datasetId - val kafkaOutputTopic = DatasetRegistry.getDataset(datasetId).get.datasetConfig.entryTopic - val resultMapStream: DataStream[String] = dataStream - .filter{msg: String => JSONUtil.isJSON(msg)}.returns(classOf[String]) // TODO: Add a metric to capture invalid JSON messages - .map { streamMap: String => { - val mutableMap = JSONUtil.deserialize[mutable.Map[String, AnyRef]](streamMap) - mutableMap.put("dataset", datasetId) - mutableMap.put("syncts", java.lang.Long.valueOf(new DateTime(DateTimeZone.UTC).getMillis)) - JSONUtil.serialize(mutableMap) - } - }.returns(classOf[String]) - resultMapStream.sinkTo(kafkaConnector.kafkaStringSink(kafkaOutputTopic)) - .name(s"$datasetId-kafka-connector-sink").uid(s"$datasetId-kafka-connector-sink") - .setParallelism(config.downstreamOperatorsParallelism) - } - env.execute(config.jobName) - } - } - - override def processStream(dataStream: DataStream[String]): DataStream[String] = { - null - } - // $COVERAGE-ON$ -} - -// $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster -object KafkaConnectorStreamTask { - - def main(args: Array[String]): Unit = { - val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path")) - val config = configFilePath.map { - path => ConfigFactory.parseFile(new File(path)).resolve() - }.getOrElse(ConfigFactory.load("kafka-connector.conf").withFallback(ConfigFactory.systemEnvironment())) - val kafkaConnectorConfig = new KafkaConnectorConfig(config) - val kafkaUtil = new FlinkKafkaConnector(kafkaConnectorConfig) - val task = new KafkaConnectorStreamTask(kafkaConnectorConfig, kafkaUtil) - task.process() - } -} -// $COVERAGE-ON$ diff --git a/pipeline/pom.xml b/pipeline/pom.xml index 2934fa49..07f8e191 100644 --- a/pipeline/pom.xml +++ b/pipeline/pom.xml @@ -26,7 +26,6 @@ transformer druid-router pipeline-merged - kafka-connector master-data-processor From e4d3dcf9a9f80614f352c0939720ce5fa6710015 Mon Sep 17 00:00:00 2001 From: shiva-rakshith Date: Tue, 21 Nov 2023 17:06:21 +0530 Subject: [PATCH 14/37] feat: add function to get all datasets --- .../main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala index e71a0915..b2b88980 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala @@ -12,6 +12,8 @@ object DatasetRegistry { private val datasetSourceConfig: Option[List[DatasetSourceConfig]] = DatasetRegistryService.readAllDatasetSourceConfig() private val datasources: Map[String, List[DataSource]] = DatasetRegistryService.readAllDatasources() + def getAllDatasets(): Map[String, Dataset] = datasets + def getAllDatasets(datasetType: String): List[Dataset] = { datasets.filter(f => f._2.datasetType.equals(datasetType)).values.toList } From 02ebca49e29aeed849aeee54e35dc9ac1e126b36 Mon Sep 17 00:00:00 2001 From: Praveen Veleneni <66662436+pveleneni@users.noreply.github.com> Date: Fri, 15 Dec 2023 16:47:08 +0530 Subject: [PATCH 15/37] Release 1.3.1 into Main (#43) * testing new images * testing new images * testing new images * testing new images * testing new images * build new image with bug fixes * update dockerfile * update dockerfile * #0 fix: upgrade packages * #0 feat: add flink dockerfiles * feat: update all failed, invalid and duplicate topic names * feat: update kafka topic names in test cases * #0 fix: add individual extraction * feat: update failed event * Update ErrorConstants.scala * feat: update failed event * Issue #0 fix: upgrade ubuntu packages for vulnerabilities * feat: add exception handling for json deserialization * Update BaseProcessFunction.scala * Update BaseProcessFunction.scala * feat: update batch failed event generation * Update ExtractionFunction.scala * feat: update invalid json exception handling * Issue #46 feat: update batch failed event * Issue #46 feat: update batch failed event * Issue #46 feat: update batch failed event * Issue #46 feat: update batch failed event * Issue #46 fix: remove cloning object * Issue #46 feat: update batch failed event * #0 fix: update github actions release condition * Issue #46 feat: add error reasons * Issue #46 feat: add exception stack trace * Issue #46 feat: add exception stack trace * Release 1.3.1 Changes (#42) * Dataset enhancements (#38) * feat: add connector config and connector stats update functions * Issue #33 feat: add documentation for Dataset, Datasources, Data In and Query APIs * Update DatasetModels.scala * #0 fix: upgrade packages * #0 feat: add flink dockerfiles * #0 fix: add individual extraction --------- Co-authored-by: ManojKrishnaChintaluri Co-authored-by: Praveen <66662436+pveleneni@users.noreply.github.com> Co-authored-by: Sowmya N Dixit * #0000 [SV] - Fallback to local redis instance if embedded redis is not starting * Update DatasetModels.scala * #0000 - refactor the denormalization logic 1. Do not fail the denormalization if the denorm key is missing 2. Add clear message whether the denorm is sucessful or failed or partially successful 3. Handle denorm for both text and number fields * #0000 - refactor: 1. Created a enum for dataset status and ignore events if the dataset is not in Live status 2. Created a outputtag for denorm failed stats 3. Parse event validation failed messages into a case class * #0000 - refactor: 1. Updated the DruidRouter job to publish data to router topics dynamically 2. Updated framework to created dynamicKafkaSink object * #0000 - mega refactoring: 1. Made calls to getAllDatasets and getAllDatasetSources to always query postgres 2. Created BaseDatasetProcessFunction for all flink functions to extend that would dynamically resolve dataset config, initialize metrics and handle common failures 3. Refactored serde - merged map and string serialization into one function and parameterized the function 4. Moved failed events sinking into a common base class 5. Master dataset processor can now do denormalization with another master dataset as well * #0000 - mega refactoring: 1. Made calls to getAllDatasets and getAllDatasetSources to always query postgres 2. Created BaseDatasetProcessFunction for all flink functions to extend that would dynamically resolve dataset config, initialize metrics and handle common failures 3. Refactored serde - merged map and string serialization into one function and parameterized the function 4. Moved failed events sinking into a common base class 5. Master dataset processor can now do denormalization with another master dataset as well * #0000 - mega refactoring: 1. Added validation to check if the event has a timestamp key and it is not blank nor invalid 2. Added timezone handling to store the data in druid in the TZ specified by the dataset * #0000 - minor refactoring: Updated DatasetRegistry.getDatasetSourceConfig to getAllDatasetSourceConfig * #0000 - mega refactoring: Refactored logs, error messages and metrics * #0000 - mega refactoring: Fix unit tests * #0000 - refactoring: 1. Introduced transformation mode to enable lenient transformations 2. Proper exception handling for transformer job * #0000 - refactoring: Fix test cases and code * #0000 - refactoring: upgrade embedded redis to work with macos sonoma m2 * #0000 - refactoring: Denormalizer test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: Router test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: Validator test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: Framework test cases and bug fixes * #0000 - refactoring: kafka connector test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: improve code coverage and fix bugs * #0000 - refactoring: improve code coverage and fix bugs --- Now the code coverage is 100% * #0000 - refactoring: organize imports * #0000 - refactoring: 1. transformer test cases and bug fixes - code coverage is 100% * #0000 - refactoring: test cases and bug fixes --------- Co-authored-by: shiva-rakshith Co-authored-by: Aniket Sakinala Co-authored-by: Manjunath Davanam Co-authored-by: ManojKrishnaChintaluri Co-authored-by: Praveen <66662436+pveleneni@users.noreply.github.com> Co-authored-by: Sowmya N Dixit Co-authored-by: Anand Parthasarathy * #000:feat: Removed the provided scope of the kafka-client in the framework (#40) * #0000 - feat: Add dataset-type to system events (#41) * #0000 - feat: Add dataset-type to system events * #0000 - feat: Modify tests for dataset-type in system events * #0000 - feat: Remove unused getDatasetType function * #0000 - feat: Remove unused pom test dependencies * #0000 - feat: Remove unused pom test dependencies --------- Co-authored-by: Santhosh Co-authored-by: shiva-rakshith Co-authored-by: Aniket Sakinala Co-authored-by: ManojKrishnaChintaluri Co-authored-by: Praveen <66662436+pveleneni@users.noreply.github.com> Co-authored-by: Sowmya N Dixit Co-authored-by: Anand Parthasarathy * Main conflicts fixes (#44) * feat: add connector config and connector stats update functions * Issue #33 feat: add documentation for Dataset, Datasources, Data In and Query APIs * Update DatasetModels.scala * Release 1.3.0 into Main branch (#34) * testing new images * testing new images * testing new images * testing new images * testing new images * build new image with bug fixes * update dockerfile * update dockerfile * #0 fix: upgrade packages * #0 feat: add flink dockerfiles * #0 fix: add individual extraction * Issue #0 fix: upgrade ubuntu packages for vulnerabilities * #0 fix: update github actions release condition --------- Co-authored-by: ManojKrishnaChintaluri Co-authored-by: Praveen <66662436+pveleneni@users.noreply.github.com> Co-authored-by: Sowmya N Dixit * Update DatasetModels.scala * Issue #2 feat: Remove kafka connector code * feat: add function to get all datasets * #000:feat: Resolve conflicts --------- Co-authored-by: shiva-rakshith Co-authored-by: Aniket Sakinala Co-authored-by: ManojKrishnaChintaluri Co-authored-by: Praveen <66662436+pveleneni@users.noreply.github.com> Co-authored-by: Sowmya N Dixit Co-authored-by: Santhosh Co-authored-by: Anand Parthasarathy Co-authored-by: Ravi Mula --------- Co-authored-by: ManojKrishnaChintaluri Co-authored-by: shiva-rakshith Co-authored-by: Manjunath Davanam Co-authored-by: Sowmya N Dixit Co-authored-by: Santhosh Co-authored-by: Aniket Sakinala Co-authored-by: Anand Parthasarathy Co-authored-by: Ravi Mula --- data-products/pom.xml | 13 +- .../MasterDataProcessorIndexer.scala | 6 +- dataset-registry/pom.xml | 4 +- .../src/main/resources/dataset-registry.sql | 9 +- .../sunbird/obsrv/model/DatasetModels.scala | 43 ++- .../obsrv/registry/DatasetRegistry.scala | 46 +-- .../service/DatasetRegistryService.scala | 113 ++++---- .../BaseDatasetProcessFunction.scala | 195 +++++++++++++ .../spec/BaseSpecWithDatasetRegistry.scala | 37 ++- .../obsrv/spec/TestDatasetRegistrySpec.scala | 92 +++++- framework/pom.xml | 11 +- framework/src/main/resources/baseconfig.conf | 6 +- .../obsrv/core/cache/DedupEngine.scala | 26 +- .../obsrv/core/cache/RedisConnect.scala | 1 + .../sunbird/obsrv/core/model/Constants.scala | 18 ++ .../obsrv/core/model/ErrorConstants.scala | 22 +- .../org/sunbird/obsrv/core/model/Models.scala | 76 ++++- .../serde/{MapSerde.scala => SerdeUtil.scala} | 44 ++- .../obsrv/core/serde/StringSerde.scala | 32 --- .../core/streaming/BaseDeduplication.scala | 38 +-- .../obsrv/core/streaming/BaseJobConfig.scala | 16 +- .../core/streaming/BaseProcessFunction.scala | 128 +++++---- .../obsrv/core/streaming/BaseStreamTask.scala | 15 +- .../core/streaming/FlinkKafkaConnector.scala | 28 +- .../sunbird/obsrv/core/util/JSONUtil.scala | 28 +- .../obsrv/core/util/PostgresConnect.scala | 16 +- .../org/sunbird/obsrv/core/util/Util.scala | 6 +- framework/src/test/resources/base-test.conf | 3 +- framework/src/test/resources/test.conf | 4 +- framework/src/test/resources/test2.conf | 69 +++++ .../spec/BaseDeduplicationTestSpec.scala | 45 +++ .../spec/BaseProcessFunctionTestSpec.scala | 27 +- .../sunbird/spec/BaseProcessTestConfig.scala | 4 + .../scala/org/sunbird/spec/BaseSpec.scala | 10 +- .../sunbird/spec/BaseSpecWithPostgres.scala | 8 +- .../org/sunbird/spec/ModelsTestSpec.scala | 118 ++++++++ .../sunbird/spec/PostgresConnectSpec.scala | 2 +- .../org/sunbird/spec/RedisTestSpec.scala | 24 +- .../org/sunbird/spec/SerdeUtilTestSpec.scala | 75 +++++ .../org/sunbird/spec/TestMapStreamFunc.scala | 33 ++- .../org/sunbird/spec/TestMapStreamTask.scala | 4 +- .../sunbird/spec/TestStringStreamTask.scala | 6 +- pipeline/denormalizer/pom.xml | 38 ++- .../src/main/resources/de-normalization.conf | 2 +- .../functions/DenormalizerFunction.scala | 88 ++++-- .../DenormalizerWindowFunction.scala | 86 ++++-- .../task/DenormalizerConfig.scala | 13 +- .../task/DenormalizerStreamTask.scala | 13 +- .../task/DenormalizerWindowStreamTask.scala | 23 +- .../obsrv/denormalizer/util/DenormCache.scala | 102 +++---- .../denormalizer/src/test/resources/test.conf | 4 +- .../DenormalizerStreamTaskTestSpec.scala | 176 ++++++++++++ ...DenormalizerWindowStreamTaskTestSpec.scala | 204 ++++++++++++++ .../obsrv/denormalizer/EventFixture.scala | 15 + pipeline/druid-router/pom.xml | 54 +++- .../functions/DruidRouterFunction.scala | 34 ++- .../functions/DynamicRouterFunction.scala | 115 ++++++++ .../obsrv/router/task/DruidRouterConfig.scala | 3 + .../router/task/DruidRouterStreamTask.scala | 14 +- .../router/task/DynamicRouterStreamTask.scala | 66 +++++ .../DynamicRouterStreamTaskTestSpec.scala | 162 +++++++++++ .../sunbird/obsrv/router/EventFixture.scala | 7 + .../obsrv/router/TestTimestampKeyParser.scala | 124 +++++++++ pipeline/extractor/pom.xml | 42 ++- .../src/main/resources/extractor.conf | 5 +- .../functions/ExtractionFunction.scala | 116 +++++--- .../extractor/task/ExtractorConfig.scala | 24 +- .../extractor/task/ExtractorStreamTask.scala | 21 +- .../extractor/src/test/resources/test.conf | 10 +- .../extractor/src/test/resources/test2.conf | 24 ++ .../obsrv/extractor/EventFixture.scala | 15 + .../extractor/ExtractorStreamTestSpec.scala | 167 +++++++++++ pipeline/kafka-connector/pom.xml | 263 ++++++++++++++++++ .../src/main/resources/kafka-connector.conf | 16 ++ .../connector/task/KafkaConnectorConfig.scala | 25 ++ .../task/KafkaConnectorStreamTask.scala | 71 +++++ .../src/test/resources/test.conf | 14 + .../KafkaConnectorStreamTestSpec.scala | 126 +++++++++ pipeline/master-data-processor/pom.xml | 12 +- .../main/resources/master-data-processor.conf | 7 +- .../MasterDataProcessorFunction.scala | 62 ++--- .../task/MasterDataProcessorConfig.scala | 9 +- .../task/MasterDataProcessorStreamTask.scala | 21 +- .../obsrv/pipeline/util/MasterDataCache.scala | 16 +- .../src/test/resources/test.conf | 10 +- .../sunbird/obsrv/fixture/EventFixture.scala | 4 +- ...asterDataProcessorStreamTaskTestSpec.scala | 36 ++- pipeline/pipeline-merged/pom.xml | 7 +- .../src/main/resources/merged-pipeline.conf | 11 +- .../pipeline/task/MergedPipelineConfig.scala | 16 +- .../task/MergedPipelineStreamTask.scala | 6 +- .../src/test/resources/test.conf | 11 +- .../MergedPipelineStreamTaskTestSpec.scala | 78 +++++- pipeline/pom.xml | 5 +- pipeline/preprocessor/pom.xml | 4 +- .../main/resources/pipeline-preprocessor.conf | 5 +- .../functions/DeduplicationFunction.scala | 52 ++-- .../functions/EventValidationFunction.scala | 166 +++++++---- .../task/PipelinePreprocessorConfig.scala | 23 +- .../task/PipelinePreprocessorStreamTask.scala | 14 +- .../preprocessor/util/SchemaValidator.scala | 57 ++-- .../preprocessor/src/test/resources/test.conf | 5 +- .../PipelinePreprocessorStreamTestSpec.scala | 139 +++++++-- .../preprocessor/TestSchemaValidator.scala | 84 ++++-- .../preprocessor/fixture/EventFixtures.scala | 16 +- pipeline/transformer/pom.xml | 4 +- .../functions/TransformerFunction.scala | 36 ++- .../transformer/task/TransformerConfig.scala | 7 +- .../task/TransformerStreamTask.scala | 9 +- pom.xml | 4 - 110 files changed, 3763 insertions(+), 956 deletions(-) create mode 100644 dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala create mode 100644 framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala rename framework/src/main/scala/org/sunbird/obsrv/core/serde/{MapSerde.scala => SerdeUtil.scala} (55%) delete mode 100644 framework/src/main/scala/org/sunbird/obsrv/core/serde/StringSerde.scala create mode 100644 framework/src/test/resources/test2.conf create mode 100644 framework/src/test/scala/org/sunbird/spec/BaseDeduplicationTestSpec.scala create mode 100644 framework/src/test/scala/org/sunbird/spec/ModelsTestSpec.scala create mode 100644 framework/src/test/scala/org/sunbird/spec/SerdeUtilTestSpec.scala create mode 100644 pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala create mode 100644 pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala create mode 100644 pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/EventFixture.scala create mode 100644 pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala create mode 100644 pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala create mode 100644 pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala create mode 100644 pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/EventFixture.scala create mode 100644 pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala create mode 100644 pipeline/extractor/src/test/resources/test2.conf create mode 100644 pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/EventFixture.scala create mode 100644 pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala create mode 100644 pipeline/kafka-connector/pom.xml create mode 100644 pipeline/kafka-connector/src/main/resources/kafka-connector.conf create mode 100644 pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/connector/task/KafkaConnectorConfig.scala create mode 100644 pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/connector/task/KafkaConnectorStreamTask.scala create mode 100644 pipeline/kafka-connector/src/test/resources/test.conf create mode 100644 pipeline/kafka-connector/src/test/scala/org/sunbird/obsrv/connector/KafkaConnectorStreamTestSpec.scala diff --git a/data-products/pom.xml b/data-products/pom.xml index e79564e5..51090a71 100644 --- a/data-products/pom.xml +++ b/data-products/pom.xml @@ -10,7 +10,7 @@ 3.1.0 - 2.12.10 + 2.12.11 2.12 1.1.1 @@ -225,6 +225,17 @@ + + + org.scoverage + scoverage-maven-plugin + ${scoverage.plugin.version} + + ${scala.version} + true + true + + diff --git a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala index 7823c7cb..e1ecfdec 100644 --- a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala +++ b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala @@ -88,7 +88,7 @@ object MasterDataProcessorIndexer { val response = Unirest.post(config.getString("druid.indexer.url")) .header("Content-Type", "application/json") .body(ingestionSpec).asJson() - response.ifFailure(response => throw new Exception("Exception while submitting ingestion task")) + response.ifFailure(_ => throw new Exception("Exception while submitting ingestion task")) } private def updateDataSourceRef(datasource: DataSource, datasourceRef: String): Unit = { @@ -100,7 +100,7 @@ object MasterDataProcessorIndexer { val response = Unirest.delete(config.getString("druid.datasource.delete.url") + datasourceRef) .header("Content-Type", "application/json") .asJson() - response.ifFailure(response => throw new Exception("Exception while deleting datasource" + datasourceRef)) + response.ifFailure(_ => throw new Exception("Exception while deleting datasource" + datasourceRef)) } private def createDataFile(dataset: Dataset, timestamp: Long, outputFilePath: String, objectKey: String): String = { @@ -115,7 +115,7 @@ object MasterDataProcessorIndexer { val sc = new SparkContext(conf) val readWriteConf = ReadWriteConfig(scanCount = 1000, maxPipelineSize = 1000) - val rdd = sc.fromRedisKV("*")(readWriteConfig = readWriteConf) + sc.fromRedisKV("*")(readWriteConfig = readWriteConf) .map(f => JSONUtil.deserialize[mutable.Map[String, AnyRef]](f._2)) .map(f => f.put("syncts", timestamp.asInstanceOf[AnyRef])) .map(f => JSONUtil.serialize(f)) diff --git a/dataset-registry/pom.xml b/dataset-registry/pom.xml index e3950291..fd17db70 100644 --- a/dataset-registry/pom.xml +++ b/dataset-registry/pom.xml @@ -62,9 +62,9 @@ test - it.ozimov + com.github.codemonstur embedded-redis - 0.7.1 + 1.0.0 test diff --git a/dataset-registry/src/main/resources/dataset-registry.sql b/dataset-registry/src/main/resources/dataset-registry.sql index aa997ebe..ff28ae98 100644 --- a/dataset-registry/src/main/resources/dataset-registry.sql +++ b/dataset-registry/src/main/resources/dataset-registry.sql @@ -41,8 +41,9 @@ CREATE TABLE IF NOT EXISTS dataset_transformations ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), field_key text NOT NULL, - transformation_function text NOT NULL, + transformation_function json NOT NULL, status text NOT NULL, + mode text, created_by text NOT NULL, updated_by text NOT NULL, created_date timestamp NOT NULL, @@ -53,17 +54,17 @@ CREATE INDEX IF NOT EXISTS dataset_transformations_status ON dataset_transformat CREATE INDEX IF NOT EXISTS dataset_transformations_dataset ON dataset_transformations(dataset_id); CREATE TABLE IF NOT EXISTS dataset_source_config ( - id SERIAL PRIMARY KEY, + id text PRIMARY KEY, dataset_id text NOT NULL REFERENCES datasets (id), connector_type text NOT NULL, connector_config json NOT NULL, - connector_stats json NOT NULL, + connector_stats json, status text NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date timestamp NOT NULL, updated_date timestamp NOT NULL, - UNIQUE(dataset_id) + UNIQUE(connector_type, dataset_id) ); CREATE INDEX IF NOT EXISTS dataset_source_config_status ON dataset_source_config(status); CREATE INDEX IF NOT EXISTS dataset_source_config_dataset ON dataset_source_config(dataset_id); \ No newline at end of file diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala index cdfcb0a7..ce0279b6 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala @@ -4,8 +4,11 @@ import com.fasterxml.jackson.annotation.JsonProperty import com.fasterxml.jackson.core.`type`.TypeReference import com.fasterxml.jackson.module.scala.JsonScalaEnumeration import org.sunbird.obsrv.core.model.SystemConfig +import org.sunbird.obsrv.model.DatasetStatus.DatasetStatus +import org.sunbird.obsrv.model.TransformMode.TransformMode import org.sunbird.obsrv.model.ValidationMode.ValidationMode +import java.sql.Timestamp import scala.beans.BeanProperty object DatasetModels { @@ -30,15 +33,16 @@ object DatasetModels { case class RouterConfig(@JsonProperty("topic") topic: String) - case class DatasetConfig(@JsonProperty("data_key") key: String, @JsonProperty("timestamp_key") tsKey: String, - @JsonProperty("entry_topic") entryTopic: String, @JsonProperty("exclude_fields") excludeFields: Option[List[String]] = None, - @JsonProperty("redis_db_host") redisDBHost: Option[String] = None, @JsonProperty("redis_db_port") redisDBPort: Option[Int] = None, - @JsonProperty("redis_db") redisDB: Option[Int] = None, @JsonProperty("index_data") indexData: Option[Boolean] = None) + case class DatasetConfig(@JsonProperty("data_key") key: String, @JsonProperty("timestamp_key") tsKey: String, @JsonProperty("entry_topic") entryTopic: String, + @JsonProperty("exclude_fields") excludeFields: Option[List[String]] = None, @JsonProperty("redis_db_host") redisDBHost: Option[String] = None, + @JsonProperty("redis_db_port") redisDBPort: Option[Int] = None, @JsonProperty("redis_db") redisDB: Option[Int] = None, + @JsonProperty("index_data") indexData: Option[Boolean] = None, @JsonProperty("timestamp_format") tsFormat: Option[String] = None, + @JsonProperty("dataset_tz") datasetTimezone: Option[String] = None) - case class Dataset(@JsonProperty("id") id: String, @JsonProperty("type") datasetType: String , @JsonProperty("extraction_config") extractionConfig: Option[ExtractionConfig], + case class Dataset(@JsonProperty("id") id: String, @JsonProperty("type") datasetType: String, @JsonProperty("extraction_config") extractionConfig: Option[ExtractionConfig], @JsonProperty("dedup_config") dedupConfig: Option[DedupConfig], @JsonProperty("validation_config") validationConfig: Option[ValidationConfig], @JsonProperty("data_schema") jsonSchema: Option[String], @JsonProperty("denorm_config") denormConfig: Option[DenormConfig], - @JsonProperty("router_config") routerConfig: RouterConfig, datasetConfig: DatasetConfig, @JsonProperty("status") status: String, + @JsonProperty("router_config") routerConfig: RouterConfig, datasetConfig: DatasetConfig, @JsonProperty("status") @JsonScalaEnumeration(classOf[DatasetStatusType]) status: DatasetStatus, @JsonProperty("tags") tags: Option[Array[String]] = None, @JsonProperty("data_version") dataVersion: Option[Int] = None) case class Condition(@JsonProperty("type") `type`: String, @JsonProperty("expr") expr: String) @@ -47,9 +51,9 @@ object DatasetModels { case class DatasetTransformation(@JsonProperty("id") id: String, @JsonProperty("dataset_id") datasetId: String, @JsonProperty("field_key") fieldKey: String, @JsonProperty("transformation_function") transformationFunction: TransformationFunction, - @JsonProperty("status") status: String) + @JsonProperty("status") status: String, @JsonProperty("mode") @JsonScalaEnumeration(classOf[TransformModeType]) mode: Option[TransformMode] = Some(TransformMode.Strict)) - case class ConnectorConfig(@JsonProperty("kafkaBrokers") kafkaBrokers: String, @JsonProperty("topic") topic: String, @JsonProperty("type")databaseType: String, + case class ConnectorConfig(@JsonProperty("kafkaBrokers") kafkaBrokers: String, @JsonProperty("topic") topic: String, @JsonProperty("type") databaseType: String, @JsonProperty("connection") connection: Connection, @JsonProperty("tableName") tableName: String, @JsonProperty("databaseName") databaseName: String, @JsonProperty("pollingInterval") pollingInterval: PollingInterval, @JsonProperty("authenticationMechanism") authenticationMechanism: AuthenticationMechanism, @JsonProperty("batchSize") batchSize: Int, @JsonProperty("timestampColumn") timestampColumn: String) @@ -60,19 +64,34 @@ object DatasetModels { case class AuthenticationMechanism(@JsonProperty("encrypted") encrypted: Boolean, @JsonProperty("encryptedValues") encryptedValues: String) - case class ConnectorStats(@JsonProperty("last_fetch_timestamp") lastFetchTimestamp: String, @JsonProperty("records") records: Long, @JsonProperty("avg_batch_read_time") avgBatchReadTime: Long, @JsonProperty("disconnections") disconnections: Int) + case class ConnectorStats(@JsonProperty("last_fetch_timestamp") lastFetchTimestamp: Timestamp, @JsonProperty("records") records: Long, @JsonProperty("avg_batch_read_time") avgBatchReadTime: Long, @JsonProperty("disconnections") disconnections: Int) case class DatasetSourceConfig(@JsonProperty("id") id: String, @JsonProperty("dataset_id") datasetId: String, @JsonProperty("connector_type") connectorType: String, @JsonProperty("connector_config") connectorConfig: ConnectorConfig, - @JsonProperty("connector_stats") connectorStats: ConnectorStats, @JsonProperty("status") status: String) - case class DataSource(@JsonProperty("datasource") datasource: String, @JsonProperty("dataset_id") datasetId: String, - @JsonProperty("ingestion_spec") ingestionSpec: String, @JsonProperty("datasource_ref") datasourceRef: String) + @JsonProperty("status") status: String, @JsonProperty("connector_stats") connectorStats: Option[ConnectorStats] = None) + case class DataSource(@JsonProperty("id") id: String, @JsonProperty("datasource") datasource: String, @JsonProperty("dataset_id") datasetId: String, + @JsonProperty("ingestion_spec") ingestionSpec: String, @JsonProperty("datasource_ref") datasourceRef: String) } class ValidationModeType extends TypeReference[ValidationMode.type] + object ValidationMode extends Enumeration { type ValidationMode = Value val Strict, IgnoreNewFields, DiscardNewFields = Value } + +class TransformModeType extends TypeReference[TransformMode.type] + +object TransformMode extends Enumeration { + type TransformMode = Value + val Strict, Lenient = Value +} + +class DatasetStatusType extends TypeReference[DatasetStatus.type] + +object DatasetStatus extends Enumeration { + type DatasetStatus = Value + val Draft, Publish, Live, Retired, Purged = Value +} \ No newline at end of file diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala index b2b88980..ad239312 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala @@ -4,58 +4,62 @@ import org.sunbird.obsrv.model.DatasetModels.{DataSource, Dataset, DatasetSource import org.sunbird.obsrv.service.DatasetRegistryService import java.sql.Timestamp +import scala.collection.mutable object DatasetRegistry { - private val datasets: Map[String, Dataset] = DatasetRegistryService.readAllDatasets() + private val datasets: mutable.Map[String, Dataset] = mutable.Map[String, Dataset]() + datasets ++= DatasetRegistryService.readAllDatasets() private val datasetTransformations: Map[String, List[DatasetTransformation]] = DatasetRegistryService.readAllDatasetTransformations() - private val datasetSourceConfig: Option[List[DatasetSourceConfig]] = DatasetRegistryService.readAllDatasetSourceConfig() - private val datasources: Map[String, List[DataSource]] = DatasetRegistryService.readAllDatasources() - - def getAllDatasets(): Map[String, Dataset] = datasets def getAllDatasets(datasetType: String): List[Dataset] = { - datasets.filter(f => f._2.datasetType.equals(datasetType)).values.toList + val datasetList = DatasetRegistryService.readAllDatasets() + datasetList.filter(f => f._2.datasetType.equals(datasetType)).values.toList } def getDataset(id: String): Option[Dataset] = { - datasets.get(id) + val datasetFromCache = datasets.get(id) + if (datasetFromCache.isDefined) datasetFromCache else { + val dataset = DatasetRegistryService.readDataset(id) + if (dataset.isDefined) datasets.put(dataset.get.id, dataset.get) + dataset + } } - def getDatasetSourceConfig(): Option[List[DatasetSourceConfig]] = { - datasetSourceConfig + def getAllDatasetSourceConfig(): Option[List[DatasetSourceConfig]] = { + DatasetRegistryService.readAllDatasetSourceConfig() } - def getDatasetSourceConfigById(datasetId: String): DatasetSourceConfig = { - datasetSourceConfig.map(configList => configList.filter(_.datasetId.equalsIgnoreCase(datasetId))).get.head + def getDatasetSourceConfigById(datasetId: String): Option[List[DatasetSourceConfig]] = { + DatasetRegistryService.readDatasetSourceConfig(datasetId) } - def getDatasetTransformations(id: String): Option[List[DatasetTransformation]] = { - datasetTransformations.get(id) + def getDatasetTransformations(datasetId: String): Option[List[DatasetTransformation]] = { + datasetTransformations.get(datasetId) } def getDatasources(datasetId: String): Option[List[DataSource]] = { - datasources.get(datasetId) + DatasetRegistryService.readDatasources(datasetId) } def getDataSetIds(datasetType: String): List[String] = { datasets.filter(f => f._2.datasetType.equals(datasetType)).keySet.toList } - def updateDatasourceRef(datasource: DataSource, datasourceRef: String): Unit = { + def updateDatasourceRef(datasource: DataSource, datasourceRef: String): Int = { DatasetRegistryService.updateDatasourceRef(datasource, datasourceRef) } - def updateConnectorStats(datasetId: String, lastFetchTimestamp: Timestamp, records: Long): Unit = { - DatasetRegistryService.updateConnectorStats(datasetId, lastFetchTimestamp, records) + def updateConnectorStats(id: String, lastFetchTimestamp: Timestamp, records: Long): Int = { + DatasetRegistryService.updateConnectorStats(id, lastFetchTimestamp, records) } - def updateConnectorDisconnections(datasetId: String, disconnections: Int): Unit = { - DatasetRegistryService.updateConnectorDisconnections(datasetId, disconnections) + def updateConnectorDisconnections(id: String, disconnections: Int): Int = { + DatasetRegistryService.updateConnectorDisconnections(id, disconnections) } - def updateConnectorAvgBatchReadTime(datasetId: String, avgReadTime: Long): Unit = { - DatasetRegistryService.updateConnectorAvgBatchReadTime(datasetId, avgReadTime) + def updateConnectorAvgBatchReadTime(id: String, avgReadTime: Long): Int = { + DatasetRegistryService.updateConnectorAvgBatchReadTime(id, avgReadTime) } } \ No newline at end of file diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala index a6c0f99b..89efec4c 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala @@ -1,19 +1,17 @@ package org.sunbird.obsrv.service import com.typesafe.config.{Config, ConfigFactory} -import org.slf4j.LoggerFactory -import org.sunbird.obsrv.core.streaming.BaseDeduplication import org.sunbird.obsrv.core.util.{JSONUtil, PostgresConnect, PostgresConnectionConfig} -import org.sunbird.obsrv.model.DatasetModels.{ConnectorConfig, ConnectorStats, DataSource, Dataset, DatasetConfig, DatasetSourceConfig, DatasetTransformation, DedupConfig, DenormConfig, ExtractionConfig, RouterConfig, TransformationFunction, ValidationConfig} +import org.sunbird.obsrv.model.DatasetModels._ +import org.sunbird.obsrv.model.{DatasetStatus, TransformMode} import java.io.File import java.sql.{ResultSet, Timestamp} object DatasetRegistryService { - private[this] val logger = LoggerFactory.getLogger(DatasetRegistryService.getClass) - private val configFile = new File("/data/flink/conf/baseconfig.conf") + // $COVERAGE-OFF$ This code only executes within a flink cluster val config: Config = if (configFile.exists()) { println("Loading configuration file cluster baseconfig.conf...") ConfigFactory.parseFile(configFile).resolve() @@ -21,6 +19,7 @@ object DatasetRegistryService { println("Loading configuration file baseconfig.conf inside the jar...") ConfigFactory.load("baseconfig.conf").withFallback(ConfigFactory.systemEnvironment()) } + // $COVERAGE-ON$ private val postgresConfig = PostgresConnectionConfig( config.getString("postgres.user"), config.getString("postgres.password"), @@ -38,10 +37,21 @@ object DatasetRegistryService { val dataset = parseDataset(result) (dataset.id, dataset) }).toMap - } catch { - case ex: Exception => - logger.error("Exception while reading datasets from Postgres", ex) - Map() + } finally { + postgresConnect.closeConnection() + } + } + + def readDataset(id: String): Option[Dataset] = { + + val postgresConnect = new PostgresConnect(postgresConfig) + try { + val rs = postgresConnect.executeQuery(s"SELECT * FROM datasets where id='$id'") + if(rs.next()) { + Some(parseDataset(rs)) + } else { + None + } } finally { postgresConnect.closeConnection() } @@ -56,10 +66,20 @@ object DatasetRegistryService { val datasetSourceConfig = parseDatasetSourceConfig(result) datasetSourceConfig }).toList) - } catch { - case ex: Exception => - ex.printStackTrace() - None + } finally { + postgresConnect.closeConnection() + } + } + + def readDatasetSourceConfig(datasetId: String): Option[List[DatasetSourceConfig]] = { + + val postgresConnect = new PostgresConnect(postgresConfig) + try { + val rs = postgresConnect.executeQuery(s"SELECT * FROM dataset_source_config where dataset_id='$datasetId'") + Option(Iterator.continually((rs, rs.next)).takeWhile(f => f._2).map(f => f._1).map(result => { + val datasetSourceConfig = parseDatasetSourceConfig(result) + datasetSourceConfig + }).toList) } finally { postgresConnect.closeConnection() } @@ -74,66 +94,50 @@ object DatasetRegistryService { val dt = parseDatasetTransformation(result) (dt.datasetId, dt) }).toList.groupBy(f => f._1).mapValues(f => f.map(x => x._2)) - } catch { - case ex: Exception => - logger.error("Exception while reading dataset transformations from Postgres", ex) - Map() } finally { postgresConnect.closeConnection() } } - def readAllDatasources(): Map[String, List[DataSource]] = { + def readDatasources(datasetId: String): Option[List[DataSource]] = { val postgresConnect = new PostgresConnect(postgresConfig) try { - val rs = postgresConnect.executeQuery("SELECT * FROM datasources") - Iterator.continually((rs, rs.next)).takeWhile(f => f._2).map(f => f._1).map(result => { - val dt = parseDatasource(result) - (dt.datasetId, dt) - }).toList.groupBy(f => f._1).mapValues(f => f.map(x => x._2)) - } catch { - case ex: Exception => - logger.error("Exception while reading dataset transformations from Postgres", ex) - Map() + val rs = postgresConnect.executeQuery(s"SELECT * FROM datasources where dataset_id='$datasetId'") + Option(Iterator.continually((rs, rs.next)).takeWhile(f => f._2).map(f => f._1).map(result => { + parseDatasource(result) + }).toList) } finally { postgresConnect.closeConnection() } } - def updateDatasourceRef(datasource: DataSource, datasourceRef: String): Unit = { + def updateDatasourceRef(datasource: DataSource, datasourceRef: String): Int = { val query = s"UPDATE datasources set datasource_ref = '$datasourceRef' where datasource='${datasource.datasource}' and dataset_id='${datasource.datasetId}'" - updateRegistry(query, "Exception while updating data source reference in Postgres") + updateRegistry(query) } - def updateConnectorStats(datasetId: String, lastFetchTimestamp: Timestamp, records: Long): Unit = { - val query = s"UPDATE dataset_source_config SET connector_stats = jsonb_set(jsonb_set(connector_stats::jsonb, '{records}'," + + def updateConnectorStats(id: String, lastFetchTimestamp: Timestamp, records: Long): Int = { + val query = s"UPDATE dataset_source_config SET connector_stats = jsonb_set(jsonb_set(coalesce(connector_stats, '{}')::jsonb, '{records}'," + s" ((COALESCE(connector_stats->>'records', '0')::int + $records)::text)::jsonb, true), '{last_fetch_timestamp}', " + - s"to_jsonb('$lastFetchTimestamp'::timestamp), true) WHERE dataset_id = '$datasetId'" - updateRegistry(query, "Exception while updating connector stats in Postgres") + s"to_jsonb('$lastFetchTimestamp'::timestamp), true) WHERE id = '$id'" + updateRegistry(query) } - def updateConnectorDisconnections(datasetId: String, disconnections: Int): Unit = { - val query = s"UPDATE dataset_source_config SET connector_stats = jsonb_set(connector_stats::jsonb, " + - s"'{disconnections}','$disconnections') WHERE dataset_id = '$datasetId'" - updateRegistry(query, "Exception while updating connector disconnections in Postgres") + def updateConnectorDisconnections(id: String, disconnections: Int): Int = { + val query = s"UPDATE dataset_source_config SET connector_stats = jsonb_set(coalesce(connector_stats, '{}')::jsonb, '{disconnections}','$disconnections') WHERE id = '$id'" + updateRegistry(query) } - def updateConnectorAvgBatchReadTime(datasetId: String, avgReadTime: Long): Unit = { - val query = s"UPDATE dataset_source_config SET connector_stats = jsonb_set(connector_stats::jsonb, " + - s"'{avg_batch_read_time}','$avgReadTime') WHERE dataset_id = '$datasetId'" - updateRegistry(query, "Exception while updating connector average batch read time in Postgres") + def updateConnectorAvgBatchReadTime(id: String, avgReadTime: Long): Int = { + val query = s"UPDATE dataset_source_config SET connector_stats = jsonb_set(coalesce(connector_stats, '{}')::jsonb, '{avg_batch_read_time}','$avgReadTime') WHERE id = '$id'" + updateRegistry(query) } - def updateRegistry(query: String, errorMsg: String): Unit = { + private def updateRegistry(query: String): Int = { val postgresConnect = new PostgresConnect(postgresConfig) try { - // TODO: Check if the udpate is successful. Else throw an Exception - postgresConnect.execute(query) - } catch { - case ex: Exception => - logger.error(errorMsg, ex) - Map() + postgresConnect.executeUpdate(query) } finally { postgresConnect.closeConnection() } @@ -162,7 +166,7 @@ object DatasetRegistryService { if (denormConfig == null) None else Some(JSONUtil.deserialize[DenormConfig](denormConfig)), JSONUtil.deserialize[RouterConfig](routerConfig), JSONUtil.deserialize[DatasetConfig](datasetConfig), - status, + DatasetStatus.withName(status), Option(tags), Option(dataVersion) ) @@ -177,19 +181,19 @@ object DatasetRegistryService { val status = rs.getString("status") DatasetSourceConfig(id = id, datasetId = datasetId, connectorType = connectorType, - JSONUtil.deserialize[ConnectorConfig](connectorConfig), - JSONUtil.deserialize[ConnectorStats](connectorStats), - status + JSONUtil.deserialize[ConnectorConfig](connectorConfig), status, + if(connectorStats != null) Some(JSONUtil.deserialize[ConnectorStats](connectorStats)) else None ) } private def parseDatasource(rs: ResultSet): DataSource = { + val id = rs.getString("id") val datasource = rs.getString("datasource") val datasetId = rs.getString("dataset_id") val ingestionSpec = rs.getString("ingestion_spec") val datasourceRef = rs.getString("datasource_ref") - DataSource(datasource, datasetId, ingestionSpec, datasourceRef) + DataSource(id, datasource, datasetId, ingestionSpec, datasourceRef) } private def parseDatasetTransformation(rs: ResultSet): DatasetTransformation = { @@ -198,8 +202,9 @@ object DatasetRegistryService { val fieldKey = rs.getString("field_key") val transformationFunction = rs.getString("transformation_function") val status = rs.getString("status") + val mode = rs.getString("mode") - DatasetTransformation(id, datasetId, fieldKey, JSONUtil.deserialize[TransformationFunction](transformationFunction), status) + DatasetTransformation(id, datasetId, fieldKey, JSONUtil.deserialize[TransformationFunction](transformationFunction), status, Some(if(mode != null) TransformMode.withName(mode) else TransformMode.Strict)) } } \ No newline at end of file diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala new file mode 100644 index 00000000..4e992eba --- /dev/null +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala @@ -0,0 +1,195 @@ +package org.sunbird.obsrv.streaming + +import org.apache.flink.api.scala.metrics.ScalaGauge +import org.apache.flink.configuration.Configuration +import org.apache.flink.streaming.api.functions.ProcessFunction +import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction +import org.apache.flink.streaming.api.windowing.windows.TimeWindow +import org.sunbird.obsrv.core.model.FunctionalError.FunctionalError +import org.sunbird.obsrv.core.model.Models._ +import org.sunbird.obsrv.core.model.Producer.Producer +import org.sunbird.obsrv.core.model.Stats.Stats +import org.sunbird.obsrv.core.model.StatusCode.StatusCode +import org.sunbird.obsrv.core.model._ +import org.sunbird.obsrv.core.streaming._ +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.Dataset +import org.sunbird.obsrv.registry.DatasetRegistry + +import java.lang +import java.util.concurrent.ConcurrentHashMap +import java.util.concurrent.atomic.AtomicLong +import scala.collection.JavaConverters._ +import scala.collection.mutable + +trait SystemEventHandler { + private def getStatus(flags: Map[String, AnyRef], producer: Producer): Option[StatusCode] = { + flags.get(producer.toString).map(f => StatusCode.withName(f.asInstanceOf[String])) + } + + private def getTime(timespans: Map[String, AnyRef], producer: Producer): Option[Long] = { + timespans.get(producer.toString).map(f => f.asInstanceOf[Long]) + } + + private def getStat(obsrvMeta: Map[String, AnyRef], stat: Stats): Option[Long] = { + obsrvMeta.get(stat.toString).map(f => f.asInstanceOf[Long]) + } + + def getError(error: ErrorConstants.Error, producer: Producer, functionalError: FunctionalError): Option[ErrorLog] = { + Some(ErrorLog(pdata_id = producer, pdata_status = StatusCode.failed, error_type = functionalError, error_code = error.errorCode, error_message = error.errorMsg, error_level = ErrorLevel.critical, error_count = Some(1))) + } + + def generateSystemEvent(dataset: Option[String], event: mutable.Map[String, AnyRef], config: BaseJobConfig[_], producer: Producer, error: Option[ErrorLog] = None, dataset_type: Option[String] = None): String = { + val obsrvMeta = event("obsrv_meta").asInstanceOf[Map[String, AnyRef]] + val flags = obsrvMeta("flags").asInstanceOf[Map[String, AnyRef]] + val timespans = obsrvMeta("timespans").asInstanceOf[Map[String, AnyRef]] + + JSONUtil.serialize(SystemEvent( + EventID.METRIC, ctx = ContextData(module = ModuleID.processing, pdata = PData(config.jobName, PDataType.flink, Some(producer)), dataset = dataset, dataset_type = dataset_type), + data = EData(error = error, pipeline_stats = Some(PipelineStats(extractor_events = None, + extractor_status = getStatus(flags, Producer.extractor), extractor_time = getTime(timespans, Producer.extractor), + validator_status = getStatus(flags, Producer.validator), validator_time = getTime(timespans, Producer.validator), + dedup_status = getStatus(flags, Producer.dedup), dedup_time = getTime(timespans, Producer.dedup), + denorm_status = getStatus(flags, Producer.denorm), denorm_time = getTime(timespans, Producer.denorm), + transform_status = getStatus(flags, Producer.transformer), transform_time = getTime(timespans, Producer.transformer), + total_processing_time = getStat(obsrvMeta, Stats.total_processing_time), latency_time = getStat(obsrvMeta, Stats.latency_time), processing_time = getStat(obsrvMeta, Stats.processing_time) + ))) + )) + } + + def getDatasetId(dataset: Option[String], config: BaseJobConfig[_]): String = { + dataset.getOrElse(config.defaultDatasetID) + } + +} + +abstract class BaseDatasetProcessFunction(config: BaseJobConfig[mutable.Map[String, AnyRef]]) + extends BaseProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]](config) with SystemEventHandler { + + override def open(parameters: Configuration): Unit = { + super.open(parameters) + } + + def getMetrics(): List[String] + + override def getMetricsList(): MetricsList = { + val metrics = getMetrics() ++ List(config.eventFailedMetricsCount) + MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics) + } + + private def initMetrics(datasetId: String): Unit = { + if(!metrics.hasDataset(datasetId)) { + val metricMap = new ConcurrentHashMap[String, AtomicLong]() + metricsList.metrics.map(metric => { + metricMap.put(metric, new AtomicLong(0L)) + getRuntimeContext.getMetricGroup.addGroup(config.jobName).addGroup(datasetId) + .gauge[Long, ScalaGauge[Long]](metric, ScalaGauge[Long](() => metrics.getAndReset(datasetId, metric))) + }) + metrics.initDataset(datasetId, metricMap) + } + } + + def markFailure(datasetId: Option[String], event: mutable.Map[String, AnyRef], ctx: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, + metrics: Metrics, error: ErrorConstants.Error, producer: Producer, functionalError: FunctionalError, datasetType: Option[String] = None): Unit = { + + metrics.incCounter(getDatasetId(datasetId, config), config.eventFailedMetricsCount) + ctx.output(config.failedEventsOutputTag(), super.markFailed(event, error, producer)) + val errorLog = getError(error, producer, functionalError) + val systemEvent = generateSystemEvent(Some(getDatasetId(datasetId, config)), event, config, producer, errorLog, datasetType) + ctx.output(config.systemEventsOutputTag, systemEvent) + } + + def markCompletion(dataset: Dataset, event: mutable.Map[String, AnyRef], ctx: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, producer: Producer): Unit = { + ctx.output(config.systemEventsOutputTag, generateSystemEvent(Some(dataset.id), super.markComplete(event, dataset.dataVersion), config, producer, dataset_type = Some(dataset.datasetType))) + } + + def processElement(dataset: Dataset, event: mutable.Map[String, AnyRef],context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, metrics: Metrics): Unit + override def processElement(event: mutable.Map[String, AnyRef], context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, metrics: Metrics): Unit = { + + val datasetIdOpt = event.get(config.CONST_DATASET) + if (datasetIdOpt.isEmpty) { + markFailure(None, event, context, metrics, ErrorConstants.MISSING_DATASET_ID, Producer.validator, FunctionalError.MissingDatasetId) + return + } + val datasetId = datasetIdOpt.get.asInstanceOf[String] + initMetrics(datasetId) + val datasetOpt = DatasetRegistry.getDataset(datasetId) + if (datasetOpt.isEmpty) { + markFailure(Some(datasetId), event, context, metrics, ErrorConstants.MISSING_DATASET_CONFIGURATION, Producer.validator, FunctionalError.MissingDatasetId) + return + } + val dataset = datasetOpt.get + if (!super.containsEvent(event)) { + markFailure(Some(datasetId), event, context, metrics, ErrorConstants.EVENT_MISSING, Producer.validator, FunctionalError.MissingEventData) + return + } + processElement(dataset, event, context, metrics) + } +} + +abstract class BaseDatasetWindowProcessFunction(config: BaseJobConfig[mutable.Map[String, AnyRef]]) + extends WindowBaseProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef], String](config) with SystemEventHandler { + + override def open(parameters: Configuration): Unit = { + super.open(parameters) + } + + def getMetrics(): List[String] + + override def getMetricsList(): MetricsList = { + val metrics = getMetrics() ++ List(config.eventFailedMetricsCount) + MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics) + } + + private def initMetrics(datasetId: String): Unit = { + if(!metrics.hasDataset(datasetId)) { + val metricMap = new ConcurrentHashMap[String, AtomicLong]() + metricsList.metrics.map(metric => { + metricMap.put(metric, new AtomicLong(0L)) + getRuntimeContext.getMetricGroup.addGroup(config.jobName).addGroup(datasetId) + .gauge[Long, ScalaGauge[Long]](metric, ScalaGauge[Long](() => metrics.getAndReset(datasetId, metric))) + }) + metrics.initDataset(datasetId, metricMap) + } + } + + def markFailure(datasetId: Option[String], event: mutable.Map[String, AnyRef], ctx: ProcessWindowFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef], String, TimeWindow]#Context, + metrics: Metrics, error: ErrorConstants.Error, producer: Producer, functionalError: FunctionalError, datasetType: Option[String] = None): Unit = { + metrics.incCounter(getDatasetId(datasetId, config), config.eventFailedMetricsCount) + ctx.output(config.failedEventsOutputTag(), super.markFailed(event, error, producer)) + val errorLog = getError(error, producer, functionalError) + val systemEvent = generateSystemEvent(Some(getDatasetId(datasetId, config)), event, config, producer, errorLog, datasetType) + ctx.output(config.systemEventsOutputTag, systemEvent) + } + + def markCompletion(dataset: Dataset, event: mutable.Map[String, AnyRef], ctx: ProcessWindowFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef], String, TimeWindow]#Context, producer: Producer): Unit = { + ctx.output(config.systemEventsOutputTag, generateSystemEvent(Some(dataset.id), super.markComplete(event, dataset.dataVersion), config, producer, dataset_type = Some(dataset.datasetType))) + } + + def processWindow(dataset: Dataset, context: ProcessWindowFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef], String, TimeWindow]#Context, elements: List[mutable.Map[String, AnyRef]], metrics: Metrics): Unit + override def process(datasetId: String, context: ProcessWindowFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef], String, TimeWindow]#Context, elements: lang.Iterable[mutable.Map[String, AnyRef]], metrics: Metrics): Unit = { + + initMetrics(datasetId) + val datasetOpt = DatasetRegistry.getDataset(datasetId) + val eventsList = elements.asScala.toList + if (datasetOpt.isEmpty) { + eventsList.foreach(event => { + markFailure(Some(datasetId), event, context, metrics, ErrorConstants.MISSING_DATASET_CONFIGURATION, Producer.validator, FunctionalError.MissingDatasetId) + }) + return + } + val dataset = datasetOpt.get + val buffer = mutable.Buffer[mutable.Map[String, AnyRef]]() + eventsList.foreach(event => { + if (!super.containsEvent(event)) { + markFailure(Some(datasetId), event, context, metrics, ErrorConstants.EVENT_MISSING, Producer.validator, FunctionalError.MissingEventData) + } else { + buffer.append(event) + } + }) + + if(buffer.nonEmpty) { + processWindow(dataset, context, buffer.toList, metrics) + } + } +} \ No newline at end of file diff --git a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala index 116bbc54..172dd181 100644 --- a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala +++ b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala @@ -4,11 +4,12 @@ import com.typesafe.config.{Config, ConfigFactory} import org.sunbird.obsrv.core.util.{PostgresConnect, PostgresConnectionConfig} import org.sunbird.spec.BaseSpecWithPostgres -class BaseSpecWithDatasetRegistry extends BaseSpecWithPostgres { +import scala.collection.mutable +class BaseSpecWithDatasetRegistry extends BaseSpecWithPostgres { val config: Config = ConfigFactory.load("test.conf") - val postgresConfig = PostgresConnectionConfig( + val postgresConfig: PostgresConnectionConfig = PostgresConnectionConfig( user = config.getString("postgres.user"), password = config.getString("postgres.password"), database = "postgres", @@ -17,31 +18,43 @@ class BaseSpecWithDatasetRegistry extends BaseSpecWithPostgres { maxConnections = config.getInt("postgres.maxConnections") ) - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() val postgresConnect = new PostgresConnect(postgresConfig) createSchema(postgresConnect) insertTestData(postgresConnect) + postgresConnect.closeConnection() } override def afterAll(): Unit = { super.afterAll() } - private def createSchema(postgresConnect: PostgresConnect) { + private def createSchema(postgresConnect: PostgresConnect): Unit = { postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasets ( id text PRIMARY KEY, type text NOT NULL, validation_config json, extraction_config json, dedup_config json, data_schema json, denorm_config json, router_config json NOT NULL, dataset_config json NOT NULL, status text NOT NULL, tags text[], data_version INT, created_by text NOT NULL, updated_by text NOT NULL, created_date timestamp NOT NULL, updated_date timestamp NOT NULL );") postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasources ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), ingestion_spec json NOT NULL, datasource text NOT NULL, datasource_ref text NOT NULL, retention_period json, archival_policy json, purge_policy json, backup_config json NOT NULL, status text NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL );") - postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_transformations ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), field_key text NOT NULL, transformation_function text NOT NULL, status text NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(field_key, dataset_id) );") - postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_source_config ( id SERIAL PRIMARY KEY, dataset_id text NOT NULL REFERENCES datasets (id), connector_type text NOT NULL, connector_config json NOT NULL, status text NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(dataset_id) );") + postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_transformations ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), field_key text NOT NULL, transformation_function json NOT NULL, status text NOT NULL, mode text, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(field_key, dataset_id) );") + postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_source_config ( id text PRIMARY KEY, dataset_id text NOT NULL REFERENCES datasets (id), connector_type text NOT NULL, connector_config json NOT NULL, status text NOT NULL, connector_stats json, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(connector_type, dataset_id) );") } private def insertTestData(postgresConnect: PostgresConnect): Unit = { - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, created_by, updated_by, created_date, updated_date) values ('d1', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":\"6340\",\"redis_db\":2}', 'ACTIVE', 2, 'System', 'System', now(), now());") - postgresConnect.execute("update datasets set denorm_config = '{\"redis_db_host\":\"localhost\",\"redis_db_port\":\"6340\",\"denorm_fields\":[{\"denorm_key\":\"vehicleCode\",\"redis_db\":2,\"denorm_out_field\":\"vehicleData\"}]}' where id='d1';") - postgresConnect.execute("insert into dataset_transformations values('tf1', 'd1', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'active', 'System', 'System', now(), now());") - postgresConnect.execute("insert into dataset_transformations values('tf2', 'd1', 'dealer.maskedPhone', '{\"type\":\"mask\",\"expr\": \"dealer.phone\"}', 'active', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d2', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'ACTIVE', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, created_by, updated_by, created_date, updated_date) values ('d1', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'System', 'System', now(), now());") + postgresConnect.execute("update datasets set denorm_config = '{\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"denorm_fields\":[{\"denorm_key\":\"vehicleCode\",\"redis_db\":2,\"denorm_out_field\":\"vehicleData\"}]}' where id='d1';") + postgresConnect.execute("insert into dataset_transformations values('tf1', 'd1', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'Live', 'Strict', 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_transformations values('tf2', 'd1', 'dealer.maskedPhone', '{\"type\":\"mask\",\"expr\": \"dealer.phone\"}', 'Live', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date, tags) values ('d2', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") + } + + def getPrintableMetrics(metricsMap: mutable.Map[String, Long]): Map[String, Map[String, Map[String, Long]]] = { + metricsMap.map(f => { + val keys = f._1.split('.') + val metricValue = f._2 + val jobId = keys.apply(0) + val datasetId = keys.apply(1) + val metric = keys.apply(2) + (jobId, datasetId, metric, metricValue) + }).groupBy(f => f._1).mapValues(f => f.map(p => (p._2, p._3, p._4))).mapValues(f => f.groupBy(p => p._1).mapValues(q => q.map(r => (r._2, r._3)).toMap)) } -} +} \ No newline at end of file diff --git a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala index 27df2676..b37e801a 100644 --- a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala +++ b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala @@ -2,32 +2,98 @@ package org.sunbird.obsrv.spec import org.scalatest.Matchers import org.scalatestplus.mockito.MockitoSugar +import org.sunbird.obsrv.core.util.PostgresConnect import org.sunbird.obsrv.registry.DatasetRegistry +import java.sql.Timestamp +import java.time.{LocalDateTime, ZoneOffset} + class TestDatasetRegistrySpec extends BaseSpecWithDatasetRegistry with Matchers with MockitoSugar { "TestDatasetRegistrySpec" should "validate all the registry service methods" in { val d1Opt = DatasetRegistry.getDataset("d1") - d1Opt should not be (None) - d1Opt.get.id should be ("d1") - d1Opt.get.dataVersion.get should be (2) + d1Opt should not be None + d1Opt.get.id should be("d1") + d1Opt.get.dataVersion.get should be(2) val d2Opt = DatasetRegistry.getDataset("d2") - d2Opt should not be (None) - d2Opt.get.id should be ("d2") - d2Opt.get.denormConfig should be (None) + d2Opt should not be None + d2Opt.get.id should be("d2") + d2Opt.get.denormConfig should be(None) + + val postgresConnect = new PostgresConnect(postgresConfig) + postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date, tags) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") + postgresConnect.closeConnection() + + val d3Opt = DatasetRegistry.getDataset("d3") + d3Opt should not be None + d3Opt.get.id should be("d3") + d3Opt.get.denormConfig should be(None) + + val d4Opt = DatasetRegistry.getDataset("d4") + d4Opt should be (None) val allDatasets = DatasetRegistry.getAllDatasets("dataset") - allDatasets.size should be (2) + allDatasets.size should be(3) val d1Tfs = DatasetRegistry.getDatasetTransformations("d1") - d1Tfs should not be (None) - d1Tfs.get.size should be (2) + d1Tfs should not be None + d1Tfs.get.size should be(2) + + val ids = DatasetRegistry.getDataSetIds("dataset").sortBy(f => f) + ids.head should be("d1") + ids.apply(1) should be("d2") + ids.apply(2) should be("d3") + + DatasetRegistry.getAllDatasetSourceConfig().get.size should be(2) + val datasetSourceConfigList = DatasetRegistry.getDatasetSourceConfigById("d1").get + val datasetSourceConfig = datasetSourceConfigList.filter(f => f.id.equals("sc1")).head + datasetSourceConfig.id should be("sc1") + datasetSourceConfig.datasetId should be("d1") + datasetSourceConfig.connectorType should be("kafka") + datasetSourceConfig.status should be("Live") + + val instant1 = LocalDateTime.now(ZoneOffset.UTC) + DatasetRegistry.updateConnectorStats("sc1", Timestamp.valueOf(instant1), 20L) + DatasetRegistry.updateConnectorDisconnections("sc1", 2) + DatasetRegistry.updateConnectorDisconnections("sc1", 4) + DatasetRegistry.updateConnectorAvgBatchReadTime("sc1", 4) + DatasetRegistry.updateConnectorAvgBatchReadTime("sc1", 5) + val instant2 = LocalDateTime.now(ZoneOffset.UTC) - val ids = DatasetRegistry.getDataSetIds("dataset") - ids.head should be ("d1") - ids.last should be ("d2") + DatasetRegistry.updateConnectorStats("sc1", Timestamp.valueOf(instant2), 60L) + val datasetSourceConfigList2 = DatasetRegistry.getDatasetSourceConfigById("d1").get + val datasetSourceConfig2 = datasetSourceConfigList2.filter(f => f.id.equals("sc1")).head + datasetSourceConfig2.connectorStats.get.records should be(80) + datasetSourceConfig2.connectorStats.get.disconnections should be(4) + datasetSourceConfig2.connectorStats.get.avgBatchReadTime should be(5) + datasetSourceConfig2.connectorStats.get.lastFetchTimestamp.getTime should be(instant2.toInstant(ZoneOffset.UTC).toEpochMilli) + + val datasource = DatasetRegistry.getDatasources("d1").get.head + datasource.datasetId should be("d1") + datasource.datasource should be("d1-datasource") + datasource.datasourceRef should be("d1-datasource-1") + + DatasetRegistry.updateDatasourceRef(datasource, "d1-datasource-2") + val datasource2 = DatasetRegistry.getDatasources("d1").get.head + datasource2.datasourceRef should be("d1-datasource-2") + + DatasetRegistry.getDatasources("d2").get.nonEmpty should be(false) } -} + override def beforeAll(): Unit = { + super.beforeAll() + prepareTestData() + } + + private def prepareTestData(): Unit = { + val postgresConnect = new PostgresConnect(postgresConfig) + postgresConnect.execute("insert into dataset_source_config values('sc1', 'd1', 'kafka', '{\"kafkaBrokers\":\"localhost:9090\",\"topic\":\"test-topic\"}', 'Live', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_source_config values('sc2', 'd1', 'rdbms', '{\"type\":\"postgres\",\"tableName\":\"test-table\"}', 'Live', null, 'System', 'System', now(), now());") + + //postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasources ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), ingestion_spec json NOT NULL, datasource text NOT NULL, datasource_ref text NOT NULL, retention_period json, archival_policy json, purge_policy json, backup_config json NOT NULL, status text NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL );") + postgresConnect.execute("insert into datasources values('ds1', 'd1', '{}', 'd1-datasource', 'd1-datasource-1', null, null, null, '{}', 'Live', 'System', 'System', now(), now());") + postgresConnect.closeConnection() + } +} \ No newline at end of file diff --git a/framework/pom.xml b/framework/pom.xml index 31402224..52ced63f 100644 --- a/framework/pom.xml +++ b/framework/pom.xml @@ -35,7 +35,6 @@ org.apache.kafka kafka-clients ${kafka.version} - provided joda-time @@ -90,6 +89,12 @@ 3.0.6 test + + org.scalamock + scalamock_2.12 + 5.2.0 + test + junit junit @@ -134,9 +139,9 @@ test - it.ozimov + com.github.codemonstur embedded-redis - 0.7.1 + 1.0.0 test diff --git a/framework/src/main/resources/baseconfig.conf b/framework/src/main/resources/baseconfig.conf index 5dc27105..e41f7e4b 100644 --- a/framework/src/main/resources/baseconfig.conf +++ b/framework/src/main/resources/baseconfig.conf @@ -8,6 +8,7 @@ kafka { compression = "snappy" } output.system.event.topic = ${job.env}".system.events" + output.failed.topic = ${job.env}".failed" } job { @@ -54,9 +55,4 @@ postgres { user = "postgres" password = "postgres" database = "postgres" -} - -lms-cassandra { - host = "localhost" - port = "9042" } \ No newline at end of file diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/cache/DedupEngine.scala b/framework/src/main/scala/org/sunbird/obsrv/core/cache/DedupEngine.scala index f04b0015..a03477b7 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/cache/DedupEngine.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/cache/DedupEngine.scala @@ -1,45 +1,23 @@ package org.sunbird.obsrv.core.cache -import org.slf4j.LoggerFactory import redis.clients.jedis.Jedis import redis.clients.jedis.exceptions.JedisException class DedupEngine(redisConnect: RedisConnect, store: Int, expirySeconds: Int) extends Serializable { - private[this] val logger = LoggerFactory.getLogger(classOf[DedupEngine]) - private val serialVersionUID = 6089562751616425354L private[this] var redisConnection: Jedis = redisConnect.getConnection redisConnection.select(store) @throws[JedisException] def isUniqueEvent(checksum: String): Boolean = { - var unique = false - try { - unique = !redisConnection.exists(checksum) - } catch { - case ex: JedisException => - logger.error("DedupEngine:isUniqueEvent() - Exception", ex) - this.redisConnection.close() - this.redisConnection = redisConnect.getConnection(this.store, backoffTimeInMillis = 10000) - unique = !this.redisConnection.exists(checksum) - } - unique + !redisConnection.exists(checksum) } @throws[JedisException] def storeChecksum(checksum: String): Unit = { - try - redisConnection.setex(checksum, expirySeconds, "") - catch { - case ex: JedisException => - logger.error("DedupEngine:storeChecksum() - Exception", ex) - this.redisConnection.close() - this.redisConnection = redisConnect.getConnection(this.store, backoffTimeInMillis = 10000) - this.redisConnection.select(this.store) - this.redisConnection.setex(checksum, expirySeconds, "") - } + redisConnection.setex(checksum, expirySeconds, "") } def getRedisConnection: Jedis = redisConnection diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/cache/RedisConnect.scala b/framework/src/main/scala/org/sunbird/obsrv/core/cache/RedisConnect.scala index 1bffdb7a..d96d6610 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/cache/RedisConnect.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/cache/RedisConnect.scala @@ -16,6 +16,7 @@ class RedisConnect(redisHost: String, redisPort: Int, defaultTimeOut: Int) exten catch { case e: InterruptedException => logger.error("RedisConnect:getConnection() - Exception", e) + e.printStackTrace() } // $COVERAGE-ON$ logger.info("Obtaining new Redis connection...") diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala new file mode 100644 index 00000000..2cfbd307 --- /dev/null +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala @@ -0,0 +1,18 @@ +package org.sunbird.obsrv.core.model + +object Constants { + + val EVENT = "event" + val INVALID_JSON = "invalid_json" + val OBSRV_META = "obsrv_meta" + val SRC = "src" + val ERROR_CODE = "error_code" + val ERROR_MSG = "error_msg" + val ERROR_REASON = "error_reason" + val FAILED = "failed" + val ERROR = "error" + val LEVEL = "level" + val TOPIC = "topic" + val MESSAGE = "message" + +} diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala index bbcd5828..d79ab327 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala @@ -9,23 +9,29 @@ object ErrorConstants extends Enumeration { } val NO_IMPLEMENTATION_FOUND = ErrorInternalValue("ERR_0001", "Unimplemented method") - val NO_EXTRACTION_DATA_FOUND = ErrorInternalValue("ERR_EXT_1001", "Unable to extract the data from the extraction key") - val EXTRACTED_DATA_NOT_A_LIST = ErrorInternalValue("ERR_EXT_1002", "The extracted data is not an list") + val EXTRACTED_DATA_NOT_A_LIST = ErrorInternalValue("ERR_EXT_1002", "The extracted data is not a list") val EVENT_SIZE_EXCEEDED = ErrorInternalValue("ERR_EXT_1003", ("Event size has exceeded max configured size of " + SystemConfig.maxEventSize)) - val EVENT_MISSING = ErrorInternalValue("ERR_EXT_1006", "Event missing in the batch event") val MISSING_DATASET_ID = ErrorInternalValue("ERR_EXT_1004", "Dataset Id is missing from the data") val MISSING_DATASET_CONFIGURATION = ErrorInternalValue("ERR_EXT_1005", "Dataset configuration is missing") - + val EVENT_MISSING = ErrorInternalValue("ERR_EXT_1006", "Event missing in the batch event") val NO_DEDUP_KEY_FOUND = ErrorInternalValue("ERR_DEDUP_1007", "No dedup key found or missing data") - val DEDUP_KEY_NOT_A_STRING = ErrorInternalValue("ERR_DEDUP_1008", "Dedup key value is not a String or Text") + val DEDUP_KEY_NOT_A_STRING_OR_NUMBER = ErrorInternalValue("ERR_DEDUP_1008", "Dedup key value is not a String or Text") val DUPLICATE_BATCH_EVENT_FOUND = ErrorInternalValue("ERR_EXT_1009", "Duplicate batch event found") - val DUPLICATE_EVENT_FOUND = ErrorInternalValue("ERR_PP_1010", "Duplicate event found") val JSON_SCHEMA_NOT_FOUND = ErrorInternalValue("ERR_PP_1011", "Json schema not found for the dataset") val INVALID_JSON_SCHEMA = ErrorInternalValue("ERR_PP_1012", "Invalid json schema") val SCHEMA_VALIDATION_FAILED = ErrorInternalValue("ERR_PP_1013", "Event failed the schema validation") - val DENORM_KEY_MISSING = ErrorInternalValue("ERR_DENORM_1014", "No denorm key found or missing data for the specified key") - val DENORM_KEY_NOT_A_STRING = ErrorInternalValue("ERR_DENORM_1015", "Denorm key value is not a String or Text") + val DENORM_KEY_NOT_A_STRING_OR_NUMBER = ErrorInternalValue("ERR_DENORM_1015", "Denorm key value is not a String or Number") + val DENORM_DATA_NOT_FOUND = ErrorInternalValue("ERR_DENORM_1016", "Denorm data not found for the given key") + val MISSING_DATASET_CONFIG_KEY = ErrorInternalValue("ERR_MASTER_DATA_1017", "Master dataset configuration key is missing") + val ERR_INVALID_EVENT = ErrorInternalValue("ERR_EXT_1018", "Invalid JSON event, error while deserializing the event") + val INDEX_KEY_MISSING_OR_BLANK = ErrorInternalValue("ERR_ROUTER_1019", "Unable to index data as the timestamp key is missing or blank or not a datetime value") + val INVALID_EXPR_FUNCTION = ErrorInternalValue("ERR_TRANSFORM_1020", "Transformation expression function is not valid") + val ERR_EVAL_EXPR_FUNCTION = ErrorInternalValue("ERR_TRANSFORM_1021", "Unable to evaluate the transformation expression function") + val ERR_UNKNOWN_TRANSFORM_EXCEPTION = ErrorInternalValue("ERR_TRANSFORM_1022", "Unable to evaluate the transformation expression function") + val ERR_TRANSFORMATION_FAILED = ErrorInternalValue("ERR_TRANSFORM_1023", "Atleast one mandatory transformation has failed") + val TRANSFORMATION_FIELD_MISSING = ErrorInternalValue("ERR_TRANSFORM_1024", "Transformation field is either missing or blank") + } diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/Models.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/Models.scala index ebdbd315..2863d78f 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/model/Models.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/Models.scala @@ -1,9 +1,79 @@ package org.sunbird.obsrv.core.model +import com.fasterxml.jackson.core.`type`.TypeReference +import com.fasterxml.jackson.module.scala.JsonScalaEnumeration +import org.sunbird.obsrv.core.model.ErrorLevel.ErrorLevel +import org.sunbird.obsrv.core.model.EventID.EventID +import org.sunbird.obsrv.core.model.FunctionalError.FunctionalError +import org.sunbird.obsrv.core.model.ModuleID.ModuleID +import org.sunbird.obsrv.core.model.PDataType.PDataType +import org.sunbird.obsrv.core.model.Producer.Producer +import org.sunbird.obsrv.core.model.StatusCode.StatusCode + object Models { - case class PData(val id: String, val `type`: String, val pid: String) - - case class SystemEvent(val pdata: PData, data: Map[String, AnyRef] ) + case class PData(id: String,@JsonScalaEnumeration(classOf[PDataTypeType]) `type`: PDataType,@JsonScalaEnumeration(classOf[ProducerType]) pid: Option[Producer]) + + case class ContextData(@JsonScalaEnumeration(classOf[ModuleIDType]) module: ModuleID, pdata: PData, dataset: Option[String] = None, dataset_type: Option[String] = None, eid: Option[String] = None) + + case class ErrorLog(@JsonScalaEnumeration(classOf[ProducerType]) pdata_id: Producer, @JsonScalaEnumeration(classOf[StatusCodeType]) pdata_status: StatusCode, @JsonScalaEnumeration(classOf[FunctionalErrorType]) error_type: FunctionalError, error_code: String, error_message: String,@JsonScalaEnumeration(classOf[ErrorLevelType]) error_level: ErrorLevel, error_count:Option[Int] = None) + + case class PipelineStats(extractor_events: Option[Int] = None, @JsonScalaEnumeration(classOf[StatusCodeType]) extractor_status: Option[StatusCode] = None, + extractor_time: Option[Long] = None, @JsonScalaEnumeration(classOf[StatusCodeType]) validator_status: Option[StatusCode] = None, validator_time: Option[Long] = None, + @JsonScalaEnumeration(classOf[StatusCodeType]) dedup_status: Option[StatusCode] = None, dedup_time: Option[Long] = None, @JsonScalaEnumeration(classOf[StatusCodeType]) denorm_status: Option[StatusCode] = None, + denorm_time: Option[Long] = None, @JsonScalaEnumeration(classOf[StatusCodeType]) transform_status: Option[StatusCode] = None, transform_time: Option[Long] = None, + total_processing_time: Option[Long] = None, latency_time: Option[Long] = None, processing_time: Option[Long] = None) + + case class EData(error: Option[ErrorLog] = None, pipeline_stats: Option[PipelineStats] = None, extra: Option[Map[String, AnyRef]] = None) + + case class SystemEvent(@JsonScalaEnumeration(classOf[EventIDType]) etype: EventID, ctx: ContextData, data: EData, ets: Long = System.currentTimeMillis()) +} + +class EventIDType extends TypeReference[EventID.type] +object EventID extends Enumeration { + type EventID = Value + val LOG, METRIC = Value +} + +class ErrorLevelType extends TypeReference[ErrorLevel.type] +object ErrorLevel extends Enumeration { + type ErrorLevel = Value + val debug, info, warn, critical = Value +} + +class FunctionalErrorType extends TypeReference[FunctionalError.type] +object FunctionalError extends Enumeration { + type FunctionalError = Value + val DedupFailed, RequiredFieldsMissing, DataTypeMismatch, AdditionalFieldsFound, UnknownValidationError, MissingDatasetId, MissingEventData, MissingTimestampKey, + EventSizeExceeded, ExtractionDataFormatInvalid, DenormKeyMissing, DenormKeyInvalid, DenormDataNotFound, InvalidJsonData, + TransformParseError, TransformEvalError, TransformFailedError, MissingMasterDatasetKey, TransformFieldMissing = Value +} + +class ProducerType extends TypeReference[Producer.type] +object Producer extends Enumeration { + type Producer = Value + val extractor, dedup, validator, denorm, transformer, router, masterdataprocessor = Value +} +class ModuleIDType extends TypeReference[ModuleID.type] +object ModuleID extends Enumeration { + type ModuleID = Value + val ingestion, processing, storage, query = Value } + +class StatusCodeType extends TypeReference[StatusCode.type] +object StatusCode extends Enumeration { + type StatusCode = Value + val success, failed, skipped, partial = Value +} + +class PDataTypeType extends TypeReference[PDataType.type] +object PDataType extends Enumeration { + type PDataType = Value + val flink, spark, druid, kafka, api = Value +} + +object Stats extends Enumeration { + type Stats = Value + val total_processing_time, latency_time, processing_time = Value +} \ No newline at end of file diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/serde/MapSerde.scala b/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala similarity index 55% rename from framework/src/main/scala/org/sunbird/obsrv/core/serde/MapSerde.scala rename to framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala index 299bab95..56525db4 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/serde/MapSerde.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala @@ -1,7 +1,5 @@ package org.sunbird.obsrv.core.serde -import java.nio.charset.StandardCharsets - import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.typeutils.TypeExtractor import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema @@ -9,17 +7,26 @@ import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDe import org.apache.flink.util.Collector import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.kafka.clients.producer.ProducerRecord +import org.sunbird.obsrv.core.model.Constants import org.sunbird.obsrv.core.util.JSONUtil + +import java.nio.charset.StandardCharsets import scala.collection.mutable class MapDeserializationSchema extends KafkaRecordDeserializationSchema[mutable.Map[String, AnyRef]] { private val serialVersionUID = -3224825136576915426L + override def getProducedType: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) override def deserialize(record: ConsumerRecord[Array[Byte], Array[Byte]], out: Collector[mutable.Map[String, AnyRef]]): Unit = { - val msg = JSONUtil.deserialize[mutable.Map[String, AnyRef]](record.value()) + val msg = try { + JSONUtil.deserialize[mutable.Map[String, AnyRef]](record.value()) + } catch { + case _: Exception => + mutable.Map[String, AnyRef](Constants.INVALID_JSON -> new String(record.value, "UTF-8")) + } initObsrvMeta(msg, record) out.collect(msg) } @@ -35,16 +42,37 @@ class MapDeserializationSchema extends KafkaRecordDeserializationSchema[mutable. )) } } + } -class MapSerializationSchema(topic: String, key: Option[String] = None) extends KafkaRecordSerializationSchema[mutable.Map[String, AnyRef]] { +class StringDeserializationSchema extends KafkaRecordDeserializationSchema[String] { + + private val serialVersionUID = -3224825136576915426L + + override def getProducedType: TypeInformation[String] = TypeExtractor.getForClass(classOf[String]) + + override def deserialize(record: ConsumerRecord[Array[Byte], Array[Byte]], out: Collector[String]): Unit = { + out.collect(new String(record.value(), StandardCharsets.UTF_8)) + } +} + +class SerializationSchema[T](topic: String) extends KafkaRecordSerializationSchema[T] { private val serialVersionUID = -4284080856874185929L - override def serialize(element: mutable.Map[String, AnyRef], context: KafkaRecordSerializationSchema.KafkaSinkContext, timestamp: java.lang.Long): ProducerRecord[Array[Byte], Array[Byte]] = { + override def serialize(element: T, context: KafkaRecordSerializationSchema.KafkaSinkContext, timestamp: java.lang.Long): ProducerRecord[Array[Byte], Array[Byte]] = { val out = JSONUtil.serialize(element) - key.map { kafkaKey => - new ProducerRecord[Array[Byte], Array[Byte]](topic, kafkaKey.getBytes(StandardCharsets.UTF_8), out.getBytes(StandardCharsets.UTF_8)) - }.getOrElse(new ProducerRecord[Array[Byte], Array[Byte]](topic, out.getBytes(StandardCharsets.UTF_8))) + new ProducerRecord[Array[Byte], Array[Byte]](topic, out.getBytes(StandardCharsets.UTF_8)) } } + +class DynamicMapSerializationSchema() extends KafkaRecordSerializationSchema[mutable.Map[String, AnyRef]] { + + private val serialVersionUID = -4284080856874185929L + + override def serialize(element: mutable.Map[String, AnyRef], context: KafkaRecordSerializationSchema.KafkaSinkContext, timestamp: java.lang.Long): ProducerRecord[Array[Byte], Array[Byte]] = { + val out = JSONUtil.serialize(element.get(Constants.MESSAGE)) + val topic = element.get(Constants.TOPIC).get.asInstanceOf[String] + new ProducerRecord[Array[Byte], Array[Byte]](topic, out.getBytes(StandardCharsets.UTF_8)) + } +} \ No newline at end of file diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/serde/StringSerde.scala b/framework/src/main/scala/org/sunbird/obsrv/core/serde/StringSerde.scala deleted file mode 100644 index 17768453..00000000 --- a/framework/src/main/scala/org/sunbird/obsrv/core/serde/StringSerde.scala +++ /dev/null @@ -1,32 +0,0 @@ -package org.sunbird.obsrv.core.serde - -import org.apache.flink.api.common.typeinfo.TypeInformation -import org.apache.flink.api.java.typeutils.TypeExtractor -import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema -import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema -import org.apache.flink.util.Collector -import org.apache.kafka.clients.consumer.ConsumerRecord -import org.apache.kafka.clients.producer.ProducerRecord - -import java.nio.charset.StandardCharsets - -class StringDeserializationSchema extends KafkaRecordDeserializationSchema[String] { - - private val serialVersionUID = -3224825136576915426L - override def getProducedType: TypeInformation[String] = TypeExtractor.getForClass(classOf[String]) - - override def deserialize(record: ConsumerRecord[Array[Byte], Array[Byte]], out: Collector[String]): Unit = { - out.collect(new String(record.value(), StandardCharsets.UTF_8)) - } -} - -class StringSerializationSchema(topic: String, key: Option[String] = None) extends KafkaRecordSerializationSchema[String] { - - private val serialVersionUID = -4284080856874185929L - - override def serialize(element: String, context: KafkaRecordSerializationSchema.KafkaSinkContext, timestamp: java.lang.Long): ProducerRecord[Array[Byte], Array[Byte]] = { - key.map { kafkaKey => - new ProducerRecord[Array[Byte], Array[Byte]](topic, kafkaKey.getBytes(StandardCharsets.UTF_8), element.getBytes(StandardCharsets.UTF_8)) - }.getOrElse(new ProducerRecord[Array[Byte], Array[Byte]](topic, element.getBytes(StandardCharsets.UTF_8))) - } -} diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseDeduplication.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseDeduplication.scala index 0e61610b..40a69191 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseDeduplication.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseDeduplication.scala @@ -1,39 +1,24 @@ package org.sunbird.obsrv.core.streaming -import org.apache.flink.streaming.api.functions.ProcessFunction import org.slf4j.LoggerFactory import org.sunbird.obsrv.core.cache.DedupEngine import org.sunbird.obsrv.core.exception.ObsrvException -import org.sunbird.obsrv.core.model.ErrorConstants -import org.sunbird.obsrv.core.model.Models.{PData, SystemEvent} +import org.sunbird.obsrv.core.model._ import org.sunbird.obsrv.core.util.JSONUtil -import scala.collection.mutable - trait BaseDeduplication { private[this] val logger = LoggerFactory.getLogger(classOf[BaseDeduplication]) - def isDuplicate(datasetId: String, dedupKey: Option[String], event: String, - context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, - config: BaseJobConfig[_]) + def isDuplicate(datasetId: String, dedupKey: Option[String], event: String) (implicit deDupEngine: DedupEngine): Boolean = { - try { - val key = datasetId+":"+getDedupKey(dedupKey, event) - if (!deDupEngine.isUniqueEvent(key)) { - logger.debug(s"Event with mid: $key is duplicate") - true - } else { - deDupEngine.storeChecksum(key) - false - } - } catch { - case ex: ObsrvException => - logger.warn("BaseDeduplication:isDuplicate()-Exception", ex.getMessage) - val sysEvent = SystemEvent(PData(config.jobName, "flink", "deduplication"), Map("error_code" -> ex.error.errorCode, "error_msg" -> ex.error.errorMsg)) - context.output(config.systemEventsOutputTag, JSONUtil.serialize(sysEvent)) - false + val key = datasetId + ":" + getDedupKey(dedupKey, event) + if (!deDupEngine.isUniqueEvent(key)) { + true + } else { + deDupEngine.storeChecksum(key) + false } } @@ -45,10 +30,11 @@ trait BaseDeduplication { if (node.isMissingNode) { throw new ObsrvException(ErrorConstants.NO_DEDUP_KEY_FOUND) } - if (!node.isTextual) { - throw new ObsrvException(ErrorConstants.DEDUP_KEY_NOT_A_STRING) + if (!node.isTextual && !node.isNumber) { + logger.warn(s"Dedup | Dedup key is not a string or number | dedupKey=$dedupKey | keyType=${node.getNodeType}") + throw new ObsrvException(ErrorConstants.DEDUP_KEY_NOT_A_STRING_OR_NUMBER) } node.asText() } -} +} \ No newline at end of file diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala index ff15af4f..f82b430c 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala @@ -25,7 +25,7 @@ abstract class BaseJobConfig[T](val config: Config, val jobName: String) extends private val kafkaProducerBatchSize: Int = config.getInt("kafka.producer.batch.size") private val kafkaProducerLingerMs: Int = config.getInt("kafka.producer.linger.ms") private val kafkaProducerCompression: String = if (config.hasPath("kafka.producer.compression")) config.getString("kafka.producer.compression") else "snappy" - val groupId: String = config.getString("kafka.groupId") + private val groupId: String = config.getString("kafka.groupId") val restartAttempts: Int = config.getInt("task.restart-strategy.attempts") val delayBetweenAttempts: Long = config.getLong("task.restart-strategy.delay") val kafkaConsumerParallelism: Int = config.getInt("task.consumer.parallelism") @@ -45,14 +45,14 @@ abstract class BaseJobConfig[T](val config: Config, val jobName: String) extends val systemEventsProducer = "system-events-sink" // Checkpointing config - val enableCompressedCheckpointing: Boolean = config.getBoolean("job.enable.distributed.checkpointing") + val enableCompressedCheckpointing: Boolean = if (config.hasPath("job.enable.distributed.checkpointing")) config.getBoolean("job.enable.distributed.checkpointing") else false val checkpointingInterval: Int = config.getInt("task.checkpointing.interval") val checkpointingPauseSeconds: Int = config.getInt("task.checkpointing.pause.between.seconds") - val enableDistributedCheckpointing: Option[Boolean] = if (config.hasPath("job")) Option(config.getBoolean("job.enable.distributed.checkpointing")) else None - val checkpointingBaseUrl: Option[String] = if (config.hasPath("job")) Option(config.getString("job.statebackend.base.url")) else None + val enableDistributedCheckpointing: Option[Boolean] = if (config.hasPath("job.enable.distributed.checkpointing")) Option(config.getBoolean("job.enable.distributed.checkpointing")) else None + val checkpointingBaseUrl: Option[String] = if (config.hasPath("job.statebackend.base.url")) Option(config.getString("job.statebackend.base.url")) else None // Base Methods - def datasetType(): String = if(config.hasPath("dataset.type")) config.getString("dataset.type") else "dataset" + def datasetType(): String = if (config.hasPath("dataset.type")) config.getString("dataset.type") else "dataset" def inputTopic(): String @@ -60,6 +60,12 @@ abstract class BaseJobConfig[T](val config: Config, val jobName: String) extends def successTag(): OutputTag[T] + // Event Failures Common Variables + val failedEventProducer = "failed-events-sink" + val eventFailedMetricsCount: String = "failed-event-count" + val kafkaFailedTopic: String = config.getString("kafka.output.failed.topic") + def failedEventsOutputTag(): OutputTag[T] + def kafkaConsumerProperties(kafkaBrokerServers: Option[String] = None, kafkaConsumerGroup: Option[String] = None): Properties = { val properties = new Properties() properties.setProperty("bootstrap.servers", kafkaBrokerServers.getOrElse(kafkaConsumerBrokerServers)) diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseProcessFunction.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseProcessFunction.scala index 8ab0e664..71641352 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseProcessFunction.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseProcessFunction.scala @@ -6,10 +6,11 @@ import org.apache.flink.streaming.api.functions.ProcessFunction import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction import org.apache.flink.streaming.api.windowing.windows.TimeWindow import org.apache.flink.util.Collector -import org.slf4j.LoggerFactory import org.sunbird.obsrv.core.model.ErrorConstants.Error -import org.sunbird.obsrv.core.model.SystemConfig -import org.sunbird.obsrv.core.util.Util +import org.sunbird.obsrv.core.model.Producer.Producer +import org.sunbird.obsrv.core.model.StatusCode.StatusCode +import org.sunbird.obsrv.core.model.{Constants, Stats, StatusCode, SystemConfig} +import org.sunbird.obsrv.core.util.{JSONUtil, Util} import java.lang import java.util.concurrent.ConcurrentHashMap @@ -18,13 +19,21 @@ import scala.collection.mutable case class MetricsList(datasets: List[String], metrics: List[String]) -case class Metrics(metrics: Map[String, ConcurrentHashMap[String, AtomicLong]]) { +case class Metrics(metrics: mutable.Map[String, ConcurrentHashMap[String, AtomicLong]]) { private def getMetric(dataset: String, metric: String): AtomicLong = { val datasetMetrics: ConcurrentHashMap[String, AtomicLong] = metrics.getOrElse(dataset, new ConcurrentHashMap[String, AtomicLong]()) datasetMetrics.getOrDefault(metric, new AtomicLong()) } + def hasDataset(dataset: String): Boolean = { + metrics.contains(dataset) + } + + def initDataset(dataset: String, counters: ConcurrentHashMap[String, AtomicLong]): Unit = { + metrics.put(dataset, counters) + } + def incCounter(dataset: String, metric: String): Unit = { getMetric(dataset, metric).getAndIncrement() } @@ -53,21 +62,23 @@ trait JobMetrics { metrics.foreach { metric => metricMap.put(metric, new AtomicLong(0L)) } (dataset, metricMap) }).toMap + val mutableMap = mutable.Map[String, ConcurrentHashMap[String, AtomicLong]]() + mutableMap ++= datasetMetricMap - Metrics(datasetMetricMap) + Metrics(mutableMap) } } trait BaseFunction { - private def addFlags(obsrvMeta: mutable.Map[String, AnyRef], flags: Map[String, AnyRef]) = { + def addFlags(obsrvMeta: mutable.Map[String, AnyRef], flags: Map[String, AnyRef]): Option[AnyRef] = { obsrvMeta.put("flags", obsrvMeta("flags").asInstanceOf[Map[String, AnyRef]] ++ flags) } - private def addError(obsrvMeta: mutable.Map[String, AnyRef], error: Map[String, AnyRef]) = { + private def addError(obsrvMeta: mutable.Map[String, AnyRef], error: Map[String, AnyRef]): Option[AnyRef] = { obsrvMeta.put("error", error) } - private def addTimespan(obsrvMeta: mutable.Map[String, AnyRef], jobName: String): Unit = { + def addTimespan(obsrvMeta: mutable.Map[String, AnyRef], producer: Producer): Unit = { val prevTS = if (obsrvMeta.contains("prevProcessingTime")) { obsrvMeta("prevProcessingTime").asInstanceOf[Long] } else { @@ -75,43 +86,49 @@ trait BaseFunction { } val currentTS = System.currentTimeMillis() val span = currentTS - prevTS - obsrvMeta.put("timespans", obsrvMeta("timespans").asInstanceOf[Map[String, AnyRef]] ++ Map(jobName -> span)) + obsrvMeta.put("timespans", obsrvMeta("timespans").asInstanceOf[Map[String, AnyRef]] ++ Map(producer.toString -> span)) obsrvMeta.put("prevProcessingTime", currentTS.asInstanceOf[AnyRef]) } - def markFailed(event: mutable.Map[String, AnyRef], error: Error, jobName: String): mutable.Map[String, AnyRef] = { - val obsrvMeta = Util.getMutableMap(event("obsrv_meta").asInstanceOf[Map[String, AnyRef]]) - addError(obsrvMeta, Map("src" -> jobName, "error_code" -> error.errorCode, "error_msg" -> error.errorMsg)) - addFlags(obsrvMeta, Map(jobName -> "failed")) - addTimespan(obsrvMeta, jobName) - event.put("obsrv_meta", obsrvMeta.toMap) + def markFailed(event: mutable.Map[String, AnyRef], error: Error, producer: Producer): mutable.Map[String, AnyRef] = { + val obsrvMeta = Util.getMutableMap(event(Constants.OBSRV_META).asInstanceOf[Map[String, AnyRef]]) + addError(obsrvMeta, Map(Constants.SRC -> producer.toString, Constants.ERROR_CODE -> error.errorCode, Constants.ERROR_MSG -> error.errorMsg)) + addFlags(obsrvMeta, Map(producer.toString -> StatusCode.failed.toString)) + addTimespan(obsrvMeta, producer) + event.remove(Constants.OBSRV_META) + event.put(Constants.EVENT, JSONUtil.serialize(event)) + event.put(Constants.OBSRV_META, obsrvMeta.toMap) event } - def markSkipped(event: mutable.Map[String, AnyRef], jobName: String): mutable.Map[String, AnyRef] = { - val obsrvMeta = Util.getMutableMap(event("obsrv_meta").asInstanceOf[Map[String, AnyRef]]) - addFlags(obsrvMeta, Map(jobName -> "skipped")) - addTimespan(obsrvMeta, jobName) - event.put("obsrv_meta", obsrvMeta.toMap) - event + def markSkipped(event: mutable.Map[String, AnyRef], producer: Producer): mutable.Map[String, AnyRef] = { + markStatus(event, producer, StatusCode.skipped) + } + + def markSuccess(event: mutable.Map[String, AnyRef], producer: Producer): mutable.Map[String, AnyRef] = { + markStatus(event, producer, StatusCode.success) } - def markSuccess(event: mutable.Map[String, AnyRef], jobName: String): mutable.Map[String, AnyRef] = { + def markPartial(event: mutable.Map[String, AnyRef], producer: Producer): mutable.Map[String, AnyRef] = { + markStatus(event, producer, StatusCode.partial) + } + + private def markStatus(event: mutable.Map[String, AnyRef], producer: Producer, statusCode: StatusCode): mutable.Map[String, AnyRef] = { val obsrvMeta = Util.getMutableMap(event("obsrv_meta").asInstanceOf[Map[String, AnyRef]]) - addFlags(obsrvMeta, Map(jobName -> "success")) - addTimespan(obsrvMeta, jobName) + addFlags(obsrvMeta, Map(producer.toString -> statusCode.toString)) + addTimespan(obsrvMeta, producer) event.put("obsrv_meta", obsrvMeta.toMap) event } - def markComplete(event: mutable.Map[String, AnyRef], dataVersion: Option[Int]) : mutable.Map[String, AnyRef] = { + def markComplete(event: mutable.Map[String, AnyRef], dataVersion: Option[Int]): mutable.Map[String, AnyRef] = { val obsrvMeta = Util.getMutableMap(event("obsrv_meta").asInstanceOf[Map[String, AnyRef]]) val syncts = obsrvMeta("syncts").asInstanceOf[Long] val processingStartTime = obsrvMeta("processingStartTime").asInstanceOf[Long] val processingEndTime = System.currentTimeMillis() - obsrvMeta.put("total_processing_time", (processingEndTime - syncts).asInstanceOf[AnyRef]) - obsrvMeta.put("latency_time", (processingStartTime - syncts).asInstanceOf[AnyRef]) - obsrvMeta.put("processing_time", (processingEndTime - processingStartTime).asInstanceOf[AnyRef]) + obsrvMeta.put(Stats.total_processing_time.toString, (processingEndTime - syncts).asInstanceOf[AnyRef]) + obsrvMeta.put(Stats.latency_time.toString, (processingStartTime - syncts).asInstanceOf[AnyRef]) + obsrvMeta.put(Stats.processing_time.toString, (processingEndTime - processingStartTime).asInstanceOf[AnyRef]) obsrvMeta.put("data_version", dataVersion.getOrElse(1).asInstanceOf[AnyRef]) event.put("obsrv_meta", obsrvMeta.toMap) event @@ -123,19 +140,28 @@ trait BaseFunction { } } -abstract class BaseProcessFunction[T, R](config: BaseJobConfig[R]) extends ProcessFunction[T, R] with BaseDeduplication with JobMetrics with BaseFunction { +abstract class BaseProcessFunction[T, R](config: BaseJobConfig[R]) extends ProcessFunction[T, R] with JobMetrics with BaseFunction { - private[this] val logger = LoggerFactory.getLogger(this.getClass) - private val metricsList = getMetricsList() - private val metrics: Metrics = registerMetrics(metricsList.datasets, metricsList.metrics) + protected val metricsList: MetricsList = getMetricsList() + protected val metrics: Metrics = registerMetrics(metricsList.datasets, metricsList.metrics) override def open(parameters: Configuration): Unit = { - (metricsList.datasets ++ List(SystemConfig.defaultDatasetId)).map { dataset => + metricsList.datasets.map { dataset => metricsList.metrics.map(metric => { getRuntimeContext.getMetricGroup.addGroup(config.jobName).addGroup(dataset) - .gauge[Long, ScalaGauge[Long]](metric, ScalaGauge[Long](() => metrics.getAndReset(dataset, metric))) + .gauge[Long, ScalaGauge[Long]](metric, ScalaGauge[Long](() => + // $COVERAGE-OFF$ + metrics.getAndReset(dataset, metric) + // $COVERAGE-ON$ + )) }) } + getRuntimeContext.getMetricGroup.addGroup(config.jobName).addGroup(SystemConfig.defaultDatasetId) + .gauge[Long, ScalaGauge[Long]](config.eventFailedMetricsCount, ScalaGauge[Long](() => + // $COVERAGE-OFF$ + metrics.getAndReset(SystemConfig.defaultDatasetId, config.eventFailedMetricsCount) + // $COVERAGE-ON$ + )) } def processElement(event: T, context: ProcessFunction[T, R]#Context, metrics: Metrics): Unit @@ -143,29 +169,33 @@ abstract class BaseProcessFunction[T, R](config: BaseJobConfig[R]) extends Proce def getMetricsList(): MetricsList override def processElement(event: T, context: ProcessFunction[T, R]#Context, out: Collector[R]): Unit = { - try { - processElement(event, context, metrics) - } catch { - case exception: Exception => - logger.error(s"${config.jobName}:processElement - Exception", exception) - } + processElement(event, context, metrics) } } -abstract class WindowBaseProcessFunction[I, O, K](config: BaseJobConfig[O]) extends ProcessWindowFunction[I, O, K, TimeWindow] with BaseDeduplication with JobMetrics with BaseFunction { +abstract class WindowBaseProcessFunction[I, O, K](config: BaseJobConfig[O]) extends ProcessWindowFunction[I, O, K, TimeWindow] with JobMetrics with BaseFunction { - private[this] val logger = LoggerFactory.getLogger(this.getClass) - private val metricsList = getMetricsList() - private val metrics: Metrics = registerMetrics(metricsList.datasets, metricsList.metrics) + protected val metricsList: MetricsList = getMetricsList() + protected val metrics: Metrics = registerMetrics(metricsList.datasets, metricsList.metrics) override def open(parameters: Configuration): Unit = { - (metricsList.datasets ++ List(SystemConfig.defaultDatasetId)).map { dataset => + metricsList.datasets.map { dataset => metricsList.metrics.map(metric => { getRuntimeContext.getMetricGroup.addGroup(config.jobName).addGroup(dataset) - .gauge[Long, ScalaGauge[Long]](metric, ScalaGauge[Long](() => metrics.getAndReset(dataset, metric))) + .gauge[Long, ScalaGauge[Long]](metric, ScalaGauge[Long](() => + // $COVERAGE-OFF$ + metrics.getAndReset(dataset, metric) + // $COVERAGE-ON$ + )) }) } + getRuntimeContext.getMetricGroup.addGroup(config.jobName).addGroup(SystemConfig.defaultDatasetId) + .gauge[Long, ScalaGauge[Long]](config.eventFailedMetricsCount, ScalaGauge[Long](() => + // $COVERAGE-OFF$ + metrics.getAndReset(SystemConfig.defaultDatasetId, config.eventFailedMetricsCount) + // $COVERAGE-ON$ + )) } def getMetricsList(): MetricsList @@ -176,11 +206,7 @@ abstract class WindowBaseProcessFunction[I, O, K](config: BaseJobConfig[O]) exte metrics: Metrics): Unit override def process(key: K, context: ProcessWindowFunction[I, O, K, TimeWindow]#Context, elements: lang.Iterable[I], out: Collector[O]): Unit = { - try { - process(key, context, elements, metrics) - } catch { - case exception: Exception => logger.error(s"${config.jobName}:processElement - Exception", exception) - } + process(key, context, elements, metrics) } } \ No newline at end of file diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseStreamTask.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseStreamTask.scala index 4862da7b..8ebdb8a7 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseStreamTask.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseStreamTask.scala @@ -2,13 +2,24 @@ package org.sunbird.obsrv.core.streaming import org.apache.flink.api.common.eventtime.WatermarkStrategy -import org.apache.flink.streaming.api.datastream.DataStream +import org.apache.flink.streaming.api.datastream.{DataStream, DataStreamSink, SingleOutputStreamOperator} import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment import java.util.Properties import scala.collection.mutable -abstract class BaseStreamTask[T] { +class BaseStreamTaskSink[T] { + def addDefaultSinks(dataStream: SingleOutputStreamOperator[T], config: BaseJobConfig[T], kafkaConnector: FlinkKafkaConnector): DataStreamSink[T] = { + + dataStream.getSideOutput(config.systemEventsOutputTag).sinkTo(kafkaConnector.kafkaSink[String](config.kafkaSystemTopic)) + .name(config.jobName + "-" + config.systemEventsProducer).uid(config.jobName + "-" + config.systemEventsProducer).setParallelism(config.downstreamOperatorsParallelism) + + dataStream.getSideOutput(config.failedEventsOutputTag()).sinkTo(kafkaConnector.kafkaSink[T](config.kafkaFailedTopic)) + .name(config.jobName + "-" + config.failedEventProducer).uid(config.jobName + "-" + config.failedEventProducer).setParallelism(config.downstreamOperatorsParallelism) + } +} + +abstract class BaseStreamTask[T] extends BaseStreamTaskSink[T] { def process() diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/FlinkKafkaConnector.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/FlinkKafkaConnector.scala index 0120bd58..508e1e7c 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/FlinkKafkaConnector.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/FlinkKafkaConnector.scala @@ -8,18 +8,13 @@ import org.apache.kafka.clients.consumer.OffsetResetStrategy import org.sunbird.obsrv.core.serde._ import java.util.Properties -import scala.collection.mutable import scala.collection.JavaConverters._ +import scala.collection.mutable class FlinkKafkaConnector(config: BaseJobConfig[_]) extends Serializable { def kafkaStringSource(kafkaTopic: String): KafkaSource[String] = { - KafkaSource.builder[String]() - .setTopics(kafkaTopic) - .setDeserializer(new StringDeserializationSchema) - .setProperties(config.kafkaConsumerProperties()) - .setStartingOffsets(OffsetsInitializer.committedOffsets(OffsetResetStrategy.EARLIEST)) - .build() + kafkaStringSource(List(kafkaTopic), config.kafkaConsumerProperties()) } def kafkaStringSource(kafkaTopic: List[String], consumerProperties: Properties): KafkaSource[String] = { @@ -31,36 +26,31 @@ class FlinkKafkaConnector(config: BaseJobConfig[_]) extends Serializable { .build() } - def kafkaStringSink(kafkaTopic: String): KafkaSink[String] = { - KafkaSink.builder[String]() + def kafkaSink[T](kafkaTopic: String): KafkaSink[T] = { + KafkaSink.builder[T]() .setDeliverGuarantee(DeliveryGuarantee.AT_LEAST_ONCE) - .setRecordSerializer(new StringSerializationSchema(kafkaTopic)) + .setRecordSerializer(new SerializationSchema(kafkaTopic)) .setKafkaProducerConfig(config.kafkaProducerProperties) .build() } def kafkaMapSource(kafkaTopic: String): KafkaSource[mutable.Map[String, AnyRef]] = { - KafkaSource.builder[mutable.Map[String, AnyRef]]() - .setTopics(kafkaTopic) - .setDeserializer(new MapDeserializationSchema) - .setProperties(config.kafkaConsumerProperties()) - .setStartingOffsets(OffsetsInitializer.committedOffsets(OffsetResetStrategy.EARLIEST)) - .build() + kafkaMapSource(List(kafkaTopic), config.kafkaConsumerProperties()) } def kafkaMapSource(kafkaTopics: List[String], consumerProperties: Properties): KafkaSource[mutable.Map[String, AnyRef]] = { KafkaSource.builder[mutable.Map[String, AnyRef]]() .setTopics(kafkaTopics.asJava) .setDeserializer(new MapDeserializationSchema) - .setProperties(consumerProperties) + .setProperties(config.kafkaConsumerProperties()) .setStartingOffsets(OffsetsInitializer.committedOffsets(OffsetResetStrategy.EARLIEST)) .build() } - def kafkaMapSink(kafkaTopic: String): KafkaSink[mutable.Map[String, AnyRef]] = { + def kafkaMapDynamicSink(): KafkaSink[mutable.Map[String, AnyRef]] = { KafkaSink.builder[mutable.Map[String, AnyRef]]() .setDeliverGuarantee(DeliveryGuarantee.AT_LEAST_ONCE) - .setRecordSerializer(new MapSerializationSchema(kafkaTopic)) + .setRecordSerializer(new DynamicMapSerializationSchema()) .setKafkaProducerConfig(config.kafkaProducerProperties) .build() } diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala b/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala index 338a47e8..19c56af1 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala @@ -1,14 +1,14 @@ package org.sunbird.obsrv.core.util -import java.lang.reflect.{ParameterizedType, Type} import com.fasterxml.jackson.annotation.JsonInclude.Include import com.fasterxml.jackson.core.JsonGenerator.Feature -import com.fasterxml.jackson.databind.{DeserializationFeature, JsonNode, MapperFeature, ObjectMapper, SerializationFeature} import com.fasterxml.jackson.core.`type`.TypeReference import com.fasterxml.jackson.databind.json.JsonMapper -import com.fasterxml.jackson.module.scala.{ClassTagExtensions, DefaultScalaModule, ScalaObjectMapper} +import com.fasterxml.jackson.databind.node.JsonNodeType +import com.fasterxml.jackson.databind.{DeserializationFeature, JsonNode, SerializationFeature} +import com.fasterxml.jackson.module.scala.{ClassTagExtensions, DefaultScalaModule} -import scala.collection.mutable +import java.lang.reflect.{ParameterizedType, Type} object JSONUtil { @@ -19,11 +19,11 @@ object JSONUtil { .enable(Feature.WRITE_BIGDECIMAL_AS_PLAIN) .build() :: ClassTagExtensions - mapper.setSerializationInclusion(Include.NON_NULL) + mapper.setSerializationInclusion(Include.NON_ABSENT) @throws(classOf[Exception]) - def serialize(obj: AnyRef): String = { - mapper.writeValueAsString(obj) + def serialize(obj: Any): String = { + if(obj.isInstanceOf[String]) obj.asInstanceOf[String] else mapper.writeValueAsString(obj) } def deserialize[T: Manifest](json: String): T = { @@ -34,12 +34,16 @@ object JSONUtil { mapper.readValue(json, typeReference[T]) } - def isJSON(jsonString: String): Boolean = { + def getJsonType(jsonString: String): String = { try { - mapper.readTree(jsonString) - true + val node = mapper.readTree(jsonString) + node.getNodeType match { + case JsonNodeType.ARRAY => "ARRAY" + case JsonNodeType.OBJECT => "OBJECT" + case _ => "NOT_A_JSON" + } } catch { - case _: Exception => false + case _: Exception => "NOT_A_JSON" } } @@ -69,4 +73,4 @@ object JSONUtil { } } -} +} \ No newline at end of file diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/util/PostgresConnect.scala b/framework/src/main/scala/org/sunbird/obsrv/core/util/PostgresConnect.scala index 86cd61a6..8322351c 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/util/PostgresConnect.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/util/PostgresConnect.scala @@ -51,12 +51,26 @@ class PostgresConnect(config: PostgresConnectionConfig) { catch { case ex: SQLException => logger.error("PostgresConnect:execute() - Exception", ex) - reset + reset() statement.execute(query) } // $COVERAGE-ON$ } + def executeUpdate(query: String): Int = { + try { + statement.executeUpdate(query) + } + // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked if postgres connection is stale + catch { + case ex: SQLException => + logger.error("PostgresConnect:execute() - Exception", ex) + reset() + statement.executeUpdate(query) + } + // $COVERAGE-ON$ + } + def executeQuery(query:String):ResultSet = statement.executeQuery(query) } diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/util/Util.scala b/framework/src/main/scala/org/sunbird/obsrv/core/util/Util.scala index d9ab6bc0..55453dec 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/util/Util.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/util/Util.scala @@ -5,9 +5,9 @@ import scala.collection.mutable object Util { def getMutableMap(immutableMap: Map[String, AnyRef]): mutable.Map[String, AnyRef] = { - val mutableMap = mutable.Map[String, AnyRef](); - mutableMap ++= immutableMap; + val mutableMap = mutable.Map[String, AnyRef]() + mutableMap ++= immutableMap mutableMap } -} +} \ No newline at end of file diff --git a/framework/src/test/resources/base-test.conf b/framework/src/test/resources/base-test.conf index ce48f132..aa395059 100644 --- a/framework/src/test/resources/base-test.conf +++ b/framework/src/test/resources/base-test.conf @@ -17,6 +17,7 @@ kafka { compression = "snappy" } output.system.event.topic = "flink.system.events" + output.failed.topic = "flink.failed" } job { @@ -57,7 +58,7 @@ redis { redis-meta { host = localhost - port = 6379 + port = 6340 } postgres { diff --git a/framework/src/test/resources/test.conf b/framework/src/test/resources/test.conf index 056d3989..ed3cd4fd 100644 --- a/framework/src/test/resources/test.conf +++ b/framework/src/test/resources/test.conf @@ -47,7 +47,7 @@ redis.connection.timeout = 30000 redis { host = 127.0.0.1 - port = 6341 + port = 6340 database { duplicationstore.id = 12 key.expiry.seconds = 3600 @@ -56,7 +56,7 @@ redis { redis-meta { host = localhost - port = 6341 + port = 6340 } postgres { diff --git a/framework/src/test/resources/test2.conf b/framework/src/test/resources/test2.conf new file mode 100644 index 00000000..b85fd6ce --- /dev/null +++ b/framework/src/test/resources/test2.conf @@ -0,0 +1,69 @@ +kafka { + map.input.topic = "local.map.input" + map.output.topic = "local.map.output" + event.input.topic = "local.event.input" + event.output.topic = "local.event.output" + string.input.topic = "local.string.input" + string.output.topic = "local.string.output" + producer.broker-servers = "localhost:9093" + consumer.broker-servers = "localhost:9093" + groupId = "pipeline-preprocessor-group" + producer { + max-request-size = 102400 + batch.size = 8192 + linger.ms = 1 + } + output.system.event.topic = "flink.system.events" + output.failed.topic = "flink.failed" + event.duplicate.topic = "local.duplicate.output" +} + +job { + env = "local" + statebackend { + blob { + storage { + account = "blob.storage.account" + container = "obsrv-container" + checkpointing.dir = "flink-jobs" + } + } + } +} + +kafka.output.metrics.topic = "pipeline_metrics" +task { + checkpointing.interval = 60000 + checkpointing.pause.between.seconds = 30000 + restart-strategy.attempts = 1 + restart-strategy.delay = 10000 + parallelism = 1 + consumer.parallelism = 1 + downstream.operators.parallelism = 1 +} + +redis.connection.timeout = 30000 + +redis { + host = 127.0.0.1 + port = 6340 + database { + duplicationstore.id = 12 + key.expiry.seconds = 3600 + } +} + +redis-meta { + host = localhost + port = 6340 +} + +postgres { + host = localhost + port = 5432 + maxConnection = 2 + user = "postgres" + password = "postgres" +} + +dataset.type = "master-dataset" \ No newline at end of file diff --git a/framework/src/test/scala/org/sunbird/spec/BaseDeduplicationTestSpec.scala b/framework/src/test/scala/org/sunbird/spec/BaseDeduplicationTestSpec.scala new file mode 100644 index 00000000..961b1474 --- /dev/null +++ b/framework/src/test/scala/org/sunbird/spec/BaseDeduplicationTestSpec.scala @@ -0,0 +1,45 @@ +package org.sunbird.spec + +import com.typesafe.config.{Config, ConfigFactory} +import org.scalatest.Matchers +import org.scalatestplus.mockito.MockitoSugar +import org.sunbird.obsrv.core.cache.{DedupEngine, RedisConnect} +import org.sunbird.obsrv.core.exception.ObsrvException +import org.sunbird.obsrv.core.model.ErrorConstants +import org.sunbird.obsrv.core.streaming.BaseDeduplication +class BaseDeduplicationTestSpec extends BaseSpec with Matchers with MockitoSugar { + + val config: Config = ConfigFactory.load("base-test.conf") + val baseConfig = new BaseProcessTestConfig(config) + val SAMPLE_EVENT: String = """{"dataset":"d1","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealer":"KUNUnited","locationId":"KUN1"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""".stripMargin + + "BaseDeduplicationTestSpec" should "be able to cover all scenarios of deduplication check" in { + val redisConnection = new RedisConnect(baseConfig.redisHost, baseConfig.redisPort, baseConfig.redisConnectionTimeout) + val dedupEngine = new DedupEngine(redisConnection, 0, 4309535) + val dedupFn = new DeduplicationFn(dedupEngine) + + dedupFn.validateDedup("d1", Some("event.id"), SAMPLE_EVENT) should be (false) + dedupFn.validateDedup("d1", Some("event.id"), SAMPLE_EVENT) should be (true) + + the[ObsrvException] thrownBy { + dedupFn.validateDedup("d1", Some("event"), SAMPLE_EVENT) + } should have message ErrorConstants.DEDUP_KEY_NOT_A_STRING_OR_NUMBER.errorMsg + + the[ObsrvException] thrownBy { + dedupFn.validateDedup("d1", Some("event.mid"), SAMPLE_EVENT) + } should have message ErrorConstants.NO_DEDUP_KEY_FOUND.errorMsg + + the[ObsrvException] thrownBy { + dedupFn.validateDedup("d1", None, SAMPLE_EVENT) + } should have message ErrorConstants.NO_DEDUP_KEY_FOUND.errorMsg + + } +} + +class DeduplicationFn(dedupEngine: DedupEngine) extends BaseDeduplication { + + def validateDedup(datasetId: String, dedupKey: Option[String], event: String):Boolean = { + isDuplicate(datasetId, dedupKey, event)(dedupEngine) + } + +} \ No newline at end of file diff --git a/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala b/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala index 61eb53f1..5d7673e0 100644 --- a/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala +++ b/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala @@ -13,7 +13,6 @@ import org.apache.flink.streaming.api.windowing.time.Time import org.apache.flink.test.util.MiniClusterWithClientResource import org.apache.kafka.common.serialization.StringDeserializer import org.scalatest.Matchers -import org.sunbird.obsrv.core.model.ErrorConstants import org.sunbird.obsrv.core.streaming._ import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, Util} @@ -85,7 +84,7 @@ class BaseProcessFunctionTestSpec extends BaseSpec with Matchers { .process(new TestMapStreamFunc(bsMapConfig)).name("TestMapEventStream") mapStream.getSideOutput(bsConfig.mapOutputTag) - .sinkTo(kafkaConnector.kafkaMapSink(bsConfig.kafkaMapOutputTopic)) + .sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](bsConfig.kafkaMapOutputTopic)) .name("Map-Event-Producer") val stringStream = @@ -95,7 +94,7 @@ class BaseProcessFunctionTestSpec extends BaseSpec with Matchers { }).window(TumblingProcessingTimeWindows.of(Time.seconds(2))).process(new TestStringWindowStreamFunc(bsConfig)).name("TestStringEventStream") stringStream.getSideOutput(bsConfig.stringOutputTag) - .sinkTo(kafkaConnector.kafkaStringSink(bsConfig.kafkaStringOutputTopic)) + .sinkTo(kafkaConnector.kafkaSink[String](bsConfig.kafkaStringOutputTopic)) .name("String-Producer") Future { @@ -131,26 +130,10 @@ class BaseProcessFunctionTestSpec extends BaseSpec with Matchers { val mutableMap = Util.getMutableMap(map) mutableMap.getClass.getCanonicalName should be ("scala.collection.mutable.HashMap") noException shouldBe thrownBy(JSONUtil.convertValue(map)) - - ErrorConstants.NO_IMPLEMENTATION_FOUND.errorCode should be ("ERR_0001") - ErrorConstants.NO_EXTRACTION_DATA_FOUND.errorCode should be ("ERR_EXT_1001") - ErrorConstants.EXTRACTED_DATA_NOT_A_LIST.errorCode should be ("ERR_EXT_1002") - ErrorConstants.EVENT_SIZE_EXCEEDED.errorCode should be ("ERR_EXT_1003") - ErrorConstants.EVENT_MISSING.errorCode should be ("ERR_EXT_1006") - ErrorConstants.MISSING_DATASET_ID.errorCode should be ("ERR_EXT_1004") - ErrorConstants.MISSING_DATASET_CONFIGURATION.errorCode should be ("ERR_EXT_1005") - ErrorConstants.NO_DEDUP_KEY_FOUND.errorCode should be ("ERR_DEDUP_1007") - ErrorConstants.DEDUP_KEY_NOT_A_STRING.errorCode should be ("ERR_DEDUP_1008") - ErrorConstants.DUPLICATE_BATCH_EVENT_FOUND.errorCode should be ("ERR_EXT_1009") - ErrorConstants.DUPLICATE_EVENT_FOUND.errorCode should be ("ERR_PP_1010") - ErrorConstants.JSON_SCHEMA_NOT_FOUND.errorCode should be ("ERR_PP_1011") - ErrorConstants.INVALID_JSON_SCHEMA.errorCode should be ("ERR_PP_1012") - ErrorConstants.SCHEMA_VALIDATION_FAILED.errorCode should be ("ERR_PP_1013") - ErrorConstants.DENORM_KEY_MISSING.errorCode should be ("ERR_DENORM_1014") - ErrorConstants.DENORM_KEY_NOT_A_STRING.errorCode should be ("ERR_DENORM_1015") - - val metrics = Metrics(Map("test" -> new ConcurrentHashMap[String, AtomicLong]())) + val metrics = Metrics(mutable.Map("test" -> new ConcurrentHashMap[String, AtomicLong]())) metrics.reset("test1", "m1") + + bsConfig.datasetType() should be ("dataset") } "TestBaseStreamTask" should "validate the getMapDataStream method" in { diff --git a/framework/src/test/scala/org/sunbird/spec/BaseProcessTestConfig.scala b/framework/src/test/scala/org/sunbird/spec/BaseProcessTestConfig.scala index 6384c2c1..96358d81 100644 --- a/framework/src/test/scala/org/sunbird/spec/BaseProcessTestConfig.scala +++ b/framework/src/test/scala/org/sunbird/spec/BaseProcessTestConfig.scala @@ -35,6 +35,8 @@ class BaseProcessTestConfig(override val config: Config) extends BaseJobConfig[S override def inputConsumer(): String = "testConsumer" override def successTag(): OutputTag[String] = stringOutputTag + + override def failedEventsOutputTag(): OutputTag[String] = stringOutputTag } class BaseProcessTestMapConfig(override val config: Config) extends BaseJobConfig[Map[String, AnyRef]](config, "Test-job") { @@ -68,4 +70,6 @@ class BaseProcessTestMapConfig(override val config: Config) extends BaseJobConfi override def inputConsumer(): String = "testConsumer" override def successTag(): OutputTag[Map[String, AnyRef]] = mapOutputTag + + override def failedEventsOutputTag(): OutputTag[Map[String, AnyRef]] = mapOutputTag } \ No newline at end of file diff --git a/framework/src/test/scala/org/sunbird/spec/BaseSpec.scala b/framework/src/test/scala/org/sunbird/spec/BaseSpec.scala index 134121d3..904a320b 100644 --- a/framework/src/test/scala/org/sunbird/spec/BaseSpec.scala +++ b/framework/src/test/scala/org/sunbird/spec/BaseSpec.scala @@ -1,6 +1,5 @@ package org.sunbird.spec -import io.zonky.test.db.postgres.embedded.EmbeddedPostgres import org.scalatest.{BeforeAndAfterAll, FlatSpec} import redis.embedded.RedisServer @@ -8,10 +7,15 @@ class BaseSpec extends FlatSpec with BeforeAndAfterAll { var redisServer: RedisServer = _ - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() redisServer = new RedisServer(6340) - redisServer.start() + try { + redisServer.start() + } catch { + case _: Exception => Console.err.println("### Unable to start redis server. Falling back to use locally run redis if any ###") + } + } override protected def afterAll(): Unit = { diff --git a/framework/src/test/scala/org/sunbird/spec/BaseSpecWithPostgres.scala b/framework/src/test/scala/org/sunbird/spec/BaseSpecWithPostgres.scala index d44191be..fd4985db 100644 --- a/framework/src/test/scala/org/sunbird/spec/BaseSpecWithPostgres.scala +++ b/framework/src/test/scala/org/sunbird/spec/BaseSpecWithPostgres.scala @@ -9,10 +9,14 @@ class BaseSpecWithPostgres extends FlatSpec with BeforeAndAfterAll { var embeddedPostgres: EmbeddedPostgres = _ var redisServer: RedisServer = _ - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() redisServer = new RedisServer(6340) - redisServer.start() + try { + redisServer.start() + } catch { + case _: Exception => Console.err.println("### Unable to start redis server. Falling back to use locally run redis if any ###") + } embeddedPostgres = EmbeddedPostgres.builder.setPort(5432).start() // Defaults to 5432 port } diff --git a/framework/src/test/scala/org/sunbird/spec/ModelsTestSpec.scala b/framework/src/test/scala/org/sunbird/spec/ModelsTestSpec.scala new file mode 100644 index 00000000..4ca0ad5e --- /dev/null +++ b/framework/src/test/scala/org/sunbird/spec/ModelsTestSpec.scala @@ -0,0 +1,118 @@ +package org.sunbird.spec + +import com.fasterxml.jackson.module.scala.JsonScalaEnumeration +import com.typesafe.config.{Config, ConfigFactory} +import org.apache.kafka.clients.producer.ProducerConfig +import org.scalatest.{FlatSpec, Matchers} +import org.sunbird.obsrv.core.model.FunctionalError.FunctionalError +import org.sunbird.obsrv.core.model.Models._ +import org.sunbird.obsrv.core.model._ +import org.sunbird.obsrv.core.util.{DatasetKeySelector, JSONUtil} + +import scala.collection.mutable + +case class FuncErrorList(@JsonScalaEnumeration(classOf[FunctionalErrorType]) list: List[FunctionalError]) +class ModelsTestSpec extends FlatSpec with Matchers { + + "ModelsTestSpec" should "cover all error constants" in { + + ErrorConstants.NO_IMPLEMENTATION_FOUND.errorCode should be("ERR_0001") + ErrorConstants.NO_EXTRACTION_DATA_FOUND.errorCode should be("ERR_EXT_1001") + ErrorConstants.EXTRACTED_DATA_NOT_A_LIST.errorCode should be("ERR_EXT_1002") + ErrorConstants.EVENT_SIZE_EXCEEDED.errorCode should be("ERR_EXT_1003") + ErrorConstants.EVENT_MISSING.errorCode should be("ERR_EXT_1006") + ErrorConstants.MISSING_DATASET_ID.errorCode should be("ERR_EXT_1004") + ErrorConstants.MISSING_DATASET_CONFIGURATION.errorCode should be("ERR_EXT_1005") + ErrorConstants.NO_DEDUP_KEY_FOUND.errorCode should be("ERR_DEDUP_1007") + ErrorConstants.DEDUP_KEY_NOT_A_STRING_OR_NUMBER.errorCode should be("ERR_DEDUP_1008") + ErrorConstants.DUPLICATE_BATCH_EVENT_FOUND.errorCode should be("ERR_EXT_1009") + ErrorConstants.DUPLICATE_EVENT_FOUND.errorCode should be("ERR_PP_1010") + ErrorConstants.JSON_SCHEMA_NOT_FOUND.errorCode should be("ERR_PP_1011") + ErrorConstants.INVALID_JSON_SCHEMA.errorCode should be("ERR_PP_1012") + ErrorConstants.SCHEMA_VALIDATION_FAILED.errorCode should be("ERR_PP_1013") + ErrorConstants.DENORM_KEY_MISSING.errorCode should be("ERR_DENORM_1014") + ErrorConstants.DENORM_KEY_NOT_A_STRING_OR_NUMBER.errorCode should be("ERR_DENORM_1015") + ErrorConstants.DENORM_DATA_NOT_FOUND.errorCode should be("ERR_DENORM_1016") + ErrorConstants.MISSING_DATASET_CONFIG_KEY.errorCode should be("ERR_MASTER_DATA_1017") + ErrorConstants.ERR_INVALID_EVENT.errorCode should be("ERR_EXT_1018") + ErrorConstants.INDEX_KEY_MISSING_OR_BLANK.errorCode should be("ERR_ROUTER_1019") + ErrorConstants.INVALID_EXPR_FUNCTION.errorCode should be("ERR_TRANSFORM_1020") + ErrorConstants.ERR_EVAL_EXPR_FUNCTION.errorCode should be("ERR_TRANSFORM_1021") + ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION.errorCode should be("ERR_TRANSFORM_1022") + ErrorConstants.ERR_TRANSFORMATION_FAILED.errorCode should be("ERR_TRANSFORM_1023") + } + + it should "cover system event model" in { + + Stats.withName("latency_time") should be (Stats.latency_time) + Stats.withName("processing_time") should be (Stats.processing_time) + Stats.withName("total_processing_time") should be (Stats.total_processing_time) + + PDataType.withName("flink") should be(PDataType.flink) + PDataType.withName("api") should be(PDataType.api) + PDataType.withName("kafka") should be(PDataType.kafka) + PDataType.withName("druid") should be(PDataType.druid) + PDataType.withName("spark") should be(PDataType.spark) + + StatusCode.withName("failed") should be (StatusCode.failed) + StatusCode.withName("partial") should be (StatusCode.partial) + StatusCode.withName("skipped") should be (StatusCode.skipped) + StatusCode.withName("success") should be (StatusCode.success) + + ModuleID.withName("ingestion") should be(ModuleID.ingestion) + ModuleID.withName("processing") should be(ModuleID.processing) + ModuleID.withName("storage") should be(ModuleID.storage) + ModuleID.withName("query") should be(ModuleID.query) + + ModuleID.withName("ingestion") should be(ModuleID.ingestion) + ModuleID.withName("processing") should be(ModuleID.processing) + ModuleID.withName("storage") should be(ModuleID.storage) + ModuleID.withName("query") should be(ModuleID.query) + + Producer.withName("extractor") should be(Producer.extractor) + Producer.withName("validator") should be(Producer.validator) + Producer.withName("dedup") should be(Producer.dedup) + Producer.withName("denorm") should be(Producer.denorm) + Producer.withName("transformer") should be(Producer.transformer) + Producer.withName("router") should be(Producer.router) + Producer.withName("masterdataprocessor") should be(Producer.masterdataprocessor) + + EventID.withName("METRIC") should be (EventID.METRIC) + EventID.withName("LOG") should be (EventID.LOG) + + ErrorLevel.withName("info") should be (ErrorLevel.info) + ErrorLevel.withName("warn") should be (ErrorLevel.warn) + ErrorLevel.withName("debug") should be (ErrorLevel.debug) + ErrorLevel.withName("critical") should be (ErrorLevel.critical) + + val funcErrorsStringList = FunctionalError.values.map(f => f.toString).toList + val funcErrors = JSONUtil.deserialize[FuncErrorList](JSONUtil.serialize(Map("list" -> funcErrorsStringList))) + funcErrors.list.contains(FunctionalError.MissingTimestampKey) should be (true) + + val sysEvent = SystemEvent(etype = EventID.METRIC, + ctx = ContextData(module = ModuleID.processing, pdata = PData(id = "testjob", `type` = PDataType.flink, pid = Some(Producer.router)), dataset = Some("d1"), eid = Some("event1")), + data = EData( + error = Some(ErrorLog(pdata_id = Producer.router, pdata_status = StatusCode.failed, error_type = FunctionalError.MissingTimestampKey, error_code = ErrorConstants.DENORM_KEY_MISSING.errorCode, error_message = ErrorConstants.DENORM_KEY_MISSING.errorMsg, error_level = ErrorLevel.warn, error_count = Some(1))), + pipeline_stats = Some(PipelineStats(extractor_events = Some(2), extractor_status = Some(StatusCode.success), extractor_time = Some(123l), validator_status = Some(StatusCode.success), validator_time = Some(786l), dedup_status = Some(StatusCode.skipped), dedup_time = Some(0l), denorm_status = Some(StatusCode.partial), denorm_time = Some(345l), transform_status = Some(StatusCode.success), transform_time = Some(98l), total_processing_time = Some(1543l), latency_time = Some(23l), processing_time = Some(1520l))), Some(Map("duration" -> 2000.asInstanceOf[AnyRef])) + ) + ) + sysEvent.etype should be (EventID.METRIC) + + val config: Config = ConfigFactory.load("test2.conf") + val bsMapConfig = new BaseProcessTestMapConfig(config) + bsMapConfig.kafkaProducerProperties.get(ProducerConfig.COMPRESSION_TYPE_CONFIG).asInstanceOf[String] should be ("snappy") + bsMapConfig.kafkaConsumerProperties() + bsMapConfig.enableDistributedCheckpointing should be (None) + bsMapConfig.checkpointingBaseUrl should be (None) + bsMapConfig.datasetType() should be ("master-dataset") + + val dsk = new DatasetKeySelector() + dsk.getKey(mutable.Map("dataset" -> "d1".asInstanceOf[AnyRef])) should be ("d1") + + JSONUtil.getJsonType("""{"test":123}""") should be ("OBJECT") + JSONUtil.getJsonType("""{"test":123""") should be ("NOT_A_JSON") + JSONUtil.getJsonType("""123""") should be ("NOT_A_JSON") + + } + +} diff --git a/framework/src/test/scala/org/sunbird/spec/PostgresConnectSpec.scala b/framework/src/test/scala/org/sunbird/spec/PostgresConnectSpec.scala index 619769fd..088c2768 100644 --- a/framework/src/test/scala/org/sunbird/spec/PostgresConnectSpec.scala +++ b/framework/src/test/scala/org/sunbird/spec/PostgresConnectSpec.scala @@ -34,7 +34,7 @@ class PostgresConnectSpec extends BaseSpecWithPostgres with Matchers with Mockit assertEquals("custchannel", rs.getString("channel")) } - val resetConnection = postgresConnect.reset + val resetConnection: Unit = postgresConnect.reset() assertNotNull(resetConnection) postgresConnect.closeConnection() } diff --git a/framework/src/test/scala/org/sunbird/spec/RedisTestSpec.scala b/framework/src/test/scala/org/sunbird/spec/RedisTestSpec.scala index a12543aa..9e897b74 100644 --- a/framework/src/test/scala/org/sunbird/spec/RedisTestSpec.scala +++ b/framework/src/test/scala/org/sunbird/spec/RedisTestSpec.scala @@ -17,34 +17,38 @@ class RedisTestSpec extends BaseSpec with Matchers with MockitoSugar { val redisConnection = new RedisConnect(baseConfig.redisHost, baseConfig.redisPort, baseConfig.redisConnectionTimeout) val status = redisConnection.getConnection(2) status.isConnected should be(true) + + val status2 = redisConnection.getConnection(2, 1000l) + status2.isConnected should be(true) } - "DedupEngine functionality" should "be able to identify if the key is unique or duplicate & it should able throw jedis excption for invalid action" in intercept[JedisException] { + "DedupEngine functionality" should "be able to identify if the key is unique or duplicate & it should able throw jedis exception for invalid action" in { val redisConnection = new RedisConnect(baseConfig.redisHost, baseConfig.redisPort, baseConfig.redisConnectionTimeout) val dedupEngine = new DedupEngine(redisConnection, 2, 200) - dedupEngine.getRedisConnection should not be (null) + dedupEngine.getRedisConnection should not be null dedupEngine.isUniqueEvent("key-1") should be(true) dedupEngine.storeChecksum("key-1") dedupEngine.isUniqueEvent("key-1") should be(false) - dedupEngine.isUniqueEvent(null) + a[JedisException] should be thrownBy {dedupEngine.isUniqueEvent(null)} dedupEngine.closeConnectionPool() } - it should "be able to reconnect when a jedis exception for invalid action is thrown" in intercept[JedisException] { + it should "be able to reconnect when a jedis exception for invalid action is thrown" in { val redisConnection = new RedisConnect(baseConfig.redisHost, baseConfig.redisPort, baseConfig.redisConnectionTimeout) val dedupEngine = new DedupEngine(redisConnection, 0, 4309535) dedupEngine.isUniqueEvent("event-id-3") should be(true) - dedupEngine.storeChecksum(null) - dedupEngine.getRedisConnection should not be(null) + a[JedisException] should be thrownBy {dedupEngine.storeChecksum(null)} + dedupEngine.getRedisConnection should not be null } - - "RestUtil functionality" should "be able to return response" in { val restUtil = new RestUtil() - val url = "https://httpbin.org/json"; - val response = restUtil.get(url); + val url = "https://httpbin.org/json" + val response = restUtil.get(url, Some(Map("x-auth" -> "123"))) response should not be null + + val response2 = restUtil.get("https://httpbin.org/json") + response2 should not be null } } \ No newline at end of file diff --git a/framework/src/test/scala/org/sunbird/spec/SerdeUtilTestSpec.scala b/framework/src/test/scala/org/sunbird/spec/SerdeUtilTestSpec.scala new file mode 100644 index 00000000..74ac75f6 --- /dev/null +++ b/framework/src/test/scala/org/sunbird/spec/SerdeUtilTestSpec.scala @@ -0,0 +1,75 @@ +package org.sunbird.spec + +import org.apache.flink.util.Collector +import org.apache.kafka.clients.consumer.ConsumerRecord +import org.apache.kafka.common.record.TimestampType +import org.scalamock.matchers.ArgCapture.CaptureAll +import org.scalamock.scalatest.MockFactory +import org.scalatest.{FlatSpec, Matchers} +import org.sunbird.obsrv.core.serde.{MapDeserializationSchema, StringDeserializationSchema} + +import java.nio.charset.StandardCharsets +import scala.collection.mutable + +class SerdeUtilTestSpec extends FlatSpec with Matchers with MockFactory { + + + + "SerdeUtil" should "test all the map and string deserialization classes" in { + + val strCollector: Collector[String] = mock[Collector[String]] + val mapCollector: Collector[mutable.Map[String, AnyRef]] = mock[Collector[mutable.Map[String, AnyRef]]] + val key = "key1".getBytes(StandardCharsets.UTF_8) + val validEvent = """{"event":{"id":1234}}""".getBytes(StandardCharsets.UTF_8) + val eventWithObsrvMeta = """{"event":{"id":1234},"obsrv_meta":{}}""".getBytes(StandardCharsets.UTF_8) + val invalidEvent = """{"event":{"id":1234}""".getBytes(StandardCharsets.UTF_8) + + val validRecord = new ConsumerRecord[Array[Byte], Array[Byte]]("test-topic", 0, 1234l, 1701447470737l, TimestampType.CREATE_TIME, -1l, -1, -1, key, validEvent) + val validRecordWithObsrvMeta = new ConsumerRecord[Array[Byte], Array[Byte]]("test-topic", 0, 1234l, 1701447470737l, TimestampType.CREATE_TIME, -1l, -1, -1, key, eventWithObsrvMeta) + val invalidRecord = new ConsumerRecord[Array[Byte], Array[Byte]]("test-topic", 0, 1234l, 1701447470737l, TimestampType.CREATE_TIME, -1l, -1, -1, key, invalidEvent) + + + val sds = new StringDeserializationSchema() + (strCollector.collect _).expects("""{"event":{"id":1234}}""") + sds.deserialize(validRecord, strCollector) + + val c = CaptureAll[mutable.Map[String, AnyRef]]() + val mds = new MapDeserializationSchema() + mapCollector.collect _ expects capture(c) repeat 3 + mds.deserialize(validRecord, mapCollector) + mds.deserialize(validRecordWithObsrvMeta, mapCollector) + mds.deserialize(invalidRecord, mapCollector) + //(mapCollector.collect _).verify(*).once() + val validMsg: mutable.Map[String, AnyRef] = c.values.apply(0) + val validMsgWithObsrvMeta: mutable.Map[String, AnyRef] = c.values.apply(1) + val invalidMsg: mutable.Map[String, AnyRef] = c.values.apply(2) + Console.println("validMsg", validMsg) + validMsg.get("obsrv_meta").isDefined should be (true) + val validObsrvMeta = validMsg.get("obsrv_meta").get.asInstanceOf[Map[String, AnyRef]] + val validEventMsg = validMsg.get("event").get.asInstanceOf[Map[String, AnyRef]] + validObsrvMeta.get("syncts").get.asInstanceOf[Long] should be (1701447470737l) + validObsrvMeta.get("processingStartTime").get.asInstanceOf[Long] should be >= 1701447470737l + validEventMsg.get("id").get.asInstanceOf[Int] should be (1234) + + Console.println("validMsgWithObsrvMeta", validMsgWithObsrvMeta) + validMsgWithObsrvMeta.get("obsrv_meta").isDefined should be(true) + validMsgWithObsrvMeta.get("event").isDefined should be(true) + val validObsrvMeta2 = validMsgWithObsrvMeta.get("obsrv_meta").get.asInstanceOf[Map[String, AnyRef]] + val validEventMsg2 = validMsgWithObsrvMeta.get("event").get.asInstanceOf[Map[String, AnyRef]] + validObsrvMeta2.keys.size should be(0) + validEventMsg2.get("id").get.asInstanceOf[Int] should be (1234) + + Console.println("invalidMsg", invalidMsg) + invalidMsg.get("obsrv_meta").isDefined should be(true) + invalidMsg.get("event").isDefined should be(false) + val invalidObsrvMeta = invalidMsg.get("obsrv_meta").get.asInstanceOf[Map[String, AnyRef]] + val invalidEventMsg = invalidMsg.get("invalid_json").get.asInstanceOf[String] + invalidObsrvMeta.get("syncts").get.asInstanceOf[Long] should be(1701447470737l) + invalidObsrvMeta.get("processingStartTime").get.asInstanceOf[Long] should be >= 1701447470737l + invalidEventMsg should be("""{"event":{"id":1234}""") + } + + it should "test generic serialization schema" in { + + } +} diff --git a/framework/src/test/scala/org/sunbird/spec/TestMapStreamFunc.scala b/framework/src/test/scala/org/sunbird/spec/TestMapStreamFunc.scala index 3e06de54..457f26ba 100644 --- a/framework/src/test/scala/org/sunbird/spec/TestMapStreamFunc.scala +++ b/framework/src/test/scala/org/sunbird/spec/TestMapStreamFunc.scala @@ -1,16 +1,20 @@ package org.sunbird.spec -import scala.collection.mutable.Map import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.streaming.api.functions.ProcessFunction import org.sunbird.obsrv.core.cache.{DedupEngine, RedisConnect} -import org.sunbird.obsrv.core.model.ErrorConstants -import org.sunbird.obsrv.core.streaming.{BaseProcessFunction, Metrics, MetricsList} +import org.sunbird.obsrv.core.model.{Constants, ErrorConstants, Producer} +import org.sunbird.obsrv.core.streaming.{BaseDeduplication, BaseProcessFunction, Metrics, MetricsList} import org.sunbird.obsrv.core.util.JSONUtil +import java.util.concurrent.ConcurrentHashMap +import java.util.concurrent.atomic.AtomicLong +import scala.collection.mutable +import scala.collection.mutable.Map + class TestMapStreamFunc(config: BaseProcessTestMapConfig)(implicit val stringTypeInfo: TypeInformation[String]) - extends BaseProcessFunction[Map[String, AnyRef], Map[String, AnyRef]](config) { + extends BaseProcessFunction[Map[String, AnyRef], Map[String, AnyRef]](config) with BaseDeduplication { override def getMetricsList(): MetricsList = { MetricsList(List("ALL"), List(config.mapEventCount)) @@ -23,27 +27,32 @@ class TestMapStreamFunc(config: BaseProcessTestMapConfig)(implicit val stringTyp metrics.reset("ALL", config.mapEventCount) metrics.incCounter("ALL", config.mapEventCount) metrics.getAndReset("ALL", config.mapEventCount) - context.output(config.mapOutputTag, event) + assert(metrics.hasDataset("ALL")) + metrics.initDataset("d2", new ConcurrentHashMap[String, AtomicLong]()) + + context.output(config.mapOutputTag, mutable.Map(Constants.TOPIC -> config.kafkaMapOutputTopic, Constants.MESSAGE -> event)) - super.markSuccess(event, "test-job") - super.markFailed(event, ErrorConstants.NO_IMPLEMENTATION_FOUND, config.jobName) - super.markSkipped(event, config.jobName) + super.markSuccess(event, Producer.extractor) + super.markFailed(event, ErrorConstants.NO_IMPLEMENTATION_FOUND, Producer.extractor) + super.markSkipped(event, Producer.extractor) super.markComplete(event, None) + super.markPartial(event, Producer.extractor) assert(super.containsEvent(event)) + assert(!super.containsEvent(mutable.Map("test" -> "123".asInstanceOf[AnyRef]))) assert(!super.containsEvent(Map("dataset" -> "d1"))) val eventStr = JSONUtil.serialize(event) val code = JSONUtil.getKey("event.vehicleCode", eventStr).textValue() val redisConnection = new RedisConnect(config.redisHost, config.redisPort, config.redisConnectionTimeout) implicit val dedupEngine = new DedupEngine(redisConnection, 2, 200) - val isDup = super.isDuplicate("D1", Option("event.id"), eventStr, context, config) + val isDup = super.isDuplicate("D1", Option("event.id"), eventStr) code match { case "HYUN-CRE-D6" => assert(!isDup) case "HYUN-CRE-D7" => assert(isDup) } - assert(!super.isDuplicate("D1", None, eventStr, context, config)) - assert(!super.isDuplicate("D1", Option("mid"), eventStr, context, config)) - assert(!super.isDuplicate("D1", Option("event"), eventStr, context, config)) + assert(!super.isDuplicate("D1", None, eventStr)) + assert(!super.isDuplicate("D1", Option("mid"), eventStr)) + assert(!super.isDuplicate("D1", Option("event"), eventStr)) } } diff --git a/framework/src/test/scala/org/sunbird/spec/TestMapStreamTask.scala b/framework/src/test/scala/org/sunbird/spec/TestMapStreamTask.scala index b64f2468..09ce0c90 100644 --- a/framework/src/test/scala/org/sunbird/spec/TestMapStreamTask.scala +++ b/framework/src/test/scala/org/sunbird/spec/TestMapStreamTask.scala @@ -13,15 +13,17 @@ class TestMapStreamTask(config: BaseProcessTestMapConfig, kafkaConnector: FlinkK implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) val dataStream = getMapDataStream(env, config, kafkaConnector) processStream(dataStream) + val dataStream2 = getMapDataStream(env, config, List(config.inputTopic()), config.kafkaConsumerProperties(), config.inputConsumer(), kafkaConnector) env.execute(config.jobName) } override def processStream(dataStream: DataStream[mutable.Map[String, AnyRef]]): DataStream[mutable.Map[String, AnyRef]] = { val stream = dataStream.process(new TestMapStreamFunc(config)) stream.getSideOutput(config.mapOutputTag) - .sinkTo(kafkaConnector.kafkaMapSink(config.kafkaMapOutputTopic)) + .sinkTo(kafkaConnector.kafkaMapDynamicSink()) .name("Map-Event-Producer") + addDefaultSinks(stream, config, kafkaConnector) stream.getSideOutput(config.mapOutputTag) } } diff --git a/framework/src/test/scala/org/sunbird/spec/TestStringStreamTask.scala b/framework/src/test/scala/org/sunbird/spec/TestStringStreamTask.scala index 116efbe3..5bc1cc1f 100644 --- a/framework/src/test/scala/org/sunbird/spec/TestStringStreamTask.scala +++ b/framework/src/test/scala/org/sunbird/spec/TestStringStreamTask.scala @@ -1,26 +1,24 @@ package org.sunbird.spec -import org.apache.flink.api.java.functions.KeySelector import org.apache.flink.api.scala.createTypeInformation import org.apache.flink.streaming.api.datastream.DataStream import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector} import org.sunbird.obsrv.core.util.FlinkUtil -import scala.collection.mutable - class TestStringStreamTask(config: BaseProcessTestConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[String] { override def process(): Unit = { implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) val dataStream = getStringDataStream(env, config, kafkaConnector) processStream(dataStream) + val dataStream2 = getStringDataStream(env, config, List(config.inputTopic()), config.kafkaConsumerProperties(), config.inputConsumer(), kafkaConnector) env.execute(config.jobName) } override def processStream(dataStream: DataStream[String]): DataStream[String] = { val stream = dataStream.process(new TestStringStreamFunc(config)).name("TestStringEventStream") stream.getSideOutput(config.stringOutputTag) - .sinkTo(kafkaConnector.kafkaStringSink(config.kafkaStringOutputTopic)) + .sinkTo(kafkaConnector.kafkaSink[String](config.kafkaStringOutputTopic)) .name("String-Event-Producer") stream.getSideOutput(config.stringOutputTag) diff --git a/pipeline/denormalizer/pom.xml b/pipeline/denormalizer/pom.xml index f3462d71..2df98cd3 100644 --- a/pipeline/denormalizer/pom.xml +++ b/pipeline/denormalizer/pom.xml @@ -4,9 +4,6 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - - 3.0.1 - org.sunbird.obsrv @@ -45,6 +42,18 @@ dataset-registry 1.0.0 + + org.apache.kafka + kafka-clients + ${kafka.version} + test + + + org.apache.kafka + kafka_${scala.maj.version} + ${kafka.version} + test + org.sunbird.obsrv framework @@ -52,6 +61,13 @@ test-jar test + + org.sunbird.obsrv + dataset-registry + 1.0.0 + test-jar + test + org.apache.flink flink-test-utils @@ -66,9 +82,21 @@ tests - it.ozimov + com.github.codemonstur embedded-redis - 0.7.1 + 1.0.0 + test + + + io.github.embeddedkafka + embedded-kafka_2.12 + 3.4.0 + test + + + io.zonky.test + embedded-postgres + 2.0.3 test diff --git a/pipeline/denormalizer/src/main/resources/de-normalization.conf b/pipeline/denormalizer/src/main/resources/de-normalization.conf index 1272058b..63b5bc9e 100644 --- a/pipeline/denormalizer/src/main/resources/de-normalization.conf +++ b/pipeline/denormalizer/src/main/resources/de-normalization.conf @@ -3,7 +3,7 @@ include "baseconfig.conf" kafka { input.topic = ${job.env}".unique" output.denorm.topic = ${job.env}".denorm" - output.denorm.failed.topic = ${job.env}".denorm.failed" + output.denorm.failed.topic = ${job.env}".failed" groupId = ${job.env}"-denormalizer-group" } diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerFunction.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerFunction.scala index d294009c..45a41c67 100644 --- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerFunction.scala +++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerFunction.scala @@ -3,25 +3,28 @@ package org.sunbird.obsrv.denormalizer.functions import org.apache.flink.configuration.Configuration import org.apache.flink.streaming.api.functions.ProcessFunction import org.slf4j.LoggerFactory -import org.sunbird.obsrv.core.exception.ObsrvException -import org.sunbird.obsrv.core.streaming.{BaseProcessFunction, Metrics, MetricsList} -import org.sunbird.obsrv.core.util.Util +import org.sunbird.obsrv.core.model.Models._ +import org.sunbird.obsrv.core.model.Producer.Producer +import org.sunbird.obsrv.core.model.StatusCode.StatusCode +import org.sunbird.obsrv.core.model._ +import org.sunbird.obsrv.core.streaming.Metrics +import org.sunbird.obsrv.core.util.{JSONUtil, Util} import org.sunbird.obsrv.denormalizer.task.DenormalizerConfig -import org.sunbird.obsrv.denormalizer.util.DenormCache +import org.sunbird.obsrv.denormalizer.util.{DenormCache, DenormEvent} +import org.sunbird.obsrv.model.DatasetModels.Dataset import org.sunbird.obsrv.registry.DatasetRegistry +import org.sunbird.obsrv.streaming.BaseDatasetProcessFunction import scala.collection.mutable -class DenormalizerFunction(config: DenormalizerConfig) - extends BaseProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]](config) { +class DenormalizerFunction(config: DenormalizerConfig) extends BaseDatasetProcessFunction(config) { private[this] val logger = LoggerFactory.getLogger(classOf[DenormalizerFunction]) private[this] var denormCache: DenormCache = _ - override def getMetricsList(): MetricsList = { - val metrics = List(config.denormSuccess, config.denormTotal, config.denormFailed, config.eventsSkipped) - MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics) + override def getMetrics(): List[String] = { + List(config.denormSuccess, config.denormTotal, config.denormFailed, config.eventsSkipped, config.denormPartialSuccess) } override def open(parameters: Configuration): Unit = { @@ -35,29 +38,64 @@ class DenormalizerFunction(config: DenormalizerConfig) denormCache.close() } - override def processElement(msg: mutable.Map[String, AnyRef], + override def processElement(dataset: Dataset, msg: mutable.Map[String, AnyRef], context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, metrics: Metrics): Unit = { - val datasetId = msg(config.CONST_DATASET).asInstanceOf[String] // DatasetId cannot be empty at this stage - metrics.incCounter(datasetId, config.denormTotal) - val dataset = DatasetRegistry.getDataset(datasetId).get - val event = Util.getMutableMap(msg(config.CONST_EVENT).asInstanceOf[Map[String, AnyRef]]) - + metrics.incCounter(dataset.id, config.denormTotal) + denormCache.open(dataset) if (dataset.denormConfig.isDefined) { - try { - msg.put(config.CONST_EVENT, denormCache.denormEvent(datasetId, event, dataset.denormConfig.get.denormFields).toMap) - metrics.incCounter(datasetId, config.denormSuccess) - context.output(config.denormEventsTag, markSuccess(msg, config.jobName)) - } catch { - case ex: ObsrvException => - metrics.incCounter(datasetId, config.denormFailed) - context.output(config.denormFailedTag, markFailed(msg, ex.error, config.jobName)) + val event = DenormEvent(msg) + val denormEvent = denormCache.denormEvent(dataset.id, event, dataset.denormConfig.get.denormFields) + val status = getDenormStatus(denormEvent) + context.output(config.denormEventsTag, markStatus(denormEvent.msg, Producer.denorm, status)) + status match { + case StatusCode.success => metrics.incCounter(dataset.id, config.denormSuccess) + case _ => + metrics.incCounter(dataset.id, if (status == StatusCode.partial) config.denormPartialSuccess else config.denormFailed) + generateSystemEvent(dataset, denormEvent, context) + logData(dataset.id, denormEvent) } } else { - metrics.incCounter(datasetId, config.eventsSkipped) - context.output(config.denormEventsTag, markSkipped(msg, config.jobName)) + metrics.incCounter(dataset.id, config.eventsSkipped) + context.output(config.denormEventsTag, markSkipped(msg, Producer.denorm)) } } + private def logData(datasetId: String, denormEvent: DenormEvent): Unit = { + logger.warn(s"Denormalizer | Denorm operation is not successful | dataset=$datasetId | denormStatus=${JSONUtil.serialize(denormEvent.fieldStatus)}") + } + + private def generateSystemEvent(dataset: Dataset, denormEvent: DenormEvent, context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context): Unit = { + + denormEvent.fieldStatus.filter(f => !f._2.success).groupBy(f => f._2.error.get).map(f => (f._1, f._2.size)) + .foreach(f => { + val functionalError = f._1 match { + case ErrorConstants.DENORM_KEY_MISSING => FunctionalError.DenormKeyMissing + case ErrorConstants.DENORM_KEY_NOT_A_STRING_OR_NUMBER => FunctionalError.DenormKeyInvalid + case ErrorConstants.DENORM_DATA_NOT_FOUND => FunctionalError.DenormDataNotFound + } + context.output(config.systemEventsOutputTag, JSONUtil.serialize(SystemEvent( + EventID.METRIC, + ctx = ContextData(module = ModuleID.processing, pdata = PData(config.jobName, PDataType.flink, Some(Producer.denorm)), dataset = Some(dataset.id), dataset_type = Some(dataset.datasetType)), + data = EData(error = Some(ErrorLog(pdata_id = Producer.denorm, pdata_status = StatusCode.failed, error_type = functionalError, error_code = f._1.errorCode, error_message = f._1.errorMsg, error_level = ErrorLevel.critical, error_count = Some(f._2)))) + ))) + }) + } + + private def getDenormStatus(denormEvent: DenormEvent): StatusCode = { + val totalFieldsCount = denormEvent.fieldStatus.size + val successCount = denormEvent.fieldStatus.values.count(f => f.success) + if (totalFieldsCount == successCount) StatusCode.success else if (successCount > 0) StatusCode.partial else StatusCode.failed + + } + + private def markStatus(event: mutable.Map[String, AnyRef], producer: Producer, status: StatusCode): mutable.Map[String, AnyRef] = { + val obsrvMeta = Util.getMutableMap(event("obsrv_meta").asInstanceOf[Map[String, AnyRef]]) + addFlags(obsrvMeta, Map(producer.toString -> status.toString)) + addTimespan(obsrvMeta, producer) + event.put("obsrv_meta", obsrvMeta.toMap) + event + } + } diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerWindowFunction.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerWindowFunction.scala index bf8ebf3d..ce603520 100644 --- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerWindowFunction.scala +++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerWindowFunction.scala @@ -5,26 +5,29 @@ import org.apache.flink.configuration.Configuration import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction import org.apache.flink.streaming.api.windowing.windows.TimeWindow import org.slf4j.LoggerFactory -import org.sunbird.obsrv.core.streaming.{Metrics, MetricsList, WindowBaseProcessFunction} +import org.sunbird.obsrv.core.model.Models._ +import org.sunbird.obsrv.core.model.Producer.Producer +import org.sunbird.obsrv.core.model.StatusCode.StatusCode +import org.sunbird.obsrv.core.model._ +import org.sunbird.obsrv.core.streaming.Metrics +import org.sunbird.obsrv.core.util.{JSONUtil, Util} import org.sunbird.obsrv.denormalizer.task.DenormalizerConfig import org.sunbird.obsrv.denormalizer.util._ import org.sunbird.obsrv.model.DatasetModels.Dataset import org.sunbird.obsrv.registry.DatasetRegistry +import org.sunbird.obsrv.streaming.BaseDatasetWindowProcessFunction -import java.lang -import scala.collection.JavaConverters._ import scala.collection.mutable class DenormalizerWindowFunction(config: DenormalizerConfig)(implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]]) - extends WindowBaseProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef], String](config) { + extends BaseDatasetWindowProcessFunction(config) { private[this] val logger = LoggerFactory.getLogger(classOf[DenormalizerWindowFunction]) private[this] var denormCache: DenormCache = _ - override def getMetricsList(): MetricsList = { - val metrics = List(config.denormSuccess, config.denormTotal, config.denormFailed, config.eventsSkipped) - MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics) + override def getMetrics(): List[String] = { + List(config.denormSuccess, config.denormTotal, config.denormFailed, config.eventsSkipped, config.denormPartialSuccess) } override def open(parameters: Configuration): Unit = { @@ -38,21 +41,20 @@ class DenormalizerWindowFunction(config: DenormalizerConfig)(implicit val eventT denormCache.close() } - override def process(datasetId: String, context: ProcessWindowFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef], String, TimeWindow]#Context, elements: lang.Iterable[mutable.Map[String, AnyRef]], metrics: Metrics): Unit = { + override def processWindow(dataset: Dataset, context: ProcessWindowFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef], String, TimeWindow]#Context, elements: List[mutable.Map[String, AnyRef]], metrics: Metrics): Unit = { - val eventsList = elements.asScala.toList - metrics.incCounter(datasetId, config.denormTotal, eventsList.size.toLong) - val dataset = DatasetRegistry.getDataset(datasetId).get - val denormEvents = eventsList.map(msg => { - DenormEvent(msg, None, None) + metrics.incCounter(dataset.id, config.denormTotal, elements.size.toLong) + denormCache.open(dataset) + val denormEvents = elements.map(msg => { + DenormEvent(msg) }) if (dataset.denormConfig.isDefined) { denormalize(denormEvents, dataset, metrics, context) } else { - metrics.incCounter(datasetId, config.eventsSkipped, eventsList.size.toLong) - eventsList.foreach(msg => { - context.output(config.denormEventsTag, markSkipped(msg, config.jobName)) + metrics.incCounter(dataset.id, config.eventsSkipped, elements.size.toLong) + elements.foreach(msg => { + context.output(config.denormEventsTag, markSkipped(msg, Producer.denorm)) }) } } @@ -61,16 +63,54 @@ class DenormalizerWindowFunction(config: DenormalizerConfig)(implicit val eventT context: ProcessWindowFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef], String, TimeWindow]#Context): Unit = { val datasetId = dataset.id - val denormEvents = denormCache.denormMultipleEvents(datasetId, events, dataset.denormConfig.get.denormFields) denormEvents.foreach(denormEvent => { - if (denormEvent.error.isEmpty) { - metrics.incCounter(datasetId, config.denormSuccess) - context.output(config.denormEventsTag, markSuccess(denormEvent.msg, config.jobName)) - } else { - metrics.incCounter(datasetId, config.denormFailed) - context.output(config.denormFailedTag, markFailed(denormEvent.msg, denormEvent.error.get, config.jobName)) + val status = getDenormStatus(denormEvent) + context.output(config.denormEventsTag, markStatus(denormEvent.msg, Producer.denorm, status.toString)) + status match { + case StatusCode.success => metrics.incCounter(dataset.id, config.denormSuccess) + case _ => + metrics.incCounter(dataset.id, if (status == StatusCode.partial) config.denormPartialSuccess else config.denormFailed) + generateSystemEvent(dataset, denormEvent, context) + logData(dataset.id, denormEvent) } }) } + + private def logData(datasetId: String, denormEvent: DenormEvent): Unit = { + logger.warn(s"Denormalizer | Denorm operation is not successful | dataset=$datasetId | denormStatus=${JSONUtil.serialize(denormEvent.fieldStatus)}") + } + + private def generateSystemEvent(dataset: Dataset, denormEvent: DenormEvent, context: ProcessWindowFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef], String, TimeWindow]#Context): Unit = { + + denormEvent.fieldStatus.filter(f => !f._2.success).groupBy(f => f._2.error.get).map(f => (f._1, f._2.size)) + .foreach(f => { + val functionalError = f._1 match { + case ErrorConstants.DENORM_KEY_MISSING => FunctionalError.DenormKeyMissing + case ErrorConstants.DENORM_KEY_NOT_A_STRING_OR_NUMBER => FunctionalError.DenormKeyInvalid + case ErrorConstants.DENORM_DATA_NOT_FOUND => FunctionalError.DenormDataNotFound + } + context.output(config.systemEventsOutputTag, JSONUtil.serialize(SystemEvent( + EventID.METRIC, + ctx = ContextData(module = ModuleID.processing, pdata = PData(config.jobName, PDataType.flink, Some(Producer.denorm)), dataset = Some(dataset.id), dataset_type = Some(dataset.datasetType)), + data = EData(error = Some(ErrorLog(pdata_id = Producer.denorm, pdata_status = StatusCode.failed, error_type = functionalError, error_code = f._1.errorCode, error_message = f._1.errorMsg, error_level = ErrorLevel.critical, error_count = Some(f._2)))) + ))) + }) + + } + + private def getDenormStatus(denormEvent: DenormEvent): StatusCode = { + val totalFieldsCount = denormEvent.fieldStatus.size + val successCount = denormEvent.fieldStatus.values.count(f => f.success) + if (totalFieldsCount == successCount) StatusCode.success else if (successCount > 0) StatusCode.partial else StatusCode.failed + + } + + private def markStatus(event: mutable.Map[String, AnyRef], producer: Producer, status: String): mutable.Map[String, AnyRef] = { + val obsrvMeta = Util.getMutableMap(event("obsrv_meta").asInstanceOf[Map[String, AnyRef]]) + addFlags(obsrvMeta, Map(producer.toString -> status)) + addTimespan(obsrvMeta, producer) + event.put("obsrv_meta", obsrvMeta.toMap) + event + } } \ No newline at end of file diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerConfig.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerConfig.scala index 1d24793c..118c0307 100644 --- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerConfig.scala +++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerConfig.scala @@ -5,9 +5,10 @@ import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.typeutils.TypeExtractor import org.apache.flink.streaming.api.scala.OutputTag import org.sunbird.obsrv.core.streaming.BaseJobConfig + import scala.collection.mutable -class DenormalizerConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "DenormalizerJob" ) { +class DenormalizerConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "DenormalizerJob") { private val serialVersionUID = 2905979434303791379L @@ -17,23 +18,20 @@ class DenormalizerConfig(override val config: Config) extends BaseJobConfig[muta // Kafka Topics Configuration val kafkaInputTopic: String = config.getString("kafka.input.topic") val denormOutputTopic: String = config.getString("kafka.output.denorm.topic") - val denormFailedTopic: String = config.getString("kafka.output.denorm.failed.topic") // Windows val windowTime: Int = config.getInt("task.window.time.in.seconds") val windowCount: Int = config.getInt("task.window.count") val DENORM_EVENTS_PRODUCER = "denorm-events-producer" - val DENORM_FAILED_EVENTS_PRODUCER = "denorm-failed-events-producer" private val DENORM_EVENTS = "denorm_events" - private val FAILED_EVENTS = "denorm_failed_events" val denormEventsTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]](DENORM_EVENTS) - val denormFailedTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]](FAILED_EVENTS) - val eventsSkipped = "events-skipped" + val eventsSkipped = "denorm-skipped" val denormFailed = "denorm-failed" + val denormPartialSuccess = "denorm-partial-success" val denormSuccess = "denorm-success" val denormTotal = "denorm-total" @@ -46,5 +44,6 @@ class DenormalizerConfig(override val config: Config) extends BaseJobConfig[muta override def inputTopic(): String = kafkaInputTopic override def inputConsumer(): String = denormalizationConsumer override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = denormEventsTag + override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") -} +} \ No newline at end of file diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerStreamTask.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerStreamTask.scala index 9d620a8f..b23cd612 100644 --- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerStreamTask.scala +++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerStreamTask.scala @@ -22,22 +22,25 @@ class DenormalizerStreamTask(config: DenormalizerConfig, kafkaConnector: FlinkKa def process(): Unit = { implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) - val dataStream = getMapDataStream(env, config, kafkaConnector) - processStream(dataStream) + process(env) env.execute(config.jobName) } // $COVERAGE-ON$ + def process(env: StreamExecutionEnvironment): Unit = { + val dataStream = getMapDataStream(env, config, kafkaConnector) + processStream(dataStream) + } + override def processStream(dataStream: DataStream[mutable.Map[String, AnyRef]]): DataStream[mutable.Map[String, AnyRef]] = { val denormStream = dataStream .process(new DenormalizerFunction(config)).name(config.denormalizationFunction).uid(config.denormalizationFunction) .setParallelism(config.downstreamOperatorsParallelism) - denormStream.getSideOutput(config.denormEventsTag).sinkTo(kafkaConnector.kafkaMapSink(config.denormOutputTopic)) + denormStream.getSideOutput(config.denormEventsTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.denormOutputTopic)) .name(config.DENORM_EVENTS_PRODUCER).uid(config.DENORM_EVENTS_PRODUCER).setParallelism(config.downstreamOperatorsParallelism) - denormStream.getSideOutput(config.denormFailedTag).sinkTo(kafkaConnector.kafkaMapSink(config.denormFailedTopic)) - .name(config.DENORM_FAILED_EVENTS_PRODUCER).uid(config.DENORM_FAILED_EVENTS_PRODUCER).setParallelism(config.downstreamOperatorsParallelism) + addDefaultSinks(denormStream, config, kafkaConnector) denormStream.getSideOutput(config.successTag()) } } diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerWindowStreamTask.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerWindowStreamTask.scala index 81fb2368..73b6c6ec 100644 --- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerWindowStreamTask.scala +++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerWindowStreamTask.scala @@ -9,7 +9,7 @@ import org.apache.flink.streaming.api.datastream.WindowedStream import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment import org.apache.flink.streaming.api.windowing.time.Time import org.apache.flink.streaming.api.windowing.windows.TimeWindow -import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector +import org.sunbird.obsrv.core.streaming.{BaseStreamTaskSink, FlinkKafkaConnector} import org.sunbird.obsrv.core.util.{DatasetKeySelector, FlinkUtil, TumblingProcessingTimeCountWindows} import org.sunbird.obsrv.denormalizer.functions.DenormalizerWindowFunction @@ -19,15 +19,21 @@ import scala.collection.mutable /** * Denormalization stream task does the following pipeline processing in a sequence: */ -class DenormalizerWindowStreamTask(config: DenormalizerConfig, kafkaConnector: FlinkKafkaConnector) { +class DenormalizerWindowStreamTask(config: DenormalizerConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTaskSink[mutable.Map[String, AnyRef]] { private val serialVersionUID = -7729362727131516112L + // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster def process(): Unit = { implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) - implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) + process(env) + env.execute(config.jobName) + } + // $COVERAGE-ON$ + def process(env: StreamExecutionEnvironment): Unit = { + implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) val source = kafkaConnector.kafkaMapSource(config.inputTopic()) val windowedStream: WindowedStream[mutable.Map[String, AnyRef], String, TimeWindow] = env.fromSource(source, WatermarkStrategy.noWatermarks[mutable.Map[String, AnyRef]](), config.denormalizationConsumer).uid(config.denormalizationConsumer) .setParallelism(config.kafkaConsumerParallelism).rebalance() @@ -35,16 +41,15 @@ class DenormalizerWindowStreamTask(config: DenormalizerConfig, kafkaConnector: F .window(TumblingProcessingTimeCountWindows.of(Time.seconds(config.windowTime), config.windowCount)) val denormStream = windowedStream - .process(new DenormalizerWindowFunction(config)).name(config.denormalizationFunction).uid(config.denormalizationFunction) - .setParallelism(config.downstreamOperatorsParallelism) + .process(new DenormalizerWindowFunction(config)).name(config.denormalizationFunction).uid(config.denormalizationFunction) + .setParallelism(config.downstreamOperatorsParallelism) - denormStream.getSideOutput(config.denormEventsTag).sinkTo(kafkaConnector.kafkaMapSink(config.denormOutputTopic)) + denormStream.getSideOutput(config.denormEventsTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.denormOutputTopic)) .name(config.DENORM_EVENTS_PRODUCER).uid(config.DENORM_EVENTS_PRODUCER).setParallelism(config.downstreamOperatorsParallelism) - denormStream.getSideOutput(config.denormFailedTag).sinkTo(kafkaConnector.kafkaMapSink(config.denormFailedTopic)) - .name(config.DENORM_FAILED_EVENTS_PRODUCER).uid(config.DENORM_FAILED_EVENTS_PRODUCER).setParallelism(config.downstreamOperatorsParallelism) - env.execute(config.jobName) + addDefaultSinks(denormStream, config, kafkaConnector) } + } // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala index 2e3aa3a1..dd94a251 100644 --- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala +++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala @@ -1,8 +1,6 @@ package org.sunbird.obsrv.denormalizer.util -import org.slf4j.LoggerFactory import org.sunbird.obsrv.core.cache.RedisConnect -import org.sunbird.obsrv.core.exception.ObsrvException import org.sunbird.obsrv.core.model.ErrorConstants import org.sunbird.obsrv.core.model.ErrorConstants.Error import org.sunbird.obsrv.core.util.{JSONUtil, Util} @@ -12,11 +10,12 @@ import redis.clients.jedis.{Pipeline, Response} import scala.collection.mutable -case class DenormEvent(msg: mutable.Map[String, AnyRef], var responses: Option[mutable.Map[String, Response[String]]], var error: Option[Error]) +case class DenormFieldStatus(fieldValue: String, var success: Boolean, var error: Option[Error]) + +case class DenormEvent(msg: mutable.Map[String, AnyRef], var responses: Option[mutable.Map[String, Response[String]]] = None, var fieldStatus: mutable.Map[String, DenormFieldStatus] = mutable.Map[String, DenormFieldStatus]()) class DenormCache(val config: DenormalizerConfig) { - private[this] val logger = LoggerFactory.getLogger(classOf[DenormCache]) private val datasetPipelineMap: mutable.Map[String, Pipeline] = mutable.Map[String, Pipeline]() def close(): Unit = { @@ -25,25 +24,42 @@ class DenormCache(val config: DenormalizerConfig) { def open(datasets: List[Dataset]): Unit = { datasets.map(dataset => { - if (dataset.denormConfig.isDefined) { - val denormConfig = dataset.denormConfig.get - val redisConnect = new RedisConnect(denormConfig.redisDBHost, denormConfig.redisDBPort, config.redisConnectionTimeout) - val pipeline: Pipeline = redisConnect.getConnection(0).pipelined() - datasetPipelineMap.put(dataset.id, pipeline) - } + open(dataset) }) } - def denormEvent(datasetId: String, event: mutable.Map[String, AnyRef], denormFieldConfigs: List[DenormFieldConfig]): mutable.Map[String, AnyRef] = { - val pipeline = this.datasetPipelineMap(datasetId) - pipeline.clear() + def open(dataset: Dataset): Unit = { + if (!datasetPipelineMap.contains(dataset.id) && dataset.denormConfig.isDefined) { + val denormConfig = dataset.denormConfig.get + val redisConnect = new RedisConnect(denormConfig.redisDBHost, denormConfig.redisDBPort, config.redisConnectionTimeout) + val pipeline: Pipeline = redisConnect.getConnection(0).pipelined() + datasetPipelineMap.put(dataset.id, pipeline) + } + } + + private def processDenorm(denormEvent: DenormEvent, pipeline: Pipeline, denormFieldConfigs: List[DenormFieldConfig]): Unit = { + val responses: mutable.Map[String, Response[String]] = mutable.Map[String, Response[String]]() + val fieldStatus: mutable.Map[String, DenormFieldStatus] = mutable.Map[String, DenormFieldStatus]() + val event = Util.getMutableMap(denormEvent.msg(config.CONST_EVENT).asInstanceOf[Map[String, AnyRef]]) val eventStr = JSONUtil.serialize(event) denormFieldConfigs.foreach(fieldConfig => { - responses.put(fieldConfig.denormOutField, getFromCache(pipeline, fieldConfig, eventStr)) + val denormFieldStatus = extractField(fieldConfig, eventStr) + fieldStatus.put(fieldConfig.denormOutField, denormFieldStatus) + if (!denormFieldStatus.fieldValue.isBlank) { + responses.put(fieldConfig.denormOutField, getFromCache(pipeline, denormFieldStatus.fieldValue, fieldConfig)) + } }) + denormEvent.fieldStatus = fieldStatus + denormEvent.responses = Some(responses) + } + + def denormEvent(datasetId: String, denormEvent: DenormEvent, denormFieldConfigs: List[DenormFieldConfig]): DenormEvent = { + val pipeline = this.datasetPipelineMap(datasetId) + pipeline.clear() + processDenorm(denormEvent, pipeline, denormFieldConfigs) pipeline.sync() - updateEvent(event, responses) + updateEvent(denormEvent) } def denormMultipleEvents(datasetId: String, events: List[DenormEvent], denormFieldConfigs: List[DenormFieldConfig]): List[DenormEvent] = { @@ -51,62 +67,50 @@ class DenormCache(val config: DenormalizerConfig) { pipeline.clear() events.foreach(denormEvent => { - val responses: mutable.Map[String, Response[String]] = mutable.Map[String, Response[String]]() - val event = Util.getMutableMap(denormEvent.msg(config.CONST_EVENT).asInstanceOf[Map[String, AnyRef]]) - val eventStr = JSONUtil.serialize(event) - try { - denormFieldConfigs.foreach(fieldConfig => { - responses.put(fieldConfig.denormOutField, getFromCache(pipeline, fieldConfig, eventStr)) - }) - denormEvent.responses = Some(responses) - } catch { - case ex: ObsrvException => - logger.error("DenormCache:denormMultipleEvents() - Exception", ex) - denormEvent.error = Some(ex.error) - } + processDenorm(denormEvent, pipeline, denormFieldConfigs) }) pipeline.sync() updateMultipleEvents(events) } - private def getFromCache(pipeline: Pipeline, fieldConfig: DenormFieldConfig, eventStr: String): Response[String] = { - pipeline.select(fieldConfig.redisDB) + private def extractField(fieldConfig: DenormFieldConfig, eventStr: String): DenormFieldStatus = { val denormFieldNode = JSONUtil.getKey(fieldConfig.denormKey, eventStr) if (denormFieldNode.isMissingNode) { - throw new ObsrvException(ErrorConstants.DENORM_KEY_MISSING) - } - if (!denormFieldNode.isTextual) { - throw new ObsrvException(ErrorConstants.DENORM_KEY_NOT_A_STRING) + DenormFieldStatus("", success = false, Some(ErrorConstants.DENORM_KEY_MISSING)) + } else { + if (denormFieldNode.isTextual || denormFieldNode.isNumber) { + DenormFieldStatus(denormFieldNode.asText(), success = false, None) + } else { + DenormFieldStatus("", success = false, Some(ErrorConstants.DENORM_KEY_NOT_A_STRING_OR_NUMBER)) + } } - val denormField = denormFieldNode.asText() + } + + private def getFromCache(pipeline: Pipeline, denormField: String, fieldConfig: DenormFieldConfig): Response[String] = { + pipeline.select(fieldConfig.redisDB) pipeline.get(denormField) } - private def updateEvent(event: mutable.Map[String, AnyRef], responses: mutable.Map[String, Response[String]]): mutable.Map[String, AnyRef] = { + private def updateEvent(denormEvent: DenormEvent): DenormEvent = { - responses.map(f => { + val event = Util.getMutableMap(denormEvent.msg(config.CONST_EVENT).asInstanceOf[Map[String, AnyRef]]) + denormEvent.responses.get.foreach(f => { if (f._2.get() != null) { + denormEvent.fieldStatus(f._1).success = true event.put(f._1, JSONUtil.deserialize[Map[String, AnyRef]](f._2.get())) + } else { + denormEvent.fieldStatus(f._1).error = Some(ErrorConstants.DENORM_DATA_NOT_FOUND) } }) - event + denormEvent.msg.put(config.CONST_EVENT, event.toMap) + denormEvent } private def updateMultipleEvents(events: List[DenormEvent]): List[DenormEvent] = { events.map(denormEvent => { - if (denormEvent.responses.isDefined) { - val event = Util.getMutableMap(denormEvent.msg(config.CONST_EVENT).asInstanceOf[Map[String, AnyRef]]) - denormEvent.responses.get.map(f => { - if (f._2.get() != null) { - event.put(f._1, JSONUtil.deserialize[Map[String, AnyRef]](f._2.get())) - } - }) - denormEvent.msg.put(config.CONST_EVENT, event.toMap) - } - denormEvent + updateEvent(denormEvent) }) } - } \ No newline at end of file diff --git a/pipeline/denormalizer/src/test/resources/test.conf b/pipeline/denormalizer/src/test/resources/test.conf index 441d01bf..f7f61beb 100644 --- a/pipeline/denormalizer/src/test/resources/test.conf +++ b/pipeline/denormalizer/src/test/resources/test.conf @@ -3,12 +3,12 @@ include "base-test.conf" kafka { input.topic = "flink.unique" output.denorm.topic = "flink.denorm" - output.denorm.failed.topic = "flink.denorm.failed" + output.denorm.failed.topic = "flink.failed" groupId = "flink-denormalizer-group" } task { - window.time.in.seconds = 5 + window.time.in.seconds = 2 window.count = 30 window.shards = 1400 consumer.parallelism = 1 diff --git a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala new file mode 100644 index 00000000..bd9658eb --- /dev/null +++ b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala @@ -0,0 +1,176 @@ +package org.sunbird.obsrv.denormalizer + +import io.github.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig} +import org.apache.flink.configuration.Configuration +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.apache.flink.test.util.MiniClusterWithClientResource +import org.apache.kafka.common.serialization.StringDeserializer +import org.scalatest.Matchers._ +import org.sunbird.obsrv.BaseMetricsReporter +import org.sunbird.obsrv.core.cache.RedisConnect +import org.sunbird.obsrv.core.model.Models.SystemEvent +import org.sunbird.obsrv.core.model._ +import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector +import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect} +import org.sunbird.obsrv.denormalizer.task.{DenormalizerConfig, DenormalizerStreamTask} +import org.sunbird.obsrv.denormalizer.util.DenormCache +import org.sunbird.obsrv.model.DatasetModels._ +import org.sunbird.obsrv.model.DatasetStatus +import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry + +import scala.collection.mutable +import scala.concurrent.ExecutionContext.Implicits.global +import scala.concurrent.Future +import scala.concurrent.duration._ + +class DenormalizerStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { + + val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder() + .setConfiguration(testConfiguration()) + .setNumberSlotsPerTaskManager(1) + .setNumberTaskManagers(1) + .build) + + val denormConfig = new DenormalizerConfig(config) + val redisPort: Int = denormConfig.redisPort + val kafkaConnector = new FlinkKafkaConnector(denormConfig) + val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group") + implicit val embeddedKafkaConfig: EmbeddedKafkaConfig = + EmbeddedKafkaConfig( + kafkaPort = 9093, + zooKeeperPort = 2183, + customConsumerProperties = customKafkaConsumerProperties + ) + implicit val deserializer: StringDeserializer = new StringDeserializer() + + def testConfiguration(): Configuration = { + val config = new Configuration() + config.setString("metrics.reporter", "job_metrics_reporter") + config.setString("metrics.reporter.job_metrics_reporter.class", classOf[BaseMetricsReporter].getName) + config + } + + override def beforeAll(): Unit = { + super.beforeAll() + BaseMetricsReporter.gaugeMetrics.clear() + EmbeddedKafka.start()(embeddedKafkaConfig) + val postgresConnect = new PostgresConnect(postgresConfig) + insertTestData(postgresConnect) + postgresConnect.closeConnection() + createTestTopics() + publishMessagesToKafka() + flinkCluster.before() + } + + private def publishMessagesToKafka(): Unit = { + EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.SUCCESS_DENORM) + EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.SKIP_DENORM) + EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.DENORM_MISSING_KEYS) + EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.DENORM_MISSING_DATA_AND_INVALIDKEY) + } + + private def insertTestData(postgresConnect: PostgresConnect): Unit = { + postgresConnect.execute("update datasets set denorm_config = '" + s"""{"redis_db_host":"localhost","redis_db_port":$redisPort,"denorm_fields":[{"denorm_key":"vehicleCode","redis_db":3,"denorm_out_field":"vehicle_data"},{"denorm_key":"dealer.dealerCode","redis_db":4,"denorm_out_field":"dealer_data"}]}""" + "' where id='d1';") + val redisConnection = new RedisConnect(denormConfig.redisHost, denormConfig.redisPort, denormConfig.redisConnectionTimeout) + redisConnection.getConnection(3).set("HYUN-CRE-D6", EventFixture.DENORM_DATA_1) + redisConnection.getConnection(4).set("D123", EventFixture.DENORM_DATA_2) + } + + override def afterAll(): Unit = { + val redisConnection = new RedisConnect(denormConfig.redisHost, denormConfig.redisPort, denormConfig.redisConnectionTimeout) + redisConnection.getConnection(3).flushAll() + redisConnection.getConnection(4).flushAll() + + super.afterAll() + flinkCluster.after() + EmbeddedKafka.stop() + } + + def createTestTopics(): Unit = { + List( + config.getString("kafka.output.system.event.topic"), config.getString("kafka.output.denorm.topic"), config.getString("kafka.input.topic") + ).foreach(EmbeddedKafka.createCustomTopic(_)) + } + + "DenormalizerStreamTaskTestSpec" should "validate the denorm stream task" in { + + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(denormConfig) + val task = new DenormalizerStreamTask(denormConfig, kafkaConnector) + task.process(env) + Future { + env.execute(denormConfig.jobName) + } + + val outputs = EmbeddedKafka.consumeNumberMessagesFrom[String](denormConfig.denormOutputTopic, 4, timeout = 30.seconds) + validateOutputs(outputs) + + val systemEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](denormConfig.kafkaSystemTopic, 3, timeout = 30.seconds) + validateSystemEvents(systemEvents) + + val mutableMetricsMap = mutable.Map[String, Long]() + BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) + Console.println("### DenormalizerStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) + validateMetrics(mutableMetricsMap) + } + + it should "validate dynamic cache creation within DenormCache" in { + val denormCache = new DenormCache(denormConfig) + noException should be thrownBy { + denormCache.open(Dataset(id = "d123", datasetType = "dataset", extractionConfig = None, dedupConfig = None, validationConfig = None, jsonSchema = None, + denormConfig = Some(DenormConfig(redisDBHost = "localhost", redisDBPort = redisPort, denormFields = List(DenormFieldConfig(denormKey = "vehicleCode", redisDB = 3, denormOutField = "vehicle_data")))), routerConfig = RouterConfig(""), + datasetConfig = DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest"), status = DatasetStatus.Live)) + } + } + + private def validateOutputs(outputs: List[String]): Unit = { + outputs.size should be(4) + outputs.zipWithIndex.foreach { + case (elem, idx) => + val msg = JSONUtil.deserialize[Map[String, AnyRef]](elem) + val event = JSONUtil.serialize(msg(Constants.EVENT)) + idx match { + case 0 => event should be("""{"vehicle_data":{"model":"Creta","price":"2200000","variant":"SX(O)","fuel":"Diesel","code":"HYUN-CRE-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic"},"dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","dealer_data":{"code":"D123","name":"KUN United","licenseNumber":"1234124","authorized":"yes"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""") + case 1 => event should be("""{"dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""") + case 2 => event should be("""{"dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"id":"2345","date":"2023-03-01","dealer_data":{"code":"D123","name":"KUN United","licenseNumber":"1234124","authorized":"yes"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""") + case 3 => event should be("""{"dealer":{"dealerCode":"D124","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":["HYUN-CRE-D7"],"id":"4567","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""") + } + } + } + + private def validateSystemEvents(systemEvents: List[String]): Unit = { + systemEvents.size should be(3) + systemEvents.foreach(f => { + val event = JSONUtil.deserialize[SystemEvent](f) + event.etype should be(EventID.METRIC) + event.ctx.module should be(ModuleID.processing) + event.ctx.pdata.id should be(denormConfig.jobName) + event.ctx.pdata.`type` should be(PDataType.flink) + event.ctx.pdata.pid.get should be(Producer.denorm) + event.data.error.isDefined should be(true) + val errorLog = event.data.error.get + errorLog.error_level should be(ErrorLevel.critical) + errorLog.pdata_id should be(Producer.denorm) + errorLog.pdata_status should be(StatusCode.failed) + errorLog.error_count.get should be(1) + errorLog.error_code match { + case ErrorConstants.DENORM_KEY_MISSING.errorCode => + errorLog.error_type should be(FunctionalError.DenormKeyMissing) + case ErrorConstants.DENORM_KEY_NOT_A_STRING_OR_NUMBER.errorCode => + errorLog.error_type should be(FunctionalError.DenormKeyInvalid) + case ErrorConstants.DENORM_DATA_NOT_FOUND.errorCode => + errorLog.error_type should be(FunctionalError.DenormDataNotFound) + } + }) + } + + private def validateMetrics(mutableMetricsMap: mutable.Map[String, Long]): Unit = { + mutableMetricsMap(s"${denormConfig.jobName}.d1.${denormConfig.denormTotal}") should be(3) + mutableMetricsMap(s"${denormConfig.jobName}.d1.${denormConfig.denormFailed}") should be(1) + mutableMetricsMap(s"${denormConfig.jobName}.d1.${denormConfig.denormSuccess}") should be(1) + mutableMetricsMap(s"${denormConfig.jobName}.d1.${denormConfig.denormPartialSuccess}") should be(1) + mutableMetricsMap(s"${denormConfig.jobName}.d2.${denormConfig.denormTotal}") should be(1) + mutableMetricsMap(s"${denormConfig.jobName}.d2.${denormConfig.eventsSkipped}") should be(1) + } + +} diff --git a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala new file mode 100644 index 00000000..5c3a5b86 --- /dev/null +++ b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala @@ -0,0 +1,204 @@ +package org.sunbird.obsrv.denormalizer + +import io.github.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig} +import org.apache.flink.configuration.Configuration +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.apache.flink.test.util.MiniClusterWithClientResource +import org.apache.kafka.common.serialization.StringDeserializer +import org.scalatest.Matchers._ +import org.sunbird.obsrv.BaseMetricsReporter +import org.sunbird.obsrv.core.cache.RedisConnect +import org.sunbird.obsrv.core.model.Models.SystemEvent +import org.sunbird.obsrv.core.model._ +import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector +import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect} +import org.sunbird.obsrv.denormalizer.task.{DenormalizerConfig, DenormalizerWindowStreamTask} +import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry + +import scala.collection.mutable +import scala.concurrent.ExecutionContext.Implicits.global +import scala.concurrent.Future +import scala.concurrent.duration._ + +class DenormalizerWindowStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { + + val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder() + .setConfiguration(testConfiguration()) + .setNumberSlotsPerTaskManager(1) + .setNumberTaskManagers(1) + .build) + + val denormConfig = new DenormalizerConfig(config) + val redisPort: Int = denormConfig.redisPort + val kafkaConnector = new FlinkKafkaConnector(denormConfig) + val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group") + implicit val embeddedKafkaConfig: EmbeddedKafkaConfig = + EmbeddedKafkaConfig( + kafkaPort = 9093, + zooKeeperPort = 2183, + customConsumerProperties = customKafkaConsumerProperties + ) + implicit val deserializer: StringDeserializer = new StringDeserializer() + + def testConfiguration(): Configuration = { + val config = new Configuration() + config.setString("metrics.reporter", "job_metrics_reporter") + config.setString("metrics.reporter.job_metrics_reporter.class", classOf[BaseMetricsReporter].getName) + config + } + + override def beforeAll(): Unit = { + super.beforeAll() + BaseMetricsReporter.gaugeMetrics.clear() + EmbeddedKafka.start()(embeddedKafkaConfig) + val postgresConnect = new PostgresConnect(postgresConfig) + insertTestData(postgresConnect) + postgresConnect.closeConnection() + createTestTopics() + publishMessagesToKafka() + flinkCluster.before() + } + + private def publishMessagesToKafka(): Unit = { + EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.SUCCESS_DENORM) + EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.SKIP_DENORM) + EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.DENORM_MISSING_KEYS) + EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.DENORM_MISSING_DATA_AND_INVALIDKEY) + EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.INVALID_DATASET_ID) + EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.MISSING_EVENT_KEY) + } + + private def insertTestData(postgresConnect: PostgresConnect): Unit = { + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, created_by, updated_by, created_date, updated_date) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'System', 'System', now(), now());") + postgresConnect.execute("update datasets set denorm_config = '" + s"""{"redis_db_host":"localhost","redis_db_port":$redisPort,"denorm_fields":[{"denorm_key":"vehicleCode","redis_db":3,"denorm_out_field":"vehicle_data"},{"denorm_key":"dealer.dealerCode","redis_db":4,"denorm_out_field":"dealer_data"}]}""" + "' where id='d1';") + val redisConnection = new RedisConnect(denormConfig.redisHost, denormConfig.redisPort, denormConfig.redisConnectionTimeout) + redisConnection.getConnection(3).set("HYUN-CRE-D6", EventFixture.DENORM_DATA_1) + redisConnection.getConnection(4).set("D123", EventFixture.DENORM_DATA_2) + } + + override def afterAll(): Unit = { + val redisConnection = new RedisConnect(denormConfig.redisHost, denormConfig.redisPort, denormConfig.redisConnectionTimeout) + redisConnection.getConnection(3).flushAll() + redisConnection.getConnection(4).flushAll() + + super.afterAll() + flinkCluster.after() + EmbeddedKafka.stop() + } + + def createTestTopics(): Unit = { + List( + config.getString("kafka.output.system.event.topic"), config.getString("kafka.output.denorm.topic"), config.getString("kafka.input.topic") + ).foreach(EmbeddedKafka.createCustomTopic(_)) + } + + "DenormalizerWindowStreamTaskTestSpec" should "validate the denorm window stream task" in { + + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(denormConfig) + val task = new DenormalizerWindowStreamTask(denormConfig, kafkaConnector) + task.process(env) + Future { + env.execute(denormConfig.jobName) + } + + val outputs = EmbeddedKafka.consumeNumberMessagesFrom[String](denormConfig.denormOutputTopic, 4, timeout = 30.seconds) + validateOutputs(outputs) + + val systemEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](denormConfig.kafkaSystemTopic, 5, timeout = 30.seconds) + validateSystemEvents(systemEvents) + + val mutableMetricsMap = mutable.Map[String, Long]() + BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) + Console.println("### DenormalizerStreamWindowTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) + validateMetrics(mutableMetricsMap) + } + + private def validateOutputs(outputs: List[String]): Unit = { + outputs.size should be(4) + outputs.zipWithIndex.foreach { + case (elem, idx) => + //TODO: Add validations for obsrv_meta + val msg = JSONUtil.deserialize[Map[String, AnyRef]](elem) + val event = JSONUtil.serialize(msg(Constants.EVENT)) + idx match { + case 0 => event should be("""{"vehicle_data":{"model":"Creta","price":"2200000","variant":"SX(O)","fuel":"Diesel","code":"HYUN-CRE-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic"},"dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","dealer_data":{"code":"D123","name":"KUN United","licenseNumber":"1234124","authorized":"yes"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""") + case 1 => event should be("""{"dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"id":"2345","date":"2023-03-01","dealer_data":{"code":"D123","name":"KUN United","licenseNumber":"1234124","authorized":"yes"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""") + case 2 => event should be("""{"dealer":{"dealerCode":"D124","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":["HYUN-CRE-D7"],"id":"4567","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""") + case 3 => event should be("""{"dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""") + } + + } + } + + private def validateSystemEvents(systemEvents: List[String]): Unit = { + systemEvents.size should be(5) + systemEvents.count(f => { + val event = JSONUtil.deserialize[SystemEvent](f) + Producer.validator.equals(event.ctx.pdata.pid.get) + }) should be (2) + systemEvents.count(f => { + val event = JSONUtil.deserialize[SystemEvent](f) + FunctionalError.MissingEventData.equals(event.data.error.get.error_type) + }) should be(1) + systemEvents.count(f => { + val event = JSONUtil.deserialize[SystemEvent](f) + Producer.denorm.equals(event.ctx.pdata.pid.get) + }) should be(3) + + systemEvents.foreach(se => { + val event = JSONUtil.deserialize[SystemEvent](se) + if (event.ctx.dataset.getOrElse("ALL").equals("ALL")) + event.ctx.dataset_type should be(None) + else + event.ctx.dataset_type.getOrElse("dataset") should be("dataset") + }) + + systemEvents.foreach(f => { + val event = JSONUtil.deserialize[SystemEvent](f) + event.etype should be(EventID.METRIC) + event.ctx.module should be(ModuleID.processing) + event.ctx.pdata.id should be(denormConfig.jobName) + event.ctx.pdata.`type` should be(PDataType.flink) + event.data.error.isDefined should be(true) + val errorLog = event.data.error.get + errorLog.error_level should be(ErrorLevel.critical) + errorLog.pdata_status should be(StatusCode.failed) + errorLog.error_count.get should be(1) + errorLog.error_code match { + case ErrorConstants.DENORM_KEY_MISSING.errorCode => + event.ctx.pdata.pid.get should be(Producer.denorm) + errorLog.pdata_id should be(Producer.denorm) + errorLog.error_type should be(FunctionalError.DenormKeyMissing) + case ErrorConstants.DENORM_KEY_NOT_A_STRING_OR_NUMBER.errorCode => + event.ctx.pdata.pid.get should be(Producer.denorm) + errorLog.pdata_id should be(Producer.denorm) + errorLog.error_type should be(FunctionalError.DenormKeyInvalid) + case ErrorConstants.DENORM_DATA_NOT_FOUND.errorCode => + event.ctx.pdata.pid.get should be(Producer.denorm) + errorLog.pdata_id should be(Producer.denorm) + errorLog.error_type should be(FunctionalError.DenormDataNotFound) + case ErrorConstants.MISSING_DATASET_CONFIGURATION.errorCode => + event.ctx.pdata.pid.get should be(Producer.validator) + errorLog.pdata_id should be(Producer.validator) + errorLog.error_type should be(FunctionalError.MissingDatasetId) + case ErrorConstants.EVENT_MISSING.errorCode => + event.ctx.pdata.pid.get should be(Producer.validator) + errorLog.pdata_id should be(Producer.validator) + errorLog.error_type should be(FunctionalError.MissingEventData) + } + }) + } + + private def validateMetrics(mutableMetricsMap: mutable.Map[String, Long]): Unit = { + mutableMetricsMap(s"${denormConfig.jobName}.d1.${denormConfig.denormTotal}") should be(3) + mutableMetricsMap(s"${denormConfig.jobName}.d1.${denormConfig.denormFailed}") should be(1) + mutableMetricsMap(s"${denormConfig.jobName}.d1.${denormConfig.denormSuccess}") should be(1) + mutableMetricsMap(s"${denormConfig.jobName}.d1.${denormConfig.denormPartialSuccess}") should be(1) + mutableMetricsMap(s"${denormConfig.jobName}.d2.${denormConfig.denormTotal}") should be(1) + mutableMetricsMap(s"${denormConfig.jobName}.d2.${denormConfig.eventsSkipped}") should be(1) + mutableMetricsMap(s"${denormConfig.jobName}.d3.${denormConfig.eventFailedMetricsCount}") should be(1) + mutableMetricsMap(s"${denormConfig.jobName}.dxyz.${denormConfig.eventFailedMetricsCount}") should be(1) + } + +} diff --git a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/EventFixture.scala b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/EventFixture.scala new file mode 100644 index 00000000..0b1a0b01 --- /dev/null +++ b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/EventFixture.scala @@ -0,0 +1,15 @@ +package org.sunbird.obsrv.denormalizer + +object EventFixture { + + val SUCCESS_DENORM = """{"dataset":"d1","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val SKIP_DENORM = """{"dataset":"d2","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + + val DENORM_MISSING_KEYS = """{"dataset":"d1","event":{"id":"2345","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val DENORM_MISSING_DATA_AND_INVALIDKEY = """{"dataset":"d1","event":{"id":"4567","vehicleCode":["HYUN-CRE-D7"],"date":"2023-03-01","dealer":{"dealerCode":"D124","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val INVALID_DATASET_ID = """{"dataset":"dxyz","event":{"id":"4567","vehicleCode":["HYUN-CRE-D7"],"date":"2023-03-01","dealer":{"dealerCode":"D124","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val MISSING_EVENT_KEY = """{"dataset":"d3","event1":{"id":"4567","vehicleCode":["HYUN-CRE-D7"],"date":"2023-03-01","dealer":{"dealerCode":"D124","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + + val DENORM_DATA_1 = """{"code":"HYUN-CRE-D6","manufacturer":"Hyundai","model":"Creta","variant":"SX(O)","modelYear":"2023","price":"2200000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel"}""" + val DENORM_DATA_2 = """{"code":"D123","name":"KUN United","licenseNumber":"1234124","authorized":"yes"}""" +} \ No newline at end of file diff --git a/pipeline/druid-router/pom.xml b/pipeline/druid-router/pom.xml index 4945f84d..41e2e390 100644 --- a/pipeline/druid-router/pom.xml +++ b/pipeline/druid-router/pom.xml @@ -4,9 +4,6 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - - 3.0.1 - org.sunbird.obsrv @@ -45,13 +42,6 @@ dataset-registry 1.0.0 - - org.sunbird.obsrv - framework - 1.0.0 - test-jar - test - com.github.java-json-tools json-schema-validator @@ -76,6 +66,32 @@ guava 32.1.2-jre + + org.sunbird.obsrv + framework + 1.0.0 + test-jar + test + + + org.sunbird.obsrv + dataset-registry + 1.0.0 + test-jar + test + + + org.apache.kafka + kafka-clients + ${kafka.version} + test + + + org.apache.kafka + kafka_${scala.maj.version} + ${kafka.version} + test + org.apache.flink flink-test-utils @@ -90,9 +106,21 @@ tests - it.ozimov + com.github.codemonstur embedded-redis - 0.7.1 + 1.0.0 + test + + + io.github.embeddedkafka + embedded-kafka_2.12 + 3.4.0 + test + + + io.zonky.test + embedded-postgres + 2.0.3 test @@ -166,7 +194,7 @@ - org.sunbird.obsrv.router.task.DruidRouterStreamTask + org.sunbird.obsrv.router.task.DynamicRouterStreamTask diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DruidRouterFunction.scala b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DruidRouterFunction.scala index 0cd9e4e1..d1f2c5e6 100644 --- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DruidRouterFunction.scala +++ b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DruidRouterFunction.scala @@ -13,6 +13,8 @@ import org.sunbird.obsrv.router.task.DruidRouterConfig import scala.collection.mutable +// $COVERAGE-OFF$ Disabling scoverage as the below function is deprecated +@Deprecated class DruidRouterFunction(config: DruidRouterConfig) extends BaseProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]](config) { private[this] val logger = LoggerFactory.getLogger(classOf[DruidRouterFunction]) @@ -33,18 +35,24 @@ class DruidRouterFunction(config: DruidRouterConfig) extends BaseProcessFunction override def processElement(msg: mutable.Map[String, AnyRef], ctx: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, metrics: Metrics): Unit = { - - implicit val mapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) - val datasetId = msg(config.CONST_DATASET).asInstanceOf[String] // DatasetId cannot be empty at this stage - metrics.incCounter(datasetId, config.routerTotalCount) - val dataset = DatasetRegistry.getDataset(datasetId).get - val event = Util.getMutableMap(msg(config.CONST_EVENT).asInstanceOf[Map[String, AnyRef]]) - event.put(config.CONST_OBSRV_META, msg(config.CONST_OBSRV_META)) - val routerConfig = dataset.routerConfig - ctx.output(OutputTag[mutable.Map[String, AnyRef]](routerConfig.topic), event) - metrics.incCounter(datasetId, config.routerSuccessCount) - - msg.remove(config.CONST_EVENT) - ctx.output(config.statsOutputTag, markComplete(msg, dataset.dataVersion)) + try { + implicit val mapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) + val datasetId = msg(config.CONST_DATASET).asInstanceOf[String] // DatasetId cannot be empty at this stage + metrics.incCounter(datasetId, config.routerTotalCount) + val dataset = DatasetRegistry.getDataset(datasetId).get + val event = Util.getMutableMap(msg(config.CONST_EVENT).asInstanceOf[Map[String, AnyRef]]) + event.put(config.CONST_OBSRV_META, msg(config.CONST_OBSRV_META)) + val routerConfig = dataset.routerConfig + ctx.output(OutputTag[mutable.Map[String, AnyRef]](routerConfig.topic), event) + metrics.incCounter(datasetId, config.routerSuccessCount) + + msg.remove(config.CONST_EVENT) + ctx.output(config.statsOutputTag, markComplete(msg, dataset.dataVersion)) + } catch { + case ex: Exception => + logger.error("DruidRouterFunction:processElement() - Exception: ", ex.getMessage) + ex.printStackTrace() + } } } +// $COVERAGE-ON$ \ No newline at end of file diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala new file mode 100644 index 00000000..7b91b19f --- /dev/null +++ b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala @@ -0,0 +1,115 @@ +package org.sunbird.obsrv.router.functions + +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.JsonNodeType +import org.apache.flink.configuration.Configuration +import org.apache.flink.streaming.api.functions.ProcessFunction +import org.joda.time.format.DateTimeFormat +import org.joda.time.{DateTime, DateTimeZone} +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.{Constants, ErrorConstants, FunctionalError, Producer} +import org.sunbird.obsrv.core.streaming.Metrics +import org.sunbird.obsrv.core.util.{JSONUtil, Util} +import org.sunbird.obsrv.model.DatasetModels.{Dataset, DatasetConfig} +import org.sunbird.obsrv.router.task.DruidRouterConfig +import org.sunbird.obsrv.streaming.BaseDatasetProcessFunction + +import java.util.TimeZone +import scala.collection.mutable + +case class TimestampKey(isValid: Boolean, value: AnyRef) + +class DynamicRouterFunction(config: DruidRouterConfig) extends BaseDatasetProcessFunction(config) { + + private[this] val logger = LoggerFactory.getLogger(classOf[DynamicRouterFunction]) + + override def open(parameters: Configuration): Unit = { + super.open(parameters) + } + + override def close(): Unit = { + super.close() + } + + override def getMetrics(): List[String] = { + List(config.routerTotalCount, config.routerSuccessCount) + } + + override def processElement(dataset: Dataset, msg: mutable.Map[String, AnyRef], + ctx: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, + metrics: Metrics): Unit = { + + metrics.incCounter(dataset.id, config.routerTotalCount) + val event = Util.getMutableMap(msg(config.CONST_EVENT).asInstanceOf[Map[String, AnyRef]]) + val tsKeyData = TimestampKeyParser.parseTimestampKey(dataset.datasetConfig, event) + if (tsKeyData.isValid) { + event.put(config.CONST_OBSRV_META, msg(config.CONST_OBSRV_META).asInstanceOf[Map[String, AnyRef]] ++ Map("indexTS" -> tsKeyData.value)) + val routerConfig = dataset.routerConfig + val topicEventMap = mutable.Map(Constants.TOPIC -> routerConfig.topic, Constants.MESSAGE -> event) + ctx.output(config.routerOutputTag, topicEventMap) + metrics.incCounter(dataset.id, config.routerSuccessCount) + markCompletion(dataset, super.markComplete(event, dataset.dataVersion), ctx, Producer.router) + } else { + markFailure(Some(dataset.id), msg, ctx, metrics, ErrorConstants.INDEX_KEY_MISSING_OR_BLANK, Producer.router, FunctionalError.MissingTimestampKey) + } + } + +} + +object TimestampKeyParser { + + def parseTimestampKey(datasetConfig: DatasetConfig, event: mutable.Map[String, AnyRef]): TimestampKey = { + val indexKey = datasetConfig.tsKey + val node = JSONUtil.getKey(indexKey, JSONUtil.serialize(event)) + node.getNodeType match { + case JsonNodeType.NUMBER => onNumber(datasetConfig, node) + case JsonNodeType.STRING => onText(datasetConfig, node) + case _ => TimestampKey(isValid = false, null) + } + } + + private def onNumber(datasetConfig: DatasetConfig, node: JsonNode): TimestampKey = { + val length = node.asText().length + val value = node.numberValue().longValue() + // TODO: [P3] Crude implementation. Checking if the epoch timestamp format is one of seconds, milli-seconds, micro-second and nano-seconds. Find a elegant approach + if (length == 10 || length == 13 || length == 16 || length == 19) { + val tfValue:Long = if (length == 10) (value * 1000).longValue() else if (length == 16) (value / 1000).longValue() else if (length == 19) (value / 1000000).longValue() else value + TimestampKey(isValid = true, addTimeZone(datasetConfig, new DateTime(tfValue)).asInstanceOf[AnyRef]) + } else { + TimestampKey(isValid = false, 0.asInstanceOf[AnyRef]) + } + } + + private def onText(datasetConfig: DatasetConfig, node: JsonNode): TimestampKey = { + val value = node.textValue() + if (datasetConfig.tsFormat.isDefined) { + parseDateTime(datasetConfig, value) + } else { + TimestampKey(isValid = true, value) + } + } + + private def parseDateTime(datasetConfig: DatasetConfig, value: String): TimestampKey = { + try { + datasetConfig.tsFormat.get match { + case "epoch" => TimestampKey(isValid = true, addTimeZone(datasetConfig, new DateTime(value.toLong)).asInstanceOf[AnyRef]) + case _ => + val dtf = DateTimeFormat.forPattern(datasetConfig.tsFormat.get) + TimestampKey(isValid = true, addTimeZone(datasetConfig, dtf.parseDateTime(value)).asInstanceOf[AnyRef]) + } + } catch { + case _: Exception => TimestampKey(isValid = false, null) + } + } + + private def addTimeZone(datasetConfig: DatasetConfig, dateTime: DateTime): Long = { + if (datasetConfig.datasetTimezone.isDefined) { + val tz = DateTimeZone.forTimeZone(TimeZone.getTimeZone(datasetConfig.datasetTimezone.get)) + val offsetInMilliseconds = tz.getOffset(dateTime) + dateTime.plusMillis(offsetInMilliseconds).getMillis + } else { + dateTime.getMillis + } + } + +} \ No newline at end of file diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterConfig.scala b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterConfig.scala index b67267a4..31106b00 100644 --- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterConfig.scala +++ b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterConfig.scala @@ -22,6 +22,7 @@ class DruidRouterConfig(override val config: Config) extends BaseJobConfig[mutab val routerSuccessCount = "router-success-count" val statsOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("processing_stats") + val routerOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("druid-routing-output") // Functions val druidRouterFunction = "DruidRouterFunction" @@ -41,4 +42,6 @@ class DruidRouterConfig(override val config: Config) extends BaseJobConfig[mutab override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = { statsOutputTag } + + override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") } diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterStreamTask.scala b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterStreamTask.scala index bff7b644..b77e110a 100644 --- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterStreamTask.scala +++ b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterStreamTask.scala @@ -1,7 +1,6 @@ package org.sunbird.obsrv.router.task import com.typesafe.config.ConfigFactory -import org.apache.flink.api.common.eventtime.WatermarkStrategy import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.typeutils.TypeExtractor import org.apache.flink.api.java.utils.ParameterTool @@ -19,19 +18,18 @@ import scala.collection.mutable /** * Druid Router stream task routes every event into its respective topic configured at dataset level */ - +// $COVERAGE-OFF$ Disabling scoverage as this stream task is deprecated +@Deprecated class DruidRouterStreamTask(config: DruidRouterConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { private val serialVersionUID = 146697324640926024L - // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster def process(): Unit = { implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) val dataStream = getMapDataStream(env, config, kafkaConnector) processStream(dataStream) env.execute(config.jobName) } - // $COVERAGE-ON$ override def processStream(dataStream: DataStream[mutable.Map[String, AnyRef]]): DataStream[mutable.Map[String, AnyRef]] = { @@ -42,20 +40,22 @@ class DruidRouterStreamTask(config: DruidRouterConfig, kafkaConnector: FlinkKafk .setParallelism(config.downstreamOperatorsParallelism) datasets.map(dataset => { routerStream.getSideOutput(OutputTag[mutable.Map[String, AnyRef]](dataset.routerConfig.topic)) - .sinkTo(kafkaConnector.kafkaMapSink(dataset.routerConfig.topic)) + .sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](dataset.routerConfig.topic)) .name(dataset.id + "-" + config.druidRouterProducer).uid(dataset.id + "-" + config.druidRouterProducer) .setParallelism(config.downstreamOperatorsParallelism) }) - routerStream.getSideOutput(config.statsOutputTag).sinkTo(kafkaConnector.kafkaMapSink(config.kafkaStatsTopic)) + routerStream.getSideOutput(config.statsOutputTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaStatsTopic)) .name(config.processingStatsProducer).uid(config.processingStatsProducer).setParallelism(config.downstreamOperatorsParallelism) + addDefaultSinks(routerStream, config, kafkaConnector) routerStream.getSideOutput(config.successTag()) } } - +// $COVERAGE-ON$ // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster +@Deprecated object DruidRouterStreamTask { def main(args: Array[String]): Unit = { diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala new file mode 100644 index 00000000..9e17a974 --- /dev/null +++ b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala @@ -0,0 +1,66 @@ +package org.sunbird.obsrv.router.task + +import com.typesafe.config.ConfigFactory +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.java.typeutils.TypeExtractor +import org.apache.flink.api.java.utils.ParameterTool +import org.apache.flink.streaming.api.datastream.DataStream +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector} +import org.sunbird.obsrv.core.util.FlinkUtil +import org.sunbird.obsrv.router.functions.DynamicRouterFunction + +import java.io.File +import scala.collection.mutable + +/** + * Druid Router stream task routes every event into its respective topic configured at dataset level + */ + +class DynamicRouterStreamTask(config: DruidRouterConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { + + private val serialVersionUID = 146697324640926024L + + // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster + def process(): Unit = { + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) + process(env) + env.execute(config.jobName) + } + // $COVERAGE-ON$ + + def process(env: StreamExecutionEnvironment): Unit = { + val dataStream = getMapDataStream(env, config, kafkaConnector) + processStream(dataStream) + } + + override def processStream(dataStream: DataStream[mutable.Map[String, AnyRef]]): DataStream[mutable.Map[String, AnyRef]] = { + + implicit val mapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) + + val routerStream = dataStream.process(new DynamicRouterFunction(config)).name(config.druidRouterFunction).uid(config.druidRouterFunction) + .setParallelism(config.downstreamOperatorsParallelism) + + routerStream.getSideOutput(config.routerOutputTag).sinkTo(kafkaConnector.kafkaMapDynamicSink()) + .name(config.druidRouterProducer).uid(config.druidRouterProducer).setParallelism(config.downstreamOperatorsParallelism) + + addDefaultSinks(routerStream, config, kafkaConnector) + routerStream.getSideOutput(config.successTag()) + } +} + +// $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster +object DynamicRouterStreamTask { + + def main(args: Array[String]): Unit = { + val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path")) + val config = configFilePath.map { + path => ConfigFactory.parseFile(new File(path)).resolve() + }.getOrElse(ConfigFactory.load("druid-router.conf").withFallback(ConfigFactory.systemEnvironment())) + val druidRouterConfig = new DruidRouterConfig(config) + val kafkaUtil = new FlinkKafkaConnector(druidRouterConfig) + val task = new DynamicRouterStreamTask(druidRouterConfig, kafkaUtil) + task.process() + } +} +// $COVERAGE-ON$ \ No newline at end of file diff --git a/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala b/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala new file mode 100644 index 00000000..34cd47a4 --- /dev/null +++ b/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala @@ -0,0 +1,162 @@ +package org.sunbird.obsrv.router + +import io.github.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig} +import org.apache.flink.configuration.Configuration +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.apache.flink.test.util.MiniClusterWithClientResource +import org.apache.kafka.common.serialization.StringDeserializer +import org.scalatest.Matchers._ +import org.sunbird.obsrv.BaseMetricsReporter +import org.sunbird.obsrv.core.model.Models.SystemEvent +import org.sunbird.obsrv.core.model._ +import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector +import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect} +import org.sunbird.obsrv.router.task.{DruidRouterConfig, DynamicRouterStreamTask} +import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry + +import scala.collection.mutable +import scala.concurrent.ExecutionContext.Implicits.global +import scala.concurrent.Future +import scala.concurrent.duration._ + +class DynamicRouterStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { + + val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder() + .setConfiguration(testConfiguration()) + .setNumberSlotsPerTaskManager(1) + .setNumberTaskManagers(1) + .build) + + val routerConfig = new DruidRouterConfig(config) + val kafkaConnector = new FlinkKafkaConnector(routerConfig) + val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group") + implicit val embeddedKafkaConfig: EmbeddedKafkaConfig = + EmbeddedKafkaConfig( + kafkaPort = 9093, + zooKeeperPort = 2183, + customConsumerProperties = customKafkaConsumerProperties + ) + implicit val deserializer: StringDeserializer = new StringDeserializer() + + def testConfiguration(): Configuration = { + val config = new Configuration() + config.setString("metrics.reporter", "job_metrics_reporter") + config.setString("metrics.reporter.job_metrics_reporter.class", classOf[BaseMetricsReporter].getName) + config + } + + override def beforeAll(): Unit = { + super.beforeAll() + BaseMetricsReporter.gaugeMetrics.clear() + EmbeddedKafka.start()(embeddedKafkaConfig) + val postgresConnect = new PostgresConnect(postgresConfig) + insertTestData(postgresConnect) + postgresConnect.closeConnection() + createTestTopics() + publishMessagesToKafka() + flinkCluster.before() + } + + private def publishMessagesToKafka(): Unit = { + EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.SUCCESS_EVENT) + EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.FAILED_EVENT) + } + + private def insertTestData(postgresConnect: PostgresConnect): Unit = { + postgresConnect.execute("update datasets set dataset_config = '" + """{"data_key":"id","timestamp_key":"date1","entry_topic":"ingest"}""" + "' where id='d2';") + + } + + override def afterAll(): Unit = { + + super.afterAll() + flinkCluster.after() + EmbeddedKafka.stop() + } + + def createTestTopics(): Unit = { + List( + routerConfig.kafkaSystemTopic, routerConfig.kafkaInputTopic, "d1-events", routerConfig.kafkaFailedTopic + ).foreach(EmbeddedKafka.createCustomTopic(_)) + } + + "DynamicRouterStreamTaskTestSpec" should "validate the router stream task" in { + + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(routerConfig) + val task = new DynamicRouterStreamTask(routerConfig, kafkaConnector) + task.process(env) + Future { + env.execute(routerConfig.jobName) + } + + val outputs = EmbeddedKafka.consumeNumberMessagesFrom[String]("d1-events", 1, timeout = 30.seconds) + validateOutputs(outputs) + + val failedEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](routerConfig.kafkaFailedTopic, 1, timeout = 30.seconds) + validateFailedEvents(failedEvents) + + val systemEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](routerConfig.kafkaSystemTopic, 2, timeout = 30.seconds) + validateSystemEvents(systemEvents) + + val mutableMetricsMap = mutable.Map[String, Long]() + BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) + Console.println("### DynamicRouterStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) + validateMetrics(mutableMetricsMap) + } + + private def validateOutputs(outputs: List[String]): Unit = { + outputs.size should be(1) + Console.println("Output", outputs.head) + } + + private def validateFailedEvents(failedEvents: List[String]): Unit = { + failedEvents.size should be(1) + Console.println("Output", failedEvents.head) + } + + private def validateSystemEvents(systemEvents: List[String]): Unit = { + systemEvents.size should be(2) + + systemEvents.foreach(se => { + val event = JSONUtil.deserialize[SystemEvent](se) + if (event.ctx.dataset.getOrElse("ALL").equals("ALL")) + event.ctx.dataset_type should be(None) + else + event.ctx.dataset_type.getOrElse("dataset") should be("dataset") + }) + + systemEvents.foreach(f => { + val event = JSONUtil.deserialize[SystemEvent](f) + event.etype should be(EventID.METRIC) + event.ctx.module should be(ModuleID.processing) + event.ctx.pdata.id should be(routerConfig.jobName) + event.ctx.pdata.`type` should be(PDataType.flink) + event.ctx.pdata.pid.get should be(Producer.router) + if(event.data.error.isDefined) { + val errorLog = event.data.error.get + errorLog.error_level should be(ErrorLevel.critical) + errorLog.pdata_id should be(Producer.router) + errorLog.pdata_status should be(StatusCode.failed) + errorLog.error_count.get should be(1) + errorLog.error_code should be(ErrorConstants.INDEX_KEY_MISSING_OR_BLANK.errorCode) + errorLog.error_message should be(ErrorConstants.INDEX_KEY_MISSING_OR_BLANK.errorMsg) + errorLog.error_type should be(FunctionalError.MissingTimestampKey) + } else { + event.data.pipeline_stats.isDefined should be (true) + event.data.pipeline_stats.get.latency_time.isDefined should be (true) + event.data.pipeline_stats.get.processing_time.isDefined should be (true) + event.data.pipeline_stats.get.total_processing_time.isDefined should be (true) + } + + }) + } + + private def validateMetrics(mutableMetricsMap: mutable.Map[String, Long]): Unit = { + mutableMetricsMap(s"${routerConfig.jobName}.d1.${routerConfig.routerTotalCount}") should be(1) + mutableMetricsMap(s"${routerConfig.jobName}.d1.${routerConfig.routerSuccessCount}") should be(1) + mutableMetricsMap(s"${routerConfig.jobName}.d2.${routerConfig.routerTotalCount}") should be(1) + mutableMetricsMap(s"${routerConfig.jobName}.d2.${routerConfig.eventFailedMetricsCount}") should be(1) + } + +} diff --git a/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/EventFixture.scala b/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/EventFixture.scala new file mode 100644 index 00000000..7856b0cc --- /dev/null +++ b/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/EventFixture.scala @@ -0,0 +1,7 @@ +package org.sunbird.obsrv.router + +object EventFixture { + + val SUCCESS_EVENT = """{"dataset":"d1","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val FAILED_EVENT = """{"dataset":"d2","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" +} diff --git a/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala b/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala new file mode 100644 index 00000000..f35567f0 --- /dev/null +++ b/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala @@ -0,0 +1,124 @@ +package org.sunbird.obsrv.router + +import org.scalatest.{FlatSpec, Matchers} +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.DatasetConfig +import org.sunbird.obsrv.router.functions.TimestampKeyParser + +import scala.collection.mutable + +class TestTimestampKeyParser extends FlatSpec with Matchers { + + "TimestampKeyParser" should "validate all scenarios of timestamp key in number format" in { + + + // Validate text date field without providing dateformat and timezone + val result1 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = None), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01"}""")) + result1.isValid should be (true) + result1.value.asInstanceOf[String] should be ("2023-03-01") + + // Validate missing timestamp key scenario + val result2 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(key = "id", tsKey = "date1", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = None), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01"}""")) + result2.isValid should be(false) + result2.value should be(null) + + // Validate number date field which is not epoch + val result3 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = None), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":20232201}""")) + result3.isValid should be(false) + result3.value.asInstanceOf[Int] should be(0) + + // Validate number date field which is epoch in seconds + val result4 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = None), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165}""")) + result4.isValid should be(true) + result4.value.asInstanceOf[Long] should be(1701373165000l) + + // Validate number date field which is epoch in milli-seconds + val result5 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = None), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165123}""")) + result5.isValid should be(true) + result5.value.asInstanceOf[Long] should be(1701373165123l) + + // Validate number date field which is epoch in micro-seconds + val result6 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = None), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165123111}""")) + result6.isValid should be(true) + result6.value.asInstanceOf[Long] should be(1701373165123l) + + // Validate number date field which is epoch in nano-seconds + val result7 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = None), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165123111000}""")) + result7.isValid should be(true) + result7.value.asInstanceOf[Long] should be(1701373165123l) + + // Validate number date field which is not an epoch in milli, micro or nano seconds + val result8 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = None), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":170137316512}""")) + result8.isValid should be(false) + result8.value.asInstanceOf[Int] should be(0) + + // Validate number date field which is an epoch with timezone present + val result9 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = Some("GMT+05:30")), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165123}""")) + result9.isValid should be(true) + result9.value.asInstanceOf[Long] should be(1701392965123l) + } + + it should "validate all scenarios of timestamp key in text format" in { + + // Validate epoch data in text format + val result1 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = Some("epoch"), datasetTimezone = Some("GMT+05:30")), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"1701373165123"}""")) + result1.isValid should be(true) + result1.value.asInstanceOf[Long] should be(1701392965123l) + + // Validate invalid epoch data in text format (would reset to millis from 1970-01-01 if not epoch in millis) + val result2 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = Some("epoch"), datasetTimezone = Some("GMT+05:30")), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"170137316512"}""")) + result2.isValid should be(true) + result2.value.asInstanceOf[Long] should be(170157116512l) + + // Validate date parser without timezone + val result3 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = Some("yyyy-MM-dd"), datasetTimezone = None), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01"}""")) + result3.isValid should be(true) + result3.value.asInstanceOf[Long] should be(1677609000000l) + + // Validate date parser with timezone + val result4 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = Some("yyyy-MM-dd"), datasetTimezone = Some("GMT+05:30")), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01"}""")) + result4.isValid should be(true) + result4.value.asInstanceOf[Long] should be(1677628800000l) + + // Validate date parser with date time in nano seconds + val result5 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = Some("yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS"), datasetTimezone = Some("GMT+05:30")), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01T12:45:32.123456789"}""")) + result5.isValid should be(true) + result5.value.asInstanceOf[Long] should be(1677674732123l) + + // Validate date parser with data in invalid format + val result6 = TimestampKeyParser.parseTimestampKey( + DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = Some("yyyy-MM-dd'T'HH:mm:ss.SSS"), datasetTimezone = Some("GMT+05:30")), + JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01T12:45:32.123456"}""")) + result6.isValid should be(false) + result6.value should be(null) + } + +} \ No newline at end of file diff --git a/pipeline/extractor/pom.xml b/pipeline/extractor/pom.xml index b73d697d..a85354d6 100644 --- a/pipeline/extractor/pom.xml +++ b/pipeline/extractor/pom.xml @@ -47,6 +47,18 @@ framework 1.0.0 + + org.apache.kafka + kafka-clients + ${kafka.version} + test + + + org.apache.kafka + kafka_${scala.maj.version} + ${kafka.version} + test + org.sunbird.obsrv framework @@ -54,6 +66,13 @@ test-jar test + + org.sunbird.obsrv + dataset-registry + 1.0.0 + test-jar + test + org.apache.flink flink-test-utils @@ -68,21 +87,22 @@ tests - it.ozimov + com.github.codemonstur embedded-redis - 0.7.1 + 1.0.0 test - - - com.google.guava - guava - - - com.google.guava - guava - 32.1.2-jre + io.github.embeddedkafka + embedded-kafka_2.12 + 3.4.0 + test + + + io.zonky.test + embedded-postgres + 2.0.3 + test org.apache.flink diff --git a/pipeline/extractor/src/main/resources/extractor.conf b/pipeline/extractor/src/main/resources/extractor.conf index a406d3c6..103649d0 100644 --- a/pipeline/extractor/src/main/resources/extractor.conf +++ b/pipeline/extractor/src/main/resources/extractor.conf @@ -3,9 +3,8 @@ include "baseconfig.conf" kafka { input.topic = ${job.env}".ingest" output.raw.topic = ${job.env}".raw" - output.extractor.duplicate.topic = ${job.env}".extractor.duplicate" - output.failed.topic = ${job.env}".failed" - output.batch.failed.topic = ${job.env}".extractor.failed" + output.extractor.duplicate.topic = ${job.env}".failed" + output.batch.failed.topic = ${job.env}".failed" event.max.size = "1048576" # Max is only 1MB groupId = ${job.env}"-extractor-group" producer { diff --git a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala index f8f4520c..f1fea9fb 100644 --- a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala +++ b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala @@ -2,12 +2,14 @@ package org.sunbird.obsrv.extractor.functions import org.apache.flink.configuration.Configuration import org.apache.flink.streaming.api.functions.ProcessFunction +import org.slf4j.LoggerFactory import org.sunbird.obsrv.core.cache.{DedupEngine, RedisConnect} import org.sunbird.obsrv.core.exception.ObsrvException -import org.sunbird.obsrv.core.model.ErrorConstants import org.sunbird.obsrv.core.model.ErrorConstants.Error -import org.sunbird.obsrv.core.model.Models.{PData, SystemEvent} -import org.sunbird.obsrv.core.streaming.{BaseProcessFunction, Metrics, MetricsList} +import org.sunbird.obsrv.core.model.FunctionalError.FunctionalError +import org.sunbird.obsrv.core.model.Models._ +import org.sunbird.obsrv.core.model._ +import org.sunbird.obsrv.core.streaming.{BaseDeduplication, BaseProcessFunction, Metrics, MetricsList} import org.sunbird.obsrv.core.util.Util.getMutableMap import org.sunbird.obsrv.core.util.{JSONUtil, Util} import org.sunbird.obsrv.extractor.task.ExtractorConfig @@ -16,44 +18,55 @@ import org.sunbird.obsrv.registry.DatasetRegistry import scala.collection.mutable -class ExtractionFunction(config: ExtractorConfig, @transient var dedupEngine: DedupEngine = null) - extends BaseProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]](config) { +class ExtractionFunction(config: ExtractorConfig) + extends BaseProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]](config) with BaseDeduplication { + + @transient private var dedupEngine: DedupEngine = null + private[this] val logger = LoggerFactory.getLogger(classOf[ExtractionFunction]) override def getMetricsList(): MetricsList = { - val metrics = List(config.successEventCount, config.systemEventCount, config.failedEventCount, config.failedExtractionCount, + val metrics = List(config.successEventCount, config.systemEventCount, config.eventFailedMetricsCount, config.failedExtractionCount, config.skippedExtractionCount, config.duplicateExtractionCount, config.totalEventCount, config.successExtractionCount) MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics) } override def open(parameters: Configuration): Unit = { super.open(parameters) - if (dedupEngine == null) { - val redisConnect = new RedisConnect(config.redisHost, config.redisPort, config.redisConnectionTimeout) - dedupEngine = new DedupEngine(redisConnect, config.dedupStore, config.cacheExpiryInSeconds) - } + val redisConnect = new RedisConnect(config.redisHost, config.redisPort, config.redisConnectionTimeout) + dedupEngine = new DedupEngine(redisConnect, config.dedupStore, config.cacheExpiryInSeconds) } override def processElement(batchEvent: mutable.Map[String, AnyRef], context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, metrics: Metrics): Unit = { metrics.incCounter(config.defaultDatasetID, config.totalEventCount) - val datasetId = batchEvent.get(config.CONST_DATASET) - if (datasetId.isEmpty) { + if (batchEvent.contains(Constants.INVALID_JSON)) { + context.output(config.failedBatchEventOutputTag, markBatchFailed(batchEvent, ErrorConstants.ERR_INVALID_EVENT)) + metrics.incCounter(config.defaultDatasetID, config.eventFailedMetricsCount) + context.output(config.systemEventsOutputTag, failedSystemEvent(Some(config.defaultDatasetID), ErrorConstants.ERR_INVALID_EVENT, FunctionalError.InvalidJsonData)) + return + } + val eventAsText = JSONUtil.serialize(batchEvent) + val datasetIdOpt = batchEvent.get(config.CONST_DATASET) + if (datasetIdOpt.isEmpty) { context.output(config.failedBatchEventOutputTag, markBatchFailed(batchEvent, ErrorConstants.MISSING_DATASET_ID)) - metrics.incCounter(config.defaultDatasetID, config.failedExtractionCount) + metrics.incCounter(config.defaultDatasetID, config.eventFailedMetricsCount) + context.output(config.systemEventsOutputTag, failedSystemEvent(Some(config.defaultDatasetID), ErrorConstants.MISSING_DATASET_ID, FunctionalError.MissingDatasetId)) return } - val datasetOpt = DatasetRegistry.getDataset(datasetId.get.asInstanceOf[String]) + val datasetId = datasetIdOpt.get.asInstanceOf[String] + metrics.incCounter(datasetId, config.totalEventCount) + val datasetOpt = DatasetRegistry.getDataset(datasetId) if (datasetOpt.isEmpty) { context.output(config.failedBatchEventOutputTag, markBatchFailed(batchEvent, ErrorConstants.MISSING_DATASET_CONFIGURATION)) - metrics.incCounter(config.defaultDatasetID, config.failedExtractionCount) + metrics.incCounter(datasetId, config.failedExtractionCount) + context.output(config.systemEventsOutputTag, failedSystemEvent(Some(datasetId), ErrorConstants.MISSING_DATASET_CONFIGURATION, FunctionalError.MissingDatasetId)) return } val dataset = datasetOpt.get if (!containsEvent(batchEvent) && dataset.extractionConfig.isDefined && dataset.extractionConfig.get.isBatchEvent.get) { - val eventAsText = JSONUtil.serialize(batchEvent) if (dataset.extractionConfig.get.dedupConfig.isDefined && dataset.extractionConfig.get.dedupConfig.get.dropDuplicates.get) { - val isDup = isDuplicate(dataset.id, dataset.extractionConfig.get.dedupConfig.get.dedupKey, eventAsText, context, config)(dedupEngine) + val isDup = isDuplicate(dataset, dataset.extractionConfig.get.dedupConfig.get.dedupKey, eventAsText, context) if (isDup) { metrics.incCounter(dataset.id, config.duplicateExtractionCount) context.output(config.duplicateEventOutputTag, markBatchFailed(batchEvent, ErrorConstants.DUPLICATE_BATCH_EVENT_FOUND)) @@ -66,20 +79,40 @@ class ExtractionFunction(config: ExtractorConfig, @transient var dedupEngine: De } } + private def isDuplicate(dataset: Dataset, dedupKey: Option[String], event: String, + context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context): Boolean = { + try { + super.isDuplicate(dataset.id, dedupKey, event)(dedupEngine) + } catch { + case ex: ObsrvException => + val sysEvent = JSONUtil.serialize(SystemEvent( + EventID.METRIC, + ctx = ContextData(module = ModuleID.processing, pdata = PData(config.jobName, PDataType.flink, Some(Producer.extractor)), dataset = Some(dataset.id), dataset_type = Some(dataset.datasetType)), + data = EData(error = Some(ErrorLog(pdata_id = Producer.dedup, pdata_status = StatusCode.skipped, error_type = FunctionalError.DedupFailed, error_code = ex.error.errorCode, error_message = ex.error.errorMsg, error_level = ErrorLevel.warn))) + )) + logger.warn("BaseDeduplication:isDuplicate() | Exception", ex) + context.output(config.systemEventsOutputTag, sysEvent) + false + } + } + private def skipExtraction(dataset: Dataset, batchEvent: mutable.Map[String, AnyRef], context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, metrics: Metrics): Unit = { val obsrvMeta = batchEvent(config.CONST_OBSRV_META).asInstanceOf[Map[String, AnyRef]] if (!super.containsEvent(batchEvent)) { - metrics.incCounter(dataset.id, config.failedEventCount) - context.output(config.failedEventsOutputTag, markBatchFailed(batchEvent, ErrorConstants.EVENT_MISSING)) + metrics.incCounter(dataset.id, config.eventFailedMetricsCount) + context.output(config.failedEventsOutputTag(), markBatchFailed(batchEvent, ErrorConstants.EVENT_MISSING)) + context.output(config.systemEventsOutputTag, failedSystemEvent(Some(dataset.id), ErrorConstants.EVENT_MISSING, FunctionalError.MissingEventData, dataset_type = Some(dataset.datasetType))) return } val eventData = Util.getMutableMap(batchEvent(config.CONST_EVENT).asInstanceOf[Map[String, AnyRef]]) val eventJson = JSONUtil.serialize(eventData) val eventSize = eventJson.getBytes("UTF-8").length if (eventSize > config.eventMaxSize) { - metrics.incCounter(dataset.id, config.failedEventCount) - context.output(config.failedEventsOutputTag, markEventFailed(dataset.id, eventData, ErrorConstants.EVENT_SIZE_EXCEEDED, obsrvMeta)) + metrics.incCounter(dataset.id, config.eventFailedMetricsCount) + context.output(config.failedEventsOutputTag(), markEventFailed(dataset.id, eventData, ErrorConstants.EVENT_SIZE_EXCEEDED, obsrvMeta)) + context.output(config.systemEventsOutputTag, failedSystemEvent(Some(dataset.id), ErrorConstants.EVENT_SIZE_EXCEEDED, FunctionalError.EventSizeExceeded, dataset_type = Some(dataset.datasetType))) + logger.error(s"Extractor | Event size exceeded max configured value | dataset=${dataset.id} | Event size is $eventSize, Max configured size is ${config.eventMaxSize}") } else { metrics.incCounter(dataset.id, config.skippedExtractionCount) context.output(config.rawEventsOutputTag, markEventSkipped(dataset.id, eventData, obsrvMeta)) @@ -96,21 +129,24 @@ class ExtractionFunction(config: ExtractorConfig, @transient var dedupEngine: De val eventJson = JSONUtil.serialize(eventData) val eventSize = eventJson.getBytes("UTF-8").length if (eventSize > config.eventMaxSize) { - metrics.incCounter(dataset.id, config.failedEventCount) - context.output(config.failedEventsOutputTag, markEventFailed(dataset.id, eventData, ErrorConstants.EVENT_SIZE_EXCEEDED, obsrvMeta)) + metrics.incCounter(dataset.id, config.eventFailedMetricsCount) + context.output(config.failedEventsOutputTag(), markEventFailed(dataset.id, eventData, ErrorConstants.EVENT_SIZE_EXCEEDED, obsrvMeta)) + context.output(config.systemEventsOutputTag, failedSystemEvent(Some(dataset.id), ErrorConstants.EVENT_SIZE_EXCEEDED, FunctionalError.EventSizeExceeded, dataset_type = Some(dataset.datasetType))) + logger.error(s"Extractor | Event size exceeded max configured value | dataset=${dataset.id} | Event size is $eventSize, Max configured size is ${config.eventMaxSize}") } else { metrics.incCounter(dataset.id, config.successEventCount) context.output(config.rawEventsOutputTag, markEventSuccess(dataset.id, eventData, obsrvMeta)) } }) - context.output(config.systemEventsOutputTag, JSONUtil.serialize(generateSystemEvent(dataset.id, eventsList.size))) + context.output(config.systemEventsOutputTag, JSONUtil.serialize(successSystemEvent(dataset, eventsList.size))) metrics.incCounter(dataset.id, config.systemEventCount) metrics.incCounter(dataset.id, config.successExtractionCount) } catch { case ex: ObsrvException => - context.output(config.failedBatchEventOutputTag, markBatchFailed(batchEvent, ex.error)) metrics.incCounter(dataset.id, config.failedExtractionCount) - case re: Exception => re.printStackTrace() + context.output(config.failedBatchEventOutputTag, markBatchFailed(batchEvent, ex.error)) + context.output(config.systemEventsOutputTag, failedSystemEvent(Some(dataset.id), ex.error, FunctionalError.ExtractionDataFormatInvalid, dataset_type = Some(dataset.datasetType))) + logger.error(s"Extractor | Exception extracting data | dataset=${dataset.id}", ex) } } @@ -133,8 +169,20 @@ class ExtractionFunction(config: ExtractorConfig, @transient var dedupEngine: De /** * Method to Generate a System Event to capture the extraction information and metrics */ - private def generateSystemEvent(dataset: String, totalEvents: Int): SystemEvent = { - SystemEvent(PData(config.jobName, "flink", ""), Map("totalEvents" -> totalEvents.asInstanceOf[AnyRef], "dataset" -> dataset.asInstanceOf[AnyRef])); // TODO: Generate a system event + private def successSystemEvent(dataset: Dataset, totalEvents: Int): SystemEvent = { + SystemEvent( + EventID.METRIC, + ctx = ContextData(module = ModuleID.processing, pdata = PData(config.jobName, PDataType.flink, Some(Producer.extractor)), dataset = Some(dataset.id), dataset_type = Some(dataset.datasetType)), + data = EData(error = None, pipeline_stats = Some(PipelineStats(Some(totalEvents), Some(StatusCode.success)))) + ) + } + + private def failedSystemEvent(dataset: Option[String], error: Error, functionalError: FunctionalError, dataset_type: Option[String] = None): String = { + + JSONUtil.serialize(SystemEvent( + EventID.METRIC, ctx = ContextData(module = ModuleID.processing, pdata = PData(config.jobName, PDataType.flink, Some(Producer.extractor)), dataset = dataset, dataset_type = dataset_type), + data = EData(error = Some(ErrorLog(Producer.extractor, StatusCode.failed, functionalError, error.errorCode, error.errorMsg, ErrorLevel.critical)), pipeline_stats = None) + )) } /** @@ -143,32 +191,30 @@ class ExtractionFunction(config: ExtractorConfig, @transient var dedupEngine: De private def markEventFailed(dataset: String, event: mutable.Map[String, AnyRef], error: Error, obsrvMeta: Map[String, AnyRef]): mutable.Map[String, AnyRef] = { val wrapperEvent = createWrapperEvent(dataset, event) updateEvent(wrapperEvent, obsrvMeta) - super.markFailed(wrapperEvent, error, config.jobName) + super.markFailed(wrapperEvent, error, Producer.extractor) wrapperEvent } private def markBatchFailed(batchEvent: mutable.Map[String, AnyRef], error: Error): mutable.Map[String, AnyRef] = { - super.markFailed(batchEvent, error, config.jobName) + super.markFailed(batchEvent, error, Producer.extractor) batchEvent } private def markEventSuccess(dataset: String, event: mutable.Map[String, AnyRef], obsrvMeta: Map[String, AnyRef]): mutable.Map[String, AnyRef] = { val wrapperEvent = createWrapperEvent(dataset, event) updateEvent(wrapperEvent, obsrvMeta) - super.markSuccess(wrapperEvent, config.jobName) + super.markSuccess(wrapperEvent, Producer.extractor) wrapperEvent } private def markEventSkipped(dataset: String, event: mutable.Map[String, AnyRef], obsrvMeta: Map[String, AnyRef]): mutable.Map[String, AnyRef] = { val wrapperEvent = createWrapperEvent(dataset, event) updateEvent(wrapperEvent, obsrvMeta) - super.markSkipped(wrapperEvent, config.jobName) + super.markSkipped(wrapperEvent, Producer.extractor) wrapperEvent } - private def createWrapperEvent(dataset: String, event: mutable.Map[String, AnyRef]): mutable.Map[String, AnyRef] = { mutable.Map(config.CONST_DATASET -> dataset, config.CONST_EVENT -> event.toMap) } -} - +} \ No newline at end of file diff --git a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/task/ExtractorConfig.scala b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/task/ExtractorConfig.scala index 421f43e5..131e70ce 100644 --- a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/task/ExtractorConfig.scala +++ b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/task/ExtractorConfig.scala @@ -1,6 +1,5 @@ package org.sunbird.obsrv.extractor.task -import scala.collection.mutable import com.typesafe.config.Config import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.typeutils.TypeExtractor @@ -8,6 +7,8 @@ import org.apache.flink.streaming.api.scala.OutputTag import org.sunbird.obsrv.core.model.SystemConfig import org.sunbird.obsrv.core.streaming.BaseJobConfig +import scala.collection.mutable + class ExtractorConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "ExtractorJob") { private val serialVersionUID = 2905979434303791379L @@ -22,26 +23,22 @@ class ExtractorConfig(override val config: Config) extends BaseJobConfig[mutable val kafkaInputTopic: String = config.getString("kafka.input.topic") val kafkaSuccessTopic: String = config.getString("kafka.output.raw.topic") val kafkaDuplicateTopic: String = config.getString("kafka.output.extractor.duplicate.topic") - val kafkaFailedTopic: String = config.getString("kafka.output.failed.topic") val kafkaBatchFailedTopic: String = config.getString("kafka.output.batch.failed.topic") - val eventMaxSize: Long = SystemConfig.maxEventSize + val eventMaxSize: Long = if(config.hasPath("kafka.event.max.size")) config.getInt("kafka.event.max.size") else SystemConfig.maxEventSize private val RAW_EVENTS_OUTPUT_TAG = "raw-events" - private val FAILED_EVENTS_OUTPUT_TAG = "failed-events" private val FAILED_BATCH_EVENTS_OUTPUT_TAG = "failed-batch-events" private val DUPLICATE_EVENTS_OUTPUT_TAG = "duplicate-batch-events" // Metric List - val totalEventCount = "total-event-count" - val successEventCount = "success-event-count" - val failedEventCount = "failed-event-count" - val failedExtractionCount = "failed-extraction-count" - val successExtractionCount = "success-extraction-count" - val duplicateExtractionCount = "duplicate-extraction-count" - val skippedExtractionCount = "skipped-extraction-count" + val totalEventCount = "extractor-total-count" + val successEventCount = "extractor-event-count" + val failedExtractionCount = "extractor-failed-count" + val successExtractionCount = "extractor-success-count" + val duplicateExtractionCount = "extractor-duplicate-count" + val skippedExtractionCount = "extractor-skipped-count" val rawEventsOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]](RAW_EVENTS_OUTPUT_TAG) - val failedEventsOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]](FAILED_EVENTS_OUTPUT_TAG) val failedBatchEventOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]](FAILED_BATCH_EVENTS_OUTPUT_TAG) val duplicateEventOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]](id = DUPLICATE_EVENTS_OUTPUT_TAG) @@ -52,10 +49,9 @@ class ExtractorConfig(override val config: Config) extends BaseJobConfig[mutable val extractorDuplicateProducer = "extractor-duplicate-events-sink" val extractorBatchFailedEventsProducer = "extractor-batch-failed-events-sink" val extractorRawEventsProducer = "extractor-raw-events-sink" - val extractorFailedEventsProducer = "extractor-failed-events-sink" override def inputTopic(): String = kafkaInputTopic override def inputConsumer(): String = "extractor-consumer" override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = rawEventsOutputTag - + override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") } diff --git a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/task/ExtractorStreamTask.scala b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/task/ExtractorStreamTask.scala index b64b55ad..521ffc61 100644 --- a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/task/ExtractorStreamTask.scala +++ b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/task/ExtractorStreamTask.scala @@ -22,33 +22,32 @@ class ExtractorStreamTask(config: ExtractorConfig, kafkaConnector: FlinkKafkaCon def process(): Unit = { implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) - val dataStream = getMapDataStream(env, config, kafkaConnector) - processStream(dataStream) + process(env) env.execute(config.jobName) } // $COVERAGE-ON$ + def process(env: StreamExecutionEnvironment): Unit = { + val dataStream = getMapDataStream(env, config, kafkaConnector) + processStream(dataStream) + } + override def processStream(dataStream: DataStream[mutable.Map[String, AnyRef]]): DataStream[mutable.Map[String, AnyRef]] = { val extractorStream = dataStream.process(new ExtractionFunction(config)) .name(config.extractionFunction).uid(config.extractionFunction) .setParallelism(config.downstreamOperatorsParallelism) - extractorStream.getSideOutput(config.failedBatchEventOutputTag).sinkTo(kafkaConnector.kafkaMapSink(config.kafkaBatchFailedTopic)) + extractorStream.getSideOutput(config.failedBatchEventOutputTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaBatchFailedTopic)) .name(config.extractorBatchFailedEventsProducer).uid(config.extractorBatchFailedEventsProducer).setParallelism(config.downstreamOperatorsParallelism) - extractorStream.getSideOutput(config.successTag()).sinkTo(kafkaConnector.kafkaMapSink(config.kafkaSuccessTopic)) + extractorStream.getSideOutput(config.successTag()).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaSuccessTopic)) .name(config.extractorRawEventsProducer).uid(config.extractorRawEventsProducer).setParallelism(config.downstreamOperatorsParallelism) - extractorStream.getSideOutput(config.duplicateEventOutputTag).sinkTo(kafkaConnector.kafkaMapSink(config.kafkaDuplicateTopic)) + extractorStream.getSideOutput(config.duplicateEventOutputTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaDuplicateTopic)) .name(config.extractorDuplicateProducer).uid(config.extractorDuplicateProducer).setParallelism(config.downstreamOperatorsParallelism) - extractorStream.getSideOutput(config.systemEventsOutputTag).sinkTo(kafkaConnector.kafkaStringSink(config.kafkaSystemTopic)) - .name(config.systemEventsProducer).uid(config.systemEventsProducer).setParallelism(config.downstreamOperatorsParallelism) - - extractorStream.getSideOutput(config.failedEventsOutputTag).sinkTo(kafkaConnector.kafkaMapSink(config.kafkaFailedTopic)) - .name(config.extractorFailedEventsProducer).uid(config.extractorFailedEventsProducer).setParallelism(config.downstreamOperatorsParallelism) - + addDefaultSinks(extractorStream, config, kafkaConnector) extractorStream.getSideOutput(config.successTag()) } } diff --git a/pipeline/extractor/src/test/resources/test.conf b/pipeline/extractor/src/test/resources/test.conf index 6cfcefa4..33066c5c 100644 --- a/pipeline/extractor/src/test/resources/test.conf +++ b/pipeline/extractor/src/test/resources/test.conf @@ -3,10 +3,10 @@ include "base-test.conf" kafka { input.topic = "flink.ingest" output.raw.topic = "flink.raw" - output.extractor.duplicate.topic = "flink.extractor.duplicate" - output.failed.topic = "flink.failed" - output.batch.failed.topic = "flink.extractor.failed" - event.max.size = "1048576" # Max is only 1MB + output.extractor.duplicate.topic = "flink.failed" + + output.batch.failed.topic = "flink.failed" + event.max.size = "300" # Max is only 1MB groupId = "flink-extractor-group" } @@ -17,7 +17,7 @@ task { redis { host = 127.0.0.1 - port = 6379 + port = 6340 database { extractor.duplication.store.id = 1 key.expiry.seconds = 3600 diff --git a/pipeline/extractor/src/test/resources/test2.conf b/pipeline/extractor/src/test/resources/test2.conf new file mode 100644 index 00000000..a381de66 --- /dev/null +++ b/pipeline/extractor/src/test/resources/test2.conf @@ -0,0 +1,24 @@ +include "base-test.conf" + +kafka { + input.topic = "flink.ingest" + output.raw.topic = "flink.raw" + output.extractor.duplicate.topic = "flink.failed" + + output.batch.failed.topic = "flink.failed" + groupId = "flink-extractor-group" +} + +task { + consumer.parallelism = 1 + downstream.operators.parallelism = 1 +} + +redis { + host = 127.0.0.1 + port = 6340 + database { + extractor.duplication.store.id = 1 + key.expiry.seconds = 3600 + } +} \ No newline at end of file diff --git a/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/EventFixture.scala b/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/EventFixture.scala new file mode 100644 index 00000000..800587a7 --- /dev/null +++ b/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/EventFixture.scala @@ -0,0 +1,15 @@ +package org.sunbird.obsrv.extractor + +object EventFixture { + + val MISSING_DEDUP_KEY = """{"dataset":"d1","id1":"event1","events":[{"id":"1","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}]}""" + val INVALID_JSON = """{"dataset":"d1","event":{"id":"2","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""" + + val VALID_EVENT = """{"dataset":"d1","id":"event4","event":{"id":"3","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val VALID_BATCH = """{"dataset":"d1","id":"event5","events":[{"id":"4","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}]}""" + + val LARGE_JSON_BATCH = """{"dataset":"d1","id":"event2","events":[{"id":"5","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19},"randomKey":"eRJcFJvUoQnlC9ZNa2b2NT84aAv4Trr9m6GFwxaL6Qn1srmWBl7ldsKnBvs6ah2l0KN6M3Vp4eoGLBiIMYsi3gHWklc8sbt6"}]}""" + val LARGE_JSON_EVENT = """{"dataset":"d1","id":"event3","event":{"id":"6","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19},"randomKey":"eRJcFJvUoQnlC9ZNa2b2NT84aAv4Trr9m6GFwxaL6Qn1srmWBl7ldsKnBvs6ah2l0KN6M3Vp4eoGLBiIMYsi3gHWklc8sbt6"}}""" + + +} diff --git a/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala b/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala new file mode 100644 index 00000000..d72a8dcb --- /dev/null +++ b/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala @@ -0,0 +1,167 @@ +package org.sunbird.obsrv.extractor + +import com.typesafe.config.{Config, ConfigFactory} +import io.github.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig} +import org.apache.flink.configuration.Configuration +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.apache.flink.test.util.MiniClusterWithClientResource +import org.apache.kafka.common.serialization.StringDeserializer +import org.scalatest.Matchers._ +import org.sunbird.obsrv.BaseMetricsReporter +import org.sunbird.obsrv.core.cache.RedisConnect +import org.sunbird.obsrv.core.model.Models.SystemEvent +import org.sunbird.obsrv.core.model.SystemConfig +import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector +import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil} +import org.sunbird.obsrv.extractor.task.{ExtractorConfig, ExtractorStreamTask} +import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry + +import scala.collection.mutable +import scala.concurrent.ExecutionContext.Implicits.global +import scala.concurrent.Future +import scala.concurrent.duration._ + +class ExtractorStreamTestSpec extends BaseSpecWithDatasetRegistry { + + val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder() + .setConfiguration(testConfiguration()) + .setNumberSlotsPerTaskManager(1) + .setNumberTaskManagers(1) + .build) + + val pConfig = new ExtractorConfig(config) + val kafkaConnector = new FlinkKafkaConnector(pConfig) + val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group") + implicit val embeddedKafkaConfig: EmbeddedKafkaConfig = + EmbeddedKafkaConfig( + kafkaPort = 9093, + zooKeeperPort = 2183, + customConsumerProperties = customKafkaConsumerProperties + ) + implicit val deserializer: StringDeserializer = new StringDeserializer() + + def testConfiguration(): Configuration = { + val config = new Configuration() + config.setString("metrics.reporter", "job_metrics_reporter") + config.setString("metrics.reporter.job_metrics_reporter.class", classOf[BaseMetricsReporter].getName) + config + } + + override def beforeAll(): Unit = { + super.beforeAll() + BaseMetricsReporter.gaugeMetrics.clear() + EmbeddedKafka.start()(embeddedKafkaConfig) + createTestTopics() + EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixture.INVALID_JSON) + EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixture.MISSING_DEDUP_KEY) + EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixture.LARGE_JSON_EVENT) + EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixture.LARGE_JSON_BATCH) + EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixture.VALID_EVENT) + EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixture.VALID_BATCH) + + flinkCluster.before() + } + + override def afterAll(): Unit = { + val redisConnection = new RedisConnect(pConfig.redisHost, pConfig.redisPort, pConfig.redisConnectionTimeout) + redisConnection.getConnection(config.getInt("redis.database.extractor.duplication.store.id")).flushAll() + super.afterAll() + flinkCluster.after() + EmbeddedKafka.stop() + } + + def createTestTopics(): Unit = { + List( + pConfig.kafkaInputTopic, pConfig.kafkaFailedTopic, pConfig.kafkaSystemTopic, pConfig.kafkaDuplicateTopic, pConfig.kafkaBatchFailedTopic + ).foreach(EmbeddedKafka.createCustomTopic(_)) + } + + "ExtractorStreamTestSpec" should "validate the negative scenarios in extractor job" in { + + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(pConfig) + val task = new ExtractorStreamTask(pConfig, kafkaConnector) + task.process(env) + Future { + env.execute(pConfig.jobName) + } + val batchFailedEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](pConfig.kafkaBatchFailedTopic, 1, timeout = 30.seconds) + val invalidEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](pConfig.kafkaFailedTopic, 2, timeout = 30.seconds) + val systemEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](pConfig.kafkaSystemTopic, 6, timeout = 30.seconds) + val outputEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](pConfig.kafkaSuccessTopic, 3, timeout = 30.seconds) + + validateOutputEvents(outputEvents) + validateBatchFailedEvents(batchFailedEvents) + validateInvalidEvents(invalidEvents) + validateSystemEvents(systemEvents) + + val mutableMetricsMap = mutable.Map[String, Long]() + BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) + Console.println("### ExtractorStreamTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) + validateMetrics(mutableMetricsMap) + + val config2: Config = ConfigFactory.load("test2.conf") + val extractorConfig = new ExtractorConfig(config2) + extractorConfig.eventMaxSize should be (SystemConfig.maxEventSize) + } + + private def validateOutputEvents(outputEvents: List[String]) = { + outputEvents.size should be (3) + //TODO: Add assertions for all 3 events + /* + (OutEvent,{"event":{"dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}},"obsrv_meta":{"flags":{"extractor":"success"},"syncts":1701760331686,"prevProcessingTime":1701760337492,"error":{},"processingStartTime":1701760337087,"timespans":{"extractor":405}},"dataset":"d1"}) + (OutEvent,{"event":{"dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"3","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}},"obsrv_meta":{"flags":{"extractor":"skipped"},"syncts":1701760331771,"prevProcessingTime":1701760337761,"error":{},"processingStartTime":1701760337089,"timespans":{"extractor":672}},"dataset":"d1"}) + (OutEvent,{"event":{"dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"4","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}},"obsrv_meta":{"flags":{"extractor":"success"},"syncts":1701760331794,"prevProcessingTime":1701760337777,"error":{},"processingStartTime":1701760337092,"timespans":{"extractor":685}},"dataset":"d1"}) + */ + } + + private def validateBatchFailedEvents(batchFailedEvents: List[String]): Unit = { + batchFailedEvents.size should be(1) + //TODO: Add assertions for all 1 events + /* + (BatchFailedEvent,{"event":"{\"invalid_json\":\"{\\\"dataset\\\":\\\"d1\\\",\\\"event\\\":{\\\"id\\\":\\\"2\\\",\\\"vehicleCode\\\":\\\"HYUN-CRE-D6\\\",\\\"date\\\":\\\"2023-03-01\\\",\\\"dealer\\\":{\\\"dealerCode\\\":\\\"KUNUnited\\\",\\\"locationId\\\":\\\"KUN1\\\",\\\"email\\\":\\\"dealer1@gmail.com\\\",\\\"phone\\\":\\\"9849012345\\\"},\\\"metrics\\\":{\\\"bookingsTaken\\\":50,\\\"deliveriesPromised\\\":20,\\\"deliveriesDone\\\":19}}\"}","obsrv_meta":{"flags":{"extractor":"failed"},"syncts":1701758716432,"prevProcessingTime":1701758721945,"error":{"src":{"enumClass":"org.sunbird.obsrv.core.model.Producer","value":"extractor"},"error_code":"ERR_EXT_1018","error_msg":"Invalid JSON event, error while deserializing the event"},"processingStartTime":1701758721739,"timespans":{"extractor":206}},"invalid_json":"{\"dataset\":\"d1\",\"event\":{\"id\":\"2\",\"vehicleCode\":\"HYUN-CRE-D6\",\"date\":\"2023-03-01\",\"dealer\":{\"dealerCode\":\"KUNUnited\",\"locationId\":\"KUN1\",\"email\":\"dealer1@gmail.com\",\"phone\":\"9849012345\"},\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}}"}) + */ + } + + private def validateInvalidEvents(invalidEvents: List[String]): Unit = { + invalidEvents.size should be(2) + //TODO: Add assertions for all 2 events + /* + (FailedEvent,{"event":"{\"event\":{\"dealer\":{\"dealerCode\":\"KUNUnited\",\"locationId\":\"KUN1\",\"email\":\"dealer1@gmail.com\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"6\",\"randomKey\":\"eRJcFJvUoQnlC9ZNa2b2NT84aAv4Trr9m6GFwxaL6Qn1srmWBl7ldsKnBvs6ah2l0KN6M3Vp4eoGLBiIMYsi3gHWklc8sbt6\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d1\"}","obsrv_meta":{"flags":{"extractor":"failed"},"syncts":1701758716560,"prevProcessingTime":1701758722479,"error":{"src":{"enumClass":"org.sunbird.obsrv.core.model.Producer","value":"extractor"},"error_code":"ERR_EXT_1003","error_msg":"Event size has exceeded max configured size of 1048576"},"processingStartTime":1701758721888,"timespans":{"extractor":591}},"dataset":"d1"}) + (FailedEvent,{"event":"{\"event\":{\"dealer\":{\"dealerCode\":\"KUNUnited\",\"locationId\":\"KUN1\",\"email\":\"dealer1@gmail.com\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"5\",\"randomKey\":\"eRJcFJvUoQnlC9ZNa2b2NT84aAv4Trr9m6GFwxaL6Qn1srmWBl7ldsKnBvs6ah2l0KN6M3Vp4eoGLBiIMYsi3gHWklc8sbt6\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d1\"}","obsrv_meta":{"flags":{"extractor":"failed"},"syncts":1701758716590,"prevProcessingTime":1701758722521,"error":{"src":{"enumClass":"org.sunbird.obsrv.core.model.Producer","value":"extractor"},"error_code":"ERR_EXT_1003","error_msg":"Event size has exceeded max configured size of 1048576"},"processingStartTime":1701758721888,"timespans":{"extractor":633}},"dataset":"d1"}) + */ + } + + private def validateSystemEvents(systemEvents: List[String]): Unit = { + systemEvents.size should be(6) + + systemEvents.foreach(se => { + val event = JSONUtil.deserialize[SystemEvent](se) + if(event.ctx.dataset.getOrElse("ALL").equals("ALL")) + event.ctx.dataset_type should be(None) + else + event.ctx.dataset_type.getOrElse("dataset") should be("dataset") + }) + + //TODO: Add assertions for all 6 events + /* + (SysEvent,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"ExtractorJob","type":"flink","pid":"extractor"},"dataset":"ALL"},"data":{"error":{"pdata_id":"extractor","pdata_status":"failed","error_type":"InvalidJsonData","error_code":"ERR_EXT_1018","error_message":"Invalid JSON event, error while deserializing the event","error_level":"critical"}},"ets":1701760337333}) + (SysEvent,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"ExtractorJob","type":"flink","pid":"extractor"},"dataset":"d1", "dataset_type": "dataset"},"data":{"error":{"pdata_id":"dedup","pdata_status":"skipped","error_type":"DedupFailed","error_code":"ERR_DEDUP_1007","error_message":"No dedup key found or missing data","error_level":"warn"}},"ets":1701760337474}) + (SysEvent,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"ExtractorJob","type":"flink","pid":"extractor"},"dataset":"d1", "dataset_type": "dataset"},"data":{"pipeline_stats":{"extractor_events":1,"extractor_status":"success"}},"ets":1701760337655}) + (SysEvent,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"ExtractorJob","type":"flink","pid":"extractor"},"dataset":"d1", "dataset_type": "dataset"},"data":{"error":{"pdata_id":"extractor","pdata_status":"failed","error_type":"EventSizeExceeded","error_code":"ERR_EXT_1003","error_message":"Event size has exceeded max configured size of 1048576","error_level":"critical"}},"ets":1701760337724}) + (SysEvent,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"ExtractorJob","type":"flink","pid":"extractor"},"dataset":"d1", "dataset_type": "dataset"},"data":{"error":{"pdata_id":"extractor","pdata_status":"failed","error_type":"EventSizeExceeded","error_code":"ERR_EXT_1003","error_message":"Event size has exceeded max configured size of 1048576","error_level":"critical"}},"ets":1701760337754}) + (SysEvent,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"ExtractorJob","type":"flink","pid":"extractor"},"dataset":"d1", "dataset_type": "dataset"},"data":{"pipeline_stats":{"extractor_events":1,"extractor_status":"success"}},"ets":1701760337754}) + */ + } + + private def validateMetrics(mutableMetricsMap: mutable.Map[String, Long]): Unit = { + + mutableMetricsMap(s"${pConfig.jobName}.ALL.${pConfig.eventFailedMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d1.${pConfig.totalEventCount}") should be(5) + mutableMetricsMap(s"${pConfig.jobName}.d1.${pConfig.eventFailedMetricsCount}") should be(2) + mutableMetricsMap(s"${pConfig.jobName}.d1.${pConfig.skippedExtractionCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d1.${pConfig.successEventCount}") should be(2) + mutableMetricsMap(s"${pConfig.jobName}.d1.${pConfig.successExtractionCount}") should be(3) + } + +} \ No newline at end of file diff --git a/pipeline/kafka-connector/pom.xml b/pipeline/kafka-connector/pom.xml new file mode 100644 index 00000000..65aa4d68 --- /dev/null +++ b/pipeline/kafka-connector/pom.xml @@ -0,0 +1,263 @@ + + + + 4.0.0 + + org.sunbird.obsrv + pipeline + 1.0 + + + org.sunbird.obsrv.pipeline + kafka-connector + 1.0.0 + jar + Kafka Connector + + Reads data from source kafka topic(s) and writes them to a configurable topic + + + + UTF-8 + 1.4.0 + + + + + org.apache.flink + flink-streaming-scala_${scala.maj.version} + ${flink.version} + provided + + + org.sunbird.obsrv + dataset-registry + 1.0.0 + + + org.apache.kafka + kafka-clients + + + + + joda-time + joda-time + 2.12.5 + + + com.fasterxml.jackson.datatype + jackson-datatype-joda + 2.15.2 + + + org.sunbird.obsrv + framework + 1.0.0 + + + org.sunbird.obsrv + framework + 1.0.0 + test-jar + test + + + org.sunbird.obsrv + dataset-registry + 1.0.0 + test-jar + test + + + org.apache.flink + flink-test-utils + ${flink.version} + test + + + org.apache.flink + flink-runtime + ${flink.version} + test + tests + + + org.apache.kafka + kafka-clients + ${kafka.version} + test + + + org.apache.kafka + kafka_${scala.maj.version} + ${kafka.version} + test + + + com.github.codemonstur + embedded-redis + 1.0.0 + test + + + io.github.embeddedkafka + embedded-kafka_2.12 + 3.4.0 + test + + + io.zonky.test + embedded-postgres + 2.0.3 + test + + + org.apache.flink + flink-streaming-java + ${flink.version} + test + tests + + + org.scalatest + scalatest_2.12 + 3.0.6 + test + + + org.mockito + mockito-core + 3.3.3 + test + + + + + src/main/scala + src/test/scala + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + 11 + + + + org.apache.maven.plugins + maven-shade-plugin + 3.2.1 + + + + package + + shade + + + + + com.google.code.findbugs:jsr305 + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + org.sunbird.obsrv.connector.task.KafkaConnectorStreamTask + + + + reference.conf + + + + + + + + + net.alchim31.maven + scala-maven-plugin + 4.4.0 + + ${java.target.runtime} + ${java.target.runtime} + ${scala.version} + false + + + + scala-compile-first + process-resources + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + + + maven-surefire-plugin + 2.22.2 + + true + + + + + org.scalatest + scalatest-maven-plugin + 1.0 + + ${project.build.directory}/surefire-reports + . + dp-duplication-testsuite.txt + + + + test + + test + + + + + + org.scoverage + scoverage-maven-plugin + ${scoverage.plugin.version} + + ${scala.version} + true + true + + + + + + diff --git a/pipeline/kafka-connector/src/main/resources/kafka-connector.conf b/pipeline/kafka-connector/src/main/resources/kafka-connector.conf new file mode 100644 index 00000000..9b5c575b --- /dev/null +++ b/pipeline/kafka-connector/src/main/resources/kafka-connector.conf @@ -0,0 +1,16 @@ +include "baseconfig.conf" + +kafka { + input.topic = ${job.env}".test" + // output.topic = ${job.env}".ingest" + event.max.size = "1048576" # Max is only 1MB + groupId = ${job.env}"-kafkaconnector-group" + producer { + max-request-size = 5242880 + } +} + +task { + consumer.parallelism = 1 + downstream.operators.parallelism = 1 +} \ No newline at end of file diff --git a/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/connector/task/KafkaConnectorConfig.scala b/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/connector/task/KafkaConnectorConfig.scala new file mode 100644 index 00000000..05ccaa8e --- /dev/null +++ b/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/connector/task/KafkaConnectorConfig.scala @@ -0,0 +1,25 @@ +package org.sunbird.obsrv.connector.task + +import com.typesafe.config.Config +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.java.typeutils.TypeExtractor +import org.apache.flink.streaming.api.scala.OutputTag +import org.sunbird.obsrv.core.streaming.BaseJobConfig + +import scala.collection.mutable + +class KafkaConnectorConfig(override val config: Config) extends BaseJobConfig[String](config, "KafkaConnectorJob") { + + private val serialVersionUID = 2905979435603791379L + + implicit val mapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) + implicit val stringTypeInfo: TypeInformation[String] = TypeExtractor.getForClass(classOf[String]) + + override def inputTopic(): String = "" + + override def inputConsumer(): String = "" + + override def successTag(): OutputTag[String] = OutputTag[String]("dummy-events") + + override def failedEventsOutputTag(): OutputTag[String] = OutputTag[String]("failed-events") +} diff --git a/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/connector/task/KafkaConnectorStreamTask.scala b/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/connector/task/KafkaConnectorStreamTask.scala new file mode 100644 index 00000000..175f64fa --- /dev/null +++ b/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/connector/task/KafkaConnectorStreamTask.scala @@ -0,0 +1,71 @@ +package org.sunbird.obsrv.connector.task + +import com.typesafe.config.ConfigFactory +import org.apache.flink.api.java.utils.ParameterTool +import org.apache.flink.streaming.api.datastream.DataStream +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.joda.time.{DateTime, DateTimeZone} +import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector} +import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil} +import org.sunbird.obsrv.registry.DatasetRegistry + +import java.io.File + +class KafkaConnectorStreamTask(config: KafkaConnectorConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[String] { + + private val serialVersionUID = -7729362727131516112L + + // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster + def process(): Unit = { + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) + env.execute(config.jobName) + } + + override def processStream(dataStream: DataStream[String]): DataStream[String] = { + null + } + // $COVERAGE-ON$ + + def process(env: StreamExecutionEnvironment): Unit = { + val datasetSourceConfig = DatasetRegistry.getAllDatasetSourceConfig() + datasetSourceConfig.map { configList => + configList.filter(_.connectorType.equalsIgnoreCase("kafka")).map { + dataSourceConfig => + val dataStream: DataStream[String] = getStringDataStream(env, config, List(dataSourceConfig.connectorConfig.topic), + config.kafkaConsumerProperties(kafkaBrokerServers = Some(dataSourceConfig.connectorConfig.kafkaBrokers), + kafkaConsumerGroup = Some(s"kafka-${dataSourceConfig.connectorConfig.topic}-consumer")), + consumerSourceName = s"kafka-${dataSourceConfig.connectorConfig.topic}", kafkaConnector) + val datasetId = dataSourceConfig.datasetId + val kafkaOutputTopic = DatasetRegistry.getDataset(datasetId).get.datasetConfig.entryTopic + val resultStream: DataStream[String] = dataStream.map { streamData: String => { + val syncts = java.lang.Long.valueOf(new DateTime(DateTimeZone.UTC).getMillis) + JSONUtil.getJsonType(streamData) match { + case "ARRAY" => s"""{"dataset":"$datasetId","syncts":$syncts,"events":$streamData}""" + case _ => s"""{"dataset":"$datasetId","syncts":$syncts,"event":$streamData}""" + } + } + }.returns(classOf[String]) + resultStream.sinkTo(kafkaConnector.kafkaSink[String](kafkaOutputTopic)) + .name(s"$datasetId-kafka-connector-sink").uid(s"$datasetId-kafka-connector-sink") + .setParallelism(config.downstreamOperatorsParallelism) + } + } + } + +} + +// $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster +object KafkaConnectorStreamTask { + + def main(args: Array[String]): Unit = { + val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path")) + val config = configFilePath.map { + path => ConfigFactory.parseFile(new File(path)).resolve() + }.getOrElse(ConfigFactory.load("kafka-connector.conf").withFallback(ConfigFactory.systemEnvironment())) + val kafkaConnectorConfig = new KafkaConnectorConfig(config) + val kafkaUtil = new FlinkKafkaConnector(kafkaConnectorConfig) + val task = new KafkaConnectorStreamTask(kafkaConnectorConfig, kafkaUtil) + task.process() + } +} +// $COVERAGE-ON$ \ No newline at end of file diff --git a/pipeline/kafka-connector/src/test/resources/test.conf b/pipeline/kafka-connector/src/test/resources/test.conf new file mode 100644 index 00000000..87306136 --- /dev/null +++ b/pipeline/kafka-connector/src/test/resources/test.conf @@ -0,0 +1,14 @@ +include "base-test.conf" + +kafka { + input.topic = "flink.test" + groupId = "flink-kafkaconnector-group" + producer { + max-request-size = 5242880 + } +} + +task { + consumer.parallelism = 1 + downstream.operators.parallelism = 1 +} \ No newline at end of file diff --git a/pipeline/kafka-connector/src/test/scala/org/sunbird/obsrv/connector/KafkaConnectorStreamTestSpec.scala b/pipeline/kafka-connector/src/test/scala/org/sunbird/obsrv/connector/KafkaConnectorStreamTestSpec.scala new file mode 100644 index 00000000..bf86eafa --- /dev/null +++ b/pipeline/kafka-connector/src/test/scala/org/sunbird/obsrv/connector/KafkaConnectorStreamTestSpec.scala @@ -0,0 +1,126 @@ +package org.sunbird.obsrv.connector + +import io.github.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig} +import org.apache.flink.configuration.Configuration +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.apache.flink.test.util.MiniClusterWithClientResource +import org.apache.kafka.common.serialization.StringDeserializer +import org.scalatest.Matchers._ +import org.sunbird.obsrv.BaseMetricsReporter +import org.sunbird.obsrv.connector.task.{KafkaConnectorConfig, KafkaConnectorStreamTask} +import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector +import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect} +import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry + +import scala.concurrent.ExecutionContext.Implicits.global +import scala.concurrent.Future +import scala.concurrent.duration._ + +class KafkaConnectorStreamTestSpec extends BaseSpecWithDatasetRegistry { + + val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder() + .setConfiguration(testConfiguration()) + .setNumberSlotsPerTaskManager(1) + .setNumberTaskManagers(1) + .build) + + val pConfig = new KafkaConnectorConfig(config) + val kafkaConnector = new FlinkKafkaConnector(pConfig) + val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group") + implicit val embeddedKafkaConfig: EmbeddedKafkaConfig = + EmbeddedKafkaConfig( + kafkaPort = 9093, + zooKeeperPort = 2183, + customConsumerProperties = customKafkaConsumerProperties + ) + implicit val deserializer: StringDeserializer = new StringDeserializer() + private val VALID_JSON_EVENT = """{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""" + private val VALID_JSON_EVENT_ARRAY = """[{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}]""" + private val INVALID_JSON_EVENT = """{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}""" + + def testConfiguration(): Configuration = { + val config = new Configuration() + config.setString("metrics.reporter", "job_metrics_reporter") + config.setString("metrics.reporter.job_metrics_reporter.class", classOf[BaseMetricsReporter].getName) + config + } + + override def beforeAll(): Unit = { + super.beforeAll() + BaseMetricsReporter.gaugeMetrics.clear() + EmbeddedKafka.start()(embeddedKafkaConfig) + prepareTestData() + createTestTopics() + EmbeddedKafka.publishStringMessageToKafka("d1-topic", VALID_JSON_EVENT) + EmbeddedKafka.publishStringMessageToKafka("d2-topic", VALID_JSON_EVENT_ARRAY) + EmbeddedKafka.publishStringMessageToKafka("d3-topic", INVALID_JSON_EVENT) + + flinkCluster.before() + } + + private def prepareTestData(): Unit = { + val postgresConnect = new PostgresConnect(postgresConfig) + postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date, tags) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") + postgresConnect.execute("insert into dataset_source_config values('sc1', 'd1', 'kafka', '{\"kafkaBrokers\":\"localhost:9093\",\"topic\":\"d1-topic\"}', 'Live', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_source_config values('sc2', 'd1', 'rdbms', '{\"type\":\"postgres\",\"tableName\":\"test-table\"}', 'Live', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_source_config values('sc3', 'd2', 'kafka', '{\"kafkaBrokers\":\"localhost:9093\",\"topic\":\"d2-topic\"}', 'Live', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_source_config values('sc4', 'd3', 'kafka', '{\"kafkaBrokers\":\"localhost:9093\",\"topic\":\"d3-topic\"}', 'Live', null, 'System', 'System', now(), now());") + postgresConnect.closeConnection() + } + + override def afterAll(): Unit = { + super.afterAll() + flinkCluster.after() + EmbeddedKafka.stop() + } + + def createTestTopics(): Unit = { + List( + "d1-topic", "d2-topic", "d3-topic", pConfig.kafkaSystemTopic, "ingest" + ).foreach(EmbeddedKafka.createCustomTopic(_)) + } + + "KafkaConnectorStreamTestSpec" should "validate the kafka connector job" in { + + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(pConfig) + val task = new KafkaConnectorStreamTask(pConfig, kafkaConnector) + task.process(env) + Future { + env.execute(pConfig.jobName) + } + + val ingestEvents = EmbeddedKafka.consumeNumberMessagesFrom[String]("ingest", 3, timeout = 30.seconds) + validateIngestEvents(ingestEvents) + + pConfig.inputTopic() should be ("") + pConfig.inputConsumer() should be ("") + pConfig.successTag().getId should be ("dummy-events") + pConfig.failedEventsOutputTag().getId should be ("failed-events") + } + + private def validateIngestEvents(ingestEvents: List[String]): Unit = { + ingestEvents.size should be(3) + ingestEvents.foreach{event: String => { + if(event.contains(""""dataset":"d1"""")) { + JSONUtil.getJsonType(event) should be ("OBJECT") + val eventMap = JSONUtil.deserialize[Map[String, AnyRef]](event) + eventMap.get("dataset").get.asInstanceOf[String] should be ("d1") + eventMap.get("syncts").isDefined should be (true) + eventMap.contains("event") should be (true) + } else if(event.contains(""""dataset":"d2"""")) { + JSONUtil.getJsonType(event) should be("OBJECT") + val eventMap = JSONUtil.deserialize[Map[String, AnyRef]](event) + eventMap.get("dataset").get.asInstanceOf[String] should be("d2") + eventMap.get("syncts").isDefined should be(true) + eventMap.contains("events") should be(true) + JSONUtil.getJsonType(JSONUtil.serialize(eventMap.get("events"))) should be("ARRAY") + } else { + JSONUtil.getJsonType(event) should be ("NOT_A_JSON") + event.contains(""""event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}""") should be(true) + } + }} + + } + +} \ No newline at end of file diff --git a/pipeline/master-data-processor/pom.xml b/pipeline/master-data-processor/pom.xml index 52783714..370ec621 100644 --- a/pipeline/master-data-processor/pom.xml +++ b/pipeline/master-data-processor/pom.xml @@ -4,9 +4,6 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - - 3.0.1 - org.sunbird.obsrv @@ -55,6 +52,11 @@ preprocessor 1.0.0 + + org.sunbird.obsrv.pipeline + denormalizer + 1.0.0 + org.sunbird.obsrv.pipeline transformer @@ -129,9 +131,9 @@ tests - it.ozimov + com.github.codemonstur embedded-redis - 0.7.1 + 1.0.0 test diff --git a/pipeline/master-data-processor/src/main/resources/master-data-processor.conf b/pipeline/master-data-processor/src/main/resources/master-data-processor.conf index 686d2f35..149e795b 100644 --- a/pipeline/master-data-processor/src/main/resources/master-data-processor.conf +++ b/pipeline/master-data-processor/src/main/resources/master-data-processor.conf @@ -3,13 +3,14 @@ include "baseconfig.conf" kafka { input.topic = ${job.env}".masterdata.ingest" output.raw.topic = ${job.env}".masterdata.raw" - output.extractor.duplicate.topic = ${job.env}".masterdata.extractor.duplicate" + output.extractor.duplicate.topic = ${job.env}".masterdata.failed" output.failed.topic = ${job.env}".masterdata.failed" output.batch.failed.topic = ${job.env}".masterdata.extractor.failed" event.max.size = "1048576" # Max is only 1MB - output.invalid.topic = ${job.env}".masterdata.invalid" + output.invalid.topic = ${job.env}".masterdata.failed" output.unique.topic = ${job.env}".masterdata.unique" - output.duplicate.topic = ${job.env}".masterdata.duplicate" + output.duplicate.topic = ${job.env}".masterdata.failed" + output.denorm.topic = ${job.env}".masterdata.denorm" output.transform.topic = ${job.env}".masterdata.transform" stats.topic = ${job.env}".masterdata.stats" groupId = ${job.env}"-masterdata-pipeline-group" diff --git a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/function/MasterDataProcessorFunction.scala b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/function/MasterDataProcessorFunction.scala index a9f6c12b..a7ca7471 100644 --- a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/function/MasterDataProcessorFunction.scala +++ b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/function/MasterDataProcessorFunction.scala @@ -3,20 +3,20 @@ package org.sunbird.obsrv.pipeline.function import org.apache.flink.configuration.Configuration import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction import org.apache.flink.streaming.api.windowing.windows.TimeWindow +import org.json4s.native.JsonMethods._ import org.slf4j.LoggerFactory -import org.sunbird.obsrv.core.streaming.{Metrics, MetricsList, WindowBaseProcessFunction} +import org.sunbird.obsrv.core.model.{ErrorConstants, FunctionalError, Producer} +import org.sunbird.obsrv.core.streaming.Metrics +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.Dataset import org.sunbird.obsrv.pipeline.task.MasterDataProcessorConfig import org.sunbird.obsrv.pipeline.util.MasterDataCache import org.sunbird.obsrv.registry.DatasetRegistry -import org.json4s._ -import org.json4s.native.JsonMethods._ -import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.streaming.BaseDatasetWindowProcessFunction -import java.lang import scala.collection.mutable -import scala.collection.JavaConverters._ -class MasterDataProcessorFunction(config: MasterDataProcessorConfig) extends WindowBaseProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef], String](config) { +class MasterDataProcessorFunction(config: MasterDataProcessorConfig) extends BaseDatasetWindowProcessFunction(config) { private[this] val logger = LoggerFactory.getLogger(classOf[MasterDataProcessorFunction]) private[this] var masterDataCache: MasterDataCache = _ @@ -32,41 +32,33 @@ class MasterDataProcessorFunction(config: MasterDataProcessorConfig) extends Win masterDataCache.close() } - override def getMetricsList(): MetricsList = { - val metrics = List(config.successEventCount, config.systemEventCount, config.totalEventCount, config.successInsertCount, config.successUpdateCount, config.failedCount) - MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics) + override def getMetrics(): List[String] = { + List(config.successEventCount, config.systemEventCount, config.totalEventCount, config.successInsertCount, config.successUpdateCount) } - override def process(datasetId: String, context: ProcessWindowFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef], String, TimeWindow]#Context, elements: lang.Iterable[mutable.Map[String, AnyRef]], metrics: Metrics): Unit = { - implicit val jsonFormats: Formats = DefaultFormats.withLong + override def processWindow(dataset: Dataset, context: ProcessWindowFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef], String, TimeWindow]#Context, elements: List[mutable.Map[String, AnyRef]], metrics: Metrics): Unit = { - implicit class JsonHelper(json: JValue) { - def customExtract[T](path: String)(implicit mf: Manifest[T]): T = { - path.split('.').foldLeft(json)({ case (acc: JValue, node: String) => acc \ node }).extract[T] + metrics.incCounter(dataset.id, config.totalEventCount, elements.size.toLong) + masterDataCache.open(dataset) + val eventsMap = elements.map(msg => { + val event = JSONUtil.serialize(msg(config.CONST_EVENT)) + val json = parse(event, useBigIntForLong = false) + val node = JSONUtil.getKey(dataset.datasetConfig.key, event) + if (node.isMissingNode) { + markFailure(Some(dataset.id), msg, context, metrics, ErrorConstants.MISSING_DATASET_CONFIG_KEY, Producer.masterdataprocessor, FunctionalError.MissingMasterDatasetKey, datasetType = Some(dataset.datasetType)) } - } - - val eventsList = elements.asScala.toList - metrics.incCounter(datasetId, config.totalEventCount, eventsList.size.toLong) - val dataset = DatasetRegistry.getDataset(datasetId).get - val eventsMap = eventsList.map(msg => { - val json = parse(JSONUtil.serialize(msg(config.CONST_EVENT)), useBigIntForLong = false) - val key = json.customExtract[String](dataset.datasetConfig.key) - if (key == null) { - metrics.incCounter(datasetId, config.failedCount) - context.output(config.failedEventsTag, msg) - } - (key, json) + (node.asText(), json) }).toMap - val validEventsMap = eventsMap.filter(f => f._1 != null) + val validEventsMap = eventsMap.filter(f => f._1.nonEmpty) val result = masterDataCache.process(dataset, validEventsMap) - metrics.incCounter(datasetId, config.successInsertCount, result._1) - metrics.incCounter(datasetId, config.successUpdateCount, result._2) - metrics.incCounter(datasetId, config.successEventCount, eventsList.size.toLong) + metrics.incCounter(dataset.id, config.successInsertCount, result._1) + metrics.incCounter(dataset.id, config.successUpdateCount, result._2) + metrics.incCounter(dataset.id, config.successEventCount, validEventsMap.size.toLong) - eventsList.foreach(event => { + elements.foreach(event => { event.remove(config.CONST_EVENT) - context.output(config.successTag(), markComplete(event, dataset.dataVersion)) + markCompletion(dataset, super.markComplete(event, dataset.dataVersion), context, Producer.masterdataprocessor) }) } -} + +} \ No newline at end of file diff --git a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorConfig.scala b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorConfig.scala index afa42a6d..824edd29 100644 --- a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorConfig.scala +++ b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorConfig.scala @@ -13,28 +13,23 @@ class MasterDataProcessorConfig(override val config: Config) extends BaseJobConf private val serialVersionUID = 2905979434303791379L implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) - // Kafka Topics Configuration - val kafkaStatsTopic: String = config.getString("kafka.stats.topic") - val kafkaFailedTopic: String = config.getString("kafka.output.failed.topic") - // Metric List val totalEventCount = "total-event-count" val successEventCount = "success-event-count" val successInsertCount = "success-insert-count" val successUpdateCount = "success-update-count" - val failedCount = "event-failed-count" val windowTime: Int = config.getInt("task.window.time.in.seconds") val windowCount: Int = config.getInt("task.window.count") - val failedEventsTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed_events") private val statsOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("processing_stats") // Functions val masterDataProcessFunction = "MasterDataProcessorFunction" - val failedEventsProducer = "MasterDataFailedEventsProducer" override def inputTopic(): String = config.getString("kafka.input.topic") override def inputConsumer(): String = "master-data-consumer" override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = statsOutputTag + + override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") } diff --git a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala index d1714a4b..7527a6c9 100644 --- a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala +++ b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala @@ -1,18 +1,15 @@ package org.sunbird.obsrv.pipeline.task import com.typesafe.config.{Config, ConfigFactory} -import org.apache.flink.api.common.typeinfo.TypeInformation -import org.apache.flink.api.java.typeutils.TypeExtractor import org.apache.flink.api.java.utils.ParameterTool import org.apache.flink.streaming.api.datastream.DataStream import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment import org.apache.flink.streaming.api.windowing.time.Time import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector} import org.sunbird.obsrv.core.util.{DatasetKeySelector, FlinkUtil, TumblingProcessingTimeCountWindows} -import org.sunbird.obsrv.extractor.functions.ExtractionFunction +import org.sunbird.obsrv.denormalizer.task.{DenormalizerConfig, DenormalizerStreamTask} import org.sunbird.obsrv.extractor.task.{ExtractorConfig, ExtractorStreamTask} import org.sunbird.obsrv.pipeline.function.MasterDataProcessorFunction -import org.sunbird.obsrv.preprocessor.functions.EventValidationFunction import org.sunbird.obsrv.preprocessor.task.{PipelinePreprocessorConfig, PipelinePreprocessorStreamTask} import org.sunbird.obsrv.transformer.task.{TransformerConfig, TransformerStreamTask} @@ -39,7 +36,7 @@ class MasterDataProcessorStreamTask(config: Config, masterDataConfig: MasterData /** * Created an overloaded process function to enable unit testing * - * @param env + * @param env StreamExecutionEnvironment */ def process(env: StreamExecutionEnvironment): Unit = { @@ -51,11 +48,14 @@ class MasterDataProcessorStreamTask(config: Config, masterDataConfig: MasterData val extractorTask = new ExtractorStreamTask(new ExtractorConfig(config), kafkaConnector) val preprocessorTask = new PipelinePreprocessorStreamTask(new PipelinePreprocessorConfig(config), kafkaConnector) + val denormalizerTask = new DenormalizerStreamTask(new DenormalizerConfig(config), kafkaConnector) val transformerTask = new TransformerStreamTask(new TransformerConfig(config), kafkaConnector) val transformedStream = transformerTask.processStream( - preprocessorTask.processStream( - extractorTask.processStream(dataStream) + denormalizerTask.processStream( + preprocessorTask.processStream( + extractorTask.processStream(dataStream) + ) ) ) @@ -65,12 +65,7 @@ class MasterDataProcessorStreamTask(config: Config, masterDataConfig: MasterData val processedStream = windowedStream.process(new MasterDataProcessorFunction(masterDataConfig)).name(masterDataConfig.masterDataProcessFunction) .uid(masterDataConfig.masterDataProcessFunction).setParallelism(masterDataConfig.downstreamOperatorsParallelism) - processedStream.getSideOutput(masterDataConfig.failedEventsTag).sinkTo(kafkaConnector.kafkaMapSink(masterDataConfig.kafkaFailedTopic)) - .name(masterDataConfig.failedEventsProducer).uid(masterDataConfig.failedEventsProducer).setParallelism(masterDataConfig.downstreamOperatorsParallelism) - - processedStream.getSideOutput(masterDataConfig.successTag()).sinkTo(kafkaConnector.kafkaMapSink(masterDataConfig.kafkaStatsTopic)) - .name("stats-producer").uid("stats-producer").setParallelism(masterDataConfig.downstreamOperatorsParallelism) - + addDefaultSinks(processedStream, masterDataConfig, kafkaConnector) processedStream.getSideOutput(masterDataConfig.successTag()) } } diff --git a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/util/MasterDataCache.scala b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/util/MasterDataCache.scala index 3449d819..e07f4399 100644 --- a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/util/MasterDataCache.scala +++ b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/util/MasterDataCache.scala @@ -1,12 +1,12 @@ package org.sunbird.obsrv.pipeline.util -import org.json4s.{DefaultFormats, Formats, JNothing, JValue} +import org.json4s.native.JsonMethods._ +import org.json4s.{JNothing, JValue} import org.slf4j.LoggerFactory import org.sunbird.obsrv.core.cache.RedisConnect -import org.sunbird.obsrv.model.DatasetModels.{Dataset, DatasetConfig} +import org.sunbird.obsrv.model.DatasetModels.Dataset import org.sunbird.obsrv.pipeline.task.MasterDataProcessorConfig import redis.clients.jedis.{Pipeline, Response} -import org.json4s.native.JsonMethods._ import scala.collection.mutable @@ -21,11 +21,17 @@ class MasterDataCache(val config: MasterDataProcessorConfig) { def open(datasets: List[Dataset]): Unit = { datasets.map(dataset => { + open(dataset) + }) + } + + def open(dataset: Dataset): Unit = { + if (!datasetPipelineMap.contains(dataset.id)) { val datasetConfig = dataset.datasetConfig val redisConnect = new RedisConnect(datasetConfig.redisDBHost.get, datasetConfig.redisDBPort.get, config.redisConnectionTimeout) val pipeline: Pipeline = redisConnect.getConnection(0).pipelined() datasetPipelineMap.put(dataset.id, pipeline) - }) + } } def process(dataset: Dataset, eventMap: Map[String, JValue]): (Int, Int) = { @@ -48,7 +54,7 @@ class MasterDataCache(val config: MasterDataProcessorConfig) { responses.map(f => (f._1, f._2.get())) } - private def updateCache(dataset: Dataset, dataFromCache: mutable.Map[String, String], eventMap: Map[String, JValue], pipeline: Pipeline ): Unit = { + private def updateCache(dataset: Dataset, dataFromCache: mutable.Map[String, String], eventMap: Map[String, JValue], pipeline: Pipeline): Unit = { pipeline.clear() pipeline.select(dataset.datasetConfig.redisDB.get) eventMap.foreach(f => { diff --git a/pipeline/master-data-processor/src/test/resources/test.conf b/pipeline/master-data-processor/src/test/resources/test.conf index 2fe3f3fb..2c8f0236 100644 --- a/pipeline/master-data-processor/src/test/resources/test.conf +++ b/pipeline/master-data-processor/src/test/resources/test.conf @@ -7,17 +7,17 @@ job { kafka { input.topic = ${job.env}".masterdata.ingest" output.raw.topic = ${job.env}".masterdata.raw" - output.extractor.duplicate.topic = ${job.env}".masterdata.extractor.duplicate" + output.extractor.duplicate.topic = ${job.env}".masterdata.failed" output.failed.topic = ${job.env}".masterdata.failed" - output.batch.failed.topic = ${job.env}".masterdata.extractor.failed" + output.batch.failed.topic = ${job.env}".masterdata.failed" event.max.size = "1048576" # Max is only 1MB - output.invalid.topic = ${job.env}".masterdata.invalid" + output.invalid.topic = ${job.env}".masterdata.failed" output.unique.topic = ${job.env}".masterdata.unique" - output.duplicate.topic = ${job.env}".masterdata.duplicate" + output.duplicate.topic = ${job.env}".masterdata.failed" + output.denorm.topic = ${job.env}".masterdata.denorm" output.transform.topic = ${job.env}".masterdata.transform" stats.topic = ${job.env}".masterdata.stats" groupId = ${job.env}"-masterdata-pipeline-group" - groupId = ${job.env}"-single-pipeline-group" producer { max-request-size = 5242880 } diff --git a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala index 77304c8c..e48f8120 100644 --- a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala +++ b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala @@ -5,8 +5,6 @@ object EventFixture { val VALID_BATCH_EVENT_D3_INSERT = """{"dataset":"d3","id":"event1","events":[{"code":"HYUN-CRE-D6","manufacturer":"Hyundai","model":"Creta","variant":"SX(O)","modelYear":"2023","price":"2200000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel"}]}""" val VALID_BATCH_EVENT_D3_INSERT_2 = """{"dataset":"d3","id":"event1","events":[{"code":"HYUN-TUC-D6","manufacturer":"Hyundai","model":"Tucson","variant":"Signature","modelYear":"2023","price":"4000000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel"}]}""" val VALID_BATCH_EVENT_D3_UPDATE = """{"dataset":"d3","id":"event1","events":[{"code":"HYUN-CRE-D6","safety":"3 Star (Global NCAP)","seatingCapacity":5}]}""" - val VALID_BATCH_EVENT_D4 = """{"dataset":"d4","event":{"code":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}}""" - - + val MISSING_DATA_KEY_EVENT_D4 = """{"dataset":"d5","event":{"code1":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}}""" } diff --git a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala index ee23eecc..9fe070d3 100644 --- a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala +++ b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala @@ -9,8 +9,9 @@ import org.apache.kafka.common.serialization.StringDeserializer import org.scalatest.Matchers._ import org.sunbird.obsrv.BaseMetricsReporter import org.sunbird.obsrv.core.cache.RedisConnect +import org.sunbird.obsrv.core.model.Models.SystemEvent import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector -import org.sunbird.obsrv.core.util.{FlinkUtil, PostgresConnect} +import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect} import org.sunbird.obsrv.fixture.EventFixture import org.sunbird.obsrv.pipeline.task.{MasterDataProcessorConfig, MasterDataProcessorStreamTask} import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry @@ -57,15 +58,23 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.VALID_BATCH_EVENT_D3_INSERT_2) EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.VALID_BATCH_EVENT_D4) EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.VALID_BATCH_EVENT_D3_UPDATE) + EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.MISSING_DATA_KEY_EVENT_D4) flinkCluster.before() } private def insertTestData(postgresConnect: PostgresConnect): Unit = { - postgresConnect.execute("insert into datasets(id, type, extraction_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d3', 'master-dataset', '{\"is_batch_event\": true, \"extraction_key\": \"events\"}', '{\"topic\":\"d3-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"masterdata.ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":6340,\"redis_db\":3}', 'ACTIVE', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d4', 'master-dataset', '{\"topic\":\"d4-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"masterdata-ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":6340,\"redis_db\":4}', 'ACTIVE', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, extraction_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d3', 'master-dataset', '{\"is_batch_event\": true, \"extraction_key\": \"events\"}', '{\"topic\":\"d3-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"masterdata.ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+",\"redis_db\":3}', 'Live', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d4', 'master-dataset', '{\"topic\":\"d4-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"masterdata-ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+",\"redis_db\":4}', 'Live', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d5', 'master-dataset', '{\"topic\":\"d4-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"masterdata-ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+",\"redis_db\":4}', 'Live', 'System', 'System', now(), now());") } override def afterAll(): Unit = { + val redisConnection = new RedisConnect(masterDataConfig.redisHost, masterDataConfig.redisPort, masterDataConfig.redisConnectionTimeout) + redisConnection.getConnection(config.getInt("redis.database.extractor.duplication.store.id")).flushAll() + redisConnection.getConnection(config.getInt("redis.database.preprocessor.duplication.store.id")).flushAll() + redisConnection.getConnection(3).flushAll() + redisConnection.getConnection(4).flushAll() + super.afterAll() flinkCluster.after() EmbeddedKafka.stop() @@ -73,7 +82,7 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry def createTestTopics(): Unit = { List( - config.getString("kafka.stats.topic"), config.getString("kafka.output.transform.topic"), config.getString("kafka.output.duplicate.topic"), + config.getString("kafka.output.system.event.topic"), config.getString("kafka.output.transform.topic"), config.getString("kafka.output.denorm.topic"), config.getString("kafka.output.duplicate.topic"), config.getString("kafka.output.unique.topic"), config.getString("kafka.output.invalid.topic"), config.getString("kafka.output.batch.failed.topic"), config.getString("kafka.output.failed.topic"), config.getString("kafka.output.extractor.duplicate.topic"), config.getString("kafka.output.raw.topic"), config.getString("kafka.input.topic") ).foreach(EmbeddedKafka.createCustomTopic(_)) @@ -88,11 +97,23 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry env.execute(masterDataConfig.jobName) } - val input = EmbeddedKafka.consumeNumberMessagesFrom[String](config.getString("kafka.stats.topic"), 4, timeout = 30.seconds) - input.size should be (4) + val sysEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](config.getString("kafka.output.system.event.topic"), 8, timeout = 30.seconds) + sysEvents.size should be(8) + + sysEvents.foreach(se => { + val event = JSONUtil.deserialize[SystemEvent](se) + if (event.ctx.dataset.getOrElse("ALL").equals("ALL")) + event.ctx.dataset_type should be(None) + else + event.ctx.dataset_type.getOrElse("dataset") should be("master-dataset") + }) + + val failedEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](masterDataConfig.kafkaFailedTopic, 1, timeout = 30.seconds) + failedEvents.size should be(1) val mutableMetricsMap = mutable.Map[String, Long](); BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) + Console.println("### MasterDataProcessorStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) masterDataConfig.successTag().getId should be ("processing_stats") @@ -106,6 +127,9 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry mutableMetricsMap(s"${masterDataConfig.jobName}.d4.${masterDataConfig.successInsertCount}") should be(1) mutableMetricsMap(s"${masterDataConfig.jobName}.d4.${masterDataConfig.successUpdateCount}") should be(0) + mutableMetricsMap(s"${masterDataConfig.jobName}.d5.${masterDataConfig.totalEventCount}") should be(1) + mutableMetricsMap(s"${masterDataConfig.jobName}.d5.${masterDataConfig.eventFailedMetricsCount}") should be(1) + val redisConnection = new RedisConnect(masterDataConfig.redisHost, masterDataConfig.redisPort, masterDataConfig.redisConnectionTimeout) val jedis1 = redisConnection.getConnection(3) val event1 = jedis1.get("HYUN-CRE-D6") diff --git a/pipeline/pipeline-merged/pom.xml b/pipeline/pipeline-merged/pom.xml index e19bc800..f3db71fe 100644 --- a/pipeline/pipeline-merged/pom.xml +++ b/pipeline/pipeline-merged/pom.xml @@ -4,9 +4,6 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - - 3.0.1 - org.sunbird.obsrv @@ -134,9 +131,9 @@ tests - it.ozimov + com.github.codemonstur embedded-redis - 0.7.1 + 1.0.0 test diff --git a/pipeline/pipeline-merged/src/main/resources/merged-pipeline.conf b/pipeline/pipeline-merged/src/main/resources/merged-pipeline.conf index 7746a8e9..75f43376 100644 --- a/pipeline/pipeline-merged/src/main/resources/merged-pipeline.conf +++ b/pipeline/pipeline-merged/src/main/resources/merged-pipeline.conf @@ -3,15 +3,14 @@ include "baseconfig.conf" kafka { input.topic = ${job.env}".ingest" output.raw.topic = ${job.env}".raw" - output.extractor.duplicate.topic = ${job.env}".extractor.duplicate" - output.failed.topic = ${job.env}".failed" - output.batch.failed.topic = ${job.env}".extractor.failed" + output.extractor.duplicate.topic = ${job.env}".failed" + output.batch.failed.topic = ${job.env}".failed" event.max.size = "1048576" # Max is only 1MB - output.invalid.topic = ${job.env}".invalid" + output.invalid.topic = ${job.env}".failed" output.unique.topic = ${job.env}".unique" - output.duplicate.topic = ${job.env}".duplicate" + output.duplicate.topic = ${job.env}".failed" output.denorm.topic = ${job.env}".denorm" - output.denorm.failed.topic = ${job.env}".denorm.failed" + output.denorm.failed.topic = ${job.env}".failed" output.transform.topic = ${job.env}".transform" stats.topic = ${job.env}".stats" groupId = ${job.env}"-single-pipeline-group" diff --git a/pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineConfig.scala b/pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineConfig.scala index b37662af..c6df88d3 100644 --- a/pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineConfig.scala +++ b/pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineConfig.scala @@ -14,19 +14,11 @@ class MergedPipelineConfig(override val config: Config) extends BaseJobConfig[mu implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) // Kafka Topics Configuration - val kafkaInputTopic: String = config.getString("kafka.input.topic") - val kafkaStatsTopic: String = config.getString("kafka.stats.topic") + override def inputTopic(): String = config.getString("kafka.input.topic") - val statsOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("processing_stats") - - // Functions - val druidRouterFunction = "DruidRouterFunction" + override def inputConsumer(): String = "pipeline-consumer" - // Producers - val druidRouterProducer = "druid-router-sink" - val processingStatsProducer = "processing-stats-sink" + override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("processing_stats") - override def inputTopic(): String = kafkaInputTopic - override def inputConsumer(): String = "pipeline-consumer" - override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = statsOutputTag + override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") } diff --git a/pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineStreamTask.scala b/pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineStreamTask.scala index 93c8ccca..f7d8dce9 100644 --- a/pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineStreamTask.scala +++ b/pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineStreamTask.scala @@ -9,7 +9,7 @@ import org.sunbird.obsrv.core.util.FlinkUtil import org.sunbird.obsrv.denormalizer.task.{DenormalizerConfig, DenormalizerStreamTask} import org.sunbird.obsrv.extractor.task.{ExtractorConfig, ExtractorStreamTask} import org.sunbird.obsrv.preprocessor.task.{PipelinePreprocessorConfig, PipelinePreprocessorStreamTask} -import org.sunbird.obsrv.router.task.{DruidRouterConfig, DruidRouterStreamTask} +import org.sunbird.obsrv.router.task.{DruidRouterConfig, DynamicRouterStreamTask} import org.sunbird.obsrv.transformer.task.{TransformerConfig, TransformerStreamTask} import java.io.File @@ -34,7 +34,7 @@ class MergedPipelineStreamTask(config: Config, mergedPipelineConfig: MergedPipel /** * Created an overloaded process function to enable unit testing - * @param env + * @param env StreamExecutionEnvironment */ def process(env: StreamExecutionEnvironment): Unit = { @@ -48,7 +48,7 @@ class MergedPipelineStreamTask(config: Config, mergedPipelineConfig: MergedPipel val preprocessorTask = new PipelinePreprocessorStreamTask(new PipelinePreprocessorConfig(config), kafkaConnector) val denormalizerTask = new DenormalizerStreamTask(new DenormalizerConfig(config), kafkaConnector) val transformerTask = new TransformerStreamTask(new TransformerConfig(config), kafkaConnector) - val routerTask = new DruidRouterStreamTask(new DruidRouterConfig(config), kafkaConnector) + val routerTask = new DynamicRouterStreamTask(new DruidRouterConfig(config), kafkaConnector) routerTask.processStream( transformerTask.processStream( diff --git a/pipeline/pipeline-merged/src/test/resources/test.conf b/pipeline/pipeline-merged/src/test/resources/test.conf index 6c8175d1..d2b959c3 100644 --- a/pipeline/pipeline-merged/src/test/resources/test.conf +++ b/pipeline/pipeline-merged/src/test/resources/test.conf @@ -7,15 +7,14 @@ job { kafka { input.topic = ${job.env}".ingest" output.raw.topic = ${job.env}".raw" - output.extractor.duplicate.topic = ${job.env}".extractor.duplicate" - output.failed.topic = ${job.env}".failed" - output.batch.failed.topic = ${job.env}".extractor.failed" + output.extractor.duplicate.topic = ${job.env}".failed" + output.batch.failed.topic = ${job.env}".failed" event.max.size = "1048576" # Max is only 1MB - output.invalid.topic = ${job.env}".invalid" + output.invalid.topic = ${job.env}".failed" output.unique.topic = ${job.env}".unique" - output.duplicate.topic = ${job.env}".duplicate" + output.duplicate.topic = ${job.env}".failed" output.denorm.topic = ${job.env}".denorm" - output.denorm.failed.topic = ${job.env}".denorm.failed" + output.denorm.failed.topic = ${job.env}".failed" output.transform.topic = ${job.env}".transform" stats.topic = ${job.env}".stats" groupId = ${job.env}"-single-pipeline-group" diff --git a/pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/pipeline/MergedPipelineStreamTaskTestSpec.scala b/pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/pipeline/MergedPipelineStreamTaskTestSpec.scala index 40b31493..f3cf86b2 100644 --- a/pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/pipeline/MergedPipelineStreamTaskTestSpec.scala +++ b/pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/pipeline/MergedPipelineStreamTaskTestSpec.scala @@ -8,11 +8,14 @@ import org.apache.flink.test.util.MiniClusterWithClientResource import org.apache.kafka.common.serialization.StringDeserializer import org.scalatest.Matchers._ import org.sunbird.obsrv.BaseMetricsReporter +import org.sunbird.obsrv.core.cache.RedisConnect import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector -import org.sunbird.obsrv.core.util.FlinkUtil +import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil} +import org.sunbird.obsrv.extractor.task.ExtractorConfig import org.sunbird.obsrv.fixture.EventFixture import org.sunbird.obsrv.pipeline.task.{MergedPipelineConfig, MergedPipelineStreamTask} import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry +import org.sunbird.obsrv.transformer.task.TransformerConfig import scala.collection.mutable import scala.concurrent.ExecutionContext.Implicits.global @@ -28,7 +31,6 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { .build) val mergedPipelineConfig = new MergedPipelineConfig(config) - //val mockKafkaUtil: FlinkKafkaConnector = mock[FlinkKafkaConnector](Mockito.withSettings().serializable()) val kafkaConnector = new FlinkKafkaConnector(mergedPipelineConfig) val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group") implicit val embeddedKafkaConfig: EmbeddedKafkaConfig = @@ -63,6 +65,9 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { } override def afterAll(): Unit = { + val redisConnection = new RedisConnect(mergedPipelineConfig.redisHost, mergedPipelineConfig.redisPort, mergedPipelineConfig.redisConnectionTimeout) + redisConnection.getConnection(config.getInt("redis.database.extractor.duplication.store.id")).flushAll() + redisConnection.getConnection(config.getInt("redis.database.preprocessor.duplication.store.id")).flushAll() super.afterAll() flinkCluster.after() EmbeddedKafka.stop() @@ -70,10 +75,11 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { def createTestTopics(): Unit = { List( - config.getString("kafka.stats.topic"), config.getString("kafka.output.transform.topic"), config.getString("kafka.output.denorm.failed.topic"), + config.getString("kafka.output.system.event.topic"), config.getString("kafka.output.transform.topic"), config.getString("kafka.output.denorm.failed.topic"), config.getString("kafka.output.denorm.topic"), config.getString("kafka.output.duplicate.topic"), config.getString("kafka.output.unique.topic"), config.getString("kafka.output.invalid.topic"), config.getString("kafka.output.batch.failed.topic"), config.getString("kafka.output.failed.topic"), - config.getString("kafka.output.extractor.duplicate.topic"), config.getString("kafka.output.raw.topic"), config.getString("kafka.input.topic") + config.getString("kafka.output.extractor.duplicate.topic"), config.getString("kafka.output.raw.topic"), config.getString("kafka.input.topic"), + "d1-events", "d2-events" ).foreach(EmbeddedKafka.createCustomTopic(_)) } @@ -84,20 +90,70 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { task.process(env) Future { env.execute(mergedPipelineConfig.jobName) - Thread.sleep(10000) } - val stats = EmbeddedKafka.consumeNumberMessagesFrom[String](mergedPipelineConfig.kafkaStatsTopic, 1, timeout = 20.seconds) - stats.foreach(Console.println("Event:", _)) + try { + val d1Events = EmbeddedKafka.consumeNumberMessagesFrom[String]("d1-events", 1, timeout = 30.seconds) + d1Events.size should be (1) + val d2Events = EmbeddedKafka.consumeNumberMessagesFrom[String]("d2-events", 1, timeout = 30.seconds) + d2Events.size should be (1) + } catch { + case ex: Exception => ex.printStackTrace() + } + try { + val systemEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](config.getString("kafka.output.system.event.topic"), 7, timeout = 30.seconds) + systemEvents.size should be(7) + } catch { + case ex: Exception => ex.printStackTrace() + } val mutableMetricsMap = mutable.Map[String, Long](); BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) + Console.println("### MergedPipelineStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) + + mutableMetricsMap("ExtractorJob.d1.extractor-total-count") should be(4) + mutableMetricsMap("ExtractorJob.d1.extractor-duplicate-count") should be(1) + mutableMetricsMap("ExtractorJob.d1.extractor-event-count") should be(1) + mutableMetricsMap("ExtractorJob.d1.extractor-success-count") should be(1) + mutableMetricsMap("ExtractorJob.d1.extractor-failed-count") should be(2) + mutableMetricsMap("ExtractorJob.d2.extractor-total-count") should be(2) + mutableMetricsMap("ExtractorJob.d2.failed-event-count") should be(1) + mutableMetricsMap("ExtractorJob.d2.extractor-skipped-count") should be(1) + + mutableMetricsMap("PipelinePreprocessorJob.d1.validator-total-count") should be(1) + mutableMetricsMap("PipelinePreprocessorJob.d1.validator-success-count") should be(1) + mutableMetricsMap("PipelinePreprocessorJob.d1.dedup-total-count") should be(1) + mutableMetricsMap("PipelinePreprocessorJob.d1.dedup-success-count") should be(1) + mutableMetricsMap("PipelinePreprocessorJob.d2.validator-total-count") should be(1) + mutableMetricsMap("PipelinePreprocessorJob.d2.validator-skipped-count") should be(1) + mutableMetricsMap("PipelinePreprocessorJob.d2.dedup-total-count") should be(1) + mutableMetricsMap("PipelinePreprocessorJob.d2.dedup-skipped-count") should be(1) + + mutableMetricsMap("DenormalizerJob.d1.denorm-total") should be(1) + mutableMetricsMap("DenormalizerJob.d1.denorm-failed") should be(1) + mutableMetricsMap("DenormalizerJob.d2.denorm-total") should be(1) + mutableMetricsMap("DenormalizerJob.d2.denorm-skipped") should be(1) + + mutableMetricsMap("TransformerJob.d1.transform-total-count") should be(1) + mutableMetricsMap("TransformerJob.d1.transform-success-count") should be(1) + mutableMetricsMap("TransformerJob.d2.transform-total-count") should be(1) + mutableMetricsMap("TransformerJob.d2.transform-skipped-count") should be(1) + + mutableMetricsMap("DruidRouterJob.d1.router-total-count") should be(1) + mutableMetricsMap("DruidRouterJob.d1.router-success-count") should be(1) + mutableMetricsMap("DruidRouterJob.d2.router-total-count") should be(1) + mutableMetricsMap("DruidRouterJob.d2.router-success-count") should be(1) + + val extractorConfig = new ExtractorConfig(config) + extractorConfig.inputTopic() should be (config.getString("kafka.input.topic")) + extractorConfig.inputConsumer() should be ("extractor-consumer") + + val transformerConfig = new TransformerConfig(config) + transformerConfig.inputTopic() should be(config.getString("kafka.input.topic")) + transformerConfig.inputConsumer() should be("transformer-consumer") - mutableMetricsMap.foreach(println(_)) - //TODO: Add assertions mergedPipelineConfig.successTag().getId should be ("processing_stats") - + mergedPipelineConfig.failedEventsOutputTag().getId should be ("failed-events") } - } diff --git a/pipeline/pom.xml b/pipeline/pom.xml index 07f8e191..25d19b66 100644 --- a/pipeline/pom.xml +++ b/pipeline/pom.xml @@ -5,10 +5,6 @@ http://maven.apache.org/maven-v4_0_0.xsd"> 4.0.0 - - 3.0.0 - - org.sunbird.obsrv pipeline 1.0 @@ -26,6 +22,7 @@ transformer druid-router pipeline-merged + kafka-connector master-data-processor diff --git a/pipeline/preprocessor/pom.xml b/pipeline/preprocessor/pom.xml index 96171103..1fb410ea 100644 --- a/pipeline/preprocessor/pom.xml +++ b/pipeline/preprocessor/pom.xml @@ -120,9 +120,9 @@ test - it.ozimov + com.github.codemonstur embedded-redis - 0.7.1 + 1.0.0 test diff --git a/pipeline/preprocessor/src/main/resources/pipeline-preprocessor.conf b/pipeline/preprocessor/src/main/resources/pipeline-preprocessor.conf index a539195b..7e845e1d 100644 --- a/pipeline/preprocessor/src/main/resources/pipeline-preprocessor.conf +++ b/pipeline/preprocessor/src/main/resources/pipeline-preprocessor.conf @@ -2,10 +2,9 @@ include "baseconfig.conf" kafka { input.topic = ${job.env}".raw" - output.failed.topic = ${job.env}".failed" - output.invalid.topic = ${job.env}".invalid" + output.invalid.topic = ${job.env}".failed" output.unique.topic = ${job.env}".unique" - output.duplicate.topic = ${job.env}".duplicate" + output.duplicate.topic = ${job.env}".failed" groupId = ${job.env}"-pipeline-preprocessor-group" } diff --git a/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/DeduplicationFunction.scala b/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/DeduplicationFunction.scala index 93522e7e..21e32b2e 100644 --- a/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/DeduplicationFunction.scala +++ b/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/DeduplicationFunction.scala @@ -5,27 +5,25 @@ import org.apache.flink.configuration.Configuration import org.apache.flink.streaming.api.functions.ProcessFunction import org.slf4j.LoggerFactory import org.sunbird.obsrv.core.cache._ -import org.sunbird.obsrv.core.model.ErrorConstants +import org.sunbird.obsrv.core.exception.ObsrvException +import org.sunbird.obsrv.core.model.Models._ +import org.sunbird.obsrv.core.model._ import org.sunbird.obsrv.core.streaming._ import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.Dataset import org.sunbird.obsrv.preprocessor.task.PipelinePreprocessorConfig -import org.sunbird.obsrv.registry.DatasetRegistry +import org.sunbird.obsrv.streaming.BaseDatasetProcessFunction import scala.collection.mutable -class DeduplicationFunction(config: PipelinePreprocessorConfig) - (implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]]) - extends BaseProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]](config) { +class DeduplicationFunction(config: PipelinePreprocessorConfig)(implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]]) + extends BaseDatasetProcessFunction(config) with BaseDeduplication { - @transient private var dedupEngine: DedupEngine = null private[this] val logger = LoggerFactory.getLogger(classOf[DeduplicationFunction]) + @transient private var dedupEngine: DedupEngine = null - override def getMetricsList(): MetricsList = { - val metrics = List( - config.duplicationTotalMetricsCount, config.duplicationSkippedEventMetricsCount, config.duplicationEventMetricsCount, - config.duplicationProcessedEventMetricsCount, config.eventFailedMetricsCount - ) - MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics) + override def getMetrics(): List[String] = { + List(config.duplicationTotalMetricsCount, config.duplicationSkippedEventMetricsCount, config.duplicationEventMetricsCount, config.duplicationProcessedEventMetricsCount) } override def open(parameters: Configuration): Unit = { @@ -39,25 +37,22 @@ class DeduplicationFunction(config: PipelinePreprocessorConfig) dedupEngine.closeConnectionPool() } - override def processElement(msg: mutable.Map[String, AnyRef], + override def processElement(dataset: Dataset, msg: mutable.Map[String, AnyRef], context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, metrics: Metrics): Unit = { - metrics.incCounter(config.defaultDatasetID, config.duplicationTotalMetricsCount) - val datasetId = msg.get(config.CONST_DATASET) - val datasetOpt = DatasetRegistry.getDataset(datasetId.get.asInstanceOf[String]) - val dataset = datasetOpt.get + metrics.incCounter(dataset.id, config.duplicationTotalMetricsCount) val dedupConfig = dataset.dedupConfig if (dedupConfig.isDefined && dedupConfig.get.dropDuplicates.get) { val event = msg(config.CONST_EVENT).asInstanceOf[Map[String, AnyRef]] val eventAsText = JSONUtil.serialize(event) - val isDup = isDuplicate(dataset.id, dedupConfig.get.dedupKey, eventAsText, context, config)(dedupEngine) + val isDup = isDuplicate(dataset, dedupConfig.get.dedupKey, eventAsText, context) if (isDup) { metrics.incCounter(dataset.id, config.duplicationEventMetricsCount) - context.output(config.duplicateEventsOutputTag, markFailed(msg, ErrorConstants.DUPLICATE_EVENT_FOUND, "Deduplication")) + context.output(config.duplicateEventsOutputTag, markFailed(msg, ErrorConstants.DUPLICATE_EVENT_FOUND, Producer.dedup)) } else { metrics.incCounter(dataset.id, config.duplicationProcessedEventMetricsCount) - context.output(config.uniqueEventsOutputTag, markSuccess(msg, "Deduplication")) + context.output(config.uniqueEventsOutputTag, markSuccess(msg, Producer.dedup)) } } else { metrics.incCounter(dataset.id, config.duplicationSkippedEventMetricsCount) @@ -65,4 +60,21 @@ class DeduplicationFunction(config: PipelinePreprocessorConfig) } } + private def isDuplicate(dataset: Dataset, dedupKey: Option[String], event: String, + context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context): Boolean = { + try { + super.isDuplicate(dataset.id, dedupKey, event)(dedupEngine) + } catch { + case ex: ObsrvException => + val sysEvent = JSONUtil.serialize(SystemEvent( + EventID.METRIC, + ctx = ContextData(module = ModuleID.processing, pdata = PData(config.jobName, PDataType.flink, Some(Producer.dedup)), dataset = Some(dataset.id), dataset_type = Some(dataset.datasetType)), + data = EData(error = Some(ErrorLog(pdata_id = Producer.dedup, pdata_status = StatusCode.skipped, error_type = FunctionalError.DedupFailed, error_code = ex.error.errorCode, error_message = ex.error.errorMsg, error_level = ErrorLevel.warn))) + )) + logger.warn("BaseDeduplication:isDuplicate() | Exception", ex) + context.output(config.systemEventsOutputTag, sysEvent) + false + } + } + } diff --git a/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/EventValidationFunction.scala b/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/EventValidationFunction.scala index 31e659ab..93cfefef 100644 --- a/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/EventValidationFunction.scala +++ b/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/EventValidationFunction.scala @@ -5,109 +5,153 @@ import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.configuration.Configuration import org.apache.flink.streaming.api.functions.ProcessFunction import org.slf4j.LoggerFactory -import org.sunbird.obsrv.core.exception.ObsrvException -import org.sunbird.obsrv.core.model.ErrorConstants -import org.sunbird.obsrv.core.model.Models.{PData, SystemEvent} -import org.sunbird.obsrv.core.streaming.{BaseProcessFunction, Metrics, MetricsList} +import org.sunbird.obsrv.core.model.FunctionalError.FunctionalError +import org.sunbird.obsrv.core.model.Models._ +import org.sunbird.obsrv.core.model._ +import org.sunbird.obsrv.core.streaming.Metrics import org.sunbird.obsrv.core.util.JSONUtil import org.sunbird.obsrv.model.DatasetModels.Dataset +import org.sunbird.obsrv.model.{DatasetStatus, ValidationMode} import org.sunbird.obsrv.preprocessor.task.PipelinePreprocessorConfig -import org.sunbird.obsrv.preprocessor.util.SchemaValidator +import org.sunbird.obsrv.preprocessor.util.{SchemaValidator, ValidationMsg} import org.sunbird.obsrv.registry.DatasetRegistry +import org.sunbird.obsrv.streaming.BaseDatasetProcessFunction import scala.collection.mutable -class EventValidationFunction(config: PipelinePreprocessorConfig, - @transient var schemaValidator: SchemaValidator = null) - (implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]]) - extends BaseProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]](config) { +class EventValidationFunction(config: PipelinePreprocessorConfig)(implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]]) + extends BaseDatasetProcessFunction(config) { private[this] val logger = LoggerFactory.getLogger(classOf[EventValidationFunction]) - override def getMetricsList(): MetricsList = { - val metrics = List(config.validationTotalMetricsCount, config.validationFailureMetricsCount, - config.validationSuccessMetricsCount, config.validationSkipMetricsCount, config.eventFailedMetricsCount) - MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics) + @transient private var schemaValidator: SchemaValidator = null + override def getMetrics(): List[String] = { + List(config.validationTotalMetricsCount, config.validationFailureMetricsCount, config.validationSuccessMetricsCount, + config.validationSkipMetricsCount, config.eventIgnoredMetricsCount) } override def open(parameters: Configuration): Unit = { super.open(parameters) - if (schemaValidator == null) { - schemaValidator = new SchemaValidator(config) - schemaValidator.loadDataSchemas(DatasetRegistry.getAllDatasets(config.datasetType())) - } + schemaValidator = new SchemaValidator() + schemaValidator.loadDataSchemas(DatasetRegistry.getAllDatasets(config.datasetType())) } override def close(): Unit = { super.close() } - override def processElement(msg: mutable.Map[String, AnyRef], - context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, + override def processElement(dataset: Dataset, msg: mutable.Map[String, AnyRef], + ctx: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, metrics: Metrics): Unit = { - metrics.incCounter(config.defaultDatasetID, config.validationTotalMetricsCount) - val datasetId = msg.get(config.CONST_DATASET) - if (datasetId.isEmpty) { - context.output(config.failedEventsOutputTag, markFailed(msg, ErrorConstants.MISSING_DATASET_ID, config.jobName)) - metrics.incCounter(config.defaultDatasetID, config.eventFailedMetricsCount) - return - } - val datasetOpt = DatasetRegistry.getDataset(datasetId.get.asInstanceOf[String]) - if (datasetOpt.isEmpty) { - context.output(config.failedEventsOutputTag, markFailed(msg, ErrorConstants.MISSING_DATASET_CONFIGURATION, config.jobName)) - metrics.incCounter(config.defaultDatasetID, config.eventFailedMetricsCount) - return - } - val dataset = datasetOpt.get - if (!super.containsEvent(msg)) { - metrics.incCounter(dataset.id, config.eventFailedMetricsCount) - context.output(config.failedEventsOutputTag, markFailed(msg, ErrorConstants.EVENT_MISSING, config.jobName)) + metrics.incCounter(dataset.id, config.validationTotalMetricsCount) + if (dataset.status != DatasetStatus.Live) { + metrics.incCounter(dataset.id, config.eventIgnoredMetricsCount) return } val validationConfig = dataset.validationConfig if (validationConfig.isDefined && validationConfig.get.validate.get) { - validateEvent(dataset, msg, context, metrics) + schemaValidator.loadDataSchema(dataset) + validateEvent(dataset, msg, ctx, metrics) } else { metrics.incCounter(dataset.id, config.validationSkipMetricsCount) - context.output(config.validEventsOutputTag, markSkipped(msg, "EventValidation")) + ctx.output(config.validEventsOutputTag, markSkipped(msg, Producer.validator)) } } private def validateEvent(dataset: Dataset, msg: mutable.Map[String, AnyRef], - context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, + ctx: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, metrics: Metrics): Unit = { - val event = msg(config.CONST_EVENT).asInstanceOf[Map[String, AnyRef]] - try { - if (schemaValidator.schemaFileExists(dataset)) { - val validationReport = schemaValidator.validate(dataset.id, event) - if (validationReport.isSuccess) { - onValidationSuccess(dataset, msg, metrics, context) - } else { - onValidationFailure(dataset, msg, metrics, context, validationReport) + if (schemaValidator.schemaFileExists(dataset)) { + val validationReport = schemaValidator.validate(dataset.id, event) + onValidationResult(dataset, msg, metrics, ctx, validationReport) + } else { + metrics.incCounter(dataset.id, config.validationSkipMetricsCount) + ctx.output(config.validEventsOutputTag, markSkipped(msg, Producer.validator)) + } + } + + private def onValidationResult(dataset: Dataset, event: mutable.Map[String, AnyRef], metrics: Metrics, + ctx: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, + validationReport: ProcessingReport): Unit = { + if (validationReport.isSuccess) { + validationSuccess(dataset, event, metrics, ctx) + } else { + val validationFailureMsgs = schemaValidator.getValidationMessages(report = validationReport) + val validationFailureCount = validationFailureMsgs.size + val additionalFieldsCount = validationFailureMsgs.count(f => "additionalProperties".equals(f.keyword)) + if (validationFailureCount == additionalFieldsCount) { + dataset.validationConfig.get.mode.get match { + case ValidationMode.Strict => + validationFailure(dataset, event, metrics, ctx, validationFailureMsgs) + case ValidationMode.IgnoreNewFields => + validationSuccess(dataset, event, metrics, ctx) + case ValidationMode.DiscardNewFields => + // TODO: [P2] Write logic to discard the fields from the pipeline. Fields are anyway discarded from Druid but not from data lake + validationSuccess(dataset, event, metrics, ctx) } + } else { + validationFailure(dataset, event, metrics, ctx, validationFailureMsgs) } - } catch { - case ex: ObsrvException => - metrics.incCounter(dataset.id, config.validationFailureMetricsCount) - context.output(config.failedEventsOutputTag, markFailed(msg, ex.error, "EventValidation")) } } - private def onValidationSuccess(dataset: Dataset, event: mutable.Map[String, AnyRef], metrics: Metrics, - context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context): Unit = { + private def getSystemEvent(dataset: Dataset, functionalError: FunctionalError, failedCount: Int): String = { + JSONUtil.serialize(SystemEvent(EventID.METRIC, + ctx = ContextData(module = ModuleID.processing, pdata = PData(config.jobName, PDataType.flink, Some(Producer.validator)), dataset = Some(dataset.id), dataset_type = Some(dataset.datasetType)), + data = EData( + error = Some(ErrorLog(pdata_id = Producer.validator, pdata_status = StatusCode.failed, error_type = functionalError, error_code = ErrorConstants.SCHEMA_VALIDATION_FAILED.errorCode, error_message = ErrorConstants.SCHEMA_VALIDATION_FAILED.errorMsg, error_level = ErrorLevel.warn, error_count = Some(failedCount))), + pipeline_stats = None + ) + )) + } + + private def generateSystemEvents(dataset: Dataset, validationFailureMsgs: List[ValidationMsg], context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context): Unit = { + + val reqFailedCount = validationFailureMsgs.count(f => "required".equals(f.keyword)) + val typeFailedCount = validationFailureMsgs.count(f => "type".equals(f.keyword)) + val addTypeFailedCount = validationFailureMsgs.count(f => "additionalProperties".equals(f.keyword)) + val unknownFailureCount = validationFailureMsgs.count(f => !List("type","required","additionalProperties").contains(f.keyword)) + if (reqFailedCount > 0) { + context.output(config.systemEventsOutputTag, getSystemEvent(dataset, FunctionalError.RequiredFieldsMissing, reqFailedCount)) + } + if (typeFailedCount > 0) { + context.output(config.systemEventsOutputTag, getSystemEvent(dataset, FunctionalError.DataTypeMismatch, typeFailedCount)) + } + if (addTypeFailedCount > 0) { + context.output(config.systemEventsOutputTag, getSystemEvent(dataset, FunctionalError.AdditionalFieldsFound, typeFailedCount)) + } + if (unknownFailureCount > 0) { + context.output(config.systemEventsOutputTag, getSystemEvent(dataset, FunctionalError.UnknownValidationError, unknownFailureCount)) + } + + // Log the validation failure messages + validationFailureMsgs.foreach(f => { + f.keyword match { + case "additionalProperties" => + logger.warn(s"SchemaValidator | Additional properties found | dataset=${dataset.id} | ValidationMessage=${JSONUtil.serialize(f)}") + case "required" => + logger.error(s"SchemaValidator | Required Fields Missing | dataset=${dataset.id} | ValidationMessage=${JSONUtil.serialize(f)}") + case "type" => + logger.error(s"SchemaValidator | Data type mismatch found | dataset=${dataset.id} | ValidationMessage=${JSONUtil.serialize(f)}") + case _ => + logger.warn(s"SchemaValidator | Unknown Validation errors found | dataset=${dataset.id} | ValidationMessage=${JSONUtil.serialize(f)}") + } + }) + } + + private def validationSuccess(dataset: Dataset, event: mutable.Map[String, AnyRef], metrics: Metrics, + context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context): Unit = { metrics.incCounter(dataset.id, config.validationSuccessMetricsCount) - context.output(config.validEventsOutputTag, markSuccess(event, "EventValidation")) + context.output(config.validEventsOutputTag, markSuccess(event, Producer.validator)) } - private def onValidationFailure(dataset: Dataset, event: mutable.Map[String, AnyRef], metrics: Metrics, - context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, - validationReport: ProcessingReport): Unit = { - val failedErrorMsg = schemaValidator.getInvalidFieldName(validationReport.toString) + private def validationFailure(dataset: Dataset, event: mutable.Map[String, AnyRef], metrics: Metrics, + context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, + validationFailureMsgs: List[ValidationMsg]): Unit = { metrics.incCounter(dataset.id, config.validationFailureMetricsCount) - context.output(config.invalidEventsOutputTag, markFailed(event, ErrorConstants.SCHEMA_VALIDATION_FAILED, "EventValidation")) - val systemEvent = SystemEvent(PData(config.jobName, "flink", "validation"), Map("error_code" -> ErrorConstants.SCHEMA_VALIDATION_FAILED.errorCode, "error_msg" -> failedErrorMsg)) - context.output(config.systemEventsOutputTag, JSONUtil.serialize(systemEvent)) + context.output(config.invalidEventsOutputTag, markFailed(event, ErrorConstants.SCHEMA_VALIDATION_FAILED, Producer.validator)) + generateSystemEvents(dataset, validationFailureMsgs, context) } } \ No newline at end of file diff --git a/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/task/PipelinePreprocessorConfig.scala b/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/task/PipelinePreprocessorConfig.scala index 23cc578c..784b92b7 100644 --- a/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/task/PipelinePreprocessorConfig.scala +++ b/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/task/PipelinePreprocessorConfig.scala @@ -20,36 +20,33 @@ class PipelinePreprocessorConfig(override val config: Config) extends BaseJobCon // Kafka Topic Configuration val kafkaInputTopic: String = config.getString("kafka.input.topic") - val kafkaFailedTopic: String = config.getString("kafka.output.failed.topic") val kafkaInvalidTopic: String = config.getString("kafka.output.invalid.topic") val kafkaUniqueTopic: String = config.getString("kafka.output.unique.topic") val kafkaDuplicateTopic: String = config.getString("kafka.output.duplicate.topic") // Validation & dedup Stream out put tag - val failedEventsOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") val invalidEventsOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("invalid-events") val validEventsOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("valid-events") val uniqueEventsOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("unique-events") val duplicateEventsOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("duplicate-events") // Validation job metrics - val validationTotalMetricsCount = "validation-total-event-count" - val validationSuccessMetricsCount = "validation-success-event-count" - val validationFailureMetricsCount = "validation-failed-event-count" - val eventFailedMetricsCount = "failed-event-count" - val validationSkipMetricsCount = "validation-skipped-event-count" + val validationTotalMetricsCount = "validator-total-count" + val validationSuccessMetricsCount = "validator-success-count" + val validationFailureMetricsCount = "validator-failed-count" + val validationSkipMetricsCount = "validator-skipped-count" + val eventIgnoredMetricsCount = "validator-ignored-count" - val duplicationTotalMetricsCount = "duplicate-total-count" - val duplicationEventMetricsCount = "duplicate-event-count" - val duplicationSkippedEventMetricsCount = "duplicate-skipped-event-count" - val duplicationProcessedEventMetricsCount = "duplicate-processed-event-count" + val duplicationTotalMetricsCount = "dedup-total-count" + val duplicationEventMetricsCount = "dedup-failed-count" + val duplicationSkippedEventMetricsCount = "dedup-skipped-count" + val duplicationProcessedEventMetricsCount = "dedup-success-count" // Consumers val validationConsumer = "validation-consumer" val dedupConsumer = "deduplication-consumer" // Producers - val failedEventProducer = "failed-events-sink" val invalidEventProducer = "invalid-events-sink" val duplicateEventProducer = "duplicate-events-sink" val uniqueEventProducer = "unique-events-sink" @@ -58,5 +55,7 @@ class PipelinePreprocessorConfig(override val config: Config) extends BaseJobCon override def inputConsumer(): String = validationConsumer + override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") + override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = uniqueEventsOutputTag } diff --git a/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/task/PipelinePreprocessorStreamTask.scala b/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/task/PipelinePreprocessorStreamTask.scala index 04b66c8c..fa941d64 100644 --- a/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/task/PipelinePreprocessorStreamTask.scala +++ b/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/task/PipelinePreprocessorStreamTask.scala @@ -1,7 +1,6 @@ package org.sunbird.obsrv.preprocessor.task import com.typesafe.config.ConfigFactory -import org.apache.flink.api.common.eventtime.WatermarkStrategy import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.typeutils.TypeExtractor import org.apache.flink.api.java.utils.ParameterTool @@ -44,20 +43,21 @@ class PipelinePreprocessorStreamTask(config: PipelinePreprocessorConfig, kafkaCo /** * Sink for invalid events, duplicate events and system events */ - validStream.getSideOutput(config.failedEventsOutputTag).sinkTo(kafkaConnector.kafkaMapSink(config.kafkaFailedTopic)) + validStream.getSideOutput(config.failedEventsOutputTag()).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaFailedTopic)) .name(config.failedEventProducer).uid(config.failedEventProducer).setParallelism(config.downstreamOperatorsParallelism) - validStream.getSideOutput(config.systemEventsOutputTag).sinkTo(kafkaConnector.kafkaStringSink(config.kafkaSystemTopic)) + validStream.getSideOutput(config.systemEventsOutputTag).sinkTo(kafkaConnector.kafkaSink[String](config.kafkaSystemTopic)) .name(config.validationConsumer + "-" + config.systemEventsProducer).uid(config.validationConsumer + "-" + config.systemEventsProducer).setParallelism(config.downstreamOperatorsParallelism) - validStream.getSideOutput(config.invalidEventsOutputTag).sinkTo(kafkaConnector.kafkaMapSink(config.kafkaInvalidTopic)) + validStream.getSideOutput(config.invalidEventsOutputTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaInvalidTopic)) .name(config.invalidEventProducer).uid(config.invalidEventProducer).setParallelism(config.downstreamOperatorsParallelism) - uniqueStream.getSideOutput(config.duplicateEventsOutputTag).sinkTo(kafkaConnector.kafkaMapSink(config.kafkaDuplicateTopic)) + uniqueStream.getSideOutput(config.duplicateEventsOutputTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaDuplicateTopic)) .name(config.duplicateEventProducer).uid(config.duplicateEventProducer).setParallelism(config.downstreamOperatorsParallelism) - uniqueStream.getSideOutput(config.systemEventsOutputTag).sinkTo(kafkaConnector.kafkaStringSink(config.kafkaSystemTopic)) + uniqueStream.getSideOutput(config.systemEventsOutputTag).sinkTo(kafkaConnector.kafkaSink[String](config.kafkaSystemTopic)) .name(config.dedupConsumer + "-" + config.systemEventsProducer).uid(config.dedupConsumer + "-" + config.systemEventsProducer).setParallelism(config.downstreamOperatorsParallelism) - uniqueStream.getSideOutput(config.successTag()).sinkTo(kafkaConnector.kafkaMapSink(config.kafkaUniqueTopic)) + uniqueStream.getSideOutput(config.successTag()).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaUniqueTopic)) .name(config.uniqueEventProducer).uid(config.uniqueEventProducer).setParallelism(config.downstreamOperatorsParallelism) + addDefaultSinks(uniqueStream, config, kafkaConnector) uniqueStream.getSideOutput(config.successTag()) } } diff --git a/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/util/SchemaValidator.scala b/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/util/SchemaValidator.scala index 9682ae71..6d725a1e 100644 --- a/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/util/SchemaValidator.scala +++ b/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/util/SchemaValidator.scala @@ -9,48 +9,60 @@ import org.sunbird.obsrv.core.exception.ObsrvException import org.sunbird.obsrv.core.model.ErrorConstants import org.sunbird.obsrv.core.util.JSONUtil import org.sunbird.obsrv.model.DatasetModels.Dataset -import org.sunbird.obsrv.preprocessor.task.PipelinePreprocessorConfig import java.io.IOException import scala.collection.mutable -class SchemaValidator(config: PipelinePreprocessorConfig) extends java.io.Serializable { +case class Schema(loadingURI: String, pointer: String) + +case class Instance(pointer: String) + +case class ValidationMsg(level: String, schema: Schema, instance: Instance, domain: String, keyword: String, message: String, allowed: Option[String], + found: Option[String], expected: Option[List[String]], unwanted: Option[List[String]], required: Option[List[String]], missing: Option[List[String]]) + +class SchemaValidator() extends java.io.Serializable { private val serialVersionUID = 8780940932759659175L private[this] val logger = LoggerFactory.getLogger(classOf[SchemaValidator]) private[this] val schemaMap = mutable.Map[String, (JsonSchema, Boolean)]() - def loadDataSchemas(datasets: List[Dataset]) = { + def loadDataSchemas(datasets: List[Dataset]): Unit = { datasets.foreach(dataset => { - if(dataset.jsonSchema.isDefined) { + if (dataset.jsonSchema.isDefined) { try { loadJsonSchema(dataset.id, dataset.jsonSchema.get) } catch { - case ex: ObsrvException => ex.printStackTrace() - schemaMap.put(dataset.id, (null, false)) + case _: ObsrvException => schemaMap.put(dataset.id, (null, false)) } } }) } + def loadDataSchema(dataset: Dataset): Any = { + if (!schemaMap.contains(dataset.id) && dataset.jsonSchema.isDefined) { + try { + loadJsonSchema(dataset.id, dataset.jsonSchema.get) + } catch { + case _: ObsrvException => schemaMap.put(dataset.id, (null, false)) + } + } + } + private def loadJsonSchema(datasetId: String, jsonSchemaStr: String) = { val schemaFactory = JsonSchemaFactory.byDefault try { val jsonSchema = schemaFactory.getJsonSchema(JsonLoader.fromString(jsonSchemaStr)) + jsonSchema.validate(JSONUtil.convertValue(Map("pqr" -> "value"))) // Test validate to check if Schema is valid schemaMap.put(datasetId, (jsonSchema, true)) } catch { case ex: Exception => - logger.error("SchemaValidator:loadJsonSchema() - Exception", ex) + logger.error(s"SchemaValidator:loadJsonSchema() - Unable to parse the schema json for dataset: $datasetId", ex) throw new ObsrvException(ErrorConstants.INVALID_JSON_SCHEMA) } } def schemaFileExists(dataset: Dataset): Boolean = { - - if (dataset.jsonSchema.isEmpty) { - throw new ObsrvException(ErrorConstants.JSON_SCHEMA_NOT_FOUND) - } - schemaMap.get(dataset.id).map(f => f._2).orElse(Some(false)).get + schemaMap.get(dataset.id).map(f => f._2).orElse(Some(false)).get } @throws[IOException] @@ -59,20 +71,13 @@ class SchemaValidator(config: PipelinePreprocessorConfig) extends java.io.Serial schemaMap(datasetId)._1.validate(JSONUtil.convertValue(event)) } - def getInvalidFieldName(errorInfo: String): String = { - val message = errorInfo.split("reports:") - val defaultValidationErrMsg = "Unable to obtain field name for failed validation" - if (message.length > 1) { - val fields = message(1).split(",") - if (fields.length > 2) { - val pointer = fields(3).split("\"pointer\":") - pointer(1).substring(0, pointer(1).length - 1) - } else { - defaultValidationErrMsg - } - } else { - defaultValidationErrMsg - } + def getValidationMessages(report: ProcessingReport): List[ValidationMsg] = { + val buffer = mutable.Buffer[ValidationMsg]() + report.forEach(processingMsg => { + buffer.append(JSONUtil.deserialize[ValidationMsg](JSONUtil.serialize(processingMsg.asJson()))) + }) + buffer.toList } + } // $COVERAGE-ON$ diff --git a/pipeline/preprocessor/src/test/resources/test.conf b/pipeline/preprocessor/src/test/resources/test.conf index dc5734f9..cc68631a 100644 --- a/pipeline/preprocessor/src/test/resources/test.conf +++ b/pipeline/preprocessor/src/test/resources/test.conf @@ -2,10 +2,9 @@ include "base-test.conf" kafka { input.topic = "flink.raw" - output.failed.topic = "flink.failed" - output.invalid.topic = "flink.invalid" + output.invalid.topic = "flink.failed" output.unique.topic = "flink.unique" - output.duplicate.topic = "flink.duplicate" + output.duplicate.topic = "flink.failed" groupId = "flink-pipeline-preprocessor-group" } diff --git a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala index 99ec39ec..d48e720d 100644 --- a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala +++ b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala @@ -8,8 +8,10 @@ import org.apache.flink.test.util.MiniClusterWithClientResource import org.apache.kafka.common.serialization.StringDeserializer import org.scalatest.Matchers._ import org.sunbird.obsrv.BaseMetricsReporter +import org.sunbird.obsrv.core.cache.RedisConnect +import org.sunbird.obsrv.core.model.Models.SystemEvent import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector -import org.sunbird.obsrv.core.util.FlinkUtil +import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect} import org.sunbird.obsrv.preprocessor.fixture.EventFixtures import org.sunbird.obsrv.preprocessor.task.{PipelinePreprocessorConfig, PipelinePreprocessorStreamTask} import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry @@ -49,6 +51,7 @@ class PipelinePreprocessorStreamTestSpec extends BaseSpecWithDatasetRegistry { super.beforeAll() BaseMetricsReporter.gaugeMetrics.clear() EmbeddedKafka.start()(embeddedKafkaConfig) + prepareTestData() createTestTopics() EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixtures.VALID_EVENT) EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixtures.INVALID_EVENT) @@ -57,11 +60,32 @@ class PipelinePreprocessorStreamTestSpec extends BaseSpecWithDatasetRegistry { EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixtures.INVALID_DATASET_EVENT) EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixtures.INVALID_EVENT_KEY) EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixtures.VALID_EVENT_DEDUP_CONFIG_NONE) + EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixtures.INVALID_EVENT_2) + EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixtures.EVENT_WITH_ADDL_PROPS_STRICT_MODE) + EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixtures.EVENT_WITH_ADDL_PROPS_ALLOW_MODE) + EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixtures.EVENT_WITH_ADDL_PROPS_IGNORE_MODE) + EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixtures.IGNORED_EVENT) + EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixtures.EVENT_WITH_UNKNOWN_VALIDATION_ERR) + EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixtures.EVENT_WITH_EMPTY_SCHEMA) + EmbeddedKafka.publishStringMessageToKafka(pConfig.kafkaInputTopic, EventFixtures.DEDUP_KEY_MISSING) flinkCluster.before() } + private def prepareTestData(): Unit = { + val postgresConnect = new PostgresConnect(postgresConfig) + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d3', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Draft', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d4', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d5', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"IgnoreNewFields\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d6', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string","maxLength":5},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"DiscardNewFields\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d7', 'dataset', '"+EventFixtures.INVALID_SCHEMA+"', '{\"validate\": true, \"mode\": \"Strict\"}','{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, created_by, updated_by, created_date, updated_date) values ('d8', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": false, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'System', 'System', now(), now());") + postgresConnect.closeConnection() + } + override def afterAll(): Unit = { + val redisConnection = new RedisConnect(pConfig.redisHost, pConfig.redisPort, pConfig.redisConnectionTimeout) + redisConnection.getConnection(config.getInt("redis.database.preprocessor.duplication.store.id")).flushAll() super.afterAll() flinkCluster.after() EmbeddedKafka.stop() @@ -69,8 +93,7 @@ class PipelinePreprocessorStreamTestSpec extends BaseSpecWithDatasetRegistry { def createTestTopics(): Unit = { List( - pConfig.kafkaInputTopic, pConfig.kafkaInvalidTopic, pConfig.kafkaSystemTopic, - pConfig.kafkaDuplicateTopic, pConfig.kafkaUniqueTopic + pConfig.kafkaInputTopic, pConfig.kafkaInvalidTopic, pConfig.kafkaSystemTopic, pConfig.kafkaDuplicateTopic, pConfig.kafkaUniqueTopic ).foreach(EmbeddedKafka.createCustomTopic(_)) } @@ -83,27 +106,105 @@ class PipelinePreprocessorStreamTestSpec extends BaseSpecWithDatasetRegistry { env.execute(pConfig.jobName) Thread.sleep(5000) } - //val extractorFailed = EmbeddedKafka.consumeNumberMessagesFrom[String](config.getString("kafka.input.topic"), 2, timeout = 60.seconds) - val uniqueEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](pConfig.kafkaUniqueTopic, 1, timeout = 60.seconds) - uniqueEvents.foreach(Console.println("Event:", _)) + val outputEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](pConfig.kafkaUniqueTopic, 5, timeout = 30.seconds) + val invalidEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](pConfig.kafkaInvalidTopic, 7, timeout = 30.seconds) + val systemEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](pConfig.kafkaSystemTopic, 8, timeout = 30.seconds) - val mutableMetricsMap = mutable.Map[String, Long](); - val metricsMap = BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) + validateOutputEvents(outputEvents) + validateInvalidEvents(invalidEvents) + validateSystemEvents(systemEvents) - mutableMetricsMap(s"${pConfig.jobName}.ALL.${pConfig.validationTotalMetricsCount}") should be (7) - mutableMetricsMap(s"${pConfig.jobName}.ALL.${pConfig.eventFailedMetricsCount}") should be (2) - mutableMetricsMap(s"${pConfig.jobName}.ALL.${pConfig.duplicationTotalMetricsCount}") should be (3) + val mutableMetricsMap = mutable.Map[String, Long]() + BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) + Console.println("### PipelinePreprocessorStreamTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) + validateMetrics(mutableMetricsMap) - mutableMetricsMap(s"${pConfig.jobName}.d1.${pConfig.validationFailureMetricsCount}") should be (1) - mutableMetricsMap(s"${pConfig.jobName}.d1.${pConfig.duplicationProcessedEventMetricsCount}") should be (1) - mutableMetricsMap(s"${pConfig.jobName}.d1.${pConfig.duplicationEventMetricsCount}") should be (1) - mutableMetricsMap(s"${pConfig.jobName}.d1.${pConfig.validationSuccessMetricsCount}") should be (2) + } - mutableMetricsMap(s"${pConfig.jobName}.d2.${pConfig.duplicationSkippedEventMetricsCount}") should be (1) - mutableMetricsMap(s"${pConfig.jobName}.d2.${pConfig.validationSkipMetricsCount}") should be (1) - mutableMetricsMap(s"${pConfig.jobName}.d2.${pConfig.eventFailedMetricsCount}") should be (1) + private def validateOutputEvents(outputEvents: List[String]): Unit = { + outputEvents.size should be(5) + outputEvents.foreach(f => println("OutputEvent", f)) + /* + (OutputEvent,{"event":{"dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}},"obsrv_meta":{"flags":{"validator":"success","dedup":"success"},"syncts":1701772208183,"prevProcessingTime":1701772214928,"error":{},"processingStartTime":1701772214321,"timespans":{"validator":590,"dedup":17}},"dataset":"d1"}) + (OutputEvent,{"event":{"dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1235","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}},"obsrv_meta":{"flags":{"validator":"skipped"},"syncts":1701772208476,"prevProcessingTime":1701772215544,"error":{},"processingStartTime":1701772214544,"timespans":{"validator":1000}},"dataset":"d2"}) + (OutputEvent,{"event":{"dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19,"deliveriesRejected":1}},"obsrv_meta":{"flags":{"validator":"success"},"syncts":1701772208577,"prevProcessingTime":1701772215613,"error":{},"processingStartTime":1701772214561,"timespans":{"validator":1052}},"dataset":"d5"}) + (OutputEvent,{"event":{"dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19,"deliveriesRejected":1}},"obsrv_meta":{"flags":{"validator":"success"},"syncts":1701772208597,"prevProcessingTime":1701772215623,"error":{},"processingStartTime":1701772214562,"timespans":{"validator":1061}},"dataset":"d6"}) + (OutputEvent,{"event":{"dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19,"deliveriesRejected":1}},"obsrv_meta":{"flags":{"validator":"skipped"},"syncts":1701772208676,"prevProcessingTime":1701772215637,"error":{},"processingStartTime":1701772214563,"timespans":{"validator":1074}},"dataset":"d7"}) + */ + } + private def validateInvalidEvents(invalidEvents: List[String]): Unit = { + invalidEvents.size should be(7) + /* + (invalid,{"event":"{\"event\":{\"id\":\"1234\",\"date\":\"2023-03-01\",\"dealer\":{\"dealerCode\":\"KUNUnited\",\"locationId\":\"KUN1\",\"email\":\"dealer1@gmail.com\",\"phone\":\"9849012345\"},\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d1\"}","obsrv_meta":{"flags":{"validator":"failed"},"syncts":1701429101820,"prevProcessingTime":1701429108259,"error":{"src":{"enumClass":"org.sunbird.obsrv.core.model.Producer","value":"validator"},"error_code":"ERR_PP_1013","error_msg":"Event failed the schema validation"},"processingStartTime":1701429107624,"timespans":{"validator":635}},"dataset":"d1"}) + (invalid,{"event":"{\"event\":{\"dealer\":{\"dealerCode\":\"KUNUnited\",\"locationId\":\"KUN1\",\"email\":\"dealer1@gmail.com\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1234\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d1\"}","obsrv_meta":{"flags":{"validator":"success","dedup":"failed"},"syncts":1701429101860,"prevProcessingTime":1701429108501,"error":{"src":{"enumClass":"org.sunbird.obsrv.core.model.Producer","value":"dedup"},"error_code":"ERR_PP_1010","error_msg":"Duplicate event found"},"processingStartTime":1701429107625,"timespans":{"validator":873,"dedup":3}},"dataset":"d1"}) + (invalid,{"event":"{\"event\":{\"dealer\":{\"dealerCode\":\"KUNUnited\",\"locationId\":\"KUN1\",\"email\":\"dealer1@gmail.com\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1234\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}}}","obsrv_meta":{"flags":{"validator":"failed"},"syncts":1701429101886,"prevProcessingTime":1701429108528,"error":{"src":{"enumClass":"org.sunbird.obsrv.core.model.Producer","value":"validator"},"error_code":"ERR_EXT_1004","error_msg":"Dataset Id is missing from the data"},"processingStartTime":1701429107625,"timespans":{"validator":903}}}) + (invalid,{"event":"{\"event\":{\"dealer\":{\"dealerCode\":\"KUNUnited\",\"locationId\":\"KUN1\",\"email\":\"dealer1@gmail.com\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1234\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"dX\"}","obsrv_meta":{"flags":{"validator":"failed"},"syncts":1701429101927,"prevProcessingTime":1701429108583,"error":{"src":{"enumClass":"org.sunbird.obsrv.core.model.Producer","value":"validator"},"error_code":"ERR_EXT_1005","error_msg":"Dataset configuration is missing"},"processingStartTime":1701429107626,"timespans":{"validator":957}},"dataset":"dX"}) + (invalid,{"event1":{"dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}},"event":"{\"event1\":{\"dealer\":{\"dealerCode\":\"KUNUnited\",\"locationId\":\"KUN1\",\"email\":\"dealer1@gmail.com\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1234\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d2\"}","obsrv_meta":{"flags":{"validator":"failed"},"syncts":1701429101961,"prevProcessingTime":1701429108586,"error":{"src":{"enumClass":"org.sunbird.obsrv.core.model.Producer","value":"validator"},"error_code":"ERR_EXT_1006","error_msg":"Event missing in the batch event"},"processingStartTime":1701429107627,"timespans":{"validator":959}},"dataset":"d2"}) + (invalid,{"event":"{\"event\":{\"dealer\":{\"dealerCode\":\"KUNUnited\",\"locationId\":\"KUN1\",\"email\":\"dealer1@gmail.com\",\"phone\":\"9849012345\"},\"vehicleCode\":[\"HYUN-CRE-D6\"],\"id\":1234,\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d4\"}","obsrv_meta":{"flags":{"validator":"failed"},"syncts":1701429102063,"prevProcessingTime":1701429108633,"error":{"src":{"enumClass":"org.sunbird.obsrv.core.model.Producer","value":"validator"},"error_code":"ERR_PP_1013","error_msg":"Event failed the schema validation"},"processingStartTime":1701429107631,"timespans":{"validator":1002}},"dataset":"d4"}) + (invalid,{"event":"{\"event\":{\"dealer\":{\"dealerCode\":\"KUNUnited\",\"locationId\":\"KUN1\",\"email\":\"dealer1@gmail.com\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1234\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19,\"deliveriesRejected\":1}},\"dataset\":\"d4\"}","obsrv_meta":{"flags":{"validator":"failed"},"syncts":1701429102092,"prevProcessingTime":1701429108661,"error":{"src":{"enumClass":"org.sunbird.obsrv.core.model.Producer","value":"validator"},"error_code":"ERR_PP_1013","error_msg":"Event failed the schema validation"},"processingStartTime":1701429107638,"timespans":{"validator":1023}},"dataset":"d4"}) + */ } + private def validateSystemEvents(systemEvents: List[String]): Unit = { + systemEvents.size should be(8) + + systemEvents.foreach(se => { + val event = JSONUtil.deserialize[SystemEvent](se) + if (event.ctx.dataset.getOrElse("ALL").equals("ALL")) + event.ctx.dataset_type should be(None) + else + event.ctx.dataset_type.getOrElse("dataset") should be("dataset") + }) + /* + (SysEvent:,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"PipelinePreprocessorJob","type":"flink","pid":"validator"},"dataset":"d1", "dataset_type": "dataset"},"data":{"error":{"pdata_id":"validator","pdata_status":"failed","error_type":"RequiredFieldsMissing","error_code":"ERR_PP_1013","error_message":"Event failed the schema validation","error_level":"warn","error_count":1}},"ets":1701428460664}) + (SysEvent:,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"PipelinePreprocessorJob","type":"flink","pid":"validator"},"dataset":"ALL"},"data":{"error":{"pdata_id":"validator","pdata_status":"failed","error_type":"MissingDatasetId","error_code":"ERR_EXT_1004","error_message":"Dataset Id is missing from the data","error_level":"critical","error_count":1},"pipeline_stats":{"validator_status":"failed","validator_time":874}},"ets":1701428460889}) + (SysEvent:,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"PipelinePreprocessorJob","type":"flink","pid":"validator"},"dataset":"dX", "dataset_type": "dataset"},"data":{"error":{"pdata_id":"validator","pdata_status":"failed","error_type":"MissingDatasetId","error_code":"ERR_EXT_1005","error_message":"Dataset configuration is missing","error_level":"critical","error_count":1},"pipeline_stats":{"validator_status":"failed","validator_time":924}},"ets":1701428460927}) + (SysEvent:,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"PipelinePreprocessorJob","type":"flink","pid":"validator"},"dataset":"d2", "dataset_type": "dataset"},"data":{"error":{"pdata_id":"validator","pdata_status":"failed","error_type":"MissingEventData","error_code":"ERR_EXT_1006","error_message":"Event missing in the batch event","error_level":"critical","error_count":1},"pipeline_stats":{"validator_status":"failed","validator_time":925}},"ets":1701428460935}) + (SysEvent:,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"PipelinePreprocessorJob","type":"flink","pid":"validator"},"dataset":"d4", "dataset_type": "dataset"},"data":{"error":{"pdata_id":"validator","pdata_status":"failed","error_type":"DataTypeMismatch","error_code":"ERR_PP_1013","error_message":"Event failed the schema validation","error_level":"warn","error_count":2}},"ets":1701428460987}) + (SysEvent:,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"PipelinePreprocessorJob","type":"flink","pid":"validator"},"dataset":"d4", "dataset_type": "dataset"},"data":{"error":{"pdata_id":"validator","pdata_status":"failed","error_type":"AdditionalFieldsFound","error_code":"ERR_PP_1013","error_message":"Event failed the schema validation","error_level":"warn","error_count":0}},"ets":1701428461010}) + (SysEvent:,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"PipelinePreprocessorJob","type":"flink","pid":"validator"},"dataset":"d6", "dataset_type": "dataset"},"data":{"error":{"pdata_id":"validator","pdata_status":"failed","error_type":"AdditionalFieldsFound","error_code":"ERR_PP_1013","error_message":"Event failed the schema validation","error_level":"warn","error_count":0}},"ets":1701428461064}) + */ + } + + private def validateMetrics(mutableMetricsMap: mutable.Map[String, Long]): Unit = { + mutableMetricsMap(s"${pConfig.jobName}.ALL.${pConfig.eventFailedMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.dX.${pConfig.eventFailedMetricsCount}") should be(1) + + mutableMetricsMap(s"${pConfig.jobName}.d1.${pConfig.validationFailureMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d1.${pConfig.duplicationProcessedEventMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d1.${pConfig.duplicationEventMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d1.${pConfig.validationSuccessMetricsCount}") should be(2) + mutableMetricsMap(s"${pConfig.jobName}.d1.${pConfig.validationTotalMetricsCount}") should be(3) + mutableMetricsMap(s"${pConfig.jobName}.d1.${pConfig.duplicationTotalMetricsCount}") should be(2) + + mutableMetricsMap(s"${pConfig.jobName}.d2.${pConfig.duplicationSkippedEventMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d2.${pConfig.validationSkipMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d2.${pConfig.eventFailedMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d2.${pConfig.validationTotalMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d2.${pConfig.duplicationTotalMetricsCount}") should be(1) + + mutableMetricsMap(s"${pConfig.jobName}.d3.${pConfig.validationTotalMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d3.${pConfig.eventIgnoredMetricsCount}") should be(1) + + mutableMetricsMap(s"${pConfig.jobName}.d4.${pConfig.validationTotalMetricsCount}") should be(2) + mutableMetricsMap(s"${pConfig.jobName}.d4.${pConfig.validationFailureMetricsCount}") should be(2) + + mutableMetricsMap(s"${pConfig.jobName}.d5.${pConfig.validationTotalMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d5.${pConfig.validationSuccessMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d5.${pConfig.duplicationTotalMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d5.${pConfig.duplicationSkippedEventMetricsCount}") should be(1) + + mutableMetricsMap(s"${pConfig.jobName}.d6.${pConfig.validationTotalMetricsCount}") should be(2) + mutableMetricsMap(s"${pConfig.jobName}.d6.${pConfig.validationSuccessMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d6.${pConfig.validationFailureMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d6.${pConfig.duplicationTotalMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d6.${pConfig.duplicationSkippedEventMetricsCount}") should be(1) + + mutableMetricsMap(s"${pConfig.jobName}.d8.${pConfig.validationTotalMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d8.${pConfig.validationSkipMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d8.${pConfig.duplicationTotalMetricsCount}") should be(1) + mutableMetricsMap(s"${pConfig.jobName}.d8.${pConfig.duplicationProcessedEventMetricsCount}") should be(1) + } -} +} \ No newline at end of file diff --git a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/TestSchemaValidator.scala b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/TestSchemaValidator.scala index d5e5e336..0ba13d65 100644 --- a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/TestSchemaValidator.scala +++ b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/TestSchemaValidator.scala @@ -2,9 +2,9 @@ package org.sunbird.obsrv.preprocessor import com.typesafe.config.{Config, ConfigFactory} import org.scalatest.{FlatSpec, Matchers} -import org.sunbird.obsrv.core.exception.ObsrvException import org.sunbird.obsrv.core.util.JSONUtil import org.sunbird.obsrv.model.DatasetModels.{Dataset, DatasetConfig, RouterConfig} +import org.sunbird.obsrv.model.DatasetStatus import org.sunbird.obsrv.preprocessor.fixture.EventFixtures import org.sunbird.obsrv.preprocessor.task.PipelinePreprocessorConfig import org.sunbird.obsrv.preprocessor.util.SchemaValidator @@ -13,12 +13,12 @@ class TestSchemaValidator extends FlatSpec with Matchers { val config: Config = ConfigFactory.load("test.conf") val pipelineProcessorConfig = new PipelinePreprocessorConfig(config) - val schemaValidator = new SchemaValidator(pipelineProcessorConfig) + val schemaValidator = new SchemaValidator() "SchemaValidator" should "return a success report for a valid event" in { - val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.VALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id","date","ingest"), "Active") - schemaValidator.loadDataSchemas(List(dataset)) + val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.VALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id","date","ingest"), DatasetStatus.Live) + schemaValidator.loadDataSchema(dataset) val event = JSONUtil.deserialize[Map[String, AnyRef]](EventFixtures.VALID_SCHEMA_EVENT) val report = schemaValidator.validate("d1", event) @@ -27,25 +27,73 @@ class TestSchemaValidator extends FlatSpec with Matchers { it should "return a failed validation report for a invalid event" in { - val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.VALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id","date","ingest"), "Active") - schemaValidator.loadDataSchemas(List(dataset)) + val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.VALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id","date","ingest"), DatasetStatus.Live) + schemaValidator.loadDataSchema(dataset) - val event = JSONUtil.deserialize[Map[String, AnyRef]](EventFixtures.INVALID_SCHEMA_EVENT) - val report = schemaValidator.validate("d1", event) - assert(!report.isSuccess) - assert(report.toString.contains("error: object has missing required properties ([\"vehicleCode\"])")) + val event1 = JSONUtil.deserialize[Map[String, AnyRef]](EventFixtures.INVALID_SCHEMA_EVENT) + val report1 = schemaValidator.validate("d1", event1) + val messages1 = schemaValidator.getValidationMessages(report1) + assert(!report1.isSuccess) + assert(messages1.size == 1) + messages1.head.message should be("object has missing required properties ([\"vehicleCode\"])") + messages1.head.keyword should be("required") + messages1.head.missing.get.head should be ("vehicleCode") - val invalidFieldName = schemaValidator.getInvalidFieldName(report.toString) - invalidFieldName should be ("Unable to obtain field name for failed validation") - } + val event2 = JSONUtil.deserialize[Map[String, AnyRef]](EventFixtures.INVALID_SCHEMA_EVENT2) + val report2 = schemaValidator.validate("d1", event2) + val messages2 = schemaValidator.getValidationMessages(report2) + assert(!report2.isSuccess) + assert(messages2.size == 2) + messages2.foreach(f => { + f.found.get match { + case "integer" => + f.message should be("instance type (integer) does not match any allowed primitive type (allowed: [\"string\"])") + f.instance.pointer should be("/id") + case "array" => + f.message should be("instance type (array) does not match any allowed primitive type (allowed: [\"string\"])") + f.instance.pointer should be ("/vehicleCode") + } + }) - it should "validate the negative scenarios" in { - val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id","date","ingest"), "Active") - schemaValidator.loadDataSchemas(List(dataset)) + val event3 = JSONUtil.deserialize[Map[String, AnyRef]](EventFixtures.INVALID_SCHEMA_EVENT3) + val report3 = schemaValidator.validate("d1", event3) + val messages3 = schemaValidator.getValidationMessages(report3) + assert(!report3.isSuccess) + assert(messages3.size == 2) + messages3.foreach(f => { + f.keyword match { + case "type" => + f.message should be("instance type (integer) does not match any allowed primitive type (allowed: [\"string\"])") + f.instance.pointer should be("/id") + f.found.get should be ("integer") + f.expected.get.head should be("string") + case "additionalProperties" => + f.message should be("object instance has properties which are not allowed by the schema: [\"deliveriesRejected\"]") + f.instance.pointer should be("/metrics") + f.unwanted.get.head should be("deliveriesRejected") + } + }) + } - val dataset2 = Dataset("d1", "dataset", None, None, None, None, None, RouterConfig(""), DatasetConfig("id","date","ingest"), "Active") - an[ObsrvException] should be thrownBy schemaValidator.schemaFileExists(dataset2) + it should "validate the negative and missing scenarios" in { + val dataset = Dataset("d4", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA_JSON), None, RouterConfig(""), DatasetConfig("id","date","ingest"), DatasetStatus.Live) + schemaValidator.loadDataSchema(dataset) schemaValidator.schemaFileExists(dataset) should be (false) + + schemaValidator.loadDataSchema(dataset) + schemaValidator.schemaFileExists(dataset) should be(false) + + val dataset2 = Dataset("d5", "dataset", None, None, None, None, None, RouterConfig(""), DatasetConfig("id","date","ingest"), DatasetStatus.Live) + schemaValidator.loadDataSchemas(List[Dataset](dataset2)) + schemaValidator.schemaFileExists(dataset2) should be (false) + + val dataset3 = Dataset("d6", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id", "date", "ingest"), DatasetStatus.Live) + + schemaValidator.loadDataSchemas(List[Dataset](dataset3)) + schemaValidator.schemaFileExists(dataset3) should be(false) + + val dataset4 = Dataset("d7", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id", "date", "ingest"), DatasetStatus.Live) + schemaValidator.schemaFileExists(dataset4) should be (false) } } diff --git a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/fixture/EventFixtures.scala b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/fixture/EventFixtures.scala index ef26b06b..432757bd 100644 --- a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/fixture/EventFixtures.scala +++ b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/fixture/EventFixtures.scala @@ -2,20 +2,32 @@ package org.sunbird.obsrv.preprocessor.fixture object EventFixtures { - val VALID_SCHEMA = """{"$schema":"https://json-schema.org/draft/2020-12/schema","id":"https://sunbird.obsrv.com/test.json","title":"Test Schema","description":"Test Schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"number"},"deliveriesPromised":{"type":"number"},"deliveriesDone":{"type":"number"}}}},"required":["id","vehicleCode","date","dealer","metrics"]}""" - val INVALID_SCHEMA = """{"$schema":"https://json-schema.org/draft/2020-12/schema","id":"https://sunbird.obsrv.com/test.json","title":"Test Schema","description":"Test Schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"number"},"deliveriesPromised":{"type":"number"},"deliveriesDone":{"type":"number"}}}},"required":["id","vehicleCode","date","dealer","metrics"}""" + val VALID_SCHEMA = """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + val INVALID_SCHEMA = """{"$schema":"https://json-schema.org/draft/2020-12/schema","id":"https://sunbird.obsrv.com/test.json","title":"Test Schema","description":"Test Schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"number"},"deliveriesPromised":{"type":"number"},"deliveriesDone":{"type":"number"}}}},"required":["id","vehicleCode","date","dealer","metrics"],"additionalProperties":"false"}""" + val INVALID_SCHEMA_JSON = """{"$schema":"https://json-schema.org/draft/2020-12/schema","id":"https://sunbird.obsrv.com/test.json","title":"Test Schema","description":"Test Schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"number"},"deliveriesPromised":{"type":"number"},"deliveriesDone":{"type":"number"}}}},"required":["id","vehicleCode","date","dealer","metrics"}""" val VALID_SCHEMA_EVENT = """{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""" val INVALID_SCHEMA_EVENT = """{"id":"1234","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""" + val INVALID_SCHEMA_EVENT2 = """{"id":1234,"vehicleCode":["HYUN-CRE-D6"],"date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""" + val INVALID_SCHEMA_EVENT3 = """{"id":1234,"vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19,"deliveriesRejected":1}}""" val VALID_EVENT = """{"dataset":"d1","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val DEDUP_KEY_MISSING = """{"dataset":"d8","event":{"id1":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" val INVALID_EVENT = """{"dataset":"d1","event":{"id":"1234","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val INVALID_EVENT_2 = """{"dataset":"d4","event":{"id":1234,"vehicleCode":["HYUN-CRE-D6"],"date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val EVENT_WITH_ADDL_PROPS_STRICT_MODE = """{"dataset":"d4","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19,"deliveriesRejected":1}}}""" + val EVENT_WITH_ADDL_PROPS_ALLOW_MODE = """{"dataset":"d5","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19,"deliveriesRejected":1}}}""" + val EVENT_WITH_ADDL_PROPS_IGNORE_MODE = """{"dataset":"d6","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19,"deliveriesRejected":1}}}""" + val EVENT_WITH_UNKNOWN_VALIDATION_ERR = """{"dataset":"d6","event":{"id":"123456","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19,"deliveriesRejected":1}}}""" + val EVENT_WITH_EMPTY_SCHEMA = """{"dataset":"d7","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19,"deliveriesRejected":1}}}""" + val IGNORED_EVENT = """{"dataset":"d3","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" val DUPLICATE_EVENT = """{"dataset":"d1","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" val MISSING_DATASET_EVENT = """{"event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" val INVALID_DATASET_EVENT = """{"dataset":"dX","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" val INVALID_EVENT_KEY = """{"dataset":"d2","event1":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" val VALID_EVENT_DEDUP_CONFIG_NONE = """{"dataset":"d2","event":{"id":"1235","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val VALID_EVENT_DRAFT_DATASET = """{"dataset":"d3","event":{"id":"1236","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" diff --git a/pipeline/transformer/pom.xml b/pipeline/transformer/pom.xml index 80d26b82..b695a812 100644 --- a/pipeline/transformer/pom.xml +++ b/pipeline/transformer/pom.xml @@ -62,9 +62,9 @@ tests - it.ozimov + com.github.codemonstur embedded-redis - 0.7.1 + 1.0.0 test diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/functions/TransformerFunction.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/functions/TransformerFunction.scala index 51750256..fb0da96c 100644 --- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/functions/TransformerFunction.scala +++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/functions/TransformerFunction.scala @@ -2,44 +2,40 @@ package org.sunbird.obsrv.transformer.functions import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.streaming.api.functions.ProcessFunction -import org.sunbird.obsrv.core.streaming.{BaseProcessFunction, Metrics, MetricsList} +import org.sunbird.obsrv.core.model.Producer +import org.sunbird.obsrv.core.streaming.Metrics +import org.sunbird.obsrv.model.DatasetModels.Dataset import org.sunbird.obsrv.registry.DatasetRegistry +import org.sunbird.obsrv.streaming.BaseDatasetProcessFunction import org.sunbird.obsrv.transformer.task.TransformerConfig import scala.collection.mutable class TransformerFunction(config: TransformerConfig)(implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]]) - extends BaseProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]](config) { + extends BaseDatasetProcessFunction(config) { - - override def getMetricsList(): MetricsList = { - val metrics = List(config.totalEventCount, config.transformSuccessCount, - config.transformFailedCount, config.transformSkippedCount) - MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics) + override def getMetrics(): List[String] = { + List(config.totalEventCount, config.transformSuccessCount, config.transformFailedCount, config.transformSkippedCount) } /** * Method to process the event transformations */ - override def processElement(msg: mutable.Map[String, AnyRef], + override def processElement(dataset: Dataset, msg: mutable.Map[String, AnyRef], context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, metrics: Metrics): Unit = { - val datasetId = msg(config.CONST_DATASET).asInstanceOf[String] // DatasetId cannot be empty at this stage - metrics.incCounter(datasetId, config.totalEventCount) - - val datasetTransformations = DatasetRegistry.getDatasetTransformations(datasetId) - if(datasetTransformations.isDefined) { + metrics.incCounter(dataset.id, config.totalEventCount) + val datasetTransformations = DatasetRegistry.getDatasetTransformations(dataset.id) + if (datasetTransformations.isDefined) { // TODO: Perform transformations - metrics.incCounter(datasetId, config.transformSuccessCount) - context.output(config.transformerOutputTag, markSuccess(msg, config.jobName)) + metrics.incCounter(dataset.id, config.transformSuccessCount) + context.output(config.transformerOutputTag, markSuccess(msg, Producer.transformer)) } else { - metrics.incCounter(datasetId, config.transformSkippedCount) - context.output(config.transformerOutputTag, markSkipped(msg, config.jobName)) + metrics.incCounter(dataset.id, config.transformSkippedCount) + context.output(config.transformerOutputTag, markSkipped(msg, Producer.transformer)) } - } -} - +} \ No newline at end of file diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerConfig.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerConfig.scala index 24dc4292..797b3e56 100644 --- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerConfig.scala +++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerConfig.scala @@ -15,12 +15,11 @@ class TransformerConfig(override val config: Config) extends BaseJobConfig[mutab implicit val mapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) // Metric List - val totalEventCount = "total-event-count" + val totalEventCount = "transform-total-count" val transformSuccessCount = "transform-success-count" val transformFailedCount = "transform-failed-count" val transformSkippedCount = "transform-skipped-count" - val kafkaInputTopic: String = config.getString("kafka.input.topic") val kafkaTransformTopic: String = config.getString("kafka.output.transform.topic") val transformerFunction = "transformer-function" @@ -29,9 +28,11 @@ class TransformerConfig(override val config: Config) extends BaseJobConfig[mutab private val TRANSFORMER_OUTPUT_TAG = "transformed-events" val transformerOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]](TRANSFORMER_OUTPUT_TAG) - override def inputTopic(): String = kafkaInputTopic + override def inputTopic(): String = config.getString("kafka.input.topic") override def inputConsumer(): String = "transformer-consumer" override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = transformerOutputTag + + override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") } diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala index f14771bf..71e86581 100644 --- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala +++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala @@ -1,7 +1,6 @@ package org.sunbird.obsrv.transformer.task import com.typesafe.config.ConfigFactory -import org.apache.flink.api.common.eventtime.WatermarkStrategy import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.typeutils.TypeExtractor import org.apache.flink.api.java.utils.ParameterTool @@ -36,10 +35,10 @@ class TransformerStreamTask(config: TransformerConfig, kafkaConnector: FlinkKafk val transformedStream = dataStream.process(new TransformerFunction(config)).name(config.transformerFunction).uid(config.transformerFunction) .setParallelism(config.downstreamOperatorsParallelism) - transformedStream.getSideOutput(config.transformerOutputTag) - .sinkTo(kafkaConnector.kafkaMapSink(config.kafkaTransformTopic)) - .name(config.transformerProducer).uid(config.transformerProducer) - .setParallelism(config.downstreamOperatorsParallelism) + transformedStream.getSideOutput(config.transformerOutputTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaTransformTopic)) + .name(config.transformerProducer).uid(config.transformerProducer).setParallelism(config.downstreamOperatorsParallelism) + + addDefaultSinks(transformedStream, config, kafkaConnector) transformedStream.getSideOutput(config.successTag()) } } diff --git a/pom.xml b/pom.xml index 86a0cc83..c8f53bd8 100644 --- a/pom.xml +++ b/pom.xml @@ -5,10 +5,6 @@ http://maven.apache.org/maven-v4_0_0.xsd"> 4.0.0 - - 3.0.0 - - org.sunbird.obsrv core 1.0 From 0a612c27a08755dda3d77123ccae8d55a52f7d67 Mon Sep 17 00:00:00 2001 From: Manoj Krishna <92361832+ManojKrishnaChintaluri@users.noreply.github.com> Date: Fri, 15 Dec 2023 19:18:09 +0530 Subject: [PATCH 16/37] update workflow file to skip tests (#45) --- .github/workflows/build_and_deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_deploy.yaml b/.github/workflows/build_and_deploy.yaml index 90b01883..48c610c9 100644 --- a/.github/workflows/build_and_deploy.yaml +++ b/.github/workflows/build_and_deploy.yaml @@ -48,7 +48,7 @@ jobs: fetch-depth: 0 - name: Maven Build run: | - mvn clean install + mvn clean install -DskipTests - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 From 8106fa218e63190ffc41f14e057e2f51ad46589a Mon Sep 17 00:00:00 2001 From: Manjunath Davanam Date: Tue, 19 Dec 2023 12:33:49 +0530 Subject: [PATCH 17/37] Release 1.3.1 into Main (#49) * testing new images * testing new images * testing new images * testing new images * testing new images * build new image with bug fixes * update dockerfile * update dockerfile * #0 fix: upgrade packages * #0 feat: add flink dockerfiles * feat: update all failed, invalid and duplicate topic names * feat: update kafka topic names in test cases * #0 fix: add individual extraction * feat: update failed event * Update ErrorConstants.scala * feat: update failed event * Issue #0 fix: upgrade ubuntu packages for vulnerabilities * feat: add exception handling for json deserialization * Update BaseProcessFunction.scala * Update BaseProcessFunction.scala * feat: update batch failed event generation * Update ExtractionFunction.scala * feat: update invalid json exception handling * Issue #46 feat: update batch failed event * Issue #46 feat: update batch failed event * Issue #46 feat: update batch failed event * Issue #46 feat: update batch failed event * Issue #46 fix: remove cloning object * Issue #46 feat: update batch failed event * #0 fix: update github actions release condition * Issue #46 feat: add error reasons * Issue #46 feat: add exception stack trace * Issue #46 feat: add exception stack trace * Release 1.3.1 Changes (#42) * Dataset enhancements (#38) * feat: add connector config and connector stats update functions * Issue #33 feat: add documentation for Dataset, Datasources, Data In and Query APIs * Update DatasetModels.scala * #0 fix: upgrade packages * #0 feat: add flink dockerfiles * #0 fix: add individual extraction --------- Co-authored-by: ManojKrishnaChintaluri Co-authored-by: Praveen <66662436+pveleneni@users.noreply.github.com> Co-authored-by: Sowmya N Dixit * #0000 [SV] - Fallback to local redis instance if embedded redis is not starting * Update DatasetModels.scala * #0000 - refactor the denormalization logic 1. Do not fail the denormalization if the denorm key is missing 2. Add clear message whether the denorm is sucessful or failed or partially successful 3. Handle denorm for both text and number fields * #0000 - refactor: 1. Created a enum for dataset status and ignore events if the dataset is not in Live status 2. Created a outputtag for denorm failed stats 3. Parse event validation failed messages into a case class * #0000 - refactor: 1. Updated the DruidRouter job to publish data to router topics dynamically 2. Updated framework to created dynamicKafkaSink object * #0000 - mega refactoring: 1. Made calls to getAllDatasets and getAllDatasetSources to always query postgres 2. Created BaseDatasetProcessFunction for all flink functions to extend that would dynamically resolve dataset config, initialize metrics and handle common failures 3. Refactored serde - merged map and string serialization into one function and parameterized the function 4. Moved failed events sinking into a common base class 5. Master dataset processor can now do denormalization with another master dataset as well * #0000 - mega refactoring: 1. Made calls to getAllDatasets and getAllDatasetSources to always query postgres 2. Created BaseDatasetProcessFunction for all flink functions to extend that would dynamically resolve dataset config, initialize metrics and handle common failures 3. Refactored serde - merged map and string serialization into one function and parameterized the function 4. Moved failed events sinking into a common base class 5. Master dataset processor can now do denormalization with another master dataset as well * #0000 - mega refactoring: 1. Added validation to check if the event has a timestamp key and it is not blank nor invalid 2. Added timezone handling to store the data in druid in the TZ specified by the dataset * #0000 - minor refactoring: Updated DatasetRegistry.getDatasetSourceConfig to getAllDatasetSourceConfig * #0000 - mega refactoring: Refactored logs, error messages and metrics * #0000 - mega refactoring: Fix unit tests * #0000 - refactoring: 1. Introduced transformation mode to enable lenient transformations 2. Proper exception handling for transformer job * #0000 - refactoring: Fix test cases and code * #0000 - refactoring: upgrade embedded redis to work with macos sonoma m2 * #0000 - refactoring: Denormalizer test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: Router test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: Validator test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: Framework test cases and bug fixes * #0000 - refactoring: kafka connector test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: improve code coverage and fix bugs * #0000 - refactoring: improve code coverage and fix bugs --- Now the code coverage is 100% * #0000 - refactoring: organize imports * #0000 - refactoring: 1. transformer test cases and bug fixes - code coverage is 100% * #0000 - refactoring: test cases and bug fixes --------- Co-authored-by: shiva-rakshith Co-authored-by: Aniket Sakinala Co-authored-by: Manjunath Davanam Co-authored-by: ManojKrishnaChintaluri Co-authored-by: Praveen <66662436+pveleneni@users.noreply.github.com> Co-authored-by: Sowmya N Dixit Co-authored-by: Anand Parthasarathy * #000:feat: Removed the provided scope of the kafka-client in the framework (#40) * #0000 - feat: Add dataset-type to system events (#41) * #0000 - feat: Add dataset-type to system events * #0000 - feat: Modify tests for dataset-type in system events * #0000 - feat: Remove unused getDatasetType function * #0000 - feat: Remove unused pom test dependencies * #0000 - feat: Remove unused pom test dependencies --------- Co-authored-by: Santhosh Co-authored-by: shiva-rakshith Co-authored-by: Aniket Sakinala Co-authored-by: ManojKrishnaChintaluri Co-authored-by: Praveen <66662436+pveleneni@users.noreply.github.com> Co-authored-by: Sowmya N Dixit Co-authored-by: Anand Parthasarathy * Main conflicts fixes (#44) * feat: add connector config and connector stats update functions * Issue #33 feat: add documentation for Dataset, Datasources, Data In and Query APIs * Update DatasetModels.scala * Release 1.3.0 into Main branch (#34) * testing new images * testing new images * testing new images * testing new images * testing new images * build new image with bug fixes * update dockerfile * update dockerfile * #0 fix: upgrade packages * #0 feat: add flink dockerfiles * #0 fix: add individual extraction * Issue #0 fix: upgrade ubuntu packages for vulnerabilities * #0 fix: update github actions release condition --------- Co-authored-by: ManojKrishnaChintaluri Co-authored-by: Praveen <66662436+pveleneni@users.noreply.github.com> Co-authored-by: Sowmya N Dixit * Update DatasetModels.scala * Issue #2 feat: Remove kafka connector code * feat: add function to get all datasets * #000:feat: Resolve conflicts --------- Co-authored-by: shiva-rakshith Co-authored-by: Aniket Sakinala Co-authored-by: ManojKrishnaChintaluri Co-authored-by: Praveen <66662436+pveleneni@users.noreply.github.com> Co-authored-by: Sowmya N Dixit Co-authored-by: Santhosh Co-authored-by: Anand Parthasarathy Co-authored-by: Ravi Mula * #0000 - fix: Fix null dataset_type in DruidRouterFunction (#48) --------- Co-authored-by: ManojKrishnaChintaluri Co-authored-by: Praveen <66662436+pveleneni@users.noreply.github.com> Co-authored-by: shiva-rakshith Co-authored-by: Sowmya N Dixit Co-authored-by: Santhosh Co-authored-by: Aniket Sakinala Co-authored-by: Anand Parthasarathy Co-authored-by: Ravi Mula --- .../DenormalizerWindowStreamTaskTestSpec.scala | 11 ++++++++++- .../router/functions/DynamicRouterFunction.scala | 2 +- .../router/DynamicRouterStreamTaskTestSpec.scala | 11 ++++++++++- .../obsrv/extractor/ExtractorStreamTestSpec.scala | 2 +- .../MasterDataProcessorStreamTaskTestSpec.scala | 12 +++++++++++- .../PipelinePreprocessorStreamTestSpec.scala | 12 +++++++++++- 6 files changed, 44 insertions(+), 6 deletions(-) diff --git a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala index 5c3a5b86..52d06e8b 100644 --- a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala +++ b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala @@ -148,10 +148,19 @@ class DenormalizerWindowStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { systemEvents.foreach(se => { val event = JSONUtil.deserialize[SystemEvent](se) + val error = event.data.error if (event.ctx.dataset.getOrElse("ALL").equals("ALL")) event.ctx.dataset_type should be(None) + else if (error.isDefined) { + val errorCode = error.get.error_code + if (errorCode.equals(ErrorConstants.MISSING_DATASET_ID.errorCode) || + errorCode.equals(ErrorConstants.MISSING_DATASET_CONFIGURATION.errorCode) || + errorCode.equals(ErrorConstants.EVENT_MISSING.errorCode)) { + event.ctx.dataset_type should be(None) + } + } else - event.ctx.dataset_type.getOrElse("dataset") should be("dataset") + event.ctx.dataset_type should be(Some("dataset")) }) systemEvents.foreach(f => { diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala index 7b91b19f..0de7b0d7 100644 --- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala +++ b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala @@ -50,7 +50,7 @@ class DynamicRouterFunction(config: DruidRouterConfig) extends BaseDatasetProces metrics.incCounter(dataset.id, config.routerSuccessCount) markCompletion(dataset, super.markComplete(event, dataset.dataVersion), ctx, Producer.router) } else { - markFailure(Some(dataset.id), msg, ctx, metrics, ErrorConstants.INDEX_KEY_MISSING_OR_BLANK, Producer.router, FunctionalError.MissingTimestampKey) + markFailure(Some(dataset.id), msg, ctx, metrics, ErrorConstants.INDEX_KEY_MISSING_OR_BLANK, Producer.router, FunctionalError.MissingTimestampKey, datasetType = Some(dataset.datasetType)) } } diff --git a/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala b/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala index 34cd47a4..0c45a555 100644 --- a/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala +++ b/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala @@ -120,10 +120,19 @@ class DynamicRouterStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { systemEvents.foreach(se => { val event = JSONUtil.deserialize[SystemEvent](se) + val error = event.data.error if (event.ctx.dataset.getOrElse("ALL").equals("ALL")) event.ctx.dataset_type should be(None) + else if (error.isDefined) { + val errorCode = error.get.error_code + if (errorCode.equals(ErrorConstants.MISSING_DATASET_ID.errorCode) || + errorCode.equals(ErrorConstants.MISSING_DATASET_CONFIGURATION.errorCode) || + errorCode.equals(ErrorConstants.EVENT_MISSING.errorCode)) { + event.ctx.dataset_type should be(None) + } + } else - event.ctx.dataset_type.getOrElse("dataset") should be("dataset") + event.ctx.dataset_type should be(Some("dataset")) }) systemEvents.foreach(f => { diff --git a/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala b/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala index d72a8dcb..5bf3c431 100644 --- a/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala +++ b/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala @@ -140,7 +140,7 @@ class ExtractorStreamTestSpec extends BaseSpecWithDatasetRegistry { if(event.ctx.dataset.getOrElse("ALL").equals("ALL")) event.ctx.dataset_type should be(None) else - event.ctx.dataset_type.getOrElse("dataset") should be("dataset") + event.ctx.dataset_type should be(Some("dataset")) }) //TODO: Add assertions for all 6 events diff --git a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala index 9fe070d3..575e2228 100644 --- a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala +++ b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala @@ -9,6 +9,7 @@ import org.apache.kafka.common.serialization.StringDeserializer import org.scalatest.Matchers._ import org.sunbird.obsrv.BaseMetricsReporter import org.sunbird.obsrv.core.cache.RedisConnect +import org.sunbird.obsrv.core.model.ErrorConstants import org.sunbird.obsrv.core.model.Models.SystemEvent import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect} @@ -102,10 +103,19 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry sysEvents.foreach(se => { val event = JSONUtil.deserialize[SystemEvent](se) + val error = event.data.error if (event.ctx.dataset.getOrElse("ALL").equals("ALL")) event.ctx.dataset_type should be(None) + else if (error.isDefined) { + val errorCode = error.get.error_code + if (errorCode.equals(ErrorConstants.MISSING_DATASET_ID.errorCode) || + errorCode.equals(ErrorConstants.MISSING_DATASET_CONFIGURATION.errorCode) || + errorCode.equals(ErrorConstants.EVENT_MISSING.errorCode)) { + event.ctx.dataset_type should be(None) + } + } else - event.ctx.dataset_type.getOrElse("dataset") should be("master-dataset") + event.ctx.dataset_type should be(Some("master-dataset")) }) val failedEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](masterDataConfig.kafkaFailedTopic, 1, timeout = 30.seconds) diff --git a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala index d48e720d..d111543b 100644 --- a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala +++ b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala @@ -9,6 +9,7 @@ import org.apache.kafka.common.serialization.StringDeserializer import org.scalatest.Matchers._ import org.sunbird.obsrv.BaseMetricsReporter import org.sunbird.obsrv.core.cache.RedisConnect +import org.sunbird.obsrv.core.model.ErrorConstants import org.sunbird.obsrv.core.model.Models.SystemEvent import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect} @@ -151,10 +152,19 @@ class PipelinePreprocessorStreamTestSpec extends BaseSpecWithDatasetRegistry { systemEvents.foreach(se => { val event = JSONUtil.deserialize[SystemEvent](se) + val error = event.data.error if (event.ctx.dataset.getOrElse("ALL").equals("ALL")) event.ctx.dataset_type should be(None) + else if (error.isDefined) { + val errorCode = error.get.error_code + if (errorCode.equals(ErrorConstants.MISSING_DATASET_ID.errorCode) || + errorCode.equals(ErrorConstants.MISSING_DATASET_CONFIGURATION.errorCode) || + errorCode.equals(ErrorConstants.EVENT_MISSING.errorCode)) { + event.ctx.dataset_type should be(None) + } + } else - event.ctx.dataset_type.getOrElse("dataset") should be("dataset") + event.ctx.dataset_type should be(Some("dataset")) }) /* (SysEvent:,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"PipelinePreprocessorJob","type":"flink","pid":"validator"},"dataset":"d1", "dataset_type": "dataset"},"data":{"error":{"pdata_id":"validator","pdata_status":"failed","error_type":"RequiredFieldsMissing","error_code":"ERR_PP_1013","error_message":"Event failed the schema validation","error_level":"warn","error_count":1}},"ets":1701428460664}) From e7949340faa158e38eb3ed451cc5713359788a52 Mon Sep 17 00:00:00 2001 From: Praveen Veleneni <66662436+pveleneni@users.noreply.github.com> Date: Tue, 26 Dec 2023 14:36:10 +0530 Subject: [PATCH 18/37] Develop to Release-1.0.0-GA (#52) (#53) * testing new images * testing new images * testing new images * testing new images * testing new images * build new image with bug fixes * update dockerfile * update dockerfile * #0 fix: upgrade packages * #0 feat: add flink dockerfiles * feat: update all failed, invalid and duplicate topic names * feat: update kafka topic names in test cases * #0 fix: add individual extraction * feat: update failed event * Update ErrorConstants.scala * feat: update failed event * Issue #0 fix: upgrade ubuntu packages for vulnerabilities * feat: add exception handling for json deserialization * Update BaseProcessFunction.scala * Update BaseProcessFunction.scala * feat: update batch failed event generation * Update ExtractionFunction.scala * feat: update invalid json exception handling * Issue #46 feat: update batch failed event * Issue #46 feat: update batch failed event * Issue #46 feat: update batch failed event * Issue #46 feat: update batch failed event * Issue #46 fix: remove cloning object * Issue #46 feat: update batch failed event * #0 fix: update github actions release condition * Issue #46 feat: add error reasons * Issue #46 feat: add exception stack trace * Issue #46 feat: add exception stack trace * Dataset enhancements (#38) * feat: add connector config and connector stats update functions * Issue #33 feat: add documentation for Dataset, Datasources, Data In and Query APIs * Update DatasetModels.scala * #0 fix: upgrade packages * #0 feat: add flink dockerfiles * #0 fix: add individual extraction --------- * #0000 [SV] - Fallback to local redis instance if embedded redis is not starting * Update DatasetModels.scala * #0000 - refactor the denormalization logic 1. Do not fail the denormalization if the denorm key is missing 2. Add clear message whether the denorm is sucessful or failed or partially successful 3. Handle denorm for both text and number fields * #0000 - refactor: 1. Created a enum for dataset status and ignore events if the dataset is not in Live status 2. Created a outputtag for denorm failed stats 3. Parse event validation failed messages into a case class * #0000 - refactor: 1. Updated the DruidRouter job to publish data to router topics dynamically 2. Updated framework to created dynamicKafkaSink object * #0000 - mega refactoring: 1. Made calls to getAllDatasets and getAllDatasetSources to always query postgres 2. Created BaseDatasetProcessFunction for all flink functions to extend that would dynamically resolve dataset config, initialize metrics and handle common failures 3. Refactored serde - merged map and string serialization into one function and parameterized the function 4. Moved failed events sinking into a common base class 5. Master dataset processor can now do denormalization with another master dataset as well * #0000 - mega refactoring: 1. Made calls to getAllDatasets and getAllDatasetSources to always query postgres 2. Created BaseDatasetProcessFunction for all flink functions to extend that would dynamically resolve dataset config, initialize metrics and handle common failures 3. Refactored serde - merged map and string serialization into one function and parameterized the function 4. Moved failed events sinking into a common base class 5. Master dataset processor can now do denormalization with another master dataset as well * #0000 - mega refactoring: 1. Added validation to check if the event has a timestamp key and it is not blank nor invalid 2. Added timezone handling to store the data in druid in the TZ specified by the dataset * #0000 - minor refactoring: Updated DatasetRegistry.getDatasetSourceConfig to getAllDatasetSourceConfig * #0000 - mega refactoring: Refactored logs, error messages and metrics * #0000 - mega refactoring: Fix unit tests * #0000 - refactoring: 1. Introduced transformation mode to enable lenient transformations 2. Proper exception handling for transformer job * #0000 - refactoring: Fix test cases and code * #0000 - refactoring: upgrade embedded redis to work with macos sonoma m2 * #0000 - refactoring: Denormalizer test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: Router test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: Validator test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: Framework test cases and bug fixes * #0000 - refactoring: kafka connector test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: improve code coverage and fix bugs * #0000 - refactoring: improve code coverage and fix bugs --- Now the code coverage is 100% * #0000 - refactoring: organize imports * #0000 - refactoring: 1. transformer test cases and bug fixes - code coverage is 100% * #0000 - refactoring: test cases and bug fixes --------- * #000:feat: Removed the provided scope of the kafka-client in the framework (#40) * #0000 - feat: Add dataset-type to system events (#41) * #0000 - feat: Add dataset-type to system events * #0000 - feat: Modify tests for dataset-type in system events * #0000 - feat: Remove unused getDatasetType function * #0000 - feat: Remove unused pom test dependencies * #0000 - feat: Remove unused pom test dependencies * #67 feat: query system configurations from meta store * #67 fix: Refactor system configuration retrieval and update dynamic router function * #67 fix: update system config according to review * #67 fix: update test cases for system config * #67 fix: update default values in test cases * #67 fix: add get all system settings method and update test cases * #67 fix: add test case for covering exception case * #67 fix: fix data types in test cases * #67 fix: Refactor event indexing in DynamicRouterFunction * Issue #67 refactor: SystemConfig read from DB implementation * #226 fix: update test cases according to the refactor --------- Co-authored-by: Manjunath Davanam Co-authored-by: ManojKrishnaChintaluri Co-authored-by: shiva-rakshith Co-authored-by: Sowmya N Dixit Co-authored-by: Santhosh Co-authored-by: Aniket Sakinala Co-authored-by: Anand Parthasarathy --- .../sunbird/obsrv/model/DatasetModels.scala | 2 +- .../{base-config.conf => baseconfig.conf} | 0 .../spec/BaseSpecWithDatasetRegistry.scala | 3 + .../obsrv/core/model/ErrorConstants.scala | 5 +- .../org/sunbird/obsrv/core/model/Models.scala | 6 +- .../obsrv/core/model/SystemConfig.scala | 116 +++++++++++++++++- .../obsrv/core/streaming/BaseJobConfig.scala | 2 +- .../core/streaming/BaseProcessFunction.scala | 12 +- .../sunbird/obsrv/core/util/JSONUtil.scala | 4 +- .../spec/BaseProcessFunctionTestSpec.scala | 17 ++- .../sunbird/spec/BaseProcessTestConfig.scala | 2 +- .../sunbird/spec/BaseSpecWithPostgres.scala | 13 ++ .../org/sunbird/spec/SystemConfigSpec.scala | 114 +++++++++++++++++ .../functions/DynamicRouterFunction.scala | 5 +- .../extractor/task/ExtractorConfig.scala | 4 +- .../extractor/ExtractorStreamTestSpec.scala | 2 +- 16 files changed, 280 insertions(+), 27 deletions(-) rename dataset-registry/src/test/resources/{base-config.conf => baseconfig.conf} (100%) create mode 100644 framework/src/test/scala/org/sunbird/spec/SystemConfigSpec.scala diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala index ce0279b6..49cc51bc 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala @@ -20,7 +20,7 @@ object DatasetModels { case class DedupConfig(@JsonProperty("drop_duplicates") dropDuplicates: Option[Boolean] = Some(false), @JsonProperty("dedup_key") dedupKey: Option[String], - @JsonProperty("dedup_period") dedupPeriod: Option[Integer] = Some(SystemConfig.defaultDedupPeriodInSeconds)) + @JsonProperty("dedup_period") dedupPeriod: Option[Integer] = Some(SystemConfig.getInt("defaultDedupPeriodInSeconds", 604800))) case class ValidationConfig(@JsonProperty("validate") validate: Option[Boolean] = Some(true), @JsonProperty("mode") @JsonScalaEnumeration(classOf[ValidationModeType]) mode: Option[ValidationMode]) diff --git a/dataset-registry/src/test/resources/base-config.conf b/dataset-registry/src/test/resources/baseconfig.conf similarity index 100% rename from dataset-registry/src/test/resources/base-config.conf rename to dataset-registry/src/test/resources/baseconfig.conf diff --git a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala index 172dd181..09321143 100644 --- a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala +++ b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala @@ -21,12 +21,15 @@ class BaseSpecWithDatasetRegistry extends BaseSpecWithPostgres { override def beforeAll(): Unit = { super.beforeAll() val postgresConnect = new PostgresConnect(postgresConfig) + createSystemSettings(postgresConnect) createSchema(postgresConnect) insertTestData(postgresConnect) postgresConnect.closeConnection() } override def afterAll(): Unit = { + val postgresConnect = new PostgresConnect(postgresConfig) + clearSystemSettings(postgresConnect) super.afterAll() } diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala index d79ab327..efac0fbe 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala @@ -11,7 +11,7 @@ object ErrorConstants extends Enumeration { val NO_IMPLEMENTATION_FOUND = ErrorInternalValue("ERR_0001", "Unimplemented method") val NO_EXTRACTION_DATA_FOUND = ErrorInternalValue("ERR_EXT_1001", "Unable to extract the data from the extraction key") val EXTRACTED_DATA_NOT_A_LIST = ErrorInternalValue("ERR_EXT_1002", "The extracted data is not a list") - val EVENT_SIZE_EXCEEDED = ErrorInternalValue("ERR_EXT_1003", ("Event size has exceeded max configured size of " + SystemConfig.maxEventSize)) + val EVENT_SIZE_EXCEEDED = ErrorInternalValue("ERR_EXT_1003", "Event size has exceeded max configured size") val MISSING_DATASET_ID = ErrorInternalValue("ERR_EXT_1004", "Dataset Id is missing from the data") val MISSING_DATASET_CONFIGURATION = ErrorInternalValue("ERR_EXT_1005", "Dataset configuration is missing") val EVENT_MISSING = ErrorInternalValue("ERR_EXT_1006", "Event missing in the batch event") @@ -33,5 +33,8 @@ object ErrorConstants extends Enumeration { val ERR_UNKNOWN_TRANSFORM_EXCEPTION = ErrorInternalValue("ERR_TRANSFORM_1022", "Unable to evaluate the transformation expression function") val ERR_TRANSFORMATION_FAILED = ErrorInternalValue("ERR_TRANSFORM_1023", "Atleast one mandatory transformation has failed") val TRANSFORMATION_FIELD_MISSING = ErrorInternalValue("ERR_TRANSFORM_1024", "Transformation field is either missing or blank") + val SYSTEM_SETTING_INVALID_TYPE = ErrorInternalValue("ERR_SYSTEM_SETTING_1025", "Invalid value type for system setting") + val SYSTEM_SETTING_NOT_FOUND = ErrorInternalValue("ERR_SYSTEM_SETTING_1026", "System setting not found for requested key") + val SYSTEM_SETTING_DEFAULT_VALUE_NOT_FOUND = ErrorInternalValue("ERR_SYSTEM_SETTING_1027", "Default value not found for requested key") } diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/Models.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/Models.scala index 2863d78f..87c91486 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/model/Models.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/Models.scala @@ -9,6 +9,8 @@ import org.sunbird.obsrv.core.model.ModuleID.ModuleID import org.sunbird.obsrv.core.model.PDataType.PDataType import org.sunbird.obsrv.core.model.Producer.Producer import org.sunbird.obsrv.core.model.StatusCode.StatusCode +import com.fasterxml.jackson.annotation.JsonProperty +import org.sunbird.obsrv.core.exception.ObsrvException object Models { @@ -27,6 +29,8 @@ object Models { case class EData(error: Option[ErrorLog] = None, pipeline_stats: Option[PipelineStats] = None, extra: Option[Map[String, AnyRef]] = None) case class SystemEvent(@JsonScalaEnumeration(classOf[EventIDType]) etype: EventID, ctx: ContextData, data: EData, ets: Long = System.currentTimeMillis()) + case class SystemSetting(key: String, value: String, category: String, valueType: String, label: Option[String]) + } class EventIDType extends TypeReference[EventID.type] @@ -76,4 +80,4 @@ object PDataType extends Enumeration { object Stats extends Enumeration { type Stats = Value val total_processing_time, latency_time, processing_time = Value -} \ No newline at end of file +} diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/SystemConfig.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/SystemConfig.scala index ee21152f..118e8c53 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/model/SystemConfig.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/SystemConfig.scala @@ -1,13 +1,117 @@ package org.sunbird.obsrv.core.model +import com.typesafe.config.{Config, ConfigFactory} +import org.sunbird.obsrv.core.exception.ObsrvException +import org.sunbird.obsrv.core.model.Models.SystemSetting +import org.sunbird.obsrv.core.util.{PostgresConnect, PostgresConnectionConfig} + +import java.io.File +import java.sql.ResultSet + object SystemConfig { - // TODO: Fetch the system config from postgres db - val defaultDedupPeriodInSeconds: Int = 604800 // 7 days - val maxEventSize: Long = 1048576 - val defaultDatasetId = "ALL" + private def getSystemConfig(key: String): Option[SystemSetting] = { + SystemConfigService.getSystemSetting(key) + } + + @throws[ObsrvException] + private def getConfigValueOpt(key: String, requiredType: String): Option[String] = { + + getSystemConfig(key).map(config => { + if (!config.valueType.equalsIgnoreCase(requiredType)) throw new ObsrvException(ErrorConstants.SYSTEM_SETTING_INVALID_TYPE) + config.value + }).orElse(None) + } + + private def getConfigValue(key: String, requiredType: String): String = { + + getSystemConfig(key).map(config => { + if (!config.valueType.equalsIgnoreCase(requiredType)) throw new ObsrvException(ErrorConstants.SYSTEM_SETTING_INVALID_TYPE) + config.value + }).orElse(throw new ObsrvException(ErrorConstants.SYSTEM_SETTING_NOT_FOUND)).get + } + + def getString(key: String): String = { + getConfigValue(key, requiredType = "string") + } + + def getString(key: String, defaultValue: String): String = { + getConfigValueOpt(key, requiredType = "string").getOrElse(defaultValue) + } + + def getInt(key: String): Int = { + getConfigValue(key, requiredType = "int").toInt + } + + def getInt(key: String, defaultValue: Int): Int = { + getConfigValueOpt(key, requiredType = "int").getOrElse(defaultValue.toString).toInt + } + + def getLong(key: String): Long = { + getConfigValue(key, requiredType = "long").toLong + } + + def getLong(key: String, defaultValue: Long): Long = { + getConfigValueOpt(key, requiredType = "long").getOrElse(defaultValue.toString).toLong + } + + def getBoolean(key: String): Boolean = { + getConfigValue(key, requiredType = "boolean").toBoolean + } + + def getBoolean(key: String, defaultValue: Boolean): Boolean = { + getConfigValueOpt(key, requiredType = "boolean").getOrElse(defaultValue.toString).toBoolean + } + +} + +object SystemConfigService { + + private val configFile = new File("/data/flink/conf/baseconfig.conf") + // $COVERAGE-OFF$ + val config: Config = if (configFile.exists()) { + println("Loading configuration file cluster baseconfig.conf...") + ConfigFactory.parseFile(configFile).resolve() + } else { + // $COVERAGE-ON$ + println("Loading configuration file baseconfig.conf inside the jar...") + ConfigFactory.load("baseconfig.conf").withFallback(ConfigFactory.systemEnvironment()) + } + private val postgresConfig = PostgresConnectionConfig( + config.getString("postgres.user"), + config.getString("postgres.password"), + config.getString("postgres.database"), + config.getString("postgres.host"), + config.getInt("postgres.port"), + config.getInt("postgres.maxConnections")) + + @throws[Exception] + def getAllSystemSettings: List[SystemSetting] = { + val postgresConnect = new PostgresConnect(postgresConfig) + val rs = postgresConnect.executeQuery("SELECT * FROM system_settings") + val result = Iterator.continually((rs, rs.next)).takeWhile(f => f._2).map(f => f._1).map(result => { + parseSystemSetting(result) + }).toList + postgresConnect.closeConnection() + result + } + + @throws[Exception] + def getSystemSetting(key: String): Option[SystemSetting] = { + val postgresConnect = new PostgresConnect(postgresConfig) + val rs = postgresConnect.executeQuery(s"SELECT * FROM system_settings WHERE key = '$key'") + if (rs.next) { + Option(parseSystemSetting(rs)) + } else None + } - // secret key length should be 16, 24 or 32 characters - val encryptionSecretKey = "ckW5GFkTtMDNGEr5k67YpQMEBJNX3x2f" + private def parseSystemSetting(rs: ResultSet): SystemSetting = { + val key = rs.getString("key") + val value = rs.getString("value") + val category = rs.getString("category") + val valueType = rs.getString("valuetype") + val label = rs.getString("label") + SystemSetting(key, value, category, valueType, Option(label)) + } } diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala index f82b430c..dc6eaa66 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala @@ -17,7 +17,7 @@ abstract class BaseJobConfig[T](val config: Config, val jobName: String) extends implicit val metricTypeInfo: TypeInformation[String] = TypeExtractor.getForClass(classOf[String]) - val defaultDatasetID: String = SystemConfig.defaultDatasetId + def defaultDatasetID: String = SystemConfig.getString("defaultDatasetId", "ALL") private val kafkaProducerBrokerServers: String = config.getString("kafka.producer.broker-servers") private val kafkaConsumerBrokerServers: String = config.getString("kafka.consumer.broker-servers") // Producer Properties diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseProcessFunction.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseProcessFunction.scala index 71641352..387842a5 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseProcessFunction.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseProcessFunction.scala @@ -56,7 +56,7 @@ case class Metrics(metrics: mutable.Map[String, ConcurrentHashMap[String, Atomic trait JobMetrics { def registerMetrics(datasets: List[String], metrics: List[String]): Metrics = { - val allDatasets = datasets ++ List(SystemConfig.defaultDatasetId) + val allDatasets = datasets ++ List(SystemConfig.getString("defaultDatasetId", "ALL")) val datasetMetricMap: Map[String, ConcurrentHashMap[String, AtomicLong]] = allDatasets.map(dataset => { val metricMap = new ConcurrentHashMap[String, AtomicLong]() metrics.foreach { metric => metricMap.put(metric, new AtomicLong(0L)) } @@ -156,10 +156,11 @@ abstract class BaseProcessFunction[T, R](config: BaseJobConfig[R]) extends Proce )) }) } - getRuntimeContext.getMetricGroup.addGroup(config.jobName).addGroup(SystemConfig.defaultDatasetId) + val defaultDatasetId = SystemConfig.getString("defaultDatasetId", "ALL") + getRuntimeContext.getMetricGroup.addGroup(config.jobName).addGroup(defaultDatasetId) .gauge[Long, ScalaGauge[Long]](config.eventFailedMetricsCount, ScalaGauge[Long](() => // $COVERAGE-OFF$ - metrics.getAndReset(SystemConfig.defaultDatasetId, config.eventFailedMetricsCount) + metrics.getAndReset(defaultDatasetId, config.eventFailedMetricsCount) // $COVERAGE-ON$ )) } @@ -190,10 +191,11 @@ abstract class WindowBaseProcessFunction[I, O, K](config: BaseJobConfig[O]) exte )) }) } - getRuntimeContext.getMetricGroup.addGroup(config.jobName).addGroup(SystemConfig.defaultDatasetId) + val defaultDatasetId = SystemConfig.getString("defaultDatasetId", "ALL") + getRuntimeContext.getMetricGroup.addGroup(config.jobName).addGroup(defaultDatasetId) .gauge[Long, ScalaGauge[Long]](config.eventFailedMetricsCount, ScalaGauge[Long](() => // $COVERAGE-OFF$ - metrics.getAndReset(SystemConfig.defaultDatasetId, config.eventFailedMetricsCount) + metrics.getAndReset(defaultDatasetId, config.eventFailedMetricsCount) // $COVERAGE-ON$ )) } diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala b/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala index 19c56af1..67156256 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala @@ -17,7 +17,7 @@ object JSONUtil { .disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES) .disable(SerializationFeature.FAIL_ON_EMPTY_BEANS) .enable(Feature.WRITE_BIGDECIMAL_AS_PLAIN) - .build() :: ClassTagExtensions + .build() mapper.setSerializationInclusion(Include.NON_ABSENT) @@ -73,4 +73,4 @@ object JSONUtil { } } -} \ No newline at end of file +} diff --git a/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala b/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala index 5d7673e0..bac2b0ae 100644 --- a/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala +++ b/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala @@ -14,7 +14,7 @@ import org.apache.flink.test.util.MiniClusterWithClientResource import org.apache.kafka.common.serialization.StringDeserializer import org.scalatest.Matchers import org.sunbird.obsrv.core.streaming._ -import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, Util} +import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnectionConfig, Util, PostgresConnect} import java.util.concurrent.ConcurrentHashMap import java.util.concurrent.atomic.AtomicLong @@ -23,14 +23,20 @@ import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent.Future import scala.concurrent.duration._ -class BaseProcessFunctionTestSpec extends BaseSpec with Matchers { - +class BaseProcessFunctionTestSpec extends BaseSpecWithPostgres with Matchers { val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder() .setNumberSlotsPerTaskManager(1) .setNumberTaskManagers(1) .build) val config: Config = ConfigFactory.load("base-test.conf") + val postgresConfig: PostgresConnectionConfig = PostgresConnectionConfig( + config.getString("postgres.user"), + config.getString("postgres.password"), + config.getString("postgres.database"), + config.getString("postgres.host"), + config.getInt("postgres.port"), + config.getInt("postgres.maxConnections")) val bsMapConfig = new BaseProcessTestMapConfig(config) val bsConfig = new BaseProcessTestConfig(config) val kafkaConnector = new FlinkKafkaConnector(bsConfig) @@ -51,7 +57,8 @@ class BaseProcessFunctionTestSpec extends BaseSpec with Matchers { override def beforeAll(): Unit = { super.beforeAll() - + val postgresConnect = new PostgresConnect(postgresConfig) + createSystemSettings(postgresConnect) EmbeddedKafka.start()(embeddedKafkaConfig) createTestTopics(bsConfig.testTopics) @@ -65,6 +72,8 @@ class BaseProcessFunctionTestSpec extends BaseSpec with Matchers { } override protected def afterAll(): Unit = { + val postgresConnect = new PostgresConnect(postgresConfig) + clearSystemSettings(postgresConnect) super.afterAll() flinkCluster.after() EmbeddedKafka.stop() diff --git a/framework/src/test/scala/org/sunbird/spec/BaseProcessTestConfig.scala b/framework/src/test/scala/org/sunbird/spec/BaseProcessTestConfig.scala index 96358d81..8cf3521f 100644 --- a/framework/src/test/scala/org/sunbird/spec/BaseProcessTestConfig.scala +++ b/framework/src/test/scala/org/sunbird/spec/BaseProcessTestConfig.scala @@ -72,4 +72,4 @@ class BaseProcessTestMapConfig(override val config: Config) extends BaseJobConfi override def successTag(): OutputTag[Map[String, AnyRef]] = mapOutputTag override def failedEventsOutputTag(): OutputTag[Map[String, AnyRef]] = mapOutputTag -} \ No newline at end of file +} diff --git a/framework/src/test/scala/org/sunbird/spec/BaseSpecWithPostgres.scala b/framework/src/test/scala/org/sunbird/spec/BaseSpecWithPostgres.scala index fd4985db..86f0afa6 100644 --- a/framework/src/test/scala/org/sunbird/spec/BaseSpecWithPostgres.scala +++ b/framework/src/test/scala/org/sunbird/spec/BaseSpecWithPostgres.scala @@ -2,6 +2,7 @@ package org.sunbird.spec import io.zonky.test.db.postgres.embedded.EmbeddedPostgres import org.scalatest.{BeforeAndAfterAll, FlatSpec} +import org.sunbird.obsrv.core.util.PostgresConnect import redis.embedded.RedisServer class BaseSpecWithPostgres extends FlatSpec with BeforeAndAfterAll { @@ -26,4 +27,16 @@ class BaseSpecWithPostgres extends FlatSpec with BeforeAndAfterAll { embeddedPostgres.close() } + def createSystemSettings(postgresConnect: PostgresConnect): Unit = { + postgresConnect.execute("CREATE TABLE IF NOT EXISTS system_settings ( key text NOT NULL, value text NOT NULL, category text NOT NULL DEFAULT 'SYSTEM'::text, valuetype text NOT NULL, created_date timestamp NOT NULL DEFAULT now(), updated_date timestamp, label text, PRIMARY KEY (\"key\"));") + postgresConnect.execute("insert into system_settings values('defaultDedupPeriodInSeconds', '604801', 'system', 'int', now(), now(), 'Dedup Period in Seconds');") + postgresConnect.execute("insert into system_settings values('maxEventSize', '1048676', 'system', 'long', now(), now(), 'Max Event Size');") + postgresConnect.execute("insert into system_settings values('defaultDatasetId', 'ALL', 'system', 'string', now(), now(), 'Default Dataset Id');") + postgresConnect.execute("insert into system_settings values('encryptionSecretKey', 'ckW5GFkTtMDNGEr5k67YpQMEBJNX3x2f', 'system', 'string', now(), now(), 'Encryption Secret Key');") + postgresConnect.execute("insert into system_settings values('enable', 'true', 'system', 'boolean', now(), now(), 'Enable flag');") + } + + def clearSystemSettings(postgresConnect: PostgresConnect): Unit = { + postgresConnect.execute("DROP TABLE system_settings;") + } } diff --git a/framework/src/test/scala/org/sunbird/spec/SystemConfigSpec.scala b/framework/src/test/scala/org/sunbird/spec/SystemConfigSpec.scala new file mode 100644 index 00000000..1907e82d --- /dev/null +++ b/framework/src/test/scala/org/sunbird/spec/SystemConfigSpec.scala @@ -0,0 +1,114 @@ +package org.sunbird.spec + +import com.typesafe.config.{Config, ConfigFactory} +import org.scalamock.scalatest.MockFactory +import org.scalatest.Matchers +import org.sunbird.obsrv.core.model.{SystemConfig, SystemConfigService} +import org.sunbird.obsrv.core.util.{PostgresConnect, PostgresConnectionConfig} + +class SystemConfigSpec extends BaseSpecWithPostgres with Matchers with MockFactory { + val configFile: Config = ConfigFactory.load("base-test.conf") + val postgresConfig: PostgresConnectionConfig = PostgresConnectionConfig( + configFile.getString("postgres.user"), + configFile.getString("postgres.password"), + configFile.getString("postgres.database"), + configFile.getString("postgres.host"), + configFile.getInt("postgres.port"), + configFile.getInt("postgres.maxConnections")) + + override def beforeAll(): Unit = { + super.beforeAll() + val postgresConnect = new PostgresConnect(postgresConfig) + createSystemSettings(postgresConnect) + } + + override def afterAll(): Unit = { + val postgresConnect = new PostgresConnect(postgresConfig) + clearSystemSettings(postgresConnect) + super.afterAll() + } + + def createInvalidSystemSettings(postgresConnect: PostgresConnect): Unit = { + postgresConnect.execute("CREATE TABLE IF NOT EXISTS system_settings ( key text NOT NULL, value text NOT NULL, category text NOT NULL DEFAULT 'SYSTEM'::text, valuetype text NOT NULL, created_date timestamp NOT NULL DEFAULT now(), updated_date timestamp, label text, PRIMARY KEY (\"key\"));") + postgresConnect.execute("insert into system_settings values('defaultDedupPeriodInSeconds', '604801', 'system', 'double', now(), now(), 'Dedup Period in Seconds');") + postgresConnect.execute("insert into system_settings values('maxEventSize', '1048676', 'system', 'inv', now(), now(), 'Max Event Size');") + postgresConnect.execute("insert into system_settings values('defaultDatasetId', 'ALL', 'system', 'random', now(), now(), 'Default Dataset Id');") + postgresConnect.execute("insert into system_settings values('encryptionSecretKey', 'ckW5GFkTtMDNGEr5k67YpQMEBJNX3x2f', 'system', 'text', now(), now(), 'Encryption Secret Key');") + } + + "SystemConfig" should "populate configurations with values from database" in { + SystemConfig.getInt("defaultDedupPeriodInSeconds") should be(604801) + SystemConfig.getInt("defaultDedupPeriodInSeconds", 604800) should be(604801) + SystemConfig.getLong("maxEventSize", 100L) should be(1048676L) + SystemConfig.getString("defaultDatasetId", "NEW") should be("ALL") + SystemConfig.getString("encryptionSecretKey", "test") should be("ckW5GFkTtMDNGEr5k67YpQMEBJNX3x2f") + SystemConfig.getBoolean("enable", false) should be(true) + } + + "SystemConfig" should "return default values when keys are not present in db" in { + val postgresConnect = new PostgresConnect(postgresConfig) + postgresConnect.execute("TRUNCATE TABLE system_settings;") + SystemConfig.getInt("defaultDedupPeriodInSeconds", 604800) should be(604800) + SystemConfig.getLong("maxEventSize", 100L) should be(100L) + SystemConfig.getString("defaultDatasetId", "NEW") should be("NEW") + SystemConfig.getString("encryptionSecretKey", "test") should be("test") + SystemConfig.getBoolean("enable", false) should be(false) + } + + "SystemConfig" should "throw exception when valueType doesn't match" in { + val postgresConnect = new PostgresConnect(postgresConfig) + clearSystemSettings(postgresConnect) + createInvalidSystemSettings(postgresConnect) + val thrown = intercept[Exception] { + SystemConfig.getInt("defaultDedupPeriodInSeconds", 604800) + } + thrown.getMessage should be("Invalid value type for system setting") + } + + "SystemConfig" should "throw exception when valueType doesn't match without default value" in { + val postgresConnect = new PostgresConnect(postgresConfig) + clearSystemSettings(postgresConnect) + createInvalidSystemSettings(postgresConnect) + val thrown = intercept[Exception] { + SystemConfig.getInt("defaultDedupPeriodInSeconds") + } + thrown.getMessage should be("Invalid value type for system setting") + } + + "SystemConfigService" should "return all system settings" in { + val systemSettings = SystemConfigService.getAllSystemSettings + systemSettings.size should be(4) + systemSettings.map(f => { + f.key match { + case "defaultDedupPeriodInSeconds" => f.value should be("604801") + case "maxEventSize" => f.value should be("1048676") + case "defaultDatasetId" => f.value should be("ALL") + case "encryptionSecretKey" => f.value should be("ckW5GFkTtMDNGEr5k67YpQMEBJNX3x2f") + case "enable" => f.value should be("true") + } + }) + } + + "SystemConfig" should "throw exception when the key is not present in db" in { + var thrown = intercept[Exception] { + SystemConfig.getInt("invalidKey") + } + thrown.getMessage should be("System setting not found for requested key") + + thrown = intercept[Exception] { + SystemConfig.getString("invalidKey") + } + thrown.getMessage should be("System setting not found for requested key") + + thrown = intercept[Exception] { + SystemConfig.getBoolean("invalidKey") + } + thrown.getMessage should be("System setting not found for requested key") + + thrown = intercept[Exception] { + SystemConfig.getLong("invalidKey") + } + thrown.getMessage should be("System setting not found for requested key") + } + +} diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala index 0de7b0d7..ed50c8eb 100644 --- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala +++ b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala @@ -41,9 +41,10 @@ class DynamicRouterFunction(config: DruidRouterConfig) extends BaseDatasetProces metrics.incCounter(dataset.id, config.routerTotalCount) val event = Util.getMutableMap(msg(config.CONST_EVENT).asInstanceOf[Map[String, AnyRef]]) + event.put(config.CONST_OBSRV_META, msg(config.CONST_OBSRV_META).asInstanceOf[Map[String, AnyRef]]) val tsKeyData = TimestampKeyParser.parseTimestampKey(dataset.datasetConfig, event) + event.put("indexTS", tsKeyData.value) if (tsKeyData.isValid) { - event.put(config.CONST_OBSRV_META, msg(config.CONST_OBSRV_META).asInstanceOf[Map[String, AnyRef]] ++ Map("indexTS" -> tsKeyData.value)) val routerConfig = dataset.routerConfig val topicEventMap = mutable.Map(Constants.TOPIC -> routerConfig.topic, Constants.MESSAGE -> event) ctx.output(config.routerOutputTag, topicEventMap) @@ -112,4 +113,4 @@ object TimestampKeyParser { } } -} \ No newline at end of file +} diff --git a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/task/ExtractorConfig.scala b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/task/ExtractorConfig.scala index 131e70ce..17c1bac9 100644 --- a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/task/ExtractorConfig.scala +++ b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/task/ExtractorConfig.scala @@ -17,14 +17,14 @@ class ExtractorConfig(override val config: Config) extends BaseJobConfig[mutable implicit val stringTypeInfo: TypeInformation[String] = TypeExtractor.getForClass(classOf[String]) val dedupStore: Int = config.getInt("redis.database.extractor.duplication.store.id") - val cacheExpiryInSeconds: Int = SystemConfig.defaultDedupPeriodInSeconds + def cacheExpiryInSeconds: Int = SystemConfig.getInt("defaultDedupPeriodInSeconds", 604800) // Kafka Topics Configuration val kafkaInputTopic: String = config.getString("kafka.input.topic") val kafkaSuccessTopic: String = config.getString("kafka.output.raw.topic") val kafkaDuplicateTopic: String = config.getString("kafka.output.extractor.duplicate.topic") val kafkaBatchFailedTopic: String = config.getString("kafka.output.batch.failed.topic") - val eventMaxSize: Long = if(config.hasPath("kafka.event.max.size")) config.getInt("kafka.event.max.size") else SystemConfig.maxEventSize + def eventMaxSize: Long = if(config.hasPath("kafka.event.max.size")) config.getInt("kafka.event.max.size") else SystemConfig.getLong("maxEventSize", 1048576L) private val RAW_EVENTS_OUTPUT_TAG = "raw-events" private val FAILED_BATCH_EVENTS_OUTPUT_TAG = "failed-batch-events" diff --git a/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala b/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala index 5bf3c431..6ada824b 100644 --- a/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala +++ b/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala @@ -102,7 +102,7 @@ class ExtractorStreamTestSpec extends BaseSpecWithDatasetRegistry { val config2: Config = ConfigFactory.load("test2.conf") val extractorConfig = new ExtractorConfig(config2) - extractorConfig.eventMaxSize should be (SystemConfig.maxEventSize) + extractorConfig.eventMaxSize should be (SystemConfig.getLong("maxEventSize", 1048576L)) } private def validateOutputEvents(outputEvents: List[String]) = { From e8c3f57439c2fbd824f96d1db3b3b71a15e389c9 Mon Sep 17 00:00:00 2001 From: Ravi Mula Date: Fri, 12 Jan 2024 11:15:20 +0530 Subject: [PATCH 19/37] Develop to 1.0.1-GA (#59) (#60) * testing new images * testing new images * testing new images * testing new images * testing new images * build new image with bug fixes * update dockerfile * update dockerfile * #0 fix: upgrade packages * #0 feat: add flink dockerfiles * feat: update all failed, invalid and duplicate topic names * feat: update kafka topic names in test cases * #0 fix: add individual extraction * feat: update failed event * Update ErrorConstants.scala * feat: update failed event * Issue #0 fix: upgrade ubuntu packages for vulnerabilities * feat: add exception handling for json deserialization * Update BaseProcessFunction.scala * Update BaseProcessFunction.scala * feat: update batch failed event generation * Update ExtractionFunction.scala * feat: update invalid json exception handling * Issue #46 feat: update batch failed event * Issue #46 feat: update batch failed event * Issue #46 feat: update batch failed event * Issue #46 feat: update batch failed event * Issue #46 fix: remove cloning object * Issue #46 feat: update batch failed event * #0 fix: update github actions release condition * Issue #46 feat: add error reasons * Issue #46 feat: add exception stack trace * Issue #46 feat: add exception stack trace * Dataset enhancements (#38) * feat: add connector config and connector stats update functions * Issue #33 feat: add documentation for Dataset, Datasources, Data In and Query APIs * Update DatasetModels.scala * #0 fix: upgrade packages * #0 feat: add flink dockerfiles * #0 fix: add individual extraction --------- * #0000 [SV] - Fallback to local redis instance if embedded redis is not starting * Update DatasetModels.scala * #0000 - refactor the denormalization logic 1. Do not fail the denormalization if the denorm key is missing 2. Add clear message whether the denorm is sucessful or failed or partially successful 3. Handle denorm for both text and number fields * #0000 - refactor: 1. Created a enum for dataset status and ignore events if the dataset is not in Live status 2. Created a outputtag for denorm failed stats 3. Parse event validation failed messages into a case class * #0000 - refactor: 1. Updated the DruidRouter job to publish data to router topics dynamically 2. Updated framework to created dynamicKafkaSink object * #0000 - mega refactoring: 1. Made calls to getAllDatasets and getAllDatasetSources to always query postgres 2. Created BaseDatasetProcessFunction for all flink functions to extend that would dynamically resolve dataset config, initialize metrics and handle common failures 3. Refactored serde - merged map and string serialization into one function and parameterized the function 4. Moved failed events sinking into a common base class 5. Master dataset processor can now do denormalization with another master dataset as well * #0000 - mega refactoring: 1. Made calls to getAllDatasets and getAllDatasetSources to always query postgres 2. Created BaseDatasetProcessFunction for all flink functions to extend that would dynamically resolve dataset config, initialize metrics and handle common failures 3. Refactored serde - merged map and string serialization into one function and parameterized the function 4. Moved failed events sinking into a common base class 5. Master dataset processor can now do denormalization with another master dataset as well * #0000 - mega refactoring: 1. Added validation to check if the event has a timestamp key and it is not blank nor invalid 2. Added timezone handling to store the data in druid in the TZ specified by the dataset * #0000 - minor refactoring: Updated DatasetRegistry.getDatasetSourceConfig to getAllDatasetSourceConfig * #0000 - mega refactoring: Refactored logs, error messages and metrics * #0000 - mega refactoring: Fix unit tests * #0000 - refactoring: 1. Introduced transformation mode to enable lenient transformations 2. Proper exception handling for transformer job * #0000 - refactoring: Fix test cases and code * #0000 - refactoring: upgrade embedded redis to work with macos sonoma m2 * #0000 - refactoring: Denormalizer test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: Router test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: Validator test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: Framework test cases and bug fixes * #0000 - refactoring: kafka connector test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: improve code coverage and fix bugs * #0000 - refactoring: improve code coverage and fix bugs --- Now the code coverage is 100% * #0000 - refactoring: organize imports * #0000 - refactoring: 1. transformer test cases and bug fixes - code coverage is 100% * #0000 - refactoring: test cases and bug fixes --------- * #000:feat: Removed the provided scope of the kafka-client in the framework (#40) * #0000 - feat: Add dataset-type to system events (#41) * #0000 - feat: Add dataset-type to system events * #0000 - feat: Modify tests for dataset-type in system events * #0000 - feat: Remove unused getDatasetType function * #0000 - feat: Remove unused pom test dependencies * #0000 - feat: Remove unused pom test dependencies * #67 feat: query system configurations from meta store * #67 fix: Refactor system configuration retrieval and update dynamic router function * #67 fix: update system config according to review * #67 fix: update test cases for system config * #67 fix: update default values in test cases * #67 fix: add get all system settings method and update test cases * #67 fix: add test case for covering exception case * #67 fix: fix data types in test cases * #67 fix: Refactor event indexing in DynamicRouterFunction * Issue #67 refactor: SystemConfig read from DB implementation * #226 fix: update test cases according to the refactor * Dataset Registry Update (#57) * Issue #0000: feat: updateConnectorStats method includes last run timestamp * Issue #0000: fix: updateConnectorStats sql query updated * Issue #0000: fix: updateConnectorStats sql query updated --------- Co-authored-by: Manjunath Davanam Co-authored-by: ManojKrishnaChintaluri Co-authored-by: Praveen <66662436+pveleneni@users.noreply.github.com> Co-authored-by: shiva-rakshith Co-authored-by: Sowmya N Dixit Co-authored-by: Santhosh Co-authored-by: Aniket Sakinala Co-authored-by: Anand Parthasarathy Co-authored-by: Shreyas Bhaktharam <121869503+shreyasb22@users.noreply.github.com> --- .../org/sunbird/obsrv/service/DatasetRegistryService.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala index 89efec4c..e783b871 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala @@ -118,9 +118,10 @@ object DatasetRegistryService { } def updateConnectorStats(id: String, lastFetchTimestamp: Timestamp, records: Long): Int = { - val query = s"UPDATE dataset_source_config SET connector_stats = jsonb_set(jsonb_set(coalesce(connector_stats, '{}')::jsonb, '{records}'," + - s" ((COALESCE(connector_stats->>'records', '0')::int + $records)::text)::jsonb, true), '{last_fetch_timestamp}', " + - s"to_jsonb('$lastFetchTimestamp'::timestamp), true) WHERE id = '$id'" + val query = s"UPDATE dataset_source_config SET connector_stats = coalesce(connector_stats, '{}')::jsonb || " + + s"jsonb_build_object('records', COALESCE(connector_stats->>'records', '0')::int + '$records'::int) || " + + s"jsonb_build_object('last_fetch_timestamp', '${lastFetchTimestamp}'::timestamp) || " + + s"jsonb_build_object('last_run_timestamp', '${new Timestamp(System.currentTimeMillis())}'::timestamp) WHERE id = '$id';" updateRegistry(query) } From d2a2deac86af44e88169354cd79780541c0446d9 Mon Sep 17 00:00:00 2001 From: Ravi Mula Date: Mon, 29 Jan 2024 14:53:48 +0530 Subject: [PATCH 20/37] Develop to 1.0.2-GA (#65) (#66) * testing new images * testing new images * testing new images * testing new images * testing new images * build new image with bug fixes * update dockerfile * update dockerfile * #0 fix: upgrade packages * #0 feat: add flink dockerfiles * feat: update all failed, invalid and duplicate topic names * feat: update kafka topic names in test cases * #0 fix: add individual extraction * feat: update failed event * Update ErrorConstants.scala * feat: update failed event * Issue #0 fix: upgrade ubuntu packages for vulnerabilities * feat: add exception handling for json deserialization * Update BaseProcessFunction.scala * Update BaseProcessFunction.scala * feat: update batch failed event generation * Update ExtractionFunction.scala * feat: update invalid json exception handling * Issue #46 feat: update batch failed event * Issue #46 feat: update batch failed event * Issue #46 feat: update batch failed event * Issue #46 feat: update batch failed event * Issue #46 fix: remove cloning object * Issue #46 feat: update batch failed event * #0 fix: update github actions release condition * Issue #46 feat: add error reasons * Issue #46 feat: add exception stack trace * Issue #46 feat: add exception stack trace * Dataset enhancements (#38) * feat: add connector config and connector stats update functions * Issue #33 feat: add documentation for Dataset, Datasources, Data In and Query APIs * Update DatasetModels.scala * #0 fix: upgrade packages * #0 feat: add flink dockerfiles * #0 fix: add individual extraction --------- * #0000 [SV] - Fallback to local redis instance if embedded redis is not starting * Update DatasetModels.scala * #0000 - refactor the denormalization logic 1. Do not fail the denormalization if the denorm key is missing 2. Add clear message whether the denorm is sucessful or failed or partially successful 3. Handle denorm for both text and number fields * #0000 - refactor: 1. Created a enum for dataset status and ignore events if the dataset is not in Live status 2. Created a outputtag for denorm failed stats 3. Parse event validation failed messages into a case class * #0000 - refactor: 1. Updated the DruidRouter job to publish data to router topics dynamically 2. Updated framework to created dynamicKafkaSink object * #0000 - mega refactoring: 1. Made calls to getAllDatasets and getAllDatasetSources to always query postgres 2. Created BaseDatasetProcessFunction for all flink functions to extend that would dynamically resolve dataset config, initialize metrics and handle common failures 3. Refactored serde - merged map and string serialization into one function and parameterized the function 4. Moved failed events sinking into a common base class 5. Master dataset processor can now do denormalization with another master dataset as well * #0000 - mega refactoring: 1. Made calls to getAllDatasets and getAllDatasetSources to always query postgres 2. Created BaseDatasetProcessFunction for all flink functions to extend that would dynamically resolve dataset config, initialize metrics and handle common failures 3. Refactored serde - merged map and string serialization into one function and parameterized the function 4. Moved failed events sinking into a common base class 5. Master dataset processor can now do denormalization with another master dataset as well * #0000 - mega refactoring: 1. Added validation to check if the event has a timestamp key and it is not blank nor invalid 2. Added timezone handling to store the data in druid in the TZ specified by the dataset * #0000 - minor refactoring: Updated DatasetRegistry.getDatasetSourceConfig to getAllDatasetSourceConfig * #0000 - mega refactoring: Refactored logs, error messages and metrics * #0000 - mega refactoring: Fix unit tests * #0000 - refactoring: 1. Introduced transformation mode to enable lenient transformations 2. Proper exception handling for transformer job * #0000 - refactoring: Fix test cases and code * #0000 - refactoring: upgrade embedded redis to work with macos sonoma m2 * #0000 - refactoring: Denormalizer test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: Router test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: Validator test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: Framework test cases and bug fixes * #0000 - refactoring: kafka connector test cases and bug fixes. Code coverage is 100% now * #0000 - refactoring: improve code coverage and fix bugs * #0000 - refactoring: improve code coverage and fix bugs --- Now the code coverage is 100% * #0000 - refactoring: organize imports * #0000 - refactoring: 1. transformer test cases and bug fixes - code coverage is 100% * #0000 - refactoring: test cases and bug fixes --------- * #000:feat: Removed the provided scope of the kafka-client in the framework (#40) * #0000 - feat: Add dataset-type to system events (#41) * #0000 - feat: Add dataset-type to system events * #0000 - feat: Modify tests for dataset-type in system events * #0000 - feat: Remove unused getDatasetType function * #0000 - feat: Remove unused pom test dependencies * #0000 - feat: Remove unused pom test dependencies * #67 feat: query system configurations from meta store * #67 fix: Refactor system configuration retrieval and update dynamic router function * #67 fix: update system config according to review * #67 fix: update test cases for system config * #67 fix: update default values in test cases * #67 fix: add get all system settings method and update test cases * #67 fix: add test case for covering exception case * #67 fix: fix data types in test cases * #67 fix: Refactor event indexing in DynamicRouterFunction * Issue #67 refactor: SystemConfig read from DB implementation * #226 fix: update test cases according to the refactor * Dataset Registry Update (#57) * Issue #0000: feat: updateConnectorStats method includes last run timestamp * Issue #0000: fix: updateConnectorStats sql query updated * Issue #0000: fix: updateConnectorStats sql query updated * #0000 - fix: Fix Postgres connection issue with defaultDatasetID (#64) * Metrics implementation for MasterDataIndexerJob (#55) * Issue #50 fix: Kafka Metrics implementation for MasterDataIndexerJob * Issue #50 fix: Changed 'ets' to UTC * Issue #50 feat: added log statements * Issue #50 fix: FIxed issue related to update query * Issue #50 fix: Code refactoring * Issue #50 fix: updated implementation of 'createDataFile' method * Issue #50 fix: code refactorig * Issue #50 test: Test cases for MasterDataIndexer * Issue #50 test: test cases implementation * Issue #50 test: Test case implementation for data-products * Issue #50 test: Test cases * Issue #50 test: test cases * Issue #50 test: test cases for data-products * Issue #50-fix: fixed jackson-databind issue * Isuue-#50-fix: code structure modifications * Issue #50-fix: code refactoring * Issue #50-fix: code refactoing * Issue-#50-Fix: test case fixes * Issue #50-fix: code formatting and code fixes * feat #50 - refactor the implementation * Issue-#50-fix: test cases fix * modified README file * revert readme file changes * revert dataset-registry * Issue-#50-fix: test cases fix * Issue-#50-fix: adding missing tests * Issue-#50-fix: refatoring code * Issue-#50-fix: code fixes and code formatting * fix #50: modified class declaration * fix #50: code refactor * fix #50: code refactor * fix #50: test cases fixes --------- * Remove kafka connector as it is moved to a independent repository --------- Signed-off-by: SurabhiAngadi Co-authored-by: Manjunath Davanam Co-authored-by: ManojKrishnaChintaluri Co-authored-by: Praveen <66662436+pveleneni@users.noreply.github.com> Co-authored-by: shiva-rakshith Co-authored-by: Sowmya N Dixit Co-authored-by: Santhosh Co-authored-by: Aniket Sakinala Co-authored-by: Anand Parthasarathy Co-authored-by: Shreyas Bhaktharam <121869503+shreyasb22@users.noreply.github.com> Co-authored-by: SurabhiAngadi <138881390+SurabhiAngadi@users.noreply.github.com> --- data-products/pom.xml | 322 +++++++++++++++--- .../src/main/resources/application.conf | 7 - data-products/src/main/resources/log4j2.xml | 13 + .../main/resources/masterdata-indexer.conf | 26 ++ .../MasterDataProcessorIndexer.scala | 196 ++++++----- .../helper/BaseMetricHelper.scala | 41 +++ .../helper/KafkaMessageProducer.scala | 34 ++ .../obsrv/dataproducts/model/JobMetric.scala | 28 ++ .../obsrv/dataproducts/util/CommonUtil.scala | 45 +++ .../obsrv/dataproducts/util/HttpUtil.scala | 17 + .../obsrv/dataproducts/util/StorageUtil.scala | 51 +++ .../src/test/resources/application.conf | 7 - .../resources/masterdata-indexer-test.conf | 35 ++ .../org/sunbird/fixture/EventFixture.scala | 11 + .../sunbird/spec/MasterDataIndexerSpec.scala | 242 +++++++++++++ .../service/DatasetRegistryService.scala | 11 +- .../obsrv/core/model/ErrorConstants.scala | 10 +- .../obsrv/core/model/SystemConfig.scala | 25 +- .../obsrv/core/streaming/BaseJobConfig.scala | 3 +- pipeline/kafka-connector/pom.xml | 263 -------------- .../src/main/resources/kafka-connector.conf | 16 - .../connector/task/KafkaConnectorConfig.scala | 25 -- .../task/KafkaConnectorStreamTask.scala | 71 ---- .../src/test/resources/test.conf | 14 - .../KafkaConnectorStreamTestSpec.scala | 126 ------- 25 files changed, 939 insertions(+), 700 deletions(-) delete mode 100644 data-products/src/main/resources/application.conf create mode 100644 data-products/src/main/resources/log4j2.xml create mode 100644 data-products/src/main/resources/masterdata-indexer.conf create mode 100644 data-products/src/main/scala/org/sunbird/obsrv/dataproducts/helper/BaseMetricHelper.scala create mode 100644 data-products/src/main/scala/org/sunbird/obsrv/dataproducts/helper/KafkaMessageProducer.scala create mode 100644 data-products/src/main/scala/org/sunbird/obsrv/dataproducts/model/JobMetric.scala create mode 100644 data-products/src/main/scala/org/sunbird/obsrv/dataproducts/util/CommonUtil.scala create mode 100644 data-products/src/main/scala/org/sunbird/obsrv/dataproducts/util/HttpUtil.scala create mode 100644 data-products/src/main/scala/org/sunbird/obsrv/dataproducts/util/StorageUtil.scala delete mode 100644 data-products/src/test/resources/application.conf create mode 100644 data-products/src/test/resources/masterdata-indexer-test.conf create mode 100644 data-products/src/test/scala/org/sunbird/fixture/EventFixture.scala create mode 100644 data-products/src/test/scala/org/sunbird/spec/MasterDataIndexerSpec.scala delete mode 100644 pipeline/kafka-connector/pom.xml delete mode 100644 pipeline/kafka-connector/src/main/resources/kafka-connector.conf delete mode 100644 pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/connector/task/KafkaConnectorConfig.scala delete mode 100644 pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/connector/task/KafkaConnectorStreamTask.scala delete mode 100644 pipeline/kafka-connector/src/test/resources/test.conf delete mode 100644 pipeline/kafka-connector/src/test/scala/org/sunbird/obsrv/connector/KafkaConnectorStreamTestSpec.scala diff --git a/data-products/pom.xml b/data-products/pom.xml index 51090a71..977030d0 100644 --- a/data-products/pom.xml +++ b/data-products/pom.xml @@ -12,7 +12,9 @@ 3.1.0 2.12.11 2.12 - 1.1.1 + 1.4.0 + 11 + 2.14.1 @@ -30,6 +32,10 @@ org.apache.xbean xbean-asm6-shaded + + org.apache.zookeeper + zookeeper + @@ -58,16 +64,31 @@ commons-text 1.6 - com.fasterxml.jackson.core jackson-annotations - 2.10.0 + 2.15.2 com.fasterxml.jackson.core jackson-core - 2.10.0 + 2.15.2 + + + com.fasterxml.jackson.module + jackson-module-scala_${scala.maj.version} + 2.15.2 + + + com.fasterxml.jackson.core + jackson-databind + + + + + com.fasterxml.jackson.core + jackson-databind + 2.15.2 org.sunbird.obsrv @@ -83,6 +104,32 @@ org.sunbird.obsrv framework 1.0.0 + + + com.fasterxml.jackson.core + jackson-annotations + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.module + jackson-module-scala_${scala.maj.version} + + + com.fasterxml.jackson.core + jackson-databind + + + io.github.embeddedkafka + embedded-kafka_${scala.maj.version} + + + org.apache.flink + flink-runtime + + org.scalatest @@ -111,29 +158,6 @@ - - org.sunbird - cloud-store-sdk_${scala.maj.version} - 1.4.6 - - - com.microsoft.azure - azure-storage - - - com.fasterxml.jackson.core - jackson-core - - - org.apache.httpcomponents - httpclient - - - com.google.guava - guava - - - com.microsoft.azure azure-storage @@ -155,6 +179,10 @@ org.apache.avro avro + + org.apache.zookeeper + zookeeper + @@ -163,43 +191,198 @@ 2.7.3 provided + + software.amazon.awssdk + s3 + 2.17.0 + + + software.amazon.awssdk + auth + 2.17.0 + + + software.amazon.awssdk + sts + 2.17.0 + + + org.apache.logging.log4j + log4j-slf4j-impl + ${log4j.version} + + + org.apache.logging.log4j + log4j-api + ${log4j.version} + + + org.apache.logging.log4j + log4j-core + ${log4j.version} + + + + org.mockito + mockito-scala-scalatest_${scala.maj.version} + 1.17.29 + test + + + org.apache.logging.log4j + log4j-slf4j-impl + ${log4j.version} + + + org.apache.logging.log4j + log4j-api + ${log4j.version} + + + org.apache.logging.log4j + log4j-core + ${log4j.version} + + + it.ozimov + embedded-redis + 0.7.1 + test + + + org.sunbird.obsrv + framework + 1.0.0 + test-jar + test + + + org.sunbird.obsrv + dataset-registry + 1.0.0 + test-jar + test + + + io.zonky.test + embedded-postgres + 2.0.3 + test + + + junit + junit + 4.12 + test + + + com.konghq + unirest-mocks + 3.14.1 + test + + + io.github.embeddedkafka + embedded-kafka_2.12 + 2.8.1 + test + + + org.apache.zookeeper + zookeeper + + + + + org.apache.zookeeper + zookeeper + 3.5.9 + + + com.squareup.okhttp3 + mockwebserver + 4.4.0 + test + + + com.google.code.gson + gson + 2.8.9 + - - - etl-jobs-1.0 src/main/scala src/test/scala - - net.alchim31.maven - scala-maven-plugin - 3.2.2 + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + 11 + + + + org.apache.maven.plugins + maven-shade-plugin + 3.2.4 + package - compile - testCompile + shade - - -dependencyfile - ${project.build.directory}/.scala_dependencies - -nobootcp - + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + org.sunbird.obsrv.dataproducts.job.MasterDataProcessorIndexer + + + META-INF/services/org.jclouds.apis.ApiMetadata + + + META-INF/services/org.jclouds.providers.ProviderMetadata + + + + + + maven-surefire-plugin + 2.20 + + true + + + org.scalatest scalatest-maven-plugin - 2.0.0 + 1.0 + + ${project.build.directory}/surefire-reports + . + data-products-testsuite.txt + test - test test @@ -208,19 +391,53 @@ - maven-assembly-plugin - 2.3 + org.apache.maven.plugins + maven-jar-plugin + 3.2.0 + + + + test-jar + + + + + + + org.scoverage + scoverage-maven-plugin + ${scoverage.plugin.version} - - src/main/assembly/src.xml - + ${scala.version} + true + true + + + + + net.alchim31.maven + scala-maven-plugin + 4.4.0 + + ${java.target.runtime} + ${java.target.runtime} + ${scala.version} + false - make-assembly - package + scala-compile-first + process-resources - single + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile @@ -238,5 +455,4 @@ - - + \ No newline at end of file diff --git a/data-products/src/main/resources/application.conf b/data-products/src/main/resources/application.conf deleted file mode 100644 index 6b0a1c94..00000000 --- a/data-products/src/main/resources/application.conf +++ /dev/null @@ -1,7 +0,0 @@ -# do not delete this file -redis.host="localhost" -redis.port="6379" -cloudStorage.container="obsrv-data" -cloudStorage.provider="aws" -druid.indexer.url=http://localhost:8888/druid/indexer/v1/task -druid.datasource.delete.url=http://localhost:8888/druid/coordinator/v1/datasources/ \ No newline at end of file diff --git a/data-products/src/main/resources/log4j2.xml b/data-products/src/main/resources/log4j2.xml new file mode 100644 index 00000000..a395a5a9 --- /dev/null +++ b/data-products/src/main/resources/log4j2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/data-products/src/main/resources/masterdata-indexer.conf b/data-products/src/main/resources/masterdata-indexer.conf new file mode 100644 index 00000000..5e2ee921 --- /dev/null +++ b/data-products/src/main/resources/masterdata-indexer.conf @@ -0,0 +1,26 @@ +# do not delete this file +env=local + +redis.host="localhost" +redis.port="6379" +redis.scan.count=1000 +redis.max.pipeline.size=1000 +cloud.storage.container="://"container_name"/" +cloud.storage.provider="" +cloud.storage.accountName="obsrv" # Is required when azure is provider. Will only be used when azure is the provider +druid.indexer.url="http://localhost:8888/druid/indexer/v1/task" +druid.datasource.delete.url="http://localhost:8888/druid/coordinator/v1/datasources/" + +metrics { + topicName = ""${env}".spark.stats" +} + +kafka { + bootstrap.servers = "localhost:9092" +} + +#inputSourceSpec +source.spec="{\"spec\":{\"ioConfig\":{\"type\":\"index_parallel\",\"inputSource\":{\"type\":\"local\",\"baseDir\":\"FILE_PATH\",\"filter\":\"**json.gz\"}}}}" + +#deltaIngestionSpec +delta.ingestion.spec= "{\"type\":\"index_parallel\",\"spec\":{\"dataSchema\":{\"dataSource\":\"DATASOURCE_REF\"},\"ioConfig\":{\"type\":\"index_parallel\"},\"tuningConfig\":{\"type\":\"index_parallel\",\"maxRowsInMemory\":500000,\"forceExtendableShardSpecs\":false,\"logParseExceptions\":true}}}" \ No newline at end of file diff --git a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala index e1ecfdec..2c41181c 100644 --- a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala +++ b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala @@ -2,129 +2,127 @@ package org.sunbird.obsrv.dataproducts import com.redislabs.provider.redis._ import com.typesafe.config.{Config, ConfigFactory} -import kong.unirest.Unirest -import org.apache.spark.{SparkConf, SparkContext} -import org.joda.time.format.{DateTimeFormat, DateTimeFormatter} +import org.apache.logging.log4j.{LogManager, Logger} +import org.apache.spark.sql.SparkSession import org.joda.time.{DateTime, DateTimeZone} import org.json4s.native.JsonMethods._ -import org.sunbird.cloud.storage.factory.{StorageConfig, StorageServiceFactory} -import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.core.exception.ObsrvException +import org.sunbird.obsrv.core.model.ErrorConstants +import org.sunbird.obsrv.dataproducts.helper.BaseMetricHelper +import org.sunbird.obsrv.dataproducts.model.{Edata, MetricLabel} +import org.sunbird.obsrv.dataproducts.util.{CommonUtil, HttpUtil, StorageUtil} import org.sunbird.obsrv.model.DatasetModels.{DataSource, Dataset} +import org.sunbird.obsrv.model.DatasetStatus import org.sunbird.obsrv.registry.DatasetRegistry -import scala.collection.mutable - object MasterDataProcessorIndexer { - - private val config: Config = ConfigFactory.load("application.conf").withFallback(ConfigFactory.systemEnvironment()) - private val dayPeriodFormat: DateTimeFormatter = DateTimeFormat.forPattern("yyyyMMdd").withZoneUTC() - - private case class Paths(datasourceRef: String, objectKey: String, outputFilePath: String, timestamp: Long) - - def main(args: Array[String]): Unit = { - - val datasets = DatasetRegistry.getAllDatasets("master-dataset") - val indexedDatasets = datasets.filter(dataset => dataset.datasetConfig.indexData.nonEmpty && dataset.datasetConfig.indexData.get) - indexedDatasets.foreach(dataset => { - indexDataset(dataset) - }) - } - - private def indexDataset(dataset: Dataset): Unit = { - val datasources = DatasetRegistry.getDatasources(dataset.id) - if(datasources.isEmpty || datasources.get.size > 1) { - return - } - val datasource = datasources.get.head - val paths = getPaths(datasource) - createDataFile(dataset, paths.timestamp, paths.outputFilePath, paths.objectKey) - val ingestionSpec = updateIngestionSpec(datasource, paths.datasourceRef, paths.objectKey) - submitIngestionTask(ingestionSpec) - updateDataSourceRef(datasource, paths.datasourceRef) - if(!datasource.datasource.equals(datasource.datasourceRef)) { - deleteDataSource(datasource.datasourceRef) + private final val logger: Logger = LogManager.getLogger(MasterDataProcessorIndexer.getClass) + + @throws[ObsrvException] + def processDataset(config: Config, dataset: Dataset, spark: SparkSession): Map[String, Long] = { + val result = CommonUtil.time { + val datasource = fetchDatasource(dataset) + val paths = StorageUtil.getPaths(datasource, config) + val eventsCount: Long = createDataFile(dataset, paths.outputFilePath, spark, config) + val ingestionSpec: String = updateIngestionSpec(datasource, paths.datasourceRef, paths.ingestionPath, config) + if (eventsCount > 0L) { + submitIngestionTask(dataset.id, ingestionSpec, config) + } + DatasetRegistry.updateDatasourceRef(datasource, paths.datasourceRef) + if (!datasource.datasourceRef.equals(paths.datasourceRef)) { + deleteDataSource(dataset.id, datasource.datasourceRef, config) + } + Map("success_dataset_count" -> 1, "total_dataset_count" -> 1, "total_events_processed" -> eventsCount) } + val metricMap = result._2 ++ Map("total_time_taken" -> result._1) + metricMap.asInstanceOf[Map[String, Long]] } - private def getPaths(datasource: DataSource): Paths = { - - val dt = new DateTime(DateTimeZone.UTC).withTimeAtStartOfDay() - val timestamp = dt.getMillis - val date = dayPeriodFormat.print(dt) - val objectKey = "masterdata-indexer/" + datasource.datasetId + "/" + date + ".json" - val datasourceRef = datasource.datasource + '-' + date - val outputFilePath = "masterdata-indexer/" + datasource.datasetId + "/" + date - Paths(datasourceRef, objectKey, outputFilePath, timestamp) - } - private def updateIngestionSpec(datasource: DataSource, datasourceRef: String, objectKey: String): String = { - - val deltaIngestionSpec = s"""{"type":"index_parallel","spec":{"dataSchema":{"dataSource":"$datasourceRef"},"ioConfig":{"type":"index_parallel"},"tuningConfig":{"type":"index_parallel","targetPartitionSize":5000000,"maxRowsInMemory":25000,"forceExtendableShardSpecs":false,"logParseExceptions":true}}}""" - val provider = getProvider() - val container = config.getString("cloudStorage.container") - val inputSourceSpec = s"""{"spec":{"ioConfig":{"inputSource":{"type":"$provider","objectGlob":"**.json","objects":[{"bucket":"$container","path":"$objectKey"}]}}}}""" - + // This method is used to update the ingestion spec based on datasource and storage path + private def updateIngestionSpec(datasource: DataSource, datasourceRef: String, filePath: String, config: Config): String = { + val deltaIngestionSpec: String = config.getString("delta.ingestion.spec").replace("DATASOURCE_REF", datasourceRef) + val inputSourceSpec: String = StorageUtil.getInputSourceSpec(filePath, config) val deltaJson = parse(deltaIngestionSpec) val inputSourceJson = parse(inputSourceSpec) val ingestionSpec = parse(datasource.ingestionSpec) - val modIngestionSpec = ingestionSpec merge deltaJson merge inputSourceJson compact(render(modIngestionSpec)) } - @throws[Exception] - private def getProvider(): String = { - config.getString("cloudStorage.provider") match { - case "aws" => "s3" - case "azure" => "azure" - case "gcloud" => "google" - case "cephs3" => "s3" // TODO: Have to check Druid compatibility - case "oci" => "s3" // TODO: Have to check Druid compatibility - case _ => throw new Exception("Unsupported provider") - } + // This method is used to submit the ingestion task to Druid for indexing data + def submitIngestionTask(datasetId: String, ingestionSpec: String, config: Config): Unit = { + logger.debug(s"submitIngestionTask() | datasetId=$datasetId") + val response = HttpUtil.post(config.getString("druid.indexer.url"), ingestionSpec) + response.ifFailure(throw new ObsrvException(ErrorConstants.ERR_SUBMIT_INGESTION_FAILED)) } - private def submitIngestionTask(ingestionSpec: String) = { - // TODO: Handle success and failure responses properly - val response = Unirest.post(config.getString("druid.indexer.url")) - .header("Content-Type", "application/json") - .body(ingestionSpec).asJson() - response.ifFailure(_ => throw new Exception("Exception while submitting ingestion task")) + // This method is used for deleting a datasource from druid + private def deleteDataSource(datasetID: String, datasourceRef: String, config: Config): Unit = { + logger.debug(s"deleteDataSource() | datasetId=$datasetID") + val response = HttpUtil.delete(config.getString("druid.datasource.delete.url") + datasourceRef) + response.ifFailure(throw new ObsrvException(ErrorConstants.ERR_DELETE_DATASOURCE_FAILED)) } - private def updateDataSourceRef(datasource: DataSource, datasourceRef: String): Unit = { - DatasetRegistry.updateDatasourceRef(datasource, datasourceRef) + // This method will fetch the data from redis based on dataset config + // then write the data as a compressed JSON to the respective cloud provider + private def createDataFile(dataset: Dataset, outputFilePath: String, spark: SparkSession, config: Config): Long = { + logger.info(s"createDataFile() | START | dataset=${dataset.id} ") + import spark.implicits._ + val readWriteConf = ReadWriteConfig(scanCount = config.getInt("redis.scan.count"), maxPipelineSize = config.getInt("redis.max.pipeline.size")) + val redisConfig = new RedisConfig(initialHost = RedisEndpoint(host = dataset.datasetConfig.redisDBHost.get, port = dataset.datasetConfig.redisDBPort.get, dbNum = dataset.datasetConfig.redisDB.get)) + val ts: Long = new DateTime(DateTimeZone.UTC).withTimeAtStartOfDay().getMillis + val rdd = spark.sparkContext.fromRedisKV("*")(redisConfig = redisConfig, readWriteConfig = readWriteConf).map( + f => CommonUtil.processEvent(f._2, ts) + ) + val noOfRecords: Long = rdd.count() + if (noOfRecords > 0) { + rdd.toDF().write.mode("overwrite").option("compression", "gzip").json(outputFilePath) + } + logger.info(s"createDataFile() | END | dataset=${dataset.id} | noOfRecords=$noOfRecords") + noOfRecords } - private def deleteDataSource(datasourceRef: String): Unit = { - // TODO: Handle success and failure responses properly - val response = Unirest.delete(config.getString("druid.datasource.delete.url") + datasourceRef) - .header("Content-Type", "application/json") - .asJson() - response.ifFailure(_ => throw new Exception("Exception while deleting datasource" + datasourceRef)) + private def getDatasets(): List[Dataset] = { + val datasets: List[Dataset] = DatasetRegistry.getAllDatasets("master-dataset") + datasets.filter(dataset => { + dataset.datasetConfig.indexData.nonEmpty && dataset.datasetConfig.indexData.get && dataset.status == DatasetStatus.Live + }) } - private def createDataFile(dataset: Dataset, timestamp: Long, outputFilePath: String, objectKey: String): String = { - - val conf = new SparkConf() - .setAppName("MasterDataProcessorIndexer") - .setMaster("local[4]") - .set("spark.redis.host", dataset.datasetConfig.redisDBHost.get) - .set("spark.redis.port", String.valueOf(dataset.datasetConfig.redisDBHost.get)) - .set("spark.redis.db", String.valueOf(dataset.datasetConfig.redisDB.get)) - - val sc = new SparkContext(conf) - - val readWriteConf = ReadWriteConfig(scanCount = 1000, maxPipelineSize = 1000) - sc.fromRedisKV("*")(readWriteConfig = readWriteConf) - .map(f => JSONUtil.deserialize[mutable.Map[String, AnyRef]](f._2)) - .map(f => f.put("syncts", timestamp.asInstanceOf[AnyRef])) - .map(f => JSONUtil.serialize(f)) - .coalesce(1) - .saveAsTextFile(outputFilePath) - sc.stop() + def fetchDatasource(dataset: Dataset): DataSource = { + val datasources: List[DataSource] = DatasetRegistry.getDatasources(dataset.id).get + if (datasources.isEmpty) { + throw new ObsrvException(ErrorConstants.ERR_DATASOURCE_NOT_FOUND) + } + datasources.head + } - val storageService = StorageServiceFactory.getStorageService(StorageConfig(config.getString("cloudStorage.provider"), config.getString("cloudStorage.accountName"), config.getString("cloudStorage.accountKey"))) - storageService.upload(config.getString("cloudStorage.container"), outputFilePath + "/part-00000", objectKey, isDirectory = Option(false)) + // This method will fetch the dataset from database and processes the dataset + // then generates required metrics + def processDatasets(config: Config, spark: SparkSession): Unit = { + val datasets: List[Dataset] = getDatasets() + val metricHelper = new BaseMetricHelper(config) + datasets.foreach(dataset => { + logger.info(s"processDataset() | START | datasetId=${dataset.id}") + val metricData = try { + val metrics = processDataset(config, dataset, spark) + logger.info(s"processDataset() | SUCCESS | datasetId=${dataset.id} | Metrics=$metrics") + Edata(metric = metrics, labels = List(MetricLabel("job", "MasterDataIndexer"), MetricLabel("datasetId", dataset.id), MetricLabel("cloud", s"${config.getString("cloud.storage.provider")}"))) + } catch { + case ex: ObsrvException => + logger.error(s"processDataset() | FAILED | datasetId=${dataset.id} | Error=${ex.error}", ex) + Edata(metric = Map(metricHelper.getMetricName("failure_dataset_count") -> 1, "total_dataset_count" -> 1), labels = List(MetricLabel("job", "MasterDataIndexer"), MetricLabel("datasetId", dataset.id), MetricLabel("cloud", s"${config.getString("cloud.storage.provider")}")), err = ex.error.errorCode, errMsg = ex.error.errorMsg) + } + metricHelper.generate(datasetId = dataset.id, edata = metricData) + }) } -} + // $COVERAGE-OFF$ + def main(args: Array[String]): Unit = { + val config = ConfigFactory.load("masterdata-indexer.conf").withFallback(ConfigFactory.systemEnvironment()) + val spark = CommonUtil.getSparkSession("MasterDataIndexer", config) + processDatasets(config, spark) + spark.stop() + } + // $COVERAGE-ON$ +} \ No newline at end of file diff --git a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/helper/BaseMetricHelper.scala b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/helper/BaseMetricHelper.scala new file mode 100644 index 00000000..5c09f6e1 --- /dev/null +++ b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/helper/BaseMetricHelper.scala @@ -0,0 +1,41 @@ +package org.sunbird.obsrv.dataproducts.helper + +import com.typesafe.config.Config +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.dataproducts.model._ + +class BaseMetricHelper(config: Config) { + + val metrics: Map[String, String] = Map( + "total_dataset_count" -> "total_dataset_count", + "success_dataset_count" -> "success_dataset_count", + "failure_dataset_count" -> "failure_dataset_count", + "total_events_processed" -> "total_events_processed", + "total_time_taken" -> "total_time_taken" + ) + + private val metricsProducer = new KafkaMessageProducer(config) + + private def sync(metric: IJobMetric): Unit = { + val metricStr = JSONUtil.serialize(metric) + metricsProducer.sendMessage(message = metricStr) + } + + def getMetricName(name: String): String = { + metrics.getOrElse(name, "") + } + + private def getObject(datasetId: String) = { + MetricObject(id = datasetId, `type` = "Dataset", ver = "1.0.0") + } + + def generate(datasetId: String, edata: Edata): Unit = { + val `object` = getObject(datasetId) + val actor = Actor(id = "MasterDataProcessorIndexerJob", `type` = "SYSTEM") + val pdata = Pdata(id = "DataProducts", pid = "MasterDataProcessorIndexerJob", ver = "1.0.0") + val context = Context(env = config.getString("env"), pdata = pdata) + val metric = JobMetric(ets = System.currentTimeMillis(), actor = actor, context = context, `object` = `object`, edata = edata) + this.sync(metric) + } +} + diff --git a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/helper/KafkaMessageProducer.scala b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/helper/KafkaMessageProducer.scala new file mode 100644 index 00000000..4fe9ab97 --- /dev/null +++ b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/helper/KafkaMessageProducer.scala @@ -0,0 +1,34 @@ +package org.sunbird.obsrv.dataproducts.helper + +import com.typesafe.config.Config +import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} +import org.slf4j.LoggerFactory + +import java.util.Properties + +class KafkaMessageProducer(config: Config) { + + private[this] val logger = LoggerFactory.getLogger(classOf[KafkaMessageProducer]) + private val kafkaProperties = new Properties(); + private val defaultTopicName = config.getString("metrics.topicName") + private val defaultKey = null + + kafkaProperties.put("bootstrap.servers", config.getString("kafka.bootstrap.servers")) + kafkaProperties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer") + kafkaProperties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer") + val producer = new KafkaProducer[String, String](kafkaProperties) + + def sendMessage(topic: String = defaultTopicName, key: String = defaultKey, message: String): Unit = { + try { + val record = new ProducerRecord[String, String](topic, key, message) + producer.send(record) + } + // $COVERAGE-OFF$ + catch { + case e: Exception => + logger.error("Exception occured while sending message to kafka", e.getMessage) + e.printStackTrace() + } + // $COVERAGE-ON$ + } +} diff --git a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/model/JobMetric.scala b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/model/JobMetric.scala new file mode 100644 index 00000000..0e8ce6a1 --- /dev/null +++ b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/model/JobMetric.scala @@ -0,0 +1,28 @@ +package org.sunbird.obsrv.dataproducts.model + +import java.util.UUID + +case class Actor(id: String, `type`: String) + +case class Context(env: String, pdata: Pdata) + +case class Edata(metric: Map[String, Any], labels: Seq[MetricLabel], err: String = null, errMsg: String = null) + +case class MetricLabel(key: String, value: String) + +case class MetricObject(id: String, `type`: String, ver: String) + +case class Pdata(id: String, pid: String, ver: String) + +trait IJobMetric { + val eid: String + val ets: Long + val mid: String + val actor: Actor + val context: Context + val `object`: MetricObject + val edata: Edata +} + +case class JobMetric(eid: String = "METRIC", ets: Long, mid: String = UUID.randomUUID().toString, actor: Actor, context: Context, `object`: MetricObject, edata: Edata) extends IJobMetric + diff --git a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/util/CommonUtil.scala b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/util/CommonUtil.scala new file mode 100644 index 00000000..4deb017b --- /dev/null +++ b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/util/CommonUtil.scala @@ -0,0 +1,45 @@ +package org.sunbird.obsrv.dataproducts.util + +import com.typesafe.config.Config +import org.apache.logging.log4j.{LogManager, Logger} +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.{DataSource, Dataset} + +import scala.collection.mutable + +object CommonUtil { + + private final val logger: Logger = LogManager.getLogger(CommonUtil.getClass) + + def time[R](block: => R): (Long, R) = { + val t0 = System.currentTimeMillis() + val result = block // call-by-name + val t1 = System.currentTimeMillis() + ((t1 - t0), result) + } + + def processEvent(value: String, ts: Long) = { + val json = JSONUtil.deserialize[mutable.Map[String, AnyRef]](value) + json("obsrv_meta") = mutable.Map[String, AnyRef]("syncts" -> ts.asInstanceOf[AnyRef]).asInstanceOf[AnyRef] + JSONUtil.serialize(json) + } + + private def getSafeConfigString(config: Config, key: String): String = { + if (config.hasPath(key)) config.getString(key) else "" + } + + def getSparkSession(appName: String, config: Config): SparkSession = { + + val conf = new SparkConf().setAppName(appName) + val master = getSafeConfigString(config, "spark.master") + + if (master.isEmpty) { + logger.info("Master not found. Setting it to local[*]") + conf.setMaster("local[*]") + } + SparkSession.builder().appName(appName).config(conf).getOrCreate() + } + +} \ No newline at end of file diff --git a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/util/HttpUtil.scala b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/util/HttpUtil.scala new file mode 100644 index 00000000..90a33055 --- /dev/null +++ b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/util/HttpUtil.scala @@ -0,0 +1,17 @@ +package org.sunbird.obsrv.dataproducts.util + +import kong.unirest.{HttpResponse, JsonNode, Unirest} + +import scala.collection.JavaConverters._ +import scala.language.postfixOps + +object HttpUtil extends Serializable { + + def post(url: String, requestBody: String, headers: Map[String, String] = Map[String, String]("Content-Type" -> "application/json")): HttpResponse[JsonNode] = { + Unirest.post(url).headers(headers.asJava).body(requestBody).asJson() + } + + def delete(url: String): HttpResponse[JsonNode] = { + Unirest.delete(url).header("Content-Type", "application/json").asJson() + } +} diff --git a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/util/StorageUtil.scala b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/util/StorageUtil.scala new file mode 100644 index 00000000..f98a047c --- /dev/null +++ b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/util/StorageUtil.scala @@ -0,0 +1,51 @@ +package org.sunbird.obsrv.dataproducts.util + +import com.typesafe.config.Config +import org.apache.logging.log4j.{LogManager, Logger} +import org.joda.time.format.{DateTimeFormat, DateTimeFormatter} +import org.joda.time.{DateTime, DateTimeZone} +import org.sunbird.obsrv.core.exception.ObsrvException +import org.sunbird.obsrv.core.model.ErrorConstants +import org.sunbird.obsrv.dataproducts.MasterDataProcessorIndexer +import org.sunbird.obsrv.model.DatasetModels.DataSource + +object StorageUtil { + val logger: Logger = LogManager.getLogger(MasterDataProcessorIndexer.getClass) + val dayPeriodFormat: DateTimeFormatter = DateTimeFormat.forPattern("yyyyMMdd").withZoneUTC() + + case class Paths(datasourceRef: String, ingestionPath: String, outputFilePath: String, timestamp: Long) + + case class BlobProvider(sparkURIFormat: String, ingestionSourceType: String, druidURIFormat: String) + + // This method returns a BlobProvider object based on cloud storage provider + def providerFormat(cloudProvider: String): BlobProvider = { + cloudProvider match { + case "local" => BlobProvider("file", "local", "file") + case "aws" => BlobProvider("s3a", "s3", "s3") + case "azure" => BlobProvider("wasbs", "azure", "azure") + case "gcloud" => BlobProvider("gs", "google", "gs") + case "cephs3" => BlobProvider("s3a", "s3", "s3") // TODO: Have to check Druid compatibility + case "oci" => BlobProvider("s3a", "s3", "s3") // TODO: Have to check Druid compatibility + case _ => throw new ObsrvException(ErrorConstants.UNSUPPORTED_PROVIDER) + } + } + + def getPaths(datasource: DataSource, config: Config): Paths = { + val dt = new DateTime(DateTimeZone.UTC).withTimeAtStartOfDay() + val timestamp = dt.getMillis + val date = dayPeriodFormat.print(dt) + val provider = providerFormat(config.getString("cloud.storage.provider")) + val cloudPrefix = provider.sparkURIFormat + config.getString("cloud.storage.container") + val pathSuffix = s"""masterdata-indexer/${datasource.datasetId}/$date/""" + val ingestionPath = cloudPrefix.replace(provider.sparkURIFormat, provider.druidURIFormat) + pathSuffix + val datasourceRef = datasource.datasource + '-' + date + val outputFilePath = cloudPrefix + pathSuffix + Paths(datasourceRef, ingestionPath, outputFilePath, timestamp) + } + + // This method provides appropriate input source spec depending on the cloud storage provider + def getInputSourceSpec(filePath: String, config: Config): String = { + config.getString("source.spec").replace("FILE_PATH", filePath) + } + +} \ No newline at end of file diff --git a/data-products/src/test/resources/application.conf b/data-products/src/test/resources/application.conf deleted file mode 100644 index 6b0a1c94..00000000 --- a/data-products/src/test/resources/application.conf +++ /dev/null @@ -1,7 +0,0 @@ -# do not delete this file -redis.host="localhost" -redis.port="6379" -cloudStorage.container="obsrv-data" -cloudStorage.provider="aws" -druid.indexer.url=http://localhost:8888/druid/indexer/v1/task -druid.datasource.delete.url=http://localhost:8888/druid/coordinator/v1/datasources/ \ No newline at end of file diff --git a/data-products/src/test/resources/masterdata-indexer-test.conf b/data-products/src/test/resources/masterdata-indexer-test.conf new file mode 100644 index 00000000..ecf5f976 --- /dev/null +++ b/data-products/src/test/resources/masterdata-indexer-test.conf @@ -0,0 +1,35 @@ +# do not delete this file +env=local + +redis.host="localhost" +redis.port="6379" +redis.scan.count=1000 +redis.max.pipeline.size=1000 +cloud.storage.container=":///"containerName"/" +cloud.storage.provider="local" +cloud.storage.accountName="obsrv" # Is required when azure is provider. Will only be used when azure is the provider +druid.indexer.url="http://localhost:8888/druid/indexer/v1/task" +druid.datasource.delete.url="http://localhost:8888/druid/coordinator/v1/datasources/" + +metrics { + topicName = ""${env}".spark.stats" +} + +kafka { + bootstrap.servers = "localhost:9092" +} + +#inputSourceSpec +source.spec="{\"spec\":{\"ioConfig\":{\"type\":\"index_parallel\",\"inputSource\":{\"type\":\"local\",\"baseDir\":\"FILE_PATH\",\"filter\":\"**json.gz\"}}}}" + +#deltaIngestionSpec +delta.ingestion.spec= "{\"type\":\"index_parallel\",\"spec\":{\"dataSchema\":{\"dataSource\":\"DATASOURCE_REF\"},\"ioConfig\":{\"type\":\"index_parallel\"},\"tuningConfig\":{\"type\":\"index_parallel\",\"maxRowsInMemory\":500000,\"forceExtendableShardSpecs\":false,\"logParseExceptions\":true}}}" + +postgres { + host = localhost + port = 5432 + maxConnections = 2 + user = "postgres" + password = "postgres" + database="postgres" +} \ No newline at end of file diff --git a/data-products/src/test/scala/org/sunbird/fixture/EventFixture.scala b/data-products/src/test/scala/org/sunbird/fixture/EventFixture.scala new file mode 100644 index 00000000..e40425a9 --- /dev/null +++ b/data-products/src/test/scala/org/sunbird/fixture/EventFixture.scala @@ -0,0 +1,11 @@ +package org.sunbird.fixture + +object EventFixture { + + val d1 = """{"fcm_token":"","city":"Sohna","device_id":"device-00","device_spec":"\"{'os':'Windows OS','cpu':'Apple M1','make':'Motorola XT1706'}\"","state":"Karnataka","uaspec":{"agent":"Safari","ver":"76.0.3809.132","system":"iOS 10","raw":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"},"country":"India","country_code":"IN","producer_id":"dev.obsrv.portal","state_code_custom":29,"state_code":"KA","state_custom":"Karnataka","district_custom":"Karnataka,s","first_access":1568379184000,"api_last_updated_on":1568377184000,"user_declared_district":"Bedfordshire","user_declared_state":"Karnataka"}""" + val d2 = """{"fcm_token":"","city":"Nawapur","device_id":"device-01","device_spec":"\"{'os':'CentOS','cpu':'Samsung Exynos','make':'Samsung S23'}\"","state":"Sikkim","uaspec":{"agent":"Safari","ver":"76.0.3809.132","system":"Windows OS","raw":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"},"country":"India","country_code":"IN","producer_id":"dev.obsrv.portal","state_code_custom":29,"state_code":"SI","state_custom":"Karnataka","district_custom":"Karnataka,s","first_access":1568379184000,"api_last_updated_on":1568377184000,"user_declared_district":"Berkshire","user_declared_state": "Sikkim"}""" + val d3 = """{"fcm_token":"","city":"Sumerpur","device_id":"device-02","device_spec":"\"{'os':'iOS 15','cpu':'abi: armeabi-v7a ARMv7 Processor rev 4 (v7l)','make':'iPhone'}\"","state":"Chhattisgarh","uaspec":{"agent":"Chrome","ver":"76.0.3809.132","system":"Android 6.0","raw":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"},"country":"India","country_code":"IN","producer_id":"dev.obsrv.portal","state_code_custom":29,"state_code":"CH","state_custom":"Karnataka","district_custom":"Karnataka,s","first_access":1568379184000,"api_last_updated_on":1568377184000,"user_declared_district":"Buckinghamshire","user_declared_state":"Chhattisgarh"}""" + val d4 = """{"fcm_token":"","city":"Kailasahar","device_id":"device-03","device_spec":"\"{'os':'Android 6.0','cpu':'AMD Ryzen 4900X','make':'Motorola XT1706'}\"","state":"Kerala","uaspec":{"agent":"Chrome","ver":"76.0.3809.132","system":"Android 12.0","raw":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"},"country":"India","country_code":"IN","producer_id":"dev.obsrv.portal","state_code_custom":29,"state_code":"KE","state_custom":"Karnataka","district_custom":"Karnataka,s","first_access":1568379184000,"api_last_updated_on":1568377184000,"user_declared_district":"Bedfordshire","user_declared_state":"Kerala"}""" + val d5 = """{"fcm_token":"","city":"Shahade","device_id":"device-04","device_spec":"\"{'os':'Ubuntu','cpu':'Qualcomm Snapdragon','make':'Blackberry'}\"","state":"Meghalaya","uaspec":{"agent":"Opera","ver":"76.0.3809.132","system":"Blackberry","raw":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"},"country":"India","country_code":"IN","producer_id":"dev.obsrv.portal","state_code_custom":29,"state_code":"ME","state_custom":"Karnataka","district_custom":"Karnataka,s","first_access":1568379184000,"api_last_updated_on":1568377184000,"user_declared_district":"Berkshire","user_declared_state":"Meghalaya"}""" + +} diff --git a/data-products/src/test/scala/org/sunbird/spec/MasterDataIndexerSpec.scala b/data-products/src/test/scala/org/sunbird/spec/MasterDataIndexerSpec.scala new file mode 100644 index 00000000..0d54050e --- /dev/null +++ b/data-products/src/test/scala/org/sunbird/spec/MasterDataIndexerSpec.scala @@ -0,0 +1,242 @@ +package org.sunbird.obsrv.spec + +import com.typesafe.config.{Config, ConfigFactory, ConfigValueFactory} +import io.github.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig, duration2JavaDuration} +import io.zonky.test.db.postgres.embedded.EmbeddedPostgres +import kong.unirest.{HttpResponse, JsonNode} +import org.joda.time.format.{DateTimeFormat, DateTimeFormatter} +import org.joda.time.{DateTime, DateTimeZone} +import org.mockito.MockitoSugar.{mock, when} +import org.sunbird.obsrv.registry.DatasetRegistry +import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} +import org.sunbird.fixture.EventFixture +import org.sunbird.obsrv.core.cache.RedisConnect +import org.sunbird.obsrv.core.util.{PostgresConnect, PostgresConnectionConfig} +import org.sunbird.obsrv.dataproducts.helper.BaseMetricHelper +import org.sunbird.obsrv.dataproducts.model.{Edata, MetricLabel} +import redis.embedded.RedisServer + +import scala.collection.JavaConverters._ +import scala.concurrent.duration.FiniteDuration +import org.apache.kafka.common.serialization.StringDeserializer +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.mockito.Mockito +import org.sunbird.obsrv.core.exception.ObsrvException +import org.sunbird.obsrv.core.model +import org.sunbird.obsrv.core.model.ErrorConstants +import org.sunbird.obsrv.dataproducts +import org.sunbird.obsrv.dataproducts.MasterDataProcessorIndexer +import org.sunbird.obsrv.dataproducts.util.StorageUtil.BlobProvider +import org.sunbird.obsrv.dataproducts.util.{CommonUtil, HttpUtil, StorageUtil} + +import scala.collection.mutable.ListBuffer + +class MasterDataIndexerSpec extends FlatSpec with BeforeAndAfterAll with Matchers { + + private val jobConfig: Config = ConfigFactory.load("masterdata-indexer-test.conf").withFallback(ConfigFactory.systemEnvironment()) + val mockMetrics = mock[BaseMetricHelper] + val pwd = System.getProperty("user.dir") + + val postgresConfig = PostgresConnectionConfig( + user = jobConfig.getString("postgres.user"), + password = jobConfig.getString("postgres.password"), + database = "postgres", + host = jobConfig.getString("postgres.host"), + port = jobConfig.getInt("postgres.port"), + maxConnections = jobConfig.getInt("postgres.maxConnections") + ) + + var embeddedPostgres: EmbeddedPostgres = _ + var redisServer: RedisServer = _ + var redisConnection: RedisConnect = _ + private val dayPeriodFormat: DateTimeFormatter = DateTimeFormat.forPattern("yyyyMMdd").withZoneUTC() + val dt = new DateTime(DateTimeZone.UTC).withTimeAtStartOfDay() + val date = dayPeriodFormat.print(dt) + + val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group") + implicit val embeddedKafkaConfig: EmbeddedKafkaConfig = + EmbeddedKafkaConfig( + kafkaPort = 9092, + zooKeeperPort = 2183, + customConsumerProperties = customKafkaConsumerProperties + ) + implicit val deserializer: StringDeserializer = new StringDeserializer() + var spark: SparkSession = _ + + + override def beforeAll(): Unit = { + super.beforeAll() + val conf = new SparkConf() + .setAppName("MasterDataProcessorIndexer") + .setMaster("local[*]") + spark = SparkSession.builder().config(conf).getOrCreate() + redisServer = new RedisServer(6340) + redisServer.start() + embeddedPostgres = EmbeddedPostgres.builder.setPort(5432).start() + val postgresConnect = new PostgresConnect(postgresConfig) + createSchema(postgresConnect) + insertTestData(postgresConnect) + redisConnection = new RedisConnect("localhost", 6340, 30000) + val jedis = redisConnection.getConnection(3) + jedis.set("device-00", EventFixture.d1) + jedis.set("device-01", EventFixture.d2) + jedis.set("device-02", EventFixture.d3) + jedis.set("device-03", EventFixture.d4) + jedis.set("device-04", EventFixture.d5) + EmbeddedKafka.start()(embeddedKafkaConfig) + createTestTopics() + } + + override def afterAll(): Unit = { + super.afterAll() + redisServer.stop() + embeddedPostgres.close() + EmbeddedKafka.stop() + spark.stop() + } + + def createTestTopics(): Unit = { + EmbeddedKafka.createCustomTopic("spark.stats") + } + + private def createSchema(postgresConnect: PostgresConnect) { + postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasets ( id text PRIMARY KEY, type text NOT NULL, validation_config json, extraction_config json, dedup_config json, data_schema json, denorm_config json, router_config json NOT NULL, dataset_config json NOT NULL, status text NOT NULL, tags text[], data_version INT, created_by text NOT NULL, updated_by text NOT NULL, created_date timestamp NOT NULL, updated_date timestamp NOT NULL );") + postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasources ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), ingestion_spec json NOT NULL, datasource text NOT NULL, datasource_ref text NOT NULL);") + postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_transformations ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), field_key text NOT NULL, transformation_function json NOT NULL, status text NOT NULL, mode text, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(field_key, dataset_id) );") + postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_source_config ( id text PRIMARY KEY, dataset_id text NOT NULL REFERENCES datasets (id), connector_type text NOT NULL, connector_config json NOT NULL, status text NOT NULL, connector_stats json, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(connector_type, dataset_id) );") + } + + private def insertTestData(postgresConnect: PostgresConnect) = { + postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md1','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 3}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") + postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md1_md1.1_DAY', 'md1', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md1.1_DAY', 'md1.1_DAY');") + postgresConnect.execute("insert into dataset_transformations values('tf1', 'md1', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'Live', 'Strict', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md2','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\":6340, \"index_data\": true, \"redis_db\": 5}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', 'now()', 'now()');") + postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md2_md1.1_DAY', 'md2', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md2.1_DAY', 'md2.1_DAY');") + postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md3','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 6}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") + postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md3_md3.1_DAY', 'md3', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md3.1_DAY', 'md3.1_DAY');") + postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md3_md3.2_DAY', 'md3', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md3.2_DAY', 'md3.2_DAY');") + postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md5','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 9}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") + postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md4','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 3}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") + postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md4_md4.1_DAY', 'md4', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md4.1_DAY', 'md4.1_DAY');") + + } + + def checkTestTopicsOffset(): Unit = { + val topics: java.util.Collection[String] = new java.util.ArrayList[String]() + topics.add("spark.stats") + val consumerPollingTimeout: FiniteDuration = FiniteDuration(1, "minute") + EmbeddedKafka.withConsumer[String, String, Unit] { + val messagesBuffers = topics.asScala.map(_ -> ListBuffer.empty[(String, String)]).toMap + consumer => + consumer.subscribe(topics) + val recordIterator = consumer.poll(duration2JavaDuration(consumerPollingTimeout)).iterator() + while (recordIterator.hasNext) { + val record = recordIterator.next + messagesBuffers(record.topic) += (record.key() -> record.value()) + consumer.commitSync() + } + consumer.close() + val messages = messagesBuffers.mapValues(_.toList) + messages("spark.stats").length shouldBe 4 + + } + } + + it should "index datasets for single datasource and generate metrics for local storage" in { + val config = jobConfig.withValue("cloud.storage.container", ConfigValueFactory.fromAnyRef(s"${pwd}/obsrv-data")) + println("Path -> " + config.getString("cloud.storage.container")) + assertThrows[Exception]( + MasterDataProcessorIndexer.processDatasets(config, spark) + ) + } + + it should "index datasets for aws" in { + val config = jobConfig.withValue("cloud.storage.provider", ConfigValueFactory.fromAnyRef("aws")) + assertThrows[Exception]( + MasterDataProcessorIndexer.processDatasets(config, spark) + ) + } + + it should "index datasets for azure" in { + val config = jobConfig.withValue("cloud.storage.provider", ConfigValueFactory.fromAnyRef("azure")) + assertThrows[Exception]( + MasterDataProcessorIndexer.processDatasets(config, spark) + ) + } + + it should "index datasets for gcloud" in { + val config = jobConfig.withValue("cloud.storage.provider", ConfigValueFactory.fromAnyRef("gcloud")) + assertThrows[Exception]( + MasterDataProcessorIndexer.processDatasets(config, spark) + ) + } + + it should "index datasets for cephs3" in { + val config = jobConfig.withValue("cloud.storage.provider", ConfigValueFactory.fromAnyRef("cephs3")) + assertThrows[Exception]( + MasterDataProcessorIndexer.processDatasets(config, spark) + ) + } + + it should "index datasets for oci" in { + val config = jobConfig.withValue("cloud.storage.provider", ConfigValueFactory.fromAnyRef("oci")) + assertThrows[Exception]( + MasterDataProcessorIndexer.processDatasets(config, spark) + ) + } + + it should "not index datasets for unknown provider" in { + val provider = jobConfig.withValue("cloud.storage.provider", ConfigValueFactory.fromAnyRef("ibm")) + val dataset = DatasetRegistry.getDataset("md1") + MasterDataProcessorIndexer.processDatasets(provider, spark) + val edata = Edata(metric = Map(mockMetrics.getMetricName("failure_dataset_count") -> 1, mockMetrics.getMetricName("total_dataset_count") -> 1), labels = List(MetricLabel("job", "MasterDataIndexer"), MetricLabel("datasetId", dataset.get.id), MetricLabel("cloud", s"${provider.getString("cloud.storage.provider")}")), err = "FAILED", errMsg = "Unsupported provider") + } + + it should "throw exception when datasource is null" in { + val dataset = DatasetRegistry.getDataset("md5") + the[ObsrvException] thrownBy { + MasterDataProcessorIndexer.fetchDatasource(dataset.get) + } should have message ErrorConstants.ERR_DATASOURCE_NOT_FOUND.errorMsg + } + + it should "create a SparkSession with default master (local[*]) if not provided in the config" in { + val appName = "MasterDataIndexer" + val configString = "" // No spark.master in the config + val config = ConfigFactory.parseString(configString) + val sparkSession = CommonUtil.getSparkSession(appName, config) + sparkSession should not be null + sparkSession.conf.get("spark.master") shouldEqual "local[*]" + sparkSession.conf.get("spark.app.name") shouldEqual appName + } + + it should "create a SparkSession with provided app name and master" in { + val appName = "TestApp" + val master = "local[2]" + val configString = + s""" + |spark.master = "$master" + """.stripMargin + val config = ConfigFactory.parseString(configString) + val sparkSession = CommonUtil.getSparkSession(appName, config) + sparkSession should not be null + sparkSession.conf.get("spark.app.name") shouldEqual appName + } + + it should "throw exception while submitting ingestion" in { + val config = jobConfig.withValue("cloud.storage.container", ConfigValueFactory.fromAnyRef(s"${pwd}/obsrv-data")) + val dataset = DatasetRegistry.getDataset("md1") + assertThrows[Exception]( + MasterDataProcessorIndexer.processDataset(config, dataset.get, spark) + ) + } + + it should "return proper provider format for each cloud provider" in { + StorageUtil.providerFormat("aws") shouldEqual BlobProvider("s3a", "s3", "s3") + StorageUtil.providerFormat("azure") shouldEqual BlobProvider("wasbs", "azure", "azure") + StorageUtil.providerFormat("gcloud") shouldEqual BlobProvider("gs", "google", "gs") + StorageUtil.providerFormat("cephs3") shouldEqual BlobProvider("s3a", "s3", "s3") + StorageUtil.providerFormat("oci") shouldEqual BlobProvider("s3a", "s3", "s3") + } + +} diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala index e783b871..88efb7a6 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala @@ -9,7 +9,6 @@ import java.io.File import java.sql.{ResultSet, Timestamp} object DatasetRegistryService { - private val configFile = new File("/data/flink/conf/baseconfig.conf") // $COVERAGE-OFF$ This code only executes within a flink cluster val config: Config = if (configFile.exists()) { @@ -37,7 +36,7 @@ object DatasetRegistryService { val dataset = parseDataset(result) (dataset.id, dataset) }).toMap - } finally { + } finally { postgresConnect.closeConnection() } } @@ -47,7 +46,7 @@ object DatasetRegistryService { val postgresConnect = new PostgresConnect(postgresConfig) try { val rs = postgresConnect.executeQuery(s"SELECT * FROM datasets where id='$id'") - if(rs.next()) { + if (rs.next()) { Some(parseDataset(rs)) } else { None @@ -156,7 +155,7 @@ object DatasetRegistryService { val datasetConfig = rs.getString("dataset_config") val status = rs.getString("status") val tagArray = rs.getArray("tags") - val tags = if(tagArray != null) tagArray.getArray.asInstanceOf[Array[String]] else null + val tags = if (tagArray != null) tagArray.getArray.asInstanceOf[Array[String]] else null val dataVersion = rs.getInt("data_version") Dataset(datasetId, datasetType, @@ -183,7 +182,7 @@ object DatasetRegistryService { DatasetSourceConfig(id = id, datasetId = datasetId, connectorType = connectorType, JSONUtil.deserialize[ConnectorConfig](connectorConfig), status, - if(connectorStats != null) Some(JSONUtil.deserialize[ConnectorStats](connectorStats)) else None + if (connectorStats != null) Some(JSONUtil.deserialize[ConnectorStats](connectorStats)) else None ) } @@ -205,7 +204,7 @@ object DatasetRegistryService { val status = rs.getString("status") val mode = rs.getString("mode") - DatasetTransformation(id, datasetId, fieldKey, JSONUtil.deserialize[TransformationFunction](transformationFunction), status, Some(if(mode != null) TransformMode.withName(mode) else TransformMode.Strict)) + DatasetTransformation(id, datasetId, fieldKey, JSONUtil.deserialize[TransformationFunction](transformationFunction), status, Some(if (mode != null) TransformMode.withName(mode) else TransformMode.Strict)) } } \ No newline at end of file diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala index efac0fbe..b5e57d87 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala @@ -3,7 +3,9 @@ package org.sunbird.obsrv.core.model object ErrorConstants extends Enumeration { type Error = ErrorValue + case class ErrorValue(errorCode: String, errorMsg: String) + protected final def ErrorInternalValue(errorCode: String, errorMsg: String): ErrorValue = { ErrorValue(errorCode, errorMsg) } @@ -26,7 +28,7 @@ object ErrorConstants extends Enumeration { val DENORM_KEY_NOT_A_STRING_OR_NUMBER = ErrorInternalValue("ERR_DENORM_1015", "Denorm key value is not a String or Number") val DENORM_DATA_NOT_FOUND = ErrorInternalValue("ERR_DENORM_1016", "Denorm data not found for the given key") val MISSING_DATASET_CONFIG_KEY = ErrorInternalValue("ERR_MASTER_DATA_1017", "Master dataset configuration key is missing") - val ERR_INVALID_EVENT = ErrorInternalValue("ERR_EXT_1018", "Invalid JSON event, error while deserializing the event") + val ERR_INVALID_EVENT = ErrorInternalValue("ERR_EXT_1018", "Invalid JSON event, error while deserializing the event") val INDEX_KEY_MISSING_OR_BLANK = ErrorInternalValue("ERR_ROUTER_1019", "Unable to index data as the timestamp key is missing or blank or not a datetime value") val INVALID_EXPR_FUNCTION = ErrorInternalValue("ERR_TRANSFORM_1020", "Transformation expression function is not valid") val ERR_EVAL_EXPR_FUNCTION = ErrorInternalValue("ERR_TRANSFORM_1021", "Unable to evaluate the transformation expression function") @@ -36,5 +38,9 @@ object ErrorConstants extends Enumeration { val SYSTEM_SETTING_INVALID_TYPE = ErrorInternalValue("ERR_SYSTEM_SETTING_1025", "Invalid value type for system setting") val SYSTEM_SETTING_NOT_FOUND = ErrorInternalValue("ERR_SYSTEM_SETTING_1026", "System setting not found for requested key") val SYSTEM_SETTING_DEFAULT_VALUE_NOT_FOUND = ErrorInternalValue("ERR_SYSTEM_SETTING_1027", "Default value not found for requested key") - + val HTTP_SERVER_ERR = ErrorInternalValue("ERR_SERVER_CONNECTION_1028", "Connection refused.") + val ERR_DATASOURCE_NOT_FOUND = ErrorInternalValue("ERR_MDP_1029", "Datasource not found.") + val UNSUPPORTED_PROVIDER = ErrorInternalValue("ERR_UNSUPPORTED_PROVIDER_1030", "Unsupported provider.") + val ERR_SUBMIT_INGESTION_FAILED = ErrorInternalValue("ERR_MDP_1031", "Unable to submit ingestion task to druid.") + val ERR_DELETE_DATASOURCE_FAILED = ErrorInternalValue("ERR_MDP_1032", "Failed to delete datasource.") } diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/SystemConfig.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/SystemConfig.scala index 118e8c53..0adb1098 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/model/SystemConfig.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/SystemConfig.scala @@ -88,21 +88,26 @@ object SystemConfigService { @throws[Exception] def getAllSystemSettings: List[SystemSetting] = { val postgresConnect = new PostgresConnect(postgresConfig) - val rs = postgresConnect.executeQuery("SELECT * FROM system_settings") - val result = Iterator.continually((rs, rs.next)).takeWhile(f => f._2).map(f => f._1).map(result => { - parseSystemSetting(result) - }).toList - postgresConnect.closeConnection() - result + try { + val rs = postgresConnect.executeQuery("SELECT * FROM system_settings") + val result = Iterator.continually((rs, rs.next)).takeWhile(f => f._2).map(f => f._1).map(result => { + parseSystemSetting(result) + }).toList + result + } finally { + postgresConnect.closeConnection() + } } @throws[Exception] def getSystemSetting(key: String): Option[SystemSetting] = { val postgresConnect = new PostgresConnect(postgresConfig) - val rs = postgresConnect.executeQuery(s"SELECT * FROM system_settings WHERE key = '$key'") - if (rs.next) { - Option(parseSystemSetting(rs)) - } else None + try { + val rs = postgresConnect.executeQuery(s"SELECT * FROM system_settings WHERE key = '$key'") + if (rs.next) Option(parseSystemSetting(rs)) else None + } finally { + postgresConnect.closeConnection() + } } private def parseSystemSetting(rs: ResultSet): SystemSetting = { diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala index dc6eaa66..9cbfe1e9 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala @@ -17,7 +17,8 @@ abstract class BaseJobConfig[T](val config: Config, val jobName: String) extends implicit val metricTypeInfo: TypeInformation[String] = TypeExtractor.getForClass(classOf[String]) - def defaultDatasetID: String = SystemConfig.getString("defaultDatasetId", "ALL") + lazy val defaultDatasetID: String = SystemConfig.getString("defaultDatasetId", "ALL") + private val kafkaProducerBrokerServers: String = config.getString("kafka.producer.broker-servers") private val kafkaConsumerBrokerServers: String = config.getString("kafka.consumer.broker-servers") // Producer Properties diff --git a/pipeline/kafka-connector/pom.xml b/pipeline/kafka-connector/pom.xml deleted file mode 100644 index 65aa4d68..00000000 --- a/pipeline/kafka-connector/pom.xml +++ /dev/null @@ -1,263 +0,0 @@ - - - - 4.0.0 - - org.sunbird.obsrv - pipeline - 1.0 - - - org.sunbird.obsrv.pipeline - kafka-connector - 1.0.0 - jar - Kafka Connector - - Reads data from source kafka topic(s) and writes them to a configurable topic - - - - UTF-8 - 1.4.0 - - - - - org.apache.flink - flink-streaming-scala_${scala.maj.version} - ${flink.version} - provided - - - org.sunbird.obsrv - dataset-registry - 1.0.0 - - - org.apache.kafka - kafka-clients - - - - - joda-time - joda-time - 2.12.5 - - - com.fasterxml.jackson.datatype - jackson-datatype-joda - 2.15.2 - - - org.sunbird.obsrv - framework - 1.0.0 - - - org.sunbird.obsrv - framework - 1.0.0 - test-jar - test - - - org.sunbird.obsrv - dataset-registry - 1.0.0 - test-jar - test - - - org.apache.flink - flink-test-utils - ${flink.version} - test - - - org.apache.flink - flink-runtime - ${flink.version} - test - tests - - - org.apache.kafka - kafka-clients - ${kafka.version} - test - - - org.apache.kafka - kafka_${scala.maj.version} - ${kafka.version} - test - - - com.github.codemonstur - embedded-redis - 1.0.0 - test - - - io.github.embeddedkafka - embedded-kafka_2.12 - 3.4.0 - test - - - io.zonky.test - embedded-postgres - 2.0.3 - test - - - org.apache.flink - flink-streaming-java - ${flink.version} - test - tests - - - org.scalatest - scalatest_2.12 - 3.0.6 - test - - - org.mockito - mockito-core - 3.3.3 - test - - - - - src/main/scala - src/test/scala - - - org.apache.maven.plugins - maven-compiler-plugin - 3.8.1 - - 11 - - - - org.apache.maven.plugins - maven-shade-plugin - 3.2.1 - - - - package - - shade - - - - - com.google.code.findbugs:jsr305 - - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - org.sunbird.obsrv.connector.task.KafkaConnectorStreamTask - - - - reference.conf - - - - - - - - - net.alchim31.maven - scala-maven-plugin - 4.4.0 - - ${java.target.runtime} - ${java.target.runtime} - ${scala.version} - false - - - - scala-compile-first - process-resources - - add-source - compile - - - - scala-test-compile - process-test-resources - - testCompile - - - - - - - maven-surefire-plugin - 2.22.2 - - true - - - - - org.scalatest - scalatest-maven-plugin - 1.0 - - ${project.build.directory}/surefire-reports - . - dp-duplication-testsuite.txt - - - - test - - test - - - - - - org.scoverage - scoverage-maven-plugin - ${scoverage.plugin.version} - - ${scala.version} - true - true - - - - - - diff --git a/pipeline/kafka-connector/src/main/resources/kafka-connector.conf b/pipeline/kafka-connector/src/main/resources/kafka-connector.conf deleted file mode 100644 index 9b5c575b..00000000 --- a/pipeline/kafka-connector/src/main/resources/kafka-connector.conf +++ /dev/null @@ -1,16 +0,0 @@ -include "baseconfig.conf" - -kafka { - input.topic = ${job.env}".test" - // output.topic = ${job.env}".ingest" - event.max.size = "1048576" # Max is only 1MB - groupId = ${job.env}"-kafkaconnector-group" - producer { - max-request-size = 5242880 - } -} - -task { - consumer.parallelism = 1 - downstream.operators.parallelism = 1 -} \ No newline at end of file diff --git a/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/connector/task/KafkaConnectorConfig.scala b/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/connector/task/KafkaConnectorConfig.scala deleted file mode 100644 index 05ccaa8e..00000000 --- a/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/connector/task/KafkaConnectorConfig.scala +++ /dev/null @@ -1,25 +0,0 @@ -package org.sunbird.obsrv.connector.task - -import com.typesafe.config.Config -import org.apache.flink.api.common.typeinfo.TypeInformation -import org.apache.flink.api.java.typeutils.TypeExtractor -import org.apache.flink.streaming.api.scala.OutputTag -import org.sunbird.obsrv.core.streaming.BaseJobConfig - -import scala.collection.mutable - -class KafkaConnectorConfig(override val config: Config) extends BaseJobConfig[String](config, "KafkaConnectorJob") { - - private val serialVersionUID = 2905979435603791379L - - implicit val mapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) - implicit val stringTypeInfo: TypeInformation[String] = TypeExtractor.getForClass(classOf[String]) - - override def inputTopic(): String = "" - - override def inputConsumer(): String = "" - - override def successTag(): OutputTag[String] = OutputTag[String]("dummy-events") - - override def failedEventsOutputTag(): OutputTag[String] = OutputTag[String]("failed-events") -} diff --git a/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/connector/task/KafkaConnectorStreamTask.scala b/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/connector/task/KafkaConnectorStreamTask.scala deleted file mode 100644 index 175f64fa..00000000 --- a/pipeline/kafka-connector/src/main/scala/org/sunbird/obsrv/connector/task/KafkaConnectorStreamTask.scala +++ /dev/null @@ -1,71 +0,0 @@ -package org.sunbird.obsrv.connector.task - -import com.typesafe.config.ConfigFactory -import org.apache.flink.api.java.utils.ParameterTool -import org.apache.flink.streaming.api.datastream.DataStream -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment -import org.joda.time.{DateTime, DateTimeZone} -import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector} -import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil} -import org.sunbird.obsrv.registry.DatasetRegistry - -import java.io.File - -class KafkaConnectorStreamTask(config: KafkaConnectorConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[String] { - - private val serialVersionUID = -7729362727131516112L - - // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster - def process(): Unit = { - implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) - env.execute(config.jobName) - } - - override def processStream(dataStream: DataStream[String]): DataStream[String] = { - null - } - // $COVERAGE-ON$ - - def process(env: StreamExecutionEnvironment): Unit = { - val datasetSourceConfig = DatasetRegistry.getAllDatasetSourceConfig() - datasetSourceConfig.map { configList => - configList.filter(_.connectorType.equalsIgnoreCase("kafka")).map { - dataSourceConfig => - val dataStream: DataStream[String] = getStringDataStream(env, config, List(dataSourceConfig.connectorConfig.topic), - config.kafkaConsumerProperties(kafkaBrokerServers = Some(dataSourceConfig.connectorConfig.kafkaBrokers), - kafkaConsumerGroup = Some(s"kafka-${dataSourceConfig.connectorConfig.topic}-consumer")), - consumerSourceName = s"kafka-${dataSourceConfig.connectorConfig.topic}", kafkaConnector) - val datasetId = dataSourceConfig.datasetId - val kafkaOutputTopic = DatasetRegistry.getDataset(datasetId).get.datasetConfig.entryTopic - val resultStream: DataStream[String] = dataStream.map { streamData: String => { - val syncts = java.lang.Long.valueOf(new DateTime(DateTimeZone.UTC).getMillis) - JSONUtil.getJsonType(streamData) match { - case "ARRAY" => s"""{"dataset":"$datasetId","syncts":$syncts,"events":$streamData}""" - case _ => s"""{"dataset":"$datasetId","syncts":$syncts,"event":$streamData}""" - } - } - }.returns(classOf[String]) - resultStream.sinkTo(kafkaConnector.kafkaSink[String](kafkaOutputTopic)) - .name(s"$datasetId-kafka-connector-sink").uid(s"$datasetId-kafka-connector-sink") - .setParallelism(config.downstreamOperatorsParallelism) - } - } - } - -} - -// $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster -object KafkaConnectorStreamTask { - - def main(args: Array[String]): Unit = { - val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path")) - val config = configFilePath.map { - path => ConfigFactory.parseFile(new File(path)).resolve() - }.getOrElse(ConfigFactory.load("kafka-connector.conf").withFallback(ConfigFactory.systemEnvironment())) - val kafkaConnectorConfig = new KafkaConnectorConfig(config) - val kafkaUtil = new FlinkKafkaConnector(kafkaConnectorConfig) - val task = new KafkaConnectorStreamTask(kafkaConnectorConfig, kafkaUtil) - task.process() - } -} -// $COVERAGE-ON$ \ No newline at end of file diff --git a/pipeline/kafka-connector/src/test/resources/test.conf b/pipeline/kafka-connector/src/test/resources/test.conf deleted file mode 100644 index 87306136..00000000 --- a/pipeline/kafka-connector/src/test/resources/test.conf +++ /dev/null @@ -1,14 +0,0 @@ -include "base-test.conf" - -kafka { - input.topic = "flink.test" - groupId = "flink-kafkaconnector-group" - producer { - max-request-size = 5242880 - } -} - -task { - consumer.parallelism = 1 - downstream.operators.parallelism = 1 -} \ No newline at end of file diff --git a/pipeline/kafka-connector/src/test/scala/org/sunbird/obsrv/connector/KafkaConnectorStreamTestSpec.scala b/pipeline/kafka-connector/src/test/scala/org/sunbird/obsrv/connector/KafkaConnectorStreamTestSpec.scala deleted file mode 100644 index bf86eafa..00000000 --- a/pipeline/kafka-connector/src/test/scala/org/sunbird/obsrv/connector/KafkaConnectorStreamTestSpec.scala +++ /dev/null @@ -1,126 +0,0 @@ -package org.sunbird.obsrv.connector - -import io.github.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig} -import org.apache.flink.configuration.Configuration -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment -import org.apache.flink.test.util.MiniClusterWithClientResource -import org.apache.kafka.common.serialization.StringDeserializer -import org.scalatest.Matchers._ -import org.sunbird.obsrv.BaseMetricsReporter -import org.sunbird.obsrv.connector.task.{KafkaConnectorConfig, KafkaConnectorStreamTask} -import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector -import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect} -import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry - -import scala.concurrent.ExecutionContext.Implicits.global -import scala.concurrent.Future -import scala.concurrent.duration._ - -class KafkaConnectorStreamTestSpec extends BaseSpecWithDatasetRegistry { - - val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder() - .setConfiguration(testConfiguration()) - .setNumberSlotsPerTaskManager(1) - .setNumberTaskManagers(1) - .build) - - val pConfig = new KafkaConnectorConfig(config) - val kafkaConnector = new FlinkKafkaConnector(pConfig) - val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group") - implicit val embeddedKafkaConfig: EmbeddedKafkaConfig = - EmbeddedKafkaConfig( - kafkaPort = 9093, - zooKeeperPort = 2183, - customConsumerProperties = customKafkaConsumerProperties - ) - implicit val deserializer: StringDeserializer = new StringDeserializer() - private val VALID_JSON_EVENT = """{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""" - private val VALID_JSON_EVENT_ARRAY = """[{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}]""" - private val INVALID_JSON_EVENT = """{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}""" - - def testConfiguration(): Configuration = { - val config = new Configuration() - config.setString("metrics.reporter", "job_metrics_reporter") - config.setString("metrics.reporter.job_metrics_reporter.class", classOf[BaseMetricsReporter].getName) - config - } - - override def beforeAll(): Unit = { - super.beforeAll() - BaseMetricsReporter.gaugeMetrics.clear() - EmbeddedKafka.start()(embeddedKafkaConfig) - prepareTestData() - createTestTopics() - EmbeddedKafka.publishStringMessageToKafka("d1-topic", VALID_JSON_EVENT) - EmbeddedKafka.publishStringMessageToKafka("d2-topic", VALID_JSON_EVENT_ARRAY) - EmbeddedKafka.publishStringMessageToKafka("d3-topic", INVALID_JSON_EVENT) - - flinkCluster.before() - } - - private def prepareTestData(): Unit = { - val postgresConnect = new PostgresConnect(postgresConfig) - postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date, tags) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") - postgresConnect.execute("insert into dataset_source_config values('sc1', 'd1', 'kafka', '{\"kafkaBrokers\":\"localhost:9093\",\"topic\":\"d1-topic\"}', 'Live', null, 'System', 'System', now(), now());") - postgresConnect.execute("insert into dataset_source_config values('sc2', 'd1', 'rdbms', '{\"type\":\"postgres\",\"tableName\":\"test-table\"}', 'Live', null, 'System', 'System', now(), now());") - postgresConnect.execute("insert into dataset_source_config values('sc3', 'd2', 'kafka', '{\"kafkaBrokers\":\"localhost:9093\",\"topic\":\"d2-topic\"}', 'Live', null, 'System', 'System', now(), now());") - postgresConnect.execute("insert into dataset_source_config values('sc4', 'd3', 'kafka', '{\"kafkaBrokers\":\"localhost:9093\",\"topic\":\"d3-topic\"}', 'Live', null, 'System', 'System', now(), now());") - postgresConnect.closeConnection() - } - - override def afterAll(): Unit = { - super.afterAll() - flinkCluster.after() - EmbeddedKafka.stop() - } - - def createTestTopics(): Unit = { - List( - "d1-topic", "d2-topic", "d3-topic", pConfig.kafkaSystemTopic, "ingest" - ).foreach(EmbeddedKafka.createCustomTopic(_)) - } - - "KafkaConnectorStreamTestSpec" should "validate the kafka connector job" in { - - implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(pConfig) - val task = new KafkaConnectorStreamTask(pConfig, kafkaConnector) - task.process(env) - Future { - env.execute(pConfig.jobName) - } - - val ingestEvents = EmbeddedKafka.consumeNumberMessagesFrom[String]("ingest", 3, timeout = 30.seconds) - validateIngestEvents(ingestEvents) - - pConfig.inputTopic() should be ("") - pConfig.inputConsumer() should be ("") - pConfig.successTag().getId should be ("dummy-events") - pConfig.failedEventsOutputTag().getId should be ("failed-events") - } - - private def validateIngestEvents(ingestEvents: List[String]): Unit = { - ingestEvents.size should be(3) - ingestEvents.foreach{event: String => { - if(event.contains(""""dataset":"d1"""")) { - JSONUtil.getJsonType(event) should be ("OBJECT") - val eventMap = JSONUtil.deserialize[Map[String, AnyRef]](event) - eventMap.get("dataset").get.asInstanceOf[String] should be ("d1") - eventMap.get("syncts").isDefined should be (true) - eventMap.contains("event") should be (true) - } else if(event.contains(""""dataset":"d2"""")) { - JSONUtil.getJsonType(event) should be("OBJECT") - val eventMap = JSONUtil.deserialize[Map[String, AnyRef]](event) - eventMap.get("dataset").get.asInstanceOf[String] should be("d2") - eventMap.get("syncts").isDefined should be(true) - eventMap.contains("events") should be(true) - JSONUtil.getJsonType(JSONUtil.serialize(eventMap.get("events"))) should be("ARRAY") - } else { - JSONUtil.getJsonType(event) should be ("NOT_A_JSON") - event.contains(""""event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}""") should be(true) - } - }} - - } - -} \ No newline at end of file From b2a183a0575012aebd81a842c466b0f968c28905 Mon Sep 17 00:00:00 2001 From: Ravi Mula Date: Fri, 16 Feb 2024 12:39:56 +0530 Subject: [PATCH 21/37] Release 1.0.3-GA (#72) --- .github/workflows/build_and_deploy.yaml | 113 ++++++++---------------- .github/workflows/pull_request.yaml | 25 ++++++ .github/workflows/upload_artifact.yaml | 85 ++++++++++++++++++ Dockerfile | 4 - pipeline/pom.xml | 1 - 5 files changed, 148 insertions(+), 80 deletions(-) create mode 100644 .github/workflows/pull_request.yaml create mode 100644 .github/workflows/upload_artifact.yaml diff --git a/.github/workflows/build_and_deploy.yaml b/.github/workflows/build_and_deploy.yaml index 48c610c9..35ba8cf8 100644 --- a/.github/workflows/build_and_deploy.yaml +++ b/.github/workflows/build_and_deploy.yaml @@ -1,104 +1,67 @@ -name: Obsrv Core service build and deploy workflow +name: Build and Deploy run-name: Workflow run for ${{ github.ref }} on: push: tags: - - '*' + - '*' + workflow_dispatch: + inputs: + aws-deploy: + type: boolean + required: true + default: false jobs: - check-tag: - runs-on: ubuntu-latest - outputs: - ALLOWED_TAG: ${{ steps.tag-checker.outputs.TRIGGER_ALLOWED }} - steps: - - name: Check if tag is one in list of current releases - id: tag-checker - run: | - (echo -n TRIGGER_ALLOWED= && echo 'print("${{ github.ref_name }}".split("_")[0] - not in ${{ vars.CURRENT_RELEASE }})' | python3) >> "$GITHUB_OUTPUT" - - docker-build: - needs: check-tag - if: needs.check-tag.outputs.ALLOWED_TAG == 'True' + build-image: runs-on: ubuntu-latest strategy: matrix: - include: - - image: "extractor" - target: "extractor-image" - - image: "preprocessor" - target: "preprocessor-image" - - image: "denormalizer" - target: "denormalizer-image" - - image: "transformer" - target: "transformer-image" - - image: "druid-router" - target: "router-image" - - image: "merged-pipeline" - target: "merged-image" - - image: "master-data-processor" - target: "master-data-processor-image" - - image: "kafka-connector" - target: "kafka-connector-image" - - + include: + - image: "extractor" + target: "extractor-image" + - image: "preprocessor" + target: "preprocessor-image" + - image: "denormalizer" + target: "denormalizer-image" + - image: "transformer" + target: "transformer-image" + - image: "druid-router" + target: "router-image" + - image: "merged-pipeline" + target: "merged-image" + - image: "master-data-processor" + target: "master-data-processor-image" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 - - name: Maven Build - run: | - mvn clean install -DskipTests - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Login to docker hub - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Build merged-pipeline image and push - uses: docker/build-push-action@v4 - with: - platforms: linux/amd64 - target: merged-image - push: true - tags: ${{ secrets.DOCKERHUB_USERNAME }}/merged-pipeline:${{ github.ref_name }} - - - name: Build merged-pipeline image and push - uses: docker/build-push-action@v4 - with: - platforms: linux/amd64 - target: master-data-processor-image - push: true - tags: ${{ secrets.DOCKERHUB_USERNAME }}/master-data-processor:${{ github.ref_name }} - - - name: Build merged-pipeline image and push - uses: docker/build-push-action@v4 - with: - platforms: linux/amd64 - target: kafka-connector-image - push: true - tags: ${{ secrets.DOCKERHUB_USERNAME }}/kafka-connector:${{ github.ref_name }} - - name: Build ${{matrix.image}} image and push - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v5 with: platforms: linux/amd64 target: ${{matrix.target}} push: true tags: ${{ secrets.DOCKERHUB_USERNAME }}/${{matrix.image}}:${{ github.ref_name }} + aws-deploy: - needs: [check-tag, docker-build] - if: needs.check-tag.outputs.ALLOWED_TAG == 'True' + needs: build-image + if: github.event.inputs.aws-deploy == 'True' runs-on: ubuntu-latest environment: aws-dev steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Setup Terragrunt uses: autero1/action-terragrunt@v1.1.0 with: @@ -107,12 +70,12 @@ jobs: run: terragrunt --version - name: Clone the terraform deployment repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: ${{ vars.DEPLOY_REPO }} path: deploy ref: ${{ vars.DEPLOY_REPO_REF }} - + - name: Fetch and update kubeconfig file env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} @@ -136,12 +99,12 @@ jobs: -var flink_image_tag=${{ github.ref_name }} azure-deploy: - needs: [check-tag, docker-build] - if: needs.check-tag.outputs.ALLOWED_TAG == 'True' && vars.CLOUD_PROVIDER == 'azure' + needs: build-image + if: vars.CLOUD_PROVIDER == 'azure' runs-on: ubuntu-latest steps: - name: Clone the terraform deployment repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: ${{ vars.DEPLOY_REPO }} path: deploy @@ -158,4 +121,4 @@ jobs: terragrunt init terragrunt apply -auto-approve -replace=module.flink.helm_release.flink \ -var flink_container_registry=${{ secrets.DOCKERHUB_USERNAME }} \ - -var flink_image_tag=${{ github.ref_name }} + -var flink_image_tag=${{ github.ref_name }} \ No newline at end of file diff --git a/.github/workflows/pull_request.yaml b/.github/workflows/pull_request.yaml new file mode 100644 index 00000000..44c79317 --- /dev/null +++ b/.github/workflows/pull_request.yaml @@ -0,0 +1,25 @@ +name: Pull Request +run-name: Workflow run for pull request - ${{ github.event.pull_request.title }} +on: + pull_request: + types: + - opened + - synchronize + +jobs: + test-cases: + if: github.event_name == 'pull_request' + runs-on: ubuntu-latest + steps: + - name: Set up JDK 11 + uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '11' + + - name: Checkout code + uses: actions/checkout@v4 + + - name: Run test cases + run: | + mvn clean install \ No newline at end of file diff --git a/.github/workflows/upload_artifact.yaml b/.github/workflows/upload_artifact.yaml new file mode 100644 index 00000000..38cb7ec8 --- /dev/null +++ b/.github/workflows/upload_artifact.yaml @@ -0,0 +1,85 @@ +name: Upload Artifacts +run-name: Workflow run for ${{ github.ref }} +on: + push: + tags: + - '*' + +jobs: + artifacts-upload-core: + runs-on: ubuntu-latest + steps: + - name: Get Tag Name + id: get-tag + run: echo "tag_name=${GITHUB_REF#refs/tags/}" >>$GITHUB_OUTPUT + + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup JAVA + uses: actions/setup-java@v4 + with: + java-version: '11' + distribution: 'temurin' + cache: 'maven' + + - name: Build Framework Artifacts + run: | + cd ./framework + mvn clean install -DskipTests + - name: Upload Framework Artifacts + uses: actions/upload-artifact@v4 + with: + name: framework-${{ steps.get-tag.outputs.tag_name }}.jar + path: ./framework/target/framework-1.0.0.jar + if-no-files-found: error + + - name: Build Dataset Registry Artifacts + run: | + cd ./dataset-registry + mvn clean install -DskipTests + - name: Upload Dataset Registry Artifacts + uses: actions/upload-artifact@v4 + with: + name: dataset-registry-${{ steps.get-tag.outputs.tag_name }}.jar + path: ./dataset-registry/target/dataset-registry-1.0.0.jar + if-no-files-found: error + + artifacts-upload-pipeline: + needs: artifacts-upload-core + runs-on: ubuntu-latest + strategy: + matrix: + include: + - image: "extractor" + - image: "preprocessor" + - image: "denormalizer" + - image: "transformer" + - image: "druid-router" + - image: "pipeline-merged" + - image: "master-data-processor" + steps: + - name: Get Tag Name + id: get-tag + run: echo "tag_name=${GITHUB_REF#refs/tags/}" >>$GITHUB_OUTPUT + + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup JAVA + uses: actions/setup-java@v4 + with: + java-version: '11' + distribution: 'temurin' + cache: 'maven' + + - name: Build Data Pipeline Artifacts + run: | + cd ./pipeline + mvn clean install -DskipTests + - name: Upload Data Pipeline Artifacts + uses: actions/upload-artifact@v4 + with: + name: ${{matrix.image}}-${{ steps.get-tag.outputs.tag_name }}.jar + path: ./pipeline/${{matrix.image}}/target/${{matrix.image}}-1.0.0.jar + if-no-files-found: error \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 17efe642..4e519a75 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,7 +35,3 @@ COPY --from=build-pipeline /app/pipeline/pipeline-merged/target/pipeline-merged- FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as master-data-processor-image USER flink COPY --from=build-pipeline /app/pipeline/master-data-processor/target/master-data-processor-1.0.0.jar $FLINK_HOME/lib - -FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as kafka-connector-image -USER flink -COPY --from=build-pipeline /app/pipeline/kafka-connector/target/kafka-connector-1.0.0.jar $FLINK_HOME/lib diff --git a/pipeline/pom.xml b/pipeline/pom.xml index 25d19b66..9c37e956 100644 --- a/pipeline/pom.xml +++ b/pipeline/pom.xml @@ -22,7 +22,6 @@ transformer druid-router pipeline-merged - kafka-connector master-data-processor From 40688021a5a824e8e8e621c8c0f7a332d0baee40 Mon Sep 17 00:00:00 2001 From: Manjunath Davanam Date: Tue, 23 Apr 2024 13:38:29 +0530 Subject: [PATCH 22/37] Pipeline Bug fixes (#74) * Sanketika-Obsrv/issue-tracker#106:fix: Fix postgres connection issue with dataset read and handling an errors while parsing the message * Sanketika-Obsrv/issue-tracker#107:fix: Denorm job fix to handle error when denorm field node is contains empty value * Sanketika-Obsrv/issue-tracker#106:fix: Review comments fix - Changed the generic exception to actual exception (NullPointer) --- .../org/sunbird/obsrv/registry/DatasetRegistry.scala | 4 ++-- .../scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala | 9 ++++++++- .../sunbird/obsrv/denormalizer/util/DenormCache.scala | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala index ad239312..c1394fd4 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala @@ -8,9 +8,9 @@ import scala.collection.mutable object DatasetRegistry { - private val datasets: mutable.Map[String, Dataset] = mutable.Map[String, Dataset]() + lazy private val datasets: mutable.Map[String, Dataset] = mutable.Map[String, Dataset]() datasets ++= DatasetRegistryService.readAllDatasets() - private val datasetTransformations: Map[String, List[DatasetTransformation]] = DatasetRegistryService.readAllDatasetTransformations() + lazy private val datasetTransformations: Map[String, List[DatasetTransformation]] = DatasetRegistryService.readAllDatasetTransformations() def getAllDatasets(datasetType: String): List[Dataset] = { val datasetList = DatasetRegistryService.readAllDatasets() diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala b/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala index 56525db4..370353c7 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala @@ -7,6 +7,7 @@ import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDe import org.apache.flink.util.Collector import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.kafka.clients.producer.ProducerRecord +import org.slf4j.LoggerFactory import org.sunbird.obsrv.core.model.Constants import org.sunbird.obsrv.core.util.JSONUtil @@ -48,11 +49,17 @@ class MapDeserializationSchema extends KafkaRecordDeserializationSchema[mutable. class StringDeserializationSchema extends KafkaRecordDeserializationSchema[String] { private val serialVersionUID = -3224825136576915426L + private[this] val logger = LoggerFactory.getLogger(classOf[StringDeserializationSchema]) override def getProducedType: TypeInformation[String] = TypeExtractor.getForClass(classOf[String]) override def deserialize(record: ConsumerRecord[Array[Byte], Array[Byte]], out: Collector[String]): Unit = { - out.collect(new String(record.value(), StandardCharsets.UTF_8)) + try { + out.collect(new String(record.value(), StandardCharsets.UTF_8)) + } catch { + case ex: NullPointerException => + logger.error(s"Exception while parsing the message: ${ex.getMessage}") + } } } diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala index dd94a251..db0da7d5 100644 --- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala +++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala @@ -79,7 +79,7 @@ class DenormCache(val config: DenormalizerConfig) { if (denormFieldNode.isMissingNode) { DenormFieldStatus("", success = false, Some(ErrorConstants.DENORM_KEY_MISSING)) } else { - if (denormFieldNode.isTextual || denormFieldNode.isNumber) { + if ((denormFieldNode.isTextual && denormFieldNode.asText().nonEmpty) || denormFieldNode.isNumber) { DenormFieldStatus(denormFieldNode.asText(), success = false, None) } else { DenormFieldStatus("", success = false, Some(ErrorConstants.DENORM_KEY_NOT_A_STRING_OR_NUMBER)) From b05912f34705a6866a43d41f91533f5d9b71d8c5 Mon Sep 17 00:00:00 2001 From: Ravi Mula Date: Tue, 23 Apr 2024 17:44:03 +0530 Subject: [PATCH 23/37] Pipeline Bug fixes (#74) (#77) --- .../org/sunbird/obsrv/registry/DatasetRegistry.scala | 4 ++-- .../scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala | 9 ++++++++- .../sunbird/obsrv/denormalizer/util/DenormCache.scala | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala index ad239312..c1394fd4 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala @@ -8,9 +8,9 @@ import scala.collection.mutable object DatasetRegistry { - private val datasets: mutable.Map[String, Dataset] = mutable.Map[String, Dataset]() + lazy private val datasets: mutable.Map[String, Dataset] = mutable.Map[String, Dataset]() datasets ++= DatasetRegistryService.readAllDatasets() - private val datasetTransformations: Map[String, List[DatasetTransformation]] = DatasetRegistryService.readAllDatasetTransformations() + lazy private val datasetTransformations: Map[String, List[DatasetTransformation]] = DatasetRegistryService.readAllDatasetTransformations() def getAllDatasets(datasetType: String): List[Dataset] = { val datasetList = DatasetRegistryService.readAllDatasets() diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala b/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala index 56525db4..370353c7 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala @@ -7,6 +7,7 @@ import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDe import org.apache.flink.util.Collector import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.kafka.clients.producer.ProducerRecord +import org.slf4j.LoggerFactory import org.sunbird.obsrv.core.model.Constants import org.sunbird.obsrv.core.util.JSONUtil @@ -48,11 +49,17 @@ class MapDeserializationSchema extends KafkaRecordDeserializationSchema[mutable. class StringDeserializationSchema extends KafkaRecordDeserializationSchema[String] { private val serialVersionUID = -3224825136576915426L + private[this] val logger = LoggerFactory.getLogger(classOf[StringDeserializationSchema]) override def getProducedType: TypeInformation[String] = TypeExtractor.getForClass(classOf[String]) override def deserialize(record: ConsumerRecord[Array[Byte], Array[Byte]], out: Collector[String]): Unit = { - out.collect(new String(record.value(), StandardCharsets.UTF_8)) + try { + out.collect(new String(record.value(), StandardCharsets.UTF_8)) + } catch { + case ex: NullPointerException => + logger.error(s"Exception while parsing the message: ${ex.getMessage}") + } } } diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala index dd94a251..db0da7d5 100644 --- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala +++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala @@ -79,7 +79,7 @@ class DenormCache(val config: DenormalizerConfig) { if (denormFieldNode.isMissingNode) { DenormFieldStatus("", success = false, Some(ErrorConstants.DENORM_KEY_MISSING)) } else { - if (denormFieldNode.isTextual || denormFieldNode.isNumber) { + if ((denormFieldNode.isTextual && denormFieldNode.asText().nonEmpty) || denormFieldNode.isNumber) { DenormFieldStatus(denormFieldNode.asText(), success = false, None) } else { DenormFieldStatus("", success = false, Some(ErrorConstants.DENORM_KEY_NOT_A_STRING_OR_NUMBER)) From b3d383f64304f0da32ca73acd6f8d75e36965370 Mon Sep 17 00:00:00 2001 From: SurabhiAngadi Date: Fri, 10 May 2024 14:50:55 +0530 Subject: [PATCH 24/37] fix: #0000: update datasourceRef only if dataset has records --- .../sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala index 2c41181c..22729aa0 100644 --- a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala +++ b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala @@ -27,8 +27,8 @@ object MasterDataProcessorIndexer { val ingestionSpec: String = updateIngestionSpec(datasource, paths.datasourceRef, paths.ingestionPath, config) if (eventsCount > 0L) { submitIngestionTask(dataset.id, ingestionSpec, config) + DatasetRegistry.updateDatasourceRef(datasource, paths.datasourceRef) } - DatasetRegistry.updateDatasourceRef(datasource, paths.datasourceRef) if (!datasource.datasourceRef.equals(paths.datasourceRef)) { deleteDataSource(dataset.id, datasource.datasourceRef, config) } From 94a03785c8c33c77e8b01fa63d24b1a06bf1aaea Mon Sep 17 00:00:00 2001 From: Sowmya N Dixit Date: Wed, 22 May 2024 19:22:09 +0530 Subject: [PATCH 25/37] Sanketika-Obsrv/issue-tracker#180 fix: Datasource DB schema changes to include type. (#79) Co-authored-by: sowmya-dixit --- .../src/main/resources/dataset-registry.sql | 1 + .../org/sunbird/obsrv/model/DatasetModels.scala | 2 +- .../sunbird/obsrv/registry/DatasetRegistry.scala | 5 +++++ .../obsrv/service/DatasetRegistryService.scala | 14 +++++++++++++- .../obsrv/spec/BaseSpecWithDatasetRegistry.scala | 2 +- .../obsrv/spec/TestDatasetRegistrySpec.scala | 4 ++-- 6 files changed, 23 insertions(+), 5 deletions(-) diff --git a/dataset-registry/src/main/resources/dataset-registry.sql b/dataset-registry/src/main/resources/dataset-registry.sql index ff28ae98..54373eec 100644 --- a/dataset-registry/src/main/resources/dataset-registry.sql +++ b/dataset-registry/src/main/resources/dataset-registry.sql @@ -22,6 +22,7 @@ CREATE INDEX IF NOT EXISTS datasets_status ON datasets(status); CREATE TABLE IF NOT EXISTS datasources ( datasource text PRIMARY KEY, dataset_id text REFERENCES datasets (id), + type text NOT NULL, ingestion_spec json NOT NULL, datasource_ref text NOT NULL, retention_period json, diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala index 49cc51bc..ee73fbe0 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala @@ -71,7 +71,7 @@ object DatasetModels { @JsonProperty("status") status: String, @JsonProperty("connector_stats") connectorStats: Option[ConnectorStats] = None) case class DataSource(@JsonProperty("id") id: String, @JsonProperty("datasource") datasource: String, @JsonProperty("dataset_id") datasetId: String, - @JsonProperty("ingestion_spec") ingestionSpec: String, @JsonProperty("datasource_ref") datasourceRef: String) + @JsonProperty("type") `type`: String, @JsonProperty("ingestion_spec") ingestionSpec: String, @JsonProperty("datasource_ref") datasourceRef: String) } diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala index c1394fd4..08921adc 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala @@ -42,6 +42,11 @@ object DatasetRegistry { DatasetRegistryService.readDatasources(datasetId) } + def getAllDatasources(): List[DataSource] = { + val datasourceList = DatasetRegistryService.readAllDatasources() + datasourceList.getOrElse(List()) + } + def getDataSetIds(datasetType: String): List[String] = { datasets.filter(f => f._2.datasetType.equals(datasetType)).keySet.toList } diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala index 88efb7a6..8075d508 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala @@ -111,6 +111,17 @@ object DatasetRegistryService { } } + def readAllDatasources(): Option[List[DataSource]] = { + + val postgresConnect = new PostgresConnect(postgresConfig) + try { + val rs = postgresConnect.executeQuery(s"SELECT * FROM datasources") + Option(Iterator.continually((rs, rs.next)).takeWhile(f => f._2).map(f => f._1).map(result => { + parseDatasource(result) + }).toList) + } + } + def updateDatasourceRef(datasource: DataSource, datasourceRef: String): Int = { val query = s"UPDATE datasources set datasource_ref = '$datasourceRef' where datasource='${datasource.datasource}' and dataset_id='${datasource.datasetId}'" updateRegistry(query) @@ -190,10 +201,11 @@ object DatasetRegistryService { val id = rs.getString("id") val datasource = rs.getString("datasource") val datasetId = rs.getString("dataset_id") + val datasourceType = rs.getString("type") val ingestionSpec = rs.getString("ingestion_spec") val datasourceRef = rs.getString("datasource_ref") - DataSource(id, datasource, datasetId, ingestionSpec, datasourceRef) + DataSource(id, datasource, datasetId, datasourceType, ingestionSpec, datasourceRef) } private def parseDatasetTransformation(rs: ResultSet): DatasetTransformation = { diff --git a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala index 09321143..1b3edea0 100644 --- a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala +++ b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala @@ -36,7 +36,7 @@ class BaseSpecWithDatasetRegistry extends BaseSpecWithPostgres { private def createSchema(postgresConnect: PostgresConnect): Unit = { postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasets ( id text PRIMARY KEY, type text NOT NULL, validation_config json, extraction_config json, dedup_config json, data_schema json, denorm_config json, router_config json NOT NULL, dataset_config json NOT NULL, status text NOT NULL, tags text[], data_version INT, created_by text NOT NULL, updated_by text NOT NULL, created_date timestamp NOT NULL, updated_date timestamp NOT NULL );") - postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasources ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), ingestion_spec json NOT NULL, datasource text NOT NULL, datasource_ref text NOT NULL, retention_period json, archival_policy json, purge_policy json, backup_config json NOT NULL, status text NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL );") + postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasources ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), type text NOT NULL, ingestion_spec json NOT NULL, datasource text NOT NULL, datasource_ref text NOT NULL, retention_period json, archival_policy json, purge_policy json, backup_config json NOT NULL, status text NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL );") postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_transformations ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), field_key text NOT NULL, transformation_function json NOT NULL, status text NOT NULL, mode text, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(field_key, dataset_id) );") postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_source_config ( id text PRIMARY KEY, dataset_id text NOT NULL REFERENCES datasets (id), connector_type text NOT NULL, connector_config json NOT NULL, status text NOT NULL, connector_stats json, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(connector_type, dataset_id) );") } diff --git a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala index b37e801a..3d83552d 100644 --- a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala +++ b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala @@ -92,8 +92,8 @@ class TestDatasetRegistrySpec extends BaseSpecWithDatasetRegistry with Matchers postgresConnect.execute("insert into dataset_source_config values('sc1', 'd1', 'kafka', '{\"kafkaBrokers\":\"localhost:9090\",\"topic\":\"test-topic\"}', 'Live', null, 'System', 'System', now(), now());") postgresConnect.execute("insert into dataset_source_config values('sc2', 'd1', 'rdbms', '{\"type\":\"postgres\",\"tableName\":\"test-table\"}', 'Live', null, 'System', 'System', now(), now());") - //postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasources ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), ingestion_spec json NOT NULL, datasource text NOT NULL, datasource_ref text NOT NULL, retention_period json, archival_policy json, purge_policy json, backup_config json NOT NULL, status text NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL );") - postgresConnect.execute("insert into datasources values('ds1', 'd1', '{}', 'd1-datasource', 'd1-datasource-1', null, null, null, '{}', 'Live', 'System', 'System', now(), now());") + //postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasources ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), type text NOT NULL, ingestion_spec json NOT NULL, datasource text NOT NULL, datasource_ref text NOT NULL, retention_period json, archival_policy json, purge_policy json, backup_config json NOT NULL, status text NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL );") + postgresConnect.execute("insert into datasources values('ds1', 'd1', 'druid', '{}', 'd1-datasource', 'd1-datasource-1', null, null, null, '{}', 'Live', 'System', 'System', now(), now());") postgresConnect.closeConnection() } } \ No newline at end of file From 5b5b0fa719ddba4f44a7e8a0030e37210459b87b Mon Sep 17 00:00:00 2001 From: Sowmya N Dixit Date: Fri, 24 May 2024 12:11:51 +0530 Subject: [PATCH 26/37] Hudi connector flink job implementation (#80) * feat: Hudi Flink Implementation. * feat: local working with metastore and localstack. * #0000 - feat: Hudi Sink implementation * #0000 - feat: Hudi Sink implementation * #0000 - feat: Initialize dataset RowType during job startup * refactor: Integrate hudi connector with dataset registry. * refactor: Integrate hudi connector with dataset registry. * Sanketika-Obsrv/issue-tracker#141 refactor: Enable timestamp based partition * Sanketika-Obsrv/issue-tracker#141 refactor: Fix Hudi connector job to handle empty datasets list for lakehouse. * Sanketika-Obsrv/issue-tracker#141 fix: Set Timestamp based partition configurations only if partition key is of timestamp type. * Sanketika-Obsrv/issue-tracker#170 fix: Resolve timestamp based partition without using TimestampBasedAvroKeyGenerator. * Sanketika-Obsrv/issue-tracker#177 fix: Lakehouse connector flink job fixes. * Sanketika-Obsrv/issue-tracker#177 fix: Dockerfile changes for hudi-connector * Sanketika-Obsrv/issue-tracker#177 fix: Lakehouse connector flink job fixes. * Sanketika-Obsrv/issue-tracker#177 fix: remove unused code * Sanketika-Obsrv/issue-tracker#177 fix: remove unused code * Sanketika-Obsrv/issue-tracker#177 fix: remove unused code * Sanketika-Obsrv/issue-tracker#177 fix: remove commented code --- Dockerfile | 8 + .../sunbird/obsrv/core/model/Constants.scala | 1 + pipeline/hudi-connector/pom.xml | 253 ++++++++++++++++++ .../src/main/resources/core-site.xml | 33 +++ .../src/main/resources/hudi-writer.conf | 41 +++ .../src/main/resources/schemas/schema.json | 108 ++++++++ .../functions/RowDataConverterFunction.scala | 43 +++ .../obsrv/streaming/HudiConnectorConfig.scala | 53 ++++ .../streaming/HudiConnectorStreamTask.scala | 153 +++++++++++ .../obsrv/streaming/TestTimestamp.scala | 19 ++ .../sunbird/obsrv/util/HudiSchemaParser.scala | 140 ++++++++++ pipeline/pom.xml | 1 + 12 files changed, 853 insertions(+) create mode 100644 pipeline/hudi-connector/pom.xml create mode 100644 pipeline/hudi-connector/src/main/resources/core-site.xml create mode 100644 pipeline/hudi-connector/src/main/resources/hudi-writer.conf create mode 100644 pipeline/hudi-connector/src/main/resources/schemas/schema.json create mode 100644 pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/functions/RowDataConverterFunction.scala create mode 100644 pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorConfig.scala create mode 100644 pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala create mode 100644 pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/TestTimestamp.scala create mode 100644 pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/util/HudiSchemaParser.scala diff --git a/Dockerfile b/Dockerfile index 4e519a75..dced8701 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,3 +35,11 @@ COPY --from=build-pipeline /app/pipeline/pipeline-merged/target/pipeline-merged- FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as master-data-processor-image USER flink COPY --from=build-pipeline /app/pipeline/master-data-processor/target/master-data-processor-1.0.0.jar $FLINK_HOME/lib + +FROM --platform=linux/x86_64 flink:1.15.0-scala_2.12-java11 as hudi-connector-image +USER flink +COPY ./pipeline/hudi-connector/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar $FLINK_HOME/lib +COPY ./pipeline/hudi-connector/flink-s3-fs-hadoop-1.15.2.jar $FLINK_HOME/lib +COPY ./pipeline/hudi-connector/hbase-server-2.4.13.jar $FLINK_HOME/lib +RUN mkdir $FLINK_HOME/custom-lib +COPY ./pipeline/hudi-connector/target/hudi-connector-1.0.0.jar $FLINK_HOME/custom-lib diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala index 2cfbd307..7c19d8e2 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala @@ -14,5 +14,6 @@ object Constants { val LEVEL = "level" val TOPIC = "topic" val MESSAGE = "message" + val DATALAKE_TYPE = "datalake" } diff --git a/pipeline/hudi-connector/pom.xml b/pipeline/hudi-connector/pom.xml new file mode 100644 index 00000000..5230d8eb --- /dev/null +++ b/pipeline/hudi-connector/pom.xml @@ -0,0 +1,253 @@ + + + 4.0.0 + + pipeline + org.sunbird.obsrv + 1.0 + + hudi-connector + 1.0.0 + Hudi Connector + + UTF-8 + 1.4.0 + + + + + org.apache.flink + flink-streaming-scala_${scala.maj.version} + ${flink.version} + provided + + + com.fasterxml.jackson.core + jackson-databind + + + + + org.sunbird.obsrv + framework + 1.0.0 + + + org.sunbird.obsrv + dataset-registry + 1.0.0 + + + org.apache.kafka + kafka-clients + + + + + org.apache.hudi + hudi-flink1.15-bundle + 0.14.1 + + + org.apache.hadoop + hadoop-common + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + + org.slf4j + slf4j-log4j12 + + + + + org.apache.flink + flink-table-api-scala-bridge_${scala.maj.version} + ${flink.version} + provided + + + io.github.classgraph + classgraph + 4.8.168 + + + org.apache.flink + flink-connector-hive_${scala.maj.version} + ${flink.version} + + + org.apache.hive + hive-metastore + 3.1.3 + + + org.apache.hadoop + hadoop-common + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + + + + org.apache.hive + hive-exec + 3.1.3 + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + + org.apache.hadoop + hadoop-common + + + org.apache.logging.log4j + log4j-slf4j-impl + + + + + + + src/main/scala + src/test/scala + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + 11 + + + + org.apache.maven.plugins + maven-shade-plugin + 3.2.1 + + + + package + + shade + + + false + + + com.google.code.findbugs:jsr305 + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + core-site.xml + + + + + + org.sunbird.obsrv.streaming.HudiConnectorStreamTask + + + + reference.conf + + + + + + + + + net.alchim31.maven + scala-maven-plugin + 4.4.0 + + ${java.target.runtime} + ${java.target.runtime} + ${scala.version} + false + + + + scala-compile-first + process-resources + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + + + maven-surefire-plugin + 2.22.2 + + true + + + + + org.scalatest + scalatest-maven-plugin + 1.0 + + ${project.build.directory}/surefire-reports + . + hudi-connector-testsuite.txt + + + + test + + test + + + + + + org.scoverage + scoverage-maven-plugin + ${scoverage.plugin.version} + + ${scala.version} + true + true + + + + + diff --git a/pipeline/hudi-connector/src/main/resources/core-site.xml b/pipeline/hudi-connector/src/main/resources/core-site.xml new file mode 100644 index 00000000..c15df562 --- /dev/null +++ b/pipeline/hudi-connector/src/main/resources/core-site.xml @@ -0,0 +1,33 @@ + + + + + + fs.s3a.impl + org.apache.hadoop.fs.s3a.S3AFileSystem + + + fs.s3a.endpoint + http://localhost:4566 + + + fs.s3a.access.key + test + + + fs.s3a.secret.key + testSecret + + + fs.s3a.path.style.access + true + + + fs.s3a.connection.ssl.enabled + false + + + + + + \ No newline at end of file diff --git a/pipeline/hudi-connector/src/main/resources/hudi-writer.conf b/pipeline/hudi-connector/src/main/resources/hudi-writer.conf new file mode 100644 index 00000000..d9c031b5 --- /dev/null +++ b/pipeline/hudi-connector/src/main/resources/hudi-writer.conf @@ -0,0 +1,41 @@ +include "baseconfig.conf" + +kafka { + input.topic = ${job.env}".hudi.connector.in" + output.topic = ${job.env}".hudi.connector.out" + output.invalid.topic = ${job.env}".failed" + event.max.size = "1048576" # Max is only 1MB + groupId = ${job.env}"-hudi-writer-group" + producer { + max-request-size = 5242880 + } +} + +task { + checkpointing.compressed = true + checkpointing.interval = 30000 + checkpointing.pause.between.seconds = 30000 + restart-strategy.attempts = 3 + restart-strategy.delay = 30000 # in milli-seconds + parallelism = 1 + consumer.parallelism = 1 + downstream.operators.parallelism = 1 +} + +hudi { + hms { + enabled = true + uri = "thrift://localhost:9083" + database { + name = "obsrv" + username = "postgres" + password = "postgres" + } + } + table { + type = "MERGE_ON_READ" + base.path = "s3a://obsrv" + } + compaction.enabled = true + write.tasks = 1 +} \ No newline at end of file diff --git a/pipeline/hudi-connector/src/main/resources/schemas/schema.json b/pipeline/hudi-connector/src/main/resources/schemas/schema.json new file mode 100644 index 00000000..177c957a --- /dev/null +++ b/pipeline/hudi-connector/src/main/resources/schemas/schema.json @@ -0,0 +1,108 @@ +{ + "dataset": "financial_transactions", + "schema": { + "table": "financial_transactions", + "partitionColumn": "receiver_ifsc_code", + "timestampColumn": "txn_date", + "primaryKey": "txn_id", + "columnSpec": [ + { + "name": "receiver_account_number", + "type": "string" + }, + { + "name": "receiver_ifsc_code", + "type": "string" + }, + { + "name": "sender_account_number", + "type": "string" + }, + { + "name": "sender_contact_email", + "type": "string" + }, + { + "name": "sender_ifsc_code", + "type": "string" + }, + { + "name": "currency", + "type": "string" + }, + { + "name": "txn_amount", + "type": "int" + }, + { + "name": "txn_date", + "type": "string" + }, + { + "name": "txn_id", + "type": "string" + }, + { + "name": "txn_status", + "type": "string" + }, + { + "name": "txn_type", + "type": "string" + } + ] + }, + "inputFormat": { + "type": "json", + "flattenSpec": { + "fields": [ + { + "type": "root", + "name": "receiver_account_number" + }, + { + "type": "path", + "name": "sender_account_number", + "expr": "$.sender.account_number" + }, + { + "type": "path", + "name": "sender_ifsc_code", + "expr": "$.sender.ifsc_code" + }, + { + "type": "root", + "name": "receiver_ifsc_code" + }, + { + "type": "root", + "name": "sender_contact_email" + }, + { + "type": "root", + "name": "currency" + }, + { + "type": "root", + "name": "txn_amount" + }, + { + "type": "root", + "name": "txn_date" + }, + { + "type": "root", + "name": "txn_id" + }, + { + "type": "root", + "name": "txn_status" + }, + { + "type": "root", + "name": "txn_type" + } + ] + } + } +} \ No newline at end of file diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/functions/RowDataConverterFunction.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/functions/RowDataConverterFunction.scala new file mode 100644 index 00000000..aec00117 --- /dev/null +++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/functions/RowDataConverterFunction.scala @@ -0,0 +1,43 @@ +package org.sunbird.obsrv.functions + +import org.apache.flink.api.common.functions.RichMapFunction +import org.apache.flink.configuration.Configuration +import org.apache.flink.formats.common.TimestampFormat +import org.apache.flink.formats.json.JsonToRowDataConverters +import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper +import org.sunbird.obsrv.util.{HudiSchemaParser, HudiSchemaSpec} +import org.apache.flink.table.data.RowData +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.util.{JSONUtil, Util} +import org.sunbird.obsrv.streaming.HudiConnectorConfig +import scala.collection.mutable.{Map => MMap} + +class RowDataConverterFunction(config: HudiConnectorConfig, datasetId: String) extends RichMapFunction[MMap[String, AnyRef], RowData] { + + var jsonToRowDataConverters: JsonToRowDataConverters = _ + var objectMapper: ObjectMapper = _ + var hudiSchemaParser: HudiSchemaParser = _ + + private val logger = LoggerFactory.getLogger(classOf[RowDataConverterFunction]) + + override def open(parameters: Configuration): Unit = { + super.open(parameters) + jsonToRowDataConverters = new JsonToRowDataConverters(false, true, TimestampFormat.SQL) + objectMapper = new ObjectMapper() + hudiSchemaParser = new HudiSchemaParser() + } + + override def map(event: MMap[String, AnyRef]): RowData = { + convertToRowData(event) + } + + def convertToRowData(data: MMap[String, AnyRef]): RowData = { + val eventJson = JSONUtil.serialize(data) + val flattenedData = hudiSchemaParser.parseJson(datasetId, eventJson) + val rowType = hudiSchemaParser.rowTypeMap(datasetId) + val converter: JsonToRowDataConverters.JsonToRowDataConverter = jsonToRowDataConverters.createRowConverter(rowType) + val rowData = converter.convert(objectMapper.readTree(JSONUtil.serialize(flattenedData))).asInstanceOf[RowData] + rowData + } + +} diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorConfig.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorConfig.scala new file mode 100644 index 00000000..4f4f46cf --- /dev/null +++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorConfig.scala @@ -0,0 +1,53 @@ +package org.sunbird.obsrv.streaming + +import com.typesafe.config.Config +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.java.typeutils.TypeExtractor +import org.apache.flink.streaming.api.scala.OutputTag +import org.apache.hudi.common.model.HoodieTableType +import org.apache.hudi.configuration.FlinkOptions +import org.sunbird.obsrv.core.streaming.BaseJobConfig + +import scala.collection.mutable + +class HudiConnectorConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "Flink-Hudi-Connector") { + + implicit val mapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) + implicit val stringTypeInfo: TypeInformation[String] = TypeExtractor.getForClass(classOf[String]) + + override def inputTopic(): String = config.getString("kafka.input.topic") + + val kafkaDefaultOutputTopic: String = config.getString("kafka.output.topic") + + override def inputConsumer(): String = config.getString("kafka.groupId") + + override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("dummy-events") + + override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") + + val kafkaInvalidTopic: String = config.getString("kafka.output.invalid.topic") + + val invalidEventsOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("invalid-events") + val validEventsOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("valid-events") + + val invalidEventProducer = "invalid-events-sink" + + + val hudiTableType: String = + if (config.getString("hudi.table.type").equalsIgnoreCase("MERGE_ON_READ")) + HoodieTableType.MERGE_ON_READ.name() + else if (config.getString("hudi.table.type").equalsIgnoreCase("COPY_ON_WRITE")) + HoodieTableType.COPY_ON_WRITE.name() + else HoodieTableType.MERGE_ON_READ.name() + + val hudiBasePath: String = config.getString("hudi.table.base.path") + val hudiCompactionEnabled: Boolean = config.getBoolean("hudi.compaction.enabled") + val hudiWriteTasks: Int = config.getInt("hudi.write.tasks") + + val hmsEnabled: Boolean = if (config.hasPath("hudi.hms.enabled")) config.getBoolean("hudi.hms.enabled") else false + val hmsUsername: String = config.getString("hudi.hms.database.username") + val hmsPassword: String = config.getString("hudi.hms.database.password") + val hmsDatabaseName: String = config.getString("hudi.hms.database.name") + val hmsURI: String = config.getString("hudi.hms.uri") + +} diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala new file mode 100644 index 00000000..b57244a2 --- /dev/null +++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala @@ -0,0 +1,153 @@ +package org.sunbird.obsrv.streaming + +import com.typesafe.config.ConfigFactory +import org.apache.commons.lang3.StringUtils +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.java.typeutils.TypeExtractor +import org.apache.flink.api.java.utils.ParameterTool +import org.apache.flink.configuration.Configuration +import org.apache.flink.streaming.api.datastream.{DataStream, DataStreamSink} +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.apache.hudi.common.config.TimestampKeyGeneratorConfig +import org.apache.hudi.configuration.{FlinkOptions, OptionsResolver} +import org.apache.hudi.sink.utils.Pipelines +import org.apache.hudi.util.AvroSchemaConverter +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.Constants +import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector} +import org.sunbird.obsrv.core.util.FlinkUtil +import org.sunbird.obsrv.functions.RowDataConverterFunction +import org.sunbird.obsrv.registry.DatasetRegistry +import org.sunbird.obsrv.util.HudiSchemaParser +import org.apache.hudi.config.HoodieWriteConfig.SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP +import org.apache.hudi.common.config.HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE +import org.apache.hudi.common.table.HoodieTableConfig.DROP_PARTITION_COLUMNS +import java.io.File +import java.sql.Timestamp +import java.time.LocalDateTime +import java.time.format.DateTimeFormatter +import scala.collection.mutable +import scala.collection.mutable.{Map => MMap} + +class HudiConnectorStreamTask(config: HudiConnectorConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { + + implicit val mutableMapTypeInfo: TypeInformation[MMap[String, AnyRef]] = TypeExtractor.getForClass(classOf[MMap[String, AnyRef]]) + private val logger = LoggerFactory.getLogger(classOf[HudiConnectorStreamTask]) + def process(): Unit = { + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) + process(env) + } + + override def processStream(dataStream: DataStream[mutable.Map[String, AnyRef]]): DataStream[mutable.Map[String, AnyRef]] = { + null + } + + def process(env: StreamExecutionEnvironment): Unit = { + val schemaParser = new HudiSchemaParser() + val dataSourceConfig = DatasetRegistry.getAllDatasources().filter(f => f.`type`.nonEmpty && f.`type`.equalsIgnoreCase(Constants.DATALAKE_TYPE)) + dataSourceConfig.map{ dataSource => + val datasetId = dataSource.datasetId + val dataStream = getMapDataStream(env, config, List(datasetId), config.kafkaConsumerProperties(), consumerSourceName = s"kafka-${datasetId}", kafkaConnector) + .map(new RowDataConverterFunction(config, datasetId)) + + val conf: Configuration = new Configuration() + setHudiBaseConfigurations(conf) + setDatasetConf(conf, datasetId, schemaParser) + logger.info("conf: " + conf.toMap.toString) + val rowType = schemaParser.rowTypeMap(datasetId) + + val hoodieRecordDataStream = Pipelines.bootstrap(conf, rowType, dataStream) + val pipeline = Pipelines.hoodieStreamWrite(conf, hoodieRecordDataStream) + if (OptionsResolver.needsAsyncCompaction(conf)) { + Pipelines.compact(conf, pipeline) + } else { + Pipelines.clean(conf, pipeline) + } + + }.orElse(List(addDefaultOperator(env, config, kafkaConnector))) + env.execute("Flink-Hudi-Connector") + } + + def addDefaultOperator(env: StreamExecutionEnvironment, config: HudiConnectorConfig, kafkaConnector: FlinkKafkaConnector): DataStreamSink[mutable.Map[String, AnyRef]] = { + val dataStreamSink: DataStreamSink[mutable.Map[String, AnyRef]] = getMapDataStream(env, config, kafkaConnector) + .sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaDefaultOutputTopic)) + .name(s"hudi-connector-default-sink").uid(s"hudi-connector-default-sink") + .setParallelism(config.downstreamOperatorsParallelism) + dataStreamSink + } + + def setDatasetConf(conf: Configuration, dataset: String, schemaParser: HudiSchemaParser): Unit = { + val datasetSchema = schemaParser.hudiSchemaMap(dataset) + val rowType = schemaParser.rowTypeMap(dataset) + val avroSchema = AvroSchemaConverter.convertToSchema(rowType, dataset.replace("-", "_")) + conf.setString(FlinkOptions.PATH.key, s"${config.hudiBasePath}/${datasetSchema.schema.table}") + conf.setString(FlinkOptions.TABLE_NAME, datasetSchema.schema.table) + conf.setString(FlinkOptions.RECORD_KEY_FIELD.key, datasetSchema.schema.primaryKey) + conf.setString(FlinkOptions.PRECOMBINE_FIELD.key, datasetSchema.schema.timestampColumn) + conf.setString(FlinkOptions.PARTITION_PATH_FIELD.key, datasetSchema.schema.partitionColumn) + conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA.key, avroSchema.toString) + + val partitionField = datasetSchema.schema.columnSpec.filter(f => f.name.equalsIgnoreCase(datasetSchema.schema.partitionColumn)).head + if(partitionField.`type`.equalsIgnoreCase("timestamp") || partitionField.`type`.equalsIgnoreCase("epoch")) { + conf.setString(FlinkOptions.PARTITION_PATH_FIELD.key, datasetSchema.schema.partitionColumn + "_partition") + } + + if (config.hmsEnabled) { + conf.setString("hive_sync.table", datasetSchema.schema.table) + } + } + + private def setHudiBaseConfigurations(conf: Configuration): Unit = { + conf.setString(FlinkOptions.TABLE_TYPE.key, config.hudiTableType) + conf.setBoolean(FlinkOptions.METADATA_ENABLED.key, true) + conf.setDouble(FlinkOptions.WRITE_BATCH_SIZE.key, 0.1) + conf.setBoolean(FlinkOptions.COMPACTION_SCHEDULE_ENABLED.key, config.hudiCompactionEnabled) + conf.setInteger("write.tasks", config.hudiWriteTasks) + conf.setInteger(FlinkOptions.COMPACTION_DELTA_COMMITS, 2) + conf.setString(FlinkOptions.COMPACTION_TRIGGER_STRATEGY, "num_or_time") + conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, true) + conf.setInteger(FlinkOptions.BUCKET_ASSIGN_TASKS, 1) + conf.setInteger(FlinkOptions.COMPACTION_TASKS, 1) + conf.setString("hoodie.fs.atomic_creation.support", "s3a") + conf.setString(FlinkOptions.HIVE_SYNC_TABLE_PROPERTIES, "hoodie.datasource.write.drop.partition.columns=true") + conf.setBoolean(DROP_PARTITION_COLUMNS.key, true) + conf.setBoolean(SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP.key(), true); // Enable dropping columns + conf.setBoolean(SCHEMA_EVOLUTION_ENABLE.key(), true); // Enable schema evolution + conf.setString(FlinkOptions.PAYLOAD_CLASS_NAME, "org.apache.hudi.common.model.PartialUpdateAvroPayload") + + if (config.hmsEnabled) { + conf.setBoolean("hive_sync.enabled", config.hmsEnabled) + conf.setString(FlinkOptions.HIVE_SYNC_DB.key(), config.hmsDatabaseName) + conf.setString("hive_sync.username", config.hmsUsername) + conf.setString("hive_sync.password", config.hmsPassword) + conf.setString("hive_sync.mode", "hms") + conf.setBoolean("hive_sync.use_jdbc", false) + conf.setString(FlinkOptions.HIVE_SYNC_METASTORE_URIS.key(), config.hmsURI) + conf.setString("hoodie.fs.atomic_creation.support", "s3a") + conf.setBoolean(FlinkOptions.HIVE_SYNC_SUPPORT_TIMESTAMP, true) + } + + } + +} + +object HudiConnectorStreamTask { + def main(args: Array[String]): Unit = { + val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path")) + val config = configFilePath.map { + path => ConfigFactory.parseFile(new File(path)).resolve() + }.getOrElse(ConfigFactory.load("hudi-writer.conf").withFallback(ConfigFactory.systemEnvironment())) + val hudiWriterConfig = new HudiConnectorConfig(config) + val kafkaUtil = new FlinkKafkaConnector(hudiWriterConfig) + val task = new HudiConnectorStreamTask(hudiWriterConfig, kafkaUtil) + task.process() + } + + def getTimestamp(ts: String): Timestamp = { + val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSXXX") + val localDateTime = if (StringUtils.isNotBlank(ts)) + LocalDateTime.from(formatter.parse(ts)) + else LocalDateTime.now + Timestamp.valueOf(localDateTime) + } +} diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/TestTimestamp.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/TestTimestamp.scala new file mode 100644 index 00000000..1c9876c0 --- /dev/null +++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/TestTimestamp.scala @@ -0,0 +1,19 @@ +package org.sunbird.obsrv.streaming + +import java.sql.Timestamp +import java.time.{LocalDateTime, ZoneOffset} +import java.time.format.DateTimeFormatter + +object TestTimestamp { + + def main(args: Array[String]): Unit = { + val timestampAsString = "2023-10-15T03:56:27.522+05:30" + val pattern = "yyyy-MM-dd'T'hh:mm:ss.SSSZ" + val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSXXX") + val localDateTime = LocalDateTime.from(formatter.parse(timestampAsString)) + val timestamp = Timestamp.valueOf(localDateTime) + println("Timestamp: " + timestamp.toString) + + } + +} diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/util/HudiSchemaParser.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/util/HudiSchemaParser.scala new file mode 100644 index 00000000..404635a8 --- /dev/null +++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/util/HudiSchemaParser.scala @@ -0,0 +1,140 @@ +package org.sunbird.obsrv.util + +import com.fasterxml.jackson.annotation.JsonInclude.Include +import com.fasterxml.jackson.core.JsonGenerator.Feature +import com.fasterxml.jackson.databind.json.JsonMapper +import com.fasterxml.jackson.databind.{DeserializationFeature, JsonNode, ObjectMapper, SerializationFeature} +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import org.apache.flink.table.types.logical.{BigIntType, BooleanType, DoubleType, IntType, LogicalType, MapType, RowType, VarCharType, TimestampType, DateType} +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.Constants +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.registry.DatasetRegistry +import java.sql.Timestamp +import java.text.SimpleDateFormat +import java.util.Date +import scala.collection.mutable + + +case class HudiSchemaSpec(dataset: String, schema: Schema, inputFormat: InputFormat) +case class Schema(table: String, partitionColumn: String, timestampColumn: String, primaryKey: String, columnSpec: List[ColumnSpec]) +case class ColumnSpec(name: String, `type`: String) +case class InputFormat(`type`: String, flattenSpec: Option[JsonFlattenSpec] = None, columns: Option[List[String]] = None) +case class JsonFlattenSpec(fields: List[JsonFieldParserSpec]) +case class JsonFieldParserSpec(`type`: String, name: String, expr: Option[String] = None) + +class HudiSchemaParser { + + private val logger = LoggerFactory.getLogger(classOf[HudiSchemaParser]) + + @transient private val objectMapper = JsonMapper.builder() + .addModule(DefaultScalaModule) + .disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES) + .disable(SerializationFeature.FAIL_ON_EMPTY_BEANS) + .enable(Feature.WRITE_BIGDECIMAL_AS_PLAIN) + .build() + + val df = new SimpleDateFormat("yyyy-MM-dd") + objectMapper.setSerializationInclusion(Include.NON_ABSENT) + + val hudiSchemaMap = new mutable.HashMap[String, HudiSchemaSpec]() + val rowTypeMap = new mutable.HashMap[String, RowType]() + + readSchema() + + def readSchema(): Unit = { + val datasourceConfig = DatasetRegistry.getAllDatasources().filter(f => f.`type`.nonEmpty && f.`type`.equalsIgnoreCase(Constants.DATALAKE_TYPE)) + datasourceConfig.map{f => + val hudiSchemaSpec = JSONUtil.deserialize[HudiSchemaSpec](f.ingestionSpec) + val dataset = hudiSchemaSpec.dataset + hudiSchemaMap.put(dataset, hudiSchemaSpec) + rowTypeMap.put(dataset, createRowType(hudiSchemaSpec)) + } + } + + private def createRowType(schema: HudiSchemaSpec): RowType = { + val columnSpec = schema.schema.columnSpec + val primaryKey = schema.schema.primaryKey + val partitionColumn = schema.schema.partitionColumn + val timeStampColumn = schema.schema.timestampColumn + val partitionField = schema.schema.columnSpec.filter(f => f.name.equalsIgnoreCase(schema.schema.partitionColumn)).head + val rowTypeMap = mutable.SortedMap[String, LogicalType]() + columnSpec.sortBy(_.name).map { + spec => + val isNullable = if (spec.name.matches(s"$primaryKey|$partitionColumn|$timeStampColumn")) false else true + val columnType = spec.`type` match { + case "string" => new VarCharType(isNullable, 20) + case "double" => new DoubleType(isNullable) + case "long" => new BigIntType(isNullable) + case "int" => new IntType(isNullable) + case "boolean" => new BooleanType(true) + case "map[string, string]" => new MapType(new VarCharType(), new VarCharType()) + case "epoch" => new BigIntType(isNullable) + case _ => new VarCharType(isNullable, 20) + } + rowTypeMap.put(spec.name, columnType) + } + if(partitionField.`type`.equalsIgnoreCase("timestamp") || partitionField.`type`.equalsIgnoreCase("epoch")) { + rowTypeMap.put(partitionField.name + "_partition", new VarCharType(false, 20)) + } + val rowType: RowType = RowType.of(false, rowTypeMap.values.toArray, rowTypeMap.keySet.toArray) + logger.info("rowType: " + rowType) + rowType + } + + def parseJson(dataset: String, event: String): mutable.Map[String, Any] = { + val parserSpec = hudiSchemaMap.get(dataset) + val jsonNode = objectMapper.readTree(event) + val flattenedEventData = mutable.Map[String, Any]() + parserSpec.map { spec => + val columnSpec = spec.schema.columnSpec + val partitionField = spec.schema.columnSpec.filter(f => f.name.equalsIgnoreCase(spec.schema.partitionColumn)).head + spec.inputFormat.flattenSpec.map { + flattenSpec => + flattenSpec.fields.map { + field => + val node = retrieveFieldFromJson(jsonNode, field) + node.map { + nodeValue => + try { + val fieldDataType = columnSpec.filter(_.name.equalsIgnoreCase(field.name)).head.`type` + val fieldValue = fieldDataType match { + case "string" => objectMapper.treeToValue(nodeValue, classOf[String]) + case "int" => objectMapper.treeToValue(nodeValue, classOf[Int]) + case "long" => objectMapper.treeToValue(nodeValue, classOf[Long]) + case "double" => objectMapper.treeToValue(nodeValue, classOf[Double]) + case "epoch" => objectMapper.treeToValue(nodeValue, classOf[Long]) + case _ => objectMapper.treeToValue(nodeValue, classOf[String]) + } + if(field.name.equalsIgnoreCase(partitionField.name)){ + if(fieldDataType.equalsIgnoreCase("timestamp")) { + flattenedEventData.put(field.name + "_partition", df.format(objectMapper.treeToValue(nodeValue, classOf[Timestamp]))) + } + else if(fieldDataType.equalsIgnoreCase("epoch")) { + flattenedEventData.put(field.name + "_partition", df.format(objectMapper.treeToValue(nodeValue, classOf[Long]))) + } + } + flattenedEventData.put(field.name, fieldValue) + } + catch { + case ex: Exception => + logger.info("Hudi Schema Parser - Exception: ", ex.getMessage) + flattenedEventData.put(field.name, null) + } + + }.orElse(flattenedEventData.put(field.name, null)) + } + } + } + logger.info("flattenedEventData: " + flattenedEventData) + flattenedEventData + } + + def retrieveFieldFromJson(jsonNode: JsonNode, field: JsonFieldParserSpec): Option[JsonNode] = { + if (field.`type`.equalsIgnoreCase("path")) { + field.expr.map{ f => jsonNode.at(s"/${f.split("\\.").tail.mkString("/")}") } + } else { + Option(jsonNode.get(field.name)) + } + } +} diff --git a/pipeline/pom.xml b/pipeline/pom.xml index 9c37e956..220ebff4 100644 --- a/pipeline/pom.xml +++ b/pipeline/pom.xml @@ -23,6 +23,7 @@ druid-router pipeline-merged master-data-processor + hudi-connector From 90090d26de49ba64c241fd6e7395e3b62a12c60a Mon Sep 17 00:00:00 2001 From: Ravi Mula Date: Mon, 27 May 2024 11:41:37 +0530 Subject: [PATCH 27/37] Release 1.0.6-GA (#81) * Pipeline Bug fixes (#74) * Sanketika-Obsrv/issue-tracker#106:fix: Fix postgres connection issue with dataset read and handling an errors while parsing the message * Sanketika-Obsrv/issue-tracker#107:fix: Denorm job fix to handle error when denorm field node is contains empty value * Sanketika-Obsrv/issue-tracker#106:fix: Review comments fix - Changed the generic exception to actual exception (NullPointer) * fix: #0000: update datasourceRef only if dataset has records * Sanketika-Obsrv/issue-tracker#180 fix: Datasource DB schema changes to include type. (#79) Co-authored-by: sowmya-dixit * Hudi connector flink job implementation (#80) * feat: Hudi Flink Implementation. * feat: local working with metastore and localstack. * #0000 - feat: Hudi Sink implementation * #0000 - feat: Hudi Sink implementation * #0000 - feat: Initialize dataset RowType during job startup * refactor: Integrate hudi connector with dataset registry. * refactor: Integrate hudi connector with dataset registry. * Sanketika-Obsrv/issue-tracker#141 refactor: Enable timestamp based partition * Sanketika-Obsrv/issue-tracker#141 refactor: Fix Hudi connector job to handle empty datasets list for lakehouse. * Sanketika-Obsrv/issue-tracker#141 fix: Set Timestamp based partition configurations only if partition key is of timestamp type. * Sanketika-Obsrv/issue-tracker#170 fix: Resolve timestamp based partition without using TimestampBasedAvroKeyGenerator. * Sanketika-Obsrv/issue-tracker#177 fix: Lakehouse connector flink job fixes. * Sanketika-Obsrv/issue-tracker#177 fix: Dockerfile changes for hudi-connector * Sanketika-Obsrv/issue-tracker#177 fix: Lakehouse connector flink job fixes. * Sanketika-Obsrv/issue-tracker#177 fix: remove unused code * Sanketika-Obsrv/issue-tracker#177 fix: remove unused code * Sanketika-Obsrv/issue-tracker#177 fix: remove unused code * Sanketika-Obsrv/issue-tracker#177 fix: remove commented code --------- Co-authored-by: Manjunath Davanam Co-authored-by: SurabhiAngadi Co-authored-by: Sowmya N Dixit Co-authored-by: sowmya-dixit --- Dockerfile | 8 + .../MasterDataProcessorIndexer.scala | 2 +- .../src/main/resources/dataset-registry.sql | 1 + .../sunbird/obsrv/model/DatasetModels.scala | 2 +- .../obsrv/registry/DatasetRegistry.scala | 5 + .../service/DatasetRegistryService.scala | 14 +- .../spec/BaseSpecWithDatasetRegistry.scala | 2 +- .../obsrv/spec/TestDatasetRegistrySpec.scala | 4 +- .../sunbird/obsrv/core/model/Constants.scala | 1 + pipeline/hudi-connector/pom.xml | 253 ++++++++++++++++++ .../src/main/resources/core-site.xml | 33 +++ .../src/main/resources/hudi-writer.conf | 41 +++ .../src/main/resources/schemas/schema.json | 108 ++++++++ .../functions/RowDataConverterFunction.scala | 43 +++ .../obsrv/streaming/HudiConnectorConfig.scala | 53 ++++ .../streaming/HudiConnectorStreamTask.scala | 153 +++++++++++ .../obsrv/streaming/TestTimestamp.scala | 19 ++ .../sunbird/obsrv/util/HudiSchemaParser.scala | 140 ++++++++++ pipeline/pom.xml | 1 + 19 files changed, 877 insertions(+), 6 deletions(-) create mode 100644 pipeline/hudi-connector/pom.xml create mode 100644 pipeline/hudi-connector/src/main/resources/core-site.xml create mode 100644 pipeline/hudi-connector/src/main/resources/hudi-writer.conf create mode 100644 pipeline/hudi-connector/src/main/resources/schemas/schema.json create mode 100644 pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/functions/RowDataConverterFunction.scala create mode 100644 pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorConfig.scala create mode 100644 pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala create mode 100644 pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/TestTimestamp.scala create mode 100644 pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/util/HudiSchemaParser.scala diff --git a/Dockerfile b/Dockerfile index 4e519a75..dced8701 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,3 +35,11 @@ COPY --from=build-pipeline /app/pipeline/pipeline-merged/target/pipeline-merged- FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as master-data-processor-image USER flink COPY --from=build-pipeline /app/pipeline/master-data-processor/target/master-data-processor-1.0.0.jar $FLINK_HOME/lib + +FROM --platform=linux/x86_64 flink:1.15.0-scala_2.12-java11 as hudi-connector-image +USER flink +COPY ./pipeline/hudi-connector/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar $FLINK_HOME/lib +COPY ./pipeline/hudi-connector/flink-s3-fs-hadoop-1.15.2.jar $FLINK_HOME/lib +COPY ./pipeline/hudi-connector/hbase-server-2.4.13.jar $FLINK_HOME/lib +RUN mkdir $FLINK_HOME/custom-lib +COPY ./pipeline/hudi-connector/target/hudi-connector-1.0.0.jar $FLINK_HOME/custom-lib diff --git a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala index 2c41181c..22729aa0 100644 --- a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala +++ b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala @@ -27,8 +27,8 @@ object MasterDataProcessorIndexer { val ingestionSpec: String = updateIngestionSpec(datasource, paths.datasourceRef, paths.ingestionPath, config) if (eventsCount > 0L) { submitIngestionTask(dataset.id, ingestionSpec, config) + DatasetRegistry.updateDatasourceRef(datasource, paths.datasourceRef) } - DatasetRegistry.updateDatasourceRef(datasource, paths.datasourceRef) if (!datasource.datasourceRef.equals(paths.datasourceRef)) { deleteDataSource(dataset.id, datasource.datasourceRef, config) } diff --git a/dataset-registry/src/main/resources/dataset-registry.sql b/dataset-registry/src/main/resources/dataset-registry.sql index ff28ae98..54373eec 100644 --- a/dataset-registry/src/main/resources/dataset-registry.sql +++ b/dataset-registry/src/main/resources/dataset-registry.sql @@ -22,6 +22,7 @@ CREATE INDEX IF NOT EXISTS datasets_status ON datasets(status); CREATE TABLE IF NOT EXISTS datasources ( datasource text PRIMARY KEY, dataset_id text REFERENCES datasets (id), + type text NOT NULL, ingestion_spec json NOT NULL, datasource_ref text NOT NULL, retention_period json, diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala index 49cc51bc..ee73fbe0 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala @@ -71,7 +71,7 @@ object DatasetModels { @JsonProperty("status") status: String, @JsonProperty("connector_stats") connectorStats: Option[ConnectorStats] = None) case class DataSource(@JsonProperty("id") id: String, @JsonProperty("datasource") datasource: String, @JsonProperty("dataset_id") datasetId: String, - @JsonProperty("ingestion_spec") ingestionSpec: String, @JsonProperty("datasource_ref") datasourceRef: String) + @JsonProperty("type") `type`: String, @JsonProperty("ingestion_spec") ingestionSpec: String, @JsonProperty("datasource_ref") datasourceRef: String) } diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala index c1394fd4..08921adc 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala @@ -42,6 +42,11 @@ object DatasetRegistry { DatasetRegistryService.readDatasources(datasetId) } + def getAllDatasources(): List[DataSource] = { + val datasourceList = DatasetRegistryService.readAllDatasources() + datasourceList.getOrElse(List()) + } + def getDataSetIds(datasetType: String): List[String] = { datasets.filter(f => f._2.datasetType.equals(datasetType)).keySet.toList } diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala index 88efb7a6..8075d508 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala @@ -111,6 +111,17 @@ object DatasetRegistryService { } } + def readAllDatasources(): Option[List[DataSource]] = { + + val postgresConnect = new PostgresConnect(postgresConfig) + try { + val rs = postgresConnect.executeQuery(s"SELECT * FROM datasources") + Option(Iterator.continually((rs, rs.next)).takeWhile(f => f._2).map(f => f._1).map(result => { + parseDatasource(result) + }).toList) + } + } + def updateDatasourceRef(datasource: DataSource, datasourceRef: String): Int = { val query = s"UPDATE datasources set datasource_ref = '$datasourceRef' where datasource='${datasource.datasource}' and dataset_id='${datasource.datasetId}'" updateRegistry(query) @@ -190,10 +201,11 @@ object DatasetRegistryService { val id = rs.getString("id") val datasource = rs.getString("datasource") val datasetId = rs.getString("dataset_id") + val datasourceType = rs.getString("type") val ingestionSpec = rs.getString("ingestion_spec") val datasourceRef = rs.getString("datasource_ref") - DataSource(id, datasource, datasetId, ingestionSpec, datasourceRef) + DataSource(id, datasource, datasetId, datasourceType, ingestionSpec, datasourceRef) } private def parseDatasetTransformation(rs: ResultSet): DatasetTransformation = { diff --git a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala index 09321143..1b3edea0 100644 --- a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala +++ b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala @@ -36,7 +36,7 @@ class BaseSpecWithDatasetRegistry extends BaseSpecWithPostgres { private def createSchema(postgresConnect: PostgresConnect): Unit = { postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasets ( id text PRIMARY KEY, type text NOT NULL, validation_config json, extraction_config json, dedup_config json, data_schema json, denorm_config json, router_config json NOT NULL, dataset_config json NOT NULL, status text NOT NULL, tags text[], data_version INT, created_by text NOT NULL, updated_by text NOT NULL, created_date timestamp NOT NULL, updated_date timestamp NOT NULL );") - postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasources ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), ingestion_spec json NOT NULL, datasource text NOT NULL, datasource_ref text NOT NULL, retention_period json, archival_policy json, purge_policy json, backup_config json NOT NULL, status text NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL );") + postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasources ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), type text NOT NULL, ingestion_spec json NOT NULL, datasource text NOT NULL, datasource_ref text NOT NULL, retention_period json, archival_policy json, purge_policy json, backup_config json NOT NULL, status text NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL );") postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_transformations ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), field_key text NOT NULL, transformation_function json NOT NULL, status text NOT NULL, mode text, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(field_key, dataset_id) );") postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_source_config ( id text PRIMARY KEY, dataset_id text NOT NULL REFERENCES datasets (id), connector_type text NOT NULL, connector_config json NOT NULL, status text NOT NULL, connector_stats json, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(connector_type, dataset_id) );") } diff --git a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala index b37e801a..3d83552d 100644 --- a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala +++ b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala @@ -92,8 +92,8 @@ class TestDatasetRegistrySpec extends BaseSpecWithDatasetRegistry with Matchers postgresConnect.execute("insert into dataset_source_config values('sc1', 'd1', 'kafka', '{\"kafkaBrokers\":\"localhost:9090\",\"topic\":\"test-topic\"}', 'Live', null, 'System', 'System', now(), now());") postgresConnect.execute("insert into dataset_source_config values('sc2', 'd1', 'rdbms', '{\"type\":\"postgres\",\"tableName\":\"test-table\"}', 'Live', null, 'System', 'System', now(), now());") - //postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasources ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), ingestion_spec json NOT NULL, datasource text NOT NULL, datasource_ref text NOT NULL, retention_period json, archival_policy json, purge_policy json, backup_config json NOT NULL, status text NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL );") - postgresConnect.execute("insert into datasources values('ds1', 'd1', '{}', 'd1-datasource', 'd1-datasource-1', null, null, null, '{}', 'Live', 'System', 'System', now(), now());") + //postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasources ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), type text NOT NULL, ingestion_spec json NOT NULL, datasource text NOT NULL, datasource_ref text NOT NULL, retention_period json, archival_policy json, purge_policy json, backup_config json NOT NULL, status text NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL );") + postgresConnect.execute("insert into datasources values('ds1', 'd1', 'druid', '{}', 'd1-datasource', 'd1-datasource-1', null, null, null, '{}', 'Live', 'System', 'System', now(), now());") postgresConnect.closeConnection() } } \ No newline at end of file diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala index 2cfbd307..7c19d8e2 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala @@ -14,5 +14,6 @@ object Constants { val LEVEL = "level" val TOPIC = "topic" val MESSAGE = "message" + val DATALAKE_TYPE = "datalake" } diff --git a/pipeline/hudi-connector/pom.xml b/pipeline/hudi-connector/pom.xml new file mode 100644 index 00000000..5230d8eb --- /dev/null +++ b/pipeline/hudi-connector/pom.xml @@ -0,0 +1,253 @@ + + + 4.0.0 + + pipeline + org.sunbird.obsrv + 1.0 + + hudi-connector + 1.0.0 + Hudi Connector + + UTF-8 + 1.4.0 + + + + + org.apache.flink + flink-streaming-scala_${scala.maj.version} + ${flink.version} + provided + + + com.fasterxml.jackson.core + jackson-databind + + + + + org.sunbird.obsrv + framework + 1.0.0 + + + org.sunbird.obsrv + dataset-registry + 1.0.0 + + + org.apache.kafka + kafka-clients + + + + + org.apache.hudi + hudi-flink1.15-bundle + 0.14.1 + + + org.apache.hadoop + hadoop-common + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + + org.slf4j + slf4j-log4j12 + + + + + org.apache.flink + flink-table-api-scala-bridge_${scala.maj.version} + ${flink.version} + provided + + + io.github.classgraph + classgraph + 4.8.168 + + + org.apache.flink + flink-connector-hive_${scala.maj.version} + ${flink.version} + + + org.apache.hive + hive-metastore + 3.1.3 + + + org.apache.hadoop + hadoop-common + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + + + + org.apache.hive + hive-exec + 3.1.3 + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + + org.apache.hadoop + hadoop-common + + + org.apache.logging.log4j + log4j-slf4j-impl + + + + + + + src/main/scala + src/test/scala + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + 11 + + + + org.apache.maven.plugins + maven-shade-plugin + 3.2.1 + + + + package + + shade + + + false + + + com.google.code.findbugs:jsr305 + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + core-site.xml + + + + + + org.sunbird.obsrv.streaming.HudiConnectorStreamTask + + + + reference.conf + + + + + + + + + net.alchim31.maven + scala-maven-plugin + 4.4.0 + + ${java.target.runtime} + ${java.target.runtime} + ${scala.version} + false + + + + scala-compile-first + process-resources + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + + + maven-surefire-plugin + 2.22.2 + + true + + + + + org.scalatest + scalatest-maven-plugin + 1.0 + + ${project.build.directory}/surefire-reports + . + hudi-connector-testsuite.txt + + + + test + + test + + + + + + org.scoverage + scoverage-maven-plugin + ${scoverage.plugin.version} + + ${scala.version} + true + true + + + + + diff --git a/pipeline/hudi-connector/src/main/resources/core-site.xml b/pipeline/hudi-connector/src/main/resources/core-site.xml new file mode 100644 index 00000000..c15df562 --- /dev/null +++ b/pipeline/hudi-connector/src/main/resources/core-site.xml @@ -0,0 +1,33 @@ + + + + + + fs.s3a.impl + org.apache.hadoop.fs.s3a.S3AFileSystem + + + fs.s3a.endpoint + http://localhost:4566 + + + fs.s3a.access.key + test + + + fs.s3a.secret.key + testSecret + + + fs.s3a.path.style.access + true + + + fs.s3a.connection.ssl.enabled + false + + + + + + \ No newline at end of file diff --git a/pipeline/hudi-connector/src/main/resources/hudi-writer.conf b/pipeline/hudi-connector/src/main/resources/hudi-writer.conf new file mode 100644 index 00000000..d9c031b5 --- /dev/null +++ b/pipeline/hudi-connector/src/main/resources/hudi-writer.conf @@ -0,0 +1,41 @@ +include "baseconfig.conf" + +kafka { + input.topic = ${job.env}".hudi.connector.in" + output.topic = ${job.env}".hudi.connector.out" + output.invalid.topic = ${job.env}".failed" + event.max.size = "1048576" # Max is only 1MB + groupId = ${job.env}"-hudi-writer-group" + producer { + max-request-size = 5242880 + } +} + +task { + checkpointing.compressed = true + checkpointing.interval = 30000 + checkpointing.pause.between.seconds = 30000 + restart-strategy.attempts = 3 + restart-strategy.delay = 30000 # in milli-seconds + parallelism = 1 + consumer.parallelism = 1 + downstream.operators.parallelism = 1 +} + +hudi { + hms { + enabled = true + uri = "thrift://localhost:9083" + database { + name = "obsrv" + username = "postgres" + password = "postgres" + } + } + table { + type = "MERGE_ON_READ" + base.path = "s3a://obsrv" + } + compaction.enabled = true + write.tasks = 1 +} \ No newline at end of file diff --git a/pipeline/hudi-connector/src/main/resources/schemas/schema.json b/pipeline/hudi-connector/src/main/resources/schemas/schema.json new file mode 100644 index 00000000..177c957a --- /dev/null +++ b/pipeline/hudi-connector/src/main/resources/schemas/schema.json @@ -0,0 +1,108 @@ +{ + "dataset": "financial_transactions", + "schema": { + "table": "financial_transactions", + "partitionColumn": "receiver_ifsc_code", + "timestampColumn": "txn_date", + "primaryKey": "txn_id", + "columnSpec": [ + { + "name": "receiver_account_number", + "type": "string" + }, + { + "name": "receiver_ifsc_code", + "type": "string" + }, + { + "name": "sender_account_number", + "type": "string" + }, + { + "name": "sender_contact_email", + "type": "string" + }, + { + "name": "sender_ifsc_code", + "type": "string" + }, + { + "name": "currency", + "type": "string" + }, + { + "name": "txn_amount", + "type": "int" + }, + { + "name": "txn_date", + "type": "string" + }, + { + "name": "txn_id", + "type": "string" + }, + { + "name": "txn_status", + "type": "string" + }, + { + "name": "txn_type", + "type": "string" + } + ] + }, + "inputFormat": { + "type": "json", + "flattenSpec": { + "fields": [ + { + "type": "root", + "name": "receiver_account_number" + }, + { + "type": "path", + "name": "sender_account_number", + "expr": "$.sender.account_number" + }, + { + "type": "path", + "name": "sender_ifsc_code", + "expr": "$.sender.ifsc_code" + }, + { + "type": "root", + "name": "receiver_ifsc_code" + }, + { + "type": "root", + "name": "sender_contact_email" + }, + { + "type": "root", + "name": "currency" + }, + { + "type": "root", + "name": "txn_amount" + }, + { + "type": "root", + "name": "txn_date" + }, + { + "type": "root", + "name": "txn_id" + }, + { + "type": "root", + "name": "txn_status" + }, + { + "type": "root", + "name": "txn_type" + } + ] + } + } +} \ No newline at end of file diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/functions/RowDataConverterFunction.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/functions/RowDataConverterFunction.scala new file mode 100644 index 00000000..aec00117 --- /dev/null +++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/functions/RowDataConverterFunction.scala @@ -0,0 +1,43 @@ +package org.sunbird.obsrv.functions + +import org.apache.flink.api.common.functions.RichMapFunction +import org.apache.flink.configuration.Configuration +import org.apache.flink.formats.common.TimestampFormat +import org.apache.flink.formats.json.JsonToRowDataConverters +import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper +import org.sunbird.obsrv.util.{HudiSchemaParser, HudiSchemaSpec} +import org.apache.flink.table.data.RowData +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.util.{JSONUtil, Util} +import org.sunbird.obsrv.streaming.HudiConnectorConfig +import scala.collection.mutable.{Map => MMap} + +class RowDataConverterFunction(config: HudiConnectorConfig, datasetId: String) extends RichMapFunction[MMap[String, AnyRef], RowData] { + + var jsonToRowDataConverters: JsonToRowDataConverters = _ + var objectMapper: ObjectMapper = _ + var hudiSchemaParser: HudiSchemaParser = _ + + private val logger = LoggerFactory.getLogger(classOf[RowDataConverterFunction]) + + override def open(parameters: Configuration): Unit = { + super.open(parameters) + jsonToRowDataConverters = new JsonToRowDataConverters(false, true, TimestampFormat.SQL) + objectMapper = new ObjectMapper() + hudiSchemaParser = new HudiSchemaParser() + } + + override def map(event: MMap[String, AnyRef]): RowData = { + convertToRowData(event) + } + + def convertToRowData(data: MMap[String, AnyRef]): RowData = { + val eventJson = JSONUtil.serialize(data) + val flattenedData = hudiSchemaParser.parseJson(datasetId, eventJson) + val rowType = hudiSchemaParser.rowTypeMap(datasetId) + val converter: JsonToRowDataConverters.JsonToRowDataConverter = jsonToRowDataConverters.createRowConverter(rowType) + val rowData = converter.convert(objectMapper.readTree(JSONUtil.serialize(flattenedData))).asInstanceOf[RowData] + rowData + } + +} diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorConfig.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorConfig.scala new file mode 100644 index 00000000..4f4f46cf --- /dev/null +++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorConfig.scala @@ -0,0 +1,53 @@ +package org.sunbird.obsrv.streaming + +import com.typesafe.config.Config +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.java.typeutils.TypeExtractor +import org.apache.flink.streaming.api.scala.OutputTag +import org.apache.hudi.common.model.HoodieTableType +import org.apache.hudi.configuration.FlinkOptions +import org.sunbird.obsrv.core.streaming.BaseJobConfig + +import scala.collection.mutable + +class HudiConnectorConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "Flink-Hudi-Connector") { + + implicit val mapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) + implicit val stringTypeInfo: TypeInformation[String] = TypeExtractor.getForClass(classOf[String]) + + override def inputTopic(): String = config.getString("kafka.input.topic") + + val kafkaDefaultOutputTopic: String = config.getString("kafka.output.topic") + + override def inputConsumer(): String = config.getString("kafka.groupId") + + override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("dummy-events") + + override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") + + val kafkaInvalidTopic: String = config.getString("kafka.output.invalid.topic") + + val invalidEventsOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("invalid-events") + val validEventsOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("valid-events") + + val invalidEventProducer = "invalid-events-sink" + + + val hudiTableType: String = + if (config.getString("hudi.table.type").equalsIgnoreCase("MERGE_ON_READ")) + HoodieTableType.MERGE_ON_READ.name() + else if (config.getString("hudi.table.type").equalsIgnoreCase("COPY_ON_WRITE")) + HoodieTableType.COPY_ON_WRITE.name() + else HoodieTableType.MERGE_ON_READ.name() + + val hudiBasePath: String = config.getString("hudi.table.base.path") + val hudiCompactionEnabled: Boolean = config.getBoolean("hudi.compaction.enabled") + val hudiWriteTasks: Int = config.getInt("hudi.write.tasks") + + val hmsEnabled: Boolean = if (config.hasPath("hudi.hms.enabled")) config.getBoolean("hudi.hms.enabled") else false + val hmsUsername: String = config.getString("hudi.hms.database.username") + val hmsPassword: String = config.getString("hudi.hms.database.password") + val hmsDatabaseName: String = config.getString("hudi.hms.database.name") + val hmsURI: String = config.getString("hudi.hms.uri") + +} diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala new file mode 100644 index 00000000..b57244a2 --- /dev/null +++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala @@ -0,0 +1,153 @@ +package org.sunbird.obsrv.streaming + +import com.typesafe.config.ConfigFactory +import org.apache.commons.lang3.StringUtils +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.java.typeutils.TypeExtractor +import org.apache.flink.api.java.utils.ParameterTool +import org.apache.flink.configuration.Configuration +import org.apache.flink.streaming.api.datastream.{DataStream, DataStreamSink} +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.apache.hudi.common.config.TimestampKeyGeneratorConfig +import org.apache.hudi.configuration.{FlinkOptions, OptionsResolver} +import org.apache.hudi.sink.utils.Pipelines +import org.apache.hudi.util.AvroSchemaConverter +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.Constants +import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector} +import org.sunbird.obsrv.core.util.FlinkUtil +import org.sunbird.obsrv.functions.RowDataConverterFunction +import org.sunbird.obsrv.registry.DatasetRegistry +import org.sunbird.obsrv.util.HudiSchemaParser +import org.apache.hudi.config.HoodieWriteConfig.SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP +import org.apache.hudi.common.config.HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE +import org.apache.hudi.common.table.HoodieTableConfig.DROP_PARTITION_COLUMNS +import java.io.File +import java.sql.Timestamp +import java.time.LocalDateTime +import java.time.format.DateTimeFormatter +import scala.collection.mutable +import scala.collection.mutable.{Map => MMap} + +class HudiConnectorStreamTask(config: HudiConnectorConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { + + implicit val mutableMapTypeInfo: TypeInformation[MMap[String, AnyRef]] = TypeExtractor.getForClass(classOf[MMap[String, AnyRef]]) + private val logger = LoggerFactory.getLogger(classOf[HudiConnectorStreamTask]) + def process(): Unit = { + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) + process(env) + } + + override def processStream(dataStream: DataStream[mutable.Map[String, AnyRef]]): DataStream[mutable.Map[String, AnyRef]] = { + null + } + + def process(env: StreamExecutionEnvironment): Unit = { + val schemaParser = new HudiSchemaParser() + val dataSourceConfig = DatasetRegistry.getAllDatasources().filter(f => f.`type`.nonEmpty && f.`type`.equalsIgnoreCase(Constants.DATALAKE_TYPE)) + dataSourceConfig.map{ dataSource => + val datasetId = dataSource.datasetId + val dataStream = getMapDataStream(env, config, List(datasetId), config.kafkaConsumerProperties(), consumerSourceName = s"kafka-${datasetId}", kafkaConnector) + .map(new RowDataConverterFunction(config, datasetId)) + + val conf: Configuration = new Configuration() + setHudiBaseConfigurations(conf) + setDatasetConf(conf, datasetId, schemaParser) + logger.info("conf: " + conf.toMap.toString) + val rowType = schemaParser.rowTypeMap(datasetId) + + val hoodieRecordDataStream = Pipelines.bootstrap(conf, rowType, dataStream) + val pipeline = Pipelines.hoodieStreamWrite(conf, hoodieRecordDataStream) + if (OptionsResolver.needsAsyncCompaction(conf)) { + Pipelines.compact(conf, pipeline) + } else { + Pipelines.clean(conf, pipeline) + } + + }.orElse(List(addDefaultOperator(env, config, kafkaConnector))) + env.execute("Flink-Hudi-Connector") + } + + def addDefaultOperator(env: StreamExecutionEnvironment, config: HudiConnectorConfig, kafkaConnector: FlinkKafkaConnector): DataStreamSink[mutable.Map[String, AnyRef]] = { + val dataStreamSink: DataStreamSink[mutable.Map[String, AnyRef]] = getMapDataStream(env, config, kafkaConnector) + .sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaDefaultOutputTopic)) + .name(s"hudi-connector-default-sink").uid(s"hudi-connector-default-sink") + .setParallelism(config.downstreamOperatorsParallelism) + dataStreamSink + } + + def setDatasetConf(conf: Configuration, dataset: String, schemaParser: HudiSchemaParser): Unit = { + val datasetSchema = schemaParser.hudiSchemaMap(dataset) + val rowType = schemaParser.rowTypeMap(dataset) + val avroSchema = AvroSchemaConverter.convertToSchema(rowType, dataset.replace("-", "_")) + conf.setString(FlinkOptions.PATH.key, s"${config.hudiBasePath}/${datasetSchema.schema.table}") + conf.setString(FlinkOptions.TABLE_NAME, datasetSchema.schema.table) + conf.setString(FlinkOptions.RECORD_KEY_FIELD.key, datasetSchema.schema.primaryKey) + conf.setString(FlinkOptions.PRECOMBINE_FIELD.key, datasetSchema.schema.timestampColumn) + conf.setString(FlinkOptions.PARTITION_PATH_FIELD.key, datasetSchema.schema.partitionColumn) + conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA.key, avroSchema.toString) + + val partitionField = datasetSchema.schema.columnSpec.filter(f => f.name.equalsIgnoreCase(datasetSchema.schema.partitionColumn)).head + if(partitionField.`type`.equalsIgnoreCase("timestamp") || partitionField.`type`.equalsIgnoreCase("epoch")) { + conf.setString(FlinkOptions.PARTITION_PATH_FIELD.key, datasetSchema.schema.partitionColumn + "_partition") + } + + if (config.hmsEnabled) { + conf.setString("hive_sync.table", datasetSchema.schema.table) + } + } + + private def setHudiBaseConfigurations(conf: Configuration): Unit = { + conf.setString(FlinkOptions.TABLE_TYPE.key, config.hudiTableType) + conf.setBoolean(FlinkOptions.METADATA_ENABLED.key, true) + conf.setDouble(FlinkOptions.WRITE_BATCH_SIZE.key, 0.1) + conf.setBoolean(FlinkOptions.COMPACTION_SCHEDULE_ENABLED.key, config.hudiCompactionEnabled) + conf.setInteger("write.tasks", config.hudiWriteTasks) + conf.setInteger(FlinkOptions.COMPACTION_DELTA_COMMITS, 2) + conf.setString(FlinkOptions.COMPACTION_TRIGGER_STRATEGY, "num_or_time") + conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, true) + conf.setInteger(FlinkOptions.BUCKET_ASSIGN_TASKS, 1) + conf.setInteger(FlinkOptions.COMPACTION_TASKS, 1) + conf.setString("hoodie.fs.atomic_creation.support", "s3a") + conf.setString(FlinkOptions.HIVE_SYNC_TABLE_PROPERTIES, "hoodie.datasource.write.drop.partition.columns=true") + conf.setBoolean(DROP_PARTITION_COLUMNS.key, true) + conf.setBoolean(SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP.key(), true); // Enable dropping columns + conf.setBoolean(SCHEMA_EVOLUTION_ENABLE.key(), true); // Enable schema evolution + conf.setString(FlinkOptions.PAYLOAD_CLASS_NAME, "org.apache.hudi.common.model.PartialUpdateAvroPayload") + + if (config.hmsEnabled) { + conf.setBoolean("hive_sync.enabled", config.hmsEnabled) + conf.setString(FlinkOptions.HIVE_SYNC_DB.key(), config.hmsDatabaseName) + conf.setString("hive_sync.username", config.hmsUsername) + conf.setString("hive_sync.password", config.hmsPassword) + conf.setString("hive_sync.mode", "hms") + conf.setBoolean("hive_sync.use_jdbc", false) + conf.setString(FlinkOptions.HIVE_SYNC_METASTORE_URIS.key(), config.hmsURI) + conf.setString("hoodie.fs.atomic_creation.support", "s3a") + conf.setBoolean(FlinkOptions.HIVE_SYNC_SUPPORT_TIMESTAMP, true) + } + + } + +} + +object HudiConnectorStreamTask { + def main(args: Array[String]): Unit = { + val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path")) + val config = configFilePath.map { + path => ConfigFactory.parseFile(new File(path)).resolve() + }.getOrElse(ConfigFactory.load("hudi-writer.conf").withFallback(ConfigFactory.systemEnvironment())) + val hudiWriterConfig = new HudiConnectorConfig(config) + val kafkaUtil = new FlinkKafkaConnector(hudiWriterConfig) + val task = new HudiConnectorStreamTask(hudiWriterConfig, kafkaUtil) + task.process() + } + + def getTimestamp(ts: String): Timestamp = { + val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSXXX") + val localDateTime = if (StringUtils.isNotBlank(ts)) + LocalDateTime.from(formatter.parse(ts)) + else LocalDateTime.now + Timestamp.valueOf(localDateTime) + } +} diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/TestTimestamp.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/TestTimestamp.scala new file mode 100644 index 00000000..1c9876c0 --- /dev/null +++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/TestTimestamp.scala @@ -0,0 +1,19 @@ +package org.sunbird.obsrv.streaming + +import java.sql.Timestamp +import java.time.{LocalDateTime, ZoneOffset} +import java.time.format.DateTimeFormatter + +object TestTimestamp { + + def main(args: Array[String]): Unit = { + val timestampAsString = "2023-10-15T03:56:27.522+05:30" + val pattern = "yyyy-MM-dd'T'hh:mm:ss.SSSZ" + val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSXXX") + val localDateTime = LocalDateTime.from(formatter.parse(timestampAsString)) + val timestamp = Timestamp.valueOf(localDateTime) + println("Timestamp: " + timestamp.toString) + + } + +} diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/util/HudiSchemaParser.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/util/HudiSchemaParser.scala new file mode 100644 index 00000000..404635a8 --- /dev/null +++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/util/HudiSchemaParser.scala @@ -0,0 +1,140 @@ +package org.sunbird.obsrv.util + +import com.fasterxml.jackson.annotation.JsonInclude.Include +import com.fasterxml.jackson.core.JsonGenerator.Feature +import com.fasterxml.jackson.databind.json.JsonMapper +import com.fasterxml.jackson.databind.{DeserializationFeature, JsonNode, ObjectMapper, SerializationFeature} +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import org.apache.flink.table.types.logical.{BigIntType, BooleanType, DoubleType, IntType, LogicalType, MapType, RowType, VarCharType, TimestampType, DateType} +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.Constants +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.registry.DatasetRegistry +import java.sql.Timestamp +import java.text.SimpleDateFormat +import java.util.Date +import scala.collection.mutable + + +case class HudiSchemaSpec(dataset: String, schema: Schema, inputFormat: InputFormat) +case class Schema(table: String, partitionColumn: String, timestampColumn: String, primaryKey: String, columnSpec: List[ColumnSpec]) +case class ColumnSpec(name: String, `type`: String) +case class InputFormat(`type`: String, flattenSpec: Option[JsonFlattenSpec] = None, columns: Option[List[String]] = None) +case class JsonFlattenSpec(fields: List[JsonFieldParserSpec]) +case class JsonFieldParserSpec(`type`: String, name: String, expr: Option[String] = None) + +class HudiSchemaParser { + + private val logger = LoggerFactory.getLogger(classOf[HudiSchemaParser]) + + @transient private val objectMapper = JsonMapper.builder() + .addModule(DefaultScalaModule) + .disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES) + .disable(SerializationFeature.FAIL_ON_EMPTY_BEANS) + .enable(Feature.WRITE_BIGDECIMAL_AS_PLAIN) + .build() + + val df = new SimpleDateFormat("yyyy-MM-dd") + objectMapper.setSerializationInclusion(Include.NON_ABSENT) + + val hudiSchemaMap = new mutable.HashMap[String, HudiSchemaSpec]() + val rowTypeMap = new mutable.HashMap[String, RowType]() + + readSchema() + + def readSchema(): Unit = { + val datasourceConfig = DatasetRegistry.getAllDatasources().filter(f => f.`type`.nonEmpty && f.`type`.equalsIgnoreCase(Constants.DATALAKE_TYPE)) + datasourceConfig.map{f => + val hudiSchemaSpec = JSONUtil.deserialize[HudiSchemaSpec](f.ingestionSpec) + val dataset = hudiSchemaSpec.dataset + hudiSchemaMap.put(dataset, hudiSchemaSpec) + rowTypeMap.put(dataset, createRowType(hudiSchemaSpec)) + } + } + + private def createRowType(schema: HudiSchemaSpec): RowType = { + val columnSpec = schema.schema.columnSpec + val primaryKey = schema.schema.primaryKey + val partitionColumn = schema.schema.partitionColumn + val timeStampColumn = schema.schema.timestampColumn + val partitionField = schema.schema.columnSpec.filter(f => f.name.equalsIgnoreCase(schema.schema.partitionColumn)).head + val rowTypeMap = mutable.SortedMap[String, LogicalType]() + columnSpec.sortBy(_.name).map { + spec => + val isNullable = if (spec.name.matches(s"$primaryKey|$partitionColumn|$timeStampColumn")) false else true + val columnType = spec.`type` match { + case "string" => new VarCharType(isNullable, 20) + case "double" => new DoubleType(isNullable) + case "long" => new BigIntType(isNullable) + case "int" => new IntType(isNullable) + case "boolean" => new BooleanType(true) + case "map[string, string]" => new MapType(new VarCharType(), new VarCharType()) + case "epoch" => new BigIntType(isNullable) + case _ => new VarCharType(isNullable, 20) + } + rowTypeMap.put(spec.name, columnType) + } + if(partitionField.`type`.equalsIgnoreCase("timestamp") || partitionField.`type`.equalsIgnoreCase("epoch")) { + rowTypeMap.put(partitionField.name + "_partition", new VarCharType(false, 20)) + } + val rowType: RowType = RowType.of(false, rowTypeMap.values.toArray, rowTypeMap.keySet.toArray) + logger.info("rowType: " + rowType) + rowType + } + + def parseJson(dataset: String, event: String): mutable.Map[String, Any] = { + val parserSpec = hudiSchemaMap.get(dataset) + val jsonNode = objectMapper.readTree(event) + val flattenedEventData = mutable.Map[String, Any]() + parserSpec.map { spec => + val columnSpec = spec.schema.columnSpec + val partitionField = spec.schema.columnSpec.filter(f => f.name.equalsIgnoreCase(spec.schema.partitionColumn)).head + spec.inputFormat.flattenSpec.map { + flattenSpec => + flattenSpec.fields.map { + field => + val node = retrieveFieldFromJson(jsonNode, field) + node.map { + nodeValue => + try { + val fieldDataType = columnSpec.filter(_.name.equalsIgnoreCase(field.name)).head.`type` + val fieldValue = fieldDataType match { + case "string" => objectMapper.treeToValue(nodeValue, classOf[String]) + case "int" => objectMapper.treeToValue(nodeValue, classOf[Int]) + case "long" => objectMapper.treeToValue(nodeValue, classOf[Long]) + case "double" => objectMapper.treeToValue(nodeValue, classOf[Double]) + case "epoch" => objectMapper.treeToValue(nodeValue, classOf[Long]) + case _ => objectMapper.treeToValue(nodeValue, classOf[String]) + } + if(field.name.equalsIgnoreCase(partitionField.name)){ + if(fieldDataType.equalsIgnoreCase("timestamp")) { + flattenedEventData.put(field.name + "_partition", df.format(objectMapper.treeToValue(nodeValue, classOf[Timestamp]))) + } + else if(fieldDataType.equalsIgnoreCase("epoch")) { + flattenedEventData.put(field.name + "_partition", df.format(objectMapper.treeToValue(nodeValue, classOf[Long]))) + } + } + flattenedEventData.put(field.name, fieldValue) + } + catch { + case ex: Exception => + logger.info("Hudi Schema Parser - Exception: ", ex.getMessage) + flattenedEventData.put(field.name, null) + } + + }.orElse(flattenedEventData.put(field.name, null)) + } + } + } + logger.info("flattenedEventData: " + flattenedEventData) + flattenedEventData + } + + def retrieveFieldFromJson(jsonNode: JsonNode, field: JsonFieldParserSpec): Option[JsonNode] = { + if (field.`type`.equalsIgnoreCase("path")) { + field.expr.map{ f => jsonNode.at(s"/${f.split("\\.").tail.mkString("/")}") } + } else { + Option(jsonNode.get(field.name)) + } + } +} diff --git a/pipeline/pom.xml b/pipeline/pom.xml index 9c37e956..220ebff4 100644 --- a/pipeline/pom.xml +++ b/pipeline/pom.xml @@ -23,6 +23,7 @@ druid-router pipeline-merged master-data-processor + hudi-connector From 7832ed1010e57f0f78bff8faedf4f986a2dd1e93 Mon Sep 17 00:00:00 2001 From: GayathriSrividya Date: Mon, 10 Jun 2024 10:20:35 +0530 Subject: [PATCH 28/37] Sanketika-obsrv/issue-tracker#228 feat: updated github actions for lakehouse job --- .github/workflows/build_and_deploy.yaml | 2 ++ Dockerfile | 5 +---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_and_deploy.yaml b/.github/workflows/build_and_deploy.yaml index 35ba8cf8..601c8f5e 100644 --- a/.github/workflows/build_and_deploy.yaml +++ b/.github/workflows/build_and_deploy.yaml @@ -31,6 +31,8 @@ jobs: target: "merged-image" - image: "master-data-processor" target: "master-data-processor-image" + - image: "lakehouse-connector" + target: "lakehouse-connector-image" steps: - uses: actions/checkout@v4 with: diff --git a/Dockerfile b/Dockerfile index dced8701..fd4002be 100644 --- a/Dockerfile +++ b/Dockerfile @@ -36,10 +36,7 @@ FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as mast USER flink COPY --from=build-pipeline /app/pipeline/master-data-processor/target/master-data-processor-1.0.0.jar $FLINK_HOME/lib -FROM --platform=linux/x86_64 flink:1.15.0-scala_2.12-java11 as hudi-connector-image +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.0-scala_2.12-lakehouse as lakehouse-connector-image USER flink -COPY ./pipeline/hudi-connector/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar $FLINK_HOME/lib -COPY ./pipeline/hudi-connector/flink-s3-fs-hadoop-1.15.2.jar $FLINK_HOME/lib -COPY ./pipeline/hudi-connector/hbase-server-2.4.13.jar $FLINK_HOME/lib RUN mkdir $FLINK_HOME/custom-lib COPY ./pipeline/hudi-connector/target/hudi-connector-1.0.0.jar $FLINK_HOME/custom-lib From 2a066b87939e1fdfad3ab0c52dbdf66ab53dfe0e Mon Sep 17 00:00:00 2001 From: Ravi Mula Date: Thu, 20 Jun 2024 11:33:21 +0530 Subject: [PATCH 29/37] # Issue:52655194 Feat - Add processingStartTime if missing during extractor job (#76) Co-authored-by: Santhosh Vasabhaktula --- .../scala/org/sunbird/obsrv/core/model/Constants.scala | 1 + .../obsrv/extractor/functions/ExtractionFunction.scala | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala index 7c19d8e2..a6fc140f 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala @@ -5,6 +5,7 @@ object Constants { val EVENT = "event" val INVALID_JSON = "invalid_json" val OBSRV_META = "obsrv_meta" + val PROCESSING_START_TIME = "processingStartTime" val SRC = "src" val ERROR_CODE = "error_code" val ERROR_MSG = "error_msg" diff --git a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala index f1fea9fb..0e79b08c 100644 --- a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala +++ b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala @@ -46,6 +46,7 @@ class ExtractionFunction(config: ExtractorConfig) context.output(config.systemEventsOutputTag, failedSystemEvent(Some(config.defaultDatasetID), ErrorConstants.ERR_INVALID_EVENT, FunctionalError.InvalidJsonData)) return } + addStartProcessingTimeIfMissing(batchEvent) val eventAsText = JSONUtil.serialize(batchEvent) val datasetIdOpt = batchEvent.get(config.CONST_DATASET) if (datasetIdOpt.isEmpty) { @@ -79,6 +80,13 @@ class ExtractionFunction(config: ExtractorConfig) } } + private def addStartProcessingTimeIfMissing(batchEvent: mutable.Map[String, AnyRef]): Unit = { + val obsrvMeta = batchEvent(Constants.OBSRV_META).asInstanceOf[Map[String, AnyRef]] + if(!obsrvMeta.contains(Constants.PROCESSING_START_TIME)) { + batchEvent.put(Constants.OBSRV_META, obsrvMeta ++ Map("processingStartTime" -> System.currentTimeMillis())) + } + } + private def isDuplicate(dataset: Dataset, dedupKey: Option[String], event: String, context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context): Boolean = { try { From d8f840686eeca6bd71aefe5b26f734094d6272f6 Mon Sep 17 00:00:00 2001 From: GayathriSrividya Date: Wed, 19 Jun 2024 16:09:30 +0530 Subject: [PATCH 30/37] Sanketika-obsrv/issue-tracker#240 feat: lakehouse job changes to support retire workflow --- .../src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala | 2 +- .../org/sunbird/obsrv/service/DatasetRegistryService.scala | 3 ++- .../org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala | 2 +- .../main/scala/org/sunbird/obsrv/util/HudiSchemaParser.scala | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala index ee73fbe0..3aebe8bd 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala @@ -71,7 +71,7 @@ object DatasetModels { @JsonProperty("status") status: String, @JsonProperty("connector_stats") connectorStats: Option[ConnectorStats] = None) case class DataSource(@JsonProperty("id") id: String, @JsonProperty("datasource") datasource: String, @JsonProperty("dataset_id") datasetId: String, - @JsonProperty("type") `type`: String, @JsonProperty("ingestion_spec") ingestionSpec: String, @JsonProperty("datasource_ref") datasourceRef: String) + @JsonProperty("type") `type`: String, @JsonProperty("status") status: String, @JsonProperty("ingestion_spec") ingestionSpec: String, @JsonProperty("datasource_ref") datasourceRef: String) } diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala index 8075d508..e5206118 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala @@ -202,10 +202,11 @@ object DatasetRegistryService { val datasource = rs.getString("datasource") val datasetId = rs.getString("dataset_id") val datasourceType = rs.getString("type") + val datasourceStatus = rs.getString("status") val ingestionSpec = rs.getString("ingestion_spec") val datasourceRef = rs.getString("datasource_ref") - DataSource(id, datasource, datasetId, datasourceType, ingestionSpec, datasourceRef) + DataSource(id, datasource, datasetId, datasourceType, datasourceStatus, ingestionSpec, datasourceRef) } private def parseDatasetTransformation(rs: ResultSet): DatasetTransformation = { diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala index b57244a2..fd160820 100644 --- a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala +++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala @@ -44,7 +44,7 @@ class HudiConnectorStreamTask(config: HudiConnectorConfig, kafkaConnector: Flink def process(env: StreamExecutionEnvironment): Unit = { val schemaParser = new HudiSchemaParser() - val dataSourceConfig = DatasetRegistry.getAllDatasources().filter(f => f.`type`.nonEmpty && f.`type`.equalsIgnoreCase(Constants.DATALAKE_TYPE)) + val dataSourceConfig = DatasetRegistry.getAllDatasources().filter(f => f.`type`.nonEmpty && f.`type`.equalsIgnoreCase(Constants.DATALAKE_TYPE) && f.status.equalsIgnoreCase("Live")) dataSourceConfig.map{ dataSource => val datasetId = dataSource.datasetId val dataStream = getMapDataStream(env, config, List(datasetId), config.kafkaConsumerProperties(), consumerSourceName = s"kafka-${datasetId}", kafkaConnector) diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/util/HudiSchemaParser.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/util/HudiSchemaParser.scala index 404635a8..aa203474 100644 --- a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/util/HudiSchemaParser.scala +++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/util/HudiSchemaParser.scala @@ -43,7 +43,7 @@ class HudiSchemaParser { readSchema() def readSchema(): Unit = { - val datasourceConfig = DatasetRegistry.getAllDatasources().filter(f => f.`type`.nonEmpty && f.`type`.equalsIgnoreCase(Constants.DATALAKE_TYPE)) + val datasourceConfig = DatasetRegistry.getAllDatasources().filter(f => f.`type`.nonEmpty && f.`type`.equalsIgnoreCase(Constants.DATALAKE_TYPE) && f.status.equalsIgnoreCase("Live")) datasourceConfig.map{f => val hudiSchemaSpec = JSONUtil.deserialize[HudiSchemaSpec](f.ingestionSpec) val dataset = hudiSchemaSpec.dataset From 647bdf037356f6f22a0f2229a6389daa6e407403 Mon Sep 17 00:00:00 2001 From: GayathriSrividya Date: Wed, 19 Jun 2024 16:12:30 +0530 Subject: [PATCH 31/37] Sanketika-obsrv/issue-tracker#240 feat: master data enhancements for lakehouse --- .../main/scala/org/sunbird/obsrv/core/model/Constants.scala | 2 +- .../obsrv/router/functions/DynamicRouterFunction.scala | 2 +- pipeline/master-data-processor/pom.xml | 5 +++++ .../obsrv/pipeline/task/MasterDataProcessorStreamTask.scala | 3 +++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala index a6fc140f..466552dd 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala @@ -16,5 +16,5 @@ object Constants { val TOPIC = "topic" val MESSAGE = "message" val DATALAKE_TYPE = "datalake" - + val MASTER_DATASET_TYPE = "master-dataset" } diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala index ed50c8eb..9d40db5c 100644 --- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala +++ b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala @@ -44,7 +44,7 @@ class DynamicRouterFunction(config: DruidRouterConfig) extends BaseDatasetProces event.put(config.CONST_OBSRV_META, msg(config.CONST_OBSRV_META).asInstanceOf[Map[String, AnyRef]]) val tsKeyData = TimestampKeyParser.parseTimestampKey(dataset.datasetConfig, event) event.put("indexTS", tsKeyData.value) - if (tsKeyData.isValid) { + if (tsKeyData.isValid || dataset.datasetType.equalsIgnoreCase(Constants.MASTER_DATASET_TYPE)) { val routerConfig = dataset.routerConfig val topicEventMap = mutable.Map(Constants.TOPIC -> routerConfig.topic, Constants.MESSAGE -> event) ctx.output(config.routerOutputTag, topicEventMap) diff --git a/pipeline/master-data-processor/pom.xml b/pipeline/master-data-processor/pom.xml index 370ec621..0dc1cc60 100644 --- a/pipeline/master-data-processor/pom.xml +++ b/pipeline/master-data-processor/pom.xml @@ -62,6 +62,11 @@ transformer 1.0.0 + + org.sunbird.obsrv.pipeline + druid-router + 1.0.0 + com.github.java-json-tools json-schema-validator diff --git a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala index 7527a6c9..65847bbd 100644 --- a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala +++ b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala @@ -12,6 +12,7 @@ import org.sunbird.obsrv.extractor.task.{ExtractorConfig, ExtractorStreamTask} import org.sunbird.obsrv.pipeline.function.MasterDataProcessorFunction import org.sunbird.obsrv.preprocessor.task.{PipelinePreprocessorConfig, PipelinePreprocessorStreamTask} import org.sunbird.obsrv.transformer.task.{TransformerConfig, TransformerStreamTask} +import org.sunbird.obsrv.router.task.{DruidRouterConfig, DynamicRouterStreamTask} import java.io.File import scala.collection.mutable @@ -50,6 +51,7 @@ class MasterDataProcessorStreamTask(config: Config, masterDataConfig: MasterData val preprocessorTask = new PipelinePreprocessorStreamTask(new PipelinePreprocessorConfig(config), kafkaConnector) val denormalizerTask = new DenormalizerStreamTask(new DenormalizerConfig(config), kafkaConnector) val transformerTask = new TransformerStreamTask(new TransformerConfig(config), kafkaConnector) + val routerTask = new DynamicRouterStreamTask(new DruidRouterConfig(config), kafkaConnector) val transformedStream = transformerTask.processStream( denormalizerTask.processStream( @@ -67,6 +69,7 @@ class MasterDataProcessorStreamTask(config: Config, masterDataConfig: MasterData addDefaultSinks(processedStream, masterDataConfig, kafkaConnector) processedStream.getSideOutput(masterDataConfig.successTag()) + routerTask.processStream(transformedStream) } } From 6ddb9f8a8619003095c78a323f049e8cfd8cd19a Mon Sep 17 00:00:00 2001 From: Anand Parthasarathy Date: Thu, 20 Jun 2024 13:13:12 +0530 Subject: [PATCH 32/37] Sanketika-obsrv/issue-tracker#240 fix: Add scala test to pom.xml --- pipeline/hudi-connector/pom.xml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pipeline/hudi-connector/pom.xml b/pipeline/hudi-connector/pom.xml index 5230d8eb..b47b58d3 100644 --- a/pipeline/hudi-connector/pom.xml +++ b/pipeline/hudi-connector/pom.xml @@ -124,6 +124,12 @@ + + org.scalatest + scalatest_2.12 + 3.0.6 + test + From 350607bd35bb55e80c02c11f69cf46215b7661c5 Mon Sep 17 00:00:00 2001 From: Santhosh Vasabhaktula Date: Thu, 20 Jun 2024 14:38:53 +0530 Subject: [PATCH 33/37] feat #0000 - open-sourcing transformer --- .../main/resources/master-data-processor.conf | 6 +- .../src/test/resources/test.conf | 4 +- .../sunbird/obsrv/fixture/EventFixture.scala | 7 +- ...asterDataProcessorStreamTaskTestSpec.scala | 29 +-- pipeline/pom.xml | 2 +- pipeline/transformer/pom.xml | 60 ++++- .../src/main/resources/transformer.conf | 1 + .../functions/TransformerFunction.scala | 133 ++++++++-- .../transformer/task/TransformerConfig.scala | 14 +- .../task/TransformerStreamTask.scala | 28 +-- .../types/EncryptTransformer.scala | 40 +++ .../transformer/types/ITransformer.scala | 63 +++++ .../types/JSONAtaTransformer.scala | 44 ++++ .../transformer/types/MaskTransformer.scala | 63 +++++ .../obsrv/transformer/util/CipherUtil.scala | 36 +++ .../transformer/util/ConditionEvaluator.scala | 46 ++++ .../transformer/src/test/resources/test.conf | 3 + .../obsrv/transformer/EventFixture.scala | 11 + .../TestTransformerFunctionHelper.scala | 211 ++++++++++++++++ .../TransformerStreamTestSpec.scala | 229 ++++++++++++++++++ .../pom.xml | 42 +--- .../src/main/resources/unified-pipeline.conf} | 3 +- .../task/UnifiedPipelineConfig.scala} | 6 +- .../task/UnifiedPipelineStreamTask.scala} | 24 +- .../src/test/resources/base-config.conf | 0 .../src/test/resources/test.conf | 3 +- .../obsrv/pipeline}/EventFixture.scala | 6 +- .../UnifiedPipelineStreamTaskTestSpec.scala} | 37 +-- 28 files changed, 1004 insertions(+), 147 deletions(-) create mode 100644 pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala create mode 100644 pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala create mode 100644 pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala create mode 100644 pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala create mode 100644 pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala create mode 100644 pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala create mode 100644 pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/EventFixture.scala create mode 100644 pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TestTransformerFunctionHelper.scala create mode 100644 pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TransformerStreamTestSpec.scala rename pipeline/{pipeline-merged => unified-pipeline}/pom.xml (88%) rename pipeline/{pipeline-merged/src/main/resources/merged-pipeline.conf => unified-pipeline/src/main/resources/unified-pipeline.conf} (89%) rename pipeline/{pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineConfig.scala => unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineConfig.scala} (84%) rename pipeline/{pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineStreamTask.scala => unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineStreamTask.scala} (76%) rename pipeline/{pipeline-merged => unified-pipeline}/src/test/resources/base-config.conf (100%) rename pipeline/{pipeline-merged => unified-pipeline}/src/test/resources/test.conf (93%) rename pipeline/{pipeline-merged/src/test/scala/org/sunbird/obsrv/fixture => unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline}/EventFixture.scala (98%) rename pipeline/{pipeline-merged/src/test/scala/org/sunbird/obsrv/pipeline/MergedPipelineStreamTaskTestSpec.scala => unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/UnifiedPipelineStreamTaskTestSpec.scala} (82%) diff --git a/pipeline/master-data-processor/src/main/resources/master-data-processor.conf b/pipeline/master-data-processor/src/main/resources/master-data-processor.conf index 149e795b..0d3f2d89 100644 --- a/pipeline/master-data-processor/src/main/resources/master-data-processor.conf +++ b/pipeline/master-data-processor/src/main/resources/master-data-processor.conf @@ -5,16 +5,16 @@ kafka { output.raw.topic = ${job.env}".masterdata.raw" output.extractor.duplicate.topic = ${job.env}".masterdata.failed" output.failed.topic = ${job.env}".masterdata.failed" - output.batch.failed.topic = ${job.env}".masterdata.extractor.failed" + output.batch.failed.topic = ${job.env}".masterdata.failed" event.max.size = "1048576" # Max is only 1MB output.invalid.topic = ${job.env}".masterdata.failed" output.unique.topic = ${job.env}".masterdata.unique" output.duplicate.topic = ${job.env}".masterdata.failed" output.denorm.topic = ${job.env}".masterdata.denorm" output.transform.topic = ${job.env}".masterdata.transform" + output.transform.failed.topic = ${job.env}".masterdata.transform.failed" stats.topic = ${job.env}".masterdata.stats" groupId = ${job.env}"-masterdata-pipeline-group" - producer { max-request-size = 5242880 } @@ -36,4 +36,4 @@ redis { } } -dataset.type = "master-dataset" \ No newline at end of file +dataset.type = "master-dataset" diff --git a/pipeline/master-data-processor/src/test/resources/test.conf b/pipeline/master-data-processor/src/test/resources/test.conf index 2c8f0236..dfb54e4b 100644 --- a/pipeline/master-data-processor/src/test/resources/test.conf +++ b/pipeline/master-data-processor/src/test/resources/test.conf @@ -5,6 +5,7 @@ job { } kafka { + input.topic = ${job.env}".masterdata.ingest" output.raw.topic = ${job.env}".masterdata.raw" output.extractor.duplicate.topic = ${job.env}".masterdata.failed" @@ -16,6 +17,7 @@ kafka { output.duplicate.topic = ${job.env}".masterdata.failed" output.denorm.topic = ${job.env}".masterdata.denorm" output.transform.topic = ${job.env}".masterdata.transform" + output.transform.failed.topic = ${job.env}".masterdata.transform.failed" stats.topic = ${job.env}".masterdata.stats" groupId = ${job.env}"-masterdata-pipeline-group" producer { @@ -40,4 +42,4 @@ redis { } } -dataset.type = "master-dataset" \ No newline at end of file +dataset.type = "master-dataset" diff --git a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala index e48f8120..cb5ece83 100644 --- a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala +++ b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala @@ -2,9 +2,8 @@ package org.sunbird.obsrv.fixture object EventFixture { - val VALID_BATCH_EVENT_D3_INSERT = """{"dataset":"d3","id":"event1","events":[{"code":"HYUN-CRE-D6","manufacturer":"Hyundai","model":"Creta","variant":"SX(O)","modelYear":"2023","price":"2200000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel"}]}""" - val VALID_BATCH_EVENT_D3_INSERT_2 = """{"dataset":"d3","id":"event1","events":[{"code":"HYUN-TUC-D6","manufacturer":"Hyundai","model":"Tucson","variant":"Signature","modelYear":"2023","price":"4000000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel"}]}""" - val VALID_BATCH_EVENT_D3_UPDATE = """{"dataset":"d3","id":"event1","events":[{"code":"HYUN-CRE-D6","safety":"3 Star (Global NCAP)","seatingCapacity":5}]}""" + val VALID_BATCH_EVENT_D3_INSERT = """{"dataset":"d3","id":"event1","events":[{"code":"HYUN-CRE-D6","manufacturer":"Hyundai","model":"Creta","variant":"SX(O)","modelYear":"2023","price":"2200000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","dealer":{"email":"john.doe@example.com","locationId":"KUN12345"}}]}""" + val VALID_BATCH_EVENT_D3_INSERT_2 = """{"dataset":"d3","id":"event2","events":[{"code":"HYUN-TUC-D6","manufacturer":"Hyundai","model":"Tucson","variant":"Signature","modelYear":"2023","price":"4000000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","dealer":{"email":"admin.hyun@gmail.com","locationId":"KUN134567"}}]}""" + val VALID_BATCH_EVENT_D3_UPDATE = """{"dataset":"d3","id":"event3","events":[{"code":"HYUN-CRE-D6","dealer":{"email":"john.doe@example.com","locationId":"KUN12345"},"safety":"3 Star (Global NCAP)","seatingCapacity":5}]}""" val VALID_BATCH_EVENT_D4 = """{"dataset":"d4","event":{"code":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}}""" - val MISSING_DATA_KEY_EVENT_D4 = """{"dataset":"d5","event":{"code1":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}}""" } diff --git a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala index 575e2228..1bafb519 100644 --- a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala +++ b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala @@ -59,14 +59,14 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.VALID_BATCH_EVENT_D3_INSERT_2) EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.VALID_BATCH_EVENT_D4) EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.VALID_BATCH_EVENT_D3_UPDATE) - EmbeddedKafka.publishStringMessageToKafka(config.getString("kafka.input.topic"), EventFixture.MISSING_DATA_KEY_EVENT_D4) flinkCluster.before() } private def insertTestData(postgresConnect: PostgresConnect): Unit = { - postgresConnect.execute("insert into datasets(id, type, extraction_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d3', 'master-dataset', '{\"is_batch_event\": true, \"extraction_key\": \"events\"}', '{\"topic\":\"d3-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"masterdata.ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+",\"redis_db\":3}', 'Live', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d4', 'master-dataset', '{\"topic\":\"d4-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"masterdata-ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+",\"redis_db\":4}', 'Live', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d5', 'master-dataset', '{\"topic\":\"d4-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"masterdata-ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+",\"redis_db\":4}', 'Live', 'System', 'System', now(), now());") + postgresConnect.execute("INSERT INTO datasets (id, type, validation_config, extraction_config, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) VALUES ('d3', 'master-dataset', '{\"validate\":true,\"mode\":\"Strict\"}', '{\"is_batch_event\":true,\"extraction_key\":\"events\"}', '{\"type\":\"object\",\"$schema\":\"http://json-schema.org/draft-04/schema#\",\"properties\":{\"code\":{\"type\":\"string\"},\"manufacturer\":{\"type\":\"string\"}, \"model\":{\"type\":\"string\"},\"variant\":{\"type\":\"string\"},\"modelYear\":{\"type\":\"string\"},\"price\":{\"type\":\"string\"},\"currencyCode\":{\"type\":\"string\"},\"currency\":{\"type\":\"string\"},\"transmission\":{\"type\":\"string\"},\"fuel\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"email\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"}}}}}', '{\"topic\":\"d3-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"local.masterdata.ingest\",\"redis_db\":3,\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+"}', 'Live', 'System', 'System', now(), now());") + postgresConnect.execute("INSERT INTO datasets (id, type, validation_config, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) VALUES ('d4', 'master-dataset', '{\"validate\":true,\"mode\":\"Strict\"}', '{\"type\":\"object\",\"$schema\":\"http://json-schema.org/draft-04/schema#\",\"properties\":{\"code\":{\"type\":\"string\"},\"manufacturer\":{\"type\":\"string\"},\"model\":{\"type\":\"string\"},\"variant\":{\"type\":\"string\"},\"modelYear\":{\"type\":\"string\"},\"price\":{\"type\":\"string\"},\"currencyCode\":{\"type\":\"string\"},\"currency\":{\"type\":\"string\"},\"seatingCapacity\": {\"type\": \"integer\"}, \"safety\": {\"type\": \"string\"},\"transmission\":{\"type\":\"string\"},\"fuel\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"email\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"}}}}}', '{\"topic\":\"d34-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"local.masterdata.ingest\",\"redis_db\":4,\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+"}', 'Live', 'System', 'System', now(), now());") + postgresConnect.execute("INSERT INTO dataset_transformations (id, dataset_id, field_key, transformation_function, status, created_by, updated_by, created_date, updated_date) VALUES ('tf3', 'd3', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'Live', 'System', 'System', now(), now());") + postgresConnect.execute("INSERT INTO dataset_transformations (id, dataset_id, field_key, transformation_function, status, created_by, updated_by, created_date, updated_date) VALUES ('tf4', 'd3', 'dealer.locationId', '{\"type\":\"encrypt\",\"expr\":\"dealer.locationId\"}', 'Live', 'System', 'System', now(), now());") } override def afterAll(): Unit = { @@ -90,7 +90,7 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry } "MasterDataProcessorStreamTaskTestSpec" should "validate the entire master data pipeline" in { - + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(masterDataConfig) val task = new MasterDataProcessorStreamTask(config, masterDataConfig, kafkaConnector) task.process(env) @@ -98,10 +98,10 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry env.execute(masterDataConfig.jobName) } - val sysEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](config.getString("kafka.output.system.event.topic"), 8, timeout = 30.seconds) - sysEvents.size should be(8) + val input = EmbeddedKafka.consumeNumberMessagesFrom[String](config.getString("kafka.output.system.event.topic"), 7, timeout = 30.seconds) + input.size should be (7) - sysEvents.foreach(se => { + input.foreach(se => { val event = JSONUtil.deserialize[SystemEvent](se) val error = event.data.error if (event.ctx.dataset.getOrElse("ALL").equals("ALL")) @@ -118,12 +118,9 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry event.ctx.dataset_type should be(Some("master-dataset")) }) - val failedEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](masterDataConfig.kafkaFailedTopic, 1, timeout = 30.seconds) - failedEvents.size should be(1) - val mutableMetricsMap = mutable.Map[String, Long](); BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) - Console.println("### MasterDataProcessorStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) + Console.println("### MasterDataProcessorStreamTaskTestSpec:metrics ###", getPrintableMetrics(mutableMetricsMap)) masterDataConfig.successTag().getId should be ("processing_stats") @@ -137,22 +134,18 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry mutableMetricsMap(s"${masterDataConfig.jobName}.d4.${masterDataConfig.successInsertCount}") should be(1) mutableMetricsMap(s"${masterDataConfig.jobName}.d4.${masterDataConfig.successUpdateCount}") should be(0) - mutableMetricsMap(s"${masterDataConfig.jobName}.d5.${masterDataConfig.totalEventCount}") should be(1) - mutableMetricsMap(s"${masterDataConfig.jobName}.d5.${masterDataConfig.eventFailedMetricsCount}") should be(1) - val redisConnection = new RedisConnect(masterDataConfig.redisHost, masterDataConfig.redisPort, masterDataConfig.redisConnectionTimeout) val jedis1 = redisConnection.getConnection(3) val event1 = jedis1.get("HYUN-CRE-D6") - event1 should be ("""{"model":"Creta","price":"2200000","variant":"SX(O)","fuel":"Diesel","code":"HYUN-CRE-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic","seatingCapacity":5,"safety":"3 Star (Global NCAP)"}""") + event1 should be ("""{"dealer":{"email":"jo*****e@example.com","locationId":"ym4iT6lWXt+Y2gEdBldeiw=="},"model":"Creta","price":"2200000","variant":"SX(O)","fuel":"Diesel","code":"HYUN-CRE-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic","seatingCapacity":5,"safety":"3 Star (Global NCAP)"}""") val event3 = jedis1.get("HYUN-TUC-D6") - event3 should be ("""{"model":"Tucson","price":"4000000","variant":"Signature","fuel":"Diesel","code":"HYUN-TUC-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic"}""") + event3 should be ("""{"dealer":{"email":"ad*******n@gmail.com","locationId":"kJ7mH49gjWHeoM1w+ex9kQ=="},"model":"Tucson","price":"4000000","variant":"Signature","fuel":"Diesel","code":"HYUN-TUC-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic"}""") jedis1.close() val jedis2 = redisConnection.getConnection(4) val event2 = jedis2.get("JEEP-CP-D3") event2 should be ("""{"model":"Compass","price":"3800000","variant":"Model S (O) Diesel 4x4 AT","fuel":"Diesel","seatingCapacity":5,"code":"JEEP-CP-D3","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Jeep","safety":"5 Star (Euro NCAP)","modelYear":"2023","transmission":"automatic"}""") jedis2.close() - } diff --git a/pipeline/pom.xml b/pipeline/pom.xml index 220ebff4..1bfcb9ce 100644 --- a/pipeline/pom.xml +++ b/pipeline/pom.xml @@ -21,7 +21,7 @@ denormalizer transformer druid-router - pipeline-merged + unified-pipeline master-data-processor hudi-connector diff --git a/pipeline/transformer/pom.xml b/pipeline/transformer/pom.xml index b695a812..ba2d8f87 100644 --- a/pipeline/transformer/pom.xml +++ b/pipeline/transformer/pom.xml @@ -41,6 +41,33 @@ dataset-registry 1.0.0 + + org.json4s + json4s-native_${scala.maj.version} + 4.0.6 + + + com.ibm.jsonata4java + JSONata4Java + 2.2.6 + + + com.fasterxml.jackson.core + jackson-databind + + + + + com.github.bancolombia + data-mask-core + 1.0.1 + + + com.fasterxml.jackson.core + jackson-databind + + + org.sunbird.obsrv framework @@ -48,6 +75,25 @@ test-jar test + + org.sunbird.obsrv + dataset-registry + 1.0.0 + test-jar + test + + + org.apache.kafka + kafka-clients + ${kafka.version} + test + + + org.apache.kafka + kafka_${scala.maj.version} + ${kafka.version} + test + org.apache.flink flink-test-utils @@ -61,6 +107,18 @@ test tests + + io.github.embeddedkafka + embedded-kafka_2.12 + 3.4.0 + test + + + io.zonky.test + embedded-postgres + 2.0.3 + test + com.github.codemonstur embedded-redis @@ -143,7 +201,7 @@ - reference.conf + transformer.conf diff --git a/pipeline/transformer/src/main/resources/transformer.conf b/pipeline/transformer/src/main/resources/transformer.conf index b7adb850..42fbb22f 100644 --- a/pipeline/transformer/src/main/resources/transformer.conf +++ b/pipeline/transformer/src/main/resources/transformer.conf @@ -3,6 +3,7 @@ include "baseconfig.conf" kafka { input.topic = ${job.env}".denorm" output.transform.topic = ${job.env}".transform" + output.transform.failed.topic = ${job.env}".transform.failed" groupId = ${job.env}"-transformer-group" producer { max-request-size = 5242880 diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/functions/TransformerFunction.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/functions/TransformerFunction.scala index fb0da96c..94a8c80f 100644 --- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/functions/TransformerFunction.scala +++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/functions/TransformerFunction.scala @@ -1,41 +1,142 @@ package org.sunbird.obsrv.transformer.functions -import org.apache.flink.api.common.typeinfo.TypeInformation +import com.fasterxml.jackson.databind.ObjectMapper +import org.sunbird.obsrv.transformer.task.TransformerConfig +import org.sunbird.obsrv.transformer.types._ import org.apache.flink.streaming.api.functions.ProcessFunction -import org.sunbird.obsrv.core.model.Producer +import org.json4s._ +import org.json4s.native.JsonMethods._ +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.exception.ObsrvException +import org.sunbird.obsrv.core.model.Models._ +import org.sunbird.obsrv.core.model.StatusCode.StatusCode +import org.sunbird.obsrv.core.model._ import org.sunbird.obsrv.core.streaming.Metrics -import org.sunbird.obsrv.model.DatasetModels.Dataset +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.{Dataset, DatasetTransformation} +import org.sunbird.obsrv.model.TransformMode import org.sunbird.obsrv.registry.DatasetRegistry import org.sunbird.obsrv.streaming.BaseDatasetProcessFunction -import org.sunbird.obsrv.transformer.task.TransformerConfig import scala.collection.mutable -class TransformerFunction(config: TransformerConfig)(implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]]) - extends BaseDatasetProcessFunction(config) { +case class TransformationStatus(resultJson: JValue, status: StatusCode, fieldStatus: List[TransformFieldStatus]) + +class TransformerFunction(config: TransformerConfig) extends BaseDatasetProcessFunction(config) { + + private[this] val logger = LoggerFactory.getLogger(classOf[TransformerFunction]) override def getMetrics(): List[String] = { - List(config.totalEventCount, config.transformSuccessCount, config.transformFailedCount, config.transformSkippedCount) + List(config.totalEventCount, config.transformSuccessCount, config.transformPartialCount, config.transformFailedCount, config.transformSkippedCount) } - /** * Method to process the event transformations */ - override def processElement(dataset: Dataset, msg: mutable.Map[String, AnyRef], - context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, - metrics: Metrics): Unit = { + override def processElement(dataset: Dataset, msg: mutable.Map[String, AnyRef], context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, metrics: Metrics): Unit = { + implicit val jsonFormats: Formats = DefaultFormats.withLong + val result = TransformerFunctionHelper.processTransformation(dataset, msg, config) metrics.incCounter(dataset.id, config.totalEventCount) + msg.put(config.CONST_EVENT, result.resultJson.extract[Map[String, AnyRef]]) + result.status match { + case StatusCode.skipped => + metrics.incCounter(dataset.id, config.transformSkippedCount) + context.output(config.transformerOutputTag, markSkipped(msg, Producer.transformer)) + case StatusCode.failed => + metrics.incCounter(dataset.id, config.transformFailedCount) + context.output(config.transformerFailedOutputTag, markFailed(msg, ErrorConstants.ERR_TRANSFORMATION_FAILED, Producer.transformer)) + logSystemEvents(dataset, result, context) + case StatusCode.partial => + metrics.incCounter(dataset.id, config.transformPartialCount) + context.output(config.transformerOutputTag, markPartial(msg, Producer.transformer)) + logSystemEvents(dataset, result, context) + case StatusCode.success => + metrics.incCounter(dataset.id, config.transformSuccessCount) + context.output(config.transformerOutputTag, markSuccess(msg, Producer.transformer)) + } + } + + private def logSystemEvents(dataset: Dataset, result: TransformationStatus, ctx: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context): Unit = { + result.fieldStatus.filter(p => !p.success).groupBy(f => f.error.get).map(f => (f._1, f._2.size)) + .foreach(errCount => { + val err = errCount._1 + val functionalError = err match { + case ErrorConstants.INVALID_EXPR_FUNCTION => FunctionalError.TransformParseError + case ErrorConstants.ERR_EVAL_EXPR_FUNCTION => FunctionalError.TransformEvalError + case ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION => FunctionalError.TransformFailedError + case ErrorConstants.TRANSFORMATION_FIELD_MISSING => FunctionalError.TransformFieldMissing + } + + ctx.output(config.systemEventsOutputTag, JSONUtil.serialize(SystemEvent( + EventID.METRIC, + ctx = ContextData(module = ModuleID.processing, pdata = PData(config.jobName, PDataType.flink, Some(Producer.denorm)), dataset = Some(dataset.id), dataset_type = Some(dataset.datasetType)), + data = EData(error = Some(ErrorLog(pdata_id = Producer.denorm, pdata_status = StatusCode.failed, error_type = functionalError, error_code = err.errorCode, error_message = err.errorMsg, error_level = ErrorLevel.critical, error_count = Some(errCount._2)))) + ))) + }) + + logger.warn(s"Transformer | Transform operation is not successful | dataset=${dataset.id} | TransformStatusData=${JSONUtil.serialize(result.fieldStatus)}") + } + +} + +object TransformerFunctionHelper { + + implicit val jsonFormats: Formats = DefaultFormats.withLong + private val mapper = new ObjectMapper() + + implicit class JsonHelper(json: JValue) { + def customExtract[T](path: String)(implicit mf: Manifest[T]): T = { + path.split('.').foldLeft(json)({ case (acc: JValue, node: String) => acc \ node }).extract[T] + } + } + + @throws[ObsrvException] + def processTransformation(dataset: Dataset, msg: mutable.Map[String, AnyRef], config: TransformerConfig): TransformationStatus = { + + val event = JSONUtil.serialize(msg(config.CONST_EVENT)) + val json = parse(event, useBigIntForLong = false) val datasetTransformations = DatasetRegistry.getDatasetTransformations(dataset.id) + processTransformations(json, datasetTransformations) + } + + def processTransformations(json: JValue, datasetTransformations: Option[List[DatasetTransformation]]): TransformationStatus = { if (datasetTransformations.isDefined) { - // TODO: Perform transformations - metrics.incCounter(dataset.id, config.transformSuccessCount) - context.output(config.transformerOutputTag, markSuccess(msg, Producer.transformer)) + val result = applyTransformations(json, datasetTransformations.get) + TransformationStatus(json merge result.json, getStatus(result.fieldStatus), result.fieldStatus) } else { - metrics.incCounter(dataset.id, config.transformSkippedCount) - context.output(config.transformerOutputTag, markSkipped(msg, Producer.transformer)) + TransformationStatus(json, StatusCode.skipped, List[TransformFieldStatus]()) } } + private def getStatus(fieldStatus: List[TransformFieldStatus]): StatusCode = { + val failedCount = fieldStatus.count(p => p.mode == TransformMode.Strict && !p.success) + val partialCount = fieldStatus.count(p => p.mode == TransformMode.Lenient && !p.success) + if (failedCount > 0) StatusCode.failed else if (partialCount > 0) StatusCode.partial else StatusCode.success + + } + + private def applyTransformations(json: JValue, datasetTransformations: List[DatasetTransformation]): TransformationResult = { + datasetTransformations.groupBy(f => f.transformationFunction.`type`).mapValues(f => { + applyTransformation(f.head.transformationFunction.`type`, json, f) + }).values.reduceLeft((a, b) => TransformationResult(mergeJson(a, b), mergeStatus(a, b))) + } + + private def mergeJson(a: TransformationResult, b: TransformationResult): JValue = { + a.json merge b.json + } + + private def mergeStatus(a: TransformationResult, b: TransformationResult): List[TransformFieldStatus] = { + a.fieldStatus ++ b.fieldStatus + } + + private def applyTransformation(tfType: String, json: JValue, dt: List[DatasetTransformation]): TransformationResult = { + val jsonNode = mapper.readTree(compact(render(json))) + tfType match { + case "mask" => MaskTransformer.transform(json, jsonNode, dt) + case "jsonata" => JSONAtaTransformer.transform(json, jsonNode, dt) + case "encrypt" => EncryptTransformer.transform(json, jsonNode, dt) + case _ => TransformationResult(json, List[TransformFieldStatus]()) + } + } } \ No newline at end of file diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerConfig.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerConfig.scala index 797b3e56..b7766448 100644 --- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerConfig.scala +++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerConfig.scala @@ -17,22 +17,28 @@ class TransformerConfig(override val config: Config) extends BaseJobConfig[mutab // Metric List val totalEventCount = "transform-total-count" val transformSuccessCount = "transform-success-count" + val transformPartialCount = "transform-partial-count" val transformFailedCount = "transform-failed-count" val transformSkippedCount = "transform-skipped-count" + private val kafkaInputTopic: String = config.getString("kafka.input.topic") val kafkaTransformTopic: String = config.getString("kafka.output.transform.topic") + val kafkaTransformFailedTopic: String = config.getString("kafka.output.transform.failed.topic") val transformerFunction = "transformer-function" val transformerProducer = "transformer-producer" + val transformerFailedProducer = "transformer-failed-producer" - private val TRANSFORMER_OUTPUT_TAG = "transformed-events" - val transformerOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]](TRANSFORMER_OUTPUT_TAG) + private val TRANSFORMER_EVENTS = "transformed-events" + private val TRANSFORMER_FAILED_EVENTS = "transformed_failed-events" + val transformerOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]](TRANSFORMER_EVENTS) + val transformerFailedOutputTag: OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]](TRANSFORMER_FAILED_EVENTS) - override def inputTopic(): String = config.getString("kafka.input.topic") + override def inputTopic(): String = kafkaInputTopic override def inputConsumer(): String = "transformer-consumer" override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = transformerOutputTag override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") -} +} \ No newline at end of file diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala index 71e86581..1bd31fb1 100644 --- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala +++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala @@ -1,59 +1,59 @@ package org.sunbird.obsrv.transformer.task import com.typesafe.config.ConfigFactory -import org.apache.flink.api.common.typeinfo.TypeInformation -import org.apache.flink.api.java.typeutils.TypeExtractor import org.apache.flink.api.java.utils.ParameterTool import org.apache.flink.streaming.api.datastream.DataStream import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector} import org.sunbird.obsrv.core.util.FlinkUtil +import org.sunbird.obsrv.model.TransformType import org.sunbird.obsrv.transformer.functions.TransformerFunction import java.io.File import scala.collection.mutable -/** - * - */ class TransformerStreamTask(config: TransformerConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { private val serialVersionUID = -7729362727131516112L - implicit val mapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster def process(): Unit = { - implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) - val dataStream = getMapDataStream(env, config, kafkaConnector) - processStream(dataStream) + process(env) env.execute(config.jobName) } // $COVERAGE-ON$ + def process(env: StreamExecutionEnvironment): Unit = { + val dataStream = getMapDataStream(env, config, kafkaConnector) + processStream(dataStream) + } + override def processStream(dataStream: DataStream[mutable.Map[String, AnyRef]]): DataStream[mutable.Map[String, AnyRef]] = { + val transformedStream = dataStream.process(new TransformerFunction(config)).name(config.transformerFunction).uid(config.transformerFunction) .setParallelism(config.downstreamOperatorsParallelism) transformedStream.getSideOutput(config.transformerOutputTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaTransformTopic)) .name(config.transformerProducer).uid(config.transformerProducer).setParallelism(config.downstreamOperatorsParallelism) + transformedStream.getSideOutput(config.transformerFailedOutputTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaTransformFailedTopic)) + .name(config.transformerFailedProducer).uid(config.transformerFailedProducer).setParallelism(config.downstreamOperatorsParallelism) addDefaultSinks(transformedStream, config, kafkaConnector) - transformedStream.getSideOutput(config.successTag()) + transformedStream.getSideOutput(config.transformerOutputTag) } } // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster object TransformerStreamTask { - def main(args: Array[String]): Unit = { val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path")) val config = configFilePath.map { path => ConfigFactory.parseFile(new File(path)).resolve() }.getOrElse(ConfigFactory.load("transformer.conf").withFallback(ConfigFactory.systemEnvironment())) - val extractorConfig = new TransformerConfig(config) - val kafkaUtil = new FlinkKafkaConnector(extractorConfig) - val task = new TransformerStreamTask(extractorConfig, kafkaUtil) + val transformerConfig = new TransformerConfig(config) + val kafkaUtil = new FlinkKafkaConnector(transformerConfig) + val task = new TransformerStreamTask(transformerConfig, kafkaUtil) task.process() } } diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala new file mode 100644 index 00000000..4cd37623 --- /dev/null +++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala @@ -0,0 +1,40 @@ +package org.sunbird.obsrv.transformer.types + +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.MissingNode +import org.sunbird.obsrv.transformer.functions.TransformerFunctionHelper.JsonHelper +import org.sunbird.obsrv.transformer.util.CipherUtil +import org.json4s.{JValue, MappingException} +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.ErrorConstants +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.DatasetTransformation + +class EncryptTransformer extends ITransformer { + + private val logger = LoggerFactory.getLogger(classOf[EncryptTransformer]) + + override def transformField(json: JValue, jsonNode: JsonNode, dt: DatasetTransformation): (JValue, TransformFieldStatus) = { + val emptyNode = getJSON(dt.fieldKey, MissingNode.getInstance()) + try { + val currentValue = json.customExtract[String](dt.transformationFunction.expr) + val encryptedValue = CipherUtil.encrypt(currentValue) + (getJSON(dt.fieldKey, encryptedValue), TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = true, dt.mode.get)) + } catch { + case ex: MappingException => + logger.error(s"Transformer(Encrypt) | Exception parsing transformation expression | Data=${JSONUtil.serialize(dt)} | error=${ex.getMessage}", ex) + (emptyNode, TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = false, dt.mode.get, Some(ErrorConstants.TRANSFORMATION_FIELD_MISSING))) + } + } + +} + +object EncryptTransformer { + + private val encryptTransformer = new EncryptTransformer() + + def transform(json: JValue, jsonNode: JsonNode, dtList: List[DatasetTransformation]): TransformationResult = { + encryptTransformer.transform(json, jsonNode, dtList) + } + +} \ No newline at end of file diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala new file mode 100644 index 00000000..2da86478 --- /dev/null +++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala @@ -0,0 +1,63 @@ +package org.sunbird.obsrv.transformer.types + +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.JsonNodeType +import org.sunbird.obsrv.transformer.util.ConditionEvaluator +import org.json4s.{JNothing, JObject, JValue} +import org.json4s.native.JsonMethods.parse +import org.sunbird.obsrv.core.model.ErrorConstants.Error +import org.sunbird.obsrv.model.DatasetModels.DatasetTransformation +import org.sunbird.obsrv.model.TransformMode.TransformMode + +import scala.collection.mutable.ListBuffer + +case class TransformFieldStatus(fieldKey: String, expr: String, success: Boolean, mode: TransformMode, error: Option[Error] = None) +case class ConditionStatus(expr: String, success: Boolean, mode: Option[TransformMode] = None, error: Option[Error] = None) +case class TransformationResult(json: JValue, fieldStatus: List[TransformFieldStatus]) +abstract class ITransformer[T] { + + def transformField(json: JValue, jsonNode: JsonNode, dt: DatasetTransformation): (JValue, TransformFieldStatus) + + def transform(json: JValue, jsonNode: JsonNode, dtList: List[DatasetTransformation]): TransformationResult = { + val resultBuffer = ListBuffer[TransformFieldStatus]() + val evalList = dtList.map(dt => { + val conditionStatus = ConditionEvaluator.evalCondition(dt.datasetId, jsonNode, dt.transformationFunction.condition, dt.mode) + if (!conditionStatus.success) { + resultBuffer.append(TransformFieldStatus(dt.fieldKey, conditionStatus.expr, success = false, dt.mode.get, conditionStatus.error)) + JObject(dt.fieldKey -> JNothing) + } else { + val result = transformField(json, jsonNode, dt) + resultBuffer.append(result._2) + result._1 + } + }) + val transformedJson = evalList.reduceLeftOption((a, b) => a merge b).getOrElse(JNothing) + TransformationResult(transformedJson, resultBuffer.toList) + } + + def getJSON(key: String, value: String): JValue = { + val path = key.split('.').toList ++ List(s""""$value"""") + val outPath = path.reduceRight((a, b) => s"""{"$a":$b}""") + parse(outPath, useBigIntForLong = false) + } + + def getJSON(key: String, value: AnyRef): JValue = { + val path = key.split('.').toList ++ List(s"""$value""") + val outPath = path.reduceRight((a, b) => s"""{"$a":$b}""") + parse(outPath, useBigIntForLong = false) + } + + def getJSON(key: String, value: JsonNode): JValue = { + Option(value).map { jsonNodeValue => + jsonNodeValue.getNodeType match { + case JsonNodeType.STRING => getJSON(key, jsonNodeValue.textValue()) + case JsonNodeType.NUMBER => getJSON(key, jsonNodeValue.numberValue().asInstanceOf[AnyRef]) + case JsonNodeType.BOOLEAN => getJSON(key, jsonNodeValue.booleanValue().asInstanceOf[AnyRef]) + case JsonNodeType.ARRAY => getJSON(key, jsonNodeValue.toString.asInstanceOf[AnyRef]) + case JsonNodeType.OBJECT => getJSON(key, jsonNodeValue.toString.asInstanceOf[AnyRef]) + case _ => getJSON(key, null.asInstanceOf[AnyRef]) + } + }.getOrElse(getJSON(key, null.asInstanceOf[AnyRef])) + } + +} \ No newline at end of file diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala new file mode 100644 index 00000000..ee7a4858 --- /dev/null +++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala @@ -0,0 +1,44 @@ +package org.sunbird.obsrv.transformer.types + +import com.api.jsonata4java.expressions.{EvaluateException, Expressions, ParseException} +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.MissingNode +import org.json4s.JValue +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.ErrorConstants +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels + +class JSONAtaTransformer extends ITransformer { + + private val logger = LoggerFactory.getLogger(classOf[JSONAtaTransformer]) + + override def transformField(json: JValue, jsonNode: JsonNode, dt: DatasetModels.DatasetTransformation): (JValue, TransformFieldStatus) = { + val emptyNode = getJSON(dt.fieldKey, MissingNode.getInstance()) + try { + val expr = Expressions.parse(dt.transformationFunction.expr) + val resNode = expr.evaluate(jsonNode) + (Option(resNode).map { node => getJSON(dt.fieldKey, node) }.getOrElse(emptyNode), TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = true, dt.mode.get)) + } catch { + case ex1: ParseException => + logger.error(s"Transformer(JSONATA) | Exception parsing transformation expression | Data=${JSONUtil.serialize(dt)} | error=${ex1.getMessage}", ex1) + (emptyNode, TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = false, dt.mode.get, Some(ErrorConstants.INVALID_EXPR_FUNCTION))) + case ex2: EvaluateException => + logger.error(s"Transformer(JSONATA) | Exception evaluating transformation expression | Data=${JSONUtil.serialize(dt)} | error=${ex2.getMessage}", ex2) + (emptyNode, TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = false, dt.mode.get, Some(ErrorConstants.ERR_EVAL_EXPR_FUNCTION))) + case ex3: Exception => + logger.error(s"Transformer(JSONATA) | Unknown error | Data=${JSONUtil.serialize(dt)} | error=${ex3.getMessage}", ex3) + (emptyNode, TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = false, dt.mode.get, Some(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION))) + } + } +} + +object JSONAtaTransformer { + + private val jsonAtaTransformer = new JSONAtaTransformer() + + def transform(json: JValue, jsonNode: JsonNode, dtList: List[DatasetModels.DatasetTransformation]): TransformationResult = { + jsonAtaTransformer.transform(json, jsonNode, dtList) + } + +} \ No newline at end of file diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala new file mode 100644 index 00000000..8df5f889 --- /dev/null +++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala @@ -0,0 +1,63 @@ +package org.sunbird.obsrv.transformer.types + +import co.com.bancolombia.datamask.{MaskUtils => CustomMaskUtils} +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.MissingNode +import org.json4s._ +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.ErrorConstants +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.DatasetTransformation + +import java.util.regex.Pattern + +class MaskTransformer extends ITransformer[String] { + + implicit val jsonFormats: Formats = DefaultFormats.withLong + private val logger = LoggerFactory.getLogger(classOf[EncryptTransformer]) + + implicit class JsonHelper(json: JValue) { + def customExtract[T](path: String)(implicit mf: Manifest[T]): T = { + path.split('.').foldLeft(json)({ case (acc: JValue, node: String) => acc \ node }).extract[T] + } + } + + private val maskRatio = 0.35 // TODO: Move it to a config + private val emailPattern = Pattern.compile("^(.+)@(\\S+)$") // TODO: Read the pattern from config + + private def mask(value: String): String = { + if (value.isEmpty) return value + if (emailPattern.matcher(value).matches()) { + CustomMaskUtils.maskAsEmail(value) + } else { + val openDigits = (value.length * maskRatio).ceil + val firstDigitCount = (openDigits / 2).floor + val lastDigitCount = openDigits - firstDigitCount + CustomMaskUtils.mask(value, firstDigitCount.intValue(), lastDigitCount.intValue()) + } + } + + override def transformField(json: JValue, jsonNode: JsonNode, dt: DatasetTransformation): (JValue, TransformFieldStatus) = { + val emptyNode = getJSON(dt.fieldKey, MissingNode.getInstance()) + try { + val currentValue = json.customExtract[String](dt.transformationFunction.expr) + val maskedValue = mask(currentValue).replaceAll("\"", "") + (getJSON(dt.fieldKey, maskedValue), TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = true, dt.mode.get)) + } catch { + case ex: MappingException => + logger.error(s"Transformer(Mask) | Exception parsing transformation expression | Data=${JSONUtil.serialize(dt)} | error=${ex.getMessage}", ex) + (emptyNode, TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = false, dt.mode.get, Some(ErrorConstants.TRANSFORMATION_FIELD_MISSING))) + } + } + +} + +object MaskTransformer { + + private val maskingTransformer = new MaskTransformer() + + def transform(json: JValue, jsonNode: JsonNode, dtList: List[DatasetTransformation]): TransformationResult = { + maskingTransformer.transform(json, jsonNode, dtList) + } + +} \ No newline at end of file diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala new file mode 100644 index 00000000..3f7a93e0 --- /dev/null +++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala @@ -0,0 +1,36 @@ +package org.sunbird.obsrv.transformer.util + +import org.sunbird.obsrv.core.model.SystemConfig + +import java.util.Base64 +import javax.crypto.Cipher +import javax.crypto.spec.SecretKeySpec + +object CipherUtil { + + private val algorithm = "AES" + + private val encryptInstance = getInstance(Cipher.ENCRYPT_MODE) + + private val decryptInstance = getInstance(Cipher.DECRYPT_MODE) + + def encrypt(value: String): String = { + if (value.isEmpty) return value + val encryptedByteValue = encryptInstance.doFinal(value.getBytes("utf-8")) + Base64.getEncoder.encodeToString(encryptedByteValue) + } + + def decrypt(value: String): String = { + val decryptedValue64 = Base64.getDecoder.decode(value) + val decryptedByteValue = decryptInstance.doFinal(decryptedValue64) + new String(decryptedByteValue, "utf-8") + } + + private def getInstance(mode: Int): Cipher = { + val cipher = Cipher.getInstance(algorithm) + val key = new SecretKeySpec(SystemConfig.getString("encryptionSecretKey").getBytes("utf-8"), algorithm) + cipher.init(mode, key) + cipher + } + +} diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala new file mode 100644 index 00000000..ee6a18a4 --- /dev/null +++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala @@ -0,0 +1,46 @@ +package org.sunbird.obsrv.transformer.util + +import com.api.jsonata4java.expressions.{EvaluateException, Expressions, ParseException} +import com.fasterxml.jackson.databind.JsonNode +import org.sunbird.obsrv.transformer.types.ConditionStatus +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.ErrorConstants +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.Condition +import org.sunbird.obsrv.model.TransformMode.TransformMode + +object ConditionEvaluator { + + private val logger = LoggerFactory.getLogger(ConditionEvaluator.getClass) + + def evalCondition(datasetId: String, json: JsonNode, condition: Option[Condition], mode: Option[TransformMode]): ConditionStatus = { + if(condition.isDefined) { + condition.get.`type` match { + case "jsonata" => evalJSONAtaCondition(datasetId, json, condition.get, mode) + case _ => ConditionStatus("", success = false, mode, Some(ErrorConstants.NO_IMPLEMENTATION_FOUND)) + } + } else { + ConditionStatus("", success = true, mode) + } + } + + private def evalJSONAtaCondition(datasetId: String, json: JsonNode, condition: Condition, mode: Option[TransformMode]): ConditionStatus = { + try { + val expr = Expressions.parse(condition.expr) + val resultNode = expr.evaluate(json) + val result = resultNode.isBoolean && resultNode.asBoolean() + ConditionStatus(condition.expr, result, mode) + } catch { + case ex1: ParseException => + logger.error(s"Transformer(ConditionEvaluator) | Exception parsing condition expression | dataset=$datasetId | ConditionData=${JSONUtil.serialize(condition)} | error=${ex1.getMessage}", ex1) + ConditionStatus(condition.expr, success = false, mode, Some(ErrorConstants.INVALID_EXPR_FUNCTION)) + case ex2: EvaluateException => + logger.error(s"Transformer(ConditionEvaluator) | Exception evaluating condition expression | dataset=$datasetId | ConditionData=${JSONUtil.serialize(condition)} | error=${ex2.getMessage}", ex2) + ConditionStatus(condition.expr, success = false, mode, Some(ErrorConstants.ERR_EVAL_EXPR_FUNCTION)) + case ex3: Exception => + logger.error(s"Transformer(ConditionEvaluator) | Unknown error during condition evaluation | dataset=$datasetId | ConditionData=${JSONUtil.serialize(condition)} | error=${ex3.getMessage}", ex3) + ConditionStatus(condition.expr, success = false, mode, Some(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION)) + } + } + +} diff --git a/pipeline/transformer/src/test/resources/test.conf b/pipeline/transformer/src/test/resources/test.conf index f1091415..1098ba64 100644 --- a/pipeline/transformer/src/test/resources/test.conf +++ b/pipeline/transformer/src/test/resources/test.conf @@ -1,8 +1,11 @@ include "base-test.conf" kafka { + producer.broker-servers = "localhost:9093" + consumer.broker-servers = "localhost:9093" input.topic = "flink.denorm" output.transform.topic = "flink.transform" + output.transform.failed.topic = "flink.transform.failed" groupId = "flink-transformer-group" producer { max-request-size = 5242880 diff --git a/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/EventFixture.scala b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/EventFixture.scala new file mode 100644 index 00000000..a4f48246 --- /dev/null +++ b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/EventFixture.scala @@ -0,0 +1,11 @@ +package org.sunbird.obsrv.transformer + +object EventFixture { + + val SUCCESS_TRANSFORM = """{"dataset":"d1","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val FAILED_TRANSFORM = """{"dataset":"d1","event":{"id":"1235","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val PARTIAL_TRANSFORM = """{"dataset":"d2","event":{"id":"1235","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val SKIPPED_TRANSFORM = """{"dataset":"d3","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + val FAILED_TRANSFORM_2 = """{"dataset":"d4","event":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" + +} \ No newline at end of file diff --git a/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TestTransformerFunctionHelper.scala b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TestTransformerFunctionHelper.scala new file mode 100644 index 00000000..197cf614 --- /dev/null +++ b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TestTransformerFunctionHelper.scala @@ -0,0 +1,211 @@ +package org.sunbird.obsrv.transformer + +import com.fasterxml.jackson.databind.{JsonNode, ObjectMapper} +import org.json4s._ +import org.json4s.native.JsonMethods._ +import org.scalatest.Matchers +import org.sunbird.obsrv.core.model.{ErrorConstants, StatusCode} +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.{Condition, DatasetTransformation, TransformationFunction} +import org.sunbird.obsrv.model.TransformMode +import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry +import org.sunbird.obsrv.transformer.functions.TransformerFunctionHelper +import org.sunbird.obsrv.transformer.util.{CipherUtil, ConditionEvaluator} +import org.sunbird.obsrv.transformer.types._ + +class TestTransformerFunctionHelper extends BaseSpecWithDatasetRegistry with Matchers { + + implicit val jsonFormats: DefaultFormats.type = DefaultFormats + + implicit class JsonHelper(json: JValue) { + def customExtract[T](path: String)(implicit mf: Manifest[T]): T = { + path.split('.').foldLeft(json)({ case (acc: JValue, node: String) => acc \ node }).extract[T] + } + } + + val jsonStr = """{"obsCode":"M_BATTERY_CHARGE","accountEmail":"firstname.lastname@gmail.com","accountPhone":"123456","codeComponents":[{"componentCode":"CC_METADATA_DEVICE_FIRMWARE_VER","componentType":"METADATA_DEVICE","selector":"FIRMWARE_VERSION","value":"2.3"}],"phenTime":"2022-06-17T07:12:02Z","valueUoM":"prcnt","value":"100","id":"df4c7aa4-65df-4463-b92a-7a29835f9c4d","parentCollectionRef":"41e9b7a4-5b6f-11ed-8fd5-a6a5696c2aaa","created":"2022-11-03T12:01:32Z","modified":1667476892000,"integrationAccountRef":"zzz11120-f0c8-4064-8d00-a73e58939ce0_mtgc203d-2478-4679-a0ef-d736a7a406fd","assetRef":"9422f7ac-c6e9-5c72-b605-5a7655863866","assetRef2":"","assetRef4":123124,"testBool":false,"contextItems":[{"code":"SYN_SYSTEM","value":"VALENCO"}],"status":"ACTIVE","xMin":3.356701,"xMax":3.356701,"yMin":51.01653,"yMax":51.01653,"spatialExtent":"{\"type\": \"Point\", \"coordinates\": [3.356701, 51.016530]}","phenEndTime":"2022-06-17T07:12:02Z","value_double_type":100.0}""" + val mapper = new ObjectMapper() + val jsonNode: JsonNode = mapper.readTree(jsonStr) + + "TransformerFunctionHelper" should "mask the events for the given transformation config" in { + + val json = parse(jsonStr) + val dtList = Option(List( + DatasetTransformation("tf1", "obs2.0", "spatialExtent", TransformationFunction("mask", None, "spatialExtent"), "active"), + DatasetTransformation("tf1", "obs2.0", "assetRef", TransformationFunction("mask", None, "assetRef"), "active"), + DatasetTransformation("tf1", "obs2.0", "accountEmail", TransformationFunction("mask", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "accountEmail"), "active"), + DatasetTransformation("tf1", "obs2.0", "accountPhone2", TransformationFunction("mask", None, "accountPhone"), "active"), + DatasetTransformation("tf1", "obs2.0", "codeComponentsList", TransformationFunction("jsonata", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "$keys(codeComponents)"), "active"), + DatasetTransformation("tf1", "obs2.0", "valueAsInt", TransformationFunction("jsonata", None, "$number(value)"), "active"), + DatasetTransformation("tf1", "obs2.0", "firmwareComponent", TransformationFunction("jsonata", None, "codeComponents[0]"), "active"), + DatasetTransformation("tf1", "obs2.0", "optionalValue", TransformationFunction("jsonata", None, "$number(optionValue)"), "active") + )) + + val result = TransformerFunctionHelper.processTransformations(json, dtList) + result.status should be(StatusCode.success) + result.fieldStatus.size should be(8) + assert(result.resultJson.customExtract[String]("spatialExtent").equals("{type: ***********************************1.016530]}")) + assert(result.resultJson.customExtract[String]("assetRef").equals("9422f7***********************5863866")) + assert(result.resultJson.customExtract[String]("accountEmail").equals("fi***************e@gmail.com")) + assert(result.resultJson.customExtract[String]("accountPhone2").equals("1***56")) + assert(JSONUtil.getKey("optionalValue", JSONUtil.serialize(result.resultJson)).isMissingNode.equals(true)) + + val dtList2 = Option(List( + DatasetTransformation("tf1", "obs2.0", "accountPhone", TransformationFunction("mask", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE1'")), "accountPhone"), "active", Some(TransformMode.Lenient)), + DatasetTransformation("tf4", "obs2.0", "asset.assetRef2", TransformationFunction("mask", None, "assetRef2"), "Live", Some(TransformMode.Lenient)), + DatasetTransformation("tf5", "obs2.0", "asset.assetRef3", TransformationFunction("mask", None, "assetRef3"), "Live", Some(TransformMode.Lenient)), + DatasetTransformation("tf6", "obs2.0", "asset.assetRef4", TransformationFunction("mask", None, "assetRef4"), "Live", Some(TransformMode.Lenient)), + DatasetTransformation("tf7", "obs2.0", "asset.assetRef5", TransformationFunction("custom", None, "join(d2.assetRef4)"), "Live", Some(TransformMode.Lenient)) + )) + val result2 = TransformerFunctionHelper.processTransformations(json, dtList2) + result2.status should be(StatusCode.partial) + result2.fieldStatus.size should be(4) + result2.resultJson.customExtract[String]("asset.assetRef2") should be("") + result2.resultJson.customExtract[String]("asset.assetRef3") should be(null) + result2.resultJson.customExtract[String]("asset.assetRef4") should be("1***24") + result.resultJson.customExtract[String]("accountPhone") should be ("123456") + + val result3 = TransformerFunctionHelper.processTransformations(json, None) + result3.status should be (StatusCode.skipped) + result3.fieldStatus.size should be(0) + } + + it should "validate the jsonata expressions" in { + + val json = parse(jsonStr) + val dtList = Option(List( + DatasetTransformation("tf1", "obs2.0", "codeComponentsList", TransformationFunction("jsonata", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "$keys(codeComponents).length"), "active", Some(TransformMode.Lenient)), + DatasetTransformation("tf1", "obs2.0", "valueAsInt", TransformationFunction("jsonata", None, "$number(value)"), "active"), + DatasetTransformation("tf1", "obs2.0", "firmwareComponent", TransformationFunction("jsonata", None, "codeComponents[0]"), "active") + )) + val result = TransformerFunctionHelper.processTransformations(json, dtList) + result.status should be(StatusCode.partial) + result.fieldStatus.size should be(3) + assert(result.resultJson.customExtract[String]("firmwareComponent.componentCode").equals("CC_METADATA_DEVICE_FIRMWARE_VER")) + assert(result.resultJson.customExtract[Int]("valueAsInt").equals(100)) + } + + it should "handle the jsonata parse and eval exceptions including transformation modes" in { + + val json = parse(jsonStr) + val dtList = Option(List( + DatasetTransformation("tf1", "obs2.0", "codeComponentsList", TransformationFunction("jsonata", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "$keys(codeComponent).length"), "active"), + DatasetTransformation("tf1", "obs2.0", "valueAsInt", TransformationFunction("jsonata", None, "number(value)"), "active"), + DatasetTransformation("tf1", "obs2.0", "valueAsInt2", TransformationFunction("jsonata", None, null), "Live", Some(TransformMode.Lenient)), + DatasetTransformation("tf1", "obs2.0", "firmwareComponent", TransformationFunction("jsonata", None, "codeComponents[0]"), "active") + )) + val result = TransformerFunctionHelper.processTransformations(json, dtList) + result.status should be(StatusCode.failed) + result.fieldStatus.size should be(4) + result.fieldStatus.count(f => f.error.isDefined && f.error.get.equals(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION)) should be(1) + result.fieldStatus.count(f => f.error.isDefined && f.error.get.equals(ErrorConstants.ERR_EVAL_EXPR_FUNCTION)) should be(1) + result.fieldStatus.count(f => f.error.isDefined && f.error.get.equals(ErrorConstants.INVALID_EXPR_FUNCTION)) should be(1) + result.fieldStatus.foreach { status: TransformFieldStatus => { + status.fieldKey match { + case "codeComponentsList" => + status.expr should be("$keys(codeComponent).length") + status.success should be(false) + status.mode should be(TransformMode.Strict) + status.error.get should be(ErrorConstants.ERR_EVAL_EXPR_FUNCTION) + case "valueAsInt" => + status.expr should be("number(value)") + status.success should be(false) + status.mode should be(TransformMode.Strict) + status.error.get should be(ErrorConstants.INVALID_EXPR_FUNCTION) + case "firmwareComponent" => + status.expr should be("codeComponents[0]") + status.success should be(true) + status.mode should be(TransformMode.Strict) + status.error should be(None) + case "valueAsInt2" => + status.expr should be(null) + status.success should be(false) + status.mode should be(TransformMode.Lenient) + status.error.get should be(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION) + } + } + } + } + + it should "encrypt the fields in the event" in { + val json = parse(jsonStr) + val dtList = Option(List( + DatasetTransformation("tf1", "obs2.0", "accountEmail", TransformationFunction("encrypt", None, "accountEmail"), "Live"), + DatasetTransformation("tf2", "obs2.0", "accountPhone", TransformationFunction("encrypt", None, "accountPhone"), "Live"), + DatasetTransformation("tf3", "obs2.0", "assetRef", TransformationFunction("encrypt", None, "assetRef"), "Live"), + DatasetTransformation("tf4", "obs2.0", "asset.assetRef2", TransformationFunction("encrypt", None, "assetRef2"), "Live"), + DatasetTransformation("tf5", "obs2.0", "asset.assetRef3", TransformationFunction("encrypt", None, "assetRef3"), "Live"), + DatasetTransformation("tf6", "obs2.0", "asset.assetRef4", TransformationFunction("encrypt", None, "assetRef4"), "Live") + )) + val result = TransformerFunctionHelper.processTransformations(json, dtList) + val jsonData = compact(render(result.resultJson)) + result.status should be(StatusCode.failed) + result.fieldStatus.size should be(6) + assert(result.resultJson.customExtract[String]("accountEmail").equals("jyx7+dUfzHgODno2jcp67/rfCvOecaLLWICRnSCNvzY=")) + assert(result.resultJson.customExtract[String]("accountPhone").equals("qqyhkaWkPR3t1k0swyQ7Ow==")) + assert(result.resultJson.customExtract[String]("assetRef").equals("e+YNIi1FebmPPI7D8k3/idlQ8XX0AIhuplwcRLbPb3nkS25gt/HyUQkWeuj6KPxf")) + result.resultJson.customExtract[String]("asset.assetRef2") should be("") + result.resultJson.customExtract[String]("asset.assetRef4") should be("D2ySyi1WGqJsM4mbIjbtJA==") + result.resultJson.customExtract[String]("asset.assetRef3") should be(null) + + JSONUtil.getKey("asset.assetRef3", jsonData).isEmpty should be(true) + + assert(CipherUtil.decrypt(result.resultJson.customExtract[String]("accountEmail")).equals("firstname.lastname@gmail.com")) + assert(CipherUtil.decrypt(result.resultJson.customExtract[String]("accountPhone")).equals("123456")) + assert(CipherUtil.decrypt(result.resultJson.customExtract[String]("assetRef")).equals("9422f7ac-c6e9-5c72-b605-5a7655863866")) + } + + it should "validate all scenarios of condition evaluator" in { + val status1 = ConditionEvaluator.evalCondition("d1", jsonNode, Some(Condition("custom", "testExpr")), Some(TransformMode.Strict)) + status1.expr should be("") + status1.success should be(false) + status1.mode.get should be(TransformMode.Strict) + status1.error.get should be(ErrorConstants.NO_IMPLEMENTATION_FOUND) + + val status2 = ConditionEvaluator.evalCondition("d1", jsonNode, Some(Condition("jsonata", "number(value)")), Some(TransformMode.Strict)) + status2.expr should be("number(value)") + status2.success should be(false) + status2.mode.get should be(TransformMode.Strict) + status2.error.get should be(ErrorConstants.INVALID_EXPR_FUNCTION) + + val status3 = ConditionEvaluator.evalCondition("d1", jsonNode, Some(Condition("jsonata", "$keys(codeComponent).length")), Some(TransformMode.Strict)) + status3.expr should be("$keys(codeComponent).length") + status3.success should be(false) + status3.mode.get should be(TransformMode.Strict) + status3.error.get should be(ErrorConstants.ERR_EVAL_EXPR_FUNCTION) + + val status4 = ConditionEvaluator.evalCondition("d1", jsonNode, Some(Condition("jsonata", null)), Some(TransformMode.Strict)) + status4.expr should be(null) + status4.success should be(false) + status4.mode.get should be(TransformMode.Strict) + status4.error.get should be(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION) + + val status5 = ConditionEvaluator.evalCondition("d1", null, Some(Condition("jsonata", "$number(value)")), Some(TransformMode.Lenient)) + status5.expr should be("$number(value)") + status5.success should be(false) + status5.mode.get should be(TransformMode.Lenient) + status5.error.get should be(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION) + } + + it should "cover the unreachable code block in ITransformer" in { + val testTransformer = new TestTransformer() + val res1 = testTransformer.getJSON("event.key", null.asInstanceOf[JsonNode]) + compact(render(res1)) should be("""{"event":{"key":null}}""") + val res2 = testTransformer.getJSON("event.key.x", JSONUtil.getKey("obsCode", jsonStr)) + compact(render(res2)) should be("""{"event":{"key":{"x":"M_BATTERY_CHARGE"}}}""") + val res3 = testTransformer.getJSON("event.key.y", JSONUtil.getKey("testBool", jsonStr)) + compact(render(res3)) should be("""{"event":{"key":{"y":false}}}""") + + val res4 = testTransformer.transform(parse(jsonStr), jsonNode, List[DatasetTransformation]()) + res4.json should be(JNothing) + res4.fieldStatus.size should be(0) + } + +} + +class TestTransformer extends ITransformer { + override def transformField(json: JValue, jsonNode: JsonNode, dt: DatasetTransformation): (JValue, TransformFieldStatus) = { + (JNothing, TransformFieldStatus("", "", success = false, TransformMode.Lenient)) + } + +} \ No newline at end of file diff --git a/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TransformerStreamTestSpec.scala b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TransformerStreamTestSpec.scala new file mode 100644 index 00000000..4e204ed0 --- /dev/null +++ b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TransformerStreamTestSpec.scala @@ -0,0 +1,229 @@ +package org.sunbird.obsrv.transformer + +import io.github.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig} +import org.apache.flink.configuration.Configuration +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.apache.flink.test.util.MiniClusterWithClientResource +import org.apache.kafka.common.serialization.StringDeserializer +import org.scalatest.Matchers._ +import org.sunbird.obsrv.BaseMetricsReporter +import org.sunbird.obsrv.core.model.Models.SystemEvent +import org.sunbird.obsrv.core.model._ +import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector +import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect} +import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry +import org.sunbird.obsrv.transformer.task.{TransformerConfig, TransformerStreamTask} + +import scala.collection.mutable +import scala.concurrent.ExecutionContext.Implicits.global +import scala.concurrent.Future +import scala.concurrent.duration._ + +class TransformerStreamTestSpec extends BaseSpecWithDatasetRegistry { + + val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder() + .setConfiguration(testConfiguration()) + .setNumberSlotsPerTaskManager(1) + .setNumberTaskManagers(1) + .build) + + val transformerConfig = new TransformerConfig(config) + val redisPort: Int = transformerConfig.redisPort + val kafkaConnector = new FlinkKafkaConnector(transformerConfig) + val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group") + implicit val embeddedKafkaConfig: EmbeddedKafkaConfig = + EmbeddedKafkaConfig( + kafkaPort = 9093, + zooKeeperPort = 2183, + customConsumerProperties = customKafkaConsumerProperties + ) + implicit val deserializer: StringDeserializer = new StringDeserializer() + + def testConfiguration(): Configuration = { + val config = new Configuration() + config.setString("metrics.reporter", "job_metrics_reporter") + config.setString("metrics.reporter.job_metrics_reporter.class", classOf[BaseMetricsReporter].getName) + config + } + + override def beforeAll(): Unit = { + super.beforeAll() + BaseMetricsReporter.gaugeMetrics.clear() + EmbeddedKafka.start()(embeddedKafkaConfig) + insertTestData() + createTestTopics() + publishMessagesToKafka() + flinkCluster.before() + } + + private def publishMessagesToKafka(): Unit = { + EmbeddedKafka.publishStringMessageToKafka(transformerConfig.inputTopic(), EventFixture.SUCCESS_TRANSFORM) + EmbeddedKafka.publishStringMessageToKafka(transformerConfig.inputTopic(), EventFixture.FAILED_TRANSFORM) + EmbeddedKafka.publishStringMessageToKafka(transformerConfig.inputTopic(), EventFixture.SKIPPED_TRANSFORM) + EmbeddedKafka.publishStringMessageToKafka(transformerConfig.inputTopic(), EventFixture.PARTIAL_TRANSFORM) + EmbeddedKafka.publishStringMessageToKafka(transformerConfig.inputTopic(), EventFixture.FAILED_TRANSFORM_2) + } + + private def insertTestData(): Unit = { + val postgresConnect = new PostgresConnect(postgresConfig) + postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date, tags) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") + postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date, tags) values ('d4', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") + postgresConnect.execute("insert into dataset_transformations values('tf3', 'd2', 'tfdata.valueAsInt', '{\"type\":\"jsonata\",\"expr\":\"$number(id)\"}', 'Live', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_transformations values('tf4', 'd2', 'tfdata.encryptEmail', '{\"type\":\"encrypt\",\"expr\": \"dealer.email\"}', 'Live', 'Lenient', 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_transformations values('tf5', 'd4', 'tfdata.expr1', '{\"type\":\"jsonata\",\"expr\":null}', 'Live', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_transformations values('tf6', 'd4', 'tfdata.expr2', '{\"type\":\"jsonata\",\"expr\":\"$keys(dealer).length\"}', 'Live', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_transformations values('tf7', 'd4', 'tfdata.expr3', '{\"type\":\"jsonata\",\"expr\":\"number(id)\"}', 'Live', null, 'System', 'System', now(), now());") + postgresConnect.closeConnection() + } + + override def afterAll(): Unit = { + super.afterAll() + flinkCluster.after() + EmbeddedKafka.stop() + } + + def createTestTopics(): Unit = { + List( + transformerConfig.inputTopic(), transformerConfig.kafkaFailedTopic, transformerConfig.kafkaSystemTopic, transformerConfig.kafkaTransformTopic, transformerConfig.kafkaTransformFailedTopic + ).foreach(EmbeddedKafka.createCustomTopic(_)) + } + + "TransformerStreamTestSpec" should "validate the transform stream task" in { + + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(transformerConfig) + val task = new TransformerStreamTask(transformerConfig, kafkaConnector) + task.process(env) + Future { + env.execute(transformerConfig.jobName) + } + + val outputs = EmbeddedKafka.consumeNumberMessagesFrom[String](transformerConfig.kafkaTransformTopic, 3, timeout = 30.seconds) + validateOutputs(outputs) + + val failedEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](transformerConfig.kafkaTransformFailedTopic, 2, timeout = 30.seconds) + validateFailedEvents(failedEvents) + + val systemEvents = EmbeddedKafka.consumeNumberMessagesFrom[String](transformerConfig.kafkaSystemTopic, 5, timeout = 30.seconds) + validateSystemEvents(systemEvents) + + val mutableMetricsMap = mutable.Map[String, Long]() + BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) + Console.println("### DenormalizerStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) + validateMetrics(mutableMetricsMap) + + transformerConfig.successTag().getId should be("transformed-events") + } + + private def validateOutputs(outputs: List[String]): Unit = { + outputs.size should be(3) + outputs.zipWithIndex.foreach { + case (elem, idx) => + val msg = JSONUtil.deserialize[Map[String, AnyRef]](elem) + val event = JSONUtil.serialize(msg(Constants.EVENT)) + val obsrvMeta = msg(Constants.OBSRV_META).asInstanceOf[Map[String, AnyRef]] + obsrvMeta("timespans").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[Int] should be > 0 + idx match { + case 0 => + event should be("""{"dealer":{"email":"de****1@gmail.com","maskedPhone":"98******45","locationId":"KUN1","dealerCode":"D123","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""") + obsrvMeta("flags").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[String] should be("success") + case 1 => + event should be("""{"dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""") + obsrvMeta("flags").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[String] should be("skipped") + case 2 => + event should be("""{"tfdata":{"valueAsInt":1235},"dealer":{"dealerCode":"D123","locationId":"KUN1","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1235","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}""") + obsrvMeta("flags").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[String] should be("partial") + } + } + /* + (Output Event,{"obsrv_meta":{"flags":{"transformer":"success"},"syncts":1701863209956,"prevProcessingTime":1701863215734,"error":{},"processingStartTime":1701863215322,"timespans":{"transformer":412}},"event":{"dealer":{"email":"de****1@gmail.com","maskedPhone":"98******45","locationId":"KUN1","dealerCode":"D123","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}},"dataset":"d1"},0) + (Output Event,{"obsrv_meta":{"flags":{"transformer":"skipped"},"syncts":1701863210084,"prevProcessingTime":1701863216141,"error":{},"processingStartTime":1701863215476,"timespans":{"transformer":665}},"event":{"dealer":{"dealerCode":"D123","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1234","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}},"dataset":"d3"},1) + (Output Event,{"obsrv_meta":{"flags":{"transformer":"partial"},"syncts":1701863210111,"prevProcessingTime":1701863216378,"error":{},"processingStartTime":1701863215477,"timespans":{"transformer":901}},"event":{"tfdata":{"valueAsInt":1235},"dealer":{"dealerCode":"D123","locationId":"KUN1","phone":"9849012345"},"vehicleCode":"HYUN-CRE-D6","id":"1235","date":"2023-03-01","metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}},"dataset":"d2"},2) + */ + } + + private def validateFailedEvents(failedEvents: List[String]): Unit = { + failedEvents.size should be(2) + failedEvents.zipWithIndex.foreach { + case (elem, idx) => + val msg = JSONUtil.deserialize[Map[String, AnyRef]](elem) + val event = msg(Constants.EVENT).asInstanceOf[String] + val obsrvMeta = msg(Constants.OBSRV_META).asInstanceOf[Map[String, AnyRef]] + obsrvMeta("timespans").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[Int] should be > 0 + obsrvMeta("flags").asInstanceOf[Map[String, AnyRef]]("transformer").asInstanceOf[String] should be (StatusCode.failed.toString) + obsrvMeta("error").asInstanceOf[Map[String, AnyRef]]("src").asInstanceOf[String] should be (Producer.transformer.toString) + obsrvMeta("error").asInstanceOf[Map[String, AnyRef]]("error_code").asInstanceOf[String] should be (ErrorConstants.ERR_TRANSFORMATION_FAILED.errorCode) + idx match { + case 0 => + event should be("{\"event\":{\"dealer\":{\"maskedPhone\":\"98******45\",\"locationId\":\"KUN1\",\"dealerCode\":\"D123\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1235\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d1\"}") + case 1 => + event should be("{\"event\":{\"tfdata\":{},\"dealer\":{\"dealerCode\":\"D123\",\"locationId\":\"KUN1\",\"email\":\"dealer1@gmail.com\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1234\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d4\"}") + } + } + /* + (Failed Event,{"event":"{\"event\":{\"dealer\":{\"maskedPhone\":\"98******45\",\"locationId\":\"KUN1\",\"dealerCode\":\"D123\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1235\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d1\"}","obsrv_meta":{"flags":{"transformer":"failed"},"syncts":1701863210058,"prevProcessingTime":1701863215948,"error":{"src":{"enumClass":"org.sunbird.obsrv.core.model.Producer","value":"transformer"},"error_code":"ERR_TRANSFORM_1023","error_msg":"Atleast one mandatory transformation has failed"},"processingStartTime":1701863215475,"timespans":{"transformer":473}},"dataset":"d1"},0) + (Failed Event,{"event":"{\"event\":{\"tfdata\":{},\"dealer\":{\"dealerCode\":\"D123\",\"locationId\":\"KUN1\",\"email\":\"dealer1@gmail.com\",\"phone\":\"9849012345\"},\"vehicleCode\":\"HYUN-CRE-D6\",\"id\":\"1234\",\"date\":\"2023-03-01\",\"metrics\":{\"bookingsTaken\":50,\"deliveriesPromised\":20,\"deliveriesDone\":19}},\"dataset\":\"d4\"}","obsrv_meta":{"flags":{"transformer":"failed"},"syncts":1701863210150,"prevProcessingTime":1701863216421,"error":{"src":{"enumClass":"org.sunbird.obsrv.core.model.Producer","value":"transformer"},"error_code":"ERR_TRANSFORM_1023","error_msg":"Atleast one mandatory transformation has failed"},"processingStartTime":1701863215477,"timespans":{"transformer":944}},"dataset":"d4"},1) + */ + } + + private def validateSystemEvents(systemEvents: List[String]): Unit = { + systemEvents.size should be(5) + systemEvents.count(f => { + val event = JSONUtil.deserialize[SystemEvent](f) + FunctionalError.TransformFieldMissing.equals(event.data.error.get.error_type) + }) should be(2) + systemEvents.count(f => { + val event = JSONUtil.deserialize[SystemEvent](f) + FunctionalError.TransformFailedError.equals(event.data.error.get.error_type) + }) should be(1) + systemEvents.count(f => { + val event = JSONUtil.deserialize[SystemEvent](f) + FunctionalError.TransformEvalError.equals(event.data.error.get.error_type) + }) should be(1) + systemEvents.count(f => { + val event = JSONUtil.deserialize[SystemEvent](f) + FunctionalError.TransformParseError.equals(event.data.error.get.error_type) + }) should be(1) + + systemEvents.foreach(se => { + val event = JSONUtil.deserialize[SystemEvent](se) + val error = event.data.error + if (event.ctx.dataset.getOrElse("ALL").equals("ALL")) + event.ctx.dataset_type should be(None) + else if (error.isDefined) { + val errorCode = error.get.error_code + if (errorCode.equals(ErrorConstants.MISSING_DATASET_ID.errorCode) || + errorCode.equals(ErrorConstants.MISSING_DATASET_CONFIGURATION.errorCode) || + errorCode.equals(ErrorConstants.EVENT_MISSING.errorCode)) { + event.ctx.dataset_type should be(None) + } + } + else + event.ctx.dataset_type should be(Some("dataset")) + }) + // TODO: Add more assertions + /* + (Sys Event,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"TransformerJob","type":"flink","pid":"denorm"},"dataset":"d1"},"data":{"error":{"pdata_id":"denorm","pdata_status":"failed","error_type":"TransformFieldMissing","error_code":"ERR_TRANSFORM_1023","error_message":"Transformation field is either missing or blank","error_level":"critical","error_count":1}},"ets":1701863215985},0) + (Sys Event,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"TransformerJob","type":"flink","pid":"denorm"},"dataset":"d2"},"data":{"error":{"pdata_id":"denorm","pdata_status":"failed","error_type":"TransformFieldMissing","error_code":"ERR_TRANSFORM_1023","error_message":"Transformation field is either missing or blank","error_level":"critical","error_count":1}},"ets":1701863216391},1) + (Sys Event,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"TransformerJob","type":"flink","pid":"denorm"},"dataset":"d4"},"data":{"error":{"pdata_id":"denorm","pdata_status":"failed","error_type":"TransformFailedError","error_code":"ERR_TRANSFORM_1022","error_message":"Unable to evaluate the transformation expression function","error_level":"critical","error_count":1}},"ets":1701863216431},2) + (Sys Event,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"TransformerJob","type":"flink","pid":"denorm"},"dataset":"d4"},"data":{"error":{"pdata_id":"denorm","pdata_status":"failed","error_type":"TransformEvalError","error_code":"ERR_TRANSFORM_1021","error_message":"Unable to evaluate the transformation expression function","error_level":"critical","error_count":1}},"ets":1701863216433},3) + (Sys Event,{"etype":"METRIC","ctx":{"module":"processing","pdata":{"id":"TransformerJob","type":"flink","pid":"denorm"},"dataset":"d4"},"data":{"error":{"pdata_id":"denorm","pdata_status":"failed","error_type":"TransformParseError","error_code":"ERR_TRANSFORM_1020","error_message":"Transformation expression function is not valid","error_level":"critical","error_count":1}},"ets":1701863216433},4) + */ + } + + private def validateMetrics(mutableMetricsMap: mutable.Map[String, Long]): Unit = { + mutableMetricsMap(s"${transformerConfig.jobName}.d1.${transformerConfig.totalEventCount}") should be(2) + mutableMetricsMap(s"${transformerConfig.jobName}.d1.${transformerConfig.transformSuccessCount}") should be(1) + mutableMetricsMap(s"${transformerConfig.jobName}.d1.${transformerConfig.transformFailedCount}") should be(1) + + mutableMetricsMap(s"${transformerConfig.jobName}.d2.${transformerConfig.totalEventCount}") should be(1) + mutableMetricsMap(s"${transformerConfig.jobName}.d2.${transformerConfig.transformPartialCount}") should be(1) + + mutableMetricsMap(s"${transformerConfig.jobName}.d3.${transformerConfig.totalEventCount}") should be(1) + mutableMetricsMap(s"${transformerConfig.jobName}.d3.${transformerConfig.transformSkippedCount}") should be(1) + + mutableMetricsMap(s"${transformerConfig.jobName}.d4.${transformerConfig.totalEventCount}") should be(1) + mutableMetricsMap(s"${transformerConfig.jobName}.d4.${transformerConfig.transformFailedCount}") should be(1) + } + +} \ No newline at end of file diff --git a/pipeline/pipeline-merged/pom.xml b/pipeline/unified-pipeline/pom.xml similarity index 88% rename from pipeline/pipeline-merged/pom.xml rename to pipeline/unified-pipeline/pom.xml index f3db71fe..a14eb0f7 100644 --- a/pipeline/pipeline-merged/pom.xml +++ b/pipeline/unified-pipeline/pom.xml @@ -4,6 +4,9 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 + + 3.0.1 + org.sunbird.obsrv @@ -12,12 +15,12 @@ org.sunbird.obsrv.pipeline - pipeline-merged + unified-pipeline 1.0.0 jar - Merged Pipeline + Unified Pipeline - Entire pipeline merged into a single processing job + Entire pipeline merged into a single processing job @@ -67,36 +70,6 @@ druid-router 1.0.0 - - com.github.java-json-tools - json-schema-validator - 2.2.14 - - - joda-time - joda-time - - - com.fasterxml.jackson.core - jackson-databind - - - com.google.guava - guava - - - - - com.google.guava - guava - 32.1.2-jre - - - org.apache.kafka - kafka-clients - ${kafka.version} - test - org.apache.kafka kafka_${scala.maj.version} @@ -173,7 +146,6 @@ 2.0.3 test - @@ -220,7 +192,7 @@ - org.sunbird.obsrv.pipeline.task.MergedPipelineStreamTask + in.sanketika.obsrv.pipeline.task.UnifiedPipelineStreamTask diff --git a/pipeline/pipeline-merged/src/main/resources/merged-pipeline.conf b/pipeline/unified-pipeline/src/main/resources/unified-pipeline.conf similarity index 89% rename from pipeline/pipeline-merged/src/main/resources/merged-pipeline.conf rename to pipeline/unified-pipeline/src/main/resources/unified-pipeline.conf index 75f43376..9b1e1bdf 100644 --- a/pipeline/pipeline-merged/src/main/resources/merged-pipeline.conf +++ b/pipeline/unified-pipeline/src/main/resources/unified-pipeline.conf @@ -12,8 +12,9 @@ kafka { output.denorm.topic = ${job.env}".denorm" output.denorm.failed.topic = ${job.env}".failed" output.transform.topic = ${job.env}".transform" + output.transform.failed.topic = ${job.env}".failed" stats.topic = ${job.env}".stats" - groupId = ${job.env}"-single-pipeline-group" + groupId = ${job.env}"-unified-pipeline-group" producer { max-request-size = 5242880 } diff --git a/pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineConfig.scala b/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineConfig.scala similarity index 84% rename from pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineConfig.scala rename to pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineConfig.scala index c6df88d3..75322bc2 100644 --- a/pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineConfig.scala +++ b/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineConfig.scala @@ -8,17 +8,13 @@ import org.sunbird.obsrv.core.streaming.BaseJobConfig import scala.collection.mutable -class MergedPipelineConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "MergedPipelineJob") { +class UnifiedPipelineConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "UnifiedPipelineJob") { private val serialVersionUID = 2905979434303791379L implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) - // Kafka Topics Configuration override def inputTopic(): String = config.getString("kafka.input.topic") - override def inputConsumer(): String = "pipeline-consumer" - override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("processing_stats") - override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") } diff --git a/pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineStreamTask.scala b/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineStreamTask.scala similarity index 76% rename from pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineStreamTask.scala rename to pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineStreamTask.scala index f7d8dce9..ed03b88b 100644 --- a/pipeline/pipeline-merged/src/main/scala/org/sunbird/obsrv/pipeline/task/MergedPipelineStreamTask.scala +++ b/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineStreamTask.scala @@ -19,26 +19,21 @@ import scala.collection.mutable * Druid Router stream task routes every event into its respective topic configured at dataset level */ -class MergedPipelineStreamTask(config: Config, mergedPipelineConfig: MergedPipelineConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { +class UnifiedPipelineStreamTask(config: Config, pipelineConfig: UnifiedPipelineConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { private val serialVersionUID = 146697324640926024L // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster def process(): Unit = { - implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(mergedPipelineConfig) + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(pipelineConfig) process(env) - env.execute(mergedPipelineConfig.jobName) + env.execute(pipelineConfig.jobName) } // $COVERAGE-ON$ - /** - * Created an overloaded process function to enable unit testing - * @param env StreamExecutionEnvironment - */ def process(env: StreamExecutionEnvironment): Unit = { - - val dataStream = getMapDataStream(env, mergedPipelineConfig, kafkaConnector) + val dataStream = getMapDataStream(env, pipelineConfig, kafkaConnector) processStream(dataStream) } @@ -63,18 +58,17 @@ class MergedPipelineStreamTask(config: Config, mergedPipelineConfig: MergedPipel } // $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster -object MergedPipelineStreamTask { +object UnifiedPipelineStreamTask { def main(args: Array[String]): Unit = { val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path")) val config = configFilePath.map { path => ConfigFactory.parseFile(new File(path)).resolve() - }.getOrElse(ConfigFactory.load("merged-pipeline.conf").withFallback(ConfigFactory.systemEnvironment())) - val mergedPipelineConfig = new MergedPipelineConfig(config) - val kafkaUtil = new FlinkKafkaConnector(mergedPipelineConfig) - val task = new MergedPipelineStreamTask(config, mergedPipelineConfig, kafkaUtil) + }.getOrElse(ConfigFactory.load("unified-pipeline.conf").withFallback(ConfigFactory.systemEnvironment())) + val pipelineConfig = new UnifiedPipelineConfig(config) + val kafkaUtil = new FlinkKafkaConnector(pipelineConfig) + val task = new UnifiedPipelineStreamTask(config, pipelineConfig, kafkaUtil) task.process() } } - // $COVERAGE-ON$ \ No newline at end of file diff --git a/pipeline/pipeline-merged/src/test/resources/base-config.conf b/pipeline/unified-pipeline/src/test/resources/base-config.conf similarity index 100% rename from pipeline/pipeline-merged/src/test/resources/base-config.conf rename to pipeline/unified-pipeline/src/test/resources/base-config.conf diff --git a/pipeline/pipeline-merged/src/test/resources/test.conf b/pipeline/unified-pipeline/src/test/resources/test.conf similarity index 93% rename from pipeline/pipeline-merged/src/test/resources/test.conf rename to pipeline/unified-pipeline/src/test/resources/test.conf index d2b959c3..aa514d54 100644 --- a/pipeline/pipeline-merged/src/test/resources/test.conf +++ b/pipeline/unified-pipeline/src/test/resources/test.conf @@ -16,6 +16,7 @@ kafka { output.denorm.topic = ${job.env}".denorm" output.denorm.failed.topic = ${job.env}".failed" output.transform.topic = ${job.env}".transform" + output.transform.failed.topic = ${job.env}".transform.failed" stats.topic = ${job.env}".stats" groupId = ${job.env}"-single-pipeline-group" producer { @@ -38,4 +39,4 @@ redis { preprocessor.duplication.store.id = 2 key.expiry.seconds = 3600 } -} +} \ No newline at end of file diff --git a/pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala b/pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/EventFixture.scala similarity index 98% rename from pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala rename to pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/EventFixture.scala index dee90323..a5e623b6 100644 --- a/pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala +++ b/pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/EventFixture.scala @@ -1,4 +1,4 @@ -package org.sunbird.obsrv.fixture +package org.sunbird.obsrv.pipeline object EventFixture { @@ -11,6 +11,4 @@ object EventFixture { val VALID_BATCH_EVENT_D2 = """{"dataset":"d2","id":"event4","event":{"id":"4567","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" val INVALID_BATCH_EVENT_D2 = """{"dataset":"d2","id":"event5","event1":{"id":"1234","vehicleCode":"HYUN-CRE-D6","date":"2023-03-01","dealer":{"dealerCode":"KUNUnited","locationId":"KUN1","email":"dealer1@gmail.com","phone":"9849012345"},"metrics":{"bookingsTaken":50,"deliveriesPromised":20,"deliveriesDone":19}}}""" - - -} +} \ No newline at end of file diff --git a/pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/pipeline/MergedPipelineStreamTaskTestSpec.scala b/pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/UnifiedPipelineStreamTaskTestSpec.scala similarity index 82% rename from pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/pipeline/MergedPipelineStreamTaskTestSpec.scala rename to pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/UnifiedPipelineStreamTaskTestSpec.scala index f3cf86b2..8d3ba2d6 100644 --- a/pipeline/pipeline-merged/src/test/scala/org/sunbird/obsrv/pipeline/MergedPipelineStreamTaskTestSpec.scala +++ b/pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/UnifiedPipelineStreamTaskTestSpec.scala @@ -11,18 +11,15 @@ import org.sunbird.obsrv.BaseMetricsReporter import org.sunbird.obsrv.core.cache.RedisConnect import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil} -import org.sunbird.obsrv.extractor.task.ExtractorConfig -import org.sunbird.obsrv.fixture.EventFixture -import org.sunbird.obsrv.pipeline.task.{MergedPipelineConfig, MergedPipelineStreamTask} +import org.sunbird.obsrv.pipeline.task.{UnifiedPipelineConfig, UnifiedPipelineStreamTask} import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry -import org.sunbird.obsrv.transformer.task.TransformerConfig import scala.collection.mutable import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent.Future import scala.concurrent.duration._ -class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { +class UnifiedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder() .setConfiguration(testConfiguration()) @@ -30,8 +27,8 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { .setNumberTaskManagers(1) .build) - val mergedPipelineConfig = new MergedPipelineConfig(config) - val kafkaConnector = new FlinkKafkaConnector(mergedPipelineConfig) + val unifiedPipelineConfig = new UnifiedPipelineConfig(config) + val kafkaConnector = new FlinkKafkaConnector(unifiedPipelineConfig) val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group") implicit val embeddedKafkaConfig: EmbeddedKafkaConfig = EmbeddedKafkaConfig( @@ -65,7 +62,7 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { } override def afterAll(): Unit = { - val redisConnection = new RedisConnect(mergedPipelineConfig.redisHost, mergedPipelineConfig.redisPort, mergedPipelineConfig.redisConnectionTimeout) + val redisConnection = new RedisConnect(unifiedPipelineConfig.redisHost, unifiedPipelineConfig.redisPort, unifiedPipelineConfig.redisConnectionTimeout) redisConnection.getConnection(config.getInt("redis.database.extractor.duplication.store.id")).flushAll() redisConnection.getConnection(config.getInt("redis.database.preprocessor.duplication.store.id")).flushAll() super.afterAll() @@ -83,20 +80,20 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { ).foreach(EmbeddedKafka.createCustomTopic(_)) } - "MergedPipelineStreamTaskTestSpec" should "validate the entire pipeline" in { + "UnifiedPipelineStreamTaskTestSpec" should "validate the entire pipeline" in { - implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(mergedPipelineConfig) - val task = new MergedPipelineStreamTask(config, mergedPipelineConfig, kafkaConnector) + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(unifiedPipelineConfig) + val task = new UnifiedPipelineStreamTask(config, unifiedPipelineConfig, kafkaConnector) task.process(env) Future { - env.execute(mergedPipelineConfig.jobName) + env.execute(unifiedPipelineConfig.jobName) } try { val d1Events = EmbeddedKafka.consumeNumberMessagesFrom[String]("d1-events", 1, timeout = 30.seconds) - d1Events.size should be (1) + d1Events.size should be(1) val d2Events = EmbeddedKafka.consumeNumberMessagesFrom[String]("d2-events", 1, timeout = 30.seconds) - d2Events.size should be (1) + d2Events.size should be(1) } catch { case ex: Exception => ex.printStackTrace() } @@ -144,16 +141,8 @@ class MergedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { mutableMetricsMap("DruidRouterJob.d2.router-total-count") should be(1) mutableMetricsMap("DruidRouterJob.d2.router-success-count") should be(1) - val extractorConfig = new ExtractorConfig(config) - extractorConfig.inputTopic() should be (config.getString("kafka.input.topic")) - extractorConfig.inputConsumer() should be ("extractor-consumer") - - val transformerConfig = new TransformerConfig(config) - transformerConfig.inputTopic() should be(config.getString("kafka.input.topic")) - transformerConfig.inputConsumer() should be("transformer-consumer") - - mergedPipelineConfig.successTag().getId should be ("processing_stats") - mergedPipelineConfig.failedEventsOutputTag().getId should be ("failed-events") + unifiedPipelineConfig.successTag().getId should be("processing_stats") + unifiedPipelineConfig.failedEventsOutputTag().getId should be("failed-events") } } From 5c04d6729b973010ed77f586ca95bf119d637343 Mon Sep 17 00:00:00 2001 From: Santhosh Vasabhaktula Date: Thu, 20 Jun 2024 18:14:41 +0530 Subject: [PATCH 34/37] feat #0001 - Ability to generate denorm field via jsonata transformation --- .github/workflows/build_and_deploy.yaml | 6 +- .github/workflows/upload_artifact.yaml | 2 +- Dockerfile | 5 +- .../sunbird/obsrv/model/DatasetModels.scala | 4 +- framework/pom.xml | 10 +- .../obsrv/core/model/ErrorConstants.scala | 2 +- .../sunbird/obsrv/core/util/JSONUtil.scala | 4 + pipeline/denormalizer/pom.xml | 5 + .../task/DenormalizerConfig.scala | 3 +- .../obsrv/denormalizer/util/DenormCache.scala | 17 +- pipeline/transformer/pom.xml | 28 +-- .../UnifiedPipelineStreamTaskTestSpec.scala | 2 +- pom.xml | 1 + transformation-sdk/pom.xml | 180 ++++++++++++++++++ .../main/resources/transformation-sdk.conf | 0 .../types/EncryptTransformer.scala | 12 +- .../transformer/types/ITransformer.scala | 5 +- .../types/JSONAtaTransformer.scala | 23 +++ .../transformer/types/MaskTransformer.scala | 2 +- .../obsrv/transformer/util/CipherUtil.scala | 2 +- .../transformer/util/ConditionEvaluator.scala | 3 +- .../src/test/resources/test.conf | 0 22 files changed, 260 insertions(+), 56 deletions(-) create mode 100644 transformation-sdk/pom.xml create mode 100644 transformation-sdk/src/main/resources/transformation-sdk.conf rename {pipeline/transformer => transformation-sdk}/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala (82%) rename {pipeline/transformer => transformation-sdk}/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala (95%) rename {pipeline/transformer => transformation-sdk}/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala (69%) rename {pipeline/transformer => transformation-sdk}/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala (97%) rename {pipeline/transformer => transformation-sdk}/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala (99%) rename {pipeline/transformer => transformation-sdk}/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala (93%) create mode 100644 transformation-sdk/src/test/resources/test.conf diff --git a/.github/workflows/build_and_deploy.yaml b/.github/workflows/build_and_deploy.yaml index 601c8f5e..05defdf1 100644 --- a/.github/workflows/build_and_deploy.yaml +++ b/.github/workflows/build_and_deploy.yaml @@ -27,8 +27,8 @@ jobs: target: "transformer-image" - image: "druid-router" target: "router-image" - - image: "merged-pipeline" - target: "merged-image" + - image: "unified-pipeline" + target: "unified-image" - image: "master-data-processor" target: "master-data-processor-image" - image: "lakehouse-connector" @@ -97,7 +97,7 @@ jobs: run: | cd deploy/terraform/aws terragrunt init - terragrunt apply -auto-approve -var merged_pipeline_enabled={{ vars.MERGED_PIPELINE || 'true' }} --replace='module.flink.helm_release.flink' \ + terragrunt apply -auto-approve -var unified_pipeline_enabled={{ vars.MERGED_PIPELINE || 'true' }} --replace='module.flink.helm_release.flink' \ -var flink_image_tag=${{ github.ref_name }} azure-deploy: diff --git a/.github/workflows/upload_artifact.yaml b/.github/workflows/upload_artifact.yaml index 38cb7ec8..07943fd1 100644 --- a/.github/workflows/upload_artifact.yaml +++ b/.github/workflows/upload_artifact.yaml @@ -56,7 +56,7 @@ jobs: - image: "denormalizer" - image: "transformer" - image: "druid-router" - - image: "pipeline-merged" + - image: "unified-pipeline" - image: "master-data-processor" steps: - name: Get Tag Name diff --git a/Dockerfile b/Dockerfile index fd4002be..e392b619 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,7 @@ FROM --platform=linux/x86_64 maven:3.9.4-eclipse-temurin-11-focal AS build-core COPY . /app RUN mvn clean install -DskipTests -f /app/framework/pom.xml RUN mvn clean install -DskipTests -f /app/dataset-registry/pom.xml +RUN mvn clean install -DskipTests -f /app/transformation-sdk/pom.xml FROM --platform=linux/x86_64 maven:3.9.4-eclipse-temurin-11-focal AS build-pipeline COPY --from=build-core /root/.m2 /root/.m2 @@ -28,9 +29,9 @@ FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as rout USER flink COPY --from=build-pipeline /app/pipeline/druid-router/target/druid-router-1.0.0.jar $FLINK_HOME/lib/ -FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as merged-image +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as unified-image USER flink -COPY --from=build-pipeline /app/pipeline/pipeline-merged/target/pipeline-merged-1.0.0.jar $FLINK_HOME/lib/ +COPY --from=build-pipeline /app/pipeline/unified-pipeline/target/unified-pipeline-1.0.0.jar $FLINK_HOME/lib/ FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as master-data-processor-image USER flink diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala index 3aebe8bd..e8affc0a 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala @@ -25,8 +25,8 @@ object DatasetModels { case class ValidationConfig(@JsonProperty("validate") validate: Option[Boolean] = Some(true), @JsonProperty("mode") @JsonScalaEnumeration(classOf[ValidationModeType]) mode: Option[ValidationMode]) - case class DenormFieldConfig(@JsonProperty("denorm_key") denormKey: String, @JsonProperty("redis_db") redisDB: Int, - @JsonProperty("denorm_out_field") denormOutField: String) + case class DenormFieldConfig(@JsonProperty("denorm_key") denormKey: Option[String], @JsonProperty("redis_db") redisDB: Int, + @JsonProperty("denorm_out_field") denormOutField: String, @JsonProperty("jsonata_expr") jsonAtaExpr: Option[String]) case class DenormConfig(@JsonProperty("redis_db_host") redisDBHost: String, @JsonProperty("redis_db_port") redisDBPort: Int, @JsonProperty("denorm_fields") denormFields: List[DenormFieldConfig]) diff --git a/framework/pom.xml b/framework/pom.xml index 52ced63f..263a52a7 100644 --- a/framework/pom.xml +++ b/framework/pom.xml @@ -44,7 +44,7 @@ org.apache.httpcomponents httpclient - 4.5.1 + 4.5.13 com.google.code.gson @@ -98,7 +98,7 @@ junit junit - 4.12 + 4.13.1 test @@ -144,12 +144,6 @@ 1.0.0 test - - org.cassandraunit - cassandra-unit - 3.11.2.0 - test - diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala index b5e57d87..6b9fcc08 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/ErrorConstants.scala @@ -24,7 +24,7 @@ object ErrorConstants extends Enumeration { val JSON_SCHEMA_NOT_FOUND = ErrorInternalValue("ERR_PP_1011", "Json schema not found for the dataset") val INVALID_JSON_SCHEMA = ErrorInternalValue("ERR_PP_1012", "Invalid json schema") val SCHEMA_VALIDATION_FAILED = ErrorInternalValue("ERR_PP_1013", "Event failed the schema validation") - val DENORM_KEY_MISSING = ErrorInternalValue("ERR_DENORM_1014", "No denorm key found or missing data for the specified key") + val DENORM_KEY_MISSING = ErrorInternalValue("ERR_DENORM_1014", "No denorm key or transformation expr found or missing data for the specified key") val DENORM_KEY_NOT_A_STRING_OR_NUMBER = ErrorInternalValue("ERR_DENORM_1015", "Denorm key value is not a String or Number") val DENORM_DATA_NOT_FOUND = ErrorInternalValue("ERR_DENORM_1016", "Denorm data not found for the given key") val MISSING_DATASET_CONFIG_KEY = ErrorInternalValue("ERR_MASTER_DATA_1017", "Master dataset configuration key is missing") diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala b/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala index 67156256..550e99d8 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/util/JSONUtil.scala @@ -57,6 +57,10 @@ object JSONUtil { root.at(path); } + def getJsonNode(json: String): JsonNode = { + mapper.readTree(json); + } + private[this] def typeReference[T: Manifest] = new TypeReference[T] { override def getType: Type = typeFromManifest(manifest[T]) } diff --git a/pipeline/denormalizer/pom.xml b/pipeline/denormalizer/pom.xml index 2df98cd3..b67aed62 100644 --- a/pipeline/denormalizer/pom.xml +++ b/pipeline/denormalizer/pom.xml @@ -42,6 +42,11 @@ dataset-registry 1.0.0 + + org.sunbird.obsrv + transformation-sdk + 1.0.0 + org.apache.kafka kafka-clients diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerConfig.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerConfig.scala index 118c0307..1fe24d68 100644 --- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerConfig.scala +++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/task/DenormalizerConfig.scala @@ -16,7 +16,6 @@ class DenormalizerConfig(override val config: Config) extends BaseJobConfig[muta implicit val anyTypeInfo: TypeInformation[String] = TypeExtractor.getForClass(classOf[String]) // Kafka Topics Configuration - val kafkaInputTopic: String = config.getString("kafka.input.topic") val denormOutputTopic: String = config.getString("kafka.output.denorm.topic") // Windows @@ -41,7 +40,7 @@ class DenormalizerConfig(override val config: Config) extends BaseJobConfig[muta // Functions val denormalizationFunction = "DenormalizationFunction" - override def inputTopic(): String = kafkaInputTopic + override def inputTopic(): String = config.getString("kafka.input.topic") override def inputConsumer(): String = denormalizationConsumer override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = denormEventsTag override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala index db0da7d5..5550748a 100644 --- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala +++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/util/DenormCache.scala @@ -1,11 +1,14 @@ package org.sunbird.obsrv.denormalizer.util +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.MissingNode import org.sunbird.obsrv.core.cache.RedisConnect import org.sunbird.obsrv.core.model.ErrorConstants import org.sunbird.obsrv.core.model.ErrorConstants.Error import org.sunbird.obsrv.core.util.{JSONUtil, Util} import org.sunbird.obsrv.denormalizer.task.DenormalizerConfig -import org.sunbird.obsrv.model.DatasetModels.{Dataset, DenormFieldConfig} +import org.sunbird.obsrv.model.DatasetModels.{Dataset, DenormFieldConfig, TransformationFunction} +import org.sunbird.obsrv.transformer.types.JSONAtaTransformer import redis.clients.jedis.{Pipeline, Response} import scala.collection.mutable @@ -75,7 +78,7 @@ class DenormCache(val config: DenormalizerConfig) { } private def extractField(fieldConfig: DenormFieldConfig, eventStr: String): DenormFieldStatus = { - val denormFieldNode = JSONUtil.getKey(fieldConfig.denormKey, eventStr) + val denormFieldNode = getDenormFieldValue(fieldConfig, eventStr) if (denormFieldNode.isMissingNode) { DenormFieldStatus("", success = false, Some(ErrorConstants.DENORM_KEY_MISSING)) } else { @@ -87,6 +90,16 @@ class DenormCache(val config: DenormalizerConfig) { } } + private def getDenormFieldValue(fieldConfig: DenormFieldConfig, eventStr: String): JsonNode = { + if(fieldConfig.denormKey.isDefined) { + JSONUtil.getKey(fieldConfig.denormKey.get, eventStr) + } else if(fieldConfig.jsonAtaExpr.isDefined) { + JSONAtaTransformer.evaluate(JSONUtil.getJsonNode(eventStr), TransformationFunction("jsonata", None, fieldConfig.jsonAtaExpr.get)) + } else { + MissingNode.getInstance() + } + } + private def getFromCache(pipeline: Pipeline, denormField: String, fieldConfig: DenormFieldConfig): Response[String] = { pipeline.select(fieldConfig.redisDB) pipeline.get(denormField) diff --git a/pipeline/transformer/pom.xml b/pipeline/transformer/pom.xml index ba2d8f87..959e549e 100644 --- a/pipeline/transformer/pom.xml +++ b/pipeline/transformer/pom.xml @@ -42,31 +42,9 @@ 1.0.0 - org.json4s - json4s-native_${scala.maj.version} - 4.0.6 - - - com.ibm.jsonata4java - JSONata4Java - 2.2.6 - - - com.fasterxml.jackson.core - jackson-databind - - - - - com.github.bancolombia - data-mask-core - 1.0.1 - - - com.fasterxml.jackson.core - jackson-databind - - + org.sunbird.obsrv + transformation-sdk + 1.0.0 org.sunbird.obsrv diff --git a/pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/UnifiedPipelineStreamTaskTestSpec.scala b/pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/UnifiedPipelineStreamTaskTestSpec.scala index 8d3ba2d6..879abeec 100644 --- a/pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/UnifiedPipelineStreamTaskTestSpec.scala +++ b/pipeline/unified-pipeline/src/test/scala/org/sunbird/obsrv/pipeline/UnifiedPipelineStreamTaskTestSpec.scala @@ -106,7 +106,7 @@ class UnifiedPipelineStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { val mutableMetricsMap = mutable.Map[String, Long](); BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) - Console.println("### MergedPipelineStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) + Console.println("### UnifiedPipelineStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) mutableMetricsMap("ExtractorJob.d1.extractor-total-count") should be(4) mutableMetricsMap("ExtractorJob.d1.extractor-duplicate-count") should be(1) diff --git a/pom.xml b/pom.xml index c8f53bd8..4ecdc676 100644 --- a/pom.xml +++ b/pom.xml @@ -18,6 +18,7 @@ framework dataset-registry + transformation-sdk pipeline data-products diff --git a/transformation-sdk/pom.xml b/transformation-sdk/pom.xml new file mode 100644 index 00000000..10d393ce --- /dev/null +++ b/transformation-sdk/pom.xml @@ -0,0 +1,180 @@ + + + 4.0.0 + transformation-sdk + org.sunbird.obsrv + 1.0.0 + jar + Obsrv Transformation Library as a SDK + + UTF-8 + UTF-8 + 2.12 + 2.12.11 + 1.15.2 + 2.8.1 + 11 + 1.9.13 + 1.4.0 + 2.14.1 + + + + org.sunbird.obsrv + dataset-registry + 1.0.0 + + + org.json4s + json4s-native_${scala.maj.version} + 4.0.6 + + + com.ibm.jsonata4java + JSONata4Java + 2.2.6 + + + com.fasterxml.jackson.core + jackson-databind + + + + + com.github.bancolombia + data-mask-core + 1.0.1 + + + com.fasterxml.jackson.core + jackson-databind + + + + + org.scalatest + scalatest_2.12 + 3.0.6 + test + + + org.mockito + mockito-core + 3.3.3 + test + + + org.sunbird.obsrv + framework + 1.0.0 + test-jar + test + + + org.sunbird.obsrv + dataset-registry + 1.0.0 + test-jar + test + + + + + src/main/scala + src/test/scala + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + 11 + + + + maven-surefire-plugin + 2.20 + + true + + + + + org.scalatest + scalatest-maven-plugin + 1.0 + + ${project.build.directory}/surefire-reports + . + dp-core-testsuite.txt + + + + test + + test + + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.2.0 + + + + test-jar + + + + + + + org.scoverage + scoverage-maven-plugin + ${scoverage.plugin.version} + + ${scala.version} + true + true + + + + + net.alchim31.maven + scala-maven-plugin + 4.4.0 + + ${java.target.runtime} + ${java.target.runtime} + ${scala.version} + false + + + + scala-compile-first + process-resources + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + + + + + diff --git a/transformation-sdk/src/main/resources/transformation-sdk.conf b/transformation-sdk/src/main/resources/transformation-sdk.conf new file mode 100644 index 00000000..e69de29b diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala similarity index 82% rename from pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala rename to transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala index 4cd37623..125872d0 100644 --- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala +++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/EncryptTransformer.scala @@ -2,18 +2,24 @@ package org.sunbird.obsrv.transformer.types import com.fasterxml.jackson.databind.JsonNode import com.fasterxml.jackson.databind.node.MissingNode -import org.sunbird.obsrv.transformer.functions.TransformerFunctionHelper.JsonHelper -import org.sunbird.obsrv.transformer.util.CipherUtil -import org.json4s.{JValue, MappingException} +import org.json4s.{DefaultFormats, Formats, JValue, MappingException} import org.slf4j.LoggerFactory import org.sunbird.obsrv.core.model.ErrorConstants import org.sunbird.obsrv.core.util.JSONUtil import org.sunbird.obsrv.model.DatasetModels.DatasetTransformation +import org.sunbird.obsrv.transformer.util.CipherUtil class EncryptTransformer extends ITransformer { + implicit val jsonFormats: Formats = DefaultFormats.withLong private val logger = LoggerFactory.getLogger(classOf[EncryptTransformer]) + implicit class JsonHelper(json: JValue) { + def customExtract[T](path: String)(implicit mf: Manifest[T]): T = { + path.split('.').foldLeft(json)({ case (acc: JValue, node: String) => acc \ node }).extract[T] + } + } + override def transformField(json: JValue, jsonNode: JsonNode, dt: DatasetTransformation): (JValue, TransformFieldStatus) = { val emptyNode = getJSON(dt.fieldKey, MissingNode.getInstance()) try { diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala similarity index 95% rename from pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala rename to transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala index 2da86478..7fee60ca 100644 --- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala +++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/ITransformer.scala @@ -2,17 +2,16 @@ package org.sunbird.obsrv.transformer.types import com.fasterxml.jackson.databind.JsonNode import com.fasterxml.jackson.databind.node.JsonNodeType -import org.sunbird.obsrv.transformer.util.ConditionEvaluator -import org.json4s.{JNothing, JObject, JValue} import org.json4s.native.JsonMethods.parse +import org.json4s.{JNothing, JObject, JValue} import org.sunbird.obsrv.core.model.ErrorConstants.Error import org.sunbird.obsrv.model.DatasetModels.DatasetTransformation import org.sunbird.obsrv.model.TransformMode.TransformMode +import org.sunbird.obsrv.transformer.util.ConditionEvaluator import scala.collection.mutable.ListBuffer case class TransformFieldStatus(fieldKey: String, expr: String, success: Boolean, mode: TransformMode, error: Option[Error] = None) -case class ConditionStatus(expr: String, success: Boolean, mode: Option[TransformMode] = None, error: Option[Error] = None) case class TransformationResult(json: JValue, fieldStatus: List[TransformFieldStatus]) abstract class ITransformer[T] { diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala similarity index 69% rename from pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala rename to transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala index ee7a4858..217f5c9c 100644 --- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala +++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala @@ -8,6 +8,7 @@ import org.slf4j.LoggerFactory import org.sunbird.obsrv.core.model.ErrorConstants import org.sunbird.obsrv.core.util.JSONUtil import org.sunbird.obsrv.model.DatasetModels +import org.sunbird.obsrv.model.DatasetModels.TransformationFunction class JSONAtaTransformer extends ITransformer { @@ -31,6 +32,24 @@ class JSONAtaTransformer extends ITransformer { (emptyNode, TransformFieldStatus(dt.fieldKey, dt.transformationFunction.expr, success = false, dt.mode.get, Some(ErrorConstants.ERR_UNKNOWN_TRANSFORM_EXCEPTION))) } } + + def evaluate(jsonNode: JsonNode, tf: TransformationFunction): JsonNode = { + + try { + val expr = Expressions.parse(tf.expr) + expr.evaluate(jsonNode) + } catch { + case ex1: ParseException => + logger.error(s"Transformer(JSONATA) | Exception parsing transformation expression | Data=${JSONUtil.serialize(dt)} | error=${ex1.getMessage}", ex1) + MissingNode.getInstance() + case ex2: EvaluateException => + logger.error(s"Transformer(JSONATA) | Exception evaluating transformation expression | Data=${JSONUtil.serialize(dt)} | error=${ex2.getMessage}", ex2) + MissingNode.getInstance() + case ex3: Exception => + logger.error(s"Transformer(JSONATA) | Unknown error | Data=${JSONUtil.serialize(dt)} | error=${ex3.getMessage}", ex3) + MissingNode.getInstance() + } + } } object JSONAtaTransformer { @@ -41,4 +60,8 @@ object JSONAtaTransformer { jsonAtaTransformer.transform(json, jsonNode, dtList) } + def evaluate(jsonNode: JsonNode, transformation: TransformationFunction): JsonNode = { + jsonAtaTransformer.evaluate(jsonNode, transformation) + } + } \ No newline at end of file diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala similarity index 97% rename from pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala rename to transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala index 8df5f889..045e224f 100644 --- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala +++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/MaskTransformer.scala @@ -3,7 +3,7 @@ package org.sunbird.obsrv.transformer.types import co.com.bancolombia.datamask.{MaskUtils => CustomMaskUtils} import com.fasterxml.jackson.databind.JsonNode import com.fasterxml.jackson.databind.node.MissingNode -import org.json4s._ +import org.json4s.{DefaultFormats, Formats, JValue, MappingException} import org.slf4j.LoggerFactory import org.sunbird.obsrv.core.model.ErrorConstants import org.sunbird.obsrv.core.util.JSONUtil diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala similarity index 99% rename from pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala rename to transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala index 3f7a93e0..58d489f6 100644 --- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala +++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/CipherUtil.scala @@ -33,4 +33,4 @@ object CipherUtil { cipher } -} +} \ No newline at end of file diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala similarity index 93% rename from pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala rename to transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala index ee6a18a4..0a892b8f 100644 --- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala +++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/util/ConditionEvaluator.scala @@ -2,13 +2,14 @@ package org.sunbird.obsrv.transformer.util import com.api.jsonata4java.expressions.{EvaluateException, Expressions, ParseException} import com.fasterxml.jackson.databind.JsonNode -import org.sunbird.obsrv.transformer.types.ConditionStatus import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.ErrorConstants.Error import org.sunbird.obsrv.core.model.ErrorConstants import org.sunbird.obsrv.core.util.JSONUtil import org.sunbird.obsrv.model.DatasetModels.Condition import org.sunbird.obsrv.model.TransformMode.TransformMode +case class ConditionStatus(expr: String, success: Boolean, mode: Option[TransformMode] = None, error: Option[Error] = None) object ConditionEvaluator { private val logger = LoggerFactory.getLogger(ConditionEvaluator.getClass) diff --git a/transformation-sdk/src/test/resources/test.conf b/transformation-sdk/src/test/resources/test.conf new file mode 100644 index 00000000..e69de29b From ded2fde8e12900ee1be6cb2c6b6bc196319fe04d Mon Sep 17 00:00:00 2001 From: Santhosh Vasabhaktula Date: Thu, 20 Jun 2024 19:18:17 +0530 Subject: [PATCH 35/37] feat #0001 - Ability to generate denorm field via jsonata transformation --- .../obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala | 4 ++-- pipeline/master-data-processor/src/test/resources/test.conf | 2 +- .../pipeline/MasterDataProcessorStreamTaskTestSpec.scala | 3 ++- .../obsrv/transformer/task/TransformerStreamTask.scala | 1 - .../obsrv/transformer/types/JSONAtaTransformer.scala | 6 +++--- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala index bd9658eb..d4d5bc30 100644 --- a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala +++ b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala @@ -71,7 +71,7 @@ class DenormalizerStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { } private def insertTestData(postgresConnect: PostgresConnect): Unit = { - postgresConnect.execute("update datasets set denorm_config = '" + s"""{"redis_db_host":"localhost","redis_db_port":$redisPort,"denorm_fields":[{"denorm_key":"vehicleCode","redis_db":3,"denorm_out_field":"vehicle_data"},{"denorm_key":"dealer.dealerCode","redis_db":4,"denorm_out_field":"dealer_data"}]}""" + "' where id='d1';") + postgresConnect.execute("update datasets set denorm_config = '" + s"""{"redis_db_host":"localhost","redis_db_port":$redisPort,"denorm_fields":[{"denorm_key":"vehicleCode","redis_db":3,"denorm_out_field":"vehicle_data"},{"jsonata_expr":"$$.dealer.dealerCode","redis_db":4,"denorm_out_field":"dealer_data"}]}""" + "' where id='d1';") val redisConnection = new RedisConnect(denormConfig.redisHost, denormConfig.redisPort, denormConfig.redisConnectionTimeout) redisConnection.getConnection(3).set("HYUN-CRE-D6", EventFixture.DENORM_DATA_1) redisConnection.getConnection(4).set("D123", EventFixture.DENORM_DATA_2) @@ -118,7 +118,7 @@ class DenormalizerStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { val denormCache = new DenormCache(denormConfig) noException should be thrownBy { denormCache.open(Dataset(id = "d123", datasetType = "dataset", extractionConfig = None, dedupConfig = None, validationConfig = None, jsonSchema = None, - denormConfig = Some(DenormConfig(redisDBHost = "localhost", redisDBPort = redisPort, denormFields = List(DenormFieldConfig(denormKey = "vehicleCode", redisDB = 3, denormOutField = "vehicle_data")))), routerConfig = RouterConfig(""), + denormConfig = Some(DenormConfig(redisDBHost = "localhost", redisDBPort = redisPort, denormFields = List(DenormFieldConfig(denormKey = Some("vehicleCode"), redisDB = 3, denormOutField = "vehicle_data", jsonAtaExpr = None)))), routerConfig = RouterConfig(""), datasetConfig = DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest"), status = DatasetStatus.Live)) } } diff --git a/pipeline/master-data-processor/src/test/resources/test.conf b/pipeline/master-data-processor/src/test/resources/test.conf index dfb54e4b..a20636d0 100644 --- a/pipeline/master-data-processor/src/test/resources/test.conf +++ b/pipeline/master-data-processor/src/test/resources/test.conf @@ -26,7 +26,7 @@ kafka { } task { - window.time.in.seconds = 5 + window.time.in.seconds = 2 window.count = 2 window.shards = 1400 consumer.parallelism = 1 diff --git a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala index 1bafb519..5d4545c4 100644 --- a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala +++ b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala @@ -96,6 +96,7 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry task.process(env) Future { env.execute(masterDataConfig.jobName) + Thread.sleep(5000) } val input = EmbeddedKafka.consumeNumberMessagesFrom[String](config.getString("kafka.output.system.event.topic"), 7, timeout = 30.seconds) @@ -120,7 +121,7 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry val mutableMetricsMap = mutable.Map[String, Long](); BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) - Console.println("### MasterDataProcessorStreamTaskTestSpec:metrics ###", getPrintableMetrics(mutableMetricsMap)) + Console.println("### MasterDataProcessorStreamTaskTestSpec:metrics ###", JSONUtil.serialize(getPrintableMetrics(mutableMetricsMap))) masterDataConfig.successTag().getId should be ("processing_stats") diff --git a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala index 1bd31fb1..eee8cee2 100644 --- a/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala +++ b/pipeline/transformer/src/main/scala/org/sunbird/obsrv/transformer/task/TransformerStreamTask.scala @@ -6,7 +6,6 @@ import org.apache.flink.streaming.api.datastream.DataStream import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector} import org.sunbird.obsrv.core.util.FlinkUtil -import org.sunbird.obsrv.model.TransformType import org.sunbird.obsrv.transformer.functions.TransformerFunction import java.io.File diff --git a/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala index 217f5c9c..d6a55b54 100644 --- a/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala +++ b/transformation-sdk/src/main/scala/org/sunbird/obsrv/transformer/types/JSONAtaTransformer.scala @@ -40,13 +40,13 @@ class JSONAtaTransformer extends ITransformer { expr.evaluate(jsonNode) } catch { case ex1: ParseException => - logger.error(s"Transformer(JSONATA) | Exception parsing transformation expression | Data=${JSONUtil.serialize(dt)} | error=${ex1.getMessage}", ex1) + logger.error(s"Transformer(JSONATA) | Exception parsing transformation expression | Data=${JSONUtil.serialize(tf)} | error=${ex1.getMessage}", ex1) MissingNode.getInstance() case ex2: EvaluateException => - logger.error(s"Transformer(JSONATA) | Exception evaluating transformation expression | Data=${JSONUtil.serialize(dt)} | error=${ex2.getMessage}", ex2) + logger.error(s"Transformer(JSONATA) | Exception evaluating transformation expression | Data=${JSONUtil.serialize(tf)} | error=${ex2.getMessage}", ex2) MissingNode.getInstance() case ex3: Exception => - logger.error(s"Transformer(JSONATA) | Unknown error | Data=${JSONUtil.serialize(dt)} | error=${ex3.getMessage}", ex3) + logger.error(s"Transformer(JSONATA) | Unknown error | Data=${JSONUtil.serialize(tf)} | error=${ex3.getMessage}", ex3) MissingNode.getInstance() } } From 49043d71769e1f761b0d905d9132b0a36aec16ac Mon Sep 17 00:00:00 2001 From: Santhosh Vasabhaktula Date: Thu, 27 Jun 2024 16:49:25 +0530 Subject: [PATCH 36/37] feat: v2 Refactoring. Following are the changes done: 1. removed the separation logic of master-dataset and dataset 2. Merged both pipelines into one. 3. Created cache indexer to index data into redis for master datasets similar to Hudi 4. Upgraded the dataset config to the newer version 5. Move the entry_topic as a separate field. This is to enable creation of multiple pipelines in the future --- .github/workflows/build_and_deploy.yaml | 2 + Dockerfile | 4 + .../MasterDataProcessorIndexer.scala | 7 +- .../sunbird/spec/MasterDataIndexerSpec.scala | 10 +- .../sunbird/obsrv/model/DatasetModels.scala | 41 ++- .../obsrv/registry/DatasetRegistry.scala | 13 +- .../service/DatasetRegistryService.scala | 29 +- .../BaseDatasetProcessFunction.scala | 4 +- .../spec/BaseSpecWithDatasetRegistry.scala | 12 +- .../obsrv/spec/TestDatasetRegistrySpec.scala | 6 +- .../sunbird/obsrv/core/model/Constants.scala | 1 - .../obsrv/core/streaming/BaseJobConfig.scala | 2 - .../spec/BaseProcessFunctionTestSpec.scala | 1 - .../org/sunbird/spec/ModelsTestSpec.scala | 1 - pipeline/cache-indexer/pom.xml | 260 ++++++++++++++++++ .../src/main/resources/cache-indexer.conf | 15 + .../MasterDataProcessorFunction.scala | 56 ++++ .../obsrv/streaming/CacheIndexerConfig.scala | 33 +++ .../streaming/CacheIndexerStreamTask.scala | 61 ++++ .../sunbird/obsrv/util/MasterDataCache.scala | 58 ++++ .../src/test/resources/base-config.conf | 8 + .../src/test/resources/test.conf | 20 ++ .../sunbird/obsrv/fixture/EventFixture.scala | 10 + .../CacheIndexerStreamTaskTestSpec.scala | 142 ++++++++++ .../{druid-router => dataset-router}/pom.xml | 8 +- .../src/main/resources/dataset-router.conf} | 0 .../functions/DynamicRouterFunction.scala | 15 +- .../router/task/DynamicRouterConfig.scala} | 2 +- .../router/task/DynamicRouterStreamTask.scala | 6 +- .../src/test/resources/test.conf | 0 .../DynamicRouterStreamTaskTestSpec.scala | 6 +- .../sunbird/obsrv/router/EventFixture.scala | 0 .../obsrv/router/TestTimestampKeyParser.scala | 57 ++-- .../functions/DenormalizerFunction.scala | 2 +- .../DenormalizerWindowFunction.scala | 2 +- .../DenormalizerStreamTaskTestSpec.scala | 2 +- ...DenormalizerWindowStreamTaskTestSpec.scala | 2 +- .../functions/DruidRouterFunction.scala | 58 ---- .../router/task/DruidRouterStreamTask.scala | 72 ----- .../functions/ExtractionFunction.scala | 2 +- .../extractor/ExtractorStreamTestSpec.scala | 2 +- .../RowDataConverterFunction.scala | 2 +- .../streaming/HudiConnectorStreamTask.scala | 2 +- pipeline/master-data-processor/pom.xml | 2 +- .../MasterDataProcessorFunction.scala | 7 +- .../task/MasterDataProcessorStreamTask.scala | 4 +- .../obsrv/pipeline/util/MasterDataCache.scala | 12 +- .../src/test/resources/test.conf | 2 +- ...asterDataProcessorStreamTaskTestSpec.scala | 10 +- pipeline/pom.xml | 3 +- .../functions/EventValidationFunction.scala | 2 +- .../PipelinePreprocessorStreamTestSpec.scala | 12 +- .../preprocessor/TestSchemaValidator.scala | 26 +- .../TestTransformerFunctionHelper.scala | 52 ++-- .../TransformerStreamTestSpec.scala | 14 +- pipeline/unified-pipeline/pom.xml | 2 +- .../task/UnifiedPipelineStreamTask.scala | 4 +- 57 files changed, 886 insertions(+), 302 deletions(-) create mode 100644 pipeline/cache-indexer/pom.xml create mode 100644 pipeline/cache-indexer/src/main/resources/cache-indexer.conf create mode 100644 pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/function/MasterDataProcessorFunction.scala create mode 100644 pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerConfig.scala create mode 100644 pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerStreamTask.scala create mode 100644 pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/util/MasterDataCache.scala create mode 100644 pipeline/cache-indexer/src/test/resources/base-config.conf create mode 100644 pipeline/cache-indexer/src/test/resources/test.conf create mode 100644 pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala create mode 100644 pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/pipeline/CacheIndexerStreamTaskTestSpec.scala rename pipeline/{druid-router => dataset-router}/pom.xml (97%) rename pipeline/{druid-router/src/main/resources/druid-router.conf => dataset-router/src/main/resources/dataset-router.conf} (100%) rename pipeline/{druid-router => dataset-router}/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala (90%) rename pipeline/{druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterConfig.scala => dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterConfig.scala} (92%) rename pipeline/{druid-router => dataset-router}/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala (89%) rename pipeline/{druid-router => dataset-router}/src/test/resources/test.conf (100%) rename pipeline/{druid-router => dataset-router}/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala (97%) rename pipeline/{druid-router => dataset-router}/src/test/scala/org/sunbird/obsrv/router/EventFixture.scala (100%) rename pipeline/{druid-router => dataset-router}/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala (51%) delete mode 100644 pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DruidRouterFunction.scala delete mode 100644 pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterStreamTask.scala rename pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/{functions => function}/RowDataConverterFunction.scala (98%) diff --git a/.github/workflows/build_and_deploy.yaml b/.github/workflows/build_and_deploy.yaml index 05defdf1..7f285253 100644 --- a/.github/workflows/build_and_deploy.yaml +++ b/.github/workflows/build_and_deploy.yaml @@ -33,6 +33,8 @@ jobs: target: "master-data-processor-image" - image: "lakehouse-connector" target: "lakehouse-connector-image" + - image: "cache-indexer" + target: "cache-indexer-image" steps: - uses: actions/checkout@v4 with: diff --git a/Dockerfile b/Dockerfile index e392b619..bb103b2a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,3 +41,7 @@ FROM --platform=linux/x86_64 sanketikahub/flink:1.15.0-scala_2.12-lakehouse as l USER flink RUN mkdir $FLINK_HOME/custom-lib COPY ./pipeline/hudi-connector/target/hudi-connector-1.0.0.jar $FLINK_HOME/custom-lib + +FROM --platform=linux/x86_64 sanketikahub/flink:1.15.2-scala_2.12-jdk-11 as cache-indexer-image +USER flink +COPY --from=build-pipeline /app/pipeline/cache-indexer/target/cache-indexer-1.0.0.jar $FLINK_HOME/lib \ No newline at end of file diff --git a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala index 22729aa0..781b916a 100644 --- a/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala +++ b/data-products/src/main/scala/org/sunbird/obsrv/dataproducts/MasterDataProcessorIndexer.scala @@ -69,7 +69,8 @@ object MasterDataProcessorIndexer { logger.info(s"createDataFile() | START | dataset=${dataset.id} ") import spark.implicits._ val readWriteConf = ReadWriteConfig(scanCount = config.getInt("redis.scan.count"), maxPipelineSize = config.getInt("redis.max.pipeline.size")) - val redisConfig = new RedisConfig(initialHost = RedisEndpoint(host = dataset.datasetConfig.redisDBHost.get, port = dataset.datasetConfig.redisDBPort.get, dbNum = dataset.datasetConfig.redisDB.get)) + val cacheConfig = dataset.datasetConfig.cacheConfig.get + val redisConfig = new RedisConfig(initialHost = RedisEndpoint(host = cacheConfig.redisDBHost.get, port = cacheConfig.redisDBPort.get, dbNum = cacheConfig.redisDB.get)) val ts: Long = new DateTime(DateTimeZone.UTC).withTimeAtStartOfDay().getMillis val rdd = spark.sparkContext.fromRedisKV("*")(redisConfig = redisConfig, readWriteConfig = readWriteConf).map( f => CommonUtil.processEvent(f._2, ts) @@ -83,9 +84,9 @@ object MasterDataProcessorIndexer { } private def getDatasets(): List[Dataset] = { - val datasets: List[Dataset] = DatasetRegistry.getAllDatasets("master-dataset") + val datasets: List[Dataset] = DatasetRegistry.getAllDatasets(Some("master")) datasets.filter(dataset => { - dataset.datasetConfig.indexData.nonEmpty && dataset.datasetConfig.indexData.get && dataset.status == DatasetStatus.Live + dataset.datasetConfig.indexingConfig.olapStoreEnabled && dataset.status == DatasetStatus.Live }) } diff --git a/data-products/src/test/scala/org/sunbird/spec/MasterDataIndexerSpec.scala b/data-products/src/test/scala/org/sunbird/spec/MasterDataIndexerSpec.scala index 0d54050e..6ef1458e 100644 --- a/data-products/src/test/scala/org/sunbird/spec/MasterDataIndexerSpec.scala +++ b/data-products/src/test/scala/org/sunbird/spec/MasterDataIndexerSpec.scala @@ -108,16 +108,16 @@ class MasterDataIndexerSpec extends FlatSpec with BeforeAndAfterAll with Matcher } private def insertTestData(postgresConnect: PostgresConnect) = { - postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md1','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 3}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") + postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES('md1','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 3}', NULL, NULL, 'Live', 'v1', 'local.masterdata.ingest', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md1_md1.1_DAY', 'md1', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md1.1_DAY', 'md1.1_DAY');") postgresConnect.execute("insert into dataset_transformations values('tf1', 'md1', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'Live', 'Strict', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md2','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\":6340, \"index_data\": true, \"redis_db\": 5}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', 'now()', 'now()');") + postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, denorm_config, router_config, dataset_config, tags, data_version, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES('md2','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\":6340, \"index_data\": true, \"redis_db\": 5}', NULL, NULL, 'Live', 'v1', 'local.masterdata.ingest', 'SYSTEM', 'SYSTEM', 'now()', 'now()');") postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md2_md1.1_DAY', 'md2', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md2.1_DAY', 'md2.1_DAY');") - postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md3','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 6}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") + postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES('md3','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 6}', NULL, NULL, 'Live', 'v1', 'local.masterdata.ingest', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md3_md3.1_DAY', 'md3', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md3.1_DAY', 'md3.1_DAY');") postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md3_md3.2_DAY', 'md3', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md3.2_DAY', 'md3.2_DAY');") - postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md5','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 9}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") - postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, created_by, updated_by, created_date, updated_date) VALUES('md4','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 3}', NULL, NULL, 'Live', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") + postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES('md5','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 9}', NULL, NULL, 'Live', 'v1', 'local.masterdata.ingest', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") + postgresConnect.execute("insert into datasets(id, type, validation_config, extraction_config, dedup_config, data_schema, denorm_config, router_config, dataset_config, tags, data_version, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES('md4','master-dataset', '{\"validate\": true, \"mode\": \"Strict\", \"validation_mode\": {}}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 1036800}, \"batch_id\": \"id\"}', '{\"drop_duplicates\": true, \"dedup_key\": \"device_id\", \"dedup_period\": 1036800}', '{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"fcm_token\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"device_id\": {\"type\": \"string\"}, \"device_spec\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\"}, \"uaspec\": {\"type\": \"object\", \"properties\": {\"agent\": {\"type\": \"string\"}, \"ver\": {\"type\": \"string\"}, \"system\": {\"type\": \"string\"}, \"raw\": {\"type\": \"string\"}}}, \"country\": {\"type\": \"string\"}, \"country_code\": {\"type\": \"string\"}, \"producer_id\": {\"type\": \"string\"}, \"state_code_custom\": {\"type\": \"integer\"}, \"state_code\": {\"type\": \"string\"}, \"state_custom\": {\"type\": \"string\"}, \"district_custom\": {\"type\": \"string\"}, \"first_access\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''first_access'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"api_last_updated_on\": {\"type\": \"integer\", \"format\": \"date-time\", \"suggestions\": [{\"message\": \"The Property ''api_last_updated_on'' appears to be ''date-time'' format type.\", \"advice\": \"The System can index all data on this column\", \"resolutionType\": \"INDEX\", \"severity\": \"LOW\"}]}, \"user_declared_district\": {\"type\": \"string\"}, \"user_declared_state\": {\"type\": \"string\"}}, \"required\": [\"first_access\", \"api_last_updated_on\", \"device_id\"]}', '{\"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"denorm_fields\": []}', '{\"topic\": \",d1\"}', '{\"data_key\": \"device_id\", \"timestamp_key\": \"\", \"exclude_fields\": [], \"entry_topic\": \"local.masterdata.ingest\", \"redis_db_host\": \"localhost\", \"redis_db_port\": 6340, \"index_data\": true, \"redis_db\": 3}', NULL, NULL, 'Live', 'v1', 'local.masterdata.ingest', 'SYSTEM', 'SYSTEM', '2023-10-04 06:44:11.600', '2023-10-04 06:44:11.600');") postgresConnect.execute("insert into datasources(id, dataset_id, ingestion_spec, datasource, datasource_ref) VALUES('md4_md4.1_DAY', 'md4', '{\"type\": \"kafka\",\"spec\": {\"dataSchema\": {\"dataSource\": \"telemetry-device-data.1_DAY\",\"dimensionsSpec\": {\"dimensions\": [{\"type\": \"string\",\"name\": \"fcm_token\"},{\"type\": \"string\",\"name\": \"city\"},{\"type\": \"string\",\"name\": \"device_id\"},{\"type\": \"string\",\"name\": \"device_spec\"},{\"type\": \"string\",\"name\": \"state\"},{\"type\": \"string\",\"name\": \"uaspec_agent\"}]},\"timestampSpec\": {\"column\": \"syncts\",\"format\": \"auto\"},\"metricsSpec\": [{\"type\": \"doubleSum\",\"name\": \"state_code_custom\",\"fieldName\": \"state_code_custom\"}],\"granularitySpec\": {\"type\": \"uniform\",\"segmentGranularity\": \"DAY\",\"rollup\": false}},\"tuningConfig\": {\"type\": \"kafka\",\"maxBytesInMemory\": 134217728,\"maxRowsPerSegment\": 500000,\"logParseExceptions\": true},\"ioConfig\": {\"type\": \"kafka\",\"topic\": \"telemetry-device-data\",\"consumerProperties\": {\"bootstrap.servers\": \"localhost:9092\"},\"taskCount\": 1,\"replicas\": 1,\"taskDuration\": \"PT1H\",\"useEarliestOffset\": true,\"completionTimeout\": \"PT1H\",\"inputFormat\": {\"type\": \"json\",\"flattenSpec\": {\"useFieldDiscovery\": true,\"fields\": [{ \"type\": \"path\",\"expr\": \"$.fcm_token\",\"name\": \"fcm_token\"},{\"type\": \"path\",\"expr\": \"$.city\",\"name\": \"city\"},{\"type\": \"path\",\"expr\": \"$.device_id\",\"name\": \"device_id\"},{\"type\": \"path\",\"expr\": \"$.device_spec\",\"name\": \"device_spec\"},{\"type\": \"path\",\"expr\": \"$.state\",\"name\": \"state\"},{\"type\": \"path\",\"expr\": \"$.uaspec.agent\",\"name\": \"uaspec_agent\"}]}},\"appendToExisting\": false}}}', 'md4.1_DAY', 'md4.1_DAY');") } diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala index e8affc0a..8bc1623b 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/model/DatasetModels.scala @@ -4,7 +4,7 @@ import com.fasterxml.jackson.annotation.JsonProperty import com.fasterxml.jackson.core.`type`.TypeReference import com.fasterxml.jackson.module.scala.JsonScalaEnumeration import org.sunbird.obsrv.core.model.SystemConfig -import org.sunbird.obsrv.model.DatasetStatus.DatasetStatus +import org.sunbird.obsrv.model.DatasetStatus.{DatasetStatus, Value} import org.sunbird.obsrv.model.TransformMode.TransformMode import org.sunbird.obsrv.model.ValidationMode.ValidationMode @@ -33,17 +33,34 @@ object DatasetModels { case class RouterConfig(@JsonProperty("topic") topic: String) - case class DatasetConfig(@JsonProperty("data_key") key: String, @JsonProperty("timestamp_key") tsKey: String, @JsonProperty("entry_topic") entryTopic: String, - @JsonProperty("exclude_fields") excludeFields: Option[List[String]] = None, @JsonProperty("redis_db_host") redisDBHost: Option[String] = None, - @JsonProperty("redis_db_port") redisDBPort: Option[Int] = None, @JsonProperty("redis_db") redisDB: Option[Int] = None, - @JsonProperty("index_data") indexData: Option[Boolean] = None, @JsonProperty("timestamp_format") tsFormat: Option[String] = None, - @JsonProperty("dataset_tz") datasetTimezone: Option[String] = None) + case class IndexingConfig(@JsonProperty("olap_store_enabled") olapStoreEnabled: Boolean, @JsonProperty("lakehouse_enabled") lakehouseEnabled: Boolean, + @JsonProperty("cache_enabled") cacheEnabled: Boolean) + + case class KeysConfig(@JsonProperty("data_key") dataKey: Option[String], @JsonProperty("partition_key") partitionKey: Option[String], + @JsonProperty("timestamp_key") tsKey: Option[String], @JsonProperty("timestamp_format") tsFormat: Option[String]) + + case class CacheConfig(@JsonProperty("redis_db_host") redisDBHost: Option[String], @JsonProperty("redis_db_port") redisDBPort: Option[Int], + @JsonProperty("redis_db") redisDB: Option[Int]) + + case class DatasetConfigV1(@JsonProperty("data_key") key: String, @JsonProperty("timestamp_key") tsKey: String, @JsonProperty("entry_topic") entryTopic: String, + @JsonProperty("exclude_fields") excludeFields: Option[List[String]] = None, @JsonProperty("redis_db_host") redisDBHost: Option[String] = None, + @JsonProperty("redis_db_port") redisDBPort: Option[Int] = None, @JsonProperty("redis_db") redisDB: Option[Int] = None, + @JsonProperty("index_data") indexData: Option[Boolean] = None, @JsonProperty("timestamp_format") tsFormat: Option[String] = None, + @JsonProperty("dataset_tz") datasetTimezone: Option[String] = None) + + case class DatasetConfig(@JsonProperty("indexing_config") indexingConfig: IndexingConfig, + @JsonProperty("keys_config") keysConfig: KeysConfig, + @JsonProperty("exclude_fields") excludeFields: Option[List[String]] = None, + @JsonProperty("dataset_tz") datasetTimezone: Option[String] = None, + @JsonProperty("cache_config") cacheConfig: Option[CacheConfig] = None) case class Dataset(@JsonProperty("id") id: String, @JsonProperty("type") datasetType: String, @JsonProperty("extraction_config") extractionConfig: Option[ExtractionConfig], @JsonProperty("dedup_config") dedupConfig: Option[DedupConfig], @JsonProperty("validation_config") validationConfig: Option[ValidationConfig], @JsonProperty("data_schema") jsonSchema: Option[String], @JsonProperty("denorm_config") denormConfig: Option[DenormConfig], - @JsonProperty("router_config") routerConfig: RouterConfig, datasetConfig: DatasetConfig, @JsonProperty("status") @JsonScalaEnumeration(classOf[DatasetStatusType]) status: DatasetStatus, - @JsonProperty("tags") tags: Option[Array[String]] = None, @JsonProperty("data_version") dataVersion: Option[Int] = None) + @JsonProperty("router_config") routerConfig: RouterConfig, datasetConfig: DatasetConfig, + @JsonProperty("status") @JsonScalaEnumeration(classOf[DatasetStatusType]) status: DatasetStatus, + @JsonProperty("entry_topic") entryTopic: String, @JsonProperty("tags") tags: Option[Array[String]] = None, + @JsonProperty("data_version") dataVersion: Option[Int] = None, @JsonProperty("api_version") apiVersion: Option[String] = None) case class Condition(@JsonProperty("type") `type`: String, @JsonProperty("expr") expr: String) @@ -51,7 +68,7 @@ object DatasetModels { case class DatasetTransformation(@JsonProperty("id") id: String, @JsonProperty("dataset_id") datasetId: String, @JsonProperty("field_key") fieldKey: String, @JsonProperty("transformation_function") transformationFunction: TransformationFunction, - @JsonProperty("status") status: String, @JsonProperty("mode") @JsonScalaEnumeration(classOf[TransformModeType]) mode: Option[TransformMode] = Some(TransformMode.Strict)) + @JsonProperty("mode") @JsonScalaEnumeration(classOf[TransformModeType]) mode: Option[TransformMode] = Some(TransformMode.Strict)) case class ConnectorConfig(@JsonProperty("kafkaBrokers") kafkaBrokers: String, @JsonProperty("topic") topic: String, @JsonProperty("type") databaseType: String, @JsonProperty("connection") connection: Connection, @JsonProperty("tableName") tableName: String, @JsonProperty("databaseName") databaseName: String, @@ -94,4 +111,10 @@ class DatasetStatusType extends TypeReference[DatasetStatus.type] object DatasetStatus extends Enumeration { type DatasetStatus = Value val Draft, Publish, Live, Retired, Purged = Value +} + +class DatasetTypeType extends TypeReference[DatasetType.type] +object DatasetType extends Enumeration { + type DatasetType = Value + val event, transaction, master = Value } \ No newline at end of file diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala index 08921adc..0945fa58 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/registry/DatasetRegistry.scala @@ -12,9 +12,14 @@ object DatasetRegistry { datasets ++= DatasetRegistryService.readAllDatasets() lazy private val datasetTransformations: Map[String, List[DatasetTransformation]] = DatasetRegistryService.readAllDatasetTransformations() - def getAllDatasets(datasetType: String): List[Dataset] = { + def getAllDatasets(datasetType: Option[String]): List[Dataset] = { val datasetList = DatasetRegistryService.readAllDatasets() - datasetList.filter(f => f._2.datasetType.equals(datasetType)).values.toList + if(datasetType.isDefined) { + datasetList.filter(f => f._2.datasetType.equals(datasetType.get)).values.toList + } else { + datasetList.values.toList + } + } def getDataset(id: String): Option[Dataset] = { @@ -47,8 +52,8 @@ object DatasetRegistry { datasourceList.getOrElse(List()) } - def getDataSetIds(datasetType: String): List[String] = { - datasets.filter(f => f._2.datasetType.equals(datasetType)).keySet.toList + def getDataSetIds(): List[String] = { + datasets.keySet.toList } def updateDatasourceRef(datasource: DataSource, datasourceRef: String): Int = { diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala index e5206118..d22a6128 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/service/DatasetRegistryService.scala @@ -119,6 +119,8 @@ object DatasetRegistryService { Option(Iterator.continually((rs, rs.next)).takeWhile(f => f._2).map(f => f._1).map(result => { parseDatasource(result) }).toList) + } finally { + postgresConnect.closeConnection() } } @@ -130,7 +132,7 @@ object DatasetRegistryService { def updateConnectorStats(id: String, lastFetchTimestamp: Timestamp, records: Long): Int = { val query = s"UPDATE dataset_source_config SET connector_stats = coalesce(connector_stats, '{}')::jsonb || " + s"jsonb_build_object('records', COALESCE(connector_stats->>'records', '0')::int + '$records'::int) || " + - s"jsonb_build_object('last_fetch_timestamp', '${lastFetchTimestamp}'::timestamp) || " + + s"jsonb_build_object('last_fetch_timestamp', '$lastFetchTimestamp'::timestamp) || " + s"jsonb_build_object('last_run_timestamp', '${new Timestamp(System.currentTimeMillis())}'::timestamp) WHERE id = '$id';" updateRegistry(query) } @@ -163,11 +165,25 @@ object DatasetRegistryService { val jsonSchema = rs.getString("data_schema") val denormConfig = rs.getString("denorm_config") val routerConfig = rs.getString("router_config") - val datasetConfig = rs.getString("dataset_config") + val datasetConfigStr = rs.getString("dataset_config") val status = rs.getString("status") val tagArray = rs.getArray("tags") val tags = if (tagArray != null) tagArray.getArray.asInstanceOf[Array[String]] else null val dataVersion = rs.getInt("data_version") + val apiVersion = rs.getString("api_version") + val entryTopic = rs.getString("entry_topic") + + val datasetConfig: DatasetConfig = if ("v2".equalsIgnoreCase(apiVersion)) { + JSONUtil.deserialize[DatasetConfig](datasetConfigStr) + } else { + val v1Config = JSONUtil.deserialize[DatasetConfigV1](datasetConfigStr) + DatasetConfig( + indexingConfig = IndexingConfig(olapStoreEnabled = true, lakehouseEnabled = false, cacheEnabled = if ("master".equalsIgnoreCase(datasetType)) true else false), + keysConfig = KeysConfig(dataKey = Some(v1Config.key), None, tsKey = Some(v1Config.tsKey), None), + excludeFields = v1Config.excludeFields, datasetTimezone = v1Config.datasetTimezone, + cacheConfig = Some(CacheConfig(redisDBHost = v1Config.redisDBHost, redisDBPort = v1Config.redisDBPort, redisDB = v1Config.redisDB)) + ) + } Dataset(datasetId, datasetType, if (extractionConfig == null) None else Some(JSONUtil.deserialize[ExtractionConfig](extractionConfig)), @@ -176,10 +192,12 @@ object DatasetRegistryService { Option(jsonSchema), if (denormConfig == null) None else Some(JSONUtil.deserialize[DenormConfig](denormConfig)), JSONUtil.deserialize[RouterConfig](routerConfig), - JSONUtil.deserialize[DatasetConfig](datasetConfig), + datasetConfig, DatasetStatus.withName(status), + entryTopic, Option(tags), - Option(dataVersion) + Option(dataVersion), + Option(apiVersion) ) } @@ -214,10 +232,9 @@ object DatasetRegistryService { val datasetId = rs.getString("dataset_id") val fieldKey = rs.getString("field_key") val transformationFunction = rs.getString("transformation_function") - val status = rs.getString("status") val mode = rs.getString("mode") - DatasetTransformation(id, datasetId, fieldKey, JSONUtil.deserialize[TransformationFunction](transformationFunction), status, Some(if (mode != null) TransformMode.withName(mode) else TransformMode.Strict)) + DatasetTransformation(id, datasetId, fieldKey, JSONUtil.deserialize[TransformationFunction](transformationFunction), Some(if (mode != null) TransformMode.withName(mode) else TransformMode.Strict)) } } \ No newline at end of file diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala index 4e992eba..fede2b54 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala @@ -74,7 +74,7 @@ abstract class BaseDatasetProcessFunction(config: BaseJobConfig[mutable.Map[Stri override def getMetricsList(): MetricsList = { val metrics = getMetrics() ++ List(config.eventFailedMetricsCount) - MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics) + MetricsList(DatasetRegistry.getDataSetIds(), metrics) } private def initMetrics(datasetId: String): Unit = { @@ -138,7 +138,7 @@ abstract class BaseDatasetWindowProcessFunction(config: BaseJobConfig[mutable.Ma override def getMetricsList(): MetricsList = { val metrics = getMetrics() ++ List(config.eventFailedMetricsCount) - MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics) + MetricsList(DatasetRegistry.getDataSetIds(), metrics) } private def initMetrics(datasetId: String): Unit = { diff --git a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala index 1b3edea0..53a40ddd 100644 --- a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala +++ b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/BaseSpecWithDatasetRegistry.scala @@ -35,18 +35,18 @@ class BaseSpecWithDatasetRegistry extends BaseSpecWithPostgres { private def createSchema(postgresConnect: PostgresConnect): Unit = { - postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasets ( id text PRIMARY KEY, type text NOT NULL, validation_config json, extraction_config json, dedup_config json, data_schema json, denorm_config json, router_config json NOT NULL, dataset_config json NOT NULL, status text NOT NULL, tags text[], data_version INT, created_by text NOT NULL, updated_by text NOT NULL, created_date timestamp NOT NULL, updated_date timestamp NOT NULL );") + postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasets ( id text PRIMARY KEY, type text NOT NULL, validation_config json, extraction_config json, dedup_config json, data_schema json, denorm_config json, router_config json NOT NULL, dataset_config json NOT NULL, status text NOT NULL, tags text[], data_version INT, api_version VARCHAR(255) NOT NULL, entry_topic TEXT NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date timestamp NOT NULL, updated_date timestamp NOT NULL );") postgresConnect.execute("CREATE TABLE IF NOT EXISTS datasources ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), type text NOT NULL, ingestion_spec json NOT NULL, datasource text NOT NULL, datasource_ref text NOT NULL, retention_period json, archival_policy json, purge_policy json, backup_config json NOT NULL, status text NOT NULL, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL );") - postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_transformations ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), field_key text NOT NULL, transformation_function json NOT NULL, status text NOT NULL, mode text, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(field_key, dataset_id) );") + postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_transformations ( id text PRIMARY KEY, dataset_id text REFERENCES datasets (id), field_key text NOT NULL, transformation_function json NOT NULL, mode text, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(field_key, dataset_id) );") postgresConnect.execute("CREATE TABLE IF NOT EXISTS dataset_source_config ( id text PRIMARY KEY, dataset_id text NOT NULL REFERENCES datasets (id), connector_type text NOT NULL, connector_config json NOT NULL, status text NOT NULL, connector_stats json, created_by text NOT NULL, updated_by text NOT NULL, created_date Date NOT NULL, updated_date Date NOT NULL, UNIQUE(connector_type, dataset_id) );") } private def insertTestData(postgresConnect: PostgresConnect): Unit = { - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, created_by, updated_by, created_date, updated_date) values ('d1', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d1', 'event', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'v1', 'ingest', 'System', 'System', now(), now());") postgresConnect.execute("update datasets set denorm_config = '{\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"denorm_fields\":[{\"denorm_key\":\"vehicleCode\",\"redis_db\":2,\"denorm_out_field\":\"vehicleData\"}]}' where id='d1';") - postgresConnect.execute("insert into dataset_transformations values('tf1', 'd1', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'Live', 'Strict', 'System', 'System', now(), now());") - postgresConnect.execute("insert into dataset_transformations values('tf2', 'd1', 'dealer.maskedPhone', '{\"type\":\"mask\",\"expr\": \"dealer.phone\"}', 'Live', null, 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date, tags) values ('d2', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") + postgresConnect.execute("insert into dataset_transformations values('tf1', 'd1', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'Strict', 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_transformations values('tf2', 'd1', 'dealer.maskedPhone', '{\"type\":\"mask\",\"expr\": \"dealer.phone\"}', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date, tags) values ('d2', 'event', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") } def getPrintableMetrics(metricsMap: mutable.Map[String, Long]): Map[String, Map[String, Map[String, Long]]] = { diff --git a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala index 3d83552d..dcdcf402 100644 --- a/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala +++ b/dataset-registry/src/test/scala/org/sunbird/obsrv/spec/TestDatasetRegistrySpec.scala @@ -23,7 +23,7 @@ class TestDatasetRegistrySpec extends BaseSpecWithDatasetRegistry with Matchers d2Opt.get.denormConfig should be(None) val postgresConnect = new PostgresConnect(postgresConfig) - postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date, tags) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") + postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date, tags) values ('d3', 'event', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") postgresConnect.closeConnection() val d3Opt = DatasetRegistry.getDataset("d3") @@ -34,14 +34,14 @@ class TestDatasetRegistrySpec extends BaseSpecWithDatasetRegistry with Matchers val d4Opt = DatasetRegistry.getDataset("d4") d4Opt should be (None) - val allDatasets = DatasetRegistry.getAllDatasets("dataset") + val allDatasets = DatasetRegistry.getAllDatasets(Some("event")) allDatasets.size should be(3) val d1Tfs = DatasetRegistry.getDatasetTransformations("d1") d1Tfs should not be None d1Tfs.get.size should be(2) - val ids = DatasetRegistry.getDataSetIds("dataset").sortBy(f => f) + val ids = DatasetRegistry.getDataSetIds().sortBy(f => f) ids.head should be("d1") ids.apply(1) should be("d2") ids.apply(2) should be("d3") diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala b/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala index 466552dd..c20bb925 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/model/Constants.scala @@ -16,5 +16,4 @@ object Constants { val TOPIC = "topic" val MESSAGE = "message" val DATALAKE_TYPE = "datalake" - val MASTER_DATASET_TYPE = "master-dataset" } diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala index 9cbfe1e9..8b29b8c6 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseJobConfig.scala @@ -53,8 +53,6 @@ abstract class BaseJobConfig[T](val config: Config, val jobName: String) extends val checkpointingBaseUrl: Option[String] = if (config.hasPath("job.statebackend.base.url")) Option(config.getString("job.statebackend.base.url")) else None // Base Methods - def datasetType(): String = if (config.hasPath("dataset.type")) config.getString("dataset.type") else "dataset" - def inputTopic(): String def inputConsumer(): String diff --git a/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala b/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala index bac2b0ae..cdffa023 100644 --- a/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala +++ b/framework/src/test/scala/org/sunbird/spec/BaseProcessFunctionTestSpec.scala @@ -142,7 +142,6 @@ class BaseProcessFunctionTestSpec extends BaseSpecWithPostgres with Matchers { val metrics = Metrics(mutable.Map("test" -> new ConcurrentHashMap[String, AtomicLong]())) metrics.reset("test1", "m1") - bsConfig.datasetType() should be ("dataset") } "TestBaseStreamTask" should "validate the getMapDataStream method" in { diff --git a/framework/src/test/scala/org/sunbird/spec/ModelsTestSpec.scala b/framework/src/test/scala/org/sunbird/spec/ModelsTestSpec.scala index 4ca0ad5e..f85347dd 100644 --- a/framework/src/test/scala/org/sunbird/spec/ModelsTestSpec.scala +++ b/framework/src/test/scala/org/sunbird/spec/ModelsTestSpec.scala @@ -104,7 +104,6 @@ class ModelsTestSpec extends FlatSpec with Matchers { bsMapConfig.kafkaConsumerProperties() bsMapConfig.enableDistributedCheckpointing should be (None) bsMapConfig.checkpointingBaseUrl should be (None) - bsMapConfig.datasetType() should be ("master-dataset") val dsk = new DatasetKeySelector() dsk.getKey(mutable.Map("dataset" -> "d1".asInstanceOf[AnyRef])) should be ("d1") diff --git a/pipeline/cache-indexer/pom.xml b/pipeline/cache-indexer/pom.xml new file mode 100644 index 00000000..7d9ed5a8 --- /dev/null +++ b/pipeline/cache-indexer/pom.xml @@ -0,0 +1,260 @@ + + + 4.0.0 + + pipeline + org.sunbird.obsrv + 1.0 + + cache-indexer + 1.0.0 + Cache Indexer + + UTF-8 + 1.4.0 + + + + + org.apache.flink + flink-streaming-scala_${scala.maj.version} + ${flink.version} + provided + + + com.fasterxml.jackson.core + jackson-databind + + + + + org.sunbird.obsrv + framework + 1.0.0 + + + org.sunbird.obsrv + dataset-registry + 1.0.0 + + + org.apache.kafka + kafka-clients + + + + + org.json4s + json4s-native_${scala.maj.version} + 4.0.6 + + + org.apache.kafka + kafka-clients + ${kafka.version} + test + + + org.apache.kafka + kafka_${scala.maj.version} + ${kafka.version} + test + + + org.sunbird.obsrv + framework + 1.0.0 + test-jar + test + + + org.sunbird.obsrv + dataset-registry + 1.0.0 + test-jar + test + + + org.apache.flink + flink-test-utils + ${flink.version} + test + + + org.apache.flink + flink-runtime + ${flink.version} + test + tests + + + com.github.codemonstur + embedded-redis + 1.0.0 + test + + + org.apache.flink + flink-streaming-java + ${flink.version} + test + tests + + + org.scalatest + scalatest_2.12 + 3.0.6 + test + + + org.mockito + mockito-core + 3.3.3 + test + + + com.fiftyonred + mock-jedis + 0.4.0 + test + + + io.github.embeddedkafka + embedded-kafka_2.12 + 3.4.0 + test + + + io.zonky.test + embedded-postgres + 2.0.3 + test + + + + + src/main/scala + src/test/scala + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + 11 + + + + org.apache.maven.plugins + maven-shade-plugin + 3.2.1 + + + + package + + shade + + + false + + + com.google.code.findbugs:jsr305 + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + core-site.xml + + + + + + org.sunbird.obsrv.streaming.CacheIndexerStreamTask + + + + reference.conf + + + + + + + + + net.alchim31.maven + scala-maven-plugin + 4.4.0 + + ${java.target.runtime} + ${java.target.runtime} + ${scala.version} + false + + + + scala-compile-first + process-resources + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + + + maven-surefire-plugin + 2.22.2 + + true + + + + + org.scalatest + scalatest-maven-plugin + 1.0 + + ${project.build.directory}/surefire-reports + . + cache-indexer-testsuite.txt + + + + test + + test + + + + + + org.scoverage + scoverage-maven-plugin + ${scoverage.plugin.version} + + ${scala.version} + true + true + + + + + diff --git a/pipeline/cache-indexer/src/main/resources/cache-indexer.conf b/pipeline/cache-indexer/src/main/resources/cache-indexer.conf new file mode 100644 index 00000000..58a9c9d1 --- /dev/null +++ b/pipeline/cache-indexer/src/main/resources/cache-indexer.conf @@ -0,0 +1,15 @@ +include "baseconfig.conf" + +kafka { + output.failed.topic = ${job.env}".masterdata.failed" + groupId = ${job.env}"-cache-indexer-group" + producer { + max-request-size = 5242880 + } +} + +task { + window.time.in.seconds = 5 + window.count = 30 + window.shards = 1400 +} \ No newline at end of file diff --git a/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/function/MasterDataProcessorFunction.scala b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/function/MasterDataProcessorFunction.scala new file mode 100644 index 00000000..bbab5307 --- /dev/null +++ b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/function/MasterDataProcessorFunction.scala @@ -0,0 +1,56 @@ +package org.sunbird.obsrv.function + +import org.apache.flink.configuration.Configuration +import org.apache.flink.streaming.api.functions.ProcessFunction +import org.json4s.native.JsonMethods._ +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.model.{ErrorConstants, FunctionalError, Producer} +import org.sunbird.obsrv.core.streaming.Metrics +import org.sunbird.obsrv.core.util.JSONUtil +import org.sunbird.obsrv.model.DatasetModels.Dataset +import org.sunbird.obsrv.pipeline.task.CacheIndexerConfig +import org.sunbird.obsrv.registry.DatasetRegistry +import org.sunbird.obsrv.streaming.BaseDatasetProcessFunction +import org.sunbird.obsrv.util.MasterDataCache + +import scala.collection.mutable + +class MasterDataProcessorFunction(config: CacheIndexerConfig) extends BaseDatasetProcessFunction(config) { + + private[this] val logger = LoggerFactory.getLogger(classOf[MasterDataProcessorFunction]) + private[this] var masterDataCache: MasterDataCache = _ + + override def open(parameters: Configuration): Unit = { + super.open(parameters) + masterDataCache = new MasterDataCache(config) + masterDataCache.open(DatasetRegistry.getAllDatasets(Some("master"))) + } + + override def close(): Unit = { + super.close() + masterDataCache.close() + } + + override def getMetrics(): List[String] = { + List(config.successEventCount, config.systemEventCount, config.totalEventCount, config.successInsertCount, config.successUpdateCount) + } + + override def processElement(dataset: Dataset, msg: mutable.Map[String, AnyRef], context: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, metrics: Metrics): Unit = { + + metrics.incCounter(dataset.id, config.totalEventCount) + masterDataCache.open(dataset) + val event = JSONUtil.serialize(msg(config.CONST_EVENT)) + val json = parse(event, useBigIntForLong = false) + val node = JSONUtil.getKey(dataset.datasetConfig.keysConfig.dataKey.get, event) + if (node.isMissingNode) { + markFailure(Some(dataset.id), msg, context, metrics, ErrorConstants.MISSING_DATASET_CONFIG_KEY, Producer.masterdataprocessor, FunctionalError.MissingMasterDatasetKey, datasetType = Some(dataset.datasetType)) + } else { + val result = masterDataCache.process(dataset, node.asText(), json) + metrics.incCounter(dataset.id, config.successInsertCount, result._1) + metrics.incCounter(dataset.id, config.successUpdateCount, result._2) + metrics.incCounter(dataset.id, config.successEventCount) + } + + } + +} \ No newline at end of file diff --git a/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerConfig.scala b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerConfig.scala new file mode 100644 index 00000000..c6a49f57 --- /dev/null +++ b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerConfig.scala @@ -0,0 +1,33 @@ +package org.sunbird.obsrv.pipeline.task + +import com.typesafe.config.Config +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.java.typeutils.TypeExtractor +import org.apache.flink.streaming.api.scala.OutputTag +import org.sunbird.obsrv.core.streaming.BaseJobConfig + +import scala.collection.mutable + +class CacheIndexerConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "CacheIndexerJob") { + + private val serialVersionUID = 2905979434303791379L + implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) + + // Metric List + val totalEventCount = "total-event-count" + val successEventCount = "success-event-count" + val successInsertCount = "success-insert-count" + val successUpdateCount = "success-update-count" + + val windowTime: Int = config.getInt("task.window.time.in.seconds") + val windowCount: Int = config.getInt("task.window.count") + + // Functions + val cacheIndexerFunction = "CacheIndexerFunction" + + override def inputTopic(): String = null + override def inputConsumer(): String = "cache-indexer" + override def successTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("processing_stats") + + override def failedEventsOutputTag(): OutputTag[mutable.Map[String, AnyRef]] = OutputTag[mutable.Map[String, AnyRef]]("failed-events") +} diff --git a/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerStreamTask.scala b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerStreamTask.scala new file mode 100644 index 00000000..61b9ddec --- /dev/null +++ b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerStreamTask.scala @@ -0,0 +1,61 @@ +package org.sunbird.obsrv.streaming + +import com.typesafe.config.ConfigFactory +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.java.typeutils.TypeExtractor +import org.apache.flink.api.java.utils.ParameterTool +import org.apache.flink.streaming.api.datastream.DataStream +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector} +import org.sunbird.obsrv.core.util.FlinkUtil +import org.sunbird.obsrv.function.MasterDataProcessorFunction +import org.sunbird.obsrv.model.DatasetType +import org.sunbird.obsrv.pipeline.task.CacheIndexerConfig +import org.sunbird.obsrv.registry.DatasetRegistry + +import java.io.File +import scala.collection.mutable + +class CacheIndexerStreamTask(config: CacheIndexerConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { + + implicit val mutableMapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) + private val logger = LoggerFactory.getLogger(classOf[CacheIndexerStreamTask]) + + def process(): Unit = { + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) + process(env) + env.execute(config.jobName) + } + + def process(env: StreamExecutionEnvironment): Unit = { + + val datasets = DatasetRegistry.getAllDatasets(Some(DatasetType.master.toString)) + val datasetIds = datasets.map(f => f.id) + val dataStream = getMapDataStream(env, config, datasetIds, config.kafkaConsumerProperties(), consumerSourceName = s"cache-indexer-consumer", kafkaConnector) + processStream(dataStream) + } + + override def processStream(dataStream: DataStream[mutable.Map[String, AnyRef]]): DataStream[mutable.Map[String, AnyRef]] = { + val processedStream = dataStream.process(new MasterDataProcessorFunction(config)).name(config.cacheIndexerFunction) + .uid(config.cacheIndexerFunction).setParallelism(config.downstreamOperatorsParallelism) + addDefaultSinks(processedStream, config, kafkaConnector) + processedStream.getSideOutput(config.successTag()) + } + +} + +object CacheIndexerStreamTask { + + def main(args: Array[String]): Unit = { + val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path")) + val config = configFilePath.map { + path => ConfigFactory.parseFile(new File(path)).resolve() + }.getOrElse(ConfigFactory.load("cache-indexer.conf").withFallback(ConfigFactory.systemEnvironment())) + val cacheConfig = new CacheIndexerConfig(config) + val kafkaUtil = new FlinkKafkaConnector(cacheConfig) + val task = new CacheIndexerStreamTask(cacheConfig, kafkaUtil) + task.process() + } + +} \ No newline at end of file diff --git a/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/util/MasterDataCache.scala b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/util/MasterDataCache.scala new file mode 100644 index 00000000..c3365255 --- /dev/null +++ b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/util/MasterDataCache.scala @@ -0,0 +1,58 @@ +package org.sunbird.obsrv.util + +import org.json4s.native.JsonMethods._ +import org.json4s.{JNothing, JValue} +import org.slf4j.LoggerFactory +import org.sunbird.obsrv.core.cache.RedisConnect +import org.sunbird.obsrv.model.DatasetModels.Dataset +import org.sunbird.obsrv.pipeline.task.CacheIndexerConfig +import redis.clients.jedis.Jedis + +import scala.collection.mutable + +class MasterDataCache(val config: CacheIndexerConfig) { + + private[this] val logger = LoggerFactory.getLogger(classOf[MasterDataCache]) + private val datasetPipelineMap: mutable.Map[String, Jedis] = mutable.Map[String, Jedis]() + + def close(): Unit = { + datasetPipelineMap.values.foreach(pipeline => pipeline.close()) + } + + def open(datasets: List[Dataset]): Unit = { + datasets.foreach(dataset => { + open(dataset) + }) + } + + def open(dataset: Dataset): Unit = { + if (!datasetPipelineMap.contains(dataset.id)) { + val redisConfig = dataset.datasetConfig.cacheConfig.get + val redisConnect = new RedisConnect(redisConfig.redisDBHost.get, redisConfig.redisDBPort.get, config.redisConnectionTimeout) + val jedis: Jedis = redisConnect.getConnection(0) + datasetPipelineMap.put(dataset.id, jedis) + } + } + + def process(dataset: Dataset, key: String, event: JValue): (Int, Int) = { + val jedis = this.datasetPipelineMap(dataset.id) + val dataFromCache = getDataFromCache(dataset, key, jedis) + updateCache(dataset, dataFromCache, key, event, jedis) + (if (dataFromCache == null) 1 else 0, if (dataFromCache == null) 0 else 1) + } + + private def getDataFromCache(dataset: Dataset, key: String, jedis: Jedis): String = { + + jedis.select(dataset.datasetConfig.cacheConfig.get.redisDB.get) + jedis.get(key) + } + + private def updateCache(dataset: Dataset, dataFromCache: String, key: String, event: JValue, jedis: Jedis): Unit = { + + jedis.select(dataset.datasetConfig.cacheConfig.get.redisDB.get) + val existingJson = if (dataFromCache != null) parse(dataFromCache) else JNothing + val mergedJson = existingJson merge event + jedis.set(key, compact(render(mergedJson))) + } + +} diff --git a/pipeline/cache-indexer/src/test/resources/base-config.conf b/pipeline/cache-indexer/src/test/resources/base-config.conf new file mode 100644 index 00000000..3ade36f7 --- /dev/null +++ b/pipeline/cache-indexer/src/test/resources/base-config.conf @@ -0,0 +1,8 @@ +postgres { + host = localhost + port = 5432 + maxConnections = 2 + user = "postgres" + password = "postgres" + database="postgres" +} \ No newline at end of file diff --git a/pipeline/cache-indexer/src/test/resources/test.conf b/pipeline/cache-indexer/src/test/resources/test.conf new file mode 100644 index 00000000..7861c8d0 --- /dev/null +++ b/pipeline/cache-indexer/src/test/resources/test.conf @@ -0,0 +1,20 @@ +include "base-test.conf" + +kafka { + + output.failed.topic = ${job.env}".masterdata.failed" + groupId = ${job.env}"-cache-indexer-group" + producer { + max-request-size = 5242880 + } +} + +task { + window.time.in.seconds = 2 + window.count = 2 + window.shards = 1400 +} + +redis { + port = 6340 +} \ No newline at end of file diff --git a/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala b/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala new file mode 100644 index 00000000..cf28aec5 --- /dev/null +++ b/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala @@ -0,0 +1,10 @@ +package org.sunbird.obsrv.fixture + +object EventFixture { + + val VALID_BATCH_EVENT_D3_INSERT = """{"dataset":"dataset3","event":{"code":"HYUN-CRE-D6","manufacturer":"Hyundai","model":"Creta","variant":"SX(O)","modelYear":"2023","price":"2200000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","dealer":{"email":"john.doe@example.com","locationId":"KUN12345"}}}""" + val VALID_BATCH_EVENT_D3_INSERT_2 = """{"dataset":"dataset3","event":{"code":"HYUN-TUC-D6","manufacturer":"Hyundai","model":"Tucson","variant":"Signature","modelYear":"2023","price":"4000000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","dealer":{"email":"admin.hyun@gmail.com","locationId":"KUN134567"}}}""" + val VALID_BATCH_EVENT_D3_UPDATE = """{"dataset":"dataset3","event":{"code":"HYUN-CRE-D6","dealer":{"email":"john.doe@example.com","locationId":"KUN12345"},"safety":"3 Star (Global NCAP)","seatingCapacity":5}}""" + val VALID_BATCH_EVENT_D4 = """{"dataset":"dataset4","event":{"code":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}}""" + val INVALID_BATCH_EVENT_D4 = """{"dataset":"dataset4","event":{"code1":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}}""" +} diff --git a/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/pipeline/CacheIndexerStreamTaskTestSpec.scala b/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/pipeline/CacheIndexerStreamTaskTestSpec.scala new file mode 100644 index 00000000..b95d754d --- /dev/null +++ b/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/pipeline/CacheIndexerStreamTaskTestSpec.scala @@ -0,0 +1,142 @@ +package org.sunbird.obsrv.pipeline + +import io.github.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig} +import org.apache.flink.configuration.Configuration +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.apache.flink.test.util.MiniClusterWithClientResource +import org.apache.kafka.common.serialization.StringDeserializer +import org.scalatest.Matchers._ +import org.sunbird.obsrv.BaseMetricsReporter +import org.sunbird.obsrv.core.cache.RedisConnect +import org.sunbird.obsrv.core.model.ErrorConstants +import org.sunbird.obsrv.core.model.Models.SystemEvent +import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector +import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect} +import org.sunbird.obsrv.fixture.EventFixture +import org.sunbird.obsrv.pipeline.task.CacheIndexerConfig +import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry +import org.sunbird.obsrv.streaming.CacheIndexerStreamTask + +import scala.collection.mutable +import scala.concurrent.ExecutionContext.Implicits.global +import scala.concurrent.Future +import scala.concurrent.duration._ + +class CacheIndexerStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { + + val flinkCluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder() + .setConfiguration(testConfiguration()) + .setNumberSlotsPerTaskManager(1) + .setNumberTaskManagers(1) + .build) + + val cacheIndexerConfig = new CacheIndexerConfig(config) + val kafkaConnector = new FlinkKafkaConnector(cacheIndexerConfig) + val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group") + implicit val embeddedKafkaConfig: EmbeddedKafkaConfig = + EmbeddedKafkaConfig( + kafkaPort = 9093, + zooKeeperPort = 2183, + customConsumerProperties = customKafkaConsumerProperties + ) + implicit val deserializer: StringDeserializer = new StringDeserializer() + + def testConfiguration(): Configuration = { + val config = new Configuration() + config.setString("metrics.reporter", "job_metrics_reporter") + config.setString("metrics.reporter.job_metrics_reporter.class", classOf[BaseMetricsReporter].getName) + config + } + + override def beforeAll(): Unit = { + super.beforeAll() + BaseMetricsReporter.gaugeMetrics.clear() + EmbeddedKafka.start()(embeddedKafkaConfig) + val postgresConnect = new PostgresConnect(postgresConfig) + insertTestData(postgresConnect) + createTestTopics() + EmbeddedKafka.publishStringMessageToKafka("dataset3", EventFixture.VALID_BATCH_EVENT_D3_INSERT) + EmbeddedKafka.publishStringMessageToKafka("dataset3", EventFixture.VALID_BATCH_EVENT_D3_INSERT_2) + EmbeddedKafka.publishStringMessageToKafka("dataset4", EventFixture.VALID_BATCH_EVENT_D4) + EmbeddedKafka.publishStringMessageToKafka("dataset3", EventFixture.VALID_BATCH_EVENT_D3_UPDATE) + EmbeddedKafka.publishStringMessageToKafka("dataset4", EventFixture.INVALID_BATCH_EVENT_D4) + flinkCluster.before() + } + + private def insertTestData(postgresConnect: PostgresConnect): Unit = { + postgresConnect.execute("INSERT INTO datasets (id, type, validation_config, extraction_config, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES ('dataset3', 'master', '{\"validate\":true,\"mode\":\"Strict\"}', '{\"is_batch_event\":true,\"extraction_key\":\"events\"}', '{\"type\":\"object\",\"$schema\":\"http://json-schema.org/draft-04/schema#\",\"properties\":{\"code\":{\"type\":\"string\"},\"manufacturer\":{\"type\":\"string\"}, \"model\":{\"type\":\"string\"},\"variant\":{\"type\":\"string\"},\"modelYear\":{\"type\":\"string\"},\"price\":{\"type\":\"string\"},\"currencyCode\":{\"type\":\"string\"},\"currency\":{\"type\":\"string\"},\"transmission\":{\"type\":\"string\"},\"fuel\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"email\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"}}}}}', '{\"topic\":\"d3-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"local.masterdata.ingest\",\"redis_db\":3,\"redis_db_host\":\"localhost\",\"redis_db_port\":" + cacheIndexerConfig.redisPort + "}', 'Live', 'v1', 'local.masterdata.ingest', 'System', 'System', now(), now());") + postgresConnect.execute("INSERT INTO datasets (id, type, validation_config, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES ('dataset4', 'master', '{\"validate\":true,\"mode\":\"Strict\"}', '{\"type\":\"object\",\"$schema\":\"http://json-schema.org/draft-04/schema#\",\"properties\":{\"code\":{\"type\":\"string\"},\"manufacturer\":{\"type\":\"string\"},\"model\":{\"type\":\"string\"},\"variant\":{\"type\":\"string\"},\"modelYear\":{\"type\":\"string\"},\"price\":{\"type\":\"string\"},\"currencyCode\":{\"type\":\"string\"},\"currency\":{\"type\":\"string\"},\"seatingCapacity\": {\"type\": \"integer\"}, \"safety\": {\"type\": \"string\"},\"transmission\":{\"type\":\"string\"},\"fuel\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"email\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"}}}}}', '{\"topic\":\"d34-events\"}', '{\"indexing_config\":{\"olap_store_enabled\":false,\"lakehouse_enabled\":false,\"cache_enabled\":true},\"keys_config\":{\"data_key\":\"code\",\"timestamp_key\":\"date\"},\"cache_config\":{\"redis_db\":4,\"redis_db_host\":\"localhost\",\"redis_db_port\":" + cacheIndexerConfig.redisPort + "}}', 'Live', 'v2', 'local.masterdata.ingest', 'System', 'System', now(), now());") + } + + override def afterAll(): Unit = { + + super.afterAll() + flinkCluster.after() + EmbeddedKafka.stop() + } + + def createTestTopics(): Unit = { + List(config.getString("kafka.output.system.event.topic"), "dataset3", "dataset4").foreach(EmbeddedKafka.createCustomTopic(_)) + } + + "CacheIndexerStreamTaskTestSpec" should "validate the cache indexer job for master datasets" in { + + implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(cacheIndexerConfig) + val task = new CacheIndexerStreamTask(cacheIndexerConfig, kafkaConnector) + task.process(env) + Future { + env.execute(cacheIndexerConfig.jobName) + } + + val input = EmbeddedKafka.consumeNumberMessagesFrom[String](config.getString("kafka.output.system.event.topic"), 1, timeout = 30.seconds) + input.size should be(1) + + input.foreach(se => { + val event = JSONUtil.deserialize[SystemEvent](se) + val error = event.data.error + if (event.ctx.dataset.getOrElse("ALL").equals("ALL")) + event.ctx.dataset_type should be(None) + else if (error.isDefined) { + val errorCode = error.get.error_code + if (errorCode.equals(ErrorConstants.MISSING_DATASET_ID.errorCode) || + errorCode.equals(ErrorConstants.MISSING_DATASET_CONFIGURATION.errorCode) || + errorCode.equals(ErrorConstants.EVENT_MISSING.errorCode)) { + event.ctx.dataset_type should be(None) + } + } + else + event.ctx.dataset_type should be(Some("master")) + }) + + val mutableMetricsMap = mutable.Map[String, Long](); + BaseMetricsReporter.gaugeMetrics.toMap.mapValues(f => f.getValue()).map(f => mutableMetricsMap.put(f._1, f._2)) + + cacheIndexerConfig.successTag().getId should be("processing_stats") + + mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset3.${cacheIndexerConfig.totalEventCount}") should be(3) + mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset3.${cacheIndexerConfig.successEventCount}") should be(3) + mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset3.${cacheIndexerConfig.successInsertCount}") should be(2) + mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset3.${cacheIndexerConfig.successUpdateCount}") should be(1) + + mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset4.${cacheIndexerConfig.totalEventCount}") should be(2) + mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset4.${cacheIndexerConfig.successEventCount}") should be(1) + mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset4.${cacheIndexerConfig.successInsertCount}") should be(1) + mutableMetricsMap(s"${cacheIndexerConfig.jobName}.dataset4.${cacheIndexerConfig.eventFailedMetricsCount}") should be(1) + + val redisConnection = new RedisConnect(cacheIndexerConfig.redisHost, cacheIndexerConfig.redisPort, cacheIndexerConfig.redisConnectionTimeout) + val jedis1 = redisConnection.getConnection(3) + val event1 = jedis1.get("HYUN-CRE-D6") + event1 should be("""{"dealer":{"email":"john.doe@example.com","locationId":"KUN12345"},"model":"Creta","price":"2200000","variant":"SX(O)","fuel":"Diesel","code":"HYUN-CRE-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic","safety":"3 Star (Global NCAP)","seatingCapacity":5}""") + val event3 = jedis1.get("HYUN-TUC-D6") + event3 should be("""{"dealer":{"email":"admin.hyun@gmail.com","locationId":"KUN134567"},"model":"Tucson","price":"4000000","variant":"Signature","fuel":"Diesel","code":"HYUN-TUC-D6","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Hyundai","modelYear":"2023","transmission":"automatic"}""") + jedis1.close() + + val jedis2 = redisConnection.getConnection(4) + val event2 = jedis2.get("JEEP-CP-D3") + event2 should be("""{"model":"Compass","price":"3800000","variant":"Model S (O) Diesel 4x4 AT","fuel":"Diesel","seatingCapacity":5,"code":"JEEP-CP-D3","currencyCode":"INR","currency":"Indian Rupee","manufacturer":"Jeep","safety":"5 Star (Euro NCAP)","modelYear":"2023","transmission":"automatic"}""") + jedis2.close() + } + + +} diff --git a/pipeline/druid-router/pom.xml b/pipeline/dataset-router/pom.xml similarity index 97% rename from pipeline/druid-router/pom.xml rename to pipeline/dataset-router/pom.xml index 41e2e390..5c6b5d23 100644 --- a/pipeline/druid-router/pom.xml +++ b/pipeline/dataset-router/pom.xml @@ -12,12 +12,12 @@ org.sunbird.obsrv.pipeline - druid-router + dataset-router 1.0.0 jar - Druid Events Router + Dataset Events Router - Validate and Route Datasets for Druid Indexing + Validate and Route Datasets for Indexing into OLAP Store or a Lakehouse @@ -198,7 +198,7 @@ - reference.conf + dataset-router.conf diff --git a/pipeline/druid-router/src/main/resources/druid-router.conf b/pipeline/dataset-router/src/main/resources/dataset-router.conf similarity index 100% rename from pipeline/druid-router/src/main/resources/druid-router.conf rename to pipeline/dataset-router/src/main/resources/dataset-router.conf diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala similarity index 90% rename from pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala rename to pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala index 9d40db5c..9f2c7907 100644 --- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala +++ b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/functions/DynamicRouterFunction.scala @@ -11,7 +11,8 @@ import org.sunbird.obsrv.core.model.{Constants, ErrorConstants, FunctionalError, import org.sunbird.obsrv.core.streaming.Metrics import org.sunbird.obsrv.core.util.{JSONUtil, Util} import org.sunbird.obsrv.model.DatasetModels.{Dataset, DatasetConfig} -import org.sunbird.obsrv.router.task.DruidRouterConfig +import org.sunbird.obsrv.model.DatasetType +import org.sunbird.obsrv.router.task.DynamicRouterConfig import org.sunbird.obsrv.streaming.BaseDatasetProcessFunction import java.util.TimeZone @@ -19,7 +20,7 @@ import scala.collection.mutable case class TimestampKey(isValid: Boolean, value: AnyRef) -class DynamicRouterFunction(config: DruidRouterConfig) extends BaseDatasetProcessFunction(config) { +class DynamicRouterFunction(config: DynamicRouterConfig) extends BaseDatasetProcessFunction(config) { private[this] val logger = LoggerFactory.getLogger(classOf[DynamicRouterFunction]) @@ -44,7 +45,7 @@ class DynamicRouterFunction(config: DruidRouterConfig) extends BaseDatasetProces event.put(config.CONST_OBSRV_META, msg(config.CONST_OBSRV_META).asInstanceOf[Map[String, AnyRef]]) val tsKeyData = TimestampKeyParser.parseTimestampKey(dataset.datasetConfig, event) event.put("indexTS", tsKeyData.value) - if (tsKeyData.isValid || dataset.datasetType.equalsIgnoreCase(Constants.MASTER_DATASET_TYPE)) { + if (tsKeyData.isValid || dataset.datasetType.equalsIgnoreCase(DatasetType.master.toString)) { val routerConfig = dataset.routerConfig val topicEventMap = mutable.Map(Constants.TOPIC -> routerConfig.topic, Constants.MESSAGE -> event) ctx.output(config.routerOutputTag, topicEventMap) @@ -60,7 +61,7 @@ class DynamicRouterFunction(config: DruidRouterConfig) extends BaseDatasetProces object TimestampKeyParser { def parseTimestampKey(datasetConfig: DatasetConfig, event: mutable.Map[String, AnyRef]): TimestampKey = { - val indexKey = datasetConfig.tsKey + val indexKey = datasetConfig.keysConfig.tsKey.get val node = JSONUtil.getKey(indexKey, JSONUtil.serialize(event)) node.getNodeType match { case JsonNodeType.NUMBER => onNumber(datasetConfig, node) @@ -83,7 +84,7 @@ object TimestampKeyParser { private def onText(datasetConfig: DatasetConfig, node: JsonNode): TimestampKey = { val value = node.textValue() - if (datasetConfig.tsFormat.isDefined) { + if (datasetConfig.keysConfig.tsFormat.isDefined) { parseDateTime(datasetConfig, value) } else { TimestampKey(isValid = true, value) @@ -92,10 +93,10 @@ object TimestampKeyParser { private def parseDateTime(datasetConfig: DatasetConfig, value: String): TimestampKey = { try { - datasetConfig.tsFormat.get match { + datasetConfig.keysConfig.tsFormat.get match { case "epoch" => TimestampKey(isValid = true, addTimeZone(datasetConfig, new DateTime(value.toLong)).asInstanceOf[AnyRef]) case _ => - val dtf = DateTimeFormat.forPattern(datasetConfig.tsFormat.get) + val dtf = DateTimeFormat.forPattern(datasetConfig.keysConfig.tsFormat.get) TimestampKey(isValid = true, addTimeZone(datasetConfig, dtf.parseDateTime(value)).asInstanceOf[AnyRef]) } } catch { diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterConfig.scala b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterConfig.scala similarity index 92% rename from pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterConfig.scala rename to pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterConfig.scala index 31106b00..a9309016 100644 --- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterConfig.scala +++ b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterConfig.scala @@ -8,7 +8,7 @@ import org.sunbird.obsrv.core.streaming.BaseJobConfig import scala.collection.mutable -class DruidRouterConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "DruidRouterJob") { +class DynamicRouterConfig(override val config: Config) extends BaseJobConfig[mutable.Map[String, AnyRef]](config, "DruidRouterJob") { private val serialVersionUID = 2905979434303791379L implicit val eventTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala similarity index 89% rename from pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala rename to pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala index 9e17a974..5ac1067f 100644 --- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala +++ b/pipeline/dataset-router/src/main/scala/org/sunbird/obsrv/router/task/DynamicRouterStreamTask.scala @@ -17,7 +17,7 @@ import scala.collection.mutable * Druid Router stream task routes every event into its respective topic configured at dataset level */ -class DynamicRouterStreamTask(config: DruidRouterConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { +class DynamicRouterStreamTask(config: DynamicRouterConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { private val serialVersionUID = 146697324640926024L @@ -56,8 +56,8 @@ object DynamicRouterStreamTask { val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path")) val config = configFilePath.map { path => ConfigFactory.parseFile(new File(path)).resolve() - }.getOrElse(ConfigFactory.load("druid-router.conf").withFallback(ConfigFactory.systemEnvironment())) - val druidRouterConfig = new DruidRouterConfig(config) + }.getOrElse(ConfigFactory.load("dataset-router.conf").withFallback(ConfigFactory.systemEnvironment())) + val druidRouterConfig = new DynamicRouterConfig(config) val kafkaUtil = new FlinkKafkaConnector(druidRouterConfig) val task = new DynamicRouterStreamTask(druidRouterConfig, kafkaUtil) task.process() diff --git a/pipeline/druid-router/src/test/resources/test.conf b/pipeline/dataset-router/src/test/resources/test.conf similarity index 100% rename from pipeline/druid-router/src/test/resources/test.conf rename to pipeline/dataset-router/src/test/resources/test.conf diff --git a/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala similarity index 97% rename from pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala rename to pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala index 0c45a555..98370128 100644 --- a/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala +++ b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/DynamicRouterStreamTaskTestSpec.scala @@ -12,7 +12,7 @@ import org.sunbird.obsrv.core.model.Models.SystemEvent import org.sunbird.obsrv.core.model._ import org.sunbird.obsrv.core.streaming.FlinkKafkaConnector import org.sunbird.obsrv.core.util.{FlinkUtil, JSONUtil, PostgresConnect} -import org.sunbird.obsrv.router.task.{DruidRouterConfig, DynamicRouterStreamTask} +import org.sunbird.obsrv.router.task.{DynamicRouterConfig, DynamicRouterStreamTask} import org.sunbird.obsrv.spec.BaseSpecWithDatasetRegistry import scala.collection.mutable @@ -28,7 +28,7 @@ class DynamicRouterStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { .setNumberTaskManagers(1) .build) - val routerConfig = new DruidRouterConfig(config) + val routerConfig = new DynamicRouterConfig(config) val kafkaConnector = new FlinkKafkaConnector(routerConfig) val customKafkaConsumerProperties: Map[String, String] = Map[String, String]("auto.offset.reset" -> "earliest", "group.id" -> "test-event-schema-group") implicit val embeddedKafkaConfig: EmbeddedKafkaConfig = @@ -132,7 +132,7 @@ class DynamicRouterStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { } } else - event.ctx.dataset_type should be(Some("dataset")) + event.ctx.dataset_type should be(Some("event")) }) systemEvents.foreach(f => { diff --git a/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/EventFixture.scala b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/EventFixture.scala similarity index 100% rename from pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/EventFixture.scala rename to pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/EventFixture.scala diff --git a/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala similarity index 51% rename from pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala rename to pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala index f35567f0..7bf5dfa6 100644 --- a/pipeline/druid-router/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala +++ b/pipeline/dataset-router/src/test/scala/org/sunbird/obsrv/router/TestTimestampKeyParser.scala @@ -2,7 +2,7 @@ package org.sunbird.obsrv.router import org.scalatest.{FlatSpec, Matchers} import org.sunbird.obsrv.core.util.JSONUtil -import org.sunbird.obsrv.model.DatasetModels.DatasetConfig +import org.sunbird.obsrv.model.DatasetModels.{DatasetConfig, IndexingConfig, KeysConfig} import org.sunbird.obsrv.router.functions.TimestampKeyParser import scala.collection.mutable @@ -14,108 +14,109 @@ class TestTimestampKeyParser extends FlatSpec with Matchers { // Validate text date field without providing dateformat and timezone val result1 = TimestampKeyParser.parseTimestampKey( - DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = None), + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01"}""")) - result1.isValid should be (true) - result1.value.asInstanceOf[String] should be ("2023-03-01") + result1.isValid should be(true) + result1.value.asInstanceOf[String] should be("2023-03-01") // Validate missing timestamp key scenario val result2 = TimestampKeyParser.parseTimestampKey( - DatasetConfig(key = "id", tsKey = "date1", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = None), + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date1"), None)), JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01"}""")) result2.isValid should be(false) result2.value should be(null) // Validate number date field which is not epoch val result3 = TimestampKeyParser.parseTimestampKey( - DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = None), + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":20232201}""")) result3.isValid should be(false) result3.value.asInstanceOf[Int] should be(0) // Validate number date field which is epoch in seconds val result4 = TimestampKeyParser.parseTimestampKey( - DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = None), + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165}""")) result4.isValid should be(true) - result4.value.asInstanceOf[Long] should be(1701373165000l) + result4.value.asInstanceOf[Long] should be(1701373165000L) // Validate number date field which is epoch in milli-seconds val result5 = TimestampKeyParser.parseTimestampKey( - DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = None), + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165123}""")) result5.isValid should be(true) - result5.value.asInstanceOf[Long] should be(1701373165123l) + result5.value.asInstanceOf[Long] should be(1701373165123L) // Validate number date field which is epoch in micro-seconds val result6 = TimestampKeyParser.parseTimestampKey( - DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = None), + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165123111}""")) result6.isValid should be(true) - result6.value.asInstanceOf[Long] should be(1701373165123l) + result6.value.asInstanceOf[Long] should be(1701373165123L) // Validate number date field which is epoch in nano-seconds val result7 = TimestampKeyParser.parseTimestampKey( - DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = None), + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165123111000}""")) result7.isValid should be(true) - result7.value.asInstanceOf[Long] should be(1701373165123l) + result7.value.asInstanceOf[Long] should be(1701373165123L) // Validate number date field which is not an epoch in milli, micro or nano seconds val result8 = TimestampKeyParser.parseTimestampKey( - DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = None), + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":170137316512}""")) result8.isValid should be(false) result8.value.asInstanceOf[Int] should be(0) // Validate number date field which is an epoch with timezone present val result9 = TimestampKeyParser.parseTimestampKey( - DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = None, datasetTimezone = Some("GMT+05:30")), + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None), datasetTimezone = Some("GMT+05:30")), JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":1701373165123}""")) result9.isValid should be(true) - result9.value.asInstanceOf[Long] should be(1701392965123l) + result9.value.asInstanceOf[Long] should be(1701392965123L) } it should "validate all scenarios of timestamp key in text format" in { // Validate epoch data in text format val result1 = TimestampKeyParser.parseTimestampKey( - DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = Some("epoch"), datasetTimezone = Some("GMT+05:30")), + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("epoch")), datasetTimezone = Some("GMT+05:30")), JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"1701373165123"}""")) result1.isValid should be(true) - result1.value.asInstanceOf[Long] should be(1701392965123l) + result1.value.asInstanceOf[Long] should be(1701392965123L) // Validate invalid epoch data in text format (would reset to millis from 1970-01-01 if not epoch in millis) val result2 = TimestampKeyParser.parseTimestampKey( - DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = Some("epoch"), datasetTimezone = Some("GMT+05:30")), + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("epoch")), datasetTimezone = Some("GMT+05:30")), JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"170137316512"}""")) result2.isValid should be(true) - result2.value.asInstanceOf[Long] should be(170157116512l) + result2.value.asInstanceOf[Long] should be(170157116512L) // Validate date parser without timezone val result3 = TimestampKeyParser.parseTimestampKey( - DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = Some("yyyy-MM-dd"), datasetTimezone = None), + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("yyyy-MM-dd")), datasetTimezone = None), JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01"}""")) result3.isValid should be(true) - result3.value.asInstanceOf[Long] should be(1677609000000l) + result3.value.asInstanceOf[Long] should be(1677609000000L) // Validate date parser with timezone val result4 = TimestampKeyParser.parseTimestampKey( - DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = Some("yyyy-MM-dd"), datasetTimezone = Some("GMT+05:30")), + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("yyyy-MM-dd")), datasetTimezone = Some("GMT+05:30")), JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01"}""")) result4.isValid should be(true) - result4.value.asInstanceOf[Long] should be(1677628800000l) + result4.value.asInstanceOf[Long] should be(1677628800000L) // Validate date parser with date time in nano seconds val result5 = TimestampKeyParser.parseTimestampKey( - DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = Some("yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS"), datasetTimezone = Some("GMT+05:30")), + DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS")), datasetTimezone = Some("GMT+05:30")), JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01T12:45:32.123456789"}""")) result5.isValid should be(true) - result5.value.asInstanceOf[Long] should be(1677674732123l) + result5.value.asInstanceOf[Long] should be(1677674732123L) // Validate date parser with data in invalid format val result6 = TimestampKeyParser.parseTimestampKey( - DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest", excludeFields = None, redisDBHost = None, redisDBPort = None, redisDB = None, indexData = None, tsFormat = Some("yyyy-MM-dd'T'HH:mm:ss.SSS"), datasetTimezone = Some("GMT+05:30")), + DatasetConfig( + IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), Some("yyyy-MM-dd'T'HH:mm:ss.SSS")), datasetTimezone = Some("GMT+05:30")), JSONUtil.deserialize[mutable.Map[String, AnyRef]]("""{"id":1234, "date":"2023-03-01T12:45:32.123456"}""")) result6.isValid should be(false) result6.value should be(null) diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerFunction.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerFunction.scala index 45a41c67..699ba75e 100644 --- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerFunction.scala +++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerFunction.scala @@ -30,7 +30,7 @@ class DenormalizerFunction(config: DenormalizerConfig) extends BaseDatasetProces override def open(parameters: Configuration): Unit = { super.open(parameters) denormCache = new DenormCache(config) - denormCache.open(DatasetRegistry.getAllDatasets(config.datasetType())) + denormCache.open(DatasetRegistry.getAllDatasets(None)) } override def close(): Unit = { diff --git a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerWindowFunction.scala b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerWindowFunction.scala index ce603520..8d188838 100644 --- a/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerWindowFunction.scala +++ b/pipeline/denormalizer/src/main/scala/org/sunbird/obsrv/denormalizer/functions/DenormalizerWindowFunction.scala @@ -33,7 +33,7 @@ class DenormalizerWindowFunction(config: DenormalizerConfig)(implicit val eventT override def open(parameters: Configuration): Unit = { super.open(parameters) denormCache = new DenormCache(config) - denormCache.open(DatasetRegistry.getAllDatasets(config.datasetType())) + denormCache.open(DatasetRegistry.getAllDatasets(None)) } override def close(): Unit = { diff --git a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala index d4d5bc30..89256390 100644 --- a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala +++ b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerStreamTaskTestSpec.scala @@ -119,7 +119,7 @@ class DenormalizerStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { noException should be thrownBy { denormCache.open(Dataset(id = "d123", datasetType = "dataset", extractionConfig = None, dedupConfig = None, validationConfig = None, jsonSchema = None, denormConfig = Some(DenormConfig(redisDBHost = "localhost", redisDBPort = redisPort, denormFields = List(DenormFieldConfig(denormKey = Some("vehicleCode"), redisDB = 3, denormOutField = "vehicle_data", jsonAtaExpr = None)))), routerConfig = RouterConfig(""), - datasetConfig = DatasetConfig(key = "id", tsKey = "date", entryTopic = "ingest"), status = DatasetStatus.Live)) + datasetConfig = DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), status = DatasetStatus.Live, "ingest")) } } diff --git a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala index 52d06e8b..e5bbaa24 100644 --- a/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala +++ b/pipeline/denormalizer/src/test/scala/org/sunbird/obsrv/denormalizer/DenormalizerWindowStreamTaskTestSpec.scala @@ -70,7 +70,7 @@ class DenormalizerWindowStreamTaskTestSpec extends BaseSpecWithDatasetRegistry { } private def insertTestData(postgresConnect: PostgresConnect): Unit = { - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, created_by, updated_by, created_date, updated_date) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'v1', 'ingest', 'System', 'System', now(), now());") postgresConnect.execute("update datasets set denorm_config = '" + s"""{"redis_db_host":"localhost","redis_db_port":$redisPort,"denorm_fields":[{"denorm_key":"vehicleCode","redis_db":3,"denorm_out_field":"vehicle_data"},{"denorm_key":"dealer.dealerCode","redis_db":4,"denorm_out_field":"dealer_data"}]}""" + "' where id='d1';") val redisConnection = new RedisConnect(denormConfig.redisHost, denormConfig.redisPort, denormConfig.redisConnectionTimeout) redisConnection.getConnection(3).set("HYUN-CRE-D6", EventFixture.DENORM_DATA_1) diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DruidRouterFunction.scala b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DruidRouterFunction.scala deleted file mode 100644 index d1f2c5e6..00000000 --- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/functions/DruidRouterFunction.scala +++ /dev/null @@ -1,58 +0,0 @@ -package org.sunbird.obsrv.router.functions - -import org.apache.flink.api.common.typeinfo.TypeInformation -import org.apache.flink.api.java.typeutils.TypeExtractor -import org.apache.flink.configuration.Configuration -import org.apache.flink.streaming.api.functions.ProcessFunction -import org.apache.flink.streaming.api.scala.OutputTag -import org.slf4j.LoggerFactory -import org.sunbird.obsrv.core.streaming.{BaseProcessFunction, Metrics, MetricsList} -import org.sunbird.obsrv.core.util.Util -import org.sunbird.obsrv.registry.DatasetRegistry -import org.sunbird.obsrv.router.task.DruidRouterConfig - -import scala.collection.mutable - -// $COVERAGE-OFF$ Disabling scoverage as the below function is deprecated -@Deprecated -class DruidRouterFunction(config: DruidRouterConfig) extends BaseProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]](config) { - - private[this] val logger = LoggerFactory.getLogger(classOf[DruidRouterFunction]) - - override def open(parameters: Configuration): Unit = { - super.open(parameters) - } - - override def close(): Unit = { - super.close() - } - - override def getMetricsList(): MetricsList = { - val metrics = List(config.routerTotalCount, config.routerSuccessCount) - MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics) - } - - override def processElement(msg: mutable.Map[String, AnyRef], - ctx: ProcessFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef]]#Context, - metrics: Metrics): Unit = { - try { - implicit val mapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) - val datasetId = msg(config.CONST_DATASET).asInstanceOf[String] // DatasetId cannot be empty at this stage - metrics.incCounter(datasetId, config.routerTotalCount) - val dataset = DatasetRegistry.getDataset(datasetId).get - val event = Util.getMutableMap(msg(config.CONST_EVENT).asInstanceOf[Map[String, AnyRef]]) - event.put(config.CONST_OBSRV_META, msg(config.CONST_OBSRV_META)) - val routerConfig = dataset.routerConfig - ctx.output(OutputTag[mutable.Map[String, AnyRef]](routerConfig.topic), event) - metrics.incCounter(datasetId, config.routerSuccessCount) - - msg.remove(config.CONST_EVENT) - ctx.output(config.statsOutputTag, markComplete(msg, dataset.dataVersion)) - } catch { - case ex: Exception => - logger.error("DruidRouterFunction:processElement() - Exception: ", ex.getMessage) - ex.printStackTrace() - } - } -} -// $COVERAGE-ON$ \ No newline at end of file diff --git a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterStreamTask.scala b/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterStreamTask.scala deleted file mode 100644 index b77e110a..00000000 --- a/pipeline/druid-router/src/main/scala/org/sunbird/obsrv/router/task/DruidRouterStreamTask.scala +++ /dev/null @@ -1,72 +0,0 @@ -package org.sunbird.obsrv.router.task - -import com.typesafe.config.ConfigFactory -import org.apache.flink.api.common.typeinfo.TypeInformation -import org.apache.flink.api.java.typeutils.TypeExtractor -import org.apache.flink.api.java.utils.ParameterTool -import org.apache.flink.streaming.api.datastream.DataStream -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment -import org.apache.flink.streaming.api.scala.OutputTag -import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector} -import org.sunbird.obsrv.core.util.FlinkUtil -import org.sunbird.obsrv.registry.DatasetRegistry -import org.sunbird.obsrv.router.functions.DruidRouterFunction - -import java.io.File -import scala.collection.mutable - -/** - * Druid Router stream task routes every event into its respective topic configured at dataset level - */ -// $COVERAGE-OFF$ Disabling scoverage as this stream task is deprecated -@Deprecated -class DruidRouterStreamTask(config: DruidRouterConfig, kafkaConnector: FlinkKafkaConnector) extends BaseStreamTask[mutable.Map[String, AnyRef]] { - - private val serialVersionUID = 146697324640926024L - - def process(): Unit = { - implicit val env: StreamExecutionEnvironment = FlinkUtil.getExecutionContext(config) - val dataStream = getMapDataStream(env, config, kafkaConnector) - processStream(dataStream) - env.execute(config.jobName) - } - - override def processStream(dataStream: DataStream[mutable.Map[String, AnyRef]]): DataStream[mutable.Map[String, AnyRef]] = { - - implicit val mapTypeInfo: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) - val datasets = DatasetRegistry.getAllDatasets(config.datasetType()) - - val routerStream = dataStream.process(new DruidRouterFunction(config)).name(config.druidRouterFunction).uid(config.druidRouterFunction) - .setParallelism(config.downstreamOperatorsParallelism) - datasets.map(dataset => { - routerStream.getSideOutput(OutputTag[mutable.Map[String, AnyRef]](dataset.routerConfig.topic)) - .sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](dataset.routerConfig.topic)) - .name(dataset.id + "-" + config.druidRouterProducer).uid(dataset.id + "-" + config.druidRouterProducer) - .setParallelism(config.downstreamOperatorsParallelism) - }) - - routerStream.getSideOutput(config.statsOutputTag).sinkTo(kafkaConnector.kafkaSink[mutable.Map[String, AnyRef]](config.kafkaStatsTopic)) - .name(config.processingStatsProducer).uid(config.processingStatsProducer).setParallelism(config.downstreamOperatorsParallelism) - - addDefaultSinks(routerStream, config, kafkaConnector) - routerStream.getSideOutput(config.successTag()) - - } -} -// $COVERAGE-ON$ -// $COVERAGE-OFF$ Disabling scoverage as the below code can only be invoked within flink cluster -@Deprecated -object DruidRouterStreamTask { - - def main(args: Array[String]): Unit = { - val configFilePath = Option(ParameterTool.fromArgs(args).get("config.file.path")) - val config = configFilePath.map { - path => ConfigFactory.parseFile(new File(path)).resolve() - }.getOrElse(ConfigFactory.load("druid-router.conf").withFallback(ConfigFactory.systemEnvironment())) - val druidRouterConfig = new DruidRouterConfig(config) - val kafkaUtil = new FlinkKafkaConnector(druidRouterConfig) - val task = new DruidRouterStreamTask(druidRouterConfig, kafkaUtil) - task.process() - } -} -// $COVERAGE-ON$ \ No newline at end of file diff --git a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala index 0e79b08c..46c1e68c 100644 --- a/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala +++ b/pipeline/extractor/src/main/scala/org/sunbird/obsrv/extractor/functions/ExtractionFunction.scala @@ -27,7 +27,7 @@ class ExtractionFunction(config: ExtractorConfig) override def getMetricsList(): MetricsList = { val metrics = List(config.successEventCount, config.systemEventCount, config.eventFailedMetricsCount, config.failedExtractionCount, config.skippedExtractionCount, config.duplicateExtractionCount, config.totalEventCount, config.successExtractionCount) - MetricsList(DatasetRegistry.getDataSetIds(config.datasetType()), metrics) + MetricsList(DatasetRegistry.getDataSetIds(), metrics) } override def open(parameters: Configuration): Unit = { diff --git a/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala b/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala index 6ada824b..3574249a 100644 --- a/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala +++ b/pipeline/extractor/src/test/scala/org/sunbird/obsrv/extractor/ExtractorStreamTestSpec.scala @@ -140,7 +140,7 @@ class ExtractorStreamTestSpec extends BaseSpecWithDatasetRegistry { if(event.ctx.dataset.getOrElse("ALL").equals("ALL")) event.ctx.dataset_type should be(None) else - event.ctx.dataset_type should be(Some("dataset")) + event.ctx.dataset_type should be(Some("event")) }) //TODO: Add assertions for all 6 events diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/functions/RowDataConverterFunction.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/function/RowDataConverterFunction.scala similarity index 98% rename from pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/functions/RowDataConverterFunction.scala rename to pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/function/RowDataConverterFunction.scala index aec00117..4aadb60a 100644 --- a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/functions/RowDataConverterFunction.scala +++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/function/RowDataConverterFunction.scala @@ -1,4 +1,4 @@ -package org.sunbird.obsrv.functions +package org.sunbird.obsrv.function import org.apache.flink.api.common.functions.RichMapFunction import org.apache.flink.configuration.Configuration diff --git a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala index fd160820..3bde66bd 100644 --- a/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala +++ b/pipeline/hudi-connector/src/main/scala/org/sunbird/obsrv/streaming/HudiConnectorStreamTask.scala @@ -16,7 +16,7 @@ import org.slf4j.LoggerFactory import org.sunbird.obsrv.core.model.Constants import org.sunbird.obsrv.core.streaming.{BaseStreamTask, FlinkKafkaConnector} import org.sunbird.obsrv.core.util.FlinkUtil -import org.sunbird.obsrv.functions.RowDataConverterFunction +import org.sunbird.obsrv.function.RowDataConverterFunction import org.sunbird.obsrv.registry.DatasetRegistry import org.sunbird.obsrv.util.HudiSchemaParser import org.apache.hudi.config.HoodieWriteConfig.SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP diff --git a/pipeline/master-data-processor/pom.xml b/pipeline/master-data-processor/pom.xml index 0dc1cc60..f97287af 100644 --- a/pipeline/master-data-processor/pom.xml +++ b/pipeline/master-data-processor/pom.xml @@ -64,7 +64,7 @@ org.sunbird.obsrv.pipeline - druid-router + dataset-router 1.0.0 diff --git a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/function/MasterDataProcessorFunction.scala b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/function/MasterDataProcessorFunction.scala index a7ca7471..dcf96a0f 100644 --- a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/function/MasterDataProcessorFunction.scala +++ b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/function/MasterDataProcessorFunction.scala @@ -9,6 +9,7 @@ import org.sunbird.obsrv.core.model.{ErrorConstants, FunctionalError, Producer} import org.sunbird.obsrv.core.streaming.Metrics import org.sunbird.obsrv.core.util.JSONUtil import org.sunbird.obsrv.model.DatasetModels.Dataset +import org.sunbird.obsrv.model.DatasetType import org.sunbird.obsrv.pipeline.task.MasterDataProcessorConfig import org.sunbird.obsrv.pipeline.util.MasterDataCache import org.sunbird.obsrv.registry.DatasetRegistry @@ -24,7 +25,7 @@ class MasterDataProcessorFunction(config: MasterDataProcessorConfig) extends Bas override def open(parameters: Configuration): Unit = { super.open(parameters) masterDataCache = new MasterDataCache(config) - masterDataCache.open(DatasetRegistry.getAllDatasets(config.datasetType())) + masterDataCache.open(DatasetRegistry.getAllDatasets(Some(DatasetType.master.toString))) } override def close(): Unit = { @@ -37,13 +38,13 @@ class MasterDataProcessorFunction(config: MasterDataProcessorConfig) extends Bas } override def processWindow(dataset: Dataset, context: ProcessWindowFunction[mutable.Map[String, AnyRef], mutable.Map[String, AnyRef], String, TimeWindow]#Context, elements: List[mutable.Map[String, AnyRef]], metrics: Metrics): Unit = { - + Console.println("dataset.id", dataset.id, dataset.datasetConfig.cacheConfig) metrics.incCounter(dataset.id, config.totalEventCount, elements.size.toLong) masterDataCache.open(dataset) val eventsMap = elements.map(msg => { val event = JSONUtil.serialize(msg(config.CONST_EVENT)) val json = parse(event, useBigIntForLong = false) - val node = JSONUtil.getKey(dataset.datasetConfig.key, event) + val node = JSONUtil.getKey(dataset.datasetConfig.keysConfig.dataKey.get, event) if (node.isMissingNode) { markFailure(Some(dataset.id), msg, context, metrics, ErrorConstants.MISSING_DATASET_CONFIG_KEY, Producer.masterdataprocessor, FunctionalError.MissingMasterDatasetKey, datasetType = Some(dataset.datasetType)) } diff --git a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala index 65847bbd..b5cfebef 100644 --- a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala +++ b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/task/MasterDataProcessorStreamTask.scala @@ -12,7 +12,7 @@ import org.sunbird.obsrv.extractor.task.{ExtractorConfig, ExtractorStreamTask} import org.sunbird.obsrv.pipeline.function.MasterDataProcessorFunction import org.sunbird.obsrv.preprocessor.task.{PipelinePreprocessorConfig, PipelinePreprocessorStreamTask} import org.sunbird.obsrv.transformer.task.{TransformerConfig, TransformerStreamTask} -import org.sunbird.obsrv.router.task.{DruidRouterConfig, DynamicRouterStreamTask} +import org.sunbird.obsrv.router.task.{DynamicRouterConfig, DynamicRouterStreamTask} import java.io.File import scala.collection.mutable @@ -51,7 +51,7 @@ class MasterDataProcessorStreamTask(config: Config, masterDataConfig: MasterData val preprocessorTask = new PipelinePreprocessorStreamTask(new PipelinePreprocessorConfig(config), kafkaConnector) val denormalizerTask = new DenormalizerStreamTask(new DenormalizerConfig(config), kafkaConnector) val transformerTask = new TransformerStreamTask(new TransformerConfig(config), kafkaConnector) - val routerTask = new DynamicRouterStreamTask(new DruidRouterConfig(config), kafkaConnector) + val routerTask = new DynamicRouterStreamTask(new DynamicRouterConfig(config), kafkaConnector) val transformedStream = transformerTask.processStream( denormalizerTask.processStream( diff --git a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/util/MasterDataCache.scala b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/util/MasterDataCache.scala index e07f4399..930595d3 100644 --- a/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/util/MasterDataCache.scala +++ b/pipeline/master-data-processor/src/main/scala/org/sunbird/obsrv/pipeline/util/MasterDataCache.scala @@ -20,15 +20,15 @@ class MasterDataCache(val config: MasterDataProcessorConfig) { } def open(datasets: List[Dataset]): Unit = { - datasets.map(dataset => { + datasets.foreach(dataset => { open(dataset) }) } def open(dataset: Dataset): Unit = { if (!datasetPipelineMap.contains(dataset.id)) { - val datasetConfig = dataset.datasetConfig - val redisConnect = new RedisConnect(datasetConfig.redisDBHost.get, datasetConfig.redisDBPort.get, config.redisConnectionTimeout) + val redisConfig = dataset.datasetConfig.cacheConfig.get + val redisConnect = new RedisConnect(redisConfig.redisDBHost.get, redisConfig.redisDBPort.get, config.redisConnectionTimeout) val pipeline: Pipeline = redisConnect.getConnection(0).pipelined() datasetPipelineMap.put(dataset.id, pipeline) } @@ -37,7 +37,7 @@ class MasterDataCache(val config: MasterDataProcessorConfig) { def process(dataset: Dataset, eventMap: Map[String, JValue]): (Int, Int) = { val pipeline = this.datasetPipelineMap(dataset.id) val dataFromCache = getDataFromCache(dataset, eventMap.keySet, pipeline) - val insertCount = dataFromCache.filter(f => f._2 == null).size + val insertCount = dataFromCache.count(f => f._2 == null) val updCount = dataFromCache.size - insertCount updateCache(dataset, dataFromCache, eventMap, pipeline) (insertCount, updCount) @@ -45,7 +45,7 @@ class MasterDataCache(val config: MasterDataProcessorConfig) { private def getDataFromCache(dataset: Dataset, keys: Set[String], pipeline: Pipeline): mutable.Map[String, String] = { pipeline.clear() - pipeline.select(dataset.datasetConfig.redisDB.get) + pipeline.select(dataset.datasetConfig.cacheConfig.get.redisDB.get) val responses: mutable.Map[String, Response[String]] = mutable.Map[String, Response[String]]() keys.foreach(key => { responses.put(key, pipeline.get(key)) @@ -56,7 +56,7 @@ class MasterDataCache(val config: MasterDataProcessorConfig) { private def updateCache(dataset: Dataset, dataFromCache: mutable.Map[String, String], eventMap: Map[String, JValue], pipeline: Pipeline): Unit = { pipeline.clear() - pipeline.select(dataset.datasetConfig.redisDB.get) + pipeline.select(dataset.datasetConfig.cacheConfig.get.redisDB.get) eventMap.foreach(f => { val key = f._1 val newJson = f._2 diff --git a/pipeline/master-data-processor/src/test/resources/test.conf b/pipeline/master-data-processor/src/test/resources/test.conf index a20636d0..3533006c 100644 --- a/pipeline/master-data-processor/src/test/resources/test.conf +++ b/pipeline/master-data-processor/src/test/resources/test.conf @@ -42,4 +42,4 @@ redis { } } -dataset.type = "master-dataset" +dataset.type = "master" diff --git a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala index 5d4545c4..7d8ed0ec 100644 --- a/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala +++ b/pipeline/master-data-processor/src/test/scala/org/sunbird/obsrv/pipeline/MasterDataProcessorStreamTaskTestSpec.scala @@ -63,10 +63,10 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry } private def insertTestData(postgresConnect: PostgresConnect): Unit = { - postgresConnect.execute("INSERT INTO datasets (id, type, validation_config, extraction_config, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) VALUES ('d3', 'master-dataset', '{\"validate\":true,\"mode\":\"Strict\"}', '{\"is_batch_event\":true,\"extraction_key\":\"events\"}', '{\"type\":\"object\",\"$schema\":\"http://json-schema.org/draft-04/schema#\",\"properties\":{\"code\":{\"type\":\"string\"},\"manufacturer\":{\"type\":\"string\"}, \"model\":{\"type\":\"string\"},\"variant\":{\"type\":\"string\"},\"modelYear\":{\"type\":\"string\"},\"price\":{\"type\":\"string\"},\"currencyCode\":{\"type\":\"string\"},\"currency\":{\"type\":\"string\"},\"transmission\":{\"type\":\"string\"},\"fuel\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"email\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"}}}}}', '{\"topic\":\"d3-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"local.masterdata.ingest\",\"redis_db\":3,\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+"}', 'Live', 'System', 'System', now(), now());") - postgresConnect.execute("INSERT INTO datasets (id, type, validation_config, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) VALUES ('d4', 'master-dataset', '{\"validate\":true,\"mode\":\"Strict\"}', '{\"type\":\"object\",\"$schema\":\"http://json-schema.org/draft-04/schema#\",\"properties\":{\"code\":{\"type\":\"string\"},\"manufacturer\":{\"type\":\"string\"},\"model\":{\"type\":\"string\"},\"variant\":{\"type\":\"string\"},\"modelYear\":{\"type\":\"string\"},\"price\":{\"type\":\"string\"},\"currencyCode\":{\"type\":\"string\"},\"currency\":{\"type\":\"string\"},\"seatingCapacity\": {\"type\": \"integer\"}, \"safety\": {\"type\": \"string\"},\"transmission\":{\"type\":\"string\"},\"fuel\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"email\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"}}}}}', '{\"topic\":\"d34-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"local.masterdata.ingest\",\"redis_db\":4,\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+"}', 'Live', 'System', 'System', now(), now());") - postgresConnect.execute("INSERT INTO dataset_transformations (id, dataset_id, field_key, transformation_function, status, created_by, updated_by, created_date, updated_date) VALUES ('tf3', 'd3', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'Live', 'System', 'System', now(), now());") - postgresConnect.execute("INSERT INTO dataset_transformations (id, dataset_id, field_key, transformation_function, status, created_by, updated_by, created_date, updated_date) VALUES ('tf4', 'd3', 'dealer.locationId', '{\"type\":\"encrypt\",\"expr\":\"dealer.locationId\"}', 'Live', 'System', 'System', now(), now());") + postgresConnect.execute("INSERT INTO datasets (id, type, validation_config, extraction_config, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES ('d3', 'master', '{\"validate\":true,\"mode\":\"Strict\"}', '{\"is_batch_event\":true,\"extraction_key\":\"events\"}', '{\"type\":\"object\",\"$schema\":\"http://json-schema.org/draft-04/schema#\",\"properties\":{\"code\":{\"type\":\"string\"},\"manufacturer\":{\"type\":\"string\"}, \"model\":{\"type\":\"string\"},\"variant\":{\"type\":\"string\"},\"modelYear\":{\"type\":\"string\"},\"price\":{\"type\":\"string\"},\"currencyCode\":{\"type\":\"string\"},\"currency\":{\"type\":\"string\"},\"transmission\":{\"type\":\"string\"},\"fuel\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"email\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"}}}}}', '{\"topic\":\"d3-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"local.masterdata.ingest\",\"redis_db\":3,\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+"}', 'Live', 'v1', 'local.masterdata.ingest', 'System', 'System', now(), now());") + postgresConnect.execute("INSERT INTO datasets (id, type, validation_config, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) VALUES ('d4', 'master', '{\"validate\":true,\"mode\":\"Strict\"}', '{\"type\":\"object\",\"$schema\":\"http://json-schema.org/draft-04/schema#\",\"properties\":{\"code\":{\"type\":\"string\"},\"manufacturer\":{\"type\":\"string\"},\"model\":{\"type\":\"string\"},\"variant\":{\"type\":\"string\"},\"modelYear\":{\"type\":\"string\"},\"price\":{\"type\":\"string\"},\"currencyCode\":{\"type\":\"string\"},\"currency\":{\"type\":\"string\"},\"seatingCapacity\": {\"type\": \"integer\"}, \"safety\": {\"type\": \"string\"},\"transmission\":{\"type\":\"string\"},\"fuel\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"email\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"}}}}}', '{\"topic\":\"d34-events\"}', '{\"data_key\":\"code\",\"timestamp_key\":\"date\",\"entry_topic\":\"local.masterdata.ingest\",\"redis_db\":4,\"redis_db_host\":\"localhost\",\"redis_db_port\":"+masterDataConfig.redisPort+"}', 'Live', 'v1', 'local.masterdata.ingest', 'System', 'System', now(), now());") + postgresConnect.execute("INSERT INTO dataset_transformations (id, dataset_id, field_key, transformation_function, created_by, updated_by, created_date, updated_date) VALUES ('tf3', 'd3', 'dealer.email', '{\"type\":\"mask\",\"expr\":\"dealer.email\"}', 'System', 'System', now(), now());") + postgresConnect.execute("INSERT INTO dataset_transformations (id, dataset_id, field_key, transformation_function, created_by, updated_by, created_date, updated_date) VALUES ('tf4', 'd3', 'dealer.locationId', '{\"type\":\"encrypt\",\"expr\":\"dealer.locationId\"}', 'System', 'System', now(), now());") } override def afterAll(): Unit = { @@ -116,7 +116,7 @@ class MasterDataProcessorStreamTaskTestSpec extends BaseSpecWithDatasetRegistry } } else - event.ctx.dataset_type should be(Some("master-dataset")) + event.ctx.dataset_type should be(Some("master")) }) val mutableMetricsMap = mutable.Map[String, Long](); diff --git a/pipeline/pom.xml b/pipeline/pom.xml index 1bfcb9ce..d2128647 100644 --- a/pipeline/pom.xml +++ b/pipeline/pom.xml @@ -20,10 +20,11 @@ preprocessor denormalizer transformer - druid-router + dataset-router unified-pipeline master-data-processor hudi-connector + cache-indexer diff --git a/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/EventValidationFunction.scala b/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/EventValidationFunction.scala index 93cfefef..f4f34789 100644 --- a/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/EventValidationFunction.scala +++ b/pipeline/preprocessor/src/main/scala/org/sunbird/obsrv/preprocessor/functions/EventValidationFunction.scala @@ -32,7 +32,7 @@ class EventValidationFunction(config: PipelinePreprocessorConfig)(implicit val e override def open(parameters: Configuration): Unit = { super.open(parameters) schemaValidator = new SchemaValidator() - schemaValidator.loadDataSchemas(DatasetRegistry.getAllDatasets(config.datasetType())) + schemaValidator.loadDataSchemas(DatasetRegistry.getAllDatasets(None)) } override def close(): Unit = { diff --git a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala index d111543b..226d87ec 100644 --- a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala +++ b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/PipelinePreprocessorStreamTestSpec.scala @@ -75,12 +75,12 @@ class PipelinePreprocessorStreamTestSpec extends BaseSpecWithDatasetRegistry { private def prepareTestData(): Unit = { val postgresConnect = new PostgresConnect(postgresConfig) - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d3', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Draft', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d4', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d5', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"IgnoreNewFields\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d6', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string","maxLength":5},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"DiscardNewFields\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date) values ('d7', 'dataset', '"+EventFixtures.INVALID_SCHEMA+"', '{\"validate\": true, \"mode\": \"Strict\"}','{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now());") - postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, created_by, updated_by, created_date, updated_date) values ('d8', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": false, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d3', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Draft', 'v1', 'ingest', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d4', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"Strict\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d5', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string"},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"IgnoreNewFields\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d6', 'dataset', '" + """{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"id":{"type":"string","maxLength":5},"vehicleCode":{"type":"string"},"date":{"type":"string"},"dealer":{"type":"object","properties":{"dealerCode":{"type":"string"},"locationId":{"type":"string"},"email":{"type":"string"},"phone":{"type":"string"}},"additionalProperties":false,"required":["dealerCode","locationId"]},"metrics":{"type":"object","properties":{"bookingsTaken":{"type":"integer"},"deliveriesPromised":{"type":"integer"},"deliveriesDone":{"type":"integer"}},"additionalProperties":false}},"additionalProperties":false,"required":["id","vehicleCode","date"]}""" + "', '{\"validate\": true, \"mode\": \"DiscardNewFields\"}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d7', 'dataset', '"+EventFixtures.INVALID_SCHEMA+"', '{\"validate\": true, \"mode\": \"Strict\"}','{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, validation_config, extraction_config, dedup_config, router_config, dataset_config, status, data_version, api_version, entry_topic, created_by, updated_by, created_date, updated_date) values ('d8', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"validate\": false, \"mode\": \"Strict\"}', '{\"is_batch_event\": true, \"extraction_key\": \"events\", \"dedup_config\": {\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}}', '{\"drop_duplicates\": true, \"dedup_key\": \"id\", \"dedup_period\": 3}', '{\"topic\":\"d1-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\",\"redis_db_host\":\"localhost\",\"redis_db_port\":"+config.getInt("redis.port")+",\"redis_db\":2}', 'Live', 2, 'v1', 'ingest', 'System', 'System', now(), now());") postgresConnect.closeConnection() } diff --git a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/TestSchemaValidator.scala b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/TestSchemaValidator.scala index 0ba13d65..c05c185d 100644 --- a/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/TestSchemaValidator.scala +++ b/pipeline/preprocessor/src/test/scala/org/sunbird/obsrv/preprocessor/TestSchemaValidator.scala @@ -3,7 +3,7 @@ package org.sunbird.obsrv.preprocessor import com.typesafe.config.{Config, ConfigFactory} import org.scalatest.{FlatSpec, Matchers} import org.sunbird.obsrv.core.util.JSONUtil -import org.sunbird.obsrv.model.DatasetModels.{Dataset, DatasetConfig, RouterConfig} +import org.sunbird.obsrv.model.DatasetModels.{Dataset, DatasetConfig, IndexingConfig, KeysConfig, RouterConfig} import org.sunbird.obsrv.model.DatasetStatus import org.sunbird.obsrv.preprocessor.fixture.EventFixtures import org.sunbird.obsrv.preprocessor.task.PipelinePreprocessorConfig @@ -17,7 +17,7 @@ class TestSchemaValidator extends FlatSpec with Matchers { "SchemaValidator" should "return a success report for a valid event" in { - val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.VALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id","date","ingest"), DatasetStatus.Live) + val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.VALID_SCHEMA), None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest") schemaValidator.loadDataSchema(dataset) val event = JSONUtil.deserialize[Map[String, AnyRef]](EventFixtures.VALID_SCHEMA_EVENT) @@ -27,7 +27,7 @@ class TestSchemaValidator extends FlatSpec with Matchers { it should "return a failed validation report for a invalid event" in { - val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.VALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id","date","ingest"), DatasetStatus.Live) + val dataset = Dataset("d1", "dataset", None, None, None, Option(EventFixtures.VALID_SCHEMA), None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest") schemaValidator.loadDataSchema(dataset) val event1 = JSONUtil.deserialize[Map[String, AnyRef]](EventFixtures.INVALID_SCHEMA_EVENT) @@ -37,7 +37,7 @@ class TestSchemaValidator extends FlatSpec with Matchers { assert(messages1.size == 1) messages1.head.message should be("object has missing required properties ([\"vehicleCode\"])") messages1.head.keyword should be("required") - messages1.head.missing.get.head should be ("vehicleCode") + messages1.head.missing.get.head should be("vehicleCode") val event2 = JSONUtil.deserialize[Map[String, AnyRef]](EventFixtures.INVALID_SCHEMA_EVENT2) val report2 = schemaValidator.validate("d1", event2) @@ -51,7 +51,7 @@ class TestSchemaValidator extends FlatSpec with Matchers { f.instance.pointer should be("/id") case "array" => f.message should be("instance type (array) does not match any allowed primitive type (allowed: [\"string\"])") - f.instance.pointer should be ("/vehicleCode") + f.instance.pointer should be("/vehicleCode") } }) @@ -65,7 +65,7 @@ class TestSchemaValidator extends FlatSpec with Matchers { case "type" => f.message should be("instance type (integer) does not match any allowed primitive type (allowed: [\"string\"])") f.instance.pointer should be("/id") - f.found.get should be ("integer") + f.found.get should be("integer") f.expected.get.head should be("string") case "additionalProperties" => f.message should be("object instance has properties which are not allowed by the schema: [\"deliveriesRejected\"]") @@ -76,24 +76,24 @@ class TestSchemaValidator extends FlatSpec with Matchers { } it should "validate the negative and missing scenarios" in { - val dataset = Dataset("d4", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA_JSON), None, RouterConfig(""), DatasetConfig("id","date","ingest"), DatasetStatus.Live) + val dataset = Dataset("d4", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA_JSON), None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest") schemaValidator.loadDataSchema(dataset) - schemaValidator.schemaFileExists(dataset) should be (false) + schemaValidator.schemaFileExists(dataset) should be(false) schemaValidator.loadDataSchema(dataset) schemaValidator.schemaFileExists(dataset) should be(false) - val dataset2 = Dataset("d5", "dataset", None, None, None, None, None, RouterConfig(""), DatasetConfig("id","date","ingest"), DatasetStatus.Live) + val dataset2 = Dataset("d5", "dataset", None, None, None, None, None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest") schemaValidator.loadDataSchemas(List[Dataset](dataset2)) - schemaValidator.schemaFileExists(dataset2) should be (false) + schemaValidator.schemaFileExists(dataset2) should be(false) - val dataset3 = Dataset("d6", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id", "date", "ingest"), DatasetStatus.Live) + val dataset3 = Dataset("d6", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA), None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest") schemaValidator.loadDataSchemas(List[Dataset](dataset3)) schemaValidator.schemaFileExists(dataset3) should be(false) - val dataset4 = Dataset("d7", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA), None, RouterConfig(""), DatasetConfig("id", "date", "ingest"), DatasetStatus.Live) - schemaValidator.schemaFileExists(dataset4) should be (false) + val dataset4 = Dataset("d7", "dataset", None, None, None, Option(EventFixtures.INVALID_SCHEMA), None, RouterConfig(""), DatasetConfig(IndexingConfig(olapStoreEnabled = false, lakehouseEnabled = false, cacheEnabled = false), KeysConfig(Some("id"), None, Some("date"), None)), DatasetStatus.Live, "ingest") + schemaValidator.schemaFileExists(dataset4) should be(false) } } diff --git a/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TestTransformerFunctionHelper.scala b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TestTransformerFunctionHelper.scala index 197cf614..13bd1b40 100644 --- a/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TestTransformerFunctionHelper.scala +++ b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TestTransformerFunctionHelper.scala @@ -31,14 +31,14 @@ class TestTransformerFunctionHelper extends BaseSpecWithDatasetRegistry with Mat val json = parse(jsonStr) val dtList = Option(List( - DatasetTransformation("tf1", "obs2.0", "spatialExtent", TransformationFunction("mask", None, "spatialExtent"), "active"), - DatasetTransformation("tf1", "obs2.0", "assetRef", TransformationFunction("mask", None, "assetRef"), "active"), - DatasetTransformation("tf1", "obs2.0", "accountEmail", TransformationFunction("mask", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "accountEmail"), "active"), - DatasetTransformation("tf1", "obs2.0", "accountPhone2", TransformationFunction("mask", None, "accountPhone"), "active"), - DatasetTransformation("tf1", "obs2.0", "codeComponentsList", TransformationFunction("jsonata", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "$keys(codeComponents)"), "active"), - DatasetTransformation("tf1", "obs2.0", "valueAsInt", TransformationFunction("jsonata", None, "$number(value)"), "active"), - DatasetTransformation("tf1", "obs2.0", "firmwareComponent", TransformationFunction("jsonata", None, "codeComponents[0]"), "active"), - DatasetTransformation("tf1", "obs2.0", "optionalValue", TransformationFunction("jsonata", None, "$number(optionValue)"), "active") + DatasetTransformation("tf1", "obs2.0", "spatialExtent", TransformationFunction("mask", None, "spatialExtent")), + DatasetTransformation("tf1", "obs2.0", "assetRef", TransformationFunction("mask", None, "assetRef")), + DatasetTransformation("tf1", "obs2.0", "accountEmail", TransformationFunction("mask", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "accountEmail")), + DatasetTransformation("tf1", "obs2.0", "accountPhone2", TransformationFunction("mask", None, "accountPhone")), + DatasetTransformation("tf1", "obs2.0", "codeComponentsList", TransformationFunction("jsonata", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "$keys(codeComponents)")), + DatasetTransformation("tf1", "obs2.0", "valueAsInt", TransformationFunction("jsonata", None, "$number(value)")), + DatasetTransformation("tf1", "obs2.0", "firmwareComponent", TransformationFunction("jsonata", None, "codeComponents[0]")), + DatasetTransformation("tf1", "obs2.0", "optionalValue", TransformationFunction("jsonata", None, "$number(optionValue)")) )) val result = TransformerFunctionHelper.processTransformations(json, dtList) @@ -51,11 +51,11 @@ class TestTransformerFunctionHelper extends BaseSpecWithDatasetRegistry with Mat assert(JSONUtil.getKey("optionalValue", JSONUtil.serialize(result.resultJson)).isMissingNode.equals(true)) val dtList2 = Option(List( - DatasetTransformation("tf1", "obs2.0", "accountPhone", TransformationFunction("mask", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE1'")), "accountPhone"), "active", Some(TransformMode.Lenient)), - DatasetTransformation("tf4", "obs2.0", "asset.assetRef2", TransformationFunction("mask", None, "assetRef2"), "Live", Some(TransformMode.Lenient)), - DatasetTransformation("tf5", "obs2.0", "asset.assetRef3", TransformationFunction("mask", None, "assetRef3"), "Live", Some(TransformMode.Lenient)), - DatasetTransformation("tf6", "obs2.0", "asset.assetRef4", TransformationFunction("mask", None, "assetRef4"), "Live", Some(TransformMode.Lenient)), - DatasetTransformation("tf7", "obs2.0", "asset.assetRef5", TransformationFunction("custom", None, "join(d2.assetRef4)"), "Live", Some(TransformMode.Lenient)) + DatasetTransformation("tf1", "obs2.0", "accountPhone", TransformationFunction("mask", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE1'")), "accountPhone"), Some(TransformMode.Lenient)), + DatasetTransformation("tf4", "obs2.0", "asset.assetRef2", TransformationFunction("mask", None, "assetRef2"), Some(TransformMode.Lenient)), + DatasetTransformation("tf5", "obs2.0", "asset.assetRef3", TransformationFunction("mask", None, "assetRef3"), Some(TransformMode.Lenient)), + DatasetTransformation("tf6", "obs2.0", "asset.assetRef4", TransformationFunction("mask", None, "assetRef4"), Some(TransformMode.Lenient)), + DatasetTransformation("tf7", "obs2.0", "asset.assetRef5", TransformationFunction("custom", None, "join(d2.assetRef4)"), Some(TransformMode.Lenient)) )) val result2 = TransformerFunctionHelper.processTransformations(json, dtList2) result2.status should be(StatusCode.partial) @@ -74,9 +74,9 @@ class TestTransformerFunctionHelper extends BaseSpecWithDatasetRegistry with Mat val json = parse(jsonStr) val dtList = Option(List( - DatasetTransformation("tf1", "obs2.0", "codeComponentsList", TransformationFunction("jsonata", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "$keys(codeComponents).length"), "active", Some(TransformMode.Lenient)), - DatasetTransformation("tf1", "obs2.0", "valueAsInt", TransformationFunction("jsonata", None, "$number(value)"), "active"), - DatasetTransformation("tf1", "obs2.0", "firmwareComponent", TransformationFunction("jsonata", None, "codeComponents[0]"), "active") + DatasetTransformation("tf1", "obs2.0", "codeComponentsList", TransformationFunction("jsonata", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "$keys(codeComponents).length"), Some(TransformMode.Lenient)), + DatasetTransformation("tf1", "obs2.0", "valueAsInt", TransformationFunction("jsonata", None, "$number(value)")), + DatasetTransformation("tf1", "obs2.0", "firmwareComponent", TransformationFunction("jsonata", None, "codeComponents[0]")) )) val result = TransformerFunctionHelper.processTransformations(json, dtList) result.status should be(StatusCode.partial) @@ -89,10 +89,10 @@ class TestTransformerFunctionHelper extends BaseSpecWithDatasetRegistry with Mat val json = parse(jsonStr) val dtList = Option(List( - DatasetTransformation("tf1", "obs2.0", "codeComponentsList", TransformationFunction("jsonata", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "$keys(codeComponent).length"), "active"), - DatasetTransformation("tf1", "obs2.0", "valueAsInt", TransformationFunction("jsonata", None, "number(value)"), "active"), - DatasetTransformation("tf1", "obs2.0", "valueAsInt2", TransformationFunction("jsonata", None, null), "Live", Some(TransformMode.Lenient)), - DatasetTransformation("tf1", "obs2.0", "firmwareComponent", TransformationFunction("jsonata", None, "codeComponents[0]"), "active") + DatasetTransformation("tf1", "obs2.0", "codeComponentsList", TransformationFunction("jsonata", Some(Condition("jsonata", "obsCode='M_BATTERY_CHARGE' and accountEmail='firstname.lastname@gmail.com' and $number(value)>=100")), "$keys(codeComponent).length")), + DatasetTransformation("tf1", "obs2.0", "valueAsInt", TransformationFunction("jsonata", None, "number(value)")), + DatasetTransformation("tf1", "obs2.0", "valueAsInt2", TransformationFunction("jsonata", None, null), Some(TransformMode.Lenient)), + DatasetTransformation("tf1", "obs2.0", "firmwareComponent", TransformationFunction("jsonata", None, "codeComponents[0]")) )) val result = TransformerFunctionHelper.processTransformations(json, dtList) result.status should be(StatusCode.failed) @@ -130,12 +130,12 @@ class TestTransformerFunctionHelper extends BaseSpecWithDatasetRegistry with Mat it should "encrypt the fields in the event" in { val json = parse(jsonStr) val dtList = Option(List( - DatasetTransformation("tf1", "obs2.0", "accountEmail", TransformationFunction("encrypt", None, "accountEmail"), "Live"), - DatasetTransformation("tf2", "obs2.0", "accountPhone", TransformationFunction("encrypt", None, "accountPhone"), "Live"), - DatasetTransformation("tf3", "obs2.0", "assetRef", TransformationFunction("encrypt", None, "assetRef"), "Live"), - DatasetTransformation("tf4", "obs2.0", "asset.assetRef2", TransformationFunction("encrypt", None, "assetRef2"), "Live"), - DatasetTransformation("tf5", "obs2.0", "asset.assetRef3", TransformationFunction("encrypt", None, "assetRef3"), "Live"), - DatasetTransformation("tf6", "obs2.0", "asset.assetRef4", TransformationFunction("encrypt", None, "assetRef4"), "Live") + DatasetTransformation("tf1", "obs2.0", "accountEmail", TransformationFunction("encrypt", None, "accountEmail")), + DatasetTransformation("tf2", "obs2.0", "accountPhone", TransformationFunction("encrypt", None, "accountPhone")), + DatasetTransformation("tf3", "obs2.0", "assetRef", TransformationFunction("encrypt", None, "assetRef")), + DatasetTransformation("tf4", "obs2.0", "asset.assetRef2", TransformationFunction("encrypt", None, "assetRef2")), + DatasetTransformation("tf5", "obs2.0", "asset.assetRef3", TransformationFunction("encrypt", None, "assetRef3")), + DatasetTransformation("tf6", "obs2.0", "asset.assetRef4", TransformationFunction("encrypt", None, "assetRef4")) )) val result = TransformerFunctionHelper.processTransformations(json, dtList) val jsonData = compact(render(result.resultJson)) diff --git a/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TransformerStreamTestSpec.scala b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TransformerStreamTestSpec.scala index 4e204ed0..76500f19 100644 --- a/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TransformerStreamTestSpec.scala +++ b/pipeline/transformer/src/test/scala/org/sunbird/obsrv/transformer/TransformerStreamTestSpec.scala @@ -67,13 +67,13 @@ class TransformerStreamTestSpec extends BaseSpecWithDatasetRegistry { private def insertTestData(): Unit = { val postgresConnect = new PostgresConnect(postgresConfig) - postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date, tags) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") - postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, created_by, updated_by, created_date, updated_date, tags) values ('d4', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") - postgresConnect.execute("insert into dataset_transformations values('tf3', 'd2', 'tfdata.valueAsInt', '{\"type\":\"jsonata\",\"expr\":\"$number(id)\"}', 'Live', null, 'System', 'System', now(), now());") - postgresConnect.execute("insert into dataset_transformations values('tf4', 'd2', 'tfdata.encryptEmail', '{\"type\":\"encrypt\",\"expr\": \"dealer.email\"}', 'Live', 'Lenient', 'System', 'System', now(), now());") - postgresConnect.execute("insert into dataset_transformations values('tf5', 'd4', 'tfdata.expr1', '{\"type\":\"jsonata\",\"expr\":null}', 'Live', null, 'System', 'System', now(), now());") - postgresConnect.execute("insert into dataset_transformations values('tf6', 'd4', 'tfdata.expr2', '{\"type\":\"jsonata\",\"expr\":\"$keys(dealer).length\"}', 'Live', null, 'System', 'System', now(), now());") - postgresConnect.execute("insert into dataset_transformations values('tf7', 'd4', 'tfdata.expr3', '{\"type\":\"jsonata\",\"expr\":\"number(id)\"}', 'Live', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date, tags) values ('d3', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") + postgresConnect.execute("insert into datasets(id, type, data_schema, router_config, dataset_config, status, api_version, entry_topic, created_by, updated_by, created_date, updated_date, tags) values ('d4', 'dataset', '{\"$schema\":\"https://json-schema.org/draft/2020-12/schema\",\"id\":\"https://sunbird.obsrv.com/test.json\",\"title\":\"Test Schema\",\"description\":\"Test Schema\",\"type\":\"object\",\"properties\":{\"id\":{\"type\":\"string\"},\"vehicleCode\":{\"type\":\"string\"},\"date\":{\"type\":\"string\"},\"dealer\":{\"type\":\"object\",\"properties\":{\"dealerCode\":{\"type\":\"string\"},\"locationId\":{\"type\":\"string\"},\"email\":{\"type\":\"string\"},\"phone\":{\"type\":\"string\"}},\"required\":[\"dealerCode\",\"locationId\"]},\"metrics\":{\"type\":\"object\",\"properties\":{\"bookingsTaken\":{\"type\":\"number\"},\"deliveriesPromised\":{\"type\":\"number\"},\"deliveriesDone\":{\"type\":\"number\"}}}},\"required\":[\"id\",\"vehicleCode\",\"date\",\"dealer\",\"metrics\"]}', '{\"topic\":\"d2-events\"}', '{\"data_key\":\"id\",\"timestamp_key\":\"date\",\"entry_topic\":\"ingest\"}', 'Live', 'v1', 'ingest', 'System', 'System', now(), now(), ARRAY['Tag1','Tag2']);") + postgresConnect.execute("insert into dataset_transformations values('tf3', 'd2', 'tfdata.valueAsInt', '{\"type\":\"jsonata\",\"expr\":\"$number(id)\"}', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_transformations values('tf4', 'd2', 'tfdata.encryptEmail', '{\"type\":\"encrypt\",\"expr\": \"dealer.email\"}', 'Lenient', 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_transformations values('tf5', 'd4', 'tfdata.expr1', '{\"type\":\"jsonata\",\"expr\":null}', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_transformations values('tf6', 'd4', 'tfdata.expr2', '{\"type\":\"jsonata\",\"expr\":\"$keys(dealer).length\"}', null, 'System', 'System', now(), now());") + postgresConnect.execute("insert into dataset_transformations values('tf7', 'd4', 'tfdata.expr3', '{\"type\":\"jsonata\",\"expr\":\"number(id)\"}', null, 'System', 'System', now(), now());") postgresConnect.closeConnection() } diff --git a/pipeline/unified-pipeline/pom.xml b/pipeline/unified-pipeline/pom.xml index a14eb0f7..37d06fe3 100644 --- a/pipeline/unified-pipeline/pom.xml +++ b/pipeline/unified-pipeline/pom.xml @@ -67,7 +67,7 @@ org.sunbird.obsrv.pipeline - druid-router + dataset-router 1.0.0 diff --git a/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineStreamTask.scala b/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineStreamTask.scala index ed03b88b..f24bb256 100644 --- a/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineStreamTask.scala +++ b/pipeline/unified-pipeline/src/main/scala/org/sunbird/obsrv/pipeline/task/UnifiedPipelineStreamTask.scala @@ -9,7 +9,7 @@ import org.sunbird.obsrv.core.util.FlinkUtil import org.sunbird.obsrv.denormalizer.task.{DenormalizerConfig, DenormalizerStreamTask} import org.sunbird.obsrv.extractor.task.{ExtractorConfig, ExtractorStreamTask} import org.sunbird.obsrv.preprocessor.task.{PipelinePreprocessorConfig, PipelinePreprocessorStreamTask} -import org.sunbird.obsrv.router.task.{DruidRouterConfig, DynamicRouterStreamTask} +import org.sunbird.obsrv.router.task.{DynamicRouterConfig, DynamicRouterStreamTask} import org.sunbird.obsrv.transformer.task.{TransformerConfig, TransformerStreamTask} import java.io.File @@ -43,7 +43,7 @@ class UnifiedPipelineStreamTask(config: Config, pipelineConfig: UnifiedPipelineC val preprocessorTask = new PipelinePreprocessorStreamTask(new PipelinePreprocessorConfig(config), kafkaConnector) val denormalizerTask = new DenormalizerStreamTask(new DenormalizerConfig(config), kafkaConnector) val transformerTask = new TransformerStreamTask(new TransformerConfig(config), kafkaConnector) - val routerTask = new DynamicRouterStreamTask(new DruidRouterConfig(config), kafkaConnector) + val routerTask = new DynamicRouterStreamTask(new DynamicRouterConfig(config), kafkaConnector) routerTask.processStream( transformerTask.processStream( From c8b7d48ba98a715c8b3e8ae28129d9af71433885 Mon Sep 17 00:00:00 2001 From: Manjunath Davanam Date: Tue, 3 Sep 2024 10:59:27 +0530 Subject: [PATCH 37/37] Obsrv V2 Release (#19) * #OBS-I182: Fix the issue with cache indexer * #OBS-I182: Cache Indexer fix |Removed the kafka-client and casting the number to long value * #OBS-I182: Cache Indexer fix | Removing the obsrv_meta information before indexing into cache --------- Co-authored-by: Santhosh Vasabhaktula --- .../BaseDatasetProcessFunction.scala | 4 +-- .../sunbird/obsrv/core/serde/SerdeUtil.scala | 34 +++++++++++++++++++ .../obsrv/core/streaming/BaseStreamTask.scala | 7 ++++ .../core/streaming/FlinkKafkaConnector.scala | 9 +++++ pipeline/cache-indexer/pom.xml | 12 ------- .../streaming/CacheIndexerStreamTask.scala | 2 +- .../sunbird/obsrv/util/MasterDataCache.scala | 9 +++-- .../sunbird/obsrv/fixture/EventFixture.scala | 10 +++--- 8 files changed, 65 insertions(+), 22 deletions(-) diff --git a/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala b/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala index fede2b54..9c454ec0 100644 --- a/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala +++ b/dataset-registry/src/main/scala/org/sunbird/obsrv/streaming/BaseDatasetProcessFunction.scala @@ -28,11 +28,11 @@ trait SystemEventHandler { } private def getTime(timespans: Map[String, AnyRef], producer: Producer): Option[Long] = { - timespans.get(producer.toString).map(f => f.asInstanceOf[Long]) + timespans.get(producer.toString).map(f => f.asInstanceOf[Number].longValue()) } private def getStat(obsrvMeta: Map[String, AnyRef], stat: Stats): Option[Long] = { - obsrvMeta.get(stat.toString).map(f => f.asInstanceOf[Long]) + obsrvMeta.get(stat.toString).map(f => f.asInstanceOf[Number].longValue()) } def getError(error: ErrorConstants.Error, producer: Producer, functionalError: FunctionalError): Option[ErrorLog] = { diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala b/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala index 370353c7..d68b924a 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/serde/SerdeUtil.scala @@ -46,6 +46,40 @@ class MapDeserializationSchema extends KafkaRecordDeserializationSchema[mutable. } +class TopicDeserializationSchema extends KafkaRecordDeserializationSchema[mutable.Map[String, AnyRef]] { + + private val serialVersionUID = -3224825136576915426L + + override def getProducedType: TypeInformation[mutable.Map[String, AnyRef]] = TypeExtractor.getForClass(classOf[mutable.Map[String, AnyRef]]) + + override def deserialize(record: ConsumerRecord[Array[Byte], Array[Byte]], out: Collector[mutable.Map[String, AnyRef]]): Unit = { + val msg = try { + val event = JSONUtil.deserialize[Map[String, AnyRef]](record.value()) + mutable.Map[String, AnyRef]( + "dataset" -> record.topic(), + "event" -> event + ) + } catch { + case _: Exception => + mutable.Map[String, AnyRef](Constants.INVALID_JSON -> new String(record.value, "UTF-8")) + } + initObsrvMeta(msg, record) + out.collect(msg) + } + + private def initObsrvMeta(msg: mutable.Map[String, AnyRef], record: ConsumerRecord[Array[Byte], Array[Byte]]): Unit = { + if (!msg.contains("obsrv_meta")) { + msg.put("obsrv_meta", Map( + "syncts" -> record.timestamp(), + "processingStartTime" -> System.currentTimeMillis(), + "flags" -> Map(), + "timespans" -> Map(), + "error" -> Map() + )) + } + } +} + class StringDeserializationSchema extends KafkaRecordDeserializationSchema[String] { private val serialVersionUID = -3224825136576915426L diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseStreamTask.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseStreamTask.scala index 8ebdb8a7..bdc897da 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseStreamTask.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/BaseStreamTask.scala @@ -38,6 +38,13 @@ abstract class BaseStreamTask[T] extends BaseStreamTaskSink[T] { .rebalance() } + def getTopicMapDataStream(env: StreamExecutionEnvironment, config: BaseJobConfig[T], kafkaTopics: List[String], + consumerSourceName: String, kafkaConnector: FlinkKafkaConnector): DataStream[mutable.Map[String, AnyRef]] = { + env.fromSource(kafkaConnector.kafkaTopicMapSource(kafkaTopics), WatermarkStrategy.noWatermarks[mutable.Map[String, AnyRef]](), consumerSourceName) + .uid(consumerSourceName).setParallelism(config.kafkaConsumerParallelism) + .rebalance() + } + def getStringDataStream(env: StreamExecutionEnvironment, config: BaseJobConfig[T], kafkaConnector: FlinkKafkaConnector): DataStream[String] = { env.fromSource(kafkaConnector.kafkaStringSource(config.inputTopic()), WatermarkStrategy.noWatermarks[String](), config.inputConsumer()) .uid(config.inputConsumer()).setParallelism(config.kafkaConsumerParallelism) diff --git a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/FlinkKafkaConnector.scala b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/FlinkKafkaConnector.scala index 508e1e7c..39552dd7 100644 --- a/framework/src/main/scala/org/sunbird/obsrv/core/streaming/FlinkKafkaConnector.scala +++ b/framework/src/main/scala/org/sunbird/obsrv/core/streaming/FlinkKafkaConnector.scala @@ -47,6 +47,15 @@ class FlinkKafkaConnector(config: BaseJobConfig[_]) extends Serializable { .build() } + def kafkaTopicMapSource(kafkaTopics: List[String]): KafkaSource[mutable.Map[String, AnyRef]] = { + KafkaSource.builder[mutable.Map[String, AnyRef]]() + .setTopics(kafkaTopics.asJava) + .setDeserializer(new TopicDeserializationSchema) + .setProperties(config.kafkaConsumerProperties()) + .setStartingOffsets(OffsetsInitializer.committedOffsets(OffsetResetStrategy.EARLIEST)) + .build() + } + def kafkaMapDynamicSink(): KafkaSink[mutable.Map[String, AnyRef]] = { KafkaSink.builder[mutable.Map[String, AnyRef]]() .setDeliverGuarantee(DeliveryGuarantee.AT_LEAST_ONCE) diff --git a/pipeline/cache-indexer/pom.xml b/pipeline/cache-indexer/pom.xml index 7d9ed5a8..36d76208 100644 --- a/pipeline/cache-indexer/pom.xml +++ b/pipeline/cache-indexer/pom.xml @@ -37,24 +37,12 @@ org.sunbird.obsrv dataset-registry 1.0.0 - - - org.apache.kafka - kafka-clients - - org.json4s json4s-native_${scala.maj.version} 4.0.6 - - org.apache.kafka - kafka-clients - ${kafka.version} - test - org.apache.kafka kafka_${scala.maj.version} diff --git a/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerStreamTask.scala b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerStreamTask.scala index 61b9ddec..735440b7 100644 --- a/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerStreamTask.scala +++ b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/streaming/CacheIndexerStreamTask.scala @@ -32,7 +32,7 @@ class CacheIndexerStreamTask(config: CacheIndexerConfig, kafkaConnector: FlinkKa val datasets = DatasetRegistry.getAllDatasets(Some(DatasetType.master.toString)) val datasetIds = datasets.map(f => f.id) - val dataStream = getMapDataStream(env, config, datasetIds, config.kafkaConsumerProperties(), consumerSourceName = s"cache-indexer-consumer", kafkaConnector) + val dataStream = getTopicMapDataStream(env, config, datasetIds, consumerSourceName = s"cache-indexer-consumer", kafkaConnector) processStream(dataStream) } diff --git a/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/util/MasterDataCache.scala b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/util/MasterDataCache.scala index c3365255..c5f95f32 100644 --- a/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/util/MasterDataCache.scala +++ b/pipeline/cache-indexer/src/main/scala/org/sunbird/obsrv/util/MasterDataCache.scala @@ -1,9 +1,10 @@ package org.sunbird.obsrv.util import org.json4s.native.JsonMethods._ -import org.json4s.{JNothing, JValue} +import org.json4s.{JField, JNothing, JValue} import org.slf4j.LoggerFactory import org.sunbird.obsrv.core.cache.RedisConnect +import org.sunbird.obsrv.core.model.Constants.OBSRV_META import org.sunbird.obsrv.model.DatasetModels.Dataset import org.sunbird.obsrv.pipeline.task.CacheIndexerConfig import redis.clients.jedis.Jedis @@ -37,7 +38,11 @@ class MasterDataCache(val config: CacheIndexerConfig) { def process(dataset: Dataset, key: String, event: JValue): (Int, Int) = { val jedis = this.datasetPipelineMap(dataset.id) val dataFromCache = getDataFromCache(dataset, key, jedis) - updateCache(dataset, dataFromCache, key, event, jedis) + val updatedEvent = event.removeField { + case JField(OBSRV_META, _) => true + case _ => false + } + updateCache(dataset, dataFromCache, key, updatedEvent, jedis) (if (dataFromCache == null) 1 else 0, if (dataFromCache == null) 0 else 1) } diff --git a/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala b/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala index cf28aec5..078cde33 100644 --- a/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala +++ b/pipeline/cache-indexer/src/test/scala/org/sunbird/obsrv/fixture/EventFixture.scala @@ -2,9 +2,9 @@ package org.sunbird.obsrv.fixture object EventFixture { - val VALID_BATCH_EVENT_D3_INSERT = """{"dataset":"dataset3","event":{"code":"HYUN-CRE-D6","manufacturer":"Hyundai","model":"Creta","variant":"SX(O)","modelYear":"2023","price":"2200000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","dealer":{"email":"john.doe@example.com","locationId":"KUN12345"}}}""" - val VALID_BATCH_EVENT_D3_INSERT_2 = """{"dataset":"dataset3","event":{"code":"HYUN-TUC-D6","manufacturer":"Hyundai","model":"Tucson","variant":"Signature","modelYear":"2023","price":"4000000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","dealer":{"email":"admin.hyun@gmail.com","locationId":"KUN134567"}}}""" - val VALID_BATCH_EVENT_D3_UPDATE = """{"dataset":"dataset3","event":{"code":"HYUN-CRE-D6","dealer":{"email":"john.doe@example.com","locationId":"KUN12345"},"safety":"3 Star (Global NCAP)","seatingCapacity":5}}""" - val VALID_BATCH_EVENT_D4 = """{"dataset":"dataset4","event":{"code":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}}""" - val INVALID_BATCH_EVENT_D4 = """{"dataset":"dataset4","event":{"code1":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}}""" + val VALID_BATCH_EVENT_D3_INSERT = """{"code":"HYUN-CRE-D6","manufacturer":"Hyundai","model":"Creta","variant":"SX(O)","modelYear":"2023","price":"2200000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","dealer":{"email":"john.doe@example.com","locationId":"KUN12345"}}""" + val VALID_BATCH_EVENT_D3_INSERT_2 = """{"code":"HYUN-TUC-D6","manufacturer":"Hyundai","model":"Tucson","variant":"Signature","modelYear":"2023","price":"4000000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","dealer":{"email":"admin.hyun@gmail.com","locationId":"KUN134567"}}""" + val VALID_BATCH_EVENT_D3_UPDATE = """{"code":"HYUN-CRE-D6","dealer":{"email":"john.doe@example.com","locationId":"KUN12345"},"safety":"3 Star (Global NCAP)","seatingCapacity":5}""" + val VALID_BATCH_EVENT_D4 = """{"code":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}""" + val INVALID_BATCH_EVENT_D4 = """{"code1":"JEEP-CP-D3","manufacturer":"Jeep","model":"Compass","variant":"Model S (O) Diesel 4x4 AT","modelYear":"2023","price":"3800000","currencyCode":"INR","currency":"Indian Rupee","transmission":"automatic","fuel":"Diesel","safety":"5 Star (Euro NCAP)","seatingCapacity":5}""" }