Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add ability to switch off/on generation of Parquet Files #1074

Merged
merged 29 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
9ffab3a
add ability to switch off/on creation of parquet dwh
mozzy11 May 29, 2024
ef22c53
Merge branch 'master' into isolate_fhir_sync
mozzy11 May 31, 2024
f37bbeb
run e2e tests for parquet and fhir sink independently
mozzy11 May 31, 2024
fbcf38c
Merge branch 'master' into isolate_fhir_sync
mozzy11 Jun 3, 2024
564d68a
Merge branch 'master' into isolate_fhir_sync
mozzy11 Jun 3, 2024
f2da0ea
Merge branch 'master' into isolate_fhir_sync
mozzy11 Jun 6, 2024
66ad736
Merge branch 'master' into isolate_fhir_sync
mozzy11 Jul 8, 2024
e296d6c
Merge branch 'master' into isolate_fhir_sync
mozzy11 Jul 9, 2024
effc19d
Update pipelines/controller/config/application.yaml
mozzy11 Jan 2, 2025
162c89b
Update pipelines/batch/src/main/java/com/google/fhir/analytics/FhirEt…
mozzy11 Jan 2, 2025
104990a
Merge branch 'master' into isolate_fhir_sync
mozzy11 Jan 2, 2025
345d321
Update PR and adress review comments
mozzy11 Jan 5, 2025
a3068a1
re-triger
mozzy11 Jan 6, 2025
a543f25
Merge branch 'master' into isolate_fhir_sync
mozzy11 Jan 6, 2025
9cc8d65
Merge branch 'master' into isolate_fhir_sync
mozzy11 Jan 7, 2025
0ee09d6
Merge branch 'master' into isolate_fhir_sync
mozzy11 Jan 7, 2025
1f0acf3
Merge branch 'master' into isolate_fhir_sync
mozzy11 Jan 8, 2025
d0a8d88
Merge branch 'master' into isolate_fhir_sync
mozzy11 Jan 13, 2025
154ec79
fix typo
mozzy11 Jan 13, 2025
e93f8e5
Update e2e-tests/controller-spark/controller_spark_sql_validation.sh
mozzy11 Jan 13, 2025
e732412
Update docker/compose-controller-spark-sql-single.yaml
mozzy11 Jan 13, 2025
b7fdb9a
Update cloudbuild.yaml
mozzy11 Jan 13, 2025
11ae8d1
Update cloudbuild.yaml
mozzy11 Jan 13, 2025
a951117
Update cloudbuild.yaml
mozzy11 Jan 13, 2025
8489378
Update cloudbuild.yaml
mozzy11 Jan 13, 2025
8750385
Merge branch 'master' into isolate_fhir_sync
mozzy11 Jan 13, 2025
7cea4ae
addres comments
mozzy11 Jan 13, 2025
47c81d8
fix typo
mozzy11 Jan 13, 2025
e9933e4
minor update
mozzy11 Jan 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 38 additions & 13 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -189,23 +189,14 @@ steps:
'-c', 'CREATE DATABASE views;']
waitFor: ['Turn down FHIR Sink Server Search']

- name: 'docker/compose'
id: 'Launch HAPI FHIR Sink Server Controller'
args: [ '-f', './docker/sink-compose.yml', '-p', 'sink-server-controller', 'up','--force-recreate', '-d' ]
env:
- SINK_SERVER_NAME=sink-server-controller
- SINK_SERVER_PORT=9001
waitFor: ['Create views database']

- name: 'docker/compose'
id: 'Bring up controller and Spark containers'
env:
- PIPELINE_CONFIG=/workspace/docker/config
- DWH_ROOT=/workspace/e2e-tests/controller-spark/dwh
- FHIRDATA_SINKFHIRSERVERURL=http://sink-server-controller:8080/fhir
args: [ '-f', './docker/compose-controller-spark-sql-single.yaml', 'up',
'--force-recreate', '-d' ]
waitFor: ['Launch HAPI FHIR Sink Server Controller']
waitFor: ['Create views database']

- name: '${_REPOSITORY}/e2e-tests/controller-spark:${_TAG}'
id: 'Run E2E Test for Dockerized Controller and Spark Thriftserver'
Expand All @@ -223,17 +214,51 @@ steps:

- name: 'docker/compose'
id: 'Bring down controller and Spark containers'
args: [ '-f', './docker/compose-controller-spark-sql-single.yaml', 'down' ]
args: [ '-f', './docker/compose-controller-spark-sql-single.yaml', 'down' ,'-v']
waitFor: ['Run E2E Test for Dockerized Controller and Spark Thriftserver']

- name: 'docker/compose'
id: 'Launch HAPI FHIR Sink Server'
args: [ '-f', './docker/sink-compose.yml', '-p', 'sink-server', 'up','--force-recreate', '-d' ]
env:
- SINK_SERVER_NAME=sink-server-controller
- SINK_SERVER_PORT=9001
waitFor: ['Bring down controller and Spark containers']

# Spinning up only the pipeline controller for FHIR server to FHIR server sync
- name: 'docker/compose'
id: 'Bring up the pipeline controller for FHIR server to FHIR server sync'
env:
- PIPELINE_CONFIG=/workspace/docker/config
- DWH_ROOT=/workspace/e2e-tests/controller-spark/dwh
- FHIRDATA_SINKFHIRSERVERURL=http://sink-server-controller:8080/fhir
- FHIRDATA_GENERATEPARQUETFILES=false
- FHIRDATA_CREATEHIVERESOURCETABLES=false
- FHIRDATA_CREATEPARQUETVIEWS=false
- FHIRDATA_SINKDBCONFIGPATH=
args: [ '-f', './docker/compose-controller-spark-sql-single.yaml', 'up',
'--force-recreate', '--no-deps' , '-d' ,'pipeline-controller' ]
waitFor: ['Launch HAPI FHIR Sink Server']

- name: '${_REPOSITORY}/e2e-tests/controller-spark:${_TAG}'
id: 'Run E2E Test for Dockerized Controller in FHIR server to FHIR server sync mode'
waitFor: ['Bring up the pipeline controller for FHIR server to FHIR server sync']
env:
- DWH_TYPE="FHIR"

- name: 'docker/compose'
id: 'Bring down the pipeline controller'
args: [ '-f', './docker/compose-controller-spark-sql-single.yaml', 'down' ,'-v']
waitFor: ['Run E2E Test for Dockerized Controller in FHIR server to FHIR server sync mode']

- name: 'docker/compose'
id: 'Turn down HAPI Source Server'
args: [ '-f', './docker/hapi-compose.yml', 'down' ]
waitFor: ['Bring down controller and Spark containers']
waitFor: ['Bring down the pipeline controller']

- name: 'docker/compose'
id: 'Turn down FHIR Sink Server Controller for e2e tests'
args: [ '-f', './docker/sink-compose.yml', '-p', 'sink-server-controller', 'down' ,'-v']
args: [ '-f', './docker/sink-compose.yml', '-p', 'sink-server', 'down' ,'-v']
env:
- SINK_SERVER_NAME=sink-server-controller
- SINK_SERVER_PORT=9001
Expand Down
8 changes: 6 additions & 2 deletions docker/compose-controller-spark-sql-single.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,12 @@ services:
- ${DWH_ROOT}:/dwh
environment:
- JAVA_OPTS=$JAVA_OPTS
# This is to turn this on in e2e but leave it off in the default config.
- FHIRDATA_SINKFHIRSERVERURL=$FHIRDATA_SINKFHIRSERVERURL
# This is to override the values in the default config.
- FHIRDATA_SINKFHIRSERVERURL=${FHIRDATA_SINKFHIRSERVERURL:-}
- FHIRDATA_GENERATEPARQUETFILES=${FHIRDATA_GENERATEPARQUETFILES:-true}
- FHIRDATA_CREATEHIVERESOURCETABLES=${FHIRDATA_CREATEHIVERESOURCETABLES:-true}
- FHIRDATA_CREATEPARQUETVIEWS=${FHIRDATA_CREATEPARQUETVIEWS:-true}
- FHIRDATA_SINKDBCONFIGPATH=${FHIRDATA_SINKDBCONFIGPATH:-config/hapi-postgres-config_local_views.json}
ports:
- '8090:8080'
networks:
Expand Down
1 change: 1 addition & 0 deletions docker/config/application.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ fhirdata:
# fhirServerUrl: "http://hapi-server:8080/fhir"
dbConfig: "config/hapi-postgres-config_local.json"
dwhRootPrefix: "/dwh/controller_DWH"
generateParquetFiles: true
incrementalSchedule: "0 0 * * * *"
purgeSchedule: "0 30 * * * *"
numOfDwhSnapshotsToRetain: 2
Expand Down
3 changes: 2 additions & 1 deletion e2e-tests/controller-spark/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,6 @@ COPY parquet-tools-1.11.1.jar parquet-tools-1.11.1.jar
ENV PARQUET_SUBDIR="dwh"
ENV DOCKER_NETWORK="--use_docker_network"
ENV HOME_DIR="/workspace/e2e-tests/controller-spark"
ENV DWH_TYPE="PARQUET"

ENTRYPOINT cd ${HOME_DIR}; ./controller_spark_sql_validation.sh ${HOME_DIR} ${PARQUET_SUBDIR} ${DOCKER_NETWORK}
ENTRYPOINT cd ${HOME_DIR}; ./controller_spark_sql_validation.sh ${HOME_DIR} ${PARQUET_SUBDIR} ${DOCKER_NETWORK} ${DWH_TYPE}
63 changes: 49 additions & 14 deletions e2e-tests/controller-spark/controller_spark_sql_validation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,13 @@ function validate_args() {
# anything that needs printing
#################################################
function print_message() {
local print_prefix="E2E TEST FOR CONTROLLER SPARK DEPLOYMENT:"
local print_prefix=""
if [[ "${DWH_TYPE}" == "PARQUET" ]]
then
print_prefix="E2E TEST FOR CONTROLLER PARQUET BASED DEPLOYMENT:"
else
print_prefix="E2E TEST FOR CONTROLLER FHIR SERVER TO FHIR SERVER SYNC:"
fi
echo "${print_prefix} $*"
}

Expand All @@ -88,6 +94,7 @@ function print_message() {
function setup() {
HOME_PATH=$1
PARQUET_SUBDIR=$2
DWH_TYPE=$4
SOURCE_FHIR_SERVER_URL='http://localhost:8091'
SINK_FHIR_SERVER_URL='http://localhost:8098'
PIPELINE_CONTROLLER_URL='http://localhost:8090'
Expand Down Expand Up @@ -410,6 +417,26 @@ function validate_updated_resource() {
}


function validate_updated_resource_in_fhir_sink() {
local fhir_username="hapi"
local fhir_password="hapi"
local fhir_url_extension="/fhir"

# Fetch the patient resource using the Patient ID
local updated_family_name=$(curl -X GET -H "Content-Type: application/json; charset=utf-8" -u $fhir_username:$fhir_password \
--connect-timeout 5 --max-time 20 "${SINK_FHIR_SERVER_URL}${fhir_url_extension}/Patient/${PATIENT_ID}" \
| jq -r '.name[0].family')

if [[ "${updated_family_name}" == "Anderson" ]]
then
print_message "Updated Patient data for ${PATIENT_ID} in FHIR sink verified successfully."
else
print_message "Updated Patient data verification for ${PATIENT_ID} in FHIR sink failed."
exit 6
fi
}


#################################################
# Function that counts resources in FHIR server and compares output to what is
# in the source FHIR server
Expand Down Expand Up @@ -451,10 +478,15 @@ validate_args "$@"
setup "$@"
fhir_source_query
sleep 30
# Full run.
run_pipeline "FULL"
wait_for_completion
check_parquet false
test_fhir_sink "FULL"
if [[ "${DWH_TYPE}" == "PARQUET" ]]
then
check_parquet false
else
test_fhir_sink "FULL"
fi

clear

Expand All @@ -463,16 +495,19 @@ update_resource
# Incremental run.
run_pipeline "INCREMENTAL"
wait_for_completion
check_parquet true
fhir_source_query
test_fhir_sink "INCREMENTAL"

validate_resource_tables
validate_resource_tables_data
validate_updated_resource

# View recreation run
# TODO add validation for the views as well
run_pipeline "VIEWS"
if [[ "${DWH_TYPE}" == "PARQUET" ]]
then
check_parquet true
validate_resource_tables
validate_resource_tables_data
validate_updated_resource
# View recreation run
# TODO add validation for the views as well
run_pipeline "VIEWS"
else
fhir_source_query
test_fhir_sink "INCREMENTAL"
validate_updated_resource_in_fhir_sink
fi

print_message "END!!"
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2020-2024 Google LLC
* Copyright 2020-2025 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -143,7 +143,7 @@ public void writeResource(HapiRowDescriptor element)

numFetchedResourcesMap.get(resourceType).inc(1);

if (!parquetFile.isEmpty()) {
if (parquetUtil != null) {
startTime = System.currentTimeMillis();
parquetUtil.write(resource);
totalGenerateTimeMillisMap.get(resourceType).inc(System.currentTimeMillis() - startTime);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2020-2024 Google LLC
* Copyright 2020-2025 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -88,6 +88,8 @@ abstract class FetchSearchPageFn<T> extends DoFn<T, KV<String, Integer>> {

protected final String parquetFile;

protected final Boolean generateParquetFiles;

private final int secondsToFlush;

private final int rowGroupSize;
Expand Down Expand Up @@ -135,6 +137,7 @@ abstract class FetchSearchPageFn<T> extends DoFn<T, KV<String, Integer>> {
this.oAuthClientSecret = options.getFhirServerOAuthClientSecret();
this.stageIdentifier = stageIdentifier;
this.parquetFile = options.getOutputParquetPath();
this.generateParquetFiles = options.isGenerateParquetFiles();
this.secondsToFlush = options.getSecondsToFlushParquetFiles();
this.rowGroupSize = options.getRowGroupSizeForParquetFiles();
if (DATAFLOW_RUNNER.equals(options.getRunner().getSimpleName())) {
Expand Down Expand Up @@ -210,7 +213,7 @@ public void setup() throws SQLException, ProfileException {
oAuthClientSecret,
fhirContext);
fhirSearchUtil = new FhirSearchUtil(fetchUtil);
if (!Strings.isNullOrEmpty(parquetFile)) {
if (generateParquetFiles && !Strings.isNullOrEmpty(parquetFile)) {
parquetUtil =
new ParquetUtil(
fhirContext.getVersion().getVersion(),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2020-2024 Google LLC
* Copyright 2020-2025 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -264,5 +264,13 @@ public interface FhirEtlOptions extends BasePipelineOptions {
@Default.String("")
String getSourceNdjsonFilePatternList();

@Description(
"Flag to switch off/on generation of parquet files; can be turned off when syncing from a"
+ " FHIR server to another.")
@Default.Boolean(true)
Boolean isGenerateParquetFiles();

void setGenerateParquetFiles(Boolean value);

void setSourceNdjsonFilePatternList(String value);
}
3 changes: 3 additions & 0 deletions pipelines/controller/config/application.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ fhirdata:
# that directory too, such that files created by the pipelines are readable by
# the Thrift Server, e.g., `setfacl -d -m o::rx dwh/`.
dwhRootPrefix: "dwh/controller_DEV_DWH"
# Whether to generate Parquet Files or not. In case of syncing from a FHIR server to another,
# if Parquet files are not needed, their generation can be switched off by this flag.
generateParquetFiles: true

# The schedule for automatic incremental pipeline runs.
# Uses the Spring CronExpression format, i.e.,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2020-2024 Google LLC
* Copyright 2020-2025 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -119,6 +119,8 @@ public class DataProperties {

private int recursiveDepth;

private boolean generateParquetFiles;

@PostConstruct
void validateProperties() {
CronExpression.parse(incrementalSchedule);
Expand All @@ -133,8 +135,12 @@ void validateProperties() {
!Strings.isNullOrEmpty(fhirServerUrl) || !Strings.isNullOrEmpty(dbConfig),
"At least one of fhirServerUrl or dbConfig should be set!");

Preconditions.checkArgument(
!Strings.isNullOrEmpty(dwhRootPrefix), "dwhRootPrefix is required!");
Preconditions.checkState(fhirVersion != null, "FhirVersion cannot be empty");
Preconditions.checkState(!createHiveResourceTables || !thriftserverHiveConfig.isEmpty());
Preconditions.checkState(!createHiveResourceTables || generateParquetFiles);
Preconditions.checkState(!createParquetViews || generateParquetFiles);
}

private PipelineConfig.PipelineConfigBuilder addFlinkOptions(FhirEtlOptions options) {
Expand Down Expand Up @@ -212,6 +218,8 @@ PipelineConfig createBatchOptions() {
String timestampSuffix = DwhFiles.safeTimestampSuffix();
options.setOutputParquetPath(dwhRootPrefix + DwhFiles.TIMESTAMP_PREFIX + timestampSuffix);

options.setGenerateParquetFiles(generateParquetFiles);

PipelineConfig.PipelineConfigBuilder pipelineConfigBuilder = addFlinkOptions(options);

// Get hold of thrift server parquet directory from dwhRootPrefix config.
Expand All @@ -231,6 +239,8 @@ List<ConfigFields> getConfigParams() {
"fhirdata.fhirFetchMode", fhirFetchMode != null ? fhirFetchMode.name() : "", "", ""),
new ConfigFields("fhirdata.fhirServerUrl", fhirServerUrl, "", ""),
new ConfigFields("fhirdata.dwhRootPrefix", dwhRootPrefix, "", ""),
new ConfigFields(
"fhirdata.generateParquetFiles", String.valueOf(generateParquetFiles), "", ""),
new ConfigFields("fhirdata.incrementalSchedule", incrementalSchedule, "", ""),
new ConfigFields("fhirdata.purgeSchedule", purgeSchedule, "", ""),
new ConfigFields(
Expand Down
Loading