From 22e7636e33d42d7363be6b52c5c9a5fb3979e610 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Wed, 21 Aug 2024 12:58:27 +0100 Subject: [PATCH] refactoring code to parquet to zip2parquet --- .make.versions | 4 +- kfp/README.md | 2 +- tools/ingest2parquet/README.md | 2 +- .../python/test-data/expected/metadata.json | 40 ---------- .../ray/test-data/expected/metadata.json | 40 ---------- .../zip2parquet}/Makefile | 0 .../zip2parquet}/README.md | 0 .../zip2parquet}/kfp_ray/Makefile | 0 .../zip2parquet}/kfp_ray/README.md | 2 +- .../zip2parquet/kfp_ray/zip2parquet_wf.py} | 74 ++++++++++-------- .../zip2parquet}/python/.dockerignore | 0 .../zip2parquet/python}/.gitignore | 2 +- .../zip2parquet}/python/Dockerfile | 10 +-- .../zip2parquet}/python/Makefile | 8 +- .../zip2parquet}/python/README.md | 66 +++++++++++----- .../zip2parquet}/python/pyproject.toml | 6 +- .../python/src/zip2parquet_local.py} | 10 ++- .../python/src/zip2parquet_local_python.py} | 4 +- .../python/src/zip2parquet_s3_python.py} | 4 +- .../python/src/zip2parquet_transform.py} | 35 +++++++-- .../src/zip2parquet_transform_python.py} | 7 +- .../expected/application-java.parquet | Bin 9225 -> 9223 bytes .../expected/data-processing-lib.parquet | Bin 23951 -> 23942 bytes ...ronments_archive_refs_heads_master.parquet | Bin 36991 -> 37004 bytes .../python/test-data/expected/metadata.json | 49 ++++++++++++ .../test-data/input/application-java.zip | Bin .../test-data/input/data-processing-lib.zip | Bin ...environments_archive_refs_heads_master.zip | Bin .../test-data/languages/lang_extensions.json | 0 .../python/test/test_zip2parquet.py} | 2 +- .../python/test/test_zip2parquet_python.py} | 4 +- .../zip2parquet}/ray/.dockerignore | 0 .../zip2parquet/ray}/.gitignore | 0 .../zip2parquet}/ray/Dockerfile | 10 +-- .../zip2parquet}/ray/Makefile | 8 +- .../zip2parquet}/ray/README.md | 12 +-- .../zip2parquet}/ray/pyproject.toml | 8 +- .../ray/src/zip2parquet_local_ray.py} | 5 +- .../ray/src/zip2parquet_s3_ray.py} | 4 +- .../ray/src/zip2parquet_transform_ray.py} | 2 +- .../expected/application-java.parquet | Bin 9225 -> 9223 bytes .../expected/data-processing-lib.parquet | Bin 23951 -> 23942 bytes ...ronments_archive_refs_heads_master.parquet | Bin 36991 -> 37004 bytes .../ray/test-data/expected/metadata.json | 49 ++++++++++++ .../ray/test-data/input/application-java.zip | Bin .../test-data/input/data-processing-lib.zip | Bin ...environments_archive_refs_heads_master.zip | Bin .../test-data/languages/lang_extensions.json | 0 .../ray/test/test_zip2parquet_ray.py} | 4 +- 49 files changed, 276 insertions(+), 197 deletions(-) delete mode 100644 transforms/code/code2parquet/python/test-data/expected/metadata.json delete mode 100644 transforms/code/code2parquet/ray/test-data/expected/metadata.json rename transforms/{code/code2parquet => universal/zip2parquet}/Makefile (100%) rename transforms/{code/code2parquet => universal/zip2parquet}/README.md (100%) rename transforms/{code/code2parquet => universal/zip2parquet}/kfp_ray/Makefile (100%) rename transforms/{code/code2parquet => universal/zip2parquet}/kfp_ray/README.md (93%) rename transforms/{code/code2parquet/kfp_ray/code2parquet_wf.py => universal/zip2parquet/kfp_ray/zip2parquet_wf.py} (78%) rename transforms/{code/code2parquet => universal/zip2parquet}/python/.dockerignore (100%) rename transforms/{code/code2parquet/ray => universal/zip2parquet/python}/.gitignore (95%) rename transforms/{code/code2parquet => universal/zip2parquet}/python/Dockerfile (85%) rename transforms/{code/code2parquet => universal/zip2parquet}/python/Makefile (82%) rename transforms/{code/code2parquet => universal/zip2parquet}/python/README.md (61%) rename transforms/{code/code2parquet => universal/zip2parquet}/python/pyproject.toml (87%) rename transforms/{code/code2parquet/python/src/code2parquet_local.py => universal/zip2parquet/python/src/zip2parquet_local.py} (91%) rename transforms/{code/code2parquet/python/src/code2parquet_local_python.py => universal/zip2parquet/python/src/zip2parquet_local_python.py} (93%) rename transforms/{code/code2parquet/python/src/code2parquet_s3_python.py => universal/zip2parquet/python/src/zip2parquet_s3_python.py} (93%) rename transforms/{code/code2parquet/python/src/code2parquet_transform.py => universal/zip2parquet/python/src/zip2parquet_transform.py} (87%) rename transforms/{code/code2parquet/python/src/code2parquet_transform_python.py => universal/zip2parquet/python/src/zip2parquet_transform_python.py} (89%) rename transforms/{code/code2parquet => universal/zip2parquet}/python/test-data/expected/application-java.parquet (60%) rename transforms/{code/code2parquet => universal/zip2parquet}/python/test-data/expected/data-processing-lib.parquet (88%) rename transforms/{code/code2parquet => universal/zip2parquet}/python/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet (90%) create mode 100644 transforms/universal/zip2parquet/python/test-data/expected/metadata.json rename transforms/{code/code2parquet => universal/zip2parquet}/python/test-data/input/application-java.zip (100%) rename transforms/{code/code2parquet => universal/zip2parquet}/python/test-data/input/data-processing-lib.zip (100%) rename transforms/{code/code2parquet => universal/zip2parquet}/python/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip (100%) rename transforms/{code/code2parquet => universal/zip2parquet}/python/test-data/languages/lang_extensions.json (100%) rename transforms/{code/code2parquet/python/test/test_code2parquet.py => universal/zip2parquet/python/test/test_zip2parquet.py} (97%) rename transforms/{code/code2parquet/python/test/test_code2parquet_python.py => universal/zip2parquet/python/test/test_zip2parquet_python.py} (93%) rename transforms/{code/code2parquet => universal/zip2parquet}/ray/.dockerignore (100%) rename transforms/{code/code2parquet/python => universal/zip2parquet/ray}/.gitignore (100%) rename transforms/{code/code2parquet => universal/zip2parquet}/ray/Dockerfile (88%) rename transforms/{code/code2parquet => universal/zip2parquet}/ray/Makefile (82%) rename transforms/{code/code2parquet => universal/zip2parquet}/ray/README.md (77%) rename transforms/{code/code2parquet => universal/zip2parquet}/ray/pyproject.toml (84%) rename transforms/{code/code2parquet/ray/src/code2parquet_local_ray.py => universal/zip2parquet/ray/src/zip2parquet_local_ray.py} (95%) rename transforms/{code/code2parquet/ray/src/code2parquet_s3_ray.py => universal/zip2parquet/ray/src/zip2parquet_s3_ray.py} (95%) rename transforms/{code/code2parquet/ray/src/code2parquet_transform_ray.py => universal/zip2parquet/ray/src/zip2parquet_transform_ray.py} (99%) rename transforms/{code/code2parquet => universal/zip2parquet}/ray/test-data/expected/application-java.parquet (60%) rename transforms/{code/code2parquet => universal/zip2parquet}/ray/test-data/expected/data-processing-lib.parquet (88%) rename transforms/{code/code2parquet => universal/zip2parquet}/ray/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet (90%) create mode 100644 transforms/universal/zip2parquet/ray/test-data/expected/metadata.json rename transforms/{code/code2parquet => universal/zip2parquet}/ray/test-data/input/application-java.zip (100%) rename transforms/{code/code2parquet => universal/zip2parquet}/ray/test-data/input/data-processing-lib.zip (100%) rename transforms/{code/code2parquet => universal/zip2parquet}/ray/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip (100%) rename transforms/{code/code2parquet => universal/zip2parquet}/ray/test-data/languages/lang_extensions.json (100%) rename transforms/{code/code2parquet/ray/test/test_code2parquet_ray.py => universal/zip2parquet/ray/test/test_zip2parquet_ray.py} (95%) diff --git a/.make.versions b/.make.versions index 3ea2ae2d1..92822959e 100644 --- a/.make.versions +++ b/.make.versions @@ -82,8 +82,8 @@ DOC_QUALITY_RAY_VERSION=$(DPK_VERSION) CODE_QUALITY_RAY_VERSION=$(DPK_VERSION) CODE_QUALITY_PYTHON_VERSION=$(DPK_VERSION) -CODE2PARQUET_PYTHON_VERSION=$(DPK_VERSION) -CODE2PARQUET_RAY_VERSION=$(DPK_VERSION) +ZIP2PARQUET_PYTHON_VERSION=$(DPK_VERSION) +ZIP2PARQUET_RAY_VERSION=$(DPK_VERSION) INGEST_TO_PARQUET_VERSION=$(DPK_VERSION) REPO_LVL_ORDER_RAY_VERSION=$(DPK_VERSION) diff --git a/kfp/README.md b/kfp/README.md index 2468e2429..cadec4e92 100644 --- a/kfp/README.md +++ b/kfp/README.md @@ -6,7 +6,7 @@ |-------------------------------------|:----------------------------------------------------------------------------------:| | language/lang_id | [lang_id_wf.py](../transforms/language/lang_id/kfp_ray/lang_id_wf.py) | | code/malware | [malware_wf.py](../transforms/code/malware/kfp_ray/malware_wf.py) | -| code/code2parquet | [code2parquet_wf.py](../transforms/code/code2parquet/kfp_ray/code2parquet_wf.py) | +| code/code2parquet | [code2parquet_wf.py](../transforms/universal/zip2parquet/kfp_ray/zip2parquet_wf.py) | | code/code_quality | [code_quality_wf.py](../transforms/code/code_quality/kfp_ray/code_quality_wf.py) | | code/proglang_select | [proglang_select_wf.py](../transforms/code/proglang_select/kfp_ray/proglang_select_wf.py) | | universal/doc_id | [doc_id_wf.py](../transforms/universal/doc_id/kfp_ray/doc_id_wf.py) | diff --git a/tools/ingest2parquet/README.md b/tools/ingest2parquet/README.md index dce3d042f..7fbe2ff02 100644 --- a/tools/ingest2parquet/README.md +++ b/tools/ingest2parquet/README.md @@ -2,7 +2,7 @@ **Please note: This tool is deprecated and will be removed soon. It is superseded by the transform-based implementation, -[code2parquet](../../transforms/code/code2parquet), providing identical capability, +[code2parquet](../../transforms/universal/zip2parquet), providing identical capability, but with support for ray-based scalability.** ## Summary diff --git a/transforms/code/code2parquet/python/test-data/expected/metadata.json b/transforms/code/code2parquet/python/test-data/expected/metadata.json deleted file mode 100644 index 5c2c6d0a0..000000000 --- a/transforms/code/code2parquet/python/test-data/expected/metadata.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "code2parquet", - "job type": "pure python", - "job id": "job_id", - "start_time": "2024-07-25 15:38:20", - "end_time": "2024-07-25 15:38:21", - "status": "success" - }, - "code": null, - "job_input_params": { - "supported_langs_file": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/languages/lang_extensions.json", - "detect_programming_lang": true, - "snapshot": null, - "domain": null, - "s3_cred": null, - "checkpointing": false, - "max_files": -1, - "random_samples": -1, - "files_to_use": [".zip"] - }, - "job_output_stats": { - "source_files": 3, - "source_size": 33885652, - "result_files": 3, - "result_size": 70167, - "processing_time": 1.5678541660308838, - "number of rows": 74 - }, - "source": { - "name": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/input", - "type": "path" - }, - "target": { - "name": "/tmp/code2parquetbl3prm61", - "type": "path" - } -} diff --git a/transforms/code/code2parquet/ray/test-data/expected/metadata.json b/transforms/code/code2parquet/ray/test-data/expected/metadata.json deleted file mode 100644 index 5c2c6d0a0..000000000 --- a/transforms/code/code2parquet/ray/test-data/expected/metadata.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "code2parquet", - "job type": "pure python", - "job id": "job_id", - "start_time": "2024-07-25 15:38:20", - "end_time": "2024-07-25 15:38:21", - "status": "success" - }, - "code": null, - "job_input_params": { - "supported_langs_file": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/languages/lang_extensions.json", - "detect_programming_lang": true, - "snapshot": null, - "domain": null, - "s3_cred": null, - "checkpointing": false, - "max_files": -1, - "random_samples": -1, - "files_to_use": [".zip"] - }, - "job_output_stats": { - "source_files": 3, - "source_size": 33885652, - "result_files": 3, - "result_size": 70167, - "processing_time": 1.5678541660308838, - "number of rows": 74 - }, - "source": { - "name": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/input", - "type": "path" - }, - "target": { - "name": "/tmp/code2parquetbl3prm61", - "type": "path" - } -} diff --git a/transforms/code/code2parquet/Makefile b/transforms/universal/zip2parquet/Makefile similarity index 100% rename from transforms/code/code2parquet/Makefile rename to transforms/universal/zip2parquet/Makefile diff --git a/transforms/code/code2parquet/README.md b/transforms/universal/zip2parquet/README.md similarity index 100% rename from transforms/code/code2parquet/README.md rename to transforms/universal/zip2parquet/README.md diff --git a/transforms/code/code2parquet/kfp_ray/Makefile b/transforms/universal/zip2parquet/kfp_ray/Makefile similarity index 100% rename from transforms/code/code2parquet/kfp_ray/Makefile rename to transforms/universal/zip2parquet/kfp_ray/Makefile diff --git a/transforms/code/code2parquet/kfp_ray/README.md b/transforms/universal/zip2parquet/kfp_ray/README.md similarity index 93% rename from transforms/code/code2parquet/kfp_ray/README.md rename to transforms/universal/zip2parquet/kfp_ray/README.md index d2c34e449..e500ebcf3 100644 --- a/transforms/code/code2parquet/kfp_ray/README.md +++ b/transforms/universal/zip2parquet/kfp_ray/README.md @@ -2,7 +2,7 @@ ## Summary -This project allows execution of the [noop Ray transform](../ray) as a +This project allows execution of the [zip2parquet Ray transform](../ray) as a [KubeFlow Pipeline](https://www.kubeflow.org/docs/components/pipelines/overview/) The detail pipeline is presented in the [Simplest Transform pipeline tutorial](../../../../kfp/doc/simple_transform_pipeline.md) diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/universal/zip2parquet/kfp_ray/zip2parquet_wf.py similarity index 78% rename from transforms/code/code2parquet/kfp_ray/code2parquet_wf.py rename to transforms/universal/zip2parquet/kfp_ray/zip2parquet_wf.py index 7cc12fd60..c2accbb98 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/universal/zip2parquet/kfp_ray/zip2parquet_wf.py @@ -19,9 +19,9 @@ # the name of the job script -EXEC_SCRIPT_NAME: str = "code2parquet_transform_ray.py" +EXEC_SCRIPT_NAME: str = "zip2parquet_transform_ray.py" -task_image = "quay.io/dataprep1/data-prep-kit/code2parquet-ray:latest" +task_image = "quay.io/dataprep1/data-prep-kit/zip2parquet-ray:latest" # components @@ -42,10 +42,12 @@ def compute_exec_params_func( runtime_pipeline_id: str, runtime_job_id: str, runtime_code_location: dict, - code2parquet_supported_langs_file: str, - code2parquet_domain: str, - code2parquet_snapshot: str, - code2parquet_detect_programming_lang: bool, + zip2parquet_code_data: bool, + zip2parquet_programming_language_column: str, + zip2parquet_supported_langs_file: str, + zip2parquet_domain: str, + zip2parquet_snapshot: str, + zip2parquet_detect_programming_lang: bool, ) -> dict: from runtime_utils import KFPUtils @@ -59,10 +61,12 @@ def compute_exec_params_func( "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": str(runtime_code_location), - "code2parquet_supported_langs_file": code2parquet_supported_langs_file, - "code2parquet_domain": code2parquet_domain, - "code2parquet_snapshot": code2parquet_snapshot, - "code2parquet_detect_programming_lang": code2parquet_detect_programming_lang, + "zip2parquet_code_data": zip2parquet_code_data, + "zip2parquet_programming_language_column": zip2parquet_programming_language_column, + "zip2parquet_supported_langs_file": zip2parquet_supported_langs_file, + "zip2parquet_domain": zip2parquet_domain, + "zip2parquet_snapshot": zip2parquet_snapshot, + "zip2parquet_detect_programming_lang": zip2parquet_detect_programming_lang, } @@ -97,22 +101,22 @@ def compute_exec_params_func( # clean up Ray cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") # Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "code2parquet" -PREFIX: str = "code2parquet" +TASK_NAME: str = "zip2parquet" +PREFIX: str = "zip2parquet" @dsl.pipeline( name=TASK_NAME + "-ray-pipeline", description="Pipeline for converting zip files to parquet", ) -def code2parquet( - ray_name: str = "code2parquet-kfp-ray", # name of Ray cluster +def zip2parquet( + ray_name: str = "zip2parquet-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access - data_s3_config: str = "{'input_folder': 'test/code2parquet/input', 'output_folder': 'test/code2parquet/output/'}", + data_s3_config: str = "{'input_folder': 'test/zip2parquet/input', 'output_folder': 'test/zip2parquet/output/'}", data_s3_access_secret: str = "s3-secret", data_max_files: int = -1, data_num_samples: int = -1, @@ -121,12 +125,14 @@ def code2parquet( runtime_actor_options: dict = {'num_cpus': 0.8}, runtime_pipeline_id: str = "pipeline_id", runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, - # code to parquet - code2parquet_supported_langs_file: str = "test/code2parquet/languages/lang_extensions.json", - code2parquet_detect_programming_lang: bool = True, - code2parquet_domain: str = "code", - code2parquet_snapshot: str = "github", - code2parquet_s3_access_secret: str = "s3-secret", + # zip to parquet + zip2parquet_code_data: bool = True, + zip2parquet_programming_language_column: str = "programming_language", + zip2parquet_supported_langs_file: str = "test/zip2parquet/languages/lang_extensions.json", + zip2parquet_detect_programming_lang: bool = True, + zip2parquet_domain: str = "code", + zip2parquet_snapshot: str = "github", + zip2parquet_s3_access_secret: str = "s3-secret", # additional parameters additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', ) -> None: @@ -162,11 +168,13 @@ def code2parquet( :param runtime_actor_options - actor options :param runtime_pipeline_id - pipeline id :param runtime_code_location - code location - :param code2parquet_supported_langs_file - file to store allowed languages - :param code2parquet_detect_programming_lang - detect programming language flag - :param code2parquet_domain: domain - :param code2parquet_snapshot: snapshot - :param code2parquet_s3_access_secret - ingest to parquet s3 access secret + :param zip2parquet_code_data - flag that data is code + :param zip2parquet_programming_language_column - name for programming language column + :param zip2parquet_supported_langs_file - file to store allowed languages + :param zip2parquet_detect_programming_lang - detect programming language flag + :param zip2parquet_domain: domain + :param zip2parquet_snapshot: snapshot + :param zip2parquet_s3_access_secret - ingest to parquet s3 access secret (here we are assuming that select language info is in S3, but potentially in the different bucket) :return: None """ @@ -186,10 +194,12 @@ def code2parquet( runtime_pipeline_id=runtime_pipeline_id, runtime_job_id=run_id, runtime_code_location=runtime_code_location, - code2parquet_supported_langs_file=code2parquet_supported_langs_file, - code2parquet_domain=code2parquet_domain, - code2parquet_snapshot=code2parquet_snapshot, - code2parquet_detect_programming_lang=code2parquet_detect_programming_lang, + zip2parquet_code_data=zip2parquet_code_data, + zip2parquet_programming_language_column=zip2parquet_programming_language_column, + zip2parquet_supported_langs_file=zip2parquet_supported_langs_file, + zip2parquet_domain=zip2parquet_domain, + zip2parquet_snapshot=zip2parquet_snapshot, + zip2parquet_detect_programming_lang=zip2parquet_detect_programming_lang, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster @@ -216,10 +226,10 @@ def code2parquet( ) ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - ComponentUtils.set_s3_env_vars_to_component(execute_job, code2parquet_s3_access_secret, prefix=PREFIX) + ComponentUtils.set_s3_env_vars_to_component(execute_job, zip2parquet_s3_access_secret, prefix=PREFIX) execute_job.after(ray_cluster) if __name__ == "__main__": # Compiling the pipeline - compiler.Compiler().compile(code2parquet, __file__.replace(".py", ".yaml")) + compiler.Compiler().compile(zip2parquet, __file__.replace(".py", ".yaml")) diff --git a/transforms/code/code2parquet/python/.dockerignore b/transforms/universal/zip2parquet/python/.dockerignore similarity index 100% rename from transforms/code/code2parquet/python/.dockerignore rename to transforms/universal/zip2parquet/python/.dockerignore diff --git a/transforms/code/code2parquet/ray/.gitignore b/transforms/universal/zip2parquet/python/.gitignore similarity index 95% rename from transforms/code/code2parquet/ray/.gitignore rename to transforms/universal/zip2parquet/python/.gitignore index 17cee1df3..de14528f0 100644 --- a/transforms/code/code2parquet/ray/.gitignore +++ b/transforms/universal/zip2parquet/python/.gitignore @@ -1,5 +1,5 @@ test-data/output -output/* +output/metadata.json /output/ data-processing-lib/ diff --git a/transforms/code/code2parquet/python/Dockerfile b/transforms/universal/zip2parquet/python/Dockerfile similarity index 85% rename from transforms/code/code2parquet/python/Dockerfile rename to transforms/universal/zip2parquet/python/Dockerfile index b36b6a6c4..918de9aff 100644 --- a/transforms/code/code2parquet/python/Dockerfile +++ b/transforms/universal/zip2parquet/python/Dockerfile @@ -17,19 +17,19 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . # END OF STEPS destined for a data-prep-kit base image -COPY --chown=dpk:root src/ src/ +COPY --chown=dpk:root src src/ COPY --chown=dpk:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY ./src/code2parquet_transform_python.py . +COPY src/zip2parquet_transform_python.py . # copy some of the samples in -COPY ./src/code2parquet_local.py local/ +COPY src/zip2parquet_local.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY test test/ +COPY test-data test-data/ # Set environment ENV PYTHONPATH /home/dpk diff --git a/transforms/code/code2parquet/python/Makefile b/transforms/universal/zip2parquet/python/Makefile similarity index 82% rename from transforms/code/code2parquet/python/Makefile rename to transforms/universal/zip2parquet/python/Makefile index d0403e601..7937c4c25 100644 --- a/transforms/code/code2parquet/python/Makefile +++ b/transforms/universal/zip2parquet/python/Makefile @@ -7,7 +7,7 @@ REPOROOT=../../../.. # $(REPOROOT)/.make.versions file contains the versions -TRANSFORM_NAME=code2parquet +TRANSFORM_NAME=zip2parquet include $(REPOROOT)/transforms/.make.transforms @@ -33,7 +33,7 @@ setup:: .transforms.setup # distribution versions is the same as image version. set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(CODE2PARQUET_PYTHON_VERSION) TOML_VERSION=$(CODE2PARQUET_PYTHON_VERSION) .transforms.set-versions + $(MAKE) TRANSFORM_PYTHON_VERSION=$(ZIP2PARQUET_PYTHON_VERSION) TOML_VERSION=$(ZIP2PARQUET_PYTHON_VERSION) .transforms.set-versions build-dist:: .defaults.build-dist @@ -46,8 +46,8 @@ run-cli-sample: RUN_ARGS=" \ --data_local_config \" { 'input_folder' : '../test-data/input', 'output_folder' : '../output' } \" \ --data_files_to_use \"['.zip']\" \ - --code2parquet_supported_langs_file ../test-data/languages/lang_extensions.json \ - --code2parquet_detect_programming_lang True " \ + --zip2parquet_supported_langs_file ../test-data/languages/lang_extensions.json \ + --zip2parquet_detect_programming_lang True " \ .transforms.run-src-file run-local-sample: .transforms.run-local-sample diff --git a/transforms/code/code2parquet/python/README.md b/transforms/universal/zip2parquet/python/README.md similarity index 61% rename from transforms/code/code2parquet/python/README.md rename to transforms/universal/zip2parquet/python/README.md index b93ff3717..8e843a443 100644 --- a/transforms/code/code2parquet/python/README.md +++ b/transforms/universal/zip2parquet/python/README.md @@ -1,10 +1,10 @@ -# Code2Parquet +# Zip2Parquet ## Summary -This code2parquet transform is designed to convert raw particularly ZIP files contain programming files (.py, .c, .java, etc) , -into Parquet format. -As a transform It is built to handle concurrent processing of Ray-based -multiple files using multiprocessing for efficient execution. +This zip2parquet transform is designed to convert ZIP files containing a set of files +into arrow table. If these files contain code data (`code_data` flag) we additionally +determine programming language (.py, .c, .java, etc). + Each file contained within the ZIP is transformed into a distinct row within the Parquet dataset, adhering to the below schema. **title:** (string) @@ -57,25 +57,28 @@ Each file contained within the ZIP is transformed into a distinct row within the - **Description:** Name indicating which dataset it belong to. - **Example:** `"snapshot": "github"` -**programming_language:** (string)(optional) - -- **Description:** Programming language detected using the file extension. -- **Example:** `"programming_language": "Java"` - **domain:** (string)(optional) - **Description:** Name indicating which domain it belong to, whether code, natural language etc.. - **Example:** `"domain": "code"` +**programming_language:** (string)(optional) - only if code_data is set to True + +- **Description:** Programming language detected using the file extension. +- **Example:** `"programming_language": "Java"` ## Configuration -The set of dictionary keys holding [code2parquet](src/code2parquet_transform.py) +The set of dictionary keys holding [zip2parquet](src/zip2parquet_transform.py) configuration for values are as follows: The transform can be configured with the following key/value pairs from the configuration dictionary. +* `code_data` - a flag defining whether to treat data as code or plain context. Default + is code. +* `programming_language_column` - name of the column where programming language information +is stored - default `programming_language`. Only used if `code_data` is True * `supported_languages` - a dictionary mapping file extensions to language names. * `supported_langs_file` - used if `supported_languages` key is not provided, and specifies the path to a JSON file containing the mapping of languages @@ -93,21 +96,42 @@ the file specified in `supported_langs_file`. ## Running ### Launched Command Line Options -When running the transform with the Ray launcher (i.e. TransformLauncher), -the following command line arguments are available in addition to -[the options provided by the launcher](../../../../data-processing-lib/doc/ray-launcher-options.md). +The following command line arguments are available in addition to +the options provided by +the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). + +``` + --zip2parquet_code_data ZIP2PARQUET_CODE_DATA + flag to process files as code + --zip2parquet_programming_language_column ZIP2PARQUET_PROGRAMMING_LANGUAGE_COLUMN + Path to file containing the list of supported languages + --zip2parquet_supported_langs_file ZIP2PARQUET_SUPPORTED_LANGS_FILE + Path to file containing the list of supported languages + --zip2parquet_detect_programming_lang ZIP2PARQUET_DETECT_PROGRAMMING_LANG + Infer the programming lang from the file extension using the file of supported languages + --zip2parquet_snapshot ZIP2PARQUET_SNAPSHOT + Snapshot value assigned to all imported documents. + --zip2parquet_domain ZIP2PARQUET_DOMAIN + Domain value assigned to all imported documents. + --zip2parquet_s3_cred ZIP2PARQUET_S3_CRED + AST string of options for s3 credentials. Only required for S3 data access. + access_key: access key help text + secret_key: secret key help text + url: optional s3 url + region: optional s3 region + Example: { 'access_key': 'access', 'secret_key': 'secret', + 'url': 'https://s3.us-east.cloud-object-storage.appdomain.cloud', + 'region': 'us-east-1' } +``` -* `--code2parquet_supported_langs_file` - set the `supported_langs_file` configuration key. -* `--code2parquet_detect_programming_lang` - set the `detect_programming_lang` configuration key. -* `--code2parquet_domain` - set the `domain` configuration key. -* `--code2parquet_snapshot` - set the `snapshot` configuration key. +These correspond to the configuration keys described above. ### Running the samples To run the samples, use the following `make` targets -* `run-cli-sample` - runs src/code2parquet_transform_ray.py using command line args -* `run-local-sample` - runs src/code2parquet.py -* `run-s3-sample` - runs src/code2parquet.py +* `run-cli-sample` - runs src/zip2parquet_transform_ray.py using command line args +* `run-local-sample` - runs src/zip2parquet.py +* `run-s3-sample` - runs src/zip2parquet.py * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html) and [here](https://min.io/docs/minio/linux/index.html) and invocation of `make minio-start` to load data into local minio for S3 access. diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/universal/zip2parquet/python/pyproject.toml similarity index 87% rename from transforms/code/code2parquet/python/pyproject.toml rename to transforms/universal/zip2parquet/python/pyproject.toml index b8c97541d..23451c0d7 100644 --- a/transforms/code/code2parquet/python/pyproject.toml +++ b/transforms/universal/zip2parquet/python/pyproject.toml @@ -1,13 +1,13 @@ [project] -name = "dpk_code2parquet_transform_python" +name = "dpk_zip2parquet_transform_python" version = "0.2.1.dev0" requires-python = ">=3.10" -description = "code2parquet Python Transform" +description = "zip2parquet Python Transform" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, + { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ "data-prep-toolkit==0.2.1.dev0", diff --git a/transforms/code/code2parquet/python/src/code2parquet_local.py b/transforms/universal/zip2parquet/python/src/zip2parquet_local.py similarity index 91% rename from transforms/code/code2parquet/python/src/code2parquet_local.py rename to transforms/universal/zip2parquet/python/src/zip2parquet_local.py index 8ebd4370b..15d0f7ae1 100644 --- a/transforms/code/code2parquet/python/src/code2parquet_local.py +++ b/transforms/universal/zip2parquet/python/src/zip2parquet_local.py @@ -13,11 +13,14 @@ import ast import os -from code2parquet_transform import ( # domain_key,; snapshot_key, +from zip2parquet_transform import ( # domain_key,; snapshot_key, CodeToParquetTransform, data_factory_key, detect_programming_lang_key, supported_langs_file_key, + domain_key, + snapshot_key, + code_data_key ) from data_processing.data_access import DataAccessFactory, DataAccessLocal @@ -30,8 +33,9 @@ params = { supported_langs_file_key: supported_languages_file, detect_programming_lang_key: True, - # snapshot_key: "github", - # domain_key: "code", + snapshot_key: "github", + domain_key: "code", + #code_data_key: False, "data_files_to_use": ast.literal_eval("['.zip']"), data_factory_key: DataAccessFactory(), # Expect to create DataAccessLocal } diff --git a/transforms/code/code2parquet/python/src/code2parquet_local_python.py b/transforms/universal/zip2parquet/python/src/zip2parquet_local_python.py similarity index 93% rename from transforms/code/code2parquet/python/src/code2parquet_local_python.py rename to transforms/universal/zip2parquet/python/src/zip2parquet_local_python.py index 66713a02f..94afa5ee7 100644 --- a/transforms/code/code2parquet/python/src/code2parquet_local_python.py +++ b/transforms/universal/zip2parquet/python/src/zip2parquet_local_python.py @@ -14,11 +14,11 @@ import os import sys -from code2parquet_transform import ( # domain_key,; snapshot_key, +from zip2parquet_transform import ( # domain_key,; snapshot_key, detect_programming_lang_cli_key, supported_langs_file_cli_key, ) -from code2parquet_transform_python import CodeToParquetPythonConfiguration +from zip2parquet_transform_python import CodeToParquetPythonConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils diff --git a/transforms/code/code2parquet/python/src/code2parquet_s3_python.py b/transforms/universal/zip2parquet/python/src/zip2parquet_s3_python.py similarity index 93% rename from transforms/code/code2parquet/python/src/code2parquet_s3_python.py rename to transforms/universal/zip2parquet/python/src/zip2parquet_s3_python.py index ca26b19cd..2dad7a854 100644 --- a/transforms/code/code2parquet/python/src/code2parquet_s3_python.py +++ b/transforms/universal/zip2parquet/python/src/zip2parquet_s3_python.py @@ -13,11 +13,11 @@ import ast import sys -from code2parquet_transform import ( # domain_key,; snapshot_key, +from zip2parquet_transform import ( # domain_key,; snapshot_key, detect_programming_lang_cli_key, supported_langs_file_cli_key, ) -from code2parquet_transform_python import CodeToParquetPythonConfiguration +from zip2parquet_transform_python import CodeToParquetPythonConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import GB, ParamsUtils diff --git a/transforms/code/code2parquet/python/src/code2parquet_transform.py b/transforms/universal/zip2parquet/python/src/zip2parquet_transform.py similarity index 87% rename from transforms/code/code2parquet/python/src/code2parquet_transform.py rename to transforms/universal/zip2parquet/python/src/zip2parquet_transform.py index 7caf4c6eb..a1a7c841c 100644 --- a/transforms/code/code2parquet/python/src/code2parquet_transform.py +++ b/transforms/universal/zip2parquet/python/src/zip2parquet_transform.py @@ -30,7 +30,7 @@ from data_processing.utils import CLIArgumentProvider, TransformUtils, str2bool -shortname = "code2parquet" +shortname = "zip2parquet" cli_prefix = f"{shortname}_" supported_langs_file_key = "supported_langs_file" @@ -43,6 +43,14 @@ detect_programming_lang_cli_key = f"{cli_prefix}{detect_programming_lang_key}" detect_programming_lang_default = True +code_data_key = "code_data" +code_data_cli_key = f"{cli_prefix}{code_data_key}" +code_data_default = True + +programming_language_column_key = "programming_language_column" +programming_language_column_cli_key = f"{cli_prefix}{programming_language_column_key}" +programming_language_column_default = "programming_language" + data_factory_key = "data_factory" domain_key = "domain" @@ -91,6 +99,9 @@ def __init__(self, config: dict): raise RuntimeError( "Programming language detection requested without providing a mapping of extensions to languages" ) + self.programming_language_column = config.get(programming_language_column_key, + programming_language_column_default) + self.treat_as_code = config.get(code_data_key, code_data_default) domain = config.get(domain_key, None) snapshot = config.get(domain_key, None) self.shared_columns = {} @@ -138,9 +149,11 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl "date_acquired": datetime.now().isoformat(), "repo_name": os.path.splitext(os.path.basename(file_name))[0], } | self.shared_columns - if self.detect_programming_lang: + # extra processing for code + if self.treat_as_code and self.detect_programming_lang: lang = self._get_lang_from_ext(ext) - row_data["programming_language"] = lang # TODO column name should be configurable + if lang is not None: + row_data[self.programming_language_column] = lang data.append(row_data) number_of_rows += 1 else: @@ -178,13 +191,25 @@ def add_input_params(self, parser: ArgumentParser) -> None: (e.g, noop_, pii_, etc.) """ parser.add_argument( - f"--{cli_prefix}{supported_langs_file_key}", + f"--{code_data_cli_key}", + type=lambda x: bool(str2bool(x)), + default=code_data_default, + help="flag to process files as code" + ) + parser.add_argument( + f"--{programming_language_column_cli_key}", + type=str, + default=programming_language_column_default, + help="Path to file containing the list of supported languages", + ) + parser.add_argument( + f"--{supported_langs_file_cli_key}", type=str, default=None, help="Path to file containing the list of supported languages", ) parser.add_argument( - f"--{cli_prefix}{detect_programming_lang_key}", + f"--{detect_programming_lang_cli_key}", type=lambda x: bool(str2bool(x)), default=detect_programming_lang_default, help="Infer the programming lang from the file extension using the file of supported languages", diff --git a/transforms/code/code2parquet/python/src/code2parquet_transform_python.py b/transforms/universal/zip2parquet/python/src/zip2parquet_transform_python.py similarity index 89% rename from transforms/code/code2parquet/python/src/code2parquet_transform_python.py rename to transforms/universal/zip2parquet/python/src/zip2parquet_transform_python.py index ea09a1808..b35b9b6e7 100644 --- a/transforms/code/code2parquet/python/src/code2parquet_transform_python.py +++ b/transforms/universal/zip2parquet/python/src/zip2parquet_transform_python.py @@ -10,12 +10,9 @@ # limitations under the License. ################################################################################ -from code2parquet_transform import ( +from zip2parquet_transform import ( CodeToParquetTransform, CodeToParquetTransformConfiguration, - data_factory_key, - get_supported_languages, - supported_langs_file_key, ) from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.runtime.pure_python.runtime_configuration import ( @@ -35,5 +32,5 @@ def __init__(self): if __name__ == "__main__": # launcher = NOOPRayLauncher() launcher = PythonTransformLauncher(CodeToParquetPythonConfiguration()) - logger.info("Launching noop transform") + logger.info("Launching zip2parquet transform") launcher.launch() diff --git a/transforms/code/code2parquet/python/test-data/expected/application-java.parquet b/transforms/universal/zip2parquet/python/test-data/expected/application-java.parquet similarity index 60% rename from transforms/code/code2parquet/python/test-data/expected/application-java.parquet rename to transforms/universal/zip2parquet/python/test-data/expected/application-java.parquet index 68be60a2558a0d2d631bfe2fa10fc2715b4064f4..aea898a1f441410311db972bce5d453277d1cc39 100644 GIT binary patch delta 843 zcmeD5X!qElDN;Xy(MOa;ltGArL1U}_Uxk3bj0{Xi`2;p><#qdeSVwL5ZQZ}m-5zR& zToO2;u;W(p+_Qm_gD8Wjkf@3d z&{P3YHc>ViGX@P6vm_&Pi$voTU4s-e3tf{$19M#qBco*96vJdA3yU;^By)o_QwbHr zv_uorG@~?K14CnTU6W)}LtP6~Lo?kpL(|k$b4xQL^CW}G7lp+b?@azEA|F1DS?yb( z+OYuqR*-JPK}oU6wW0xx(HCbQY-WKTd3ehB#V6|~&WME;EVqjn}GB7gHHL%b%G7K@W zurfBZGB(#U2Zn)xsj(#^1Do6AUlKA{q>U{lq;P8mhsmbN1@ekcdzjR|B&&T(0)+`q z9e7Qfd_c0Bal&LDDFv2a>1sdISig2EGF$6+=OUNev|!teMnM!oh^`z~*%d(aZo{g7XFd delta 874 zcmZqo==9j2DN;X)(MOa;ltGArL1U}_UxffxCI%)}e!&f@{4HyR4(HC^8?78;;pxZ3 z*m*o3?x*{ zQVh(@O;gi!6O$7Ybxq8YlXa7fOp|oYQ&LQg%*@PF%u*63UlbN&yf^uShl0wQ9QYeWMWXG}gTD#fxZOzq2LLpk})Tw)50j8`UWi%YZoi&UF7xlnHQ zDp9Sy$fP4(Ra;(G-+mwee+S5Mh#HU|=vZFf!3KFxNFQ4KXye zGPbZXGSD-zFf=eTH8eJ1WM~VR{6j(pQywTOA%$BnI83%p&XZSk+Q+2!BU$Z75-3b? z>A-K=LNqe~#hy%hyr<~2>yx%GwowuY{R4DMsJ+aqUd4=f6BDgikXo?;LFrW7apBDRL_3)p4Hp1V83IM z(vA!aO>4R=lU{5W`7?1V|Fw!d|Mr00rM63VefXjB{Ogr{+AlIKcIB$%J4oLNcvre} zxx<^IPWkOy7tLvj?)j-AQkVIr+WNd@V8>ckW4^UKDK(vMbGPj6{5I9KI`52t*_xHi z;nFPAHdcrq>(8p*_HAn3Y>w0QU8hx@G|%+Eu$%mL(UiCwZCgZ?T$~d5cuQ_?ln;7S z-m4#Ts;Qgfcuw0S;q#k*$!wjj`ZM>eVL*@Kk&bJEH|8DrowF%DKlX-!t)G-kTXNL8 z@+!S|pE@$*3%^AlEOlbn?6Q<<(?4ACEpE%wL-ca7) zp3r|luF;mMn&lhE8^IgQIcI$Hn`&8pb6gXiaQynxGolQldZG?Gz-Sc^WfNtSF=NnB zNli^OG%!g{)iq2sHq$jpwzSkuN=z};H8LURM|&O|I~k zVO_C?No~*Mnci;o7Z_2JbOhs9CWfRI9)Sf*E&p@oER85nTXZm3Hjhzl*_Mwii#q>` zoQt3D`l0Kj|KB{*1G3j1a`ilxYFQ+p#39VkaIisA;YfpHf$@QqgyaJW1<3+Q2NVP( zIb;)95)2a9I1CduGzR*igt#pv$fbZmYocplp=)FqVqjrqY-nX{u4is+WN2b8fkV#F z(AWYLx=f!oO@0%h=z5A-ZN^4m@U903FAhxv4QzB0o9yUY%lKmQeqTk_V_TWjUQOPihQ|m4Ku}Ay_FRi7?>U>t$4)Z_$T4r z>NRuApJjx9abjRlcrn3jZL(K?{>6PVx6_yHPux8rk%57wIg9gTJ!kUoB^!_DReyZs z{e9CG1_l9^<<|Fhe{;N6*O(nVds>lB@1|n)RkoH}*ts5QR|tgNRczWCDONf4)y6Y= z854@zHp{X2N4GpO`gf-{d9H1X!M@wAo3d`M;Mv-CB=p0)3pWfO$ZnjuF5==UY1t!R zw$9+M6$b>zHT*hBF{na#9@BTv8}qCx zo(i7m(oj~LSbO<-ajd^=<1`yVm*@ zq4!@bsxgo5vdCN*?z=nk>%?7E?25V_UR+KkITQI$f4g9F?&*$P)~OsP9KD$Z8LB6^ z2=I5X9#J}S`NSRN7w#MS6XXxtHPy2G=6IdS^@-!1;0^0_VV;bl45E6X4m!ZN6%b_; zWs@;u&`?QBG`28HN;K9rPf9V?HAzi12eK>;Ep(Gi42%pcl9Q8+4J{>943o`M)6COM zbuBE+jdV>cQ&M#k&6CV@fhv-cOfAgQQWKLW$JwYbKA8MIOd)ayr`o-VOlm8-m~dG_ zt{sg|Vv}oJ^BM0=t__!F{WOh9?a$=7;fkAe-8M5bo|yd5Q-<~05+=1LlOKl9nLN+y z3ger}^${|xo7OO?9hcUNU7lU} zzJC#`{ARD`&xLncf4S;hc7*fnm8HVdO;aoPuu90>JE)w%W#VYTrPw6Q&>*RB1cDF5 z9N-a1I>3{lm%zqh8o{x_Fo7+>fWt7MO)K!VOmxxZj4cdJEkPm6)UtW<9v?;5OU!DkHZrMgS`Q9d?3xH02nt3E-&)3B zlj9;4S#N9w1>+sxZ>b*I#-x$LkQ!ZVQon z>?`FqzFFe5V}^)Q3U6Th$qUDS7A|>yq0H&vzLqV$GmP9M{i3JVg}=zWRmb{!YU5F} zv{=Q~w$l6gEb`OeZd(!;A+c~-s?}5JJg4WuwXIu)b$^JN%v$_n(h5(`ABj`ci@pUd z;&)rInVG>Mpr^y{k)wK^*N^%$8RvCAI@wrFXJFu95}YPJ!SJoOl6~;E2|KnQJoJo% zp+P}8AoD@cyFIhlhwP3!p|7sTuFB{5(arFkXTsjLME$O>iE|uEm!;m;E6!i%{yQW| zzUJ{m4UXN*^khrI?4 zwn}T(t=PscWv6QCn51)1d5*2g@1@o96W*?!xu3fwI%Pr7i6xd(=J9NL&QLl(=Kd!Y@-x^K-tbs{Y~W_EL1cHQDuKOy!Al%jeF0^R9V@ z%92G#*Iv7&_bp(>e8V>#caQ7bSW+Fnt3yfYTPySY1slQ@pKjG)J9+S$?2O2o^^P@S zDeH|tN;TE@SQy<|dgmHfUEfpJ&41fosB?7(-cmWra$7KZf9Aeh8}9pAt_(F5jn$nV zSoJ-{B%XU?@uxk9&X)0NJzZmBn*Yb`!RC#zDmS&S8EVdIbenrpR!Al?Owk#>CVFAGfv!ye$=k~TQTjp z&hJ@w{5FeTwft}n!6v;&g}fr#UNIuiIz4u)on_s$G$>D=P9hiMj~OBq}SnIF_}z-QeiGz|lKFP-@{s(}k)r zhg4&nymz#0?-1nNpmO<)Z5B}Akm|eGQ$WglhfA7ZO`ch ziY#f!n#ed|vP_pW>x|XRYSSj$bjfcH%>KsC^ykdvxGrh-D<_%NCaqk``}h1JBPV-UUTX^SULl^TqoB z^_GLPZ7t^?`I9)aCuP^wSeGdYtJ8QMNSYm1PEfL8@$wL0X*5=7Fs|pC#eU#{+UcG) ziJFUXUmh03Y!TCy-Lz16;qJS9r@PHX)MY~VxGyRwxoE2)(|KrPhS^NtkB4>_RPD&` zFrIOc=Uf6?vKxpcaVFsmhasD>@)5~1JkN|Qnk9YSCvI3Qj3uh2j0}uSbPX(YjSNGy z3@ogS4XupL^~_Do4NWa1kR=gv#>SSGlh>9xFn*d`->u|&j#cf^O=h(v*T9jAT?--O z%qBNviEYj%Pl0h*DZKed zf;ApoQp7}O@LVU`i)ku+ss6Ofc(POT^rovho%1y6Mf u*b|&EdRl`9UKCXV51TttgK<7)5KjJ4Da0kVt&?3WLybf1#%AT-7-j&AhgXaM delta 2073 zcmcJPc~H{_7RP_&fBD1w3u3KS6OxYk-`J3F&>c4q(S&YOAfk2mkVneX>A zPhJjrR|aYLKq#&<+mDJy0owuKquaaQ9>*F4*#qIpQWMh#CzEBB9oWz$UG}^pNPunv z>syh=iW}3emgikAI>SRqB5uCQzZIU3$sgDc4Zg2KQ zHX}e=n51}{+*tHks}PA!M1MCPimJBAm}8E6!0b93HEu8fU>Q1Gu16R@#jwv^Tnf^^ zDtW#@000O(t6*@RAVTM9oHlt)wq0~*1HcRW;EB#@^x%g^>GlK1_Pv;51ZxUH=rl2_ z`pW8pQZpc(ODTS&g8*I)U;i1g&G^herqhQS51~2^WP7s!K;qteEP2Hthy2s|+Ny!( zfj35@f4lnD`Sqxz4vJ@R+-qkpO*W{eCX5HpUWk(BjH-}dU&w4Pd)A}xKC~h#QAakY zS{vkJ5#%!mr8d3=xqIuo_=VC;yp6c@;b842Z3`jS_T?3rmIv6?bj6BW&afnJWo5&c zDo8NE>&viS7vJLk2z;s8vdzf!jfJq;C+5!;$W7W*>0dh32-_%nVRUaxrY6U3C%YTT zaeq3ZY#xlgmA|d>M$mG#)br#rGU|%F(EI@F7Ef{*yFr#7KreZ}x|L~OVk*uE)O`-? zeN(Wq&+@c0T2)Dp_daet>T<-s`gv@%x)nA#cg&F#jpn{JY;#Epl={X^SBSq`@pEgeQ{ z!UAhvK>k7ky6>#H`bYexc(8J(`jc90)5q|TY|eUKv2BLV;9eVcXNGYfDYU{h;h%1o zcflt5=G*0ok~K)!7c(E%dfa^E8sZ*2GpxHC-NyN}YGl7a9^#8_@^PSxEE z(sZVc=XX!C?w)jCNBVhIEIV#^Kn!F?000}|_@=x1cr>k7PI_G5+}m5@*VZE>;P zqifvobtwY&(8BTIfFF0hc9f0nrKq>z_dE!o=7|#58)at&GZ7Js(Q|o+B{BZjs1}~t zx|jOL@Hx3<`Jdky64zg~jd?Q|b;54-(0wbIRVIVGt>r_}>-&*oUx`j-)MaVwum5%6 zho6U)R8Z7&GAO2~6|bM@-;RIP))nlxcBm*COLv;eNBg{A&Ohwr?O_2$*6#YOMhKx`%c%L!UVPKNnmo`> zXOv=Pn$(fhrF<)6+T3=RSto9nQ@b?qj?_YN$%f}ZV>`sdn%h|32`SQ`JJO;r4ae*+ zvklFZ`?7(7>3j3bFI5D4i!CXqpn6cA}lp&QXnz~>XmOg@Q5;S0!A3LRWa zwgM+&93irr{8-ztUywNM4Jb}t0JUZSA2~+-ad*V0(BQ>bkzN^S6lVpmsfFUIl+JM| z1VsKl$V#$;&-FlY6-r*x-}UrXK=LVT{a>_DoNN#b%=HwbV5kNRY6J4#cg4Un;Q&O2 zHY;;_sQ=C|$Xns9SeUu#lAt%=(KanC?N({{)oFoq7dBen)t4JRI{GL79rLS;3dGF` zE61EuXf204mP#~%pJj(PTL_vcq0>k_aJmfB4;(5s)Y1_WFDiD1o<&xO0;l%K2PNcS9maxz$C0>qX+;ijHc?l+--a&FDQs|*%`fjS*ZVK6j$sjSv6wD3^i9#d(zDJ{x=!}1x zq^ASrx%S?ba9sB|6xTfZFO&ZJ-2d9C<#qdeSVwL5ZQZ}m-5zR& zToO2;u;W(p+_Qm_gD8Wjkf@3d z&{P3YHc>ViGX@P6vm_&Pi$voTU4s-e3tf{$19M#qBco*96vJdA3yU;^By)o_QwbHr zv_uorG@~?K14CnTU6W)}LtP6~Lo?kpL(|k$b4xQL^CW}G7lp+b?@azEA|F1DS?yb( z+OYuqR*-JPK}oU6wW0xx(HCbQY-WKTd3ehB#V6|~&WME;EVqjn}GB7gHHL%b%G7K@W zurfBZGB(#U2Zn)xsj(#^1Do6AUlKA{q>U{lq;P8mhsmbN1@ekcdzjR|B&&T(0)+`q z9e7Qfd_c0Bal&LDDFv2a>1sdISig2EGF$6+=OUNev|!teMnM!oh^`z~*%d(aZo{g7XFd delta 874 zcmZqo==9j2DN;X)(MOa;ltGArL1U}_UxffxCI%)}e!&f@{4HyR4(HC^8?78;;pxZ3 z*m*o3?x*{ zQVh(@O;gi!6O$7Ybxq8YlXa7fOp|oYQ&LQg%*@PF%u*63UlbN&yf^uShl0wQ9QYeWMWXG}gTD#fxZOzq2LLpk})Tw)50j8`UWi%YZoi&UF7xlnHQ zDp9Sy$fP4(Ra;(G-+mwee+S5Mh#HU|=vZFf!3KFxNFQ4KXye zGPbZXGSD-zFf=eTH8eJ1WM~VR{6j(pQywTOA%$BnI83%p&XZSk+Q+2!BU$Z75-3b? z>A-K=LNqe~#hy%hyr<~2>yx%GwowuY{R4DMsJ+aqUd4=f6BDgikXo?;LFrW7apBDRL_3)p4Hp1V83IM z(vA!aO>4R=lU{5W`7?1V|Fw!d|Mr00rM63VefXjB{Ogr{+AlIKcIB$%J4oLNcvre} zxx<^IPWkOy7tLvj?)j-AQkVIr+WNd@V8>ckW4^UKDK(vMbGPj6{5I9KI`52t*_xHi z;nFPAHdcrq>(8p*_HAn3Y>w0QU8hx@G|%+Eu$%mL(UiCwZCgZ?T$~d5cuQ_?ln;7S z-m4#Ts;Qgfcuw0S;q#k*$!wjj`ZM>eVL*@Kk&bJEH|8DrowF%DKlX-!t)G-kTXNL8 z@+!S|pE@$*3%^AlEOlbn?6Q<<(?4ACEpE%wL-ca7) zp3r|luF;mMn&lhE8^IgQIcI$Hn`&8pb6gXiaQynxGolQldZG?Gz-Sc^WfNtSF=NnB zNli^OG%!g{)iq2sHq$jpwzSkuN=z};H8LURM|&O|I~k zVO_C?No~*Mnci;o7Z_2JbOhs9CWfRI9)Sf*E&p@oER85nTXZm3Hjhzl*_Mwii#q>` zoQt3D`l0Kj|KB{*1G3j1a`ilxYFQ+p#39VkaIisA;YfpHf$@QqgyaJW1<3+Q2NVP( zIb;)95)2a9I1CduGzR*igt#pv$fbZmYocplp=)FqVqjrqY-nX{u4is+WN2b8fkV#F z(AWYLx=f!oO@0%h=z5A-ZN^4m@U903FAhxv4QzB0o9yUY%lKmQeqTk_V_TWjUQOPihQ|m4Ku}Ay_FRi7?>U>t$4)Z_$T4r z>NRuApJjx9abjRlcrn3jZL(K?{>6PVx6_yHPux8rk%57wIg9gTJ!kUoB^!_DReyZs z{e9CG1_l9^<<|Fhe{;N6*O(nVds>lB@1|n)RkoH}*ts5QR|tgNRczWCDONf4)y6Y= z854@zHp{X2N4GpO`gf-{d9H1X!M@wAo3d`M;Mv-CB=p0)3pWfO$ZnjuF5==UY1t!R zw$9+M6$b>zHT*hBF{na#9@BTv8}qCx zo(i7m(oj~LSbO<-ajd^=<1`yVm*@ zq4!@bsxgo5vdCN*?z=nk>%?7E?25V_UR+KkITQI$f4g9F?&*$P)~OsP9KD$Z8LB6^ z2=I5X9#J}S`NSRN7w#MS6XXxtHPy2G=6IdS^@-!1;0^0_VV;bl45E6X4m!ZN6%b_; zWs@;u&`?QBG`28HN;K9rPf9V?HAzi12eK>;Ep(Gi42%pcl9Q8+4J{>943o`M)6COM zbuBE+jdV>cQ&M#k&6CV@fhv-cOfAgQQWKLW$JwYbKA8MIOd)ayr`o-VOlm8-m~dG_ zt{sg|Vv}oJ^BM0=t__!F{WOh9?a$=7;fkAe-8M5bo|yd5Q-<~05+=1LlOKl9nLN+y z3ger}^${|xo7OO?9hcUNU7lU} zzJC#`{ARD`&xLncf4S;hc7*fnm8HVdO;aoPuu90>JE)w%W#VYTrPw6Q&>*RB1cDF5 z9N-a1I>3{lm%zqh8o{x_Fo7+>fWt7MO)K!VOmxxZj4cdJEkPm6)UtW<9v?;5OU!DkHZrMgS`Q9d?3xH02nt3E-&)3B zlj9;4S#N9w1>+sxZ>b*I#-x$LkQ!ZVQon z>?`FqzFFe5V}^)Q3U6Th$qUDS7A|>yq0H&vzLqV$GmP9M{i3JVg}=zWRmb{!YU5F} zv{=Q~w$l6gEb`OeZd(!;A+c~-s?}5JJg4WuwXIu)b$^JN%v$_n(h5(`ABj`ci@pUd z;&)rInVG>Mpr^y{k)wK^*N^%$8RvCAI@wrFXJFu95}YPJ!SJoOl6~;E2|KnQJoJo% zp+P}8AoD@cyFIhlhwP3!p|7sTuFB{5(arFkXTsjLME$O>iE|uEm!;m;E6!i%{yQW| zzUJ{m4UXN*^khrI?4 zwn}T(t=PscWv6QCn51)1d5*2g@1@o96W*?!xu3fwI%Pr7i6xd(=J9NL&QLl(=Kd!Y@-x^K-tbs{Y~W_EL1cHQDuKOy!Al%jeF0^R9V@ z%92G#*Iv7&_bp(>e8V>#caQ7bSW+Fnt3yfYTPySY1slQ@pKjG)J9+S$?2O2o^^P@S zDeH|tN;TE@SQy<|dgmHfUEfpJ&41fosB?7(-cmWra$7KZf9Aeh8}9pAt_(F5jn$nV zSoJ-{B%XU?@uxk9&X)0NJzZmBn*Yb`!RC#zDmS&S8EVdIbenrpR!Al?Owk#>CVFAGfv!ye$=k~TQTjp z&hJ@w{5FeTwft}n!6v;&g}fr#UNIuiIz4u)on_s$G$>D=P9hiMj~OBq}SnIF_}z-QeiGz|lKFP-@{s(}k)r zhg4&nymz#0?-1nNpmO<)Z5B}Akm|eGQ$WglhfA7ZO`ch ziY#f!n#ed|vP_pW>x|XRYSSj$bjfcH%>KsC^ykdvxGrh-D<_%NCaqk``}h1JBPV-UUTX^SULl^TqoB z^_GLPZ7t^?`I9)aCuP^wSeGdYtJ8QMNSYm1PEfL8@$wL0X*5=7Fs|pC#eU#{+UcG) ziJFUXUmh03Y!TCy-Lz16;qJS9r@PHX)MY~VxGyRwxoE2)(|KrPhS^NtkB4>_RPD&` zFrIOc=Uf6?vKxpcaVFsmhasD>@)5~1JkN|Qnk9YSCvI3Qj3uh2j0}uSbPX(YjSNGy z3@ogS4XupL^~_Do4NWa1kR=gv#>SSGlh>9xFn*d`->u|&j#cf^O=h(v*T9jAT?--O z%qBNviEYj%Pl0h*DZKed zf;ApoQp7}O@LVU`i)ku+ss6Ofc(POT^rovho%1y6Mf u*b|&EdRl`9UKCXV51TttgK<7)5KjJ4Da0kVt&?3WLybf1#%AT-7-j&AhgXaM delta 2073 zcmcJPc~H{_7RP_&fBD1w3u3KS6OxYk-`J3F&>c4q(S&YOAfk2mkVneX>A zPhJjrR|aYLKq#&<+mDJy0owuKquaaQ9>*F4*#qIpQWMh#CzEBB9oWz$UG}^pNPunv z>syh=iW}3emgikAI>SRqB5uCQzZIU3$sgDc4Zg2KQ zHX}e=n51}{+*tHks}PA!M1MCPimJBAm}8E6!0b93HEu8fU>Q1Gu16R@#jwv^Tnf^^ zDtW#@000O(t6*@RAVTM9oHlt)wq0~*1HcRW;EB#@^x%g^>GlK1_Pv;51ZxUH=rl2_ z`pW8pQZpc(ODTS&g8*I)U;i1g&G^herqhQS51~2^WP7s!K;qteEP2Hthy2s|+Ny!( zfj35@f4lnD`Sqxz4vJ@R+-qkpO*W{eCX5HpUWk(BjH-}dU&w4Pd)A}xKC~h#QAakY zS{vkJ5#%!mr8d3=xqIuo_=VC;yp6c@;b842Z3`jS_T?3rmIv6?bj6BW&afnJWo5&c zDo8NE>&viS7vJLk2z;s8vdzf!jfJq;C+5!;$W7W*>0dh32-_%nVRUaxrY6U3C%YTT zaeq3ZY#xlgmA|d>M$mG#)br#rGU|%F(EI@F7Ef{*yFr#7KreZ}x|L~OVk*uE)O`-? zeN(Wq&+@c0T2)Dp_daet>T<-s`gv@%x)nA#cg&F#jpn{JY;#Epl={X^SBSq`@pEgeQ{ z!UAhvK>k7ky6>#H`bYexc(8J(`jc90)5q|TY|eUKv2BLV;9eVcXNGYfDYU{h;h%1o zcflt5=G*0ok~K)!7c(E%dfa^E8sZ*2GpxHC-NyN}YGl7a9^#8_@^PSxEE z(sZVc=XX!C?w)jCNBVhIEIV#^Kn!F?000}|_@=x1cr>k7PI_G5+}m5@*VZE>;P zqifvobtwY&(8BTIfFF0hc9f0nrKq>z_dE!o=7|#58)at&GZ7Js(Q|o+B{BZjs1}~t zx|jOL@Hx3<`Jdky64zg~jd?Q|b;54-(0wbIRVIVGt>r_}>-&*oUx`j-)MaVwum5%6 zho6U)R8Z7&GAO2~6|bM@-;RIP))nlxcBm*COLv;eNBg{A&Ohwr?O_2$*6#YOMhKx`%c%L!UVPKNnmo`> zXOv=Pn$(fhrF<)6+T3=RSto9nQ@b?qj?_YN$%f}ZV>`sdn%h|32`SQ`JJO;r4ae*+ zvklFZ`?7(7>3j3bFI5D4i!CXqpn6cA}lp&QXnz~>XmOg@Q5;S0!A3LRWa zwgM+&93irr{8-ztUywNM4Jb}t0JUZSA2~+-ad*V0(BQ>bkzN^S6lVpmsfFUIl+JM| z1VsKl$V#$;&-FlY6-r*x-}UrXK=LVT{a>_DoNN#b%=HwbV5kNRY6J4#cg4Un;Q&O2 zHY;;_sQ=C|$Xns9SeUu#lAt%=(KanC?N({{)oFoq7dBen)t4JRI{GL79rLS;3dGF` zE61EuXf204mP#~%pJj(PTL_vcq0>k_aJmfB4;(5s)Y1_WFDiD1o<&xO0;l%K2PNcS9maxz$C0>qX+;ijHc?l+--a&FDQs|*%`fjS*ZVK6j$sjSv6wD3^i9#d(zDJ{x=!}1x zq^ASrx%S?ba9sB|6xTfZFO&ZJ-2d9C