-
Notifications
You must be signed in to change notification settings - Fork 69
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #13 from ciaran28/featureone
Featureone
Showing
56 changed files
with
525 additions
and
2 deletions.
There are no files selected for viewing
138 changes: 138 additions & 0 deletions
138
...ps/MLOps_Engineer/Infrastructure/Azure_ML_Pipelines/components/databricks/listclusters.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
import os | ||
import argparse | ||
import pandas as pd | ||
from sklearn.model_selection import train_test_split | ||
import logging | ||
import mlflow | ||
import requests | ||
import os | ||
|
||
#parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25) | ||
|
||
def main(): | ||
"""Main function of the script.""" | ||
|
||
# input and output arguments | ||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument("--data", type=str, help="path to input data") | ||
parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25) | ||
parser.add_argument("--train_data", type=str, help="path to train data") | ||
parser.add_argument("--test_data", type=str, help="path to test data") | ||
|
||
args = parser.parse_args() | ||
# Start Logging | ||
mlflow.start_run() | ||
|
||
print(" ".join(f"{k}={v}" for k, v in vars(args).items())) | ||
|
||
print("input data:", args.data) | ||
|
||
credit_df = pd.read_excel(args.data, header=1, index_col=0) | ||
|
||
mlflow.log_metric("num_samples", credit_df.shape[0]) | ||
mlflow.log_metric("num_features", credit_df.shape[1] - 1) | ||
|
||
credit_train_df, credit_test_df = train_test_split( | ||
credit_df, | ||
test_size=args.test_train_ratio, | ||
) | ||
|
||
# output paths are mounted as folder, therefore, we are adding a filename to the path | ||
credit_train_df.to_csv(os.path.join(args.train_data, "data.csv"), index=False) | ||
|
||
credit_test_df.to_csv(os.path.join(args.test_data, "data.csv"), index=False) | ||
|
||
# Stop Logging | ||
mlflow.end_run() | ||
|
||
|
||
|
||
# Retrieve Tokens | ||
|
||
|
||
def createManagementToken(tokenRequestBody, tokenRequestHeaders, tokenBaseURL): | ||
""" | ||
Uses Our Service Principal Credentials To Generate Azure Active Directory Tokens | ||
""" | ||
|
||
tokenRequestBody['resource'] = 'https://management.core.windows.net/' | ||
|
||
response = requests.get(tokenBaseURL, headers=tokenRequestHeaders, data=tokenRequestBody) | ||
|
||
if response.status_code == 200: | ||
print(response.status_code) | ||
|
||
else: | ||
raise Exception(response.text) | ||
|
||
return response.json()['access_token'] | ||
|
||
def createBearerToken(tokenRequestBody, tokenRequestHeaders, tokenBaseURL): | ||
""" | ||
Uses Our Service Principal Credentials To Generate Azure Active Directory Tokens | ||
""" | ||
|
||
tokenRequestBody['resource'] = '2ff814a6-3304-4ab8-85cb-cd0e6f879c1d' | ||
|
||
response = requests.get(tokenBaseURL, headers=tokenRequestHeaders, data=tokenRequestBody) | ||
|
||
if response.status_code == 200: | ||
print(response.status_code) | ||
|
||
else: | ||
raise Exception(response.text) | ||
|
||
return response.json()['access_token'] | ||
|
||
|
||
|
||
def listClusters(DBRKS_REQ_HEADERS, DATABRICKS_INSTANCE): | ||
""" | ||
Returns a Json object containing a list of existing Databricks Clusters. | ||
""" | ||
|
||
response = requests.get('https://' + DATABRICKS_INSTANCE + '/api/2.0/clusters/list', headers=DBRKS_REQ_HEADERS) | ||
|
||
if response.status_code != 200: | ||
raise Exception(response.content) | ||
|
||
else: | ||
return response.json() | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
|
||
# The sp credentials need to come in from key vault | ||
|
||
tokenRequestBody = { | ||
'grant_type': 'client_credentials', | ||
'client_id': '841ba6d9-a509-44ee-bf40-c0876b4ac6bb', | ||
'client_secret': 'IQG8Q~hQDGO5eFcRos~YN9waI0gE-Gsx8sMx5bJQ' | ||
} | ||
tokenRequestHeaders = {'Content-Type': 'application/x-www-form-urlencoded'} | ||
tokenBaseURL = 'https://login.microsoftonline.com/' + '16b3c013-d300-468d-ac64-7eda0820b6d3' + '/oauth2/token' | ||
|
||
DBRKS_BEARER_TOKEN = createBearerToken(tokenRequestBody=tokenRequestBody, | ||
tokenRequestHeaders=tokenRequestHeaders, | ||
tokenBaseURL=tokenBaseURL | ||
) | ||
|
||
DBRKS_MANAGEMENT_TOKEN = createManagementToken(tokenRequestBody=tokenRequestBody, | ||
tokenRequestHeaders=tokenRequestHeaders, | ||
tokenBaseURL=tokenBaseURL | ||
) | ||
|
||
|
||
DBRKS_REQ_HEADERS = { | ||
'Authorization': f'Bearer {DBRKS_BEARER_TOKEN}', | ||
'X-Databricks-Azure-SP-Management-Token': f'{DBRKS_MANAGEMENT_TOKEN}', | ||
'X-Databricks-Azure-Workspace-Resource-Id': '/subscriptions/2a834239-8f89-42e1-8cf1-c3c10090f51c/resourceGroups/databricks-sandbox-rg/providers/Microsoft.Databricks/workspaces/dbxwssandbox-eco3', | ||
'Content-Type': 'application/json' | ||
} | ||
DATABRICKS_INSTANCE = "adb-2041102092454885.5.azuredatabricks.net" | ||
|
||
existingClusters = listClusters(DBRKS_REQ_HEADERS, DATABRICKS_INSTANCE) | ||
|
||
print(existingClusters) |
17 changes: 17 additions & 0 deletions
17
...evOps/MLOps_Engineer/Infrastructure/Azure_ML_Pipelines/components/dependencies/conda.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
name: model-env | ||
channels: | ||
- conda-forge | ||
dependencies: | ||
- python=3.8 | ||
- numpy=1.21.2 | ||
- pip=21.2.4 | ||
- scikit-learn=0.24.2 | ||
- scipy=1.7.1 | ||
- pandas>=1.1,<1.2 | ||
- pip: | ||
- inference-schema[numpy-support]==1.3.0 | ||
- xlrd==2.0.1 | ||
- mlflow== 1.26.1 | ||
- azureml-mlflow==1.42.0 | ||
- pandas | ||
- requests |
339 changes: 339 additions & 0 deletions
339
...ps/MLOps_Engineer/Infrastructure/Azure_ML_Pipelines/components/pipelines/databricks.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,339 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"MLClient(credential=<azure.identity._credentials.default.DefaultAzureCredential object at 0x7f395be3bf40>,\n", | ||
" subscription_id=2a834239-8f89-42e1-8cf1-c3c10090f51c,\n", | ||
" resource_group_name=databricks-sandbox-rg,\n", | ||
" workspace_name=amlsandbox-eco3)\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from azure.identity import DefaultAzureCredential\n", | ||
"from azure.ai.ml import MLClient\n", | ||
"\n", | ||
"ml_client = MLClient(\n", | ||
" credential=DefaultAzureCredential(),\n", | ||
" subscription_id=\"2a834239-8f89-42e1-8cf1-c3c10090f51c\",\n", | ||
" resource_group_name=\"databricks-sandbox-rg\",\n", | ||
" workspace_name=\"amlsandbox-eco3\",\n", | ||
" )\n", | ||
"\n", | ||
"print(ml_client)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 12, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"You already have a cluster named ciaranh1, we'll reuse it as is.\n", | ||
"AMLCompute with name ciaranh1 is created, the compute size is STANDARD_DS12_V2\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from azure.ai.ml.entities import AmlCompute\n", | ||
"\n", | ||
"cpu_compute_target = \"ciaranh1\"\n", | ||
"\n", | ||
"try:\n", | ||
" # let's see if the compute target already exists\n", | ||
" cpu_cluster = ml_client.compute.get(cpu_compute_target)\n", | ||
" print(\n", | ||
" f\"You already have a cluster named {cpu_compute_target}, we'll reuse it as is.\"\n", | ||
" )\n", | ||
"\n", | ||
"except Exception:\n", | ||
" print(\"Creating a new cpu compute target...\")\n", | ||
"\n", | ||
" # Let's create the Azure ML compute object with the intended parameters\n", | ||
" cpu_cluster = AmlCompute(\n", | ||
" # Name assigned to the compute cluster\n", | ||
" name=\"cpu-cluster\",\n", | ||
" # Azure ML Compute is the on-demand VM service\n", | ||
" type=\"amlcompute\",\n", | ||
" # VM Family\n", | ||
" size=\"STANDARD_DS3_V2\",\n", | ||
" # Minimum running nodes when there is no job running\n", | ||
" min_instances=0,\n", | ||
" # Nodes in cluster\n", | ||
" max_instances=4,\n", | ||
" # How many seconds will the node running after the job termination\n", | ||
" idle_time_before_scale_down=180,\n", | ||
" # Dedicated or LowPriority. The latter is cheaper but there is a chance of job termination\n", | ||
" tier=\"Dedicated\",\n", | ||
" )\n", | ||
"\n", | ||
" # Now, we pass the object to MLClient's create_or_update method\n", | ||
" cpu_cluster = ml_client.begin_create_or_update(cpu_cluster)\n", | ||
"\n", | ||
"print(\n", | ||
" f\"AMLCompute with name {cpu_cluster.name} is created, the compute size is {cpu_cluster.size}\"\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 13, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Environment with name aml-scikit-learn is registered to workspace, the environment version is 0.2.0\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from azure.ai.ml.entities import Environment\n", | ||
"\n", | ||
"custom_env_name = \"aml-scikit-learn\"\n", | ||
"\n", | ||
"pipeline_job_env = Environment(\n", | ||
" name=custom_env_name,\n", | ||
" description=\"Custom environment for Credit Card Defaults pipeline\",\n", | ||
" tags={\"scikit-learn\": \"0.24.2\"},\n", | ||
" conda_file=os.path.join(\"../dependencies/\", \"conda.yaml\"),\n", | ||
" image=\"mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest\",\n", | ||
" version=\"0.2.0\",\n", | ||
"\n", | ||
")\n", | ||
"pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)\n", | ||
"\n", | ||
"print(\n", | ||
" f\"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}\"\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 14, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"../components/databricks\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"data_prep_src_dir = \"../components/databricks\"\n", | ||
"print(data_prep_src_dir)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 15, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from azure.ai.ml import command\n", | ||
"from azure.ai.ml import Input, Output\n", | ||
"\n", | ||
"list_dbx_clusters = command(\n", | ||
" name=\"Databricks E2E ML Workflow\",\n", | ||
" display_name=\"Databricks E2E ML Workflow\",\n", | ||
" description=\"Invoke a Databricks Pipeline\",\n", | ||
" inputs={\n", | ||
" \"test_train_ratio\": Input(type=\"number\")\n", | ||
" },\n", | ||
" outputs=dict(\n", | ||
" train_data=Output(type=\"uri_folder\", mode=\"rw_mount\"),\n", | ||
" test_data=Output(type=\"uri_folder\", mode=\"rw_mount\"),\n", | ||
" ),\n", | ||
" # The source folder of the component\n", | ||
" code=data_prep_src_dir,\n", | ||
" command=\"\"\"python listclusters.py --test_train_ratio ${{inputs.test_train_ratio}} \\\n", | ||
" --train_data ${{outputs.train_data}} --test_data ${{outputs.test_data}} \\\n", | ||
" \"\"\",\n", | ||
" environment=f\"{pipeline_job_env.name}:{pipeline_job_env.version}\",\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 16, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"type: command\n", | ||
"outputs:\n", | ||
" train_data:\n", | ||
" mode: rw_mount\n", | ||
" type: uri_folder\n", | ||
" test_data:\n", | ||
" mode: rw_mount\n", | ||
" type: uri_folder\n", | ||
"environment: azureml:aml-scikit-learn:0.2.0\n", | ||
"code: /mnt/batch/tasks/shared/LS_root/mounts/clusters/ciaranh1/code/Users/ciaranh/components/databricks\n", | ||
"component:\n", | ||
" name: Databricks E2E ML Workflow\n", | ||
" display_name: Databricks E2E ML Workflow\n", | ||
" description: Invoke a Databricks Pipeline\n", | ||
" type: command\n", | ||
" inputs:\n", | ||
" test_train_ratio:\n", | ||
" type: number\n", | ||
" outputs:\n", | ||
" train_data:\n", | ||
" type: uri_folder\n", | ||
" mode: rw_mount\n", | ||
" test_data:\n", | ||
" type: uri_folder\n", | ||
" mode: rw_mount\n", | ||
" command: 'python listclusters.py --test_train_ratio ${{inputs.test_train_ratio}} --train_data\n", | ||
" ${{outputs.train_data}} --test_data ${{outputs.test_data}} '\n", | ||
" environment: azureml:aml-scikit-learn:0.2.0\n", | ||
" code: /mnt/batch/tasks/shared/LS_root/mounts/clusters/ciaranh1/code/Users/ciaranh/components/databricks\n", | ||
" is_deterministic: true\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(list_dbx_clusters)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 17, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# the dsl decorator tells the sdk that we are defining an Azure ML pipeline\n", | ||
"from azure.ai.ml import dsl, Input, Output\n", | ||
"\n", | ||
"pipeline_job_test_train_ratio = 0.2\n", | ||
"@dsl.pipeline(\n", | ||
" compute=cpu_compute_target,\n", | ||
" description=\"E2E data_perp-train pipeline\",\n", | ||
")\n", | ||
"def dbx_pipeline(\n", | ||
" pipeline_job_test_train_ratio,\n", | ||
"):\n", | ||
" # using data_prep_function like a python call with its own inputs\n", | ||
" data_prep_job = list_dbx_clusters(\n", | ||
" test_train_ratio=pipeline_job_test_train_ratio,\n", | ||
" )\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"pipeline = dbx_pipeline(\n", | ||
" pipeline_job_test_train_ratio=0.3,\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 21, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# submit the pipeline job\n", | ||
"pipeline_job = ml_client.jobs.create_or_update(\n", | ||
" pipeline,\n", | ||
" # Project's name\n", | ||
" experiment_name=\"dbx_list_clusters\",\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 22, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"name: witty_grass_hhzx8v120l\n", | ||
"display_name: dbx_pipeline\n", | ||
"description: E2E data_perp-train pipeline\n", | ||
"type: pipeline\n", | ||
"inputs:\n", | ||
" pipeline_job_test_train_ratio: '0.3'\n", | ||
"jobs:\n", | ||
" data_prep_job:\n", | ||
" type: command\n", | ||
" inputs:\n", | ||
" test_train_ratio:\n", | ||
" path: ${{parent.inputs.pipeline_job_test_train_ratio}}\n", | ||
" outputs:\n", | ||
" train_data:\n", | ||
" mode: rw_mount\n", | ||
" type: uri_folder\n", | ||
" test_data:\n", | ||
" mode: rw_mount\n", | ||
" type: uri_folder\n", | ||
" component: azureml:azureml_anonymous:32bdee22-c855-46f7-9f0f-38725a6253c2\n", | ||
"services:\n", | ||
" Tracking:\n", | ||
" job_service_type: Tracking\n", | ||
" endpoint: azureml://eastus.api.azureml.ms/mlflow/v1.0/subscriptions/2a834239-8f89-42e1-8cf1-c3c10090f51c/resourceGroups/databricks-sandbox-rg/providers/Microsoft.MachineLearningServices/workspaces/amlsandbox-eco3?\n", | ||
" Studio:\n", | ||
" job_service_type: Studio\n", | ||
" endpoint: https://ml.azure.com/runs/witty_grass_hhzx8v120l?wsid=/subscriptions/2a834239-8f89-42e1-8cf1-c3c10090f51c/resourcegroups/databricks-sandbox-rg/workspaces/amlsandbox-eco3&tid=16b3c013-d300-468d-ac64-7eda0820b6d3\n", | ||
"experiment_name: dbx_list_clusters\n", | ||
"id: azureml:/subscriptions/2a834239-8f89-42e1-8cf1-c3c10090f51c/resourceGroups/databricks-sandbox-rg/providers/Microsoft.MachineLearningServices/workspaces/amlsandbox-eco3/jobs/witty_grass_hhzx8v120l\n", | ||
"compute: azureml:ciaranh1\n", | ||
"properties:\n", | ||
" azureml.DevPlatv2: 'true'\n", | ||
" azureml.runsource: azureml.PipelineRun\n", | ||
" runSource: MFE\n", | ||
" runType: HTTP\n", | ||
" azureml.parameters: '{\"pipeline_job_test_train_ratio\":\"0.3\"}'\n", | ||
" azureml.continue_on_step_failure: 'True'\n", | ||
" azureml.continue_on_failed_optional_input: 'True'\n", | ||
" azureml.defaultComputeName: ciaranh1\n", | ||
" azureml.defaultDataStoreName: workspaceblobstore\n", | ||
" azureml.pipelineComponent: pipelinerun\n", | ||
"status: Preparing\n", | ||
"creation_context:\n", | ||
" created_at: '2023-03-31T12:30:43.049783+00:00'\n", | ||
" created_by: Ciaran Hamill Diamond\n", | ||
" created_by_type: User\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(pipeline_job)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3.10 - SDK v2", | ||
"language": "python", | ||
"name": "python310-sdkv2" | ||
}, | ||
"orig_nbformat": 4 | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
|
||
{ | ||
"ciaran_components": [ | ||
|
||
{ | ||
"script": "train.py", | ||
"parameters": { | ||
"test_data": {"type": "uri_folder"}, | ||
"train_data": {"type": "uri_folder"}, | ||
"test_labels": {"type": "uri_folder"}, | ||
"train_labels": {"type": "uri_folder"}, | ||
"model": {"type": "uri_folder"}, | ||
"model_name": {"type": "string"}, | ||
"model_type": {"type": "string"}, | ||
"model_params": {"type": "string"}, | ||
"model_params_file": {"type": "uri_file"} | ||
}, | ||
"outputs": { | ||
"model": {"type": "uri_folder"} | ||
}, | ||
"environments": "test" | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.