Skip to content

Commit

Permalink
Merge pull request #13 from ciaran28/featureone
Browse files Browse the repository at this point in the history
Featureone
ciaran28 authored Apr 17, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
2 parents 9e683f7 + 64bb312 commit d476cdf
Showing 56 changed files with 525 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import os
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split
import logging
import mlflow
import requests
import os

#parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)

def main():
"""Main function of the script."""

# input and output arguments
parser = argparse.ArgumentParser()

parser.add_argument("--data", type=str, help="path to input data")
parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
parser.add_argument("--train_data", type=str, help="path to train data")
parser.add_argument("--test_data", type=str, help="path to test data")

args = parser.parse_args()
# Start Logging
mlflow.start_run()

print(" ".join(f"{k}={v}" for k, v in vars(args).items()))

print("input data:", args.data)

credit_df = pd.read_excel(args.data, header=1, index_col=0)

mlflow.log_metric("num_samples", credit_df.shape[0])
mlflow.log_metric("num_features", credit_df.shape[1] - 1)

credit_train_df, credit_test_df = train_test_split(
credit_df,
test_size=args.test_train_ratio,
)

# output paths are mounted as folder, therefore, we are adding a filename to the path
credit_train_df.to_csv(os.path.join(args.train_data, "data.csv"), index=False)

credit_test_df.to_csv(os.path.join(args.test_data, "data.csv"), index=False)

# Stop Logging
mlflow.end_run()



# Retrieve Tokens


def createManagementToken(tokenRequestBody, tokenRequestHeaders, tokenBaseURL):
"""
Uses Our Service Principal Credentials To Generate Azure Active Directory Tokens
"""

tokenRequestBody['resource'] = 'https://management.core.windows.net/'

response = requests.get(tokenBaseURL, headers=tokenRequestHeaders, data=tokenRequestBody)

if response.status_code == 200:
print(response.status_code)

else:
raise Exception(response.text)

return response.json()['access_token']

def createBearerToken(tokenRequestBody, tokenRequestHeaders, tokenBaseURL):
"""
Uses Our Service Principal Credentials To Generate Azure Active Directory Tokens
"""

tokenRequestBody['resource'] = '2ff814a6-3304-4ab8-85cb-cd0e6f879c1d'

response = requests.get(tokenBaseURL, headers=tokenRequestHeaders, data=tokenRequestBody)

if response.status_code == 200:
print(response.status_code)

else:
raise Exception(response.text)

return response.json()['access_token']



def listClusters(DBRKS_REQ_HEADERS, DATABRICKS_INSTANCE):
"""
Returns a Json object containing a list of existing Databricks Clusters.
"""

response = requests.get('https://' + DATABRICKS_INSTANCE + '/api/2.0/clusters/list', headers=DBRKS_REQ_HEADERS)

if response.status_code != 200:
raise Exception(response.content)

else:
return response.json()



if __name__ == "__main__":

# The sp credentials need to come in from key vault

tokenRequestBody = {
'grant_type': 'client_credentials',
'client_id': '841ba6d9-a509-44ee-bf40-c0876b4ac6bb',
'client_secret': 'IQG8Q~hQDGO5eFcRos~YN9waI0gE-Gsx8sMx5bJQ'
}
tokenRequestHeaders = {'Content-Type': 'application/x-www-form-urlencoded'}
tokenBaseURL = 'https://login.microsoftonline.com/' + '16b3c013-d300-468d-ac64-7eda0820b6d3' + '/oauth2/token'

DBRKS_BEARER_TOKEN = createBearerToken(tokenRequestBody=tokenRequestBody,
tokenRequestHeaders=tokenRequestHeaders,
tokenBaseURL=tokenBaseURL
)

DBRKS_MANAGEMENT_TOKEN = createManagementToken(tokenRequestBody=tokenRequestBody,
tokenRequestHeaders=tokenRequestHeaders,
tokenBaseURL=tokenBaseURL
)


DBRKS_REQ_HEADERS = {
'Authorization': f'Bearer {DBRKS_BEARER_TOKEN}',
'X-Databricks-Azure-SP-Management-Token': f'{DBRKS_MANAGEMENT_TOKEN}',
'X-Databricks-Azure-Workspace-Resource-Id': '/subscriptions/2a834239-8f89-42e1-8cf1-c3c10090f51c/resourceGroups/databricks-sandbox-rg/providers/Microsoft.Databricks/workspaces/dbxwssandbox-eco3',
'Content-Type': 'application/json'
}
DATABRICKS_INSTANCE = "adb-2041102092454885.5.azuredatabricks.net"

existingClusters = listClusters(DBRKS_REQ_HEADERS, DATABRICKS_INSTANCE)

print(existingClusters)
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: model-env
channels:
- conda-forge
dependencies:
- python=3.8
- numpy=1.21.2
- pip=21.2.4
- scikit-learn=0.24.2
- scipy=1.7.1
- pandas>=1.1,<1.2
- pip:
- inference-schema[numpy-support]==1.3.0
- xlrd==2.0.1
- mlflow== 1.26.1
- azureml-mlflow==1.42.0
- pandas
- requests
Original file line number Diff line number Diff line change
@@ -0,0 +1,339 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MLClient(credential=<azure.identity._credentials.default.DefaultAzureCredential object at 0x7f395be3bf40>,\n",
" subscription_id=2a834239-8f89-42e1-8cf1-c3c10090f51c,\n",
" resource_group_name=databricks-sandbox-rg,\n",
" workspace_name=amlsandbox-eco3)\n"
]
}
],
"source": [
"from azure.identity import DefaultAzureCredential\n",
"from azure.ai.ml import MLClient\n",
"\n",
"ml_client = MLClient(\n",
" credential=DefaultAzureCredential(),\n",
" subscription_id=\"2a834239-8f89-42e1-8cf1-c3c10090f51c\",\n",
" resource_group_name=\"databricks-sandbox-rg\",\n",
" workspace_name=\"amlsandbox-eco3\",\n",
" )\n",
"\n",
"print(ml_client)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"You already have a cluster named ciaranh1, we'll reuse it as is.\n",
"AMLCompute with name ciaranh1 is created, the compute size is STANDARD_DS12_V2\n"
]
}
],
"source": [
"from azure.ai.ml.entities import AmlCompute\n",
"\n",
"cpu_compute_target = \"ciaranh1\"\n",
"\n",
"try:\n",
" # let's see if the compute target already exists\n",
" cpu_cluster = ml_client.compute.get(cpu_compute_target)\n",
" print(\n",
" f\"You already have a cluster named {cpu_compute_target}, we'll reuse it as is.\"\n",
" )\n",
"\n",
"except Exception:\n",
" print(\"Creating a new cpu compute target...\")\n",
"\n",
" # Let's create the Azure ML compute object with the intended parameters\n",
" cpu_cluster = AmlCompute(\n",
" # Name assigned to the compute cluster\n",
" name=\"cpu-cluster\",\n",
" # Azure ML Compute is the on-demand VM service\n",
" type=\"amlcompute\",\n",
" # VM Family\n",
" size=\"STANDARD_DS3_V2\",\n",
" # Minimum running nodes when there is no job running\n",
" min_instances=0,\n",
" # Nodes in cluster\n",
" max_instances=4,\n",
" # How many seconds will the node running after the job termination\n",
" idle_time_before_scale_down=180,\n",
" # Dedicated or LowPriority. The latter is cheaper but there is a chance of job termination\n",
" tier=\"Dedicated\",\n",
" )\n",
"\n",
" # Now, we pass the object to MLClient's create_or_update method\n",
" cpu_cluster = ml_client.begin_create_or_update(cpu_cluster)\n",
"\n",
"print(\n",
" f\"AMLCompute with name {cpu_cluster.name} is created, the compute size is {cpu_cluster.size}\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Environment with name aml-scikit-learn is registered to workspace, the environment version is 0.2.0\n"
]
}
],
"source": [
"from azure.ai.ml.entities import Environment\n",
"\n",
"custom_env_name = \"aml-scikit-learn\"\n",
"\n",
"pipeline_job_env = Environment(\n",
" name=custom_env_name,\n",
" description=\"Custom environment for Credit Card Defaults pipeline\",\n",
" tags={\"scikit-learn\": \"0.24.2\"},\n",
" conda_file=os.path.join(\"../dependencies/\", \"conda.yaml\"),\n",
" image=\"mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest\",\n",
" version=\"0.2.0\",\n",
"\n",
")\n",
"pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)\n",
"\n",
"print(\n",
" f\"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"../components/databricks\n"
]
}
],
"source": [
"data_prep_src_dir = \"../components/databricks\"\n",
"print(data_prep_src_dir)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"from azure.ai.ml import command\n",
"from azure.ai.ml import Input, Output\n",
"\n",
"list_dbx_clusters = command(\n",
" name=\"Databricks E2E ML Workflow\",\n",
" display_name=\"Databricks E2E ML Workflow\",\n",
" description=\"Invoke a Databricks Pipeline\",\n",
" inputs={\n",
" \"test_train_ratio\": Input(type=\"number\")\n",
" },\n",
" outputs=dict(\n",
" train_data=Output(type=\"uri_folder\", mode=\"rw_mount\"),\n",
" test_data=Output(type=\"uri_folder\", mode=\"rw_mount\"),\n",
" ),\n",
" # The source folder of the component\n",
" code=data_prep_src_dir,\n",
" command=\"\"\"python listclusters.py --test_train_ratio ${{inputs.test_train_ratio}} \\\n",
" --train_data ${{outputs.train_data}} --test_data ${{outputs.test_data}} \\\n",
" \"\"\",\n",
" environment=f\"{pipeline_job_env.name}:{pipeline_job_env.version}\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"type: command\n",
"outputs:\n",
" train_data:\n",
" mode: rw_mount\n",
" type: uri_folder\n",
" test_data:\n",
" mode: rw_mount\n",
" type: uri_folder\n",
"environment: azureml:aml-scikit-learn:0.2.0\n",
"code: /mnt/batch/tasks/shared/LS_root/mounts/clusters/ciaranh1/code/Users/ciaranh/components/databricks\n",
"component:\n",
" name: Databricks E2E ML Workflow\n",
" display_name: Databricks E2E ML Workflow\n",
" description: Invoke a Databricks Pipeline\n",
" type: command\n",
" inputs:\n",
" test_train_ratio:\n",
" type: number\n",
" outputs:\n",
" train_data:\n",
" type: uri_folder\n",
" mode: rw_mount\n",
" test_data:\n",
" type: uri_folder\n",
" mode: rw_mount\n",
" command: 'python listclusters.py --test_train_ratio ${{inputs.test_train_ratio}} --train_data\n",
" ${{outputs.train_data}} --test_data ${{outputs.test_data}} '\n",
" environment: azureml:aml-scikit-learn:0.2.0\n",
" code: /mnt/batch/tasks/shared/LS_root/mounts/clusters/ciaranh1/code/Users/ciaranh/components/databricks\n",
" is_deterministic: true\n",
"\n"
]
}
],
"source": [
"print(list_dbx_clusters)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# the dsl decorator tells the sdk that we are defining an Azure ML pipeline\n",
"from azure.ai.ml import dsl, Input, Output\n",
"\n",
"pipeline_job_test_train_ratio = 0.2\n",
"@dsl.pipeline(\n",
" compute=cpu_compute_target,\n",
" description=\"E2E data_perp-train pipeline\",\n",
")\n",
"def dbx_pipeline(\n",
" pipeline_job_test_train_ratio,\n",
"):\n",
" # using data_prep_function like a python call with its own inputs\n",
" data_prep_job = list_dbx_clusters(\n",
" test_train_ratio=pipeline_job_test_train_ratio,\n",
" )\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"pipeline = dbx_pipeline(\n",
" pipeline_job_test_train_ratio=0.3,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# submit the pipeline job\n",
"pipeline_job = ml_client.jobs.create_or_update(\n",
" pipeline,\n",
" # Project's name\n",
" experiment_name=\"dbx_list_clusters\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"name: witty_grass_hhzx8v120l\n",
"display_name: dbx_pipeline\n",
"description: E2E data_perp-train pipeline\n",
"type: pipeline\n",
"inputs:\n",
" pipeline_job_test_train_ratio: '0.3'\n",
"jobs:\n",
" data_prep_job:\n",
" type: command\n",
" inputs:\n",
" test_train_ratio:\n",
" path: ${{parent.inputs.pipeline_job_test_train_ratio}}\n",
" outputs:\n",
" train_data:\n",
" mode: rw_mount\n",
" type: uri_folder\n",
" test_data:\n",
" mode: rw_mount\n",
" type: uri_folder\n",
" component: azureml:azureml_anonymous:32bdee22-c855-46f7-9f0f-38725a6253c2\n",
"services:\n",
" Tracking:\n",
" job_service_type: Tracking\n",
" endpoint: azureml://eastus.api.azureml.ms/mlflow/v1.0/subscriptions/2a834239-8f89-42e1-8cf1-c3c10090f51c/resourceGroups/databricks-sandbox-rg/providers/Microsoft.MachineLearningServices/workspaces/amlsandbox-eco3?\n",
" Studio:\n",
" job_service_type: Studio\n",
" endpoint: https://ml.azure.com/runs/witty_grass_hhzx8v120l?wsid=/subscriptions/2a834239-8f89-42e1-8cf1-c3c10090f51c/resourcegroups/databricks-sandbox-rg/workspaces/amlsandbox-eco3&tid=16b3c013-d300-468d-ac64-7eda0820b6d3\n",
"experiment_name: dbx_list_clusters\n",
"id: azureml:/subscriptions/2a834239-8f89-42e1-8cf1-c3c10090f51c/resourceGroups/databricks-sandbox-rg/providers/Microsoft.MachineLearningServices/workspaces/amlsandbox-eco3/jobs/witty_grass_hhzx8v120l\n",
"compute: azureml:ciaranh1\n",
"properties:\n",
" azureml.DevPlatv2: 'true'\n",
" azureml.runsource: azureml.PipelineRun\n",
" runSource: MFE\n",
" runType: HTTP\n",
" azureml.parameters: '{\"pipeline_job_test_train_ratio\":\"0.3\"}'\n",
" azureml.continue_on_step_failure: 'True'\n",
" azureml.continue_on_failed_optional_input: 'True'\n",
" azureml.defaultComputeName: ciaranh1\n",
" azureml.defaultDataStoreName: workspaceblobstore\n",
" azureml.pipelineComponent: pipelinerun\n",
"status: Preparing\n",
"creation_context:\n",
" created_at: '2023-03-31T12:30:43.049783+00:00'\n",
" created_by: Ciaran Hamill Diamond\n",
" created_by_type: User\n",
"\n"
]
}
],
"source": [
"print(pipeline_job)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10 - SDK v2",
"language": "python",
"name": "python310-sdkv2"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
24 changes: 24 additions & 0 deletions .azureDevOps/delete.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

{
"ciaran_components": [

{
"script": "train.py",
"parameters": {
"test_data": {"type": "uri_folder"},
"train_data": {"type": "uri_folder"},
"test_labels": {"type": "uri_folder"},
"train_labels": {"type": "uri_folder"},
"model": {"type": "uri_folder"},
"model_name": {"type": "string"},
"model_type": {"type": "string"},
"model_params": {"type": "string"},
"model_params_file": {"type": "uri_file"}
},
"outputs": {
"model": {"type": "uri_folder"}
},
"environments": "test"
}
]
}
7 changes: 6 additions & 1 deletion Data_Scientist/housePriceModelling.py
Original file line number Diff line number Diff line change
@@ -3,6 +3,11 @@
%pip install sklearn_pandas
%pip install azureml-sdk
%pip install azureml-mlflow
# COMMAND ----------

print("test again")
print("new test")

# COMMAND ----------
import os
import numpy as np
@@ -57,7 +62,7 @@


ws = Workspace(
subscription_id=subscription_id,
subscription_id="2a834239-8f89-42e1-8cf1-c3c10090f51c",
resource_group=resource_group,
workspace_name=workspace_name,
auth=svc_pr
Original file line number Diff line number Diff line change
@@ -7,7 +7,7 @@ jobs:
name: Master Deployment
strategy:
matrix:
targetEnvironment: [ Sandbox, Development, UAT, Production ]
targetEnvironment: [ Sandbox ] #[ Sandbox, Development, UAT, Production ]
uses: ./.github/workflows/taskDatabricks.yaml
with:
ENVIRONMENT: ${{ matrix.targetEnvironment }}
File renamed without changes.
File renamed without changes.

0 comments on commit d476cdf

Please sign in to comment.