Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mamokari/observability #307

Open
wants to merge 28 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
07f985a
Adds Observability Modules
Jul 2, 2020
5adb1bc
Fixes YML file
Jul 2, 2020
5bc9778
Fixes flake8 errors
Jul 2, 2020
cb2694a
removes python-dotenv==0.10.3 dependency
Jul 2, 2020
534eaec
Chages the AML_REBUILD_ENVIRONMENT to false
Jul 3, 2020
56a20bf
Fixes the AML image snapshot path
Jul 3, 2020
0f614c4
Satisfies Flake8
Jul 3, 2020
bb0d158
Integrates Observability onto Pipeline and one script
Jul 3, 2020
3a619da
Chamnges factory to register_loggers
Jul 3, 2020
ff513b3
Adds app_insights_connection_string to Env class
Jul 3, 2020
f61d444
Integrates observability with ml_service
Jul 3, 2020
b2d81d1
Satisfies Flake8
Jul 3, 2020
0904c59
Integrates diabetes_regression scripts with observability
Jul 3, 2020
cbd100a
Changes Observability to add AppInsightsLoggers to list of loggers on…
Jul 3, 2020
704d345
Adds APP_INSIGHTS_CONNECTION_STRING to logger_interface
Jul 3, 2020
421c14c
Removes the dependency to Env object
Jul 6, 2020
bbc871a
Satisfies Flake8
Jul 6, 2020
cc95eaf
Adds dataclasses to conda_dependencies.yml
Jul 6, 2020
e1a76f8
Fixes YAML indentation
Jul 6, 2020
07e432f
Changes log_metric to log
Jul 6, 2020
b48da8e
Adds more scripts to use Observability and get app_insight_connectoin_
Jul 6, 2020
b25c7e2
Remove AML_REBUILD_ENVIRONMENT flag
Jul 6, 2020
b814d47
Resolves the conflicts
Jul 6, 2020
13841e2
Adds diabetes_regression to scoring path
Jul 6, 2020
66cffef
Adds dataclasses
Jul 6, 2020
8328b88
Adds dataclasses to conda_dependencies_scorecopy.yml
Jul 6, 2020
8cac588
Adds python-dotenv and opencensus to conda_dependencies_scorecopy.yml
Jul 6, 2020
c113423
Merges with master
Jul 9, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions .env.example
Original file line number Diff line number Diff line change
@@ -69,13 +69,16 @@ AML_CLUSTER_MAX_NODES_SCORING = '4'
AML_CLUSTER_MIN_NODES_SCORING = '0'
AML_CLUSTER_PRIORITY_SCORING = 'lowpriority'
AML_REBUILD_ENVIRONMENT_SCORING = 'true'
BATCHSCORE_SCRIPT_PATH = 'scoring/parallel_batchscore.py'
BATCHSCORE_COPY_SCRIPT_PATH = 'scoring/parallel_batchscore_copyoutput.py'
BATCHSCORE_SCRIPT_PATH = 'diabetes_regression/scoring/parallel_batchscore.py'
BATCHSCORE_COPY_SCRIPT_PATH = 'diabetes_regression/scoring/parallel_batchscore_copyoutput.py'


SCORING_DATASTORE_INPUT_CONTAINER = 'input'
SCORING_DATASTORE_INPUT_FILENAME = 'diabetes_scoring_input.csv'
SCORING_DATASTORE_OUTPUT_CONTAINER = 'output'
SCORING_DATASTORE_OUTPUT_FILENAME = 'diabetes_scoring_output.csv'
SCORING_DATASET_NAME = 'diabetes_scoring_ds'
SCORING_PIPELINE_NAME = 'diabetes-scoring-pipeline'
SCORING_PIPELINE_NAME = 'diabetes-scoring-pipeline'

# Observability
APP_INSIGHTS_CONNECTION_STRING = ''
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -109,3 +109,6 @@ condaenv.*
.mypy_cache/

.DS_Store

#pycharm
.idea
2 changes: 2 additions & 0 deletions .pipelines/diabetes_regression-batchscoring-ci.yml
Original file line number Diff line number Diff line change
@@ -68,6 +68,8 @@ stages:
python -m ml_service.pipelines.diabetes_regression_build_parallel_batchscore_pipeline
env:
SCORING_DATASTORE_ACCESS_KEY: $(SCORING_DATASTORE_ACCESS_KEY)
APP_INSIGHTS_CONNECTION_STRING: $(APP_INSIGHTS_CONNECTION_STRING)


- job: "Run_Batch_Score_Pipeline"
displayName: "Run Batch Scoring Pipeline"
14 changes: 10 additions & 4 deletions .pipelines/diabetes_regression-cd.yml
Original file line number Diff line number Diff line change
@@ -55,7 +55,7 @@ stages:
inputs:
azureSubscription: '$(WORKSPACE_SVC_CONNECTION)'
scriptLocation: inlineScript
workingDirectory: $(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/scoring
workingDirectory: $(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/diabetes_regression/scoring
inlineScript: |
set -e # fail on error

@@ -101,7 +101,7 @@ stages:
inputs:
azureSubscription: '$(WORKSPACE_SVC_CONNECTION)'
scriptLocation: inlineScript
workingDirectory: $(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/scoring
workingDirectory: $(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/diabetes_regression/scoring
inlineScript: |
set -e # fail on error

@@ -111,6 +111,8 @@ stages:
--dc deployment_config_aks.yml \
-g $(RESOURCE_GROUP) --workspace-name $(WORKSPACE_NAME) \
--overwrite -v
env:
APP_INSIGHTS_CONNECTION_STRING: $(APP_INSIGHTS_CONNECTION_STRING)
- task: AzureCLI@1
displayName: 'Smoke test'
inputs:
@@ -120,6 +122,8 @@ stages:
set -e # fail on error
export SUBSCRIPTION_ID=$(az account show --query id -o tsv)
python -m ml_service.util.smoke_test_scoring_service --type AKS --service "$(AKS_DEPLOYMENT_NAME)"
env:
APP_INSIGHTS_CONNECTION_STRING: $(APP_INSIGHTS_CONNECTION_STRING)

- stage: 'Deploy_Webapp'
displayName: 'Deploy to Webapp'
@@ -138,8 +142,8 @@ stages:
- template: diabetes_regression-package-model-template.yml
parameters:
modelId: $(MODEL_NAME):$(get_model.MODEL_VERSION)
scoringScriptPath: '$(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/scoring/score.py'
condaFilePath: '$(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/conda_dependencies.yml'
scoringScriptPath: '$(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/diabetes_regression/scoring/score.py'
condaFilePath: '$(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/diabetes_regression/conda_dependencies.yml'
- script: echo $(IMAGE_LOCATION) >image_location.txt
displayName: "Write image location file"
- task: AzureWebAppContainer@1
@@ -159,3 +163,5 @@ stages:
set -e # fail on error
export SUBSCRIPTION_ID=$(az account show --query id -o tsv)
python -m ml_service.util.smoke_test_scoring_service --type Webapp --service "$(WebAppDeploy.AppServiceApplicationUrl)/score"
env:
APP_INSIGHTS_CONNECTION_STRING: $(APP_INSIGHTS_CONNECTION_STRING)
6 changes: 6 additions & 0 deletions .pipelines/diabetes_regression-ci.yml
Original file line number Diff line number Diff line change
@@ -45,6 +45,8 @@ stages:
# Invoke the Python building and publishing a training pipeline
python -m ml_service.pipelines.diabetes_regression_build_train_pipeline
displayName: 'Publish Azure Machine Learning Pipeline'
env:
APP_INSIGHTS_CONNECTION_STRING: $(APP_INSIGHTS_CONNECTION_STRING)

- stage: 'Trigger_AML_Pipeline'
displayName: 'Train and evaluate model'
@@ -70,6 +72,8 @@ stages:
# Set AMLPIPELINEID variable for next AML Pipeline task in next job
AMLPIPELINEID="$(cat pipeline_id.txt)"
echo "##vso[task.setvariable variable=AMLPIPELINEID;isOutput=true]$AMLPIPELINEID"
env:
APP_INSIGHTS_CONNECTION_STRING: $(APP_INSIGHTS_CONNECTION_STRING)
name: 'getpipelineid'
displayName: 'Get Pipeline ID'
- job: "Run_ML_Pipeline"
@@ -87,6 +91,8 @@ stages:
PipelineId: '$(AMLPIPELINE_ID)'
ExperimentName: '$(EXPERIMENT_NAME)'
PipelineParameters: '"ParameterAssignments": {"model_name": "$(MODEL_NAME)"}, "tags": {"BuildId": "$(Build.BuildId)", "BuildUri": "$(BUILD_URI)"}, "StepTags": {"BuildId": "$(Build.BuildId)", "BuildUri": "$(BUILD_URI)"}'
env:
APP_INSIGHTS_CONNECTION_STRING: $(APP_INSIGHTS_CONNECTION_STRING)
- job: "Training_Run_Report"
dependsOn: "Run_ML_Pipeline"
condition: always()
Original file line number Diff line number Diff line change
@@ -27,6 +27,8 @@ steps:
runId: '${{ parameters.artifactBuildId }}'
runBranch: '$(Build.SourceBranch)'
path: $(Build.SourcesDirectory)/bin
env:
APP_INSIGHTS_CONNECTION_STRING: $(APP_INSIGHTS_CONNECTION_STRING)
- task: Bash@3
name: get_model
displayName: Parse Json for Model Name and Version
Original file line number Diff line number Diff line change
@@ -25,5 +25,7 @@ steps:
echo $FOUND_MODEL >model.json
name: 'getversion'
displayName: "Determine if evaluation succeeded and new model is registered (CLI)"
env:
APP_INSIGHTS_CONNECTION_STRING: $(APP_INSIGHTS_CONNECTION_STRING)
- publish: model.json
artifact: model
23 changes: 13 additions & 10 deletions .pipelines/diabetes_regression-variables-template.yml
Original file line number Diff line number Diff line change
@@ -3,19 +3,19 @@ variables:
# Source Config
# The directory containing the scripts for training, evaluating, and registering the model
- name: SOURCES_DIR_TRAIN
value: diabetes_regression
value: .
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this needed? This environment variable is used to keep the name of the folder, since the name will likely change during bootstrap. For this reason, we should avoid hardcoding this name in the pipeline and code.

# The path to the model training script under SOURCES_DIR_TRAIN
- name: TRAIN_SCRIPT_PATH
value: training/train_aml.py
value: diabetes_regression/training/train_aml.py
# The path to the model evaluation script under SOURCES_DIR_TRAIN
- name: EVALUATE_SCRIPT_PATH
value: evaluate/evaluate_model.py
value: diabetes_regression/evaluate/evaluate_model.py
# The path to the model registration script under SOURCES_DIR_TRAIN
- name: REGISTER_SCRIPT_PATH
value: register/register_model.py
value: diabetes_regression/register/register_model.py
# The path to the model scoring script relative to SOURCES_DIR_TRAIN
- name: SCORE_SCRIPT
value: scoring/score.py
value: diabetes_regression/scoring/score.py


# Azure ML Variables
@@ -66,8 +66,8 @@ variables:
# value: "true"

# Flag to allow rebuilding the AML Environment after it was built for the first time. This enables dependency updates from conda_dependencies.yaml.
# - name: AML_REBUILD_ENVIRONMENT
# value: "false"
# - name: AML_REBUILD_ENVIRONMENT
# value: "true"

# Variables below are used for controlling various aspects of batch scoring
- name: USE_GPU_FOR_SCORING
@@ -95,9 +95,9 @@ variables:
value: lowpriority
# The path to the batch scoring script relative to SOURCES_DIR_TRAIN
- name: BATCHSCORE_SCRIPT_PATH
value: scoring/parallel_batchscore.py
value: diabetes_regression/scoring/parallel_batchscore.py
- name: BATCHSCORE_COPY_SCRIPT_PATH
value: scoring/parallel_batchscore_copyoutput.py
value: diabetes_regression/scoring/parallel_batchscore_copyoutput.py
# Flag to allow rebuilding the AML Environment after it was built for the first time.
# This enables dependency updates from the conda dependencies yaml for scoring activities.
- name: AML_REBUILD_ENVIRONMENT_SCORING
@@ -126,4 +126,7 @@ variables:
# Scoring pipeline name
- name: SCORING_PIPELINE_NAME
value: "diabetes-scoring-pipeline"


#Observability
- name: APP_INSIGHTS_CONNECTION_STRING
value: ""
3 changes: 3 additions & 0 deletions diabetes_regression/ci_dependencies.yml
Original file line number Diff line number Diff line change
@@ -27,3 +27,6 @@ dependencies:
- flake8==3.7.*
- flake8_formatter_junit_xml==0.0.*
- azure-cli==2.3.*
- opencensus==0.7.7
- opencensus-context==0.1.1
- opencensus-ext-azure==1.0.2
9 changes: 9 additions & 0 deletions diabetes_regression/conda_dependencies.yml
Original file line number Diff line number Diff line change
@@ -37,3 +37,12 @@ dependencies:

# MLOps with R
- azure-storage-blob

# Observability
- opencensus==0.7.7
- opencensus-context==0.1.1
- opencensus-ext-azure==1.0.2
- python-dotenv==0.10.3

# Data Classes
- dataclasses
9 changes: 9 additions & 0 deletions diabetes_regression/conda_dependencies_scorecopy.yml
Original file line number Diff line number Diff line change
@@ -29,3 +29,12 @@ dependencies:

# Score copying deps
- azure-storage-blob

# Observability
- opencensus==0.7.7
- opencensus-context==0.1.1
- opencensus-ext-azure==1.0.2
- python-dotenv==0.10.3

# Data Classes
- dataclasses
9 changes: 9 additions & 0 deletions diabetes_regression/conda_dependencies_scoring.yml
Original file line number Diff line number Diff line change
@@ -30,3 +30,12 @@ dependencies:
# Scoring deps
- scikit-learn
- pandas

# Observability
- opencensus==0.7.7
- opencensus-context==0.1.1
- opencensus-ext-azure==1.0.2
- python-dotenv==0.10.3

# Data Classes
- dataclasses
48 changes: 29 additions & 19 deletions diabetes_regression/evaluate/evaluate_model.py
Original file line number Diff line number Diff line change
@@ -26,7 +26,11 @@
from azureml.core import Run
import argparse
import traceback
from util.model_helper import get_model
from diabetes_regression.util.model_helper import get_model
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As @j-so mentioned, all the references to diabetes_regression by name will not work when someone bootstraps and renames the project. We should find a way to reference this using a variable. Both in the python files and in the pipelines.

from utils.logger.logger_interface import Severity
from utils.logger.observability import Observability

observability = Observability()

run = Run.get_context()

@@ -42,7 +46,7 @@
# load_dotenv()
# sources_dir = os.environ.get("SOURCES_DIR_TRAIN")
# if (sources_dir is None):
# sources_dir = 'diabetes_regression'
# sources_dir = '.'
# path_to_util = os.path.join(".", sources_dir, "util")
# sys.path.append(os.path.abspath(path_to_util)) # NOQA: E402
# from model_helper import get_model
@@ -89,7 +93,8 @@
parser.add_argument(
"--allow_run_cancel",
type=str,
help="Set this to false to avoid evaluation step from cancelling run after an unsuccessful evaluation", # NOQA: E501
help="Set this to false to avoid evaluation step from cancelling "
"run after an unsuccessful evaluation",
default="true",
)

@@ -109,42 +114,47 @@
tag_name = 'experiment_name'

model = get_model(
model_name=model_name,
tag_name=tag_name,
tag_value=exp.name,
aml_workspace=ws)
model_name=model_name,
tag_name=tag_name,
tag_value=exp.name,
aml_workspace=ws)

if (model is not None):
production_model_mse = 10000
if (metric_eval in model.tags):
production_model_mse = float(model.tags[metric_eval])
new_model_mse = float(run.parent.get_metrics().get(metric_eval))
if (production_model_mse is None or new_model_mse is None):
print("Unable to find", metric_eval, "metrics, "
"exiting evaluation")
if((allow_run_cancel).lower() == 'true'):
observability.log("Unable to find" +
metric_eval + "metrics, exiting evaluation")
if ((allow_run_cancel).lower() == 'true'):
run.parent.cancel()
else:
print(
observability.log(
"Current Production model mse: {}, "
"New trained model mse: {}".format(
production_model_mse, new_model_mse
)
)

if (new_model_mse < production_model_mse):
print("New trained model performs better, "
"thus it should be registered")
observability.log("New trained model performs better, "
"thus it should be registered")
else:
print("New trained model metric is worse than or equal to "
"production model so skipping model registration.")
if((allow_run_cancel).lower() == 'true'):
observability.log("New trained model metric is worse "
"than or equal to "
"production model so skipping "
"model registration.")
if ((allow_run_cancel).lower() == 'true'):
run.parent.cancel()
else:
print("This is the first model, "
"thus it should be registered")
observability.log("This is the first model, "
"thus it should be registered")

except Exception:
traceback.print_exc(limit=None, file=None, chain=True)
print("Something went wrong trying to evaluate. Exiting.")
observability.log(
description="Something went wrong trying to evaluate. Exiting.",
severity=Severity.ERROR)

raise
Loading