From 093f7649ff3020735f4135a94ced69132f71153b Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 9 Jun 2022 14:58:16 +0900 Subject: [PATCH 01/47] Added automl workflows --- machine-learning-box/automl/README.md | 15 +++++ .../automl/config/params.yaml | 4 ++ machine-learning-box/automl/ml_datasets.dig | 14 ++++ machine-learning-box/automl/ml_experiment.dig | 64 +++++++++++++++++++ 4 files changed, 97 insertions(+) create mode 100644 machine-learning-box/automl/README.md create mode 100644 machine-learning-box/automl/config/params.yaml create mode 100644 machine-learning-box/automl/ml_datasets.dig create mode 100644 machine-learning-box/automl/ml_experiment.dig diff --git a/machine-learning-box/automl/README.md b/machine-learning-box/automl/README.md new file mode 100644 index 00000000..98025bc4 --- /dev/null +++ b/machine-learning-box/automl/README.md @@ -0,0 +1,15 @@ +## How to use + +Workflow example of AutoML operator. + +Note: this feature is still in Beta and available to limited customers. + + +```sh +# Push project +$ td -c ~/.td/td.conf wf push --project . + +# Setting td.apikey secret is required for automl operator. + +$ td -c ~/.td/td.conf wf secrets --project --set td.apikey +``` \ No newline at end of file diff --git a/machine-learning-box/automl/config/params.yaml b/machine-learning-box/automl/config/params.yaml new file mode 100644 index 00000000..45ad77e7 --- /dev/null +++ b/machine-learning-box/automl/config/params.yaml @@ -0,0 +1,4 @@ +input_database: ml_datasets +output_database: automl_test + +expr_tracking_table: automl_experiments diff --git a/machine-learning-box/automl/ml_datasets.dig b/machine-learning-box/automl/ml_datasets.dig new file mode 100644 index 00000000..b89ee140 --- /dev/null +++ b/machine-learning-box/automl/ml_datasets.dig @@ -0,0 +1,14 @@ +timezone: Asia/Tokyo +#timezone: PST + +_export: + td: + engine: presto + ++load_datasets: + ipynb>: + notebook: ml_datasets + input_table: dummy.removed_later # temporary workaround. + output_database: ml_datasets + datasets: all +# datasets: gluon, bank_marketing \ No newline at end of file diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig new file mode 100644 index 00000000..78cc12a7 --- /dev/null +++ b/machine-learning-box/automl/ml_experiment.dig @@ -0,0 +1,64 @@ +timezone: Asia/Tokyo +#timezone: PST + +_export: + !include : config/params.yaml + td: + engine: presto + database: ${output_database} + ++create_tbl_if_not_exists: + td_ddl>: + create_tables: ["${expr_tracking_table}"] + ++load_datasets: + ipynb>: + notebook: ml_datasets + input_table: dummy.removed_later # temporary workaround. + output_database: ${input_database} +# datasets: gluon, bank_marketing + datasets: gluon + ++gluon_train: + ml_train>: + notebook: gluon_train + model_name: gluon_model_${session_id} + input_table: ${input_database}.gluon_train # expect database_name.table_name + target_column: class + # The following options are optional ones + #problem_type: binary # ‘binary’, ‘multiclass’, ‘regression’, or ‘quantile’. autolugon automatically detect problem types + #eval_metric: roc_auc # autolugon automatically select a right eval_metric for a given setting if not specified. + ignore_columns: time,rowid # Note time column is ignored by the default. + time_limit: 60 * 3 # fit timeout. 3 min just for training time. Recommend 60 * 60 or so for production purposes (Note 24 hours at max). Note this is a soft limit, not hard limit. + # timeout: 60 * 3 # timeout for notebook cell-level execution. This is a hard limit. Note it's cell-level timeout. No timeout if not specified. + export_leaderboard: ${output_database}.leaderboard_gluon_train + export_feature_importance: ${output_database}.feature_importance_gluon_train + ++print_train_result: + echo>: "executed ${automl.last_executed_notebook}.ipynb" + ++track_experiment: + td>: queries/track_experiment.sql + insert_into: automl_experiments + last_executed_notebook: ${automl.last_executed_notebook} + user_id: ${automl.last_executed_user_id} + user_email: ${automl.last_executed_user_email} + model_name: gluon_model_${session_id} + task_attempt_id: ${attempt_id} + session_time: ${session_local_time} + engine: presto + ++gluon_predict: + ml_predict>: + notebook: gluon_predict + model_name: gluon_model_${session_id} + input_table: ${input_database}.gluon_test # expect database_name.table_name + output_table: ${output_database}.gluon_predicted # expect database_name.table_name. DB will be created if not exists. table is overwrite'd. + # optional + #rowid_column: rowid # Note when rowid_column is specified, only rowid column + prediction result columns are resulted in the output table + #ignore_columns: time # target column should not be in test data + export_leaderboard: ${output_database}.leaderboard_gluon_predict + export_feature_importance: ${output_database}.feature_importance_gluon_predict + ++print_predict_result: + echo>: "executed ${automl.last_executed_notebook}.ipynb" \ No newline at end of file From 799f99e614a5f2795aebe50b41696170cb8abc3b Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 13 Jun 2022 11:22:39 +0900 Subject: [PATCH 02/47] Added eda workflow --- machine-learning-box/automl/eda.dig | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 machine-learning-box/automl/eda.dig diff --git a/machine-learning-box/automl/eda.dig b/machine-learning-box/automl/eda.dig new file mode 100644 index 00000000..ca8d057e --- /dev/null +++ b/machine-learning-box/automl/eda.dig @@ -0,0 +1,23 @@ +timezone: Asia/Tokyo +#timezone: PST + +_export: + !include : config/params.yaml + ++datasets: + for_each>: + table: [gluon_train, bank_marketing_train, vehicle_coupon_train, online_retail_train, telco_churn_train, boston_house_train] + _parallel: + limit: 3 + _do: + +run_eda: + ipynb>: + docker: + task_mem: 256g + notebook: EDA + input_table: ${input_database}.${table} + # The following options are optional ones + eda: all + # eda: pandas-profiling, sweetviz + # target_column: label + sampling_threshold: 1000000 From c4bc7eaf5ba351474843e5d96e0533faa0705998 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 13 Jun 2022 11:48:46 +0900 Subject: [PATCH 03/47] Fixed table name --- machine-learning-box/automl/eda.dig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine-learning-box/automl/eda.dig b/machine-learning-box/automl/eda.dig index ca8d057e..45285020 100644 --- a/machine-learning-box/automl/eda.dig +++ b/machine-learning-box/automl/eda.dig @@ -6,7 +6,7 @@ _export: +datasets: for_each>: - table: [gluon_train, bank_marketing_train, vehicle_coupon_train, online_retail_train, telco_churn_train, boston_house_train] + table: [gluon_train, bank_marketing_train, vehicle_coupon_train, online_retail_ltv_train, telco_churn_train, boston_house_train] _parallel: limit: 3 _do: From cc3993f1041dec529b7feee3d6fd3dd08ccaf05b Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 13 Jun 2022 14:28:21 +0900 Subject: [PATCH 04/47] Fixed EDA workflow to load sample datasets --- machine-learning-box/automl/eda.dig | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/machine-learning-box/automl/eda.dig b/machine-learning-box/automl/eda.dig index 45285020..e2662cd4 100644 --- a/machine-learning-box/automl/eda.dig +++ b/machine-learning-box/automl/eda.dig @@ -1,8 +1,13 @@ timezone: Asia/Tokyo #timezone: PST -_export: - !include : config/params.yaml ++load_datasets: + ipynb>: + notebook: ml_datasets + input_table: dummy.removed_later # temporary workaround. + output_database: ml_datasets + datasets: all +# datasets: gluon, bank_marketing, vehicle_coupon, online_retail, telco_churn, boston_house +datasets: for_each>: @@ -15,7 +20,7 @@ _export: docker: task_mem: 256g notebook: EDA - input_table: ${input_database}.${table} + input_table: ml_datasets.${table} # The following options are optional ones eda: all # eda: pandas-profiling, sweetviz From d316f7b88bafcaef8ffcef9561f194e0d4876948 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Sat, 18 Jun 2022 02:55:04 +0900 Subject: [PATCH 05/47] Revised options --- machine-learning-box/automl/ml_datasets.dig | 1 - machine-learning-box/automl/ml_experiment.dig | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/machine-learning-box/automl/ml_datasets.dig b/machine-learning-box/automl/ml_datasets.dig index b89ee140..f5c495be 100644 --- a/machine-learning-box/automl/ml_datasets.dig +++ b/machine-learning-box/automl/ml_datasets.dig @@ -8,7 +8,6 @@ _export: +load_datasets: ipynb>: notebook: ml_datasets - input_table: dummy.removed_later # temporary workaround. output_database: ml_datasets datasets: all # datasets: gluon, bank_marketing \ No newline at end of file diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index 78cc12a7..2b5a6be3 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -14,7 +14,6 @@ _export: +load_datasets: ipynb>: notebook: ml_datasets - input_table: dummy.removed_later # temporary workaround. output_database: ${input_database} # datasets: gluon, bank_marketing datasets: gluon @@ -33,6 +32,7 @@ _export: # timeout: 60 * 3 # timeout for notebook cell-level execution. This is a hard limit. Note it's cell-level timeout. No timeout if not specified. export_leaderboard: ${output_database}.leaderboard_gluon_train export_feature_importance: ${output_database}.feature_importance_gluon_train + # hide_table_contents: true +print_train_result: echo>: "executed ${automl.last_executed_notebook}.ipynb" @@ -59,6 +59,7 @@ _export: #ignore_columns: time # target column should not be in test data export_leaderboard: ${output_database}.leaderboard_gluon_predict export_feature_importance: ${output_database}.feature_importance_gluon_predict + # hide_table_contents: true +print_predict_result: echo>: "executed ${automl.last_executed_notebook}.ipynb" \ No newline at end of file From 57922bedc49aac293551ba424e85a11f1a5306fb Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Sat, 18 Jun 2022 12:05:25 +0900 Subject: [PATCH 06/47] Updated comments --- machine-learning-box/automl/ml_experiment.dig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index 2b5a6be3..db644c91 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -28,7 +28,7 @@ _export: #problem_type: binary # ‘binary’, ‘multiclass’, ‘regression’, or ‘quantile’. autolugon automatically detect problem types #eval_metric: roc_auc # autolugon automatically select a right eval_metric for a given setting if not specified. ignore_columns: time,rowid # Note time column is ignored by the default. - time_limit: 60 * 3 # fit timeout. 3 min just for training time. Recommend 60 * 60 or so for production purposes (Note 24 hours at max). Note this is a soft limit, not hard limit. + time_limit: 60 * 3 # fit timeout. 3 min just for training time. Default: 60 * 60 (1hr). 1hr or more is recommended for production purposes (Note 24 hours at max). Note this is a soft limit, not hard limit. # timeout: 60 * 3 # timeout for notebook cell-level execution. This is a hard limit. Note it's cell-level timeout. No timeout if not specified. export_leaderboard: ${output_database}.leaderboard_gluon_train export_feature_importance: ${output_database}.feature_importance_gluon_train From cdf77a2cc75b6a937aacfb74fd4864c1f5e7a703 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 23 Jun 2022 17:24:24 +0900 Subject: [PATCH 07/47] Revised options --- machine-learning-box/automl/eda.dig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/machine-learning-box/automl/eda.dig b/machine-learning-box/automl/eda.dig index e2662cd4..d6edcf9d 100644 --- a/machine-learning-box/automl/eda.dig +++ b/machine-learning-box/automl/eda.dig @@ -4,7 +4,6 @@ timezone: Asia/Tokyo +load_datasets: ipynb>: notebook: ml_datasets - input_table: dummy.removed_later # temporary workaround. output_database: ml_datasets datasets: all # datasets: gluon, bank_marketing, vehicle_coupon, online_retail, telco_churn, boston_house @@ -18,7 +17,7 @@ timezone: Asia/Tokyo +run_eda: ipynb>: docker: - task_mem: 256g + task_mem: 128g notebook: EDA input_table: ml_datasets.${table} # The following options are optional ones From 44eb67ce3b3c9b84079968d181bf773dba749aa9 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 1 Jul 2022 21:04:59 +0900 Subject: [PATCH 08/47] Copyed from ml_experiment.dig --- .../automl/ml_experiment_demo.dig | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 machine-learning-box/automl/ml_experiment_demo.dig diff --git a/machine-learning-box/automl/ml_experiment_demo.dig b/machine-learning-box/automl/ml_experiment_demo.dig new file mode 100644 index 00000000..db644c91 --- /dev/null +++ b/machine-learning-box/automl/ml_experiment_demo.dig @@ -0,0 +1,65 @@ +timezone: Asia/Tokyo +#timezone: PST + +_export: + !include : config/params.yaml + td: + engine: presto + database: ${output_database} + ++create_tbl_if_not_exists: + td_ddl>: + create_tables: ["${expr_tracking_table}"] + ++load_datasets: + ipynb>: + notebook: ml_datasets + output_database: ${input_database} +# datasets: gluon, bank_marketing + datasets: gluon + ++gluon_train: + ml_train>: + notebook: gluon_train + model_name: gluon_model_${session_id} + input_table: ${input_database}.gluon_train # expect database_name.table_name + target_column: class + # The following options are optional ones + #problem_type: binary # ‘binary’, ‘multiclass’, ‘regression’, or ‘quantile’. autolugon automatically detect problem types + #eval_metric: roc_auc # autolugon automatically select a right eval_metric for a given setting if not specified. + ignore_columns: time,rowid # Note time column is ignored by the default. + time_limit: 60 * 3 # fit timeout. 3 min just for training time. Default: 60 * 60 (1hr). 1hr or more is recommended for production purposes (Note 24 hours at max). Note this is a soft limit, not hard limit. + # timeout: 60 * 3 # timeout for notebook cell-level execution. This is a hard limit. Note it's cell-level timeout. No timeout if not specified. + export_leaderboard: ${output_database}.leaderboard_gluon_train + export_feature_importance: ${output_database}.feature_importance_gluon_train + # hide_table_contents: true + ++print_train_result: + echo>: "executed ${automl.last_executed_notebook}.ipynb" + ++track_experiment: + td>: queries/track_experiment.sql + insert_into: automl_experiments + last_executed_notebook: ${automl.last_executed_notebook} + user_id: ${automl.last_executed_user_id} + user_email: ${automl.last_executed_user_email} + model_name: gluon_model_${session_id} + task_attempt_id: ${attempt_id} + session_time: ${session_local_time} + engine: presto + ++gluon_predict: + ml_predict>: + notebook: gluon_predict + model_name: gluon_model_${session_id} + input_table: ${input_database}.gluon_test # expect database_name.table_name + output_table: ${output_database}.gluon_predicted # expect database_name.table_name. DB will be created if not exists. table is overwrite'd. + # optional + #rowid_column: rowid # Note when rowid_column is specified, only rowid column + prediction result columns are resulted in the output table + #ignore_columns: time # target column should not be in test data + export_leaderboard: ${output_database}.leaderboard_gluon_predict + export_feature_importance: ${output_database}.feature_importance_gluon_predict + # hide_table_contents: true + ++print_predict_result: + echo>: "executed ${automl.last_executed_notebook}.ipynb" \ No newline at end of file From 422683d6374a2282fc5b772474c54279a73302c8 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 1 Jul 2022 21:08:31 +0900 Subject: [PATCH 09/47] Added a missing file --- machine-learning-box/automl/queries/track_experiment.sql | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 machine-learning-box/automl/queries/track_experiment.sql diff --git a/machine-learning-box/automl/queries/track_experiment.sql b/machine-learning-box/automl/queries/track_experiment.sql new file mode 100644 index 00000000..d781a9f4 --- /dev/null +++ b/machine-learning-box/automl/queries/track_experiment.sql @@ -0,0 +1,9 @@ +-- DIGDAG_INSERT_LINE +select + '${task_attempt_id}' as task_attempt_id, + '${session_time}' as session_time, + '${user_id}' as user_id, + '${user_email}' as user_email, + '${model_name}' as model_name, + '${last_executed_notebook}.ipynb' as ipynb_url, + '${last_executed_notebook}.html' as html_url From 41d05ff1e9d1e9a874df6bbe173ce1a11de800af Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 1 Jul 2022 21:14:57 +0900 Subject: [PATCH 10/47] Added parameterized automl workflow --- .../automl/config/params.yaml | 6 +++ machine-learning-box/automl/ml_experiment.dig | 45 +++++-------------- 2 files changed, 17 insertions(+), 34 deletions(-) diff --git a/machine-learning-box/automl/config/params.yaml b/machine-learning-box/automl/config/params.yaml index 45ad77e7..8e5aa835 100644 --- a/machine-learning-box/automl/config/params.yaml +++ b/machine-learning-box/automl/config/params.yaml @@ -1,4 +1,10 @@ input_database: ml_datasets output_database: automl_test +train_data_table: gluon_train +target_column: class +test_data_table: gluon_test + expr_tracking_table: automl_experiments + +fit_time_limit: 60 * 3 # # fit timeout. 3 min just for training time. Default: 60 * 60 (1hr). \ No newline at end of file diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index db644c91..2967c38a 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -1,45 +1,28 @@ -timezone: Asia/Tokyo -#timezone: PST - _export: !include : config/params.yaml td: engine: presto - database: ${output_database} +create_tbl_if_not_exists: td_ddl>: - create_tables: ["${expr_tracking_table}"] - -+load_datasets: - ipynb>: - notebook: ml_datasets - output_database: ${input_database} -# datasets: gluon, bank_marketing - datasets: gluon + create_tables: ["${expr_tracking_table}", "${output_database}"] +gluon_train: ml_train>: notebook: gluon_train model_name: gluon_model_${session_id} - input_table: ${input_database}.gluon_train # expect database_name.table_name - target_column: class - # The following options are optional ones - #problem_type: binary # ‘binary’, ‘multiclass’, ‘regression’, or ‘quantile’. autolugon automatically detect problem types - #eval_metric: roc_auc # autolugon automatically select a right eval_metric for a given setting if not specified. - ignore_columns: time,rowid # Note time column is ignored by the default. - time_limit: 60 * 3 # fit timeout. 3 min just for training time. Default: 60 * 60 (1hr). 1hr or more is recommended for production purposes (Note 24 hours at max). Note this is a soft limit, not hard limit. - # timeout: 60 * 3 # timeout for notebook cell-level execution. This is a hard limit. Note it's cell-level timeout. No timeout if not specified. - export_leaderboard: ${output_database}.leaderboard_gluon_train - export_feature_importance: ${output_database}.feature_importance_gluon_train - # hide_table_contents: true + input_table: ${input_database}.${train_data_table} + target_column: ${target_column} + time_limit: ${fit_time_limit} + export_leaderboard: ${output_database}.leaderboard_${train_input_table} + export_feature_importance: ${output_database}.feature_importance_${train_input_table} +print_train_result: - echo>: "executed ${automl.last_executed_notebook}.ipynb" + echo>: "Executed training and built gluon_model_${session_id}: ${automl.last_executed_notebook}.ipynb" +track_experiment: td>: queries/track_experiment.sql - insert_into: automl_experiments + insert_into: ${output_database}.automl_experiments last_executed_notebook: ${automl.last_executed_notebook} user_id: ${automl.last_executed_user_id} user_email: ${automl.last_executed_user_email} @@ -52,14 +35,8 @@ _export: ml_predict>: notebook: gluon_predict model_name: gluon_model_${session_id} - input_table: ${input_database}.gluon_test # expect database_name.table_name - output_table: ${output_database}.gluon_predicted # expect database_name.table_name. DB will be created if not exists. table is overwrite'd. - # optional - #rowid_column: rowid # Note when rowid_column is specified, only rowid column + prediction result columns are resulted in the output table - #ignore_columns: time # target column should not be in test data - export_leaderboard: ${output_database}.leaderboard_gluon_predict - export_feature_importance: ${output_database}.feature_importance_gluon_predict - # hide_table_contents: true + input_table: ${input_database}.${test_data_table} + output_table: ${output_database}.predicted_${test_data_table}_${session_id} +print_predict_result: - echo>: "executed ${automl.last_executed_notebook}.ipynb" \ No newline at end of file + echo>: "Run prediction and resulted to ${output_database}.predicted_${test_data_table}_${session_id}: ${automl.last_executed_notebook}.ipynb" \ No newline at end of file From 250c32d7961d5a5e4725c0159025282b57e48895 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 1 Jul 2022 21:18:22 +0900 Subject: [PATCH 11/47] td.database is required --- machine-learning-box/automl/.ruby-version | 1 + machine-learning-box/automl/ml_experiment.dig | 1 + 2 files changed, 2 insertions(+) create mode 100644 machine-learning-box/automl/.ruby-version diff --git a/machine-learning-box/automl/.ruby-version b/machine-learning-box/automl/.ruby-version new file mode 100644 index 00000000..ec1cf33c --- /dev/null +++ b/machine-learning-box/automl/.ruby-version @@ -0,0 +1 @@ +2.6.3 diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index 2967c38a..79d46ade 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -2,6 +2,7 @@ _export: !include : config/params.yaml td: engine: presto + database: ${output_database} +create_tbl_if_not_exists: td_ddl>: From 61c6b8e9fd8d210050e504015b1fac5810ccbf89 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 1 Jul 2022 21:22:28 +0900 Subject: [PATCH 12/47] Fixed var ref --- machine-learning-box/automl/ml_experiment.dig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index 79d46ade..a638ec45 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -15,8 +15,8 @@ _export: input_table: ${input_database}.${train_data_table} target_column: ${target_column} time_limit: ${fit_time_limit} - export_leaderboard: ${output_database}.leaderboard_${train_input_table} - export_feature_importance: ${output_database}.feature_importance_${train_input_table} + export_leaderboard: ${output_database}.leaderboard_${train_data_table} + export_feature_importance: ${output_database}.feature_importance_${train_data_table} +print_train_result: echo>: "Executed training and built gluon_model_${session_id}: ${automl.last_executed_notebook}.ipynb" From 03a6963751d6a370418c873cb26969177ba38100 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 7 Jul 2022 21:59:45 +0900 Subject: [PATCH 13/47] Fixed to properly create output database if missing --- machine-learning-box/automl/ml_experiment.dig | 6 +++++- machine-learning-box/automl/ml_experiment_demo.dig | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index a638ec45..20ab4c60 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -4,9 +4,13 @@ _export: engine: presto database: ${output_database} ++create_database_if_not_exists: + td_ddl>: + create_databases: ["${output_database}"] + +create_tbl_if_not_exists: td_ddl>: - create_tables: ["${expr_tracking_table}", "${output_database}"] + create_tables: ["${expr_tracking_table}"] +gluon_train: ml_train>: diff --git a/machine-learning-box/automl/ml_experiment_demo.dig b/machine-learning-box/automl/ml_experiment_demo.dig index db644c91..57c2dc41 100644 --- a/machine-learning-box/automl/ml_experiment_demo.dig +++ b/machine-learning-box/automl/ml_experiment_demo.dig @@ -7,6 +7,10 @@ _export: engine: presto database: ${output_database} ++create_database_if_not_exists: + td_ddl>: + create_databases: ["${output_database}"] + +create_tbl_if_not_exists: td_ddl>: create_tables: ["${expr_tracking_table}"] From 4ea4661671ac1ab6e4018840b8e163931fadd4e8 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 7 Jul 2022 21:59:57 +0900 Subject: [PATCH 14/47] Minor comment format change --- machine-learning-box/automl/config/params.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine-learning-box/automl/config/params.yaml b/machine-learning-box/automl/config/params.yaml index 8e5aa835..19fedd79 100644 --- a/machine-learning-box/automl/config/params.yaml +++ b/machine-learning-box/automl/config/params.yaml @@ -7,4 +7,4 @@ test_data_table: gluon_test expr_tracking_table: automl_experiments -fit_time_limit: 60 * 3 # # fit timeout. 3 min just for training time. Default: 60 * 60 (1hr). \ No newline at end of file +fit_time_limit: 60 * 3 # fit timeout in sec. 3 min just for demo. Default: 60 * 60 (1hr). \ No newline at end of file From b0f5a8729a18477480e91c53a6fd007f993cc602 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 8 Jul 2022 10:35:13 +0900 Subject: [PATCH 15/47] Fixed td_ddl --- machine-learning-box/automl/ml_experiment.dig | 9 +++------ machine-learning-box/automl/ml_experiment_demo.dig | 9 +++------ 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index 20ab4c60..14d45fe5 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -4,13 +4,10 @@ _export: engine: presto database: ${output_database} -+create_database_if_not_exists: ++create_db_tbl_if_not_exists: td_ddl>: - create_databases: ["${output_database}"] - -+create_tbl_if_not_exists: - td_ddl>: - create_tables: ["${expr_tracking_table}"] + create_databases: ["${output_database}"] + create_tables: ["${expr_tracking_table}"] +gluon_train: ml_train>: diff --git a/machine-learning-box/automl/ml_experiment_demo.dig b/machine-learning-box/automl/ml_experiment_demo.dig index 57c2dc41..95fa75d2 100644 --- a/machine-learning-box/automl/ml_experiment_demo.dig +++ b/machine-learning-box/automl/ml_experiment_demo.dig @@ -7,13 +7,10 @@ _export: engine: presto database: ${output_database} -+create_database_if_not_exists: ++create_db_tbl_if_not_exists: td_ddl>: - create_databases: ["${output_database}"] - -+create_tbl_if_not_exists: - td_ddl>: - create_tables: ["${expr_tracking_table}"] + create_databases: ["${output_database}"] + create_tables: ["${expr_tracking_table}"] +load_datasets: ipynb>: From 37537116ee3eb50c61c1989a5f59ebecbd2ade7b Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 12 Jul 2022 17:44:46 +0900 Subject: [PATCH 16/47] Add a workaround for input_table is required --- machine-learning-box/automl/ml_experiment_demo.dig | 1 + 1 file changed, 1 insertion(+) diff --git a/machine-learning-box/automl/ml_experiment_demo.dig b/machine-learning-box/automl/ml_experiment_demo.dig index 95fa75d2..d8fd4d18 100644 --- a/machine-learning-box/automl/ml_experiment_demo.dig +++ b/machine-learning-box/automl/ml_experiment_demo.dig @@ -16,6 +16,7 @@ _export: ipynb>: notebook: ml_datasets output_database: ${input_database} + input_table: ${input_database}.dummy # datasets: gluon, bank_marketing datasets: gluon From 2faf3d3c296141af9f65e34db96c0a2445ae727c Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 1 Dec 2022 19:03:48 +0900 Subject: [PATCH 17/47] Added NBA and network_analysis notebook sample workflows --- machine-learning-box/automl/nba.dig | 51 +++++++++++++++++++ .../automl/network_analysis.dig | 10 ++++ 2 files changed, 61 insertions(+) create mode 100644 machine-learning-box/automl/nba.dig create mode 100644 machine-learning-box/automl/network_analysis.dig diff --git a/machine-learning-box/automl/nba.dig b/machine-learning-box/automl/nba.dig new file mode 100644 index 00000000..c8df7c14 --- /dev/null +++ b/machine-learning-box/automl/nba.dig @@ -0,0 +1,51 @@ +_export: + !include : config/params.yaml + td: + engine: presto + database: ${output_database} + ++create_db_tbl_if_not_exists: + td_ddl>: + create_databases: ["${output_database}"] + ++load_datasets: + ipynb>: + notebook: ml_datasets + output_database: ml_datasets + datasets: nba + ++nba_only_qtable: + ipynb>: + notebook: NBA + train_table: ml_datasets.nba_train + # optional + export_q_table: ${output_database}.rl_qtable_${session_id} + export_state_action: ${output_database}.rl_state_action_${session_id} + ++nba_with_eval: + ipynb>: + notebook: NBA + train_table: ml_datasets.nba_train + test_table: ml_datasets.nba_test + budget: 10000 + value_per_cv: 100 + # optional + # export_q_table: ${output_database}.rl_qtable_${session_id} + export_channel_ratio: ${output_database}.rl_channel_ratio_${session_id} + export_predictions: ${output_database}.rl_predictions_${session_id} + export_model_performance: ${output_database}.rl_model_performance_${session_id} + ignore_actions: client_domain_organic_visit, organic_search + action_cost: | + { + "display": 2, + "social-social": 1.4, + "social": 2, + "social-paid": 5, + "organic_search": 1, + "emai": 3.2, + "cpc": 3, + "referral": 2, + "linkedin": 3, + "search-paid": 2, + "twitter": 1 + } \ No newline at end of file diff --git a/machine-learning-box/automl/network_analysis.dig b/machine-learning-box/automl/network_analysis.dig new file mode 100644 index 00000000..c214264f --- /dev/null +++ b/machine-learning-box/automl/network_analysis.dig @@ -0,0 +1,10 @@ ++load_datasets: + ipynb>: + notebook: ml_datasets + output_database: ml_datasets + datasets: transition_matrix + ++network_analysis: + ipynb>: + notebook: network_analysis + input_table: ml_datasets.transition_matrix From 8d474201b3ec8bf71b36daaf9d52456f139ad75e Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 7 Feb 2023 13:28:09 +0900 Subject: [PATCH 18/47] Added timeseries forecasting example workflow --- machine-learning-box/automl/ts_forecast.dig | 32 +++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 machine-learning-box/automl/ts_forecast.dig diff --git a/machine-learning-box/automl/ts_forecast.dig b/machine-learning-box/automl/ts_forecast.dig new file mode 100644 index 00000000..d76f433b --- /dev/null +++ b/machine-learning-box/automl/ts_forecast.dig @@ -0,0 +1,32 @@ +#timezone: Asia/Tokyo +#timezone: PST + +_export: + !include : config/params.yaml + td: + engine: presto + database: sample_datasets # dummy to avoid error on create_databases + ++create_db_tbl_if_not_exists: + td_ddl>: + create_databases: ["ml_datasets", "ml_test"] + ++load_datasets: + ipynb>: + docker: + task_mem: 64g + notebook: ml_datasets + output_database: ml_datasets + datasets: ts_airline + ++run_ts_forecast: + ipynb>: + docker: + task_mem: 256g # 64g/128g/256g/384g/512g + notebook: ts_forecast + train_table: ml_datasets.ts_airline + tstamp_column: period + target_column: number_of_airline_passengers + forecast_length: 30 + output_table: ml_test.ts_airline_predicted + From 612fe5eb7666978b6ab1e697ce827dd1515c2b8e Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 7 Feb 2023 18:26:13 +0900 Subject: [PATCH 19/47] Set default time_limit --- machine-learning-box/automl/ts_forecast.dig | 1 + 1 file changed, 1 insertion(+) diff --git a/machine-learning-box/automl/ts_forecast.dig b/machine-learning-box/automl/ts_forecast.dig index d76f433b..d065c6dc 100644 --- a/machine-learning-box/automl/ts_forecast.dig +++ b/machine-learning-box/automl/ts_forecast.dig @@ -29,4 +29,5 @@ _export: target_column: number_of_airline_passengers forecast_length: 30 output_table: ml_test.ts_airline_predicted + time_limit: 10 * 60 # 10 min by the default From cc32839f366b6d2073e1548c39587d2d3163c001 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 9 Feb 2023 12:23:13 +0900 Subject: [PATCH 20/47] Add shepley workflow --- machine-learning-box/automl/ml_experiment.dig | 10 ++++------ .../automl/queries/track_experiment.sql | 6 +++--- machine-learning-box/automl/shapley.dig | 19 +++++++++++++++++++ 3 files changed, 26 insertions(+), 9 deletions(-) create mode 100644 machine-learning-box/automl/shapley.dig diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index 14d45fe5..1680dab6 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -11,6 +11,8 @@ _export: +gluon_train: ml_train>: + docker: + task_mem: 128g # 64g/128g/256g/384g/512g notebook: gluon_train model_name: gluon_model_${session_id} input_table: ${input_database}.${train_data_table} @@ -19,9 +21,6 @@ _export: export_leaderboard: ${output_database}.leaderboard_${train_data_table} export_feature_importance: ${output_database}.feature_importance_${train_data_table} -+print_train_result: - echo>: "Executed training and built gluon_model_${session_id}: ${automl.last_executed_notebook}.ipynb" - +track_experiment: td>: queries/track_experiment.sql insert_into: ${output_database}.automl_experiments @@ -35,10 +34,9 @@ _export: +gluon_predict: ml_predict>: + docker: + task_mem: 64g # 64g/128g/256g/384g/512g notebook: gluon_predict model_name: gluon_model_${session_id} input_table: ${input_database}.${test_data_table} output_table: ${output_database}.predicted_${test_data_table}_${session_id} - -+print_predict_result: - echo>: "Run prediction and resulted to ${output_database}.predicted_${test_data_table}_${session_id}: ${automl.last_executed_notebook}.ipynb" \ No newline at end of file diff --git a/machine-learning-box/automl/queries/track_experiment.sql b/machine-learning-box/automl/queries/track_experiment.sql index d781a9f4..0fe30327 100644 --- a/machine-learning-box/automl/queries/track_experiment.sql +++ b/machine-learning-box/automl/queries/track_experiment.sql @@ -4,6 +4,6 @@ select '${session_time}' as session_time, '${user_id}' as user_id, '${user_email}' as user_email, - '${model_name}' as model_name, - '${last_executed_notebook}.ipynb' as ipynb_url, - '${last_executed_notebook}.html' as html_url + '${model_name}' as model_name, + '${last_executed_notebook}' as notebook_url + diff --git a/machine-learning-box/automl/shapley.dig b/machine-learning-box/automl/shapley.dig new file mode 100644 index 00000000..da8c73f1 --- /dev/null +++ b/machine-learning-box/automl/shapley.dig @@ -0,0 +1,19 @@ +timezone: Asia/Tokyo +#timezone: PST + +_export: + !include : config/params.yaml + td: + engine: presto + database: ${output_database} + ++run_ml_experiment_demo: + call>: ml_experiment_demo.dig + ++explain_predictions_by_shap: + ipynb>: + docker: + task_mem: 128g # 64g/128g/256g/384g/512g + notebook: shapley + model_name: gluon_model_${session_id} # model used for prediction + input_table: ${input_database}.gluon_test # test data used for prediction From eae700209af1f908fbdd976fad63587be8318f4e Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 10 Feb 2023 15:39:46 +0900 Subject: [PATCH 21/47] Add experimental MTA workflow --- machine-learning-box/automl/mta.dig | 40 +++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 machine-learning-box/automl/mta.dig diff --git a/machine-learning-box/automl/mta.dig b/machine-learning-box/automl/mta.dig new file mode 100644 index 00000000..1601f586 --- /dev/null +++ b/machine-learning-box/automl/mta.dig @@ -0,0 +1,40 @@ +#timezone: Asia/Tokyo +#timezone: PST + +_export: + !include : config/params.yaml + td: + engine: presto + database: sample_datasets # dummy to avoid error on create_databases + output_db: ml_test + ++create_db_tbl_if_not_exists: + td_ddl>: + create_databases: ["ml_datasets", "${output_db}"] + ++load_datasets: + ipynb>: + docker: + task_mem: 64g + notebook: ml_datasets + output_database: ml_datasets + datasets: mta + ++run_mta: + ipynb>: + branch: ATML-18_mta + docker: + task_mem: 128g # 64g/128g/256g/384g/512g + notebook: MTA + # required param + input_table: ml_datasets.mta + # optional param + tstamp_column: tstamp + user_column: user + channel_column: channel + conversion_column: conversion + ignore_channels: Facebook + overwrite_channel: Direct + export_channel_interactions: ${output_db}.channel_interactions + export_shapley_attributions: ${output_db}.shapley_attributions + export_attributed_conversions: ${output_db}.attributed_conversions From 74892f472aa5e06223ce3d0542945b4887916133 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 16 Feb 2023 11:06:04 +0900 Subject: [PATCH 22/47] Added a new option --- machine-learning-box/automl/mta.dig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/machine-learning-box/automl/mta.dig b/machine-learning-box/automl/mta.dig index 1601f586..358b78fb 100644 --- a/machine-learning-box/automl/mta.dig +++ b/machine-learning-box/automl/mta.dig @@ -22,7 +22,6 @@ _export: +run_mta: ipynb>: - branch: ATML-18_mta docker: task_mem: 128g # 64g/128g/256g/384g/512g notebook: MTA @@ -33,6 +32,8 @@ _export: user_column: user channel_column: channel conversion_column: conversion + # optional columns (usually not needed) + analyze_topk_channels: 50 ignore_channels: Facebook overwrite_channel: Direct export_channel_interactions: ${output_db}.channel_interactions From db9a098a4b0ffb92328807d56d03e9baace58904 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 18 May 2023 11:47:20 +0900 Subject: [PATCH 23/47] Added shared_model option --- machine-learning-box/automl/ml_experiment.dig | 2 ++ machine-learning-box/automl/queries/track_experiment.sql | 1 + 2 files changed, 3 insertions(+) diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index 1680dab6..f414a063 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -18,6 +18,7 @@ _export: input_table: ${input_database}.${train_data_table} target_column: ${target_column} time_limit: ${fit_time_limit} + share_model: true export_leaderboard: ${output_database}.leaderboard_${train_data_table} export_feature_importance: ${output_database}.feature_importance_${train_data_table} @@ -28,6 +29,7 @@ _export: user_id: ${automl.last_executed_user_id} user_email: ${automl.last_executed_user_email} model_name: gluon_model_${session_id} + shared_mdoel: ${automl.shared_model} task_attempt_id: ${attempt_id} session_time: ${session_local_time} engine: presto diff --git a/machine-learning-box/automl/queries/track_experiment.sql b/machine-learning-box/automl/queries/track_experiment.sql index 0fe30327..0d8b3d9d 100644 --- a/machine-learning-box/automl/queries/track_experiment.sql +++ b/machine-learning-box/automl/queries/track_experiment.sql @@ -5,5 +5,6 @@ select '${user_id}' as user_id, '${user_email}' as user_email, '${model_name}' as model_name, + '${shared_mdoel} as shared_mdoel, '${last_executed_notebook}' as notebook_url From 6261a344b85e6307d2cd65692ae2ff3e96e7bb6b Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 18 May 2023 14:40:14 +0900 Subject: [PATCH 24/47] Added missing ' --- machine-learning-box/automl/queries/track_experiment.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine-learning-box/automl/queries/track_experiment.sql b/machine-learning-box/automl/queries/track_experiment.sql index 0d8b3d9d..a9dc9522 100644 --- a/machine-learning-box/automl/queries/track_experiment.sql +++ b/machine-learning-box/automl/queries/track_experiment.sql @@ -5,6 +5,6 @@ select '${user_id}' as user_id, '${user_email}' as user_email, '${model_name}' as model_name, - '${shared_mdoel} as shared_mdoel, + '${shared_mdoel}' as shared_mdoel, '${last_executed_notebook}' as notebook_url From dcd73ad40933276871a53857a620148516b9885f Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 18 May 2023 15:21:44 +0900 Subject: [PATCH 25/47] Revised to record AUC --- machine-learning-box/automl/ml_experiment.dig | 19 +++++++++++++++++-- machine-learning-box/automl/queries/auc.sql | 8 ++++++++ .../automl/queries/record_evaluation.sql | 5 +++++ 3 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 machine-learning-box/automl/queries/auc.sql create mode 100644 machine-learning-box/automl/queries/record_evaluation.sql diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index f414a063..99cac771 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -9,7 +9,7 @@ _export: create_databases: ["${output_database}"] create_tables: ["${expr_tracking_table}"] -+gluon_train: ++train: ml_train>: docker: task_mem: 128g # 64g/128g/256g/384g/512g @@ -34,7 +34,8 @@ _export: session_time: ${session_local_time} engine: presto -+gluon_predict: +# Note: If input_table contains target labels, ml_predict shows evaluation results ++predict: ml_predict>: docker: task_mem: 64g # 64g/128g/256g/384g/512g @@ -42,3 +43,17 @@ _export: model_name: gluon_model_${session_id} input_table: ${input_database}.${test_data_table} output_table: ${output_database}.predicted_${test_data_table}_${session_id} + ++evaluation: + td>: queries/auc.sql + table: ${output_database}.predicted_${test_data_table}_${session_id} + target_column: y + store_last_results: true + ++record_evaluation: + td>: queries/auc.sql + insert_into: ${output_database}.automl_eval_results + engine: presto + model_name: gluon_model_${session_id} + session_time: ${session_local_time} + auc: ${td.last_results.auc} diff --git a/machine-learning-box/automl/queries/auc.sql b/machine-learning-box/automl/queries/auc.sql new file mode 100644 index 00000000..ee64cd39 --- /dev/null +++ b/machine-learning-box/automl/queries/auc.sql @@ -0,0 +1,8 @@ +-- DIGDAG_INSERT_LINE +select + auc(prob, label) as auc +from ( + select predicted_proba as prob, ${target_column} as label + from ${table} + ORDER BY prob DESC +) t diff --git a/machine-learning-box/automl/queries/record_evaluation.sql b/machine-learning-box/automl/queries/record_evaluation.sql new file mode 100644 index 00000000..e08b6916 --- /dev/null +++ b/machine-learning-box/automl/queries/record_evaluation.sql @@ -0,0 +1,5 @@ +-- DIGDAG_INSERT_LINE +select + '${session_time}' as session_time, + '${model_name}' as model_name, + '${auc}' as auroc From e8f28e5b83f5b126610749c6be1aa216e84814f3 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 18 May 2023 15:35:52 +0900 Subject: [PATCH 26/47] Fixed y is missing --- machine-learning-box/automl/ml_experiment.dig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index 99cac771..b8b228a3 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -47,7 +47,7 @@ _export: +evaluation: td>: queries/auc.sql table: ${output_database}.predicted_${test_data_table}_${session_id} - target_column: y + target_column: ${target_column} store_last_results: true +record_evaluation: From a5ffeaa30c2e0bd5152380d32da4306d04d78857 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 18 May 2023 16:00:56 +0900 Subject: [PATCH 27/47] Fixed a bug --- machine-learning-box/automl/config/params.yaml | 2 -- machine-learning-box/automl/ml_experiment.dig | 5 +++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/machine-learning-box/automl/config/params.yaml b/machine-learning-box/automl/config/params.yaml index 19fedd79..fcf30438 100644 --- a/machine-learning-box/automl/config/params.yaml +++ b/machine-learning-box/automl/config/params.yaml @@ -5,6 +5,4 @@ train_data_table: gluon_train target_column: class test_data_table: gluon_test -expr_tracking_table: automl_experiments - fit_time_limit: 60 * 3 # fit timeout in sec. 3 min just for demo. Default: 60 * 60 (1hr). \ No newline at end of file diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index b8b228a3..040c6540 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -7,7 +7,7 @@ _export: +create_db_tbl_if_not_exists: td_ddl>: create_databases: ["${output_database}"] - create_tables: ["${expr_tracking_table}"] + create_tables: ["automl_experiments", "automl_eval_results"] +train: ml_train>: @@ -49,9 +49,10 @@ _export: table: ${output_database}.predicted_${test_data_table}_${session_id} target_column: ${target_column} store_last_results: true + engine: hive +record_evaluation: - td>: queries/auc.sql + td>: queries/record_evaluation.sql insert_into: ${output_database}.automl_eval_results engine: presto model_name: gluon_model_${session_id} From 6d52e9bb9c211d73275588c588b877857fe9fbb2 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 18 May 2023 16:17:54 +0900 Subject: [PATCH 28/47] Fixed a bug --- machine-learning-box/automl/ml_experiment.dig | 1 + machine-learning-box/automl/queries/auc.sql | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index 040c6540..bb8e8701 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -48,6 +48,7 @@ _export: td>: queries/auc.sql table: ${output_database}.predicted_${test_data_table}_${session_id} target_column: ${target_column} + positive_class: ' >50K' store_last_results: true engine: hive diff --git a/machine-learning-box/automl/queries/auc.sql b/machine-learning-box/automl/queries/auc.sql index ee64cd39..06d35e75 100644 --- a/machine-learning-box/automl/queries/auc.sql +++ b/machine-learning-box/automl/queries/auc.sql @@ -1,8 +1,7 @@ --- DIGDAG_INSERT_LINE select auc(prob, label) as auc from ( - select predicted_proba as prob, ${target_column} as label + select predicted_proba as prob, if(cast(${target_column} as string)=="${positive_class}", 1, 0) as label from ${table} ORDER BY prob DESC ) t From e95cba492d7b5418ec630ab4c6a0e6bdd395ce50 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 25 May 2023 18:47:23 +0900 Subject: [PATCH 29/47] Added vehicle coupon workflow to demonstrate adding an attribute table to audience --- .../automl/queries/assign_rowid.sql | 2 + .../automl/scripts/audience.py | 246 ++++++++++++++++++ .../automl/vehicle_coupon.dig | 66 +++++ 3 files changed, 314 insertions(+) create mode 100644 machine-learning-box/automl/queries/assign_rowid.sql create mode 100644 machine-learning-box/automl/scripts/audience.py create mode 100644 machine-learning-box/automl/vehicle_coupon.dig diff --git a/machine-learning-box/automl/queries/assign_rowid.sql b/machine-learning-box/automl/queries/assign_rowid.sql new file mode 100644 index 00000000..a07119b3 --- /dev/null +++ b/machine-learning-box/automl/queries/assign_rowid.sql @@ -0,0 +1,2 @@ +-- DIGDAG_INSERT_LINE +select rownum() as ${rowid_column}, * from ${table} diff --git a/machine-learning-box/automl/scripts/audience.py b/machine-learning-box/automl/scripts/audience.py new file mode 100644 index 00000000..80e214b7 --- /dev/null +++ b/machine-learning-box/automl/scripts/audience.py @@ -0,0 +1,246 @@ +__all__ = ['CdpAudience'] + +import sys, os +import requests +import json +import pytd +import re +from typing import Tuple + +from requests.models import Response +from requests.packages.urllib3.util.retry import Retry +from requests.adapters import HTTPAdapter +from requests import Session + + +class CdpApiClient: + def __init__(self, endpoint, headers: dict) -> None: + retry_strategy = Retry( + total=3, status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2 + ) + adapter = HTTPAdapter(max_retries=retry_strategy) + s = Session() + s.headers = headers + s.mount("http://", adapter) + s.mount("https://", adapter) + self.endpoint = f"https://{endpoint}" + self.client: Session = s + + def get(self, path, **kwargs) -> Response: + return self.client.get(url=self.endpoint+path, **kwargs) + + def put(self, path: str, data=None, **kwargs) -> Response: + return self.client.put(url=self.endpoint+path, data=data, **kwargs) + + def post(self, path: str, data=None, json=None, **kwargs) -> Response: + return self.client.post(url=self.endpoint+path, data=data, json=json, **kwargs) + + +def to_boolean(o) -> bool: + if o == None: + return False + s = str(o) + + try: + from distutils.util import strtobool + return bool(strtobool(s)) + except ValueError as e: + return False + + +def validate_db_resource_name(name: str) -> str: + ''' + Validate DB_NAME or TABLE_NAME + ''' + # https://docs.treasuredata.com/display/public/PD/Naming+Requirements+and+Restrictions+for+Treasure+Data+Entities + TD_DB_RESOURCE_REGEX = "[a-z0-9_]+" + assert re.fullmatch(rf"^{TD_DB_RESOURCE_REGEX}$", name) is not None, f"Invalid DB resource name: {name}" + return name + + +def parse_table(table: str) -> Tuple[str, str]: + ''' + Parse DB_NAME.TABLE_NAME to DB_NAME, TABLE_NAME + ''' + assert table.count(".") == 1, f"Invalid table name {table}, DB_NAME.TABLE_NAME is expected." + database, table = table.split(".") + validate_db_resource_name(database) + validate_db_resource_name(table) + return database, table + + +def resolve_type(table, column_name: str): + # workaround for ValueError: not enough values to unpack (expected 3, got 2) + schema = [c if len(c) == 3 else [c[0], c[1], ""] for c in table.schema] + # column_name:str, column_type:str, alias:str + for (c_name, c_type, _) in schema: + if c_name == column_name: + # Note: Only string, number, timestamp, string_array, or number_array is accepted for attr_type + # https://github.com/treasure-data/td-cdp-api/blob/master/app/models/audience_attribute.rb#L9 + # https://docs.treasuredata.com/display/PD/Using+TD+CLI+to+Annotate+Schema+-+Legacy + if c_type in ['int', 'long', 'double', 'float']: + return 'number' + else: + return 'string' + raise KeyError(f"column {column_name} not found in {table.schema}") + + +class CdpAudience: + ''' + Usage: + cdp = CdpAudience() + cdp.add_attribute(audience_name=audience_name, attr_db=attr_db, attr_table=attr_table, attr_column=attr_column, join_key=join_key, foreign_key=foreign_key, replace_attr_if_exists=True) + ''' + + def __init__(self): + TD_API_KEY = os.environ["TD_API_KEY"] + TD_ENDPOINT = os.environ["TD_API_SERVER"] + + CDP_ENDPOINT = TD_ENDPOINT.replace('api', 'api-cdp') + HEADERRS = {'Authorization': f'TD1 {TD_API_KEY}', 'Content-Type': 'application/json'} + self.cdp_api = CdpApiClient(endpoint=CDP_ENDPOINT, headers=HEADERRS) + self.td_api = pytd.Client(retry_post_requests=True).api_client + + def add_attribute( + self, *, audience_id: str=None, audience_name: str=None, attr_db: str=None, attr_table: str, attr_column: str, join_key: str, foreign_key: str, + attr_alias: str=None, attr_group: str="AutoML", rerun_master_segment: bool=True, replace_attr_if_exists: bool=False, + **kwargs + ): + if attr_alias is None: + attr_alias = attr_column + + if attr_db is None: + attr_db, attr_table = parse_table(attr_table) + + if audience_id is None: + assert audience_name is not None, "Either audience_id or audience_name argument is required" + audience_id = self.get_parent_segment_id(audience_name) + + table = self.td_api.table(attr_db, attr_table) + attr_type = resolve_type(table, attr_column) + + res = self.cdp_api.put(f"/audiences/{audience_id}") + if not res.ok: + raise RuntimeError(res.text) + audience = res.json() + attributes = audience['attributes'] if 'attributes' in audience else [] + + new_attr = { + 'audienceId': audience_id, # ID of Master Segment for this attribute + 'name': attr_column, # Column name to be defined on Master Segment + 'type': attr_type, # Type of the column + 'parentDatabaseName': attr_db, # Database name of the attribute table + 'parentTableName': attr_table, # Table name of the attribute table + 'parentColumn': attr_column, # Column name of the attribute table which is imported into customer table + 'parentKey': join_key, # Join key of the attribute table + 'foreignKey': foreign_key, # Foreign key of the master table + 'groupingName': attr_group, # Group name of the attribute + } + + append_attr = False + for i, attr in enumerate(attributes): + if 'name' in attr and attr['name'] == attr_column: + if replace_attr_if_exists: + attributes[i] = new_attr + append_attr = False + print(f"⚠ Repalce '{attr_column}' in Master Segment {audience_id}", file=sys.stderr) + break + else: + print(f"⚠ skip adding an attribute because the attribute column '{attr_column}' already exists", file=sys.stderr) + return + if append_attr == True: + attributes.append(new_attr) + + res = self.cdp_api.put(f"/audiences/{audience_id}", json=audience) + if res.ok: + print(f"ⓘ Successfully added an attribute table '{attr_table}' to master segment {audience_id}", file=sys.stderr) + else: + try: + 'not unique' in res.json()['base'][0] + print(f"⚠ Attribute '{attr_column}' already exists in Parent Segment and thus skip adding an attribue.", file=sys.stderr) + return + except: + print(f"failed to PUT /audiences/{audience_id}: {new_attr}") + raise RuntimeError(f"{res.status_code} error on PUT /audiences/{audience_id}: {res.json()}") + + if rerun_master_segment: + res = self.cdp_api.post(f"/audiences/{audience_id}/run") + if res.ok: + print(f"ⓘ Successfully triggered rerun of Master Segment: {audience_id}", file=sys.stderr) + else: + raise RuntimeError(f"{res.status_code} error on POST /audiences/{audience_id}/run: {res.json()}") + + + def get_parent_segment_id(self, name: str) -> str: + ''' + Retrive parent segment ID if exists. Otherwise, return None + ''' + + assert name is not None + + # Get all the audience configurations + res = self.cdp_api.get('/audiences') + if not res.ok: + raise RuntimeError(res.text) + audiences = json.loads(res.text) + + for audience in audiences: + if 'name' in audience and name == audience['name']: + if 'id' in audience: + return audience['id'] + + raise ValueError(f"Cannot find parent segment: {name}") + + +def parse_arguments(kwargs: dict) -> dict: + assert os.environ.get('TD_API_KEY') is not None, "TD_API_KEY ENV variable is required" + assert os.environ.get('TD_API_SERVER') is not None, "TD_API_SERVER ENV variable is required" + + ret = {} + + audience = kwargs.pop('audience', None) + assert audience is not None, "audience argument is required" + audience_id = audience.pop('id', None) + if audience_id is not None: ret['audience_id'] = audience_id + audience_name = audience.pop('name', None) + if audience_name is not None: ret['audience_name'] = audience_name + foreign_key = audience.pop('foreign_key', None) + assert foreign_key is not None, "foreign_key argument is required" + ret['foreign_key'] = foreign_key + ret['rerun_master_segment'] = to_boolean(audience.pop('rerun', 'False')) + + attribute = kwargs.pop('attribute', None) + assert attribute is not None, "attribute argument is required" + attr_table = attribute.pop('table', None) + assert attr_table is not None, "attr_table argument is required" + ret['attr_table'] = attr_table + attr_column = attribute.pop('attr_column', None) + assert attr_column is not None, "attr_column argument is required" + ret['attr_column'] = attr_column + join_key = attribute.pop('join_key', None) + assert join_key is not None, "join_key argument is required" + ret['join_key'] = join_key + attr_db = attribute.pop('database', None) + if attr_db is not None: ret['attr_db'] = attr_db + attr_alias = attribute.pop('attr_alias', None) + if attr_alias is not None: ret['attr_alias'] = attr_alias + attr_group = attribute.pop('attr_group', "AutoML") + ret['attr_group'] = attr_group + replace_attr_if_exists = to_boolean(attribute.pop('replace_if_exists', 'False')) + ret['replace_attr_if_exists'] = replace_attr_if_exists + + return ret + + +def add_attribute(**kwargs): + import faulthandler + faulthandler.enable() + + try: + params = parse_arguments(kwargs) + cdp = CdpAudience() + cdp.add_attribute(**params) + finally: + # force flush + sys.stdout.flush() + sys.stderr.flush() diff --git a/machine-learning-box/automl/vehicle_coupon.dig b/machine-learning-box/automl/vehicle_coupon.dig new file mode 100644 index 00000000..c450de8c --- /dev/null +++ b/machine-learning-box/automl/vehicle_coupon.dig @@ -0,0 +1,66 @@ +_export: + output_database: ml_test + audience_name: "vehicle coupon test" + foreign_key: userid + td: + engine: presto + database: ${output_database} + ++create_db_tbl_if_not_exists: + td_ddl>: + create_databases: ["ml_test"] + ++load_datasets: + ipynb>: + docker: + task_mem: 64g # 64g/128g/256g/384g/512g + notebook: ml_datasets + output_database: ml_datasets + datasets: vehicle_coupon + ++train: + ml_train>: + docker: + task_mem: 256g # 64g/128g/256g/384g/512g + notebook: gluon_train + model_name: gluon_model_${session_id} + input_table: ml_datasets.vehicle_coupon_train + target_column: y + time_limit: 3 * 60 # 3 min + ++prepare_input: + td>: queries/assign_rowid.sql + table: ml_datasets.vehicle_coupon_test + rowid_column: userid + create_table: ml_datasets.vehicle_coupon_test_with_rowid + engine: hive + ++predict: + ml_predict>: + docker: + task_mem: 128g # 64g/128g/256g/384g/512g + notebook: gluon_predict + model_name: gluon_model_${session_id} + input_table: ml_datasets.vehicle_coupon_test_with_rowid + output_table: ${output_database}.predicted__${session_id} + ++add_attribute: + py>: scripts.audience.add_attribute + audience: + name: ${audience_name} # segment name or segment id + # id: 1111 + foreign_key: ${foreign_key} + ### optional + rerun: true + attribute: + table: ${output_database}.predicted__${session_id} + attr_column: "predicted_proba" + join_key: "userid" + ### optional + attr_group: "AutoML" + replace_if_exists: true + docker: + image: "digdag/digdag-python:3.9" + _env: + TD_API_KEY: ${secret:td.apikey} + TD_API_SERVER: "api.treasuredata.com" From f21dd81a1f0041b890daa6bc8e4739c276873413 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Sat, 27 May 2023 00:52:55 +0900 Subject: [PATCH 30/47] Updated audience script --- .../automl/scripts/audience.py | 287 ++++++++++++------ 1 file changed, 197 insertions(+), 90 deletions(-) diff --git a/machine-learning-box/automl/scripts/audience.py b/machine-learning-box/automl/scripts/audience.py index 80e214b7..9c09e878 100644 --- a/machine-learning-box/automl/scripts/audience.py +++ b/machine-learning-box/automl/scripts/audience.py @@ -1,11 +1,14 @@ __all__ = ['CdpAudience'] import sys, os -import requests +import requests import json import pytd import re -from typing import Tuple +import faulthandler +import warnings + +from typing import List, Tuple from requests.models import Response from requests.packages.urllib3.util.retry import Retry @@ -13,6 +16,28 @@ from requests import Session +def deprecated(func): + """This is a decorator which can be used to mark functions + as deprecated. It will result in a warning being emitted + when the function is used.""" + import functools + + @functools.wraps(func) + def new_func(*args, **kwargs): + warnings.simplefilter('always', DeprecationWarning) # turn off filter + warnings.warn("Call to deprecated function {}.".format(func.__name__), + category=DeprecationWarning, + stacklevel=2) + warnings.simplefilter('default', DeprecationWarning) # reset filter + return func(*args, **kwargs) + return new_func + +class ApiRequestError(Exception): + def __init__(self, response: requests.Response, msg: str=None): + if msg is None: + msg = f"{response.status_code} ERROR\n{response.text}" + super().__init__(msg) + class CdpApiClient: def __init__(self, endpoint, headers: dict) -> None: retry_strategy = Retry( @@ -20,7 +45,7 @@ def __init__(self, endpoint, headers: dict) -> None: ) adapter = HTTPAdapter(max_retries=retry_strategy) s = Session() - s.headers = headers + s.headers = headers s.mount("http://", adapter) s.mount("https://", adapter) self.endpoint = f"https://{endpoint}" @@ -68,7 +93,7 @@ def parse_table(table: str) -> Tuple[str, str]: validate_db_resource_name(table) return database, table - +@deprecated def resolve_type(table, column_name: str): # workaround for ValueError: not enough values to unpack (expected 3, got 2) schema = [c if len(c) == 3 else [c[0], c[1], ""] for c in table.schema] @@ -87,7 +112,7 @@ def resolve_type(table, column_name: str): class CdpAudience: ''' - Usage: + Usage: cdp = CdpAudience() cdp.add_attribute(audience_name=audience_name, attr_db=attr_db, attr_table=attr_table, attr_column=attr_column, join_key=join_key, foreign_key=foreign_key, replace_attr_if_exists=True) ''' @@ -101,13 +126,38 @@ def __init__(self): self.cdp_api = CdpApiClient(endpoint=CDP_ENDPOINT, headers=HEADERRS) self.td_api = pytd.Client(retry_post_requests=True).api_client + def create_master_segment(self, *, name: str, database: str, table: str, description: str=None, run:bool=False): + payload = {} + payload['name'] = name + payload['description'] = "" if description is None else description + payload['master'] = {} + payload['master']['parentDatabaseName'] = database + payload['master']['parentTableName'] = table + + res = self.cdp_api.post('/audiences', data=json.dumps(payload)) + if not res.ok: + raise ApiRequestError(res) + + audience = json.loads(res.text) + audience_id = audience['id'] + print(f"ⓘ Successfully created Master Segment '{name}': {audience_id}", file=sys.stderr) + + if run: + res = self.cdp_api.post(f"/audiences/{audience_id}/run") + print(f"ⓘ Run Master Segment {name}", file=sys.stderr) + + return audience_id + def add_attribute( - self, *, audience_id: str=None, audience_name: str=None, attr_db: str=None, attr_table: str, attr_column: str, join_key: str, foreign_key: str, - attr_alias: str=None, attr_group: str="AutoML", rerun_master_segment: bool=True, replace_attr_if_exists: bool=False, + self, *, audience_id: str=None, audience_name: str=None, attr_db: str=None, attr_table: str, attr_columns: List[str], join_key: str, foreign_key: str, + attr_aliases: List[str]=None, attr_group: str="AutoML", rerun_master_segment: bool=True, replace_attr_if_exists: bool=False, **kwargs ): - if attr_alias is None: - attr_alias = attr_column + assert len(attr_columns) >= 1, "At least one element in attr_columns but it was empty" + if attr_aliases is None: + attr_aliases = attr_columns + else: + assert len(attr_aliases) == len(attr_columns), f"len(attr_aliases) {len(attr_aliases)} is expected to be equals to len(attr_columns) {len(attr_columns)}" if attr_db is None: attr_db, attr_table = parse_table(attr_table) @@ -116,126 +166,143 @@ def add_attribute( assert audience_name is not None, "Either audience_id or audience_name argument is required" audience_id = self.get_parent_segment_id(audience_name) - table = self.td_api.table(attr_db, attr_table) - attr_type = resolve_type(table, attr_column) + # table = self.td_api.table(attr_db, attr_table) + # attr_type = resolve_type(table, "predicted_proba") res = self.cdp_api.put(f"/audiences/{audience_id}") if not res.ok: - raise RuntimeError(res.text) + raise ApiRequestError(res) audience = res.json() - attributes = audience['attributes'] if 'attributes' in audience else [] - new_attr = { - 'audienceId': audience_id, # ID of Master Segment for this attribute - 'name': attr_column, # Column name to be defined on Master Segment - 'type': attr_type, # Type of the column - 'parentDatabaseName': attr_db, # Database name of the attribute table - 'parentTableName': attr_table, # Table name of the attribute table - 'parentColumn': attr_column, # Column name of the attribute table which is imported into customer table - 'parentKey': join_key, # Join key of the attribute table - 'foreignKey': foreign_key, # Foreign key of the master table - 'groupingName': attr_group, # Group name of the attribute - } - - append_attr = False - for i, attr in enumerate(attributes): - if 'name' in attr and attr['name'] == attr_column: + attributes = audience['attributes'] if 'attributes' in audience else [] + existing_attr_names = [attr['name'] for attr in attributes] + + for i, attr_column in enumerate(attr_columns): + attr_alias = attr_aliases[i] + + new_attr = { + #'audienceId': audience_id, # ID of Master Segment for this attribute + 'name': attr_alias, # Column name to be defined on Master Segment + #'type': attr_type, # Type of the column + 'parentDatabaseName': attr_db, # Database name of the attribute table + 'parentTableName': attr_table, # Table name of the attribute table + 'parentColumn': attr_column, # Column name of the attribute table which is imported into customer table + 'parentKey': join_key, # Join key of the attribute table + 'foreignKey': foreign_key, # Foreign key of the master table + 'groupingName': attr_group, # Group name of the attribute + } + + if attr_alias in existing_attr_names: if replace_attr_if_exists: - attributes[i] = new_attr - append_attr = False - print(f"⚠ Repalce '{attr_column}' in Master Segment {audience_id}", file=sys.stderr) - break + attributes[existing_attr_names.index(attr_alias)] = new_attr + print(f"⚠ Replace an attribute column '{attr_alias}' in Master Segment {audience_id}", file=sys.stderr) else: - print(f"⚠ skip adding an attribute because the attribute column '{attr_column}' already exists", file=sys.stderr) - return - if append_attr == True: - attributes.append(new_attr) + print(f"⚠ Skip adding an attribute because the attribute column '{attr_alias}' already exists", file=sys.stderr) + else: + attributes.append(new_attr) + # from IPython.core.debugger import Pdb; Pdb().set_trace() res = self.cdp_api.put(f"/audiences/{audience_id}", json=audience) if res.ok: print(f"ⓘ Successfully added an attribute table '{attr_table}' to master segment {audience_id}", file=sys.stderr) else: - try: + try: 'not unique' in res.json()['base'][0] - print(f"⚠ Attribute '{attr_column}' already exists in Parent Segment and thus skip adding an attribue.", file=sys.stderr) + print(f"⚠ Attribute '{attr_column}' already exists in Parent Segment and thus skip adding an attribue.", file=sys.stderr) return except: print(f"failed to PUT /audiences/{audience_id}: {new_attr}") - raise RuntimeError(f"{res.status_code} error on PUT /audiences/{audience_id}: {res.json()}") + raise ApiRequestError(res, f"{res.status_code} error on PUT /audiences/{audience_id}: {res.json()}") if rerun_master_segment: res = self.cdp_api.post(f"/audiences/{audience_id}/run") if res.ok: - print(f"ⓘ Successfully triggered rerun of Master Segment: {audience_id}", file=sys.stderr) + print(f"ⓘ Successfully triggered rerun of Master Segment: {audience_id}", file=sys.stderr) else: - raise RuntimeError(f"{res.status_code} error on POST /audiences/{audience_id}/run: {res.json()}") + raise ApiRequestError(res, f"{res.status_code} error on POST /audiences/{audience_id}/run: {res.json()}") def get_parent_segment_id(self, name: str) -> str: ''' - Retrive parent segment ID if exists. Otherwise, return None + Retrive parent segment ID if exists. ''' - assert name is not None - # Get all the audience configurations + # Note: console-next (v5) uses different endpoints for listing audience + res = self.cdp_api.get('/entities/parent_segments') + if res.ok: + v5_res = res.json() + for audience in v5_res.get('data',{}): + if audience.get('attributes',{}).get('name') == name: + return audience['id'] + + # Fall back to v4 res = self.cdp_api.get('/audiences') if not res.ok: - raise RuntimeError(res.text) - audiences = json.loads(res.text) + raise ApiRequestError(res) + audiences = res.json() for audience in audiences: - if 'name' in audience and name == audience['name']: - if 'id' in audience: - return audience['id'] + if name == audience.get('name'): + return audience['id'] raise ValueError(f"Cannot find parent segment: {name}") -def parse_arguments(kwargs: dict) -> dict: - assert os.environ.get('TD_API_KEY') is not None, "TD_API_KEY ENV variable is required" - assert os.environ.get('TD_API_SERVER') is not None, "TD_API_SERVER ENV variable is required" - - ret = {} - - audience = kwargs.pop('audience', None) - assert audience is not None, "audience argument is required" - audience_id = audience.pop('id', None) - if audience_id is not None: ret['audience_id'] = audience_id - audience_name = audience.pop('name', None) - if audience_name is not None: ret['audience_name'] = audience_name - foreign_key = audience.pop('foreign_key', None) - assert foreign_key is not None, "foreign_key argument is required" - ret['foreign_key'] = foreign_key - ret['rerun_master_segment'] = to_boolean(audience.pop('rerun', 'False')) - - attribute = kwargs.pop('attribute', None) - assert attribute is not None, "attribute argument is required" - attr_table = attribute.pop('table', None) - assert attr_table is not None, "attr_table argument is required" - ret['attr_table'] = attr_table - attr_column = attribute.pop('attr_column', None) - assert attr_column is not None, "attr_column argument is required" - ret['attr_column'] = attr_column - join_key = attribute.pop('join_key', None) - assert join_key is not None, "join_key argument is required" - ret['join_key'] = join_key - attr_db = attribute.pop('database', None) - if attr_db is not None: ret['attr_db'] = attr_db - attr_alias = attribute.pop('attr_alias', None) - if attr_alias is not None: ret['attr_alias'] = attr_alias - attr_group = attribute.pop('attr_group', "AutoML") - ret['attr_group'] = attr_group - replace_attr_if_exists = to_boolean(attribute.pop('replace_if_exists', 'False')) - ret['replace_attr_if_exists'] = replace_attr_if_exists - - return ret - - def add_attribute(**kwargs): - import faulthandler faulthandler.enable() + def parse_arguments(kwargs: dict) -> dict: + assert os.environ.get('TD_API_KEY') is not None, "TD_API_KEY ENV variable is required" + assert os.environ.get('TD_API_SERVER') is not None, "TD_API_SERVER ENV variable is required" + + ret = {} + + audience = kwargs.pop('audience', None) + assert audience is not None, "audience argument is required" + audience_id = audience.pop('id', None) + if audience_id is not None: ret['audience_id'] = audience_id + audience_name = audience.pop('name', None) + if audience_name is not None: ret['audience_name'] = audience_name + foreign_key = audience.pop('foreign_key', None) + assert foreign_key is not None, "foreign_key argument is required" + ret['foreign_key'] = foreign_key + ret['rerun_master_segment'] = to_boolean(audience.pop('rerun', 'False')) + + attribute = kwargs.pop('attribute', None) + assert attribute is not None, "attribute argument is required" + attr_table = attribute.pop('table', None) + assert attr_table is not None, "attr_table argument is required" + ret['attr_table'] = attr_table + join_key = attribute.pop('join_key', None) + assert join_key is not None, "join_key argument is required" + ret['join_key'] = join_key + attr_db = attribute.pop('database', None) + if attr_db is not None: ret['attr_db'] = attr_db + + attr_columns = attribute.pop('attr_columns', None) + if attr_columns is None: + attr_column = attribute.pop('attr_column', None) + assert attr_column is not None, "Either attr_columns or attr_column is required" + ret['attr_columns'] = [attr_column] + else: + ret['attr_columns'] = [s.strip() for s in attr_columns.split(',')] + + attr_aliases = attribute.pop('attr_aliases', None) + if attr_aliases is None: + attr_alias = attribute.pop('attr_alias', None) + assert attr_alias is not None, "Either attr_aliases or attr_alias is required" + ret['attr_aliases'] = [attr_alias] + else: + ret['attr_aliases'] = [s.strip() for s in attr_aliases.split(',')] + + attr_group = attribute.pop('attr_group', "AutoML") + ret['attr_group'] = attr_group + replace_attr_if_exists = to_boolean(attribute.pop('replace_if_exists', 'False')) + ret['replace_attr_if_exists'] = replace_attr_if_exists + + return ret + try: params = parse_arguments(kwargs) cdp = CdpAudience() @@ -244,3 +311,43 @@ def add_attribute(**kwargs): # force flush sys.stdout.flush() sys.stderr.flush() + + +def create_master_segment(**kwargs): + faulthandler.enable() + + def parse_arguments(kwargs: dict) -> dict: + assert os.environ.get('TD_API_KEY') is not None, "TD_API_KEY ENV variable is required" + assert os.environ.get('TD_API_SERVER') is not None, "TD_API_SERVER ENV variable is required" + + ret = {} + + name = kwargs.pop('name', None) + assert name is not None, "name argument is required" + ret['name'] = name + description = kwargs.pop('description', None) + if description is not None: ret['description'] = description + + master = kwargs.pop('master', None) + assert master is not None, "audience argument is required" + database = master.pop('database', None) + assert database is not None, "master.database argument is required" + ret['database'] = database + table = master.pop('table', None) + assert table is not None, "master.table argument is required" + ret['table'] = table + + ret['run'] = to_boolean(kwargs.pop('run', None)) + return ret + + try: + params = parse_arguments(kwargs) + cdp = CdpAudience() + audience_id = cdp.create_master_segment(**params) + + import digdag + digdag.env.store({'audience_id': audience_id}) + finally: + # force flush + sys.stdout.flush() + sys.stderr.flush() From 711718cccb4a49d502eda9fc97ce39ddc427a7af Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Sat, 27 May 2023 00:53:21 +0900 Subject: [PATCH 31/47] Revised a workflow --- .../automl/vehicle_coupon.dig | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/machine-learning-box/automl/vehicle_coupon.dig b/machine-learning-box/automl/vehicle_coupon.dig index c450de8c..a58fb8da 100644 --- a/machine-learning-box/automl/vehicle_coupon.dig +++ b/machine-learning-box/automl/vehicle_coupon.dig @@ -1,7 +1,5 @@ _export: output_database: ml_test - audience_name: "vehicle coupon test" - foreign_key: userid td: engine: presto database: ${output_database} @@ -42,18 +40,32 @@ _export: notebook: gluon_predict model_name: gluon_model_${session_id} input_table: ml_datasets.vehicle_coupon_test_with_rowid - output_table: ${output_database}.predicted__${session_id} + output_table: ${output_database}.predicted_${session_id} + ++create_master_segment: + py>: scripts.audience.create_master_segment + name: vehicle_coupon_${session_id} + # description: xxx + master: + database: ml_datasets + table: vehicle_coupon_test_with_rowid + run: false + docker: + image: "digdag/digdag-python:3.9" + _env: + TD_API_KEY: ${secret:td.apikey} + TD_API_SERVER: "api.treasuredata.com" +add_attribute: py>: scripts.audience.add_attribute audience: - name: ${audience_name} # segment name or segment id + name: vehicle_coupon_${session_id} # id: 1111 - foreign_key: ${foreign_key} + foreign_key: userid ### optional rerun: true attribute: - table: ${output_database}.predicted__${session_id} + table: ${output_database}.predicted_${session_id} attr_column: "predicted_proba" join_key: "userid" ### optional From f849caf8beb1027b0023889e8f87fede7e3b971a Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Sat, 27 May 2023 01:08:45 +0900 Subject: [PATCH 32/47] Changed parameters to accept multiple attribute columns --- machine-learning-box/automl/vehicle_coupon.dig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/machine-learning-box/automl/vehicle_coupon.dig b/machine-learning-box/automl/vehicle_coupon.dig index a58fb8da..17084be0 100644 --- a/machine-learning-box/automl/vehicle_coupon.dig +++ b/machine-learning-box/automl/vehicle_coupon.dig @@ -66,9 +66,11 @@ _export: rerun: true attribute: table: ${output_database}.predicted_${session_id} - attr_column: "predicted_proba" + attr_columns: "predicted_proba, y" + # attr_column: predicted_proba join_key: "userid" ### optional + attr_aliases: "predicted_proba, y2" attr_group: "AutoML" replace_if_exists: true docker: From ac28fad8af7994d26610b2a536d57bc3db1919b9 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Sun, 28 May 2023 00:45:26 +0900 Subject: [PATCH 33/47] Added two variations for adding attribute to CDP master segment --- .../automl/vehicle_coupon.dig | 42 +++------- .../vehicle_coupon_custom_script_version.dig | 80 +++++++++++++++++++ 2 files changed, 92 insertions(+), 30 deletions(-) create mode 100644 machine-learning-box/automl/vehicle_coupon_custom_script_version.dig diff --git a/machine-learning-box/automl/vehicle_coupon.dig b/machine-learning-box/automl/vehicle_coupon.dig index 17084be0..2fed4590 100644 --- a/machine-learning-box/automl/vehicle_coupon.dig +++ b/machine-learning-box/automl/vehicle_coupon.dig @@ -18,6 +18,7 @@ _export: +train: ml_train>: + branch: ATML-109_attr docker: task_mem: 256g # 64g/128g/256g/384g/512g notebook: gluon_train @@ -33,15 +34,6 @@ _export: create_table: ml_datasets.vehicle_coupon_test_with_rowid engine: hive -+predict: - ml_predict>: - docker: - task_mem: 128g # 64g/128g/256g/384g/512g - notebook: gluon_predict - model_name: gluon_model_${session_id} - input_table: ml_datasets.vehicle_coupon_test_with_rowid - output_table: ${output_database}.predicted_${session_id} - +create_master_segment: py>: scripts.audience.create_master_segment name: vehicle_coupon_${session_id} @@ -56,25 +48,15 @@ _export: TD_API_KEY: ${secret:td.apikey} TD_API_SERVER: "api.treasuredata.com" -+add_attribute: - py>: scripts.audience.add_attribute - audience: - name: vehicle_coupon_${session_id} - # id: 1111 ++predict: + ml_predict>: + branch: ATML-109_attr + docker: + task_mem: 128g # 64g/128g/256g/384g/512g + notebook: gluon_predict + model_name: gluon_model_${session_id} + input_table: ml_datasets.vehicle_coupon_test_with_rowid + rowid_column: userid + output_table: ${output_database}.predicted_${session_id} + audience_name: vehicle_coupon_${session_id} foreign_key: userid - ### optional - rerun: true - attribute: - table: ${output_database}.predicted_${session_id} - attr_columns: "predicted_proba, y" - # attr_column: predicted_proba - join_key: "userid" - ### optional - attr_aliases: "predicted_proba, y2" - attr_group: "AutoML" - replace_if_exists: true - docker: - image: "digdag/digdag-python:3.9" - _env: - TD_API_KEY: ${secret:td.apikey} - TD_API_SERVER: "api.treasuredata.com" diff --git a/machine-learning-box/automl/vehicle_coupon_custom_script_version.dig b/machine-learning-box/automl/vehicle_coupon_custom_script_version.dig new file mode 100644 index 00000000..1db3b0b6 --- /dev/null +++ b/machine-learning-box/automl/vehicle_coupon_custom_script_version.dig @@ -0,0 +1,80 @@ +_export: + output_database: ml_test + td: + engine: presto + database: ${output_database} + ++create_db_tbl_if_not_exists: + td_ddl>: + create_databases: ["ml_test"] + ++load_datasets: + ipynb>: + docker: + task_mem: 64g # 64g/128g/256g/384g/512g + notebook: ml_datasets + output_database: ml_datasets + datasets: vehicle_coupon + ++train: + ml_train>: + docker: + task_mem: 256g # 64g/128g/256g/384g/512g + notebook: gluon_train + model_name: gluon_model_${session_id} + input_table: ml_datasets.vehicle_coupon_train + target_column: y + time_limit: 3 * 60 # 3 min + ++prepare_input: + td>: queries/assign_rowid.sql + table: ml_datasets.vehicle_coupon_test + rowid_column: userid + create_table: ml_datasets.vehicle_coupon_test_with_rowid + engine: hive + ++predict: + ml_predict>: + docker: + task_mem: 128g # 64g/128g/256g/384g/512g + notebook: gluon_predict + model_name: gluon_model_${session_id} + input_table: ml_datasets.vehicle_coupon_test_with_rowid + output_table: ${output_database}.predicted_${session_id} + ++create_master_segment: + py>: scripts.audience.create_master_segment + name: vehicle_coupon_${session_id} + # description: xxx + master: + database: ml_datasets + table: vehicle_coupon_test_with_rowid + run: false + docker: + image: "digdag/digdag-python:3.9" + _env: + TD_API_KEY: ${secret:td.apikey} + TD_API_SERVER: "api.treasuredata.com" + ++add_attribute: + py>: scripts.audience.add_attribute + audience: + name: vehicle_coupon_${session_id} + # id: 1111 + foreign_key: userid + ### optional + rerun: true + attribute: + table: ${output_database}.predicted_${session_id} + attr_columns: predicted_proba, y + # attr_column: predicted_proba + join_key: "userid" + ### optional + attr_aliases: predicted_proba, y2 + attr_group: "AutoML" + replace_if_exists: true + docker: + image: "digdag/digdag-python:3.9" + _env: + TD_API_KEY: ${secret:td.apikey} + TD_API_SERVER: "api.treasuredata.com" From 03ac63312e501c14b659f638c5b72ed71938f814 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Sun, 28 May 2023 18:08:22 +0900 Subject: [PATCH 34/47] Added an example to add next_action to CDP master segment --- machine-learning-box/automl/nba_cdp.dig | 59 +++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 machine-learning-box/automl/nba_cdp.dig diff --git a/machine-learning-box/automl/nba_cdp.dig b/machine-learning-box/automl/nba_cdp.dig new file mode 100644 index 00000000..48ee9b0a --- /dev/null +++ b/machine-learning-box/automl/nba_cdp.dig @@ -0,0 +1,59 @@ +_export: + !include : config/params.yaml + td: + engine: presto + database: ${output_database} + ++create_db_tbl_if_not_exists: + td_ddl>: + create_databases: ["${output_database}"] + ++load_datasets: + ipynb>: + notebook: ml_datasets + output_database: ml_datasets + datasets: nba + ++create_master_segment: + py>: scripts.audience.create_master_segment + name: nba_${session_id} + description: NBA test audience + master: + database: ml_datasets + table: nba_test + run: false + docker: + image: "digdag/digdag-python:3.9" + _env: + TD_API_KEY: ${secret:td.apikey} + TD_API_SERVER: "api.treasuredata.com" + ++nba_with_eval: + ipynb>: + branch: ATML-109_attr + notebook: NBA + train_table: ml_datasets.nba_train + test_table: ml_datasets.nba_test + budget: 10000 + value_per_cv: 100 + # optional + audience_name: nba_${session_id} + # export_q_table: ${output_database}.rl_qtable_${session_id} + export_channel_ratio: ${output_database}.rl_channel_ratio_${session_id} + export_predictions: ${output_database}.rl_predictions_${session_id} + export_model_performance: ${output_database}.rl_model_performance_${session_id} + ignore_actions: client_domain_organic_visit, organic_search + action_cost: | + { + "display": 2, + "social-social": 1.4, + "social": 2, + "social-paid": 5, + "organic_search": 1, + "emai": 3.2, + "cpc": 3, + "referral": 2, + "linkedin": 3, + "search-paid": 2, + "twitter": 1 + } \ No newline at end of file From cae911e807387132164b0528b15cb068a1ad0cd0 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 30 May 2023 12:21:09 +0900 Subject: [PATCH 35/47] Removed branch setting --- machine-learning-box/automl/nba_cdp.dig | 1 - machine-learning-box/automl/vehicle_coupon.dig | 2 -- 2 files changed, 3 deletions(-) diff --git a/machine-learning-box/automl/nba_cdp.dig b/machine-learning-box/automl/nba_cdp.dig index 48ee9b0a..40602deb 100644 --- a/machine-learning-box/automl/nba_cdp.dig +++ b/machine-learning-box/automl/nba_cdp.dig @@ -30,7 +30,6 @@ _export: +nba_with_eval: ipynb>: - branch: ATML-109_attr notebook: NBA train_table: ml_datasets.nba_train test_table: ml_datasets.nba_test diff --git a/machine-learning-box/automl/vehicle_coupon.dig b/machine-learning-box/automl/vehicle_coupon.dig index 2fed4590..a849bc93 100644 --- a/machine-learning-box/automl/vehicle_coupon.dig +++ b/machine-learning-box/automl/vehicle_coupon.dig @@ -18,7 +18,6 @@ _export: +train: ml_train>: - branch: ATML-109_attr docker: task_mem: 256g # 64g/128g/256g/384g/512g notebook: gluon_train @@ -50,7 +49,6 @@ _export: +predict: ml_predict>: - branch: ATML-109_attr docker: task_mem: 128g # 64g/128g/256g/384g/512g notebook: gluon_predict From d93474f7594ea02922f72478756f31c36ad32624 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 15 Jun 2023 16:20:41 +0900 Subject: [PATCH 36/47] Fixed a typo --- machine-learning-box/automl/ml_experiment.dig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index bb8e8701..a861ba2c 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -29,7 +29,7 @@ _export: user_id: ${automl.last_executed_user_id} user_email: ${automl.last_executed_user_email} model_name: gluon_model_${session_id} - shared_mdoel: ${automl.shared_model} + shared_model: ${automl.shared_model} task_attempt_id: ${attempt_id} session_time: ${session_local_time} engine: presto From edb33dc775a09f29c70c9ec08860c9890e3c874c Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 15 Jun 2023 16:25:56 +0900 Subject: [PATCH 37/47] Fixed a typo --- machine-learning-box/automl/queries/track_experiment.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine-learning-box/automl/queries/track_experiment.sql b/machine-learning-box/automl/queries/track_experiment.sql index a9dc9522..7db45323 100644 --- a/machine-learning-box/automl/queries/track_experiment.sql +++ b/machine-learning-box/automl/queries/track_experiment.sql @@ -5,6 +5,6 @@ select '${user_id}' as user_id, '${user_email}' as user_email, '${model_name}' as model_name, - '${shared_mdoel}' as shared_mdoel, + '${shared_model}' as shared_model, '${last_executed_notebook}' as notebook_url From 35f343c03a205f064076e786107bfcbc1176554e Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 15 Jun 2023 17:06:34 +0900 Subject: [PATCH 38/47] Removed to record test table name --- machine-learning-box/automl/ml_experiment.dig | 1 + machine-learning-box/automl/queries/record_evaluation.sql | 1 + 2 files changed, 2 insertions(+) diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index a861ba2c..a95bdd24 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -57,5 +57,6 @@ _export: insert_into: ${output_database}.automl_eval_results engine: presto model_name: gluon_model_${session_id} + test_table: ${output_database}.predicted_${test_data_table}_${session_id} session_time: ${session_local_time} auc: ${td.last_results.auc} diff --git a/machine-learning-box/automl/queries/record_evaluation.sql b/machine-learning-box/automl/queries/record_evaluation.sql index e08b6916..88070aec 100644 --- a/machine-learning-box/automl/queries/record_evaluation.sql +++ b/machine-learning-box/automl/queries/record_evaluation.sql @@ -2,4 +2,5 @@ select '${session_time}' as session_time, '${model_name}' as model_name, + '${test_table}' as test_table, '${auc}' as auroc From b24b27dc0f981118bcc7e26692cc5c670b6e806f Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 15 Jun 2023 17:10:57 +0900 Subject: [PATCH 39/47] Fixed test_table value --- machine-learning-box/automl/ml_experiment.dig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index a95bdd24..eef93c1c 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -57,6 +57,6 @@ _export: insert_into: ${output_database}.automl_eval_results engine: presto model_name: gluon_model_${session_id} - test_table: ${output_database}.predicted_${test_data_table}_${session_id} + test_table: ${input_database}.${test_data_table} session_time: ${session_local_time} auc: ${td.last_results.auc} From 2e80b31c8422b5110f76a416d2ae5fb12958719e Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 15 Jun 2023 17:55:55 +0900 Subject: [PATCH 40/47] Added a drift detection example --- machine-learning-box/automl/config/params.yaml | 4 +++- machine-learning-box/automl/ml_experiment.dig | 9 +++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/machine-learning-box/automl/config/params.yaml b/machine-learning-box/automl/config/params.yaml index fcf30438..8eb7d609 100644 --- a/machine-learning-box/automl/config/params.yaml +++ b/machine-learning-box/automl/config/params.yaml @@ -5,4 +5,6 @@ train_data_table: gluon_train target_column: class test_data_table: gluon_test -fit_time_limit: 60 * 3 # fit timeout in sec. 3 min just for demo. Default: 60 * 60 (1hr). \ No newline at end of file +fit_time_limit: 60 * 3 # fit timeout in sec. 3 min just for demo. Default: 60 * 60 (1hr). + +drift_auc_threshold: 0.93 diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig index eef93c1c..abf9b3a8 100644 --- a/machine-learning-box/automl/ml_experiment.dig +++ b/machine-learning-box/automl/ml_experiment.dig @@ -52,6 +52,15 @@ _export: store_last_results: true engine: hive ++alert_if_drift_detected: + if>: ${td.last_results.auc < drift_auc_threshold} + _do: + mail>: + data: Detect drift in model performance. AUC was ${td.last_results.auc}. + subject: Drift detected + to: [me+alerts@example.com] + # bcc: [foo@example.com,bar@example.com] + +record_evaluation: td>: queries/record_evaluation.sql insert_into: ${output_database}.automl_eval_results From b6f758ed87b9b2efd94cebd547080b335e0e9b4b Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 10 Jul 2023 17:33:06 +0900 Subject: [PATCH 41/47] Fixed a bug in cdp endpoints --- .../automl/scripts/audience.py | 163 +++++++++++++++++- 1 file changed, 160 insertions(+), 3 deletions(-) diff --git a/machine-learning-box/automl/scripts/audience.py b/machine-learning-box/automl/scripts/audience.py index 9c09e878..2c17d725 100644 --- a/machine-learning-box/automl/scripts/audience.py +++ b/machine-learning-box/automl/scripts/audience.py @@ -8,7 +8,7 @@ import faulthandler import warnings -from typing import List, Tuple +from typing import List, Tuple, Optional from requests.models import Response from requests.packages.urllib3.util.retry import Retry @@ -121,7 +121,7 @@ def __init__(self): TD_API_KEY = os.environ["TD_API_KEY"] TD_ENDPOINT = os.environ["TD_API_SERVER"] - CDP_ENDPOINT = TD_ENDPOINT.replace('api', 'api-cdp') + CDP_ENDPOINT = TD_ENDPOINT.replace('.treasuredata', '-cdp.treasuredata') HEADERRS = {'Authorization': f'TD1 {TD_API_KEY}', 'Content-Type': 'application/json'} self.cdp_api = CdpApiClient(endpoint=CDP_ENDPOINT, headers=HEADERRS) self.td_api = pytd.Client(retry_post_requests=True).api_client @@ -146,6 +146,9 @@ def create_master_segment(self, *, name: str, database: str, table: str, descrip res = self.cdp_api.post(f"/audiences/{audience_id}/run") print(f"ⓘ Run Master Segment {name}", file=sys.stderr) + TD_ENDPOINT = os.environ["TD_API_SERVER"] + ms_url = f"https://{TD_ENDPOINT.replace('api', 'console')}/app/ms/{audience_id}" + print(f"💎 Created a Master Segment: {ms_url}", file=sys.stderr) return audience_id def add_attribute( @@ -207,7 +210,7 @@ def add_attribute( print(f"ⓘ Successfully added an attribute table '{attr_table}' to master segment {audience_id}", file=sys.stderr) else: try: - 'not unique' in res.json()['base'][0] + assert 'not unique' in res.json()['base'][0] print(f"⚠ Attribute '{attr_column}' already exists in Parent Segment and thus skip adding an attribue.", file=sys.stderr) return except: @@ -249,6 +252,120 @@ def get_parent_segment_id(self, name: str) -> str: raise ValueError(f"Cannot find parent segment: {name}") + def create_folder(self, name: str, audience_id: str) -> str: + folder = self.cdp_api.post(f'/audiences/{audience_id}/folders', json={ + 'name': name, + 'description': 'AutoML Segments' + }) + + if folder.ok: + return folder.json()['id'] + else: + res = self.cdp_api.get(f'/audiences/{audience_id}/folders') + if not res.ok: + raise ApiRequestError(res, f"{res.status_code} error on GET /audiences/{audience_id}/folders: {res.json()}") + + folders = res.json() + for f in folders: + if f.get('name') == name: + print(f"Reuse folder `{name}` already existing in audience `{audience_id}`") + return f['id'] + + raise ApiRequestError(folder, f"{folder.status_code} error on POST /audiences/{audience_id}/folders: {folder.json()}") + + + def create_segments(self, *, column_name: str, column_values: List[str], folder: Optional[str]="AutoML", audience_id: Optional[str]=None, audience_name: Optional[str]=None, rerun_master_segment: Optional[bool]=False): + assert len(column_values) >= 1, "At least 1 column_values are required." + if audience_id is None: + assert audience_name is not None, "Either audience_id or audience_name argument is required" + audience_id = self.get_parent_segment_id(audience_name) + + res = self.cdp_api.get(f"/entities/parent_segments/{audience_id}") + use_v4_api = False + if res.ok: + print(f"ⓘ Successfully retrieved the audience", file=sys.stderr) + folder_id = res.json()['data']['relationships']['parentFolder']['data']['id'] + else: + try: + assert res.json()['errors'].split(':')[0] == 'v5 endpoints flag should be enabled for audience' + print("v5 API is not enabled. Fall back to v4 API") + use_v4_api = True + except: + raise ApiRequestError(res, f"{res.status_code} error on GET /entities/parent_segments/{audience_id}: {res.json()}") + + if folder: + folder_id = self.create_folder(folder, audience_id) + + for value in column_values: + attribute_name = column_name.replace('_', ' ').title() + ' = ' + str(value).title() + rule = { + 'type': 'And', + 'conditions': [{ + 'conditions': [{ + 'type': 'Value', + 'leftValue': {'name': column_name, 'visibility': 'clear'}, + 'operator': {'not': False, 'rightValue': value, 'type': 'Equal'}, + 'arrayMatching': None, + 'exclude': False + }], + 'type': 'And', + }], + 'expr': '', + } + + if use_v4_api: + segment = { + 'name': attribute_name, + 'kind': 0, # batch, + 'description': f'{column_name} = {value}', + 'countPopulation': True, + 'rule': rule, + } + if folder: + segment['segmentFolderId'] = folder_id + res = self.cdp_api.post(f"/audiences/{audience_id}/segments", json=segment) + if res.ok: + print(f"ⓘ Successfully created a segment '{attribute_name}' to master segment {audience_id}", file=sys.stderr) + else: + try: + assert res.json()['errors']['name'][0] == 'has already been taken' + print(f"Segment `{attribute_name}` already exists") + except: + raise ApiRequestError(res, f"{res.status_code} error on POST /entities/segments: {res.json()}") + else: # v5 API + segment = { + 'attributes': { + 'name': attribute_name, + 'description': f'{column_name} = {value}', + 'rule': rule, + }, + 'relationships': {'parentFolder': {'data': {'id': folder_id, 'type': 'folder-segment'}}} + } + res = self.cdp_api.post("/entities/segments", json=segment) + if res.ok: + print(f"ⓘ Successfully created a segment '{attribute_name}' to master segment {audience_id}", file=sys.stderr) + else: + try: + assert res.json()['errors']['name'][0] == 'has already been taken' + print(f"Segment `{attribute_name}` already exists") + except: + raise ApiRequestError(res, f"{res.status_code} error on POST /entities/segments: {res.json()}") + + if rerun_master_segment: + res = self.cdp_api.post(f"/audiences/{audience_id}/run") + if res.ok: + print(f"ⓘ Successfully triggered rerun of Master Segment: {audience_id}", file=sys.stderr) + else: + raise ApiRequestError(res, f"{res.status_code} error on POST /audiences/{audience_id}/run: {res.json()}") + + TD_ENDPOINT = os.environ["TD_API_SERVER"] + if use_v4_api: + s_url = f"https://{TD_ENDPOINT.replace('api', 'console')}/app/ms/{audience_id}/se" + else: + s_url = f"https://{TD_ENDPOINT.replace('api', 'console').replace('.treasuredata', '-next.treasuredata')}/app/ps/{audience_id}" + print(f"💎 Created new segments: {s_url}", file=sys.stderr) + + def add_attribute(**kwargs): faulthandler.enable() @@ -351,3 +468,43 @@ def parse_arguments(kwargs: dict) -> dict: # force flush sys.stdout.flush() sys.stderr.flush() + + +def create_segments(**kwargs): + faulthandler.enable() + + def parse_arguments(kwargs: dict) -> dict: + assert os.environ.get('TD_API_KEY') is not None, "TD_API_KEY ENV variable is required" + assert os.environ.get('TD_API_SERVER') is not None, "TD_API_SERVER ENV variable is required" + + ret = {} + + column_name = kwargs.pop('column_name', None) + assert column_name is not None, "column_name argument is required" + ret['column_name'] = column_name + + column_values = kwargs.pop('column_values', None) + assert column_values is not None, "column_values argument is required" + ret['column_values'] = [s.strip() for s in column_values.split(',')] + + folder = kwargs.pop('folder', None) + if folder is not None: ret['folder'] = folder + + audience = kwargs.pop('audience', None) + assert audience is not None, "audience argument is required" + audience_id = audience.pop('id', None) + if audience_id is not None: ret['audience_id'] = audience_id + audience_name = audience.pop('name', None) + if audience_name is not None: ret['audience_name'] = audience_name + ret['rerun_master_segment'] = to_boolean(audience.pop('rerun', 'False')) + + return ret + + try: + params = parse_arguments(kwargs) + cdp = CdpAudience() + cdp.create_segments(**params) + finally: + # force flush + sys.stdout.flush() + sys.stderr.flush() From 7192e85ddb9ac5e34ff2c19aa02029d813f245dc Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 3 Aug 2023 16:40:30 +0900 Subject: [PATCH 42/47] Added rfm workflow --- machine-learning-box/automl/rfm.dig | 46 +++++++++++++++++++ .../automl/scripts/audience.py | 33 +++++++++---- 2 files changed, 71 insertions(+), 8 deletions(-) create mode 100644 machine-learning-box/automl/rfm.dig diff --git a/machine-learning-box/automl/rfm.dig b/machine-learning-box/automl/rfm.dig new file mode 100644 index 00000000..e9442533 --- /dev/null +++ b/machine-learning-box/automl/rfm.dig @@ -0,0 +1,46 @@ +_export: + !include : config/params.yaml + td: + engine: presto + database: ${output_database} + ++create_db_tbl_if_not_exists: + td_ddl>: + create_databases: ["${output_database}"] + ++load_datasets: + ipynb>: + notebook: ml_datasets + output_database: ${input_database} + datasets: cosmetics_store + ++create_users_table: + td>: + query: "select distinct user_id from ${input_database}.cosmetics_store" + create_table: cosmetics_users + ++create_master_segment: + py>: scripts.audience.create_master_segment + name: cosmetics_${session_id} + description: Cosmetics store audience + master: + database: ${output_database} + table: cosmetics_users + run: false + docker: + image: "digdag/digdag-python:3.9" + _env: + TD_API_KEY: ${secret:td.apikey} + TD_API_SERVER: "api.treasuredata.com" + ++rfm_orders: + ipynb>: + notebook: RFM + input_table: ${input_database}.cosmetics_store + output_table: ${output_database}.rfm_output_cosmetics_store + user_column: user_id + # tstamp_column: event_time + # tstamp_column: tstamp + amount_column: price + audience_name: cosmetics_${session_id} + diff --git a/machine-learning-box/automl/scripts/audience.py b/machine-learning-box/automl/scripts/audience.py index 2c17d725..e8445369 100644 --- a/machine-learning-box/automl/scripts/audience.py +++ b/machine-learning-box/automl/scripts/audience.py @@ -169,17 +169,32 @@ def add_attribute( assert audience_name is not None, "Either audience_id or audience_name argument is required" audience_id = self.get_parent_segment_id(audience_name) - # table = self.td_api.table(attr_db, attr_table) - # attr_type = resolve_type(table, "predicted_proba") - res = self.cdp_api.put(f"/audiences/{audience_id}") if not res.ok: raise ApiRequestError(res) audience = res.json() - attributes = audience['attributes'] if 'attributes' in audience else [] + if 'attributes' in audience: + attributes = audience['attributes'] + else: + attributes = [] + audience['attributes'] = attributes + existing_attr_names = [attr['name'] for attr in attributes] + # Workaround for attribute column does not exists in the attribute table + if len(attributes) >= 1: + table = self.td_api.table(attr_db, attr_table) + existing_column_names = [col[2] if len(col) == 3 else col[0] for col in table.schema] + + def remove_attribute(attr) -> bool: + if attr['parentDatabaseName'] == attr_db and attr['parentTableName'] == attr_table: + if attr['parentColumn'] not in existing_column_names: + print(f"⚠ Remove an attribute column '{attr['name']}' in Master Segment {audience_id} because '{attr['parentColumn']}' column does not exists in the Atrribute table '{attr_db}.{attr_table}'", file=sys.stderr) + return True + return False + audience['attributes'] = [attr for attr in attributes if not remove_attribute(attr)] + for i, attr_column in enumerate(attr_columns): attr_alias = attr_aliases[i] @@ -198,19 +213,19 @@ def add_attribute( if attr_alias in existing_attr_names: if replace_attr_if_exists: attributes[existing_attr_names.index(attr_alias)] = new_attr - print(f"⚠ Replace an attribute column '{attr_alias}' in Master Segment {audience_id}", file=sys.stderr) + print(f"⚠ Replace an existing attribute column '{attr_alias}' in Master Segment {audience_id}", file=sys.stderr) else: print(f"⚠ Skip adding an attribute because the attribute column '{attr_alias}' already exists", file=sys.stderr) else: attributes.append(new_attr) - # from IPython.core.debugger import Pdb; Pdb().set_trace() res = self.cdp_api.put(f"/audiences/{audience_id}", json=audience) if res.ok: print(f"ⓘ Successfully added an attribute table '{attr_table}' to master segment {audience_id}", file=sys.stderr) else: try: - assert 'not unique' in res.json()['base'][0] + res_value = res.json()['base'][0] + assert 'not unique' in res_value, f"Unexpected error: {res_value}" print(f"⚠ Attribute '{attr_column}' already exists in Parent Segment and thus skip adding an attribue.", file=sys.stderr) return except: @@ -274,7 +289,9 @@ def create_folder(self, name: str, audience_id: str) -> str: raise ApiRequestError(folder, f"{folder.status_code} error on POST /audiences/{audience_id}/folders: {folder.json()}") - def create_segments(self, *, column_name: str, column_values: List[str], folder: Optional[str]="AutoML", audience_id: Optional[str]=None, audience_name: Optional[str]=None, rerun_master_segment: Optional[bool]=False): + def create_segments(self, *, column_name: str, column_values: List[str], folder: Optional[str]="AutoML", + audience_id: Optional[str]=None, audience_name: Optional[str]=None, rerun_master_segment: Optional[bool]=False + ): assert len(column_values) >= 1, "At least 1 column_values are required." if audience_id is None: assert audience_name is not None, "Either audience_id or audience_name argument is required" From 7b8065c3d0be9c37a9f12f316b5cffc722fe8e43 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 3 Aug 2023 17:07:50 +0900 Subject: [PATCH 43/47] Revised not to use custom script --- machine-learning-box/automl/rfm.dig | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/machine-learning-box/automl/rfm.dig b/machine-learning-box/automl/rfm.dig index e9442533..645c2f8f 100644 --- a/machine-learning-box/automl/rfm.dig +++ b/machine-learning-box/automl/rfm.dig @@ -14,26 +14,7 @@ _export: output_database: ${input_database} datasets: cosmetics_store -+create_users_table: - td>: - query: "select distinct user_id from ${input_database}.cosmetics_store" - create_table: cosmetics_users - -+create_master_segment: - py>: scripts.audience.create_master_segment - name: cosmetics_${session_id} - description: Cosmetics store audience - master: - database: ${output_database} - table: cosmetics_users - run: false - docker: - image: "digdag/digdag-python:3.9" - _env: - TD_API_KEY: ${secret:td.apikey} - TD_API_SERVER: "api.treasuredata.com" - -+rfm_orders: ++run_rfm: ipynb>: notebook: RFM input_table: ${input_database}.cosmetics_store @@ -43,4 +24,3 @@ _export: # tstamp_column: tstamp amount_column: price audience_name: cosmetics_${session_id} - From 04eb2048d50293c4c9c1086f73061ac69d4e080d Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 26 Sep 2023 16:50:44 +0900 Subject: [PATCH 44/47] Added clustering example --- machine-learning-box/automl/clustering.dig | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 machine-learning-box/automl/clustering.dig diff --git a/machine-learning-box/automl/clustering.dig b/machine-learning-box/automl/clustering.dig new file mode 100644 index 00000000..b204cb5a --- /dev/null +++ b/machine-learning-box/automl/clustering.dig @@ -0,0 +1,23 @@ +_export: + !include : config/params.yaml + td: + engine: presto + database: ${output_database} + ++create_db_tbl_if_not_exists: + td_ddl>: + create_databases: ["${output_database}"] + ++load_datasets: + ipynb>: + notebook: ml_datasets + output_database: ml_datasets + datasets: dermatology + ++clustering_gluon_new_model: + ipynb>: + notebook: clustering + input_table: ml_datasets.dermatology + output_table: ${output_database}.dermatology_clusters_${session_id} + export_feature_importance: ${output_database}.feature_importance_${session_id} + export_shap_values: ${output_database}.shap_values_${session_id} From 44f72f42e599a1175baecabcfeb2e9713a94c2dd Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 4 Dec 2023 23:42:56 +0900 Subject: [PATCH 45/47] Added CLTV notebook --- machine-learning-box/automl/cltv.dig | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 machine-learning-box/automl/cltv.dig diff --git a/machine-learning-box/automl/cltv.dig b/machine-learning-box/automl/cltv.dig new file mode 100644 index 00000000..b1a6ddcc --- /dev/null +++ b/machine-learning-box/automl/cltv.dig @@ -0,0 +1,26 @@ +_export: + !include : config/params.yaml + td: + engine: presto + database: ${output_database} + ++create_db_tbl_if_not_exists: + td_ddl>: + create_databases: ["${output_database}"] + ++load_datasets: + ipynb>: + notebook: ml_datasets + output_database: ${input_database} + datasets: online_retail + ++run_cltv: + ipynb>: + branch: ATML-174-cltv + notebook: CLTV + input_table: ${input_database}.online_retail_txn + output_table: ${output_database}.online_retail_cltv_result + user_column: customerid + tstamp_column: invoicedate + amount_column: purchaseamount + audience_name: online_retail_cltv From 6d45d53347f6a12399904b0cf1cc4627ef098754 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 5 Dec 2023 13:57:44 +0900 Subject: [PATCH 46/47] Added branch option --- machine-learning-box/automl/cltv.dig | 1 + 1 file changed, 1 insertion(+) diff --git a/machine-learning-box/automl/cltv.dig b/machine-learning-box/automl/cltv.dig index b1a6ddcc..be1e629d 100644 --- a/machine-learning-box/automl/cltv.dig +++ b/machine-learning-box/automl/cltv.dig @@ -10,6 +10,7 @@ _export: +load_datasets: ipynb>: + branch: ATML-174-cltv notebook: ml_datasets output_database: ${input_database} datasets: online_retail From 708c760ce5b38f81558ef809f4692e1171558ca7 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 18 Dec 2023 17:54:07 +0900 Subject: [PATCH 47/47] Removed branch --- machine-learning-box/automl/cltv.dig | 2 -- 1 file changed, 2 deletions(-) diff --git a/machine-learning-box/automl/cltv.dig b/machine-learning-box/automl/cltv.dig index be1e629d..7137a69a 100644 --- a/machine-learning-box/automl/cltv.dig +++ b/machine-learning-box/automl/cltv.dig @@ -10,14 +10,12 @@ _export: +load_datasets: ipynb>: - branch: ATML-174-cltv notebook: ml_datasets output_database: ${input_database} datasets: online_retail +run_cltv: ipynb>: - branch: ATML-174-cltv notebook: CLTV input_table: ${input_database}.online_retail_txn output_table: ${output_database}.online_retail_cltv_result