From 093f7649ff3020735f4135a94ced69132f71153b Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 9 Jun 2022 14:58:16 +0900
Subject: [PATCH 01/47] Added automl workflows

---
 machine-learning-box/automl/README.md         | 15 +++++
 .../automl/config/params.yaml                 |  4 ++
 machine-learning-box/automl/ml_datasets.dig   | 14 ++++
 machine-learning-box/automl/ml_experiment.dig | 64 +++++++++++++++++++
 4 files changed, 97 insertions(+)
 create mode 100644 machine-learning-box/automl/README.md
 create mode 100644 machine-learning-box/automl/config/params.yaml
 create mode 100644 machine-learning-box/automl/ml_datasets.dig
 create mode 100644 machine-learning-box/automl/ml_experiment.dig

diff --git a/machine-learning-box/automl/README.md b/machine-learning-box/automl/README.md
new file mode 100644
index 00000000..98025bc4
--- /dev/null
+++ b/machine-learning-box/automl/README.md
@@ -0,0 +1,15 @@
+## How to use
+
+Workflow example of AutoML operator. 
+
+Note: this feature is still in Beta and available to limited customers.
+
+
+```sh
+# Push project
+$ td -c ~/.td/td.conf wf push <project_name> --project .
+
+# Setting td.apikey secret is required for automl operator.
+
+$ td -c ~/.td/td.conf wf secrets --project <project_name> --set td.apikey
+```
\ No newline at end of file
diff --git a/machine-learning-box/automl/config/params.yaml b/machine-learning-box/automl/config/params.yaml
new file mode 100644
index 00000000..45ad77e7
--- /dev/null
+++ b/machine-learning-box/automl/config/params.yaml
@@ -0,0 +1,4 @@
+input_database: ml_datasets
+output_database: automl_test
+
+expr_tracking_table: automl_experiments
diff --git a/machine-learning-box/automl/ml_datasets.dig b/machine-learning-box/automl/ml_datasets.dig
new file mode 100644
index 00000000..b89ee140
--- /dev/null
+++ b/machine-learning-box/automl/ml_datasets.dig
@@ -0,0 +1,14 @@
+timezone: Asia/Tokyo
+#timezone: PST
+
+_export:
+  td:
+    engine: presto
+
++load_datasets:
+  ipynb>:
+    notebook: ml_datasets
+    input_table: dummy.removed_later     # temporary workaround.
+    output_database: ml_datasets
+    datasets: all
+#   datasets: gluon, bank_marketing
\ No newline at end of file
diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
new file mode 100644
index 00000000..78cc12a7
--- /dev/null
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -0,0 +1,64 @@
+timezone: Asia/Tokyo
+#timezone: PST
+
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: ${output_database}
+
++create_tbl_if_not_exists:
+  td_ddl>:
+    create_tables: ["${expr_tracking_table}"]
+
++load_datasets:
+  ipynb>:
+    notebook: ml_datasets
+    input_table: dummy.removed_later     # temporary workaround.
+    output_database: ${input_database}
+#   datasets: gluon, bank_marketing
+    datasets: gluon
+
++gluon_train:
+  ml_train>:
+    notebook: gluon_train
+    model_name: gluon_model_${session_id}
+    input_table: ${input_database}.gluon_train # expect database_name.table_name
+    target_column: class
+    # The following options are optional ones
+    #problem_type: binary                # ‘binary’, ‘multiclass’, ‘regression’, or ‘quantile’. autolugon automatically detect problem types
+    #eval_metric: roc_auc                # autolugon automatically select a right eval_metric for a given setting if not specified.
+    ignore_columns: time,rowid           # Note time column is ignored by the default.
+    time_limit: 60 * 3                   # fit timeout. 3 min just for training time. Recommend 60 * 60 or so for production purposes (Note 24 hours at max). Note this is a soft limit, not hard limit.
+    # timeout: 60 * 3                    # timeout for notebook cell-level execution. This is a hard limit. Note it's cell-level timeout. No timeout if not specified.
+    export_leaderboard: ${output_database}.leaderboard_gluon_train
+    export_feature_importance: ${output_database}.feature_importance_gluon_train
+
++print_train_result:
+  echo>: "executed ${automl.last_executed_notebook}.ipynb"
+
++track_experiment:
+  td>: queries/track_experiment.sql
+  insert_into: automl_experiments
+  last_executed_notebook: ${automl.last_executed_notebook}
+  user_id: ${automl.last_executed_user_id}
+  user_email: ${automl.last_executed_user_email}
+  model_name: gluon_model_${session_id}
+  task_attempt_id: ${attempt_id}
+  session_time: ${session_local_time}
+  engine: presto
+
++gluon_predict:
+  ml_predict>:
+    notebook: gluon_predict
+    model_name: gluon_model_${session_id}
+    input_table: ${input_database}.gluon_test # expect database_name.table_name
+    output_table: ${output_database}.gluon_predicted  # expect database_name.table_name. DB will be created if not exists. table is overwrite'd.
+    # optional
+    #rowid_column: rowid                # Note when rowid_column is specified, only rowid column + prediction result columns are resulted in the output table
+    #ignore_columns: time               # target column should not be in test data
+    export_leaderboard: ${output_database}.leaderboard_gluon_predict
+    export_feature_importance: ${output_database}.feature_importance_gluon_predict
+
++print_predict_result:
+  echo>: "executed ${automl.last_executed_notebook}.ipynb"
\ No newline at end of file

From 799f99e614a5f2795aebe50b41696170cb8abc3b Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Mon, 13 Jun 2022 11:22:39 +0900
Subject: [PATCH 02/47] Added eda workflow

---
 machine-learning-box/automl/eda.dig | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 machine-learning-box/automl/eda.dig

diff --git a/machine-learning-box/automl/eda.dig b/machine-learning-box/automl/eda.dig
new file mode 100644
index 00000000..ca8d057e
--- /dev/null
+++ b/machine-learning-box/automl/eda.dig
@@ -0,0 +1,23 @@
+timezone: Asia/Tokyo
+#timezone: PST
+
+_export:
+  !include : config/params.yaml
+
++datasets:
+  for_each>:
+    table: [gluon_train, bank_marketing_train, vehicle_coupon_train, online_retail_train, telco_churn_train, boston_house_train]
+  _parallel:
+    limit: 3
+  _do:
+    +run_eda:
+      ipynb>:
+        docker:
+          task_mem: 256g
+        notebook: EDA
+        input_table: ${input_database}.${table}
+        # The following options are optional ones
+        eda: all
+        # eda: pandas-profiling, sweetviz
+        # target_column: label
+        sampling_threshold: 1000000

From c4bc7eaf5ba351474843e5d96e0533faa0705998 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Mon, 13 Jun 2022 11:48:46 +0900
Subject: [PATCH 03/47] Fixed table name

---
 machine-learning-box/automl/eda.dig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine-learning-box/automl/eda.dig b/machine-learning-box/automl/eda.dig
index ca8d057e..45285020 100644
--- a/machine-learning-box/automl/eda.dig
+++ b/machine-learning-box/automl/eda.dig
@@ -6,7 +6,7 @@ _export:
 
 +datasets:
   for_each>:
-    table: [gluon_train, bank_marketing_train, vehicle_coupon_train, online_retail_train, telco_churn_train, boston_house_train]
+    table: [gluon_train, bank_marketing_train, vehicle_coupon_train, online_retail_ltv_train, telco_churn_train, boston_house_train]
   _parallel:
     limit: 3
   _do:

From cc3993f1041dec529b7feee3d6fd3dd08ccaf05b Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Mon, 13 Jun 2022 14:28:21 +0900
Subject: [PATCH 04/47] Fixed EDA workflow to load sample datasets

---
 machine-learning-box/automl/eda.dig | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/machine-learning-box/automl/eda.dig b/machine-learning-box/automl/eda.dig
index 45285020..e2662cd4 100644
--- a/machine-learning-box/automl/eda.dig
+++ b/machine-learning-box/automl/eda.dig
@@ -1,8 +1,13 @@
 timezone: Asia/Tokyo
 #timezone: PST
 
-_export:
-  !include : config/params.yaml
++load_datasets:
+  ipynb>:
+    notebook: ml_datasets
+    input_table: dummy.removed_later     # temporary workaround.
+    output_database: ml_datasets
+    datasets: all
+#   datasets: gluon, bank_marketing, vehicle_coupon, online_retail, telco_churn, boston_house
 
 +datasets:
   for_each>:
@@ -15,7 +20,7 @@ _export:
         docker:
           task_mem: 256g
         notebook: EDA
-        input_table: ${input_database}.${table}
+        input_table: ml_datasets.${table}
         # The following options are optional ones
         eda: all
         # eda: pandas-profiling, sweetviz

From d316f7b88bafcaef8ffcef9561f194e0d4876948 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Sat, 18 Jun 2022 02:55:04 +0900
Subject: [PATCH 05/47] Revised options

---
 machine-learning-box/automl/ml_datasets.dig   | 1 -
 machine-learning-box/automl/ml_experiment.dig | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/machine-learning-box/automl/ml_datasets.dig b/machine-learning-box/automl/ml_datasets.dig
index b89ee140..f5c495be 100644
--- a/machine-learning-box/automl/ml_datasets.dig
+++ b/machine-learning-box/automl/ml_datasets.dig
@@ -8,7 +8,6 @@ _export:
 +load_datasets:
   ipynb>:
     notebook: ml_datasets
-    input_table: dummy.removed_later     # temporary workaround.
     output_database: ml_datasets
     datasets: all
 #   datasets: gluon, bank_marketing
\ No newline at end of file
diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index 78cc12a7..2b5a6be3 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -14,7 +14,6 @@ _export:
 +load_datasets:
   ipynb>:
     notebook: ml_datasets
-    input_table: dummy.removed_later     # temporary workaround.
     output_database: ${input_database}
 #   datasets: gluon, bank_marketing
     datasets: gluon
@@ -33,6 +32,7 @@ _export:
     # timeout: 60 * 3                    # timeout for notebook cell-level execution. This is a hard limit. Note it's cell-level timeout. No timeout if not specified.
     export_leaderboard: ${output_database}.leaderboard_gluon_train
     export_feature_importance: ${output_database}.feature_importance_gluon_train
+    # hide_table_contents: true
 
 +print_train_result:
   echo>: "executed ${automl.last_executed_notebook}.ipynb"
@@ -59,6 +59,7 @@ _export:
     #ignore_columns: time               # target column should not be in test data
     export_leaderboard: ${output_database}.leaderboard_gluon_predict
     export_feature_importance: ${output_database}.feature_importance_gluon_predict
+    # hide_table_contents: true
 
 +print_predict_result:
   echo>: "executed ${automl.last_executed_notebook}.ipynb"
\ No newline at end of file

From 57922bedc49aac293551ba424e85a11f1a5306fb Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Sat, 18 Jun 2022 12:05:25 +0900
Subject: [PATCH 06/47] Updated comments

---
 machine-learning-box/automl/ml_experiment.dig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index 2b5a6be3..db644c91 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -28,7 +28,7 @@ _export:
     #problem_type: binary                # ‘binary’, ‘multiclass’, ‘regression’, or ‘quantile’. autolugon automatically detect problem types
     #eval_metric: roc_auc                # autolugon automatically select a right eval_metric for a given setting if not specified.
     ignore_columns: time,rowid           # Note time column is ignored by the default.
-    time_limit: 60 * 3                   # fit timeout. 3 min just for training time. Recommend 60 * 60 or so for production purposes (Note 24 hours at max). Note this is a soft limit, not hard limit.
+    time_limit: 60 * 3                   # fit timeout. 3 min just for training time. Default: 60 * 60 (1hr). 1hr or more is recommended for production purposes (Note 24 hours at max). Note this is a soft limit, not hard limit.
     # timeout: 60 * 3                    # timeout for notebook cell-level execution. This is a hard limit. Note it's cell-level timeout. No timeout if not specified.
     export_leaderboard: ${output_database}.leaderboard_gluon_train
     export_feature_importance: ${output_database}.feature_importance_gluon_train

From cdf77a2cc75b6a937aacfb74fd4864c1f5e7a703 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 23 Jun 2022 17:24:24 +0900
Subject: [PATCH 07/47] Revised options

---
 machine-learning-box/automl/eda.dig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/machine-learning-box/automl/eda.dig b/machine-learning-box/automl/eda.dig
index e2662cd4..d6edcf9d 100644
--- a/machine-learning-box/automl/eda.dig
+++ b/machine-learning-box/automl/eda.dig
@@ -4,7 +4,6 @@ timezone: Asia/Tokyo
 +load_datasets:
   ipynb>:
     notebook: ml_datasets
-    input_table: dummy.removed_later     # temporary workaround.
     output_database: ml_datasets
     datasets: all
 #   datasets: gluon, bank_marketing, vehicle_coupon, online_retail, telco_churn, boston_house
@@ -18,7 +17,7 @@ timezone: Asia/Tokyo
     +run_eda:
       ipynb>:
         docker:
-          task_mem: 256g
+          task_mem: 128g
         notebook: EDA
         input_table: ml_datasets.${table}
         # The following options are optional ones

From 44eb67ce3b3c9b84079968d181bf773dba749aa9 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Fri, 1 Jul 2022 21:04:59 +0900
Subject: [PATCH 08/47] Copyed from ml_experiment.dig

---
 .../automl/ml_experiment_demo.dig             | 65 +++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 machine-learning-box/automl/ml_experiment_demo.dig

diff --git a/machine-learning-box/automl/ml_experiment_demo.dig b/machine-learning-box/automl/ml_experiment_demo.dig
new file mode 100644
index 00000000..db644c91
--- /dev/null
+++ b/machine-learning-box/automl/ml_experiment_demo.dig
@@ -0,0 +1,65 @@
+timezone: Asia/Tokyo
+#timezone: PST
+
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: ${output_database}
+
++create_tbl_if_not_exists:
+  td_ddl>:
+    create_tables: ["${expr_tracking_table}"]
+
++load_datasets:
+  ipynb>:
+    notebook: ml_datasets
+    output_database: ${input_database}
+#   datasets: gluon, bank_marketing
+    datasets: gluon
+
++gluon_train:
+  ml_train>:
+    notebook: gluon_train
+    model_name: gluon_model_${session_id}
+    input_table: ${input_database}.gluon_train # expect database_name.table_name
+    target_column: class
+    # The following options are optional ones
+    #problem_type: binary                # ‘binary’, ‘multiclass’, ‘regression’, or ‘quantile’. autolugon automatically detect problem types
+    #eval_metric: roc_auc                # autolugon automatically select a right eval_metric for a given setting if not specified.
+    ignore_columns: time,rowid           # Note time column is ignored by the default.
+    time_limit: 60 * 3                   # fit timeout. 3 min just for training time. Default: 60 * 60 (1hr). 1hr or more is recommended for production purposes (Note 24 hours at max). Note this is a soft limit, not hard limit.
+    # timeout: 60 * 3                    # timeout for notebook cell-level execution. This is a hard limit. Note it's cell-level timeout. No timeout if not specified.
+    export_leaderboard: ${output_database}.leaderboard_gluon_train
+    export_feature_importance: ${output_database}.feature_importance_gluon_train
+    # hide_table_contents: true
+
++print_train_result:
+  echo>: "executed ${automl.last_executed_notebook}.ipynb"
+
++track_experiment:
+  td>: queries/track_experiment.sql
+  insert_into: automl_experiments
+  last_executed_notebook: ${automl.last_executed_notebook}
+  user_id: ${automl.last_executed_user_id}
+  user_email: ${automl.last_executed_user_email}
+  model_name: gluon_model_${session_id}
+  task_attempt_id: ${attempt_id}
+  session_time: ${session_local_time}
+  engine: presto
+
++gluon_predict:
+  ml_predict>:
+    notebook: gluon_predict
+    model_name: gluon_model_${session_id}
+    input_table: ${input_database}.gluon_test # expect database_name.table_name
+    output_table: ${output_database}.gluon_predicted  # expect database_name.table_name. DB will be created if not exists. table is overwrite'd.
+    # optional
+    #rowid_column: rowid                # Note when rowid_column is specified, only rowid column + prediction result columns are resulted in the output table
+    #ignore_columns: time               # target column should not be in test data
+    export_leaderboard: ${output_database}.leaderboard_gluon_predict
+    export_feature_importance: ${output_database}.feature_importance_gluon_predict
+    # hide_table_contents: true
+
++print_predict_result:
+  echo>: "executed ${automl.last_executed_notebook}.ipynb"
\ No newline at end of file

From 422683d6374a2282fc5b772474c54279a73302c8 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Fri, 1 Jul 2022 21:08:31 +0900
Subject: [PATCH 09/47] Added a missing file

---
 machine-learning-box/automl/queries/track_experiment.sql | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 machine-learning-box/automl/queries/track_experiment.sql

diff --git a/machine-learning-box/automl/queries/track_experiment.sql b/machine-learning-box/automl/queries/track_experiment.sql
new file mode 100644
index 00000000..d781a9f4
--- /dev/null
+++ b/machine-learning-box/automl/queries/track_experiment.sql
@@ -0,0 +1,9 @@
+-- DIGDAG_INSERT_LINE
+select
+   '${task_attempt_id}' as task_attempt_id,   
+   '${session_time}' as session_time,
+   '${user_id}' as user_id,
+   '${user_email}' as user_email,
+   '${model_name}' as model_name,   
+   '${last_executed_notebook}.ipynb' as ipynb_url,
+   '${last_executed_notebook}.html' as html_url

From 41d05ff1e9d1e9a874df6bbe173ce1a11de800af Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Fri, 1 Jul 2022 21:14:57 +0900
Subject: [PATCH 10/47] Added parameterized automl workflow

---
 .../automl/config/params.yaml                 |  6 +++
 machine-learning-box/automl/ml_experiment.dig | 45 +++++--------------
 2 files changed, 17 insertions(+), 34 deletions(-)

diff --git a/machine-learning-box/automl/config/params.yaml b/machine-learning-box/automl/config/params.yaml
index 45ad77e7..8e5aa835 100644
--- a/machine-learning-box/automl/config/params.yaml
+++ b/machine-learning-box/automl/config/params.yaml
@@ -1,4 +1,10 @@
 input_database: ml_datasets
 output_database: automl_test
 
+train_data_table: gluon_train
+target_column: class
+test_data_table: gluon_test
+
 expr_tracking_table: automl_experiments
+
+fit_time_limit: 60 * 3 # # fit timeout. 3 min just for training time. Default: 60 * 60 (1hr).
\ No newline at end of file
diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index db644c91..2967c38a 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -1,45 +1,28 @@
-timezone: Asia/Tokyo
-#timezone: PST
-
 _export:
   !include : config/params.yaml
   td:
     engine: presto
-    database: ${output_database}
 
 +create_tbl_if_not_exists:
   td_ddl>:
-    create_tables: ["${expr_tracking_table}"]
-
-+load_datasets:
-  ipynb>:
-    notebook: ml_datasets
-    output_database: ${input_database}
-#   datasets: gluon, bank_marketing
-    datasets: gluon
+    create_tables: ["${expr_tracking_table}", "${output_database}"]
 
 +gluon_train:
   ml_train>:
     notebook: gluon_train
     model_name: gluon_model_${session_id}
-    input_table: ${input_database}.gluon_train # expect database_name.table_name
-    target_column: class
-    # The following options are optional ones
-    #problem_type: binary                # ‘binary’, ‘multiclass’, ‘regression’, or ‘quantile’. autolugon automatically detect problem types
-    #eval_metric: roc_auc                # autolugon automatically select a right eval_metric for a given setting if not specified.
-    ignore_columns: time,rowid           # Note time column is ignored by the default.
-    time_limit: 60 * 3                   # fit timeout. 3 min just for training time. Default: 60 * 60 (1hr). 1hr or more is recommended for production purposes (Note 24 hours at max). Note this is a soft limit, not hard limit.
-    # timeout: 60 * 3                    # timeout for notebook cell-level execution. This is a hard limit. Note it's cell-level timeout. No timeout if not specified.
-    export_leaderboard: ${output_database}.leaderboard_gluon_train
-    export_feature_importance: ${output_database}.feature_importance_gluon_train
-    # hide_table_contents: true
+    input_table: ${input_database}.${train_data_table}
+    target_column: ${target_column}
+    time_limit: ${fit_time_limit}
+    export_leaderboard: ${output_database}.leaderboard_${train_input_table}
+    export_feature_importance: ${output_database}.feature_importance_${train_input_table}
 
 +print_train_result:
-  echo>: "executed ${automl.last_executed_notebook}.ipynb"
+  echo>: "Executed training and built gluon_model_${session_id}: ${automl.last_executed_notebook}.ipynb"
 
 +track_experiment:
   td>: queries/track_experiment.sql
-  insert_into: automl_experiments
+  insert_into: ${output_database}.automl_experiments
   last_executed_notebook: ${automl.last_executed_notebook}
   user_id: ${automl.last_executed_user_id}
   user_email: ${automl.last_executed_user_email}
@@ -52,14 +35,8 @@ _export:
   ml_predict>:
     notebook: gluon_predict
     model_name: gluon_model_${session_id}
-    input_table: ${input_database}.gluon_test # expect database_name.table_name
-    output_table: ${output_database}.gluon_predicted  # expect database_name.table_name. DB will be created if not exists. table is overwrite'd.
-    # optional
-    #rowid_column: rowid                # Note when rowid_column is specified, only rowid column + prediction result columns are resulted in the output table
-    #ignore_columns: time               # target column should not be in test data
-    export_leaderboard: ${output_database}.leaderboard_gluon_predict
-    export_feature_importance: ${output_database}.feature_importance_gluon_predict
-    # hide_table_contents: true
+    input_table: ${input_database}.${test_data_table}
+    output_table: ${output_database}.predicted_${test_data_table}_${session_id}
 
 +print_predict_result:
-  echo>: "executed ${automl.last_executed_notebook}.ipynb"
\ No newline at end of file
+  echo>: "Run prediction and resulted to ${output_database}.predicted_${test_data_table}_${session_id}: ${automl.last_executed_notebook}.ipynb"
\ No newline at end of file

From 250c32d7961d5a5e4725c0159025282b57e48895 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Fri, 1 Jul 2022 21:18:22 +0900
Subject: [PATCH 11/47] td.database is required

---
 machine-learning-box/automl/.ruby-version     | 1 +
 machine-learning-box/automl/ml_experiment.dig | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 machine-learning-box/automl/.ruby-version

diff --git a/machine-learning-box/automl/.ruby-version b/machine-learning-box/automl/.ruby-version
new file mode 100644
index 00000000..ec1cf33c
--- /dev/null
+++ b/machine-learning-box/automl/.ruby-version
@@ -0,0 +1 @@
+2.6.3
diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index 2967c38a..79d46ade 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -2,6 +2,7 @@ _export:
   !include : config/params.yaml
   td:
     engine: presto
+    database: ${output_database}
 
 +create_tbl_if_not_exists:
   td_ddl>:

From 61c6b8e9fd8d210050e504015b1fac5810ccbf89 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Fri, 1 Jul 2022 21:22:28 +0900
Subject: [PATCH 12/47] Fixed var ref

---
 machine-learning-box/automl/ml_experiment.dig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index 79d46ade..a638ec45 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -15,8 +15,8 @@ _export:
     input_table: ${input_database}.${train_data_table}
     target_column: ${target_column}
     time_limit: ${fit_time_limit}
-    export_leaderboard: ${output_database}.leaderboard_${train_input_table}
-    export_feature_importance: ${output_database}.feature_importance_${train_input_table}
+    export_leaderboard: ${output_database}.leaderboard_${train_data_table}
+    export_feature_importance: ${output_database}.feature_importance_${train_data_table}
 
 +print_train_result:
   echo>: "Executed training and built gluon_model_${session_id}: ${automl.last_executed_notebook}.ipynb"

From 03a6963751d6a370418c873cb26969177ba38100 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 7 Jul 2022 21:59:45 +0900
Subject: [PATCH 13/47] Fixed to properly create output database if missing

---
 machine-learning-box/automl/ml_experiment.dig      | 6 +++++-
 machine-learning-box/automl/ml_experiment_demo.dig | 4 ++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index a638ec45..20ab4c60 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -4,9 +4,13 @@ _export:
     engine: presto
     database: ${output_database}
 
++create_database_if_not_exists:
+  td_ddl>:
+    create_databases: ["${output_database}"]
+
 +create_tbl_if_not_exists:
   td_ddl>:
-    create_tables: ["${expr_tracking_table}", "${output_database}"]
+    create_tables: ["${expr_tracking_table}"]
 
 +gluon_train:
   ml_train>:
diff --git a/machine-learning-box/automl/ml_experiment_demo.dig b/machine-learning-box/automl/ml_experiment_demo.dig
index db644c91..57c2dc41 100644
--- a/machine-learning-box/automl/ml_experiment_demo.dig
+++ b/machine-learning-box/automl/ml_experiment_demo.dig
@@ -7,6 +7,10 @@ _export:
     engine: presto
     database: ${output_database}
 
++create_database_if_not_exists:
+  td_ddl>:
+    create_databases: ["${output_database}"]
+
 +create_tbl_if_not_exists:
   td_ddl>:
     create_tables: ["${expr_tracking_table}"]

From 4ea4661671ac1ab6e4018840b8e163931fadd4e8 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 7 Jul 2022 21:59:57 +0900
Subject: [PATCH 14/47] Minor comment format change

---
 machine-learning-box/automl/config/params.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine-learning-box/automl/config/params.yaml b/machine-learning-box/automl/config/params.yaml
index 8e5aa835..19fedd79 100644
--- a/machine-learning-box/automl/config/params.yaml
+++ b/machine-learning-box/automl/config/params.yaml
@@ -7,4 +7,4 @@ test_data_table: gluon_test
 
 expr_tracking_table: automl_experiments
 
-fit_time_limit: 60 * 3 # # fit timeout. 3 min just for training time. Default: 60 * 60 (1hr).
\ No newline at end of file
+fit_time_limit: 60 * 3   # fit timeout in sec. 3 min just for demo. Default: 60 * 60 (1hr).
\ No newline at end of file

From b0f5a8729a18477480e91c53a6fd007f993cc602 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Fri, 8 Jul 2022 10:35:13 +0900
Subject: [PATCH 15/47] Fixed td_ddl

---
 machine-learning-box/automl/ml_experiment.dig      | 9 +++------
 machine-learning-box/automl/ml_experiment_demo.dig | 9 +++------
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index 20ab4c60..14d45fe5 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -4,13 +4,10 @@ _export:
     engine: presto
     database: ${output_database}
 
-+create_database_if_not_exists:
++create_db_tbl_if_not_exists:
   td_ddl>:
-    create_databases: ["${output_database}"]
-
-+create_tbl_if_not_exists:
-  td_ddl>:
-    create_tables: ["${expr_tracking_table}"]
+  create_databases: ["${output_database}"]
+  create_tables: ["${expr_tracking_table}"]
 
 +gluon_train:
   ml_train>:
diff --git a/machine-learning-box/automl/ml_experiment_demo.dig b/machine-learning-box/automl/ml_experiment_demo.dig
index 57c2dc41..95fa75d2 100644
--- a/machine-learning-box/automl/ml_experiment_demo.dig
+++ b/machine-learning-box/automl/ml_experiment_demo.dig
@@ -7,13 +7,10 @@ _export:
     engine: presto
     database: ${output_database}
 
-+create_database_if_not_exists:
++create_db_tbl_if_not_exists:
   td_ddl>:
-    create_databases: ["${output_database}"]
-
-+create_tbl_if_not_exists:
-  td_ddl>:
-    create_tables: ["${expr_tracking_table}"]
+  create_databases: ["${output_database}"]
+  create_tables: ["${expr_tracking_table}"]
 
 +load_datasets:
   ipynb>:

From 37537116ee3eb50c61c1989a5f59ebecbd2ade7b Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Tue, 12 Jul 2022 17:44:46 +0900
Subject: [PATCH 16/47] Add a workaround for input_table is required

---
 machine-learning-box/automl/ml_experiment_demo.dig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/machine-learning-box/automl/ml_experiment_demo.dig b/machine-learning-box/automl/ml_experiment_demo.dig
index 95fa75d2..d8fd4d18 100644
--- a/machine-learning-box/automl/ml_experiment_demo.dig
+++ b/machine-learning-box/automl/ml_experiment_demo.dig
@@ -16,6 +16,7 @@ _export:
   ipynb>:
     notebook: ml_datasets
     output_database: ${input_database}
+    input_table: ${input_database}.dummy
 #   datasets: gluon, bank_marketing
     datasets: gluon
 

From 2faf3d3c296141af9f65e34db96c0a2445ae727c Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 1 Dec 2022 19:03:48 +0900
Subject: [PATCH 17/47] Added NBA and network_analysis notebook sample
 workflows

---
 machine-learning-box/automl/nba.dig           | 51 +++++++++++++++++++
 .../automl/network_analysis.dig               | 10 ++++
 2 files changed, 61 insertions(+)
 create mode 100644 machine-learning-box/automl/nba.dig
 create mode 100644 machine-learning-box/automl/network_analysis.dig

diff --git a/machine-learning-box/automl/nba.dig b/machine-learning-box/automl/nba.dig
new file mode 100644
index 00000000..c8df7c14
--- /dev/null
+++ b/machine-learning-box/automl/nba.dig
@@ -0,0 +1,51 @@
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: ${output_database}
+
++create_db_tbl_if_not_exists:
+  td_ddl>:
+  create_databases: ["${output_database}"]
+
++load_datasets:
+  ipynb>:
+    notebook: ml_datasets
+    output_database: ml_datasets
+    datasets: nba
+
++nba_only_qtable:
+  ipynb>:
+    notebook: NBA
+    train_table: ml_datasets.nba_train
+    # optional
+    export_q_table: ${output_database}.rl_qtable_${session_id}
+    export_state_action: ${output_database}.rl_state_action_${session_id}
+
++nba_with_eval:
+  ipynb>:
+    notebook: NBA
+    train_table: ml_datasets.nba_train
+    test_table: ml_datasets.nba_test
+    budget: 10000
+    value_per_cv: 100
+    # optional
+    # export_q_table: ${output_database}.rl_qtable_${session_id}
+    export_channel_ratio: ${output_database}.rl_channel_ratio_${session_id}
+    export_predictions: ${output_database}.rl_predictions_${session_id}
+    export_model_performance: ${output_database}.rl_model_performance_${session_id}
+    ignore_actions: client_domain_organic_visit, organic_search
+    action_cost: |
+     {
+       "display": 2,
+       "social-social": 1.4,
+       "social": 2,
+       "social-paid": 5,
+       "organic_search": 1,
+       "emai": 3.2,
+       "cpc": 3,
+       "referral": 2,
+       "linkedin": 3,
+       "search-paid": 2,
+       "twitter": 1
+     }
\ No newline at end of file
diff --git a/machine-learning-box/automl/network_analysis.dig b/machine-learning-box/automl/network_analysis.dig
new file mode 100644
index 00000000..c214264f
--- /dev/null
+++ b/machine-learning-box/automl/network_analysis.dig
@@ -0,0 +1,10 @@
++load_datasets:
+  ipynb>:
+    notebook: ml_datasets
+    output_database: ml_datasets
+    datasets: transition_matrix
+
++network_analysis:
+  ipynb>:
+    notebook: network_analysis
+    input_table: ml_datasets.transition_matrix

From 8d474201b3ec8bf71b36daaf9d52456f139ad75e Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Tue, 7 Feb 2023 13:28:09 +0900
Subject: [PATCH 18/47] Added timeseries forecasting example workflow

---
 machine-learning-box/automl/ts_forecast.dig | 32 +++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 machine-learning-box/automl/ts_forecast.dig

diff --git a/machine-learning-box/automl/ts_forecast.dig b/machine-learning-box/automl/ts_forecast.dig
new file mode 100644
index 00000000..d76f433b
--- /dev/null
+++ b/machine-learning-box/automl/ts_forecast.dig
@@ -0,0 +1,32 @@
+#timezone: Asia/Tokyo
+#timezone: PST
+
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: sample_datasets # dummy to avoid error on create_databases
+
++create_db_tbl_if_not_exists:
+  td_ddl>:
+  create_databases: ["ml_datasets", "ml_test"]
+
++load_datasets:
+  ipynb>:
+    docker:
+      task_mem: 64g
+    notebook: ml_datasets
+    output_database: ml_datasets
+    datasets: ts_airline
+
++run_ts_forecast:
+  ipynb>:
+    docker:
+      task_mem: 256g # 64g/128g/256g/384g/512g
+    notebook: ts_forecast
+    train_table: ml_datasets.ts_airline
+    tstamp_column: period
+    target_column: number_of_airline_passengers
+    forecast_length: 30
+    output_table: ml_test.ts_airline_predicted
+

From 612fe5eb7666978b6ab1e697ce827dd1515c2b8e Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Tue, 7 Feb 2023 18:26:13 +0900
Subject: [PATCH 19/47] Set default time_limit

---
 machine-learning-box/automl/ts_forecast.dig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/machine-learning-box/automl/ts_forecast.dig b/machine-learning-box/automl/ts_forecast.dig
index d76f433b..d065c6dc 100644
--- a/machine-learning-box/automl/ts_forecast.dig
+++ b/machine-learning-box/automl/ts_forecast.dig
@@ -29,4 +29,5 @@ _export:
     target_column: number_of_airline_passengers
     forecast_length: 30
     output_table: ml_test.ts_airline_predicted
+    time_limit: 10 * 60 # 10 min by the default
 

From cc32839f366b6d2073e1548c39587d2d3163c001 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 9 Feb 2023 12:23:13 +0900
Subject: [PATCH 20/47] Add shepley workflow

---
 machine-learning-box/automl/ml_experiment.dig | 10 ++++------
 .../automl/queries/track_experiment.sql       |  6 +++---
 machine-learning-box/automl/shapley.dig       | 19 +++++++++++++++++++
 3 files changed, 26 insertions(+), 9 deletions(-)
 create mode 100644 machine-learning-box/automl/shapley.dig

diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index 14d45fe5..1680dab6 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -11,6 +11,8 @@ _export:
 
 +gluon_train:
   ml_train>:
+    docker:
+      task_mem: 128g # 64g/128g/256g/384g/512g
     notebook: gluon_train
     model_name: gluon_model_${session_id}
     input_table: ${input_database}.${train_data_table}
@@ -19,9 +21,6 @@ _export:
     export_leaderboard: ${output_database}.leaderboard_${train_data_table}
     export_feature_importance: ${output_database}.feature_importance_${train_data_table}
 
-+print_train_result:
-  echo>: "Executed training and built gluon_model_${session_id}: ${automl.last_executed_notebook}.ipynb"
-
 +track_experiment:
   td>: queries/track_experiment.sql
   insert_into: ${output_database}.automl_experiments
@@ -35,10 +34,9 @@ _export:
 
 +gluon_predict:
   ml_predict>:
+    docker:
+      task_mem: 64g # 64g/128g/256g/384g/512g
     notebook: gluon_predict
     model_name: gluon_model_${session_id}
     input_table: ${input_database}.${test_data_table}
     output_table: ${output_database}.predicted_${test_data_table}_${session_id}
-
-+print_predict_result:
-  echo>: "Run prediction and resulted to ${output_database}.predicted_${test_data_table}_${session_id}: ${automl.last_executed_notebook}.ipynb"
\ No newline at end of file
diff --git a/machine-learning-box/automl/queries/track_experiment.sql b/machine-learning-box/automl/queries/track_experiment.sql
index d781a9f4..0fe30327 100644
--- a/machine-learning-box/automl/queries/track_experiment.sql
+++ b/machine-learning-box/automl/queries/track_experiment.sql
@@ -4,6 +4,6 @@ select
    '${session_time}' as session_time,
    '${user_id}' as user_id,
    '${user_email}' as user_email,
-   '${model_name}' as model_name,   
-   '${last_executed_notebook}.ipynb' as ipynb_url,
-   '${last_executed_notebook}.html' as html_url
+   '${model_name}' as model_name,
+   '${last_executed_notebook}' as notebook_url
+ 
diff --git a/machine-learning-box/automl/shapley.dig b/machine-learning-box/automl/shapley.dig
new file mode 100644
index 00000000..da8c73f1
--- /dev/null
+++ b/machine-learning-box/automl/shapley.dig
@@ -0,0 +1,19 @@
+timezone: Asia/Tokyo
+#timezone: PST
+
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: ${output_database}
+  
++run_ml_experiment_demo:
+  call>: ml_experiment_demo.dig
+
++explain_predictions_by_shap:
+  ipynb>:
+    docker:
+      task_mem: 128g # 64g/128g/256g/384g/512g
+    notebook: shapley
+    model_name: gluon_model_${session_id}     # model used for prediction
+    input_table: ${input_database}.gluon_test # test data used for prediction

From eae700209af1f908fbdd976fad63587be8318f4e Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Fri, 10 Feb 2023 15:39:46 +0900
Subject: [PATCH 21/47] Add experimental MTA workflow

---
 machine-learning-box/automl/mta.dig | 40 +++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 machine-learning-box/automl/mta.dig

diff --git a/machine-learning-box/automl/mta.dig b/machine-learning-box/automl/mta.dig
new file mode 100644
index 00000000..1601f586
--- /dev/null
+++ b/machine-learning-box/automl/mta.dig
@@ -0,0 +1,40 @@
+#timezone: Asia/Tokyo
+#timezone: PST
+
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: sample_datasets # dummy to avoid error on create_databases
+  output_db: ml_test
+
++create_db_tbl_if_not_exists:
+  td_ddl>:
+  create_databases: ["ml_datasets", "${output_db}"]
+
++load_datasets:
+  ipynb>:
+    docker:
+      task_mem: 64g
+    notebook: ml_datasets
+    output_database: ml_datasets
+    datasets: mta
+
++run_mta:
+  ipynb>:
+    branch: ATML-18_mta
+    docker:
+      task_mem: 128g # 64g/128g/256g/384g/512g
+    notebook: MTA
+    # required param
+    input_table: ml_datasets.mta
+    # optional param
+    tstamp_column: tstamp
+    user_column: user
+    channel_column: channel
+    conversion_column: conversion
+    ignore_channels: Facebook
+    overwrite_channel: Direct
+    export_channel_interactions: ${output_db}.channel_interactions
+    export_shapley_attributions: ${output_db}.shapley_attributions
+    export_attributed_conversions: ${output_db}.attributed_conversions

From 74892f472aa5e06223ce3d0542945b4887916133 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 16 Feb 2023 11:06:04 +0900
Subject: [PATCH 22/47] Added a new option

---
 machine-learning-box/automl/mta.dig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/machine-learning-box/automl/mta.dig b/machine-learning-box/automl/mta.dig
index 1601f586..358b78fb 100644
--- a/machine-learning-box/automl/mta.dig
+++ b/machine-learning-box/automl/mta.dig
@@ -22,7 +22,6 @@ _export:
 
 +run_mta:
   ipynb>:
-    branch: ATML-18_mta
     docker:
       task_mem: 128g # 64g/128g/256g/384g/512g
     notebook: MTA
@@ -33,6 +32,8 @@ _export:
     user_column: user
     channel_column: channel
     conversion_column: conversion
+    # optional columns (usually not needed)
+    analyze_topk_channels: 50
     ignore_channels: Facebook
     overwrite_channel: Direct
     export_channel_interactions: ${output_db}.channel_interactions

From db9a098a4b0ffb92328807d56d03e9baace58904 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 18 May 2023 11:47:20 +0900
Subject: [PATCH 23/47] Added shared_model option

---
 machine-learning-box/automl/ml_experiment.dig            | 2 ++
 machine-learning-box/automl/queries/track_experiment.sql | 1 +
 2 files changed, 3 insertions(+)

diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index 1680dab6..f414a063 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -18,6 +18,7 @@ _export:
     input_table: ${input_database}.${train_data_table}
     target_column: ${target_column}
     time_limit: ${fit_time_limit}
+    share_model: true
     export_leaderboard: ${output_database}.leaderboard_${train_data_table}
     export_feature_importance: ${output_database}.feature_importance_${train_data_table}
 
@@ -28,6 +29,7 @@ _export:
   user_id: ${automl.last_executed_user_id}
   user_email: ${automl.last_executed_user_email}
   model_name: gluon_model_${session_id}
+  shared_mdoel: ${automl.shared_model}
   task_attempt_id: ${attempt_id}
   session_time: ${session_local_time}
   engine: presto
diff --git a/machine-learning-box/automl/queries/track_experiment.sql b/machine-learning-box/automl/queries/track_experiment.sql
index 0fe30327..0d8b3d9d 100644
--- a/machine-learning-box/automl/queries/track_experiment.sql
+++ b/machine-learning-box/automl/queries/track_experiment.sql
@@ -5,5 +5,6 @@ select
    '${user_id}' as user_id,
    '${user_email}' as user_email,
    '${model_name}' as model_name,
+   '${shared_mdoel} as shared_mdoel,
    '${last_executed_notebook}' as notebook_url
  

From 6261a344b85e6307d2cd65692ae2ff3e96e7bb6b Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 18 May 2023 14:40:14 +0900
Subject: [PATCH 24/47] Added missing '

---
 machine-learning-box/automl/queries/track_experiment.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine-learning-box/automl/queries/track_experiment.sql b/machine-learning-box/automl/queries/track_experiment.sql
index 0d8b3d9d..a9dc9522 100644
--- a/machine-learning-box/automl/queries/track_experiment.sql
+++ b/machine-learning-box/automl/queries/track_experiment.sql
@@ -5,6 +5,6 @@ select
    '${user_id}' as user_id,
    '${user_email}' as user_email,
    '${model_name}' as model_name,
-   '${shared_mdoel} as shared_mdoel,
+   '${shared_mdoel}' as shared_mdoel,
    '${last_executed_notebook}' as notebook_url
  

From dcd73ad40933276871a53857a620148516b9885f Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 18 May 2023 15:21:44 +0900
Subject: [PATCH 25/47] Revised to record AUC

---
 machine-learning-box/automl/ml_experiment.dig | 19 +++++++++++++++++--
 machine-learning-box/automl/queries/auc.sql   |  8 ++++++++
 .../automl/queries/record_evaluation.sql      |  5 +++++
 3 files changed, 30 insertions(+), 2 deletions(-)
 create mode 100644 machine-learning-box/automl/queries/auc.sql
 create mode 100644 machine-learning-box/automl/queries/record_evaluation.sql

diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index f414a063..99cac771 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -9,7 +9,7 @@ _export:
   create_databases: ["${output_database}"]
   create_tables: ["${expr_tracking_table}"]
 
-+gluon_train:
++train:
   ml_train>:
     docker:
       task_mem: 128g # 64g/128g/256g/384g/512g
@@ -34,7 +34,8 @@ _export:
   session_time: ${session_local_time}
   engine: presto
 
-+gluon_predict:
+# Note: If input_table contains target labels, ml_predict shows evaluation results
++predict:
   ml_predict>:
     docker:
       task_mem: 64g # 64g/128g/256g/384g/512g
@@ -42,3 +43,17 @@ _export:
     model_name: gluon_model_${session_id}
     input_table: ${input_database}.${test_data_table}
     output_table: ${output_database}.predicted_${test_data_table}_${session_id}
+
++evaluation:
+  td>: queries/auc.sql
+  table: ${output_database}.predicted_${test_data_table}_${session_id}
+  target_column: y
+  store_last_results: true
+
++record_evaluation:
+  td>: queries/auc.sql
+  insert_into: ${output_database}.automl_eval_results
+  engine: presto
+  model_name: gluon_model_${session_id}
+  session_time: ${session_local_time}
+  auc: ${td.last_results.auc}
diff --git a/machine-learning-box/automl/queries/auc.sql b/machine-learning-box/automl/queries/auc.sql
new file mode 100644
index 00000000..ee64cd39
--- /dev/null
+++ b/machine-learning-box/automl/queries/auc.sql
@@ -0,0 +1,8 @@
+-- DIGDAG_INSERT_LINE
+select
+  auc(prob, label) as auc
+from (
+  select predicted_proba as prob, ${target_column} as label
+  from ${table}
+  ORDER BY prob DESC
+) t
diff --git a/machine-learning-box/automl/queries/record_evaluation.sql b/machine-learning-box/automl/queries/record_evaluation.sql
new file mode 100644
index 00000000..e08b6916
--- /dev/null
+++ b/machine-learning-box/automl/queries/record_evaluation.sql
@@ -0,0 +1,5 @@
+-- DIGDAG_INSERT_LINE
+select
+   '${session_time}' as session_time,
+   '${model_name}' as model_name,
+   '${auc}' as auroc

From e8f28e5b83f5b126610749c6be1aa216e84814f3 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 18 May 2023 15:35:52 +0900
Subject: [PATCH 26/47] Fixed y is missing

---
 machine-learning-box/automl/ml_experiment.dig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index 99cac771..b8b228a3 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -47,7 +47,7 @@ _export:
 +evaluation:
   td>: queries/auc.sql
   table: ${output_database}.predicted_${test_data_table}_${session_id}
-  target_column: y
+  target_column: ${target_column}
   store_last_results: true
 
 +record_evaluation:

From a5ffeaa30c2e0bd5152380d32da4306d04d78857 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 18 May 2023 16:00:56 +0900
Subject: [PATCH 27/47] Fixed a bug

---
 machine-learning-box/automl/config/params.yaml | 2 --
 machine-learning-box/automl/ml_experiment.dig  | 5 +++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/machine-learning-box/automl/config/params.yaml b/machine-learning-box/automl/config/params.yaml
index 19fedd79..fcf30438 100644
--- a/machine-learning-box/automl/config/params.yaml
+++ b/machine-learning-box/automl/config/params.yaml
@@ -5,6 +5,4 @@ train_data_table: gluon_train
 target_column: class
 test_data_table: gluon_test
 
-expr_tracking_table: automl_experiments
-
 fit_time_limit: 60 * 3   # fit timeout in sec. 3 min just for demo. Default: 60 * 60 (1hr).
\ No newline at end of file
diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index b8b228a3..040c6540 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -7,7 +7,7 @@ _export:
 +create_db_tbl_if_not_exists:
   td_ddl>:
   create_databases: ["${output_database}"]
-  create_tables: ["${expr_tracking_table}"]
+  create_tables: ["automl_experiments", "automl_eval_results"]
 
 +train:
   ml_train>:
@@ -49,9 +49,10 @@ _export:
   table: ${output_database}.predicted_${test_data_table}_${session_id}
   target_column: ${target_column}
   store_last_results: true
+  engine: hive
 
 +record_evaluation:
-  td>: queries/auc.sql
+  td>: queries/record_evaluation.sql
   insert_into: ${output_database}.automl_eval_results
   engine: presto
   model_name: gluon_model_${session_id}

From 6d52e9bb9c211d73275588c588b877857fe9fbb2 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 18 May 2023 16:17:54 +0900
Subject: [PATCH 28/47] Fixed a bug

---
 machine-learning-box/automl/ml_experiment.dig | 1 +
 machine-learning-box/automl/queries/auc.sql   | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index 040c6540..bb8e8701 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -48,6 +48,7 @@ _export:
   td>: queries/auc.sql
   table: ${output_database}.predicted_${test_data_table}_${session_id}
   target_column: ${target_column}
+  positive_class: ' >50K'
   store_last_results: true
   engine: hive
 
diff --git a/machine-learning-box/automl/queries/auc.sql b/machine-learning-box/automl/queries/auc.sql
index ee64cd39..06d35e75 100644
--- a/machine-learning-box/automl/queries/auc.sql
+++ b/machine-learning-box/automl/queries/auc.sql
@@ -1,8 +1,7 @@
--- DIGDAG_INSERT_LINE
 select
   auc(prob, label) as auc
 from (
-  select predicted_proba as prob, ${target_column} as label
+  select predicted_proba as prob, if(cast(${target_column} as string)=="${positive_class}", 1, 0) as label
   from ${table}
   ORDER BY prob DESC
 ) t

From e95cba492d7b5418ec630ab4c6a0e6bdd395ce50 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 25 May 2023 18:47:23 +0900
Subject: [PATCH 29/47] Added vehicle coupon workflow to demonstrate adding an
 attribute table to audience

---
 .../automl/queries/assign_rowid.sql           |   2 +
 .../automl/scripts/audience.py                | 246 ++++++++++++++++++
 .../automl/vehicle_coupon.dig                 |  66 +++++
 3 files changed, 314 insertions(+)
 create mode 100644 machine-learning-box/automl/queries/assign_rowid.sql
 create mode 100644 machine-learning-box/automl/scripts/audience.py
 create mode 100644 machine-learning-box/automl/vehicle_coupon.dig

diff --git a/machine-learning-box/automl/queries/assign_rowid.sql b/machine-learning-box/automl/queries/assign_rowid.sql
new file mode 100644
index 00000000..a07119b3
--- /dev/null
+++ b/machine-learning-box/automl/queries/assign_rowid.sql
@@ -0,0 +1,2 @@
+-- DIGDAG_INSERT_LINE
+select rownum() as ${rowid_column}, * from ${table}
diff --git a/machine-learning-box/automl/scripts/audience.py b/machine-learning-box/automl/scripts/audience.py
new file mode 100644
index 00000000..80e214b7
--- /dev/null
+++ b/machine-learning-box/automl/scripts/audience.py
@@ -0,0 +1,246 @@
+__all__ = ['CdpAudience']
+
+import sys, os
+import requests 
+import json
+import pytd
+import re
+from typing import Tuple
+
+from requests.models import Response
+from requests.packages.urllib3.util.retry import Retry
+from requests.adapters import HTTPAdapter
+from requests import Session
+
+
+class CdpApiClient:
+    def __init__(self, endpoint, headers: dict) -> None:
+        retry_strategy = Retry(
+            total=3, status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2
+        )
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        s = Session()
+        s.headers = headers 
+        s.mount("http://", adapter)
+        s.mount("https://", adapter)
+        self.endpoint = f"https://{endpoint}"
+        self.client: Session = s
+
+    def get(self, path, **kwargs) -> Response:
+        return self.client.get(url=self.endpoint+path, **kwargs)
+
+    def put(self, path: str, data=None, **kwargs) -> Response:
+        return self.client.put(url=self.endpoint+path, data=data, **kwargs)
+
+    def post(self, path: str, data=None, json=None, **kwargs) -> Response:
+        return self.client.post(url=self.endpoint+path, data=data, json=json, **kwargs)
+
+
+def to_boolean(o) -> bool:
+    if o == None:
+        return False
+    s = str(o)
+
+    try:
+        from distutils.util import strtobool
+        return bool(strtobool(s))
+    except ValueError as e:
+        return False
+
+
+def validate_db_resource_name(name: str) -> str:
+    '''
+    Validate DB_NAME or TABLE_NAME
+    '''
+    # https://docs.treasuredata.com/display/public/PD/Naming+Requirements+and+Restrictions+for+Treasure+Data+Entities
+    TD_DB_RESOURCE_REGEX = "[a-z0-9_]+"
+    assert re.fullmatch(rf"^{TD_DB_RESOURCE_REGEX}$", name) is not None, f"Invalid DB resource name: {name}"
+    return name
+
+
+def parse_table(table: str) -> Tuple[str, str]:
+    '''
+    Parse DB_NAME.TABLE_NAME to DB_NAME, TABLE_NAME
+    '''
+    assert table.count(".") == 1, f"Invalid table name {table}, DB_NAME.TABLE_NAME is expected."
+    database, table = table.split(".")
+    validate_db_resource_name(database)
+    validate_db_resource_name(table)
+    return database, table
+
+
+def resolve_type(table, column_name: str):
+    # workaround for ValueError: not enough values to unpack (expected 3, got 2)
+    schema = [c if len(c) == 3 else [c[0], c[1], ""] for c in table.schema]
+    # column_name:str, column_type:str, alias:str
+    for (c_name, c_type, _) in schema:
+        if c_name == column_name:
+            # Note: Only string, number, timestamp, string_array, or number_array is accepted for attr_type
+            # https://github.com/treasure-data/td-cdp-api/blob/master/app/models/audience_attribute.rb#L9
+            # https://docs.treasuredata.com/display/PD/Using+TD+CLI+to+Annotate+Schema+-+Legacy
+            if c_type in ['int', 'long', 'double', 'float']:
+                return 'number'
+            else:
+                return 'string'
+    raise KeyError(f"column {column_name} not found in {table.schema}")
+
+
+class CdpAudience:
+    '''
+    Usage: 
+      cdp = CdpAudience()
+      cdp.add_attribute(audience_name=audience_name, attr_db=attr_db, attr_table=attr_table, attr_column=attr_column, join_key=join_key, foreign_key=foreign_key, replace_attr_if_exists=True)
+    '''
+
+    def __init__(self):
+        TD_API_KEY = os.environ["TD_API_KEY"]
+        TD_ENDPOINT = os.environ["TD_API_SERVER"]
+
+        CDP_ENDPOINT = TD_ENDPOINT.replace('api', 'api-cdp')
+        HEADERRS = {'Authorization': f'TD1 {TD_API_KEY}', 'Content-Type': 'application/json'}
+        self.cdp_api = CdpApiClient(endpoint=CDP_ENDPOINT, headers=HEADERRS)
+        self.td_api = pytd.Client(retry_post_requests=True).api_client
+
+    def add_attribute(
+        self, *, audience_id: str=None, audience_name: str=None, attr_db: str=None, attr_table: str, attr_column: str, join_key: str, foreign_key: str, 
+        attr_alias: str=None, attr_group: str="AutoML", rerun_master_segment: bool=True, replace_attr_if_exists: bool=False,
+        **kwargs
+    ):
+        if attr_alias is None:
+            attr_alias = attr_column
+
+        if attr_db is None:
+            attr_db, attr_table = parse_table(attr_table)
+
+        if audience_id is None:
+            assert audience_name is not None, "Either audience_id or audience_name argument is required"
+            audience_id = self.get_parent_segment_id(audience_name)
+
+        table = self.td_api.table(attr_db, attr_table)
+        attr_type = resolve_type(table, attr_column)
+
+        res = self.cdp_api.put(f"/audiences/{audience_id}")
+        if not res.ok:
+            raise RuntimeError(res.text)
+        audience = res.json()
+        attributes = audience['attributes'] if 'attributes' in audience else []
+
+        new_attr = {
+            'audienceId': audience_id,        # ID of Master Segment for this attribute
+            'name': attr_column,              # Column name to be defined on Master Segment
+            'type': attr_type,                # Type of the column 
+            'parentDatabaseName': attr_db,    # Database name of the attribute table
+            'parentTableName': attr_table,    # Table name of the attribute table
+            'parentColumn': attr_column,      # Column name of the attribute table which is imported into customer table
+            'parentKey': join_key,            # Join key of the attribute table
+            'foreignKey': foreign_key,        # Foreign key of the master table
+            'groupingName': attr_group,       # Group name of the attribute
+        }
+
+        append_attr = False
+        for i, attr in enumerate(attributes):
+            if 'name' in attr and attr['name'] == attr_column:
+                if replace_attr_if_exists:
+                    attributes[i] = new_attr
+                    append_attr = False
+                    print(f"⚠ Repalce '{attr_column}' in Master Segment {audience_id}", file=sys.stderr) 
+                    break
+                else:
+                    print(f"⚠ skip adding an attribute because the attribute column '{attr_column}' already exists", file=sys.stderr) 
+                    return
+        if append_attr == True:
+            attributes.append(new_attr)
+
+        res = self.cdp_api.put(f"/audiences/{audience_id}", json=audience)
+        if res.ok:
+            print(f"ⓘ Successfully added an attribute table '{attr_table}' to master segment {audience_id}", file=sys.stderr) 
+        else:
+            try: 
+                'not unique' in res.json()['base'][0]
+                print(f"⚠ Attribute '{attr_column}' already exists in Parent Segment and thus skip adding an attribue.", file=sys.stderr) 
+                return
+            except:
+                print(f"failed to PUT /audiences/{audience_id}: {new_attr}")
+                raise RuntimeError(f"{res.status_code} error on PUT /audiences/{audience_id}: {res.json()}")
+
+        if rerun_master_segment:
+            res = self.cdp_api.post(f"/audiences/{audience_id}/run")
+            if res.ok:
+                print(f"ⓘ Successfully triggered rerun of Master Segment: {audience_id}", file=sys.stderr) 
+            else:
+                raise RuntimeError(f"{res.status_code} error on POST /audiences/{audience_id}/run: {res.json()}")
+
+
+    def get_parent_segment_id(self, name: str) -> str:
+        '''
+            Retrive parent segment ID if exists. Otherwise, return None
+        '''
+
+        assert name is not None
+
+        # Get all the audience configurations 
+        res = self.cdp_api.get('/audiences')
+        if not res.ok:
+            raise RuntimeError(res.text)
+        audiences = json.loads(res.text)
+
+        for audience in audiences:
+            if 'name' in audience and name == audience['name']:
+                if 'id' in audience:
+                    return audience['id']
+
+        raise ValueError(f"Cannot find parent segment: {name}")
+
+
+def parse_arguments(kwargs: dict) -> dict:
+    assert os.environ.get('TD_API_KEY') is not None, "TD_API_KEY ENV variable is required"
+    assert os.environ.get('TD_API_SERVER') is not None, "TD_API_SERVER ENV variable is required"
+
+    ret = {}
+
+    audience = kwargs.pop('audience', None)
+    assert audience is not None, "audience argument is required"
+    audience_id = audience.pop('id', None)
+    if audience_id is not None: ret['audience_id'] = audience_id
+    audience_name = audience.pop('name', None)
+    if audience_name is not None: ret['audience_name'] = audience_name
+    foreign_key = audience.pop('foreign_key', None)
+    assert foreign_key is not None, "foreign_key argument is required"
+    ret['foreign_key'] = foreign_key
+    ret['rerun_master_segment'] = to_boolean(audience.pop('rerun', 'False'))
+
+    attribute = kwargs.pop('attribute', None)
+    assert attribute is not None, "attribute argument is required"
+    attr_table = attribute.pop('table', None)
+    assert attr_table is not None, "attr_table argument is required"
+    ret['attr_table'] = attr_table
+    attr_column =  attribute.pop('attr_column', None)
+    assert attr_column is not None, "attr_column argument is required"
+    ret['attr_column'] = attr_column
+    join_key =  attribute.pop('join_key', None)
+    assert join_key is not None, "join_key argument is required"
+    ret['join_key'] = join_key
+    attr_db = attribute.pop('database', None)
+    if attr_db is not None: ret['attr_db'] = attr_db
+    attr_alias = attribute.pop('attr_alias', None)
+    if attr_alias is not None: ret['attr_alias'] = attr_alias
+    attr_group = attribute.pop('attr_group', "AutoML")
+    ret['attr_group'] = attr_group
+    replace_attr_if_exists = to_boolean(attribute.pop('replace_if_exists', 'False'))
+    ret['replace_attr_if_exists'] = replace_attr_if_exists
+
+    return ret
+
+
+def add_attribute(**kwargs):
+    import faulthandler
+    faulthandler.enable()
+
+    try:
+        params = parse_arguments(kwargs)
+        cdp = CdpAudience()
+        cdp.add_attribute(**params)
+    finally:
+        # force flush
+        sys.stdout.flush()
+        sys.stderr.flush()
diff --git a/machine-learning-box/automl/vehicle_coupon.dig b/machine-learning-box/automl/vehicle_coupon.dig
new file mode 100644
index 00000000..c450de8c
--- /dev/null
+++ b/machine-learning-box/automl/vehicle_coupon.dig
@@ -0,0 +1,66 @@
+_export:
+  output_database: ml_test
+  audience_name: "vehicle coupon test"
+  foreign_key: userid
+  td:
+    engine: presto
+    database: ${output_database}
+
++create_db_tbl_if_not_exists:
+  td_ddl>:
+  create_databases: ["ml_test"]
+
++load_datasets:
+  ipynb>:
+    docker:
+      task_mem: 64g # 64g/128g/256g/384g/512g
+    notebook: ml_datasets
+    output_database: ml_datasets
+    datasets: vehicle_coupon
+
++train:
+  ml_train>:
+    docker:
+      task_mem: 256g # 64g/128g/256g/384g/512g
+    notebook: gluon_train
+    model_name: gluon_model_${session_id}
+    input_table: ml_datasets.vehicle_coupon_train
+    target_column: y
+    time_limit: 3 * 60 # 3 min
+
++prepare_input:
+  td>: queries/assign_rowid.sql
+  table: ml_datasets.vehicle_coupon_test
+  rowid_column: userid
+  create_table: ml_datasets.vehicle_coupon_test_with_rowid
+  engine: hive
+
++predict:
+  ml_predict>:
+    docker:
+      task_mem: 128g # 64g/128g/256g/384g/512g
+    notebook: gluon_predict
+    model_name: gluon_model_${session_id}
+    input_table: ml_datasets.vehicle_coupon_test_with_rowid
+    output_table: ${output_database}.predicted__${session_id}
+
++add_attribute:
+  py>: scripts.audience.add_attribute
+  audience:
+    name: ${audience_name} # segment name or segment id
+    # id: 1111
+    foreign_key: ${foreign_key}
+    ### optional
+    rerun: true
+  attribute:
+    table: ${output_database}.predicted__${session_id}
+    attr_column: "predicted_proba"
+    join_key: "userid"
+    ### optional
+    attr_group: "AutoML"
+    replace_if_exists: true
+  docker:
+    image: "digdag/digdag-python:3.9"
+  _env:
+    TD_API_KEY: ${secret:td.apikey}
+    TD_API_SERVER: "api.treasuredata.com"

From f21dd81a1f0041b890daa6bc8e4739c276873413 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Sat, 27 May 2023 00:52:55 +0900
Subject: [PATCH 30/47] Updated audience script

---
 .../automl/scripts/audience.py                | 287 ++++++++++++------
 1 file changed, 197 insertions(+), 90 deletions(-)

diff --git a/machine-learning-box/automl/scripts/audience.py b/machine-learning-box/automl/scripts/audience.py
index 80e214b7..9c09e878 100644
--- a/machine-learning-box/automl/scripts/audience.py
+++ b/machine-learning-box/automl/scripts/audience.py
@@ -1,11 +1,14 @@
 __all__ = ['CdpAudience']
 
 import sys, os
-import requests 
+import requests
 import json
 import pytd
 import re
-from typing import Tuple
+import faulthandler
+import warnings
+
+from typing import List, Tuple
 
 from requests.models import Response
 from requests.packages.urllib3.util.retry import Retry
@@ -13,6 +16,28 @@
 from requests import Session
 
 
+def deprecated(func):
+    """This is a decorator which can be used to mark functions
+    as deprecated. It will result in a warning being emitted
+    when the function is used."""
+    import functools
+
+    @functools.wraps(func)
+    def new_func(*args, **kwargs):
+        warnings.simplefilter('always', DeprecationWarning)  # turn off filter
+        warnings.warn("Call to deprecated function {}.".format(func.__name__),
+                      category=DeprecationWarning,
+                      stacklevel=2)
+        warnings.simplefilter('default', DeprecationWarning)  # reset filter
+        return func(*args, **kwargs)
+    return new_func
+
+class ApiRequestError(Exception):
+    def __init__(self, response: requests.Response, msg: str=None):
+        if msg is None:
+            msg = f"{response.status_code} ERROR\n{response.text}"
+        super().__init__(msg)
+
 class CdpApiClient:
     def __init__(self, endpoint, headers: dict) -> None:
         retry_strategy = Retry(
@@ -20,7 +45,7 @@ def __init__(self, endpoint, headers: dict) -> None:
         )
         adapter = HTTPAdapter(max_retries=retry_strategy)
         s = Session()
-        s.headers = headers 
+        s.headers = headers
         s.mount("http://", adapter)
         s.mount("https://", adapter)
         self.endpoint = f"https://{endpoint}"
@@ -68,7 +93,7 @@ def parse_table(table: str) -> Tuple[str, str]:
     validate_db_resource_name(table)
     return database, table
 
-
+@deprecated
 def resolve_type(table, column_name: str):
     # workaround for ValueError: not enough values to unpack (expected 3, got 2)
     schema = [c if len(c) == 3 else [c[0], c[1], ""] for c in table.schema]
@@ -87,7 +112,7 @@ def resolve_type(table, column_name: str):
 
 class CdpAudience:
     '''
-    Usage: 
+    Usage:
       cdp = CdpAudience()
       cdp.add_attribute(audience_name=audience_name, attr_db=attr_db, attr_table=attr_table, attr_column=attr_column, join_key=join_key, foreign_key=foreign_key, replace_attr_if_exists=True)
     '''
@@ -101,13 +126,38 @@ def __init__(self):
         self.cdp_api = CdpApiClient(endpoint=CDP_ENDPOINT, headers=HEADERRS)
         self.td_api = pytd.Client(retry_post_requests=True).api_client
 
+    def create_master_segment(self, *, name: str, database: str, table: str, description: str=None, run:bool=False):
+        payload = {}
+        payload['name'] = name
+        payload['description'] = "" if description is None else description
+        payload['master'] = {}
+        payload['master']['parentDatabaseName'] = database
+        payload['master']['parentTableName'] = table
+
+        res = self.cdp_api.post('/audiences', data=json.dumps(payload))
+        if not res.ok:
+            raise ApiRequestError(res)
+
+        audience = json.loads(res.text)
+        audience_id = audience['id']
+        print(f"ⓘ Successfully created Master Segment '{name}':  {audience_id}", file=sys.stderr)
+
+        if run:
+            res = self.cdp_api.post(f"/audiences/{audience_id}/run")
+            print(f"ⓘ Run Master Segment {name}", file=sys.stderr)
+
+        return audience_id
+
     def add_attribute(
-        self, *, audience_id: str=None, audience_name: str=None, attr_db: str=None, attr_table: str, attr_column: str, join_key: str, foreign_key: str, 
-        attr_alias: str=None, attr_group: str="AutoML", rerun_master_segment: bool=True, replace_attr_if_exists: bool=False,
+        self, *, audience_id: str=None, audience_name: str=None, attr_db: str=None, attr_table: str, attr_columns: List[str], join_key: str, foreign_key: str,
+        attr_aliases: List[str]=None, attr_group: str="AutoML", rerun_master_segment: bool=True, replace_attr_if_exists: bool=False,
         **kwargs
     ):
-        if attr_alias is None:
-            attr_alias = attr_column
+        assert len(attr_columns) >= 1, "At least one element in attr_columns but it was empty"
+        if attr_aliases is None:
+            attr_aliases = attr_columns
+        else:
+            assert len(attr_aliases) == len(attr_columns), f"len(attr_aliases) {len(attr_aliases)} is expected to be equals to len(attr_columns) {len(attr_columns)}"
 
         if attr_db is None:
             attr_db, attr_table = parse_table(attr_table)
@@ -116,126 +166,143 @@ def add_attribute(
             assert audience_name is not None, "Either audience_id or audience_name argument is required"
             audience_id = self.get_parent_segment_id(audience_name)
 
-        table = self.td_api.table(attr_db, attr_table)
-        attr_type = resolve_type(table, attr_column)
+        # table = self.td_api.table(attr_db, attr_table)
+        # attr_type = resolve_type(table, "predicted_proba")
 
         res = self.cdp_api.put(f"/audiences/{audience_id}")
         if not res.ok:
-            raise RuntimeError(res.text)
+            raise ApiRequestError(res)
         audience = res.json()
-        attributes = audience['attributes'] if 'attributes' in audience else []
 
-        new_attr = {
-            'audienceId': audience_id,        # ID of Master Segment for this attribute
-            'name': attr_column,              # Column name to be defined on Master Segment
-            'type': attr_type,                # Type of the column 
-            'parentDatabaseName': attr_db,    # Database name of the attribute table
-            'parentTableName': attr_table,    # Table name of the attribute table
-            'parentColumn': attr_column,      # Column name of the attribute table which is imported into customer table
-            'parentKey': join_key,            # Join key of the attribute table
-            'foreignKey': foreign_key,        # Foreign key of the master table
-            'groupingName': attr_group,       # Group name of the attribute
-        }
-
-        append_attr = False
-        for i, attr in enumerate(attributes):
-            if 'name' in attr and attr['name'] == attr_column:
+        attributes = audience['attributes'] if 'attributes' in audience else []
+        existing_attr_names = [attr['name'] for attr in attributes]
+
+        for i, attr_column in enumerate(attr_columns):
+            attr_alias = attr_aliases[i]
+
+            new_attr = {
+                #'audienceId': audience_id,       # ID of Master Segment for this attribute
+                'name': attr_alias,               # Column name to be defined on Master Segment
+                #'type': attr_type,               # Type of the column
+                'parentDatabaseName': attr_db,    # Database name of the attribute table
+                'parentTableName': attr_table,    # Table name of the attribute table
+                'parentColumn': attr_column,      # Column name of the attribute table which is imported into customer table
+                'parentKey': join_key,            # Join key of the attribute table
+                'foreignKey': foreign_key,        # Foreign key of the master table
+                'groupingName': attr_group,       # Group name of the attribute
+            }
+
+            if attr_alias in existing_attr_names:
                 if replace_attr_if_exists:
-                    attributes[i] = new_attr
-                    append_attr = False
-                    print(f"⚠ Repalce '{attr_column}' in Master Segment {audience_id}", file=sys.stderr) 
-                    break
+                    attributes[existing_attr_names.index(attr_alias)] = new_attr
+                    print(f"⚠ Replace an attribute column '{attr_alias}' in Master Segment {audience_id}", file=sys.stderr)
                 else:
-                    print(f"⚠ skip adding an attribute because the attribute column '{attr_column}' already exists", file=sys.stderr) 
-                    return
-        if append_attr == True:
-            attributes.append(new_attr)
+                    print(f"⚠ Skip adding an attribute because the attribute column '{attr_alias}' already exists", file=sys.stderr)
+            else:
+                attributes.append(new_attr)
 
+        # from IPython.core.debugger import Pdb; Pdb().set_trace()
         res = self.cdp_api.put(f"/audiences/{audience_id}", json=audience)
         if res.ok:
             print(f"ⓘ Successfully added an attribute table '{attr_table}' to master segment {audience_id}", file=sys.stderr) 
         else:
-            try: 
+            try:
                 'not unique' in res.json()['base'][0]
-                print(f"⚠ Attribute '{attr_column}' already exists in Parent Segment and thus skip adding an attribue.", file=sys.stderr) 
+                print(f"⚠ Attribute '{attr_column}' already exists in Parent Segment and thus skip adding an attribue.", file=sys.stderr)
                 return
             except:
                 print(f"failed to PUT /audiences/{audience_id}: {new_attr}")
-                raise RuntimeError(f"{res.status_code} error on PUT /audiences/{audience_id}: {res.json()}")
+                raise ApiRequestError(res, f"{res.status_code} error on PUT /audiences/{audience_id}: {res.json()}")
 
         if rerun_master_segment:
             res = self.cdp_api.post(f"/audiences/{audience_id}/run")
             if res.ok:
-                print(f"ⓘ Successfully triggered rerun of Master Segment: {audience_id}", file=sys.stderr) 
+                print(f"ⓘ Successfully triggered rerun of Master Segment: {audience_id}", file=sys.stderr)
             else:
-                raise RuntimeError(f"{res.status_code} error on POST /audiences/{audience_id}/run: {res.json()}")
+                raise ApiRequestError(res, f"{res.status_code} error on POST /audiences/{audience_id}/run: {res.json()}")
 
 
     def get_parent_segment_id(self, name: str) -> str:
         '''
-            Retrive parent segment ID if exists. Otherwise, return None
+            Retrive parent segment ID if exists.
         '''
-
         assert name is not None
 
-        # Get all the audience configurations 
+        # Note: console-next (v5) uses different endpoints for listing audience
+        res = self.cdp_api.get('/entities/parent_segments')
+        if res.ok:
+            v5_res = res.json()
+            for audience in v5_res.get('data',{}):
+                if audience.get('attributes',{}).get('name') == name:
+                    return audience['id']
+
+        # Fall back to v4
         res = self.cdp_api.get('/audiences')
         if not res.ok:
-            raise RuntimeError(res.text)
-        audiences = json.loads(res.text)
+            raise ApiRequestError(res)
 
+        audiences = res.json()
         for audience in audiences:
-            if 'name' in audience and name == audience['name']:
-                if 'id' in audience:
-                    return audience['id']
+            if name == audience.get('name'):
+                return audience['id']
 
         raise ValueError(f"Cannot find parent segment: {name}")
 
 
-def parse_arguments(kwargs: dict) -> dict:
-    assert os.environ.get('TD_API_KEY') is not None, "TD_API_KEY ENV variable is required"
-    assert os.environ.get('TD_API_SERVER') is not None, "TD_API_SERVER ENV variable is required"
-
-    ret = {}
-
-    audience = kwargs.pop('audience', None)
-    assert audience is not None, "audience argument is required"
-    audience_id = audience.pop('id', None)
-    if audience_id is not None: ret['audience_id'] = audience_id
-    audience_name = audience.pop('name', None)
-    if audience_name is not None: ret['audience_name'] = audience_name
-    foreign_key = audience.pop('foreign_key', None)
-    assert foreign_key is not None, "foreign_key argument is required"
-    ret['foreign_key'] = foreign_key
-    ret['rerun_master_segment'] = to_boolean(audience.pop('rerun', 'False'))
-
-    attribute = kwargs.pop('attribute', None)
-    assert attribute is not None, "attribute argument is required"
-    attr_table = attribute.pop('table', None)
-    assert attr_table is not None, "attr_table argument is required"
-    ret['attr_table'] = attr_table
-    attr_column =  attribute.pop('attr_column', None)
-    assert attr_column is not None, "attr_column argument is required"
-    ret['attr_column'] = attr_column
-    join_key =  attribute.pop('join_key', None)
-    assert join_key is not None, "join_key argument is required"
-    ret['join_key'] = join_key
-    attr_db = attribute.pop('database', None)
-    if attr_db is not None: ret['attr_db'] = attr_db
-    attr_alias = attribute.pop('attr_alias', None)
-    if attr_alias is not None: ret['attr_alias'] = attr_alias
-    attr_group = attribute.pop('attr_group', "AutoML")
-    ret['attr_group'] = attr_group
-    replace_attr_if_exists = to_boolean(attribute.pop('replace_if_exists', 'False'))
-    ret['replace_attr_if_exists'] = replace_attr_if_exists
-
-    return ret
-
-
 def add_attribute(**kwargs):
-    import faulthandler
     faulthandler.enable()
 
+    def parse_arguments(kwargs: dict) -> dict:
+        assert os.environ.get('TD_API_KEY') is not None, "TD_API_KEY ENV variable is required"
+        assert os.environ.get('TD_API_SERVER') is not None, "TD_API_SERVER ENV variable is required"
+
+        ret = {}
+
+        audience = kwargs.pop('audience', None)
+        assert audience is not None, "audience argument is required"
+        audience_id = audience.pop('id', None)
+        if audience_id is not None: ret['audience_id'] = audience_id
+        audience_name = audience.pop('name', None)
+        if audience_name is not None: ret['audience_name'] = audience_name
+        foreign_key = audience.pop('foreign_key', None)
+        assert foreign_key is not None, "foreign_key argument is required"
+        ret['foreign_key'] = foreign_key
+        ret['rerun_master_segment'] = to_boolean(audience.pop('rerun', 'False'))
+
+        attribute = kwargs.pop('attribute', None)
+        assert attribute is not None, "attribute argument is required"
+        attr_table = attribute.pop('table', None)
+        assert attr_table is not None, "attr_table argument is required"
+        ret['attr_table'] = attr_table
+        join_key =  attribute.pop('join_key', None)
+        assert join_key is not None, "join_key argument is required"
+        ret['join_key'] = join_key
+        attr_db = attribute.pop('database', None)
+        if attr_db is not None: ret['attr_db'] = attr_db
+
+        attr_columns =  attribute.pop('attr_columns', None)
+        if attr_columns is None:
+            attr_column =  attribute.pop('attr_column', None)
+            assert attr_column is not None, "Either attr_columns or attr_column is required"
+            ret['attr_columns'] = [attr_column]
+        else:
+            ret['attr_columns'] = [s.strip() for s in attr_columns.split(',')]
+
+        attr_aliases =  attribute.pop('attr_aliases', None)
+        if attr_aliases is None:
+            attr_alias =  attribute.pop('attr_alias', None)
+            assert attr_alias is not None, "Either attr_aliases or attr_alias is required"
+            ret['attr_aliases'] = [attr_alias]
+        else:
+            ret['attr_aliases'] = [s.strip() for s in attr_aliases.split(',')]
+
+        attr_group = attribute.pop('attr_group', "AutoML")
+        ret['attr_group'] = attr_group
+        replace_attr_if_exists = to_boolean(attribute.pop('replace_if_exists', 'False'))
+        ret['replace_attr_if_exists'] = replace_attr_if_exists
+
+        return ret
+
     try:
         params = parse_arguments(kwargs)
         cdp = CdpAudience()
@@ -244,3 +311,43 @@ def add_attribute(**kwargs):
         # force flush
         sys.stdout.flush()
         sys.stderr.flush()
+
+
+def create_master_segment(**kwargs):
+    faulthandler.enable()
+
+    def parse_arguments(kwargs: dict) -> dict:
+        assert os.environ.get('TD_API_KEY') is not None, "TD_API_KEY ENV variable is required"
+        assert os.environ.get('TD_API_SERVER') is not None, "TD_API_SERVER ENV variable is required"
+
+        ret = {}
+
+        name = kwargs.pop('name', None)
+        assert name is not None, "name argument is required"
+        ret['name'] = name
+        description = kwargs.pop('description', None)
+        if description is not None: ret['description'] = description
+
+        master = kwargs.pop('master', None)
+        assert master is not None, "audience argument is required"
+        database = master.pop('database', None)
+        assert database is not None, "master.database argument is required"
+        ret['database'] = database
+        table = master.pop('table', None)
+        assert table is not None, "master.table argument is required"
+        ret['table'] = table
+
+        ret['run'] = to_boolean(kwargs.pop('run', None))
+        return ret
+
+    try:
+        params = parse_arguments(kwargs)
+        cdp = CdpAudience()
+        audience_id = cdp.create_master_segment(**params)
+
+        import digdag
+        digdag.env.store({'audience_id': audience_id})
+    finally:
+        # force flush
+        sys.stdout.flush()
+        sys.stderr.flush()

From 711718cccb4a49d502eda9fc97ce39ddc427a7af Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Sat, 27 May 2023 00:53:21 +0900
Subject: [PATCH 31/47] Revised a workflow

---
 .../automl/vehicle_coupon.dig                 | 24 ++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/machine-learning-box/automl/vehicle_coupon.dig b/machine-learning-box/automl/vehicle_coupon.dig
index c450de8c..a58fb8da 100644
--- a/machine-learning-box/automl/vehicle_coupon.dig
+++ b/machine-learning-box/automl/vehicle_coupon.dig
@@ -1,7 +1,5 @@
 _export:
   output_database: ml_test
-  audience_name: "vehicle coupon test"
-  foreign_key: userid
   td:
     engine: presto
     database: ${output_database}
@@ -42,18 +40,32 @@ _export:
     notebook: gluon_predict
     model_name: gluon_model_${session_id}
     input_table: ml_datasets.vehicle_coupon_test_with_rowid
-    output_table: ${output_database}.predicted__${session_id}
+    output_table: ${output_database}.predicted_${session_id}
+
++create_master_segment:
+  py>: scripts.audience.create_master_segment
+  name: vehicle_coupon_${session_id}
+  # description: xxx
+  master:
+    database: ml_datasets
+    table: vehicle_coupon_test_with_rowid
+  run: false
+  docker:
+    image: "digdag/digdag-python:3.9"
+  _env:
+    TD_API_KEY: ${secret:td.apikey}
+    TD_API_SERVER: "api.treasuredata.com"
 
 +add_attribute:
   py>: scripts.audience.add_attribute
   audience:
-    name: ${audience_name} # segment name or segment id
+    name: vehicle_coupon_${session_id}
     # id: 1111
-    foreign_key: ${foreign_key}
+    foreign_key: userid
     ### optional
     rerun: true
   attribute:
-    table: ${output_database}.predicted__${session_id}
+    table: ${output_database}.predicted_${session_id}
     attr_column: "predicted_proba"
     join_key: "userid"
     ### optional

From f849caf8beb1027b0023889e8f87fede7e3b971a Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Sat, 27 May 2023 01:08:45 +0900
Subject: [PATCH 32/47] Changed parameters to accept multiple attribute columns

---
 machine-learning-box/automl/vehicle_coupon.dig | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/machine-learning-box/automl/vehicle_coupon.dig b/machine-learning-box/automl/vehicle_coupon.dig
index a58fb8da..17084be0 100644
--- a/machine-learning-box/automl/vehicle_coupon.dig
+++ b/machine-learning-box/automl/vehicle_coupon.dig
@@ -66,9 +66,11 @@ _export:
     rerun: true
   attribute:
     table: ${output_database}.predicted_${session_id}
-    attr_column: "predicted_proba"
+    attr_columns: "predicted_proba, y"
+    # attr_column: predicted_proba
     join_key: "userid"
     ### optional
+    attr_aliases: "predicted_proba, y2"
     attr_group: "AutoML"
     replace_if_exists: true
   docker:

From ac28fad8af7994d26610b2a536d57bc3db1919b9 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Sun, 28 May 2023 00:45:26 +0900
Subject: [PATCH 33/47] Added two variations for adding attribute to CDP master
 segment

---
 .../automl/vehicle_coupon.dig                 | 42 +++-------
 .../vehicle_coupon_custom_script_version.dig  | 80 +++++++++++++++++++
 2 files changed, 92 insertions(+), 30 deletions(-)
 create mode 100644 machine-learning-box/automl/vehicle_coupon_custom_script_version.dig

diff --git a/machine-learning-box/automl/vehicle_coupon.dig b/machine-learning-box/automl/vehicle_coupon.dig
index 17084be0..2fed4590 100644
--- a/machine-learning-box/automl/vehicle_coupon.dig
+++ b/machine-learning-box/automl/vehicle_coupon.dig
@@ -18,6 +18,7 @@ _export:
 
 +train:
   ml_train>:
+    branch: ATML-109_attr
     docker:
       task_mem: 256g # 64g/128g/256g/384g/512g
     notebook: gluon_train
@@ -33,15 +34,6 @@ _export:
   create_table: ml_datasets.vehicle_coupon_test_with_rowid
   engine: hive
 
-+predict:
-  ml_predict>:
-    docker:
-      task_mem: 128g # 64g/128g/256g/384g/512g
-    notebook: gluon_predict
-    model_name: gluon_model_${session_id}
-    input_table: ml_datasets.vehicle_coupon_test_with_rowid
-    output_table: ${output_database}.predicted_${session_id}
-
 +create_master_segment:
   py>: scripts.audience.create_master_segment
   name: vehicle_coupon_${session_id}
@@ -56,25 +48,15 @@ _export:
     TD_API_KEY: ${secret:td.apikey}
     TD_API_SERVER: "api.treasuredata.com"
 
-+add_attribute:
-  py>: scripts.audience.add_attribute
-  audience:
-    name: vehicle_coupon_${session_id}
-    # id: 1111
++predict:
+  ml_predict>:
+    branch: ATML-109_attr
+    docker:
+      task_mem: 128g # 64g/128g/256g/384g/512g
+    notebook: gluon_predict
+    model_name: gluon_model_${session_id}
+    input_table: ml_datasets.vehicle_coupon_test_with_rowid
+    rowid_column: userid
+    output_table: ${output_database}.predicted_${session_id}
+    audience_name: vehicle_coupon_${session_id}
     foreign_key: userid
-    ### optional
-    rerun: true
-  attribute:
-    table: ${output_database}.predicted_${session_id}
-    attr_columns: "predicted_proba, y"
-    # attr_column: predicted_proba
-    join_key: "userid"
-    ### optional
-    attr_aliases: "predicted_proba, y2"
-    attr_group: "AutoML"
-    replace_if_exists: true
-  docker:
-    image: "digdag/digdag-python:3.9"
-  _env:
-    TD_API_KEY: ${secret:td.apikey}
-    TD_API_SERVER: "api.treasuredata.com"
diff --git a/machine-learning-box/automl/vehicle_coupon_custom_script_version.dig b/machine-learning-box/automl/vehicle_coupon_custom_script_version.dig
new file mode 100644
index 00000000..1db3b0b6
--- /dev/null
+++ b/machine-learning-box/automl/vehicle_coupon_custom_script_version.dig
@@ -0,0 +1,80 @@
+_export:
+  output_database: ml_test
+  td:
+    engine: presto
+    database: ${output_database}
+
++create_db_tbl_if_not_exists:
+  td_ddl>:
+  create_databases: ["ml_test"]
+
++load_datasets:
+  ipynb>:
+    docker:
+      task_mem: 64g # 64g/128g/256g/384g/512g
+    notebook: ml_datasets
+    output_database: ml_datasets
+    datasets: vehicle_coupon
+
++train:
+  ml_train>:
+    docker:
+      task_mem: 256g # 64g/128g/256g/384g/512g
+    notebook: gluon_train
+    model_name: gluon_model_${session_id}
+    input_table: ml_datasets.vehicle_coupon_train
+    target_column: y
+    time_limit: 3 * 60 # 3 min
+
++prepare_input:
+  td>: queries/assign_rowid.sql
+  table: ml_datasets.vehicle_coupon_test
+  rowid_column: userid
+  create_table: ml_datasets.vehicle_coupon_test_with_rowid
+  engine: hive
+
++predict:
+  ml_predict>:
+    docker:
+      task_mem: 128g # 64g/128g/256g/384g/512g
+    notebook: gluon_predict
+    model_name: gluon_model_${session_id}
+    input_table: ml_datasets.vehicle_coupon_test_with_rowid
+    output_table: ${output_database}.predicted_${session_id}
+
++create_master_segment:
+  py>: scripts.audience.create_master_segment
+  name: vehicle_coupon_${session_id}
+  # description: xxx
+  master:
+    database: ml_datasets
+    table: vehicle_coupon_test_with_rowid
+  run: false
+  docker:
+    image: "digdag/digdag-python:3.9"
+  _env:
+    TD_API_KEY: ${secret:td.apikey}
+    TD_API_SERVER: "api.treasuredata.com"
+
++add_attribute:
+  py>: scripts.audience.add_attribute
+  audience:
+    name: vehicle_coupon_${session_id}
+    # id: 1111
+    foreign_key: userid
+    ### optional
+    rerun: true
+  attribute:
+    table: ${output_database}.predicted_${session_id}
+    attr_columns: predicted_proba, y
+    # attr_column: predicted_proba
+    join_key: "userid"
+    ### optional
+    attr_aliases: predicted_proba, y2
+    attr_group: "AutoML"
+    replace_if_exists: true
+  docker:
+    image: "digdag/digdag-python:3.9"
+  _env:
+    TD_API_KEY: ${secret:td.apikey}
+    TD_API_SERVER: "api.treasuredata.com"

From 03ac63312e501c14b659f638c5b72ed71938f814 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Sun, 28 May 2023 18:08:22 +0900
Subject: [PATCH 34/47] Added an example to add next_action to CDP master
 segment

---
 machine-learning-box/automl/nba_cdp.dig | 59 +++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 machine-learning-box/automl/nba_cdp.dig

diff --git a/machine-learning-box/automl/nba_cdp.dig b/machine-learning-box/automl/nba_cdp.dig
new file mode 100644
index 00000000..48ee9b0a
--- /dev/null
+++ b/machine-learning-box/automl/nba_cdp.dig
@@ -0,0 +1,59 @@
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: ${output_database}
+
++create_db_tbl_if_not_exists:
+  td_ddl>:
+  create_databases: ["${output_database}"]
+
++load_datasets:
+  ipynb>:
+    notebook: ml_datasets
+    output_database: ml_datasets
+    datasets: nba
+
++create_master_segment:
+  py>: scripts.audience.create_master_segment
+  name: nba_${session_id}
+  description: NBA test audience
+  master:
+    database: ml_datasets
+    table: nba_test
+  run: false
+  docker:
+    image: "digdag/digdag-python:3.9"
+  _env:
+    TD_API_KEY: ${secret:td.apikey}
+    TD_API_SERVER: "api.treasuredata.com"
+
++nba_with_eval:
+  ipynb>:
+    branch: ATML-109_attr
+    notebook: NBA
+    train_table: ml_datasets.nba_train
+    test_table: ml_datasets.nba_test
+    budget: 10000
+    value_per_cv: 100
+    # optional
+    audience_name: nba_${session_id}
+    # export_q_table: ${output_database}.rl_qtable_${session_id}
+    export_channel_ratio: ${output_database}.rl_channel_ratio_${session_id}
+    export_predictions: ${output_database}.rl_predictions_${session_id}
+    export_model_performance: ${output_database}.rl_model_performance_${session_id}
+    ignore_actions: client_domain_organic_visit, organic_search
+    action_cost: |
+     {
+       "display": 2,
+       "social-social": 1.4,
+       "social": 2,
+       "social-paid": 5,
+       "organic_search": 1,
+       "emai": 3.2,
+       "cpc": 3,
+       "referral": 2,
+       "linkedin": 3,
+       "search-paid": 2,
+       "twitter": 1
+     }
\ No newline at end of file

From cae911e807387132164b0528b15cb068a1ad0cd0 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Tue, 30 May 2023 12:21:09 +0900
Subject: [PATCH 35/47] Removed branch setting

---
 machine-learning-box/automl/nba_cdp.dig        | 1 -
 machine-learning-box/automl/vehicle_coupon.dig | 2 --
 2 files changed, 3 deletions(-)

diff --git a/machine-learning-box/automl/nba_cdp.dig b/machine-learning-box/automl/nba_cdp.dig
index 48ee9b0a..40602deb 100644
--- a/machine-learning-box/automl/nba_cdp.dig
+++ b/machine-learning-box/automl/nba_cdp.dig
@@ -30,7 +30,6 @@ _export:
 
 +nba_with_eval:
   ipynb>:
-    branch: ATML-109_attr
     notebook: NBA
     train_table: ml_datasets.nba_train
     test_table: ml_datasets.nba_test
diff --git a/machine-learning-box/automl/vehicle_coupon.dig b/machine-learning-box/automl/vehicle_coupon.dig
index 2fed4590..a849bc93 100644
--- a/machine-learning-box/automl/vehicle_coupon.dig
+++ b/machine-learning-box/automl/vehicle_coupon.dig
@@ -18,7 +18,6 @@ _export:
 
 +train:
   ml_train>:
-    branch: ATML-109_attr
     docker:
       task_mem: 256g # 64g/128g/256g/384g/512g
     notebook: gluon_train
@@ -50,7 +49,6 @@ _export:
 
 +predict:
   ml_predict>:
-    branch: ATML-109_attr
     docker:
       task_mem: 128g # 64g/128g/256g/384g/512g
     notebook: gluon_predict

From d93474f7594ea02922f72478756f31c36ad32624 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 15 Jun 2023 16:20:41 +0900
Subject: [PATCH 36/47] Fixed a typo

---
 machine-learning-box/automl/ml_experiment.dig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index bb8e8701..a861ba2c 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -29,7 +29,7 @@ _export:
   user_id: ${automl.last_executed_user_id}
   user_email: ${automl.last_executed_user_email}
   model_name: gluon_model_${session_id}
-  shared_mdoel: ${automl.shared_model}
+  shared_model: ${automl.shared_model}
   task_attempt_id: ${attempt_id}
   session_time: ${session_local_time}
   engine: presto

From edb33dc775a09f29c70c9ec08860c9890e3c874c Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 15 Jun 2023 16:25:56 +0900
Subject: [PATCH 37/47] Fixed a typo

---
 machine-learning-box/automl/queries/track_experiment.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine-learning-box/automl/queries/track_experiment.sql b/machine-learning-box/automl/queries/track_experiment.sql
index a9dc9522..7db45323 100644
--- a/machine-learning-box/automl/queries/track_experiment.sql
+++ b/machine-learning-box/automl/queries/track_experiment.sql
@@ -5,6 +5,6 @@ select
    '${user_id}' as user_id,
    '${user_email}' as user_email,
    '${model_name}' as model_name,
-   '${shared_mdoel}' as shared_mdoel,
+   '${shared_model}' as shared_model,
    '${last_executed_notebook}' as notebook_url
  

From 35f343c03a205f064076e786107bfcbc1176554e Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 15 Jun 2023 17:06:34 +0900
Subject: [PATCH 38/47] Removed to record test table name

---
 machine-learning-box/automl/ml_experiment.dig             | 1 +
 machine-learning-box/automl/queries/record_evaluation.sql | 1 +
 2 files changed, 2 insertions(+)

diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index a861ba2c..a95bdd24 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -57,5 +57,6 @@ _export:
   insert_into: ${output_database}.automl_eval_results
   engine: presto
   model_name: gluon_model_${session_id}
+  test_table: ${output_database}.predicted_${test_data_table}_${session_id}
   session_time: ${session_local_time}
   auc: ${td.last_results.auc}
diff --git a/machine-learning-box/automl/queries/record_evaluation.sql b/machine-learning-box/automl/queries/record_evaluation.sql
index e08b6916..88070aec 100644
--- a/machine-learning-box/automl/queries/record_evaluation.sql
+++ b/machine-learning-box/automl/queries/record_evaluation.sql
@@ -2,4 +2,5 @@
 select
    '${session_time}' as session_time,
    '${model_name}' as model_name,
+   '${test_table}' as test_table,
    '${auc}' as auroc

From b24b27dc0f981118bcc7e26692cc5c670b6e806f Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 15 Jun 2023 17:10:57 +0900
Subject: [PATCH 39/47] Fixed test_table value

---
 machine-learning-box/automl/ml_experiment.dig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index a95bdd24..eef93c1c 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -57,6 +57,6 @@ _export:
   insert_into: ${output_database}.automl_eval_results
   engine: presto
   model_name: gluon_model_${session_id}
-  test_table: ${output_database}.predicted_${test_data_table}_${session_id}
+  test_table: ${input_database}.${test_data_table}
   session_time: ${session_local_time}
   auc: ${td.last_results.auc}

From 2e80b31c8422b5110f76a416d2ae5fb12958719e Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 15 Jun 2023 17:55:55 +0900
Subject: [PATCH 40/47] Added a drift detection example

---
 machine-learning-box/automl/config/params.yaml | 4 +++-
 machine-learning-box/automl/ml_experiment.dig  | 9 +++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/machine-learning-box/automl/config/params.yaml b/machine-learning-box/automl/config/params.yaml
index fcf30438..8eb7d609 100644
--- a/machine-learning-box/automl/config/params.yaml
+++ b/machine-learning-box/automl/config/params.yaml
@@ -5,4 +5,6 @@ train_data_table: gluon_train
 target_column: class
 test_data_table: gluon_test
 
-fit_time_limit: 60 * 3   # fit timeout in sec. 3 min just for demo. Default: 60 * 60 (1hr).
\ No newline at end of file
+fit_time_limit: 60 * 3   # fit timeout in sec. 3 min just for demo. Default: 60 * 60 (1hr).
+
+drift_auc_threshold: 0.93
diff --git a/machine-learning-box/automl/ml_experiment.dig b/machine-learning-box/automl/ml_experiment.dig
index eef93c1c..abf9b3a8 100644
--- a/machine-learning-box/automl/ml_experiment.dig
+++ b/machine-learning-box/automl/ml_experiment.dig
@@ -52,6 +52,15 @@ _export:
   store_last_results: true
   engine: hive
 
++alert_if_drift_detected:
+  if>: ${td.last_results.auc < drift_auc_threshold}
+  _do:
+    mail>:
+      data: Detect drift in model performance. AUC was ${td.last_results.auc}.
+    subject: Drift detected
+    to: [me+alerts@example.com]
+    # bcc: [foo@example.com,bar@example.com]
+
 +record_evaluation:
   td>: queries/record_evaluation.sql
   insert_into: ${output_database}.automl_eval_results

From b6f758ed87b9b2efd94cebd547080b335e0e9b4b Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Mon, 10 Jul 2023 17:33:06 +0900
Subject: [PATCH 41/47] Fixed a bug in cdp endpoints

---
 .../automl/scripts/audience.py                | 163 +++++++++++++++++-
 1 file changed, 160 insertions(+), 3 deletions(-)

diff --git a/machine-learning-box/automl/scripts/audience.py b/machine-learning-box/automl/scripts/audience.py
index 9c09e878..2c17d725 100644
--- a/machine-learning-box/automl/scripts/audience.py
+++ b/machine-learning-box/automl/scripts/audience.py
@@ -8,7 +8,7 @@
 import faulthandler
 import warnings
 
-from typing import List, Tuple
+from typing import List, Tuple, Optional
 
 from requests.models import Response
 from requests.packages.urllib3.util.retry import Retry
@@ -121,7 +121,7 @@ def __init__(self):
         TD_API_KEY = os.environ["TD_API_KEY"]
         TD_ENDPOINT = os.environ["TD_API_SERVER"]
 
-        CDP_ENDPOINT = TD_ENDPOINT.replace('api', 'api-cdp')
+        CDP_ENDPOINT = TD_ENDPOINT.replace('.treasuredata', '-cdp.treasuredata')
         HEADERRS = {'Authorization': f'TD1 {TD_API_KEY}', 'Content-Type': 'application/json'}
         self.cdp_api = CdpApiClient(endpoint=CDP_ENDPOINT, headers=HEADERRS)
         self.td_api = pytd.Client(retry_post_requests=True).api_client
@@ -146,6 +146,9 @@ def create_master_segment(self, *, name: str, database: str, table: str, descrip
             res = self.cdp_api.post(f"/audiences/{audience_id}/run")
             print(f"ⓘ Run Master Segment {name}", file=sys.stderr)
 
+        TD_ENDPOINT = os.environ["TD_API_SERVER"]
+        ms_url = f"https://{TD_ENDPOINT.replace('api', 'console')}/app/ms/{audience_id}"
+        print(f"💎 Created a Master Segment: {ms_url}", file=sys.stderr)
         return audience_id
 
     def add_attribute(
@@ -207,7 +210,7 @@ def add_attribute(
             print(f"ⓘ Successfully added an attribute table '{attr_table}' to master segment {audience_id}", file=sys.stderr) 
         else:
             try:
-                'not unique' in res.json()['base'][0]
+                assert 'not unique' in res.json()['base'][0]
                 print(f"⚠ Attribute '{attr_column}' already exists in Parent Segment and thus skip adding an attribue.", file=sys.stderr)
                 return
             except:
@@ -249,6 +252,120 @@ def get_parent_segment_id(self, name: str) -> str:
         raise ValueError(f"Cannot find parent segment: {name}")
 
 
+    def create_folder(self, name: str, audience_id: str) -> str:
+        folder = self.cdp_api.post(f'/audiences/{audience_id}/folders', json={
+            'name': name,
+            'description': 'AutoML Segments'
+        })
+
+        if folder.ok:
+            return folder.json()['id']
+        else:
+            res = self.cdp_api.get(f'/audiences/{audience_id}/folders')
+            if not res.ok:
+                raise ApiRequestError(res, f"{res.status_code} error on GET /audiences/{audience_id}/folders: {res.json()}")
+
+            folders = res.json()
+            for f in folders:
+                if f.get('name') == name:
+                    print(f"Reuse folder `{name}` already existing in audience `{audience_id}`")
+                    return f['id']
+
+            raise ApiRequestError(folder, f"{folder.status_code} error on POST /audiences/{audience_id}/folders: {folder.json()}")
+
+
+    def create_segments(self, *, column_name: str, column_values: List[str], folder: Optional[str]="AutoML", audience_id: Optional[str]=None, audience_name: Optional[str]=None, rerun_master_segment: Optional[bool]=False):
+        assert len(column_values) >= 1, "At least 1 column_values are required."
+        if audience_id is None:
+            assert audience_name is not None, "Either audience_id or audience_name argument is required"
+            audience_id = self.get_parent_segment_id(audience_name)
+
+        res = self.cdp_api.get(f"/entities/parent_segments/{audience_id}")
+        use_v4_api = False
+        if res.ok:
+            print(f"ⓘ Successfully retrieved the audience", file=sys.stderr)
+            folder_id = res.json()['data']['relationships']['parentFolder']['data']['id']
+        else:
+            try:
+                assert res.json()['errors'].split(':')[0] == 'v5 endpoints flag should be enabled for audience'
+                print("v5 API is not enabled. Fall back to v4 API")
+                use_v4_api = True
+            except:
+                raise ApiRequestError(res, f"{res.status_code} error on GET /entities/parent_segments/{audience_id}: {res.json()}")
+
+        if folder:
+            folder_id = self.create_folder(folder, audience_id)
+
+        for value in column_values:
+            attribute_name = column_name.replace('_', ' ').title() + ' = ' + str(value).title()
+            rule = {
+                'type': 'And',
+                'conditions': [{
+                    'conditions': [{
+                        'type': 'Value',
+                        'leftValue': {'name': column_name, 'visibility': 'clear'},
+                        'operator': {'not': False, 'rightValue': value, 'type': 'Equal'},
+                        'arrayMatching': None,
+                        'exclude': False
+                    }],
+                    'type': 'And',
+                }],
+                'expr': '',
+            }
+
+            if use_v4_api:
+                segment = {
+                    'name': attribute_name,
+                    'kind': 0, # batch,
+                    'description': f'{column_name} = {value}',
+                    'countPopulation': True,
+                    'rule': rule,
+                }
+                if folder:
+                    segment['segmentFolderId'] = folder_id
+                res = self.cdp_api.post(f"/audiences/{audience_id}/segments", json=segment)
+                if res.ok:
+                    print(f"ⓘ Successfully created a segment '{attribute_name}' to master segment {audience_id}", file=sys.stderr)
+                else:
+                    try:
+                        assert res.json()['errors']['name'][0] == 'has already been taken'
+                        print(f"Segment `{attribute_name}` already exists")
+                    except:
+                        raise ApiRequestError(res, f"{res.status_code} error on POST /entities/segments: {res.json()}")
+            else: # v5 API
+                segment = {
+                    'attributes': {
+                        'name': attribute_name,
+                        'description': f'{column_name} = {value}',
+                        'rule': rule,
+                    },
+                    'relationships': {'parentFolder': {'data': {'id': folder_id, 'type': 'folder-segment'}}}
+                }
+                res = self.cdp_api.post("/entities/segments", json=segment)
+                if res.ok:
+                    print(f"ⓘ Successfully created a segment '{attribute_name}' to master segment {audience_id}", file=sys.stderr)
+                else:
+                    try:
+                        assert res.json()['errors']['name'][0] == 'has already been taken'
+                        print(f"Segment `{attribute_name}` already exists")
+                    except:
+                        raise ApiRequestError(res, f"{res.status_code} error on POST /entities/segments: {res.json()}")
+
+        if rerun_master_segment:
+            res = self.cdp_api.post(f"/audiences/{audience_id}/run")
+            if res.ok:
+                print(f"ⓘ Successfully triggered rerun of Master Segment: {audience_id}", file=sys.stderr)
+            else:
+                raise ApiRequestError(res, f"{res.status_code} error on POST /audiences/{audience_id}/run: {res.json()}")
+
+        TD_ENDPOINT = os.environ["TD_API_SERVER"]
+        if use_v4_api:
+            s_url = f"https://{TD_ENDPOINT.replace('api', 'console')}/app/ms/{audience_id}/se"
+        else:
+            s_url = f"https://{TD_ENDPOINT.replace('api', 'console').replace('.treasuredata', '-next.treasuredata')}/app/ps/{audience_id}"
+        print(f"💎 Created new segments: {s_url}", file=sys.stderr)
+
+
 def add_attribute(**kwargs):
     faulthandler.enable()
 
@@ -351,3 +468,43 @@ def parse_arguments(kwargs: dict) -> dict:
         # force flush
         sys.stdout.flush()
         sys.stderr.flush()
+
+
+def create_segments(**kwargs):
+    faulthandler.enable()
+
+    def parse_arguments(kwargs: dict) -> dict:
+        assert os.environ.get('TD_API_KEY') is not None, "TD_API_KEY ENV variable is required"
+        assert os.environ.get('TD_API_SERVER') is not None, "TD_API_SERVER ENV variable is required"
+
+        ret = {}
+
+        column_name = kwargs.pop('column_name', None)
+        assert column_name is not None, "column_name argument is required"
+        ret['column_name'] = column_name
+
+        column_values = kwargs.pop('column_values', None)
+        assert column_values is not None, "column_values argument is required"
+        ret['column_values'] = [s.strip() for s in column_values.split(',')]
+
+        folder = kwargs.pop('folder', None)
+        if folder is not None: ret['folder'] = folder
+
+        audience = kwargs.pop('audience', None)
+        assert audience is not None, "audience argument is required"
+        audience_id = audience.pop('id', None)
+        if audience_id is not None: ret['audience_id'] = audience_id
+        audience_name = audience.pop('name', None)
+        if audience_name is not None: ret['audience_name'] = audience_name
+        ret['rerun_master_segment'] = to_boolean(audience.pop('rerun', 'False'))
+
+        return ret
+
+    try:
+        params = parse_arguments(kwargs)
+        cdp = CdpAudience()
+        cdp.create_segments(**params)
+    finally:
+        # force flush
+        sys.stdout.flush()
+        sys.stderr.flush()

From 7192e85ddb9ac5e34ff2c19aa02029d813f245dc Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 3 Aug 2023 16:40:30 +0900
Subject: [PATCH 42/47] Added rfm workflow

---
 machine-learning-box/automl/rfm.dig           | 46 +++++++++++++++++++
 .../automl/scripts/audience.py                | 33 +++++++++----
 2 files changed, 71 insertions(+), 8 deletions(-)
 create mode 100644 machine-learning-box/automl/rfm.dig

diff --git a/machine-learning-box/automl/rfm.dig b/machine-learning-box/automl/rfm.dig
new file mode 100644
index 00000000..e9442533
--- /dev/null
+++ b/machine-learning-box/automl/rfm.dig
@@ -0,0 +1,46 @@
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: ${output_database}
+
++create_db_tbl_if_not_exists:
+  td_ddl>:
+  create_databases: ["${output_database}"]
+
++load_datasets:
+   ipynb>:
+     notebook: ml_datasets
+     output_database: ${input_database}
+     datasets: cosmetics_store
+
++create_users_table:
+  td>:
+  query: "select distinct user_id from ${input_database}.cosmetics_store"
+  create_table: cosmetics_users
+
++create_master_segment:
+  py>: scripts.audience.create_master_segment
+  name: cosmetics_${session_id}
+  description: Cosmetics store audience
+  master:
+    database: ${output_database}
+    table: cosmetics_users
+  run: false
+  docker:
+    image: "digdag/digdag-python:3.9"
+  _env:
+    TD_API_KEY: ${secret:td.apikey}
+    TD_API_SERVER: "api.treasuredata.com"
+
++rfm_orders:
+  ipynb>:
+    notebook: RFM
+    input_table: ${input_database}.cosmetics_store
+    output_table: ${output_database}.rfm_output_cosmetics_store
+    user_column: user_id
+    # tstamp_column: event_time
+    # tstamp_column: tstamp
+    amount_column: price
+    audience_name: cosmetics_${session_id}
+
diff --git a/machine-learning-box/automl/scripts/audience.py b/machine-learning-box/automl/scripts/audience.py
index 2c17d725..e8445369 100644
--- a/machine-learning-box/automl/scripts/audience.py
+++ b/machine-learning-box/automl/scripts/audience.py
@@ -169,17 +169,32 @@ def add_attribute(
             assert audience_name is not None, "Either audience_id or audience_name argument is required"
             audience_id = self.get_parent_segment_id(audience_name)
 
-        # table = self.td_api.table(attr_db, attr_table)
-        # attr_type = resolve_type(table, "predicted_proba")
-
         res = self.cdp_api.put(f"/audiences/{audience_id}")
         if not res.ok:
             raise ApiRequestError(res)
         audience = res.json()
 
-        attributes = audience['attributes'] if 'attributes' in audience else []
+        if 'attributes' in audience:
+            attributes = audience['attributes']
+        else:
+            attributes = []
+            audience['attributes'] = attributes
+
         existing_attr_names = [attr['name'] for attr in attributes]
 
+        # Workaround for attribute column does not exists in the attribute table
+        if len(attributes) >= 1:
+            table = self.td_api.table(attr_db, attr_table)
+            existing_column_names = [col[2] if len(col) == 3 else col[0] for col in table.schema]
+
+            def remove_attribute(attr) -> bool:
+                if attr['parentDatabaseName'] == attr_db and attr['parentTableName'] == attr_table:
+                    if attr['parentColumn'] not in existing_column_names:
+                        print(f"⚠ Remove an attribute column '{attr['name']}' in Master Segment {audience_id} because '{attr['parentColumn']}' column does not exists in the Atrribute table '{attr_db}.{attr_table}'", file=sys.stderr)
+                        return True
+                return False
+            audience['attributes'] = [attr for attr in attributes if not remove_attribute(attr)]
+
         for i, attr_column in enumerate(attr_columns):
             attr_alias = attr_aliases[i]
 
@@ -198,19 +213,19 @@ def add_attribute(
             if attr_alias in existing_attr_names:
                 if replace_attr_if_exists:
                     attributes[existing_attr_names.index(attr_alias)] = new_attr
-                    print(f"⚠ Replace an attribute column '{attr_alias}' in Master Segment {audience_id}", file=sys.stderr)
+                    print(f"⚠ Replace an existing attribute column '{attr_alias}' in Master Segment {audience_id}", file=sys.stderr)
                 else:
                     print(f"⚠ Skip adding an attribute because the attribute column '{attr_alias}' already exists", file=sys.stderr)
             else:
                 attributes.append(new_attr)
 
-        # from IPython.core.debugger import Pdb; Pdb().set_trace()
         res = self.cdp_api.put(f"/audiences/{audience_id}", json=audience)
         if res.ok:
             print(f"ⓘ Successfully added an attribute table '{attr_table}' to master segment {audience_id}", file=sys.stderr) 
         else:
             try:
-                assert 'not unique' in res.json()['base'][0]
+                res_value = res.json()['base'][0]
+                assert 'not unique' in res_value, f"Unexpected error: {res_value}"
                 print(f"⚠ Attribute '{attr_column}' already exists in Parent Segment and thus skip adding an attribue.", file=sys.stderr)
                 return
             except:
@@ -274,7 +289,9 @@ def create_folder(self, name: str, audience_id: str) -> str:
             raise ApiRequestError(folder, f"{folder.status_code} error on POST /audiences/{audience_id}/folders: {folder.json()}")
 
 
-    def create_segments(self, *, column_name: str, column_values: List[str], folder: Optional[str]="AutoML", audience_id: Optional[str]=None, audience_name: Optional[str]=None, rerun_master_segment: Optional[bool]=False):
+    def create_segments(self, *, column_name: str, column_values: List[str], folder: Optional[str]="AutoML",
+                        audience_id: Optional[str]=None, audience_name: Optional[str]=None, rerun_master_segment: Optional[bool]=False
+    ):
         assert len(column_values) >= 1, "At least 1 column_values are required."
         if audience_id is None:
             assert audience_name is not None, "Either audience_id or audience_name argument is required"

From 7b8065c3d0be9c37a9f12f316b5cffc722fe8e43 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Thu, 3 Aug 2023 17:07:50 +0900
Subject: [PATCH 43/47] Revised not to use custom script

---
 machine-learning-box/automl/rfm.dig | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/machine-learning-box/automl/rfm.dig b/machine-learning-box/automl/rfm.dig
index e9442533..645c2f8f 100644
--- a/machine-learning-box/automl/rfm.dig
+++ b/machine-learning-box/automl/rfm.dig
@@ -14,26 +14,7 @@ _export:
      output_database: ${input_database}
      datasets: cosmetics_store
 
-+create_users_table:
-  td>:
-  query: "select distinct user_id from ${input_database}.cosmetics_store"
-  create_table: cosmetics_users
-
-+create_master_segment:
-  py>: scripts.audience.create_master_segment
-  name: cosmetics_${session_id}
-  description: Cosmetics store audience
-  master:
-    database: ${output_database}
-    table: cosmetics_users
-  run: false
-  docker:
-    image: "digdag/digdag-python:3.9"
-  _env:
-    TD_API_KEY: ${secret:td.apikey}
-    TD_API_SERVER: "api.treasuredata.com"
-
-+rfm_orders:
++run_rfm:
   ipynb>:
     notebook: RFM
     input_table: ${input_database}.cosmetics_store
@@ -43,4 +24,3 @@ _export:
     # tstamp_column: tstamp
     amount_column: price
     audience_name: cosmetics_${session_id}
-

From 04eb2048d50293c4c9c1086f73061ac69d4e080d Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Tue, 26 Sep 2023 16:50:44 +0900
Subject: [PATCH 44/47] Added clustering example

---
 machine-learning-box/automl/clustering.dig | 23 ++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 machine-learning-box/automl/clustering.dig

diff --git a/machine-learning-box/automl/clustering.dig b/machine-learning-box/automl/clustering.dig
new file mode 100644
index 00000000..b204cb5a
--- /dev/null
+++ b/machine-learning-box/automl/clustering.dig
@@ -0,0 +1,23 @@
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: ${output_database}
+
++create_db_tbl_if_not_exists:
+  td_ddl>:
+  create_databases: ["${output_database}"]
+
++load_datasets:
+  ipynb>:
+    notebook: ml_datasets
+    output_database: ml_datasets
+    datasets: dermatology
+
++clustering_gluon_new_model:
+  ipynb>:
+    notebook: clustering
+    input_table: ml_datasets.dermatology
+    output_table: ${output_database}.dermatology_clusters_${session_id}
+    export_feature_importance: ${output_database}.feature_importance_${session_id}
+    export_shap_values: ${output_database}.shap_values_${session_id}

From 44f72f42e599a1175baecabcfeb2e9713a94c2dd Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Mon, 4 Dec 2023 23:42:56 +0900
Subject: [PATCH 45/47] Added CLTV notebook

---
 machine-learning-box/automl/cltv.dig | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 machine-learning-box/automl/cltv.dig

diff --git a/machine-learning-box/automl/cltv.dig b/machine-learning-box/automl/cltv.dig
new file mode 100644
index 00000000..b1a6ddcc
--- /dev/null
+++ b/machine-learning-box/automl/cltv.dig
@@ -0,0 +1,26 @@
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: ${output_database}
+
++create_db_tbl_if_not_exists:
+  td_ddl>:
+  create_databases: ["${output_database}"]
+
++load_datasets:
+   ipynb>:
+     notebook: ml_datasets
+     output_database: ${input_database}
+     datasets: online_retail
+
++run_cltv:
+  ipynb>:
+    branch: ATML-174-cltv
+    notebook: CLTV
+    input_table: ${input_database}.online_retail_txn
+    output_table: ${output_database}.online_retail_cltv_result
+    user_column: customerid
+    tstamp_column: invoicedate
+    amount_column: purchaseamount
+    audience_name: online_retail_cltv

From 6d45d53347f6a12399904b0cf1cc4627ef098754 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Tue, 5 Dec 2023 13:57:44 +0900
Subject: [PATCH 46/47] Added branch option

---
 machine-learning-box/automl/cltv.dig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/machine-learning-box/automl/cltv.dig b/machine-learning-box/automl/cltv.dig
index b1a6ddcc..be1e629d 100644
--- a/machine-learning-box/automl/cltv.dig
+++ b/machine-learning-box/automl/cltv.dig
@@ -10,6 +10,7 @@ _export:
 
 +load_datasets:
    ipynb>:
+     branch: ATML-174-cltv
      notebook: ml_datasets
      output_database: ${input_database}
      datasets: online_retail

From 708c760ce5b38f81558ef809f4692e1171558ca7 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Mon, 18 Dec 2023 17:54:07 +0900
Subject: [PATCH 47/47] Removed branch

---
 machine-learning-box/automl/cltv.dig | 2 --
 1 file changed, 2 deletions(-)

diff --git a/machine-learning-box/automl/cltv.dig b/machine-learning-box/automl/cltv.dig
index be1e629d..7137a69a 100644
--- a/machine-learning-box/automl/cltv.dig
+++ b/machine-learning-box/automl/cltv.dig
@@ -10,14 +10,12 @@ _export:
 
 +load_datasets:
    ipynb>:
-     branch: ATML-174-cltv
      notebook: ml_datasets
      output_database: ${input_database}
      datasets: online_retail
 
 +run_cltv:
   ipynb>:
-    branch: ATML-174-cltv
     notebook: CLTV
     input_table: ${input_database}.online_retail_txn
     output_table: ${output_database}.online_retail_cltv_result