From c5e71d3a3856db12c7bc459080528702fba9a494 Mon Sep 17 00:00:00 2001
From: Wesley <ziyue.jiang97@gmail.com>
Date: Thu, 31 Mar 2022 17:30:46 +0800
Subject: [PATCH 1/5] test pp + tp1d

---
 bert/colossalai_utils/model_zoo/colo_tp1dcol_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bert/colossalai_utils/model_zoo/colo_tp1dcol_bert.py b/bert/colossalai_utils/model_zoo/colo_tp1dcol_bert.py
index 8875ed5..6e869d6 100644
--- a/bert/colossalai_utils/model_zoo/colo_tp1dcol_bert.py
+++ b/bert/colossalai_utils/model_zoo/colo_tp1dcol_bert.py
@@ -195,8 +195,8 @@ class BertOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = col_nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = col_nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = col_nn.Dropout(config.hidden_dropout_prob)
+        self.LayerNorm = col_nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
     def forward(self, hidden_states, input_tensor):
         hidden_states = self.dense(hidden_states)

From 582084791a1bebe350db27293bcc6bf224beeb7f Mon Sep 17 00:00:00 2001
From: Wesley <ziyue.jiang97@gmail.com>
Date: Thu, 31 Mar 2022 17:51:47 +0800
Subject: [PATCH 2/5] add tp1dpp

---
 bert/colossalai_utils/bert_config_tp1dpp.json | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 bert/colossalai_utils/bert_config_tp1dpp.json

diff --git a/bert/colossalai_utils/bert_config_tp1dpp.json b/bert/colossalai_utils/bert_config_tp1dpp.json
new file mode 100644
index 0000000..fcc687c
--- /dev/null
+++ b/bert/colossalai_utils/bert_config_tp1dpp.json
@@ -0,0 +1,20 @@
+{
+    "method": "colossalai",
+    "model": {
+        "type": "bert_small"
+    },
+    "hyperparameter": {
+        "batch_size": 8,
+        "num_epochs": 20,
+        "steps_per_epoch": 10
+    },
+    "gradient_clipping": 1.0,
+    "parallel": {
+        "pipeline": 2,
+        "tensor": {
+            "mode": "1d",
+            "size": 2
+        }
+    },
+    "use_mem_monitor": true
+}

From 0031bfb553e50ca171eced9c58d9da22b2ede47b Mon Sep 17 00:00:00 2001
From: Wesley <ziyue.jiang97@gmail.com>
Date: Fri, 1 Apr 2022 17:34:04 +0800
Subject: [PATCH 3/5] update vocab size to pass check in divide()

---
 bert/common/helper.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/bert/common/helper.py b/bert/common/helper.py
index 8a783aa..0ed946d 100644
--- a/bert/common/helper.py
+++ b/bert/common/helper.py
@@ -9,7 +9,7 @@
 
 _bert_small = dict(
     seq_length=512,
-    vocab_size=32400,
+    vocab_size=50304,
     hidden_size=768,
     num_heads=12,
     depth=12,
@@ -77,8 +77,6 @@ def tokenize(examples, mode='concat'):
                                     keep_in_memory=True,
                                     remove_columns='text')
 
-    CONFIG['model']['vocab_size'] = len(tokenizer)
-
     def seed_worker(_):
         worker_seed = 1024
         np.random.seed(worker_seed)

From 21ab7bcfe579487bc30353be5d65bc9a58fdf0e8 Mon Sep 17 00:00:00 2001
From: Wesley <ziyue.jiang97@gmail.com>
Date: Sat, 2 Apr 2022 16:26:48 +0800
Subject: [PATCH 4/5] refine code

---
 bert/colossalai_utils/bert_config_pp.json     |  2 +-
 bert/colossalai_utils/bert_config_tp1d.json   |  7 ++--
 bert/colossalai_utils/bert_config_tp1dpp.json |  2 +-
 bert/colossalai_utils/bert_config_tp2d.json   |  2 +-
 bert/colossalai_utils/bert_config_tp2p5d.json |  2 +-
 bert/colossalai_utils/bert_config_tp3d.json   |  2 +-
 ...nfig_zerov2.json => bert_config_zero.json} |  2 +-
 .../bert_config_zerotppp.json                 | 35 +++++++++++++++++++
 bert/colossalai_utils/model_zoo/__init__.py   |  2 +-
 .../{colo_tp1dcol_bert.py => colo_bert.py}    |  5 +--
 bert/colossalai_utils/utils.py                |  6 ++--
 bert/common/helper.py                         | 20 ++++++++---
 bert/torch_utils/bert_config.json             | 20 -----------
 13 files changed, 66 insertions(+), 41 deletions(-)
 rename bert/colossalai_utils/{bert_config_zerov2.json => bert_config_zero.json} (95%)
 create mode 100644 bert/colossalai_utils/bert_config_zerotppp.json
 rename bert/colossalai_utils/model_zoo/{colo_tp1dcol_bert.py => colo_bert.py} (97%)
 delete mode 100644 bert/torch_utils/bert_config.json

diff --git a/bert/colossalai_utils/bert_config_pp.json b/bert/colossalai_utils/bert_config_pp.json
index e8e4734..e6eedec 100644
--- a/bert/colossalai_utils/bert_config_pp.json
+++ b/bert/colossalai_utils/bert_config_pp.json
@@ -1,7 +1,7 @@
 {
     "method": "colossalai",
     "model": {
-        "type": "bert_small"
+        "type": "bert_base"
     },
     "hyperparameter": {
         "batch_size": 8,
diff --git a/bert/colossalai_utils/bert_config_tp1d.json b/bert/colossalai_utils/bert_config_tp1d.json
index db1acd6..f78347c 100644
--- a/bert/colossalai_utils/bert_config_tp1d.json
+++ b/bert/colossalai_utils/bert_config_tp1d.json
@@ -1,19 +1,22 @@
 {
     "method": "colossalai",
     "model": {
-        "type": "bert_small"
+        "type": "bert_base"
     },
     "hyperparameter": {
         "batch_size": 8,
         "num_epochs": 20,
         "steps_per_epoch": 10
     },
+    "fp16": {
+        "mode": "AMP_TYPE.NAIVE"
+    },
     "gradient_clipping": 1.0,
     "parallel": {
         "pipeline": 1,
         "tensor": {
             "mode": "1d",
-            "size": 2
+            "size": 1
         }
     },
     "use_mem_monitor": true
diff --git a/bert/colossalai_utils/bert_config_tp1dpp.json b/bert/colossalai_utils/bert_config_tp1dpp.json
index fcc687c..11bbdb8 100644
--- a/bert/colossalai_utils/bert_config_tp1dpp.json
+++ b/bert/colossalai_utils/bert_config_tp1dpp.json
@@ -1,7 +1,7 @@
 {
     "method": "colossalai",
     "model": {
-        "type": "bert_small"
+        "type": "bert_base"
     },
     "hyperparameter": {
         "batch_size": 8,
diff --git a/bert/colossalai_utils/bert_config_tp2d.json b/bert/colossalai_utils/bert_config_tp2d.json
index b3cebbf..260dd20 100644
--- a/bert/colossalai_utils/bert_config_tp2d.json
+++ b/bert/colossalai_utils/bert_config_tp2d.json
@@ -1,7 +1,7 @@
 {
     "method": "colossalai",
     "model": {
-        "type": "bert_small"
+        "type": "bert_base"
     },
     "hyperparameter": {
         "batch_size": 8,
diff --git a/bert/colossalai_utils/bert_config_tp2p5d.json b/bert/colossalai_utils/bert_config_tp2p5d.json
index e1bf528..3cfebea 100644
--- a/bert/colossalai_utils/bert_config_tp2p5d.json
+++ b/bert/colossalai_utils/bert_config_tp2p5d.json
@@ -1,7 +1,7 @@
 {
     "method": "colossalai",
     "model": {
-        "type": "bert_small"
+        "type": "bert_base"
     },
     "hyperparameter": {
         "batch_size": 8,
diff --git a/bert/colossalai_utils/bert_config_tp3d.json b/bert/colossalai_utils/bert_config_tp3d.json
index daca835..f877f81 100644
--- a/bert/colossalai_utils/bert_config_tp3d.json
+++ b/bert/colossalai_utils/bert_config_tp3d.json
@@ -1,7 +1,7 @@
 {
     "method": "colossalai",
     "model": {
-        "type": "bert_small"
+        "type": "bert_base"
     },
     "hyperparameter": {
         "batch_size": 8,
diff --git a/bert/colossalai_utils/bert_config_zerov2.json b/bert/colossalai_utils/bert_config_zero.json
similarity index 95%
rename from bert/colossalai_utils/bert_config_zerov2.json
rename to bert/colossalai_utils/bert_config_zero.json
index 286f9df..b6626bf 100644
--- a/bert/colossalai_utils/bert_config_zerov2.json
+++ b/bert/colossalai_utils/bert_config_zero.json
@@ -1,7 +1,7 @@
 {
     "method": "colossalai",
     "model": {
-        "type": "bert_small"
+        "type": "bert_base"
     },
     "hyperparameter": {
         "batch_size": 8,
diff --git a/bert/colossalai_utils/bert_config_zerotppp.json b/bert/colossalai_utils/bert_config_zerotppp.json
new file mode 100644
index 0000000..46b1cdd
--- /dev/null
+++ b/bert/colossalai_utils/bert_config_zerotppp.json
@@ -0,0 +1,35 @@
+{
+    "method": "colossalai",
+    "model": {
+        "type": "bert_base"
+    },
+    "hyperparameter": {
+        "batch_size": 8,
+        "num_epochs": 100,
+        "steps_per_epoch": 10
+    },
+    "gradient_clipping": 1.0,
+    "zero": {
+        "model_config": {
+            "offload_config": {
+                "device": "cpu"
+            }
+        },
+        "optimizer_config": {
+            "cpu_offload": true,
+            "initial_scale": 256,
+            "min_scale": 1,
+            "growth_factor": 2.0,
+            "backoff_factor": 0.5,
+            "growth_interval": 1000
+        }
+    },
+    "parallel": {
+        "pipeline":1,
+        "tensor": {
+            "mode": "1d",
+            "size": 2
+        }
+    },
+    "use_mem_monitor": true
+}
diff --git a/bert/colossalai_utils/model_zoo/__init__.py b/bert/colossalai_utils/model_zoo/__init__.py
index 2799632..b1df787 100644
--- a/bert/colossalai_utils/model_zoo/__init__.py
+++ b/bert/colossalai_utils/model_zoo/__init__.py
@@ -1,3 +1,3 @@
-from .colo_tp1dcol_bert import create_colo_bert_pipeline_model, ColoBertForMaskedLM, ColoBertMaskedLMLoss
+from .colo_bert import create_colo_bert_pipeline_model, ColoBertForMaskedLM, ColoBertMaskedLMLoss
 
 __all__ = ['create_colo_bert_pipeline_model', 'ColoBertForMaskedLM', 'ColoBertMaskedLMLoss']
diff --git a/bert/colossalai_utils/model_zoo/colo_tp1dcol_bert.py b/bert/colossalai_utils/model_zoo/colo_bert.py
similarity index 97%
rename from bert/colossalai_utils/model_zoo/colo_tp1dcol_bert.py
rename to bert/colossalai_utils/model_zoo/colo_bert.py
index 6e869d6..3a48031 100644
--- a/bert/colossalai_utils/model_zoo/colo_tp1dcol_bert.py
+++ b/bert/colossalai_utils/model_zoo/colo_bert.py
@@ -682,7 +682,7 @@ def __init__(self, config):
 
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
+        self.decoder = col_nn.Classifier(config.hidden_size, config.vocab_size, bias=True)
 
     def forward(self, hidden_states):
         ###print("BertLMPredictionHead:input:", hidden_states.shape)
@@ -712,9 +712,6 @@ def __init__(self, config):
         self.bert = BertModel(config, add_pooling_layer=False)
         self.cls = BertOnlyMLMHead(config)
 
-        # Initialize weights and apply final processing
-        self.post_init()
-
     def get_output_embeddings(self):
         return self.cls.predictions.decoder
 
diff --git a/bert/colossalai_utils/utils.py b/bert/colossalai_utils/utils.py
index 669819e..4084052 100644
--- a/bert/colossalai_utils/utils.py
+++ b/bert/colossalai_utils/utils.py
@@ -8,10 +8,8 @@ def init_w_col(builder):
     from colossalai.core import global_context as gpc
     from colossalai.nn.optimizer import CPUAdam
     from colossalai.zero.init_ctx import ZeroInitContext
-    from colossalai.zero.shard_utils import (BucketTensorShardStrategy,
-                                             TensorShardStrategy)
-    from colossalai.zero.sharded_model import ShardedModelV2
-    from colossalai.zero.sharded_optim import ShardedOptimizerV2
+    from colossalai.zero.shard_utils import (BucketTensorShardStrategy)
+    from colossalai.amp import AMP_TYPE
 
     colossalai.launch_from_torch(config=CONFIG)
 
diff --git a/bert/common/helper.py b/bert/common/helper.py
index 0ed946d..e726832 100644
--- a/bert/common/helper.py
+++ b/bert/common/helper.py
@@ -5,9 +5,9 @@
 from transformers import BertConfig, BertTokenizer
 
 from zero.common.utils import CONFIG, ModelFromHF, get_model_size
-from bert.colossalai_utils.model_zoo.colo_tp1dcol_bert import ColoBertMaskedLMLoss, ColoBertForMaskedLM, create_colo_bert_pipeline_model
+from bert.colossalai_utils.model_zoo.colo_bert import ColoBertMaskedLMLoss, ColoBertForMaskedLM, create_colo_bert_pipeline_model
 
-_bert_small = dict(
+_bert_base = dict(
     seq_length=512,
     vocab_size=50304,
     hidden_size=768,
@@ -18,9 +18,21 @@
     evaluation='ppl',
 )
 
+_bert_large = dict(
+    seq_length=512,
+    vocab_size=50304,
+    hidden_size=1024,
+    num_heads=16,
+    depth=24,
+    ff_size=3072,
+    checkpoint=False,
+    evaluation='ppl',
+)
+
 _bert_configurations = dict(
-    bert=_bert_small,
-    bert_small=_bert_small,
+    bert=_bert_base,
+    bert_base=_bert_base,
+    bert_large=_bert_large
 )
 
 _default_hyperparameters = dict(
diff --git a/bert/torch_utils/bert_config.json b/bert/torch_utils/bert_config.json
deleted file mode 100644
index dc0640a..0000000
--- a/bert/torch_utils/bert_config.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
-    "method": "torch",
-    "model": {
-        "type": "bert_small"
-    },
-    "hyperparameter": {
-        "batch_size": 8,
-        "num_epochs": 300,
-        "steps_per_epoch": 10
-    },
-    "fp16": {
-        "enabled": true,
-        "init_scale": 32768,
-        "growth_factor": 2.0,
-        "backoff_factor": 0.5,
-        "growth_interval": 1000
-    },
-    "gradient_clipping": 1.0,
-    "use_mem_monitor": true
-}

From 5ce8f1b09f8de68387a67d9e788797be76134c99 Mon Sep 17 00:00:00 2001
From: Wesley <ziyue.jiang97@gmail.com>
Date: Sat, 2 Apr 2022 18:07:59 +0800
Subject: [PATCH 5/5] Add README

---
 bert/README.md                              | 22 +++++++++++++++++++++
 bert/colossalai_utils/bert_config_tp1d.json |  7 ++-----
 bert/colossalai_utils/requirement.txt       |  7 +++++++
 bert/colossalai_utils/utils.py              |  4 +++-
 4 files changed, 34 insertions(+), 6 deletions(-)
 create mode 100644 bert/README.md
 create mode 100644 bert/colossalai_utils/requirement.txt

diff --git a/bert/README.md b/bert/README.md
new file mode 100644
index 0000000..4924ad9
--- /dev/null
+++ b/bert/README.md
@@ -0,0 +1,22 @@
+# Bert Benchmark
+Bert Benchmark with data parallel, tensor parallel(tp), pipeline parallel(pp) and ZeRO.
+
+## Setup
+1. Install dependencies if you do not have them
+```
+pip install -r requirement.txt
+```
+
+2. Add root dir into PYTHONPATH
+```
+export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH
+```
+
+## Bert Usage
+
+1. Prepare datasets and tokenizers from HuggingFace Hub if necessary (e.g. we provide an example of training `wikitext-2`).
+
+2. Run benchmark with one of the systems to evaluate
+```
+DATA=/PATH/TO/DATASET TOKENIZER=/PATH/TO/TOKENIZER LOG=/PATH/TO/LOG torchrun --nproc_per_node=NUM_GPUS run.py --config=CONFIG_FILE
+```
\ No newline at end of file
diff --git a/bert/colossalai_utils/bert_config_tp1d.json b/bert/colossalai_utils/bert_config_tp1d.json
index f78347c..a835a03 100644
--- a/bert/colossalai_utils/bert_config_tp1d.json
+++ b/bert/colossalai_utils/bert_config_tp1d.json
@@ -5,18 +5,15 @@
     },
     "hyperparameter": {
         "batch_size": 8,
-        "num_epochs": 20,
+        "num_epochs": 10,
         "steps_per_epoch": 10
     },
-    "fp16": {
-        "mode": "AMP_TYPE.NAIVE"
-    },
     "gradient_clipping": 1.0,
     "parallel": {
         "pipeline": 1,
         "tensor": {
             "mode": "1d",
-            "size": 1
+            "size": 2
         }
     },
     "use_mem_monitor": true
diff --git a/bert/colossalai_utils/requirement.txt b/bert/colossalai_utils/requirement.txt
new file mode 100644
index 0000000..24fcf65
--- /dev/null
+++ b/bert/colossalai_utils/requirement.txt
@@ -0,0 +1,7 @@
+
+torch>=1.10 -f https://download.pytorch.org/whl/cu113/torch_stable.html
+torchvision -f https://download.pytorch.org/whl/cu113/torch_stable.html
+transformers
+datasets
+colossalai
+rich
\ No newline at end of file
diff --git a/bert/colossalai_utils/utils.py b/bert/colossalai_utils/utils.py
index 4084052..abad460 100644
--- a/bert/colossalai_utils/utils.py
+++ b/bert/colossalai_utils/utils.py
@@ -9,7 +9,9 @@ def init_w_col(builder):
     from colossalai.nn.optimizer import CPUAdam
     from colossalai.zero.init_ctx import ZeroInitContext
     from colossalai.zero.shard_utils import (BucketTensorShardStrategy)
-    from colossalai.amp import AMP_TYPE
+
+    from colossalai.utils.memory_utils.utils import colo_set_process_memory_fraction
+    colo_set_process_memory_fraction(0.2)
 
     colossalai.launch_from_torch(config=CONFIG)