hpcaitech · feifeibear · Apr 2, 2022 · Mar 31, 2022 · Mar 31, 2022 · Apr 1, 2022
diff --git a/bert/README.md b/bert/README.md
@@ -0,0 +1,22 @@
+# Bert Benchmark
+Bert Benchmark with data parallel, tensor parallel(tp), pipeline parallel(pp) and ZeRO.
+
+## Setup
+1. Install dependencies if you do not have them
+```
+pip install -r requirement.txt
+```
+
+2. Add root dir into PYTHONPATH
+```
+export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH
+```
+
+## Bert Usage
+
+1. Prepare datasets and tokenizers from HuggingFace Hub if necessary (e.g. we provide an example of training `wikitext-2`).
+
+2. Run benchmark with one of the systems to evaluate
+```
+DATA=/PATH/TO/DATASET TOKENIZER=/PATH/TO/TOKENIZER LOG=/PATH/TO/LOG torchrun --nproc_per_node=NUM_GPUS run.py --config=CONFIG_FILE
+```
diff --git a/bert/colossalai_utils/bert_config_pp.json b/bert/colossalai_utils/bert_config_pp.json
@@ -1,7 +1,7 @@
 {
     "method": "colossalai",
     "model": {
-        "type": "bert_small"
+        "type": "bert_base"
     },
     "hyperparameter": {
         "batch_size": 8,

diff --git a/bert/colossalai_utils/bert_config_tp1d.json b/bert/colossalai_utils/bert_config_tp1d.json
@@ -1,11 +1,11 @@
 {
     "method": "colossalai",
     "model": {
-        "type": "bert_small"
+        "type": "bert_base"
     },
     "hyperparameter": {
         "batch_size": 8,
-        "num_epochs": 20,
+        "num_epochs": 10,
         "steps_per_epoch": 10
     },
     "gradient_clipping": 1.0,

diff --git a/bert/colossalai_utils/bert_config_tp1dpp.json b/bert/colossalai_utils/bert_config_tp1dpp.json
@@ -1,7 +1,7 @@
 {
     "method": "colossalai",
     "model": {
-        "type": "bert_small"
+        "type": "bert_base"
     },
     "hyperparameter": {
         "batch_size": 8,

diff --git a/bert/colossalai_utils/bert_config_tp2d.json b/bert/colossalai_utils/bert_config_tp2d.json
@@ -1,7 +1,7 @@
 {
     "method": "colossalai",
     "model": {
-        "type": "bert_small"
+        "type": "bert_base"
     },
     "hyperparameter": {
         "batch_size": 8,

diff --git a/bert/colossalai_utils/bert_config_tp2p5d.json b/bert/colossalai_utils/bert_config_tp2p5d.json
@@ -1,7 +1,7 @@
 {
     "method": "colossalai",
     "model": {
-        "type": "bert_small"
+        "type": "bert_base"
     },
     "hyperparameter": {
         "batch_size": 8,

diff --git a/bert/colossalai_utils/bert_config_tp3d.json b/bert/colossalai_utils/bert_config_tp3d.json
@@ -1,7 +1,7 @@
 {
     "method": "colossalai",
     "model": {
-        "type": "bert_small"
+        "type": "bert_base"
     },
     "hyperparameter": {
         "batch_size": 8,

diff --git a/.../colossalai_utils/bert_config_zerov2.json → bert/colossalai_utils/bert_config_zero.json b/.../colossalai_utils/bert_config_zerov2.json → bert/colossalai_utils/bert_config_zero.json
@@ -1,7 +1,7 @@
 {
     "method": "colossalai",
     "model": {
-        "type": "bert_small"
+        "type": "bert_base"
     },
     "hyperparameter": {
         "batch_size": 8,

diff --git a/bert/colossalai_utils/bert_config_zerotppp.json b/bert/colossalai_utils/bert_config_zerotppp.json
@@ -0,0 +1,35 @@
+{
+    "method": "colossalai",
+    "model": {
+        "type": "bert_base"
+    },
+    "hyperparameter": {
+        "batch_size": 8,
+        "num_epochs": 100,
+        "steps_per_epoch": 10
+    },
+    "gradient_clipping": 1.0,
+    "zero": {
+        "model_config": {
+            "offload_config": {
+                "device": "cpu"
+            }
+        },
+        "optimizer_config": {
+            "cpu_offload": true,
+            "initial_scale": 256,
+            "min_scale": 1,
+            "growth_factor": 2.0,
+            "backoff_factor": 0.5,
+            "growth_interval": 1000
+        }
+    },
+    "parallel": {
+        "pipeline":1,
+        "tensor": {
+            "mode": "1d",
+            "size": 2
+        }
+    },
+    "use_mem_monitor": true
+}
diff --git a/bert/colossalai_utils/model_zoo/__init__.py b/bert/colossalai_utils/model_zoo/__init__.py
@@ -1,3 +1,3 @@
-from .colo_tp1dcol_bert import create_colo_bert_pipeline_model, ColoBertForMaskedLM, ColoBertMaskedLMLoss
+from .colo_bert import create_colo_bert_pipeline_model, ColoBertForMaskedLM, ColoBertMaskedLMLoss
 
 __all__ = ['create_colo_bert_pipeline_model', 'ColoBertForMaskedLM', 'ColoBertMaskedLMLoss']
diff --git a/...alai_utils/model_zoo/colo_tp1dcol_bert.py → bert/colossalai_utils/model_zoo/colo_bert.py b/...alai_utils/model_zoo/colo_tp1dcol_bert.py → bert/colossalai_utils/model_zoo/colo_bert.py
@@ -682,7 +682,7 @@ def __init__(self, config):
 
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
+        self.decoder = col_nn.Classifier(config.hidden_size, config.vocab_size, bias=True)
 
     def forward(self, hidden_states):
         ###print("BertLMPredictionHead:input:", hidden_states.shape)
@@ -712,9 +712,6 @@ def __init__(self, config):
         self.bert = BertModel(config, add_pooling_layer=False)
         self.cls = BertOnlyMLMHead(config)
 
-        # Initialize weights and apply final processing
-        self.post_init()
-
     def get_output_embeddings(self):
         return self.cls.predictions.decoder
 

diff --git a/bert/colossalai_utils/requirement.txt b/bert/colossalai_utils/requirement.txt
@@ -0,0 +1,7 @@
+
+torch>=1.10 -f https://download.pytorch.org/whl/cu113/torch_stable.html
+torchvision -f https://download.pytorch.org/whl/cu113/torch_stable.html
+transformers
+datasets
+colossalai
+rich
diff --git a/bert/colossalai_utils/utils.py b/bert/colossalai_utils/utils.py
@@ -8,10 +8,10 @@ def init_w_col(builder):
     from colossalai.core import global_context as gpc
     from colossalai.nn.optimizer import CPUAdam
     from colossalai.zero.init_ctx import ZeroInitContext
-    from colossalai.zero.shard_utils import (BucketTensorShardStrategy,
-                                             TensorShardStrategy)
-    from colossalai.zero.sharded_model import ShardedModelV2
-    from colossalai.zero.sharded_optim import ShardedOptimizerV2
+    from colossalai.zero.shard_utils import (BucketTensorShardStrategy)
+
+    from colossalai.utils.memory_utils.utils import colo_set_process_memory_fraction
+    colo_set_process_memory_fraction(0.2)
 
     colossalai.launch_from_torch(config=CONFIG)
 

diff --git a/bert/common/helper.py b/bert/common/helper.py
@@ -5,9 +5,9 @@
 from transformers import BertConfig, BertTokenizer
 
 from zero.common.utils import CONFIG, ModelFromHF, get_model_size
-from bert.colossalai_utils.model_zoo.colo_tp1dcol_bert import ColoBertMaskedLMLoss, ColoBertForMaskedLM, create_colo_bert_pipeline_model
+from bert.colossalai_utils.model_zoo.colo_bert import ColoBertMaskedLMLoss, ColoBertForMaskedLM, create_colo_bert_pipeline_model
 
-_bert_small = dict(
+_bert_base = dict(
     seq_length=512,
     vocab_size=50304,
     hidden_size=768,
@@ -18,9 +18,21 @@
     evaluation='ppl',
 )
 
+_bert_large = dict(
+    seq_length=512,
+    vocab_size=50304,
+    hidden_size=1024,
+    num_heads=16,
+    depth=24,
+    ff_size=3072,
+    checkpoint=False,
+    evaluation='ppl',
+)
+
 _bert_configurations = dict(
-    bert=_bert_small,
-    bert_small=_bert_small,
+    bert=_bert_base,
+    bert_base=_bert_base,
+    bert_large=_bert_large
 )
 
 _default_hyperparameters = dict(

diff --git a/bert/torch_utils/bert_config.json b/bert/torch_utils/bert_config.json