mlcommons · hiwotadese · Jul 31, 2024 · Jun 13, 2024
@@ -0,0 +1,8 @@
+megatron/__pycache__/
+megatron/data/__pycache__/
+megatron/model/__pycache__/
+megatron/mpu/__pycache__/
+megatron/optimizer/__pycache__/
+megatron/tokenizer/__pycache__/
+megatron/fused_kernels/__pycache__/
+megatron/fused_kernels/build/
@@ -1,4 +1,4 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:22.04-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:24.04-py3
 FROM ${FROM_IMAGE_NAME}
 
 # Copy code

@@ -1,6 +1,6 @@
 The following applies to all files unless otherwise noted:
 
-# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions

@@ -7,7 +7,7 @@ Our codebase is capable of training large language models with both model and da
 
 ### Steps to configure machine
 
-To use this repository, please install a supported version of PyTorch with GPU support (python 3.8, pytorch 1.12, cuda 11.6.2, and nccl 2.12.10 and above) and NVIDIA [APEX](https://github.com/NVIDIA/apex#quick-start). We recommend using one of [NGC's PyTorch containers](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch). The latest tested compatible version is `nvcr.io/nvidia/pytorch:22.04-py3`).
+To use this repository, please install a supported version of PyTorch with GPU support (python 3.8, pytorch 1.12, cuda 11.6.2, and nccl 2.12.10 and above) and NVIDIA [APEX](https://github.com/NVIDIA/apex#quick-start). We recommend using one of [NGC's PyTorch containers](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch). The latest tested compatible version is `nvcr.io/nvidia/pytorch:24.04-py3`).
 
 ### Steps to run and time
 
@@ -256,3 +256,7 @@ cd scripts
 sbatch preprocess.sh <path to c4>
 sbatch preprocess_val.sh <path to c4> <path to validation json>
 ```
+
+# 4. Model
+### Publication/Attribution
+Megatron ([1](https://arxiv.org/pdf/1909.08053.pdf) and [2](https://arxiv.org/pdf/2104.04473.pdf)) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA.
@@ -206,6 +206,7 @@ def validate_args(args, defaults={}):
     # Consumed tokens.
     args.consumed_train_samples = 0
     args.consumed_valid_samples = 0
+    args.tokens_per_batch = args.eval_interval * args.global_batch_size * args.seq_length
 
     # Iteration-based training.
     if args.train_iters:

@@ -95,7 +95,7 @@ def write_longs(f, a):
     3: np.int16,
     4: np.int32,
     5: np.int64,
-    6: np.float,
+    6: float,
     7: np.double,
     8: np.uint16
 }
@@ -268,7 +268,7 @@ class IndexedDatasetBuilder(object):
         np.int16: 2,
         np.int32: 4,
         np.int64: 8,
-        np.float: 4,
+        float: 4,
         np.double: 8
     }