Skip to content

Commit

Permalink
Merge pull request #101 from dice-group/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
Demirrr authored Apr 26, 2023
2 parents 736b5e0 + 13680c3 commit 14dcd84
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 23 deletions.
16 changes: 10 additions & 6 deletions dicee/trainer/torch_trainer_ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def fit(self, *args, **kwargs):

# (2) Initialize OPTIMIZER.
optimizer = model.configure_optimizers()
# (3) Create a static DDB Trainer.
# (3) Start NodeTrainer.
NodeTrainer(model, train_dataset_loader, optimizer, self.callbacks, self.attributes.num_epochs).train()
torch.distributed.destroy_process_group()
self.on_fit_end(self, model)
Expand Down Expand Up @@ -100,22 +100,24 @@ def __init__(self,
optimizer: torch.optim.Optimizer,
callbacks,
num_epochs: int) -> None:
# (1) Local and Global Ranks.
self.local_rank = int(os.environ["LOCAL_RANK"])
self.global_rank = int(os.environ["RANK"])
# (2) Send model to local trainer. (Check whether it is uncesseary as we wrap it with DDP
self.model = model.to(self.local_rank)
self.train_dataset_loader = train_dataset_loader
self.loss_func = self.model.loss
self.optimizer = optimizer
self.callbacks = callbacks
# (1) Wrap the model with DDP() along with GPU ID that model lives on.
# (3) Wrap the model with DDP() along with GPU ID that model lives on.
self.model = DDP(model, device_ids=[self.local_rank])
self.num_epochs = num_epochs
print_peak_memory("Max memory allocated after creating DDP local local_rank:", self.local_rank)
print(f'Global Rank {self.global_rank}\t Local Rank:{self.local_rank}')
print(self.model)
print(self.optimizer)
print(
f'NumOfDataPoints:{len(self.train_dataset_loader.dataset)} | NumOfEpochs:{self.num_epochs} | LearningRate:{self.model.module.learning_rate} | BatchSize:{self.train_dataset_loader.batch_size} | EpochBatchsize:{len(self.train_dataset_loader)}')
f'Global:{self.global_rank} | Local:{self.local_rank} | NumOfDataPoints:{len(self.train_dataset_loader.dataset)} | NumOfEpochs:{self.num_epochs} | LearningRate:{self.model.module.learning_rate} | BatchSize:{self.train_dataset_loader.batch_size} | EpochBatchsize:{len(self.train_dataset_loader)}')

self.loss_history = []

Expand Down Expand Up @@ -162,16 +164,18 @@ def _run_epoch(self, epoch):
f"Global:{self.global_rank} | Local:{self.local_rank} | Epoch:{epoch + 1} | Batch:{i + 1} | Loss:{batch_loss} |ForwardBackwardUpdate:{(time.time() - start_time):.2f}sec | BatchConst.:{construct_mini_batch_time:.2f}sec")
else:
print(
f"Epoch:{epoch + 1} | Batch:{i + 1} | Loss:{batch_loss} |ForwardBackwardUpdate:{(time.time() - start_time):.2f}secs")
f"Global:{self.global_rank} | Local:{self.local_rank} | Epoch:{epoch + 1} | Batch:{i + 1} | Loss:{batch_loss} |ForwardBackwardUpdate:{(time.time() - start_time):.2f}secs")
construct_mini_batch_time = time.time()
return epoch_loss / (i + 1)

def train(self):
for epoch in range(self.num_epochs):
start_time = time.time()
epoch_loss = self._run_epoch(epoch)
if self.local_rank == self.global_rank == 0:
print(f"Epoch:{epoch + 1} | Loss:{epoch_loss:.8f} | Runtime:{(time.time() - start_time) / 60:.3f}mins")

print(f"Epoch:{epoch + 1} | Loss:{epoch_loss:.8f} | Runtime:{(time.time() - start_time) / 60:.3f}mins")
if True:#self.local_rank == self.global_rank == 0:
#print(f"Epoch:{epoch + 1} | Loss:{epoch_loss:.8f} | Runtime:{(time.time() - start_time) / 60:.3f}mins")
self.model.module.loss_history.append(epoch_loss)
for c in self.callbacks:
c.on_train_epoch_end(None, self.model.module)
Expand Down
2 changes: 1 addition & 1 deletion examples/Train_and_Eval_KGE.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1405,4 +1405,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}
32 changes: 24 additions & 8 deletions examples/Trainers.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
},
"source": [
"# Training a KGE model with DDP on KINSHIP with torchDDP with 128 CPUs and NVIDIA GeForce RTX 3090\n",
"torchrun --standalone --nproc_per_node=gpu main.py --model 'ComplEx' --embedding_dim 32 --num_epochs 500 --path_dataset_folder 'KGs/KINSHIP' --trainer torchDDP --eval_mode 'test'\n",
"```torchrun --standalone --nproc_per_node=gpu main.py --model 'ComplEx' --embedding_dim 32 --num_epochs 500 --path_dataset_folder 'KGs/KINSHIP' --trainer torchDDP --eval_mode 'test'```\n",
"\n",
"### Here are the last few lines of the log file:\n",
"Global1 | Local1 | Epoch:500 | Batch:2 | Loss:0.036598145961761475 |ForwardBackwardUpdate:0.00sec | BatchConst.:0.01sec\n",
"Global0 | Local0 | Epoch:500 | Batch:2 | Loss:0.03864430636167526 |ForwardBackwardUpdate:0.00sec | BatchConst.:0.01sec\n",
Expand All @@ -31,28 +32,43 @@
"### We see two eval result as we have to GPUs.\n",
"\n",
"### Training a KGE model with CPU\n",
"torchrun --standalone --nproc_per_node=gpu main.py --model 'ComplEx' --embedding_dim 32 --num_epochs 500 --path_dataset_folder 'KGs/KINSHIP' --trainer torchCPU --eval_mode 'test'\n",
"```torchrun --standalone --nproc_per_node=gpu main.py --model 'ComplEx' --embedding_dim 32 --num_epochs 500 --path_dataset_folder 'KGs/KINSHIP' --trainer torchCPU --eval_mode 'test'```\n",
"\n",
"Epoch:500 | Batch:1 | Loss:0.05302952229976654 |ForwardBackwardUpdate:0.00secs | Mem. Usage 497.07MB\n",
"Epoch:500 | Batch:2 | Loss:0.0575931221 |ForwardBackwardUpdate:0.00sec | BatchConst.:0.01sec | Mem. Usage 497.07MB avail. 1.5 %\n",
"Epoch:500 | Batch:3 | Loss:0.0585726425 |ForwardBackwardUpdate:0.00sec | BatchConst.:0.01sec | Mem. Usage 497.07MB avail. 1.5 %\n",
"Epoch:500 | Batch:4 | Loss:0.0551908500 |ForwardBackwardUpdate:0.00sec | BatchConst.:0.00sec | Mem. Usage 497.07MB avail. 1.5 %\n",
"Done ! It took 23.230 seconds.\n",
"\n",
"*** Save Trained Model ***\n",
"Took 0.0006 seconds | Current Memory Usage 497.25 in MB\n",
"Total computation time: 23.308 seconds\n",
"Evaluate ComplEx on Test set: Evaluate ComplEx on Test set\n",
"{'H@1': 0.6014897579143389, 'H@3': 0.8212290502793296, 'H@10': 0.9599627560521415, 'MRR': 0.7271467645821516}\n",
"\n",
"\n",
"# Multi-node GPU training\n",
"Execute the following command on the node 1\n",
"torchrun --nnodes 2 --nproc_per_node=gpu --node_rank 0 --rdzv_id 456 --rdzv_backend c10d --rdzv_endpoint=nebula main.py --model 'ComplEx' --embedding_dim 32 --num_epochs 10 --path_dataset_folder 'KGs/WN18RR' --trainer torchDDP --eval_mode 'test'\n",
"\n",
"```torchrun --nnodes 2 --nproc_per_node=gpu --node_rank 0 --rdzv_id 456 --rdzv_backend c10d --rdzv_endpoint=nebula main.py --model 'ComplEx' --embedding_dim 32 --num_epochs 100 --path_dataset_folder 'KGs/WN18RR' --trainer torchDDP```\n",
"Execute the following command on the node 2\n",
"torchrun --nnodes 2 --nproc_per_node=gpu --node_rank 1 --rdzv_id 456 --rdzv_backend c10d --rdzv_endpoint=nebula main.py --model 'ComplEx' --embedding_dim 32 --num_epochs 10 --path_dataset_folder 'KGs/WN18RR' --trainer torchDDP --eval_mode 'test'\n",
"\n",
"```torchrun --nnodes 2 --nproc_per_node=gpu --node_rank 1 --rdzv_id 456 --rdzv_backend c10d --rdzv_endpoint=nebula main.py --model 'ComplEx' --embedding_dim 32 --num_epochs 100 --path_dataset_folder 'KGs/WN18RR' --trainer torchDDP```\n",
"\n",
"### Node 1\n",
"Global:3 | Local:1 | Epoch:100 | Loss:0.00011253 | Runtime:0.042mins\n",
"Global:2 | Local:0 | Epoch:100 | Batch:26 | Loss:0.00011469019227661192 |ForwardBackwardUpdate:0.01sec | BatchConst.:0.09sec\n",
"Global:2 | Local:0 | Epoch:100 | Loss:0.00011178 | Runtime:0.042mins\n",
"Done ! It took 4.440 minutes.\n",
"Done ! It took 4.441 minutes.\n",
"\n",
"### Node 2\n",
"```\n",
"Global:1 | Local:1 | Epoch:100 | Batch:25 | Loss:0.00011904298298759386 |ForwardBackwardUpdate:0.01sec | BatchConst.:0.09sec\n",
"Global:0 | Local:0 | Epoch:100 | Batch:25 | Loss:0.00011089341569459066 |ForwardBackwardUpdate:0.01sec | BatchConst.:0.09sec\n",
"Global:1 | Local:1 | Epoch:100 | Batch:26 | Loss:0.00011964481382165104 |ForwardBackwardUpdate:0.01sec | BatchConst.:0.08sec\n",
"Epoch:100 | Loss:0.00011271 | Runtime:0.042mins\n",
"Global:0 | Local:0 | Epoch:100 | Batch:26 | Loss:9.990083344746381e-05 |ForwardBackwardUpdate:0.01sec | BatchConst.:0.05sec\n",
"Epoch:100 | Loss:0.00010982 | Runtime:0.042mins\n",
"Done ! It took 4.421 minutes.\n",
"Done ! It took 4.419 minutes.\n",
"```\n",
"\n",
"# TODO:Pytorch-Lightning Trainer\n",
"\n",
Expand Down
16 changes: 8 additions & 8 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,15 @@ def get_default_arguments(description=None):
parser.add_argument("--storage_path", type=str, default='Experiments',
help="Embeddings, model, and any other related data will be stored therein.")
parser.add_argument("--model", type=str,
default="Keci",
default="AConEx",
help="Available models: CMult, ConEx, ConvQ, ConvO, DistMult, QMult, OMult, "
"Shallom, AConEx, ConEx, ComplEx, DistMult, TransE, Keci")
parser.add_argument('--p', type=int, default=0,
help='P for Clifford Algebra')
parser.add_argument('--q', type=int, default=1,
help='Q for Clifford Algebra')
parser.add_argument('--optim', type=str, default='Adam',
help='[Adam, SGD]')
parser.add_argument('--embedding_dim', type=int, default=32,
help='Number of dimensions for an embedding vector. ')
parser.add_argument("--num_epochs", type=int, default=100, help='Number of epochs for training. ')
parser.add_argument('--batch_size', type=int, default=1024, help='Mini batch size')
parser.add_argument('--auto_batch_finder', type=bool, default=False,
help='Find a batch size w.r.t. computational budgets')
parser.add_argument("--lr", type=float, default=0.1)
parser.add_argument('--callbacks', '--list', nargs='+', default=[],
help='List of tuples representing a callback and values, e.g. [FPPE or PPE or PPE10 ,PPE20 or PPE, FPPE]')
Expand Down Expand Up @@ -56,7 +50,7 @@ def get_default_arguments(description=None):
help='At every X number of epochs model will be saved. If None, we save 4 times.')
parser.add_argument("--label_smoothing_rate", type=float, default=0.0, help='None for not using it.')
parser.add_argument("--kernel_size", type=int, default=3, help="Square kernel size for ConEx")
parser.add_argument("--num_of_output_channels", type=int, default=32,
parser.add_argument("--num_of_output_channels", type=int, default=2,
help="# of output channels in convolution")
parser.add_argument("--num_core", type=int, default=0,
help='Number of cores to be used. 0 implies using single CPU')
Expand All @@ -65,6 +59,12 @@ def get_default_arguments(description=None):
parser.add_argument("--sample_triples_ratio", type=float, default=None, help='Sample input data.')
parser.add_argument("--read_only_few", type=int, default=None,
help='READ only first N triples. If 0, read all.')
parser.add_argument('--p', type=int, default=0,
help='P for Clifford Algebra')
parser.add_argument('--q', type=int, default=0,
help='Q for Clifford Algebra')
parser.add_argument('--auto_batch_finder', type=bool, default=False,
help='Find a batch size w.r.t. computational budgets')
if description is None:
return parser.parse_args()
return parser.parse_args(description)
Expand Down

0 comments on commit 14dcd84

Please sign in to comment.