From 132bbb423a3fe13d71a9081abcc722ab484fae5a Mon Sep 17 00:00:00 2001 From: Ng Boon Cheong Raymond Date: Thu, 9 Dec 2021 15:48:28 +0800 Subject: [PATCH 001/201] [#41] initial commit for asgcn --- sgnlp/models/asgcn/__init__.py | 0 sgnlp/models/asgcn/modules/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 sgnlp/models/asgcn/__init__.py create mode 100644 sgnlp/models/asgcn/modules/__init__.py diff --git a/sgnlp/models/asgcn/__init__.py b/sgnlp/models/asgcn/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sgnlp/models/asgcn/modules/__init__.py b/sgnlp/models/asgcn/modules/__init__.py new file mode 100644 index 0000000..e69de29 From ae77dc76bc13c57562883a600ca2e2c27e625d99 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 9 Dec 2021 16:49:09 +0800 Subject: [PATCH 002/201] [#41] add dynamic_rnn module --- sgnlp/models/asgcn/modules/dynamic_rnn.py | 98 +++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 sgnlp/models/asgcn/modules/dynamic_rnn.py diff --git a/sgnlp/models/asgcn/modules/dynamic_rnn.py b/sgnlp/models/asgcn/modules/dynamic_rnn.py new file mode 100644 index 0000000..237875e --- /dev/null +++ b/sgnlp/models/asgcn/modules/dynamic_rnn.py @@ -0,0 +1,98 @@ +import torch +import torch.nn as nn + + +class DynamicLSTM(nn.Module): + """ + A dynamic LSTM class which can hold variable length sequence + """ + def __init__( + self, + input_size, + hidden_size, + num_layers=1, + bias=True, + batch_first=True, + dropout=0, + bidirectional=False, + only_use_last_hidden_state=False, + rnn_type='LSTM') -> None: + super(DynamicLSTM, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.bias = bias + self.batch_first = batch_first + self.dropout = dropout + self.bidirectional = bidirectional + self.only_use_last_hidden_state = only_use_last_hidden_state + self.rnn_type = rnn_type + self.__init_rnn() + + def __init_rnn(self) -> None: + if self.rnn_type == 'LSTM': + self.rnn = nn.LSTM( + input_size=self.input_size, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + bias=self.bias, + batch_first=self.batch_first, + dropout=self.dropout, + bidirectional=self.bidirectional + ) + elif self.rnn_type == 'GRU': + self.rnn = nn.GRU( + input_size=self.input_size, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + bias=self.bias, + batch_first=self.batch_first, + dropout=self.dropout, + bidirectional=self.bidirectional + ) + elif self.rnn_type == 'RNN': + self.rnn = nn.RNN( + input_size=self.input_size, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + bias=self.bias, + batch_first=self.batch_first, + dropout=self.dropout, + bidirectional=self.bidirectional + ) + + def forward(self, x, x_len, h0=None): + # Sort + x_sort_idx = torch.argsort(-x_len) + x_unsort_idx = torch.argsort(x_sort_idx).long() + x_len = x_len[x_sort_idx] + x = x[x_sort_idx.long()] + + # Pack + x_emb_p = torch.nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first=self.batch_first) + + if self.rnn_type == "LSTM": + out_pack, (ht, ct) = self.rnn(x_emb_p, None) if h0 is None else self.rnn(x_emb_p, (h0, h0)) + else: + out_pack, ht = self.rnn(x_emb_p, None) if h0 is None else self.rnn(x_emb_p, h0) + ct = None + + # Unsort + # (num_layers * num_directions, batch, hidden_size) -> (batch, ...) + ht = torch.transpose(ht, 0, 1)[x_unsort_idx] + ht = torch.transpose(ht, 0, 1) + + if self.only_use_last_hidden_state: + return ht + else: + # Unpack: out + out = torch.nn.utils.rnn.pad_packed_sequence(out_pack, batch_first=self.batch_first) # (sequence, lengths) + out = out[0] + out = out[x_unsort_idx] + + # Unsort: out c + if self.rnn_type == 'LSTM': + # (num_layers * num_directions, batch, hidden_size) -> (batch, ...) + ct = torch.transpose(ct, 0, 1)[x_unsort_idx] + ct = torch.transpose(ct, 0, 1) + return out, (ht, ct) From d19add592fe0c2f5af8c70bbd00f2d41e31f3968 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 9 Dec 2021 16:59:18 +0800 Subject: [PATCH 003/201] [#41] slight refactor for code reuse --- sgnlp/models/asgcn/modules/dynamic_rnn.py | 42 ++++++++--------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/sgnlp/models/asgcn/modules/dynamic_rnn.py b/sgnlp/models/asgcn/modules/dynamic_rnn.py index 237875e..fa96702 100644 --- a/sgnlp/models/asgcn/modules/dynamic_rnn.py +++ b/sgnlp/models/asgcn/modules/dynamic_rnn.py @@ -30,36 +30,24 @@ def __init__( self.__init_rnn() def __init_rnn(self) -> None: + """ + Helper method to initalized RNN type + """ + input_args = { + "input_size": self.input_size, + "hidden_size": self.hidden_size, + "num_layers": self.num_layers, + "bias": self.bias, + "batch_first": self.batch_first, + "dropout": self.dropout, + "bidirectional": self.bidirectional + } if self.rnn_type == 'LSTM': - self.rnn = nn.LSTM( - input_size=self.input_size, - hidden_size=self.hidden_size, - num_layers=self.num_layers, - bias=self.bias, - batch_first=self.batch_first, - dropout=self.dropout, - bidirectional=self.bidirectional - ) + self.rnn = nn.LSTM(**input_args) elif self.rnn_type == 'GRU': - self.rnn = nn.GRU( - input_size=self.input_size, - hidden_size=self.hidden_size, - num_layers=self.num_layers, - bias=self.bias, - batch_first=self.batch_first, - dropout=self.dropout, - bidirectional=self.bidirectional - ) + self.rnn = nn.GRU(**input_args) elif self.rnn_type == 'RNN': - self.rnn = nn.RNN( - input_size=self.input_size, - hidden_size=self.hidden_size, - num_layers=self.num_layers, - bias=self.bias, - batch_first=self.batch_first, - dropout=self.dropout, - bidirectional=self.bidirectional - ) + self.rnn = nn.RNN(**input_args) def forward(self, x, x_len, h0=None): # Sort From 5292df778f9a67ad2e860ca37b5f957e629e4013 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 9 Dec 2021 17:17:17 +0800 Subject: [PATCH 004/201] [#41] add gcn class --- sgnlp/models/asgcn/modules/gcn.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 sgnlp/models/asgcn/modules/gcn.py diff --git a/sgnlp/models/asgcn/modules/gcn.py b/sgnlp/models/asgcn/modules/gcn.py new file mode 100644 index 0000000..58edec4 --- /dev/null +++ b/sgnlp/models/asgcn/modules/gcn.py @@ -0,0 +1,22 @@ +import torch +import torch.nn as nn + +class GraphConvolution(nn.Module): + """ + Simple GCN Layer, similar to https://arxiv.org/abs/1609.02907 + """ + def __init__(self, in_features, out_features, bias=True) -> None: + super(GraphConvolution, self).__init__() + self.in_features = in_features + self.out_features = out_features + self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features)) + if bias: + self.bias = nn.Parameter(torch.FloatTensor(out_features)) + else: + self.register_parameter('bias', None) + + def forward(self, text, adj): + hidden = torch.matmul(text, self.weight) + denom = torch.sum(adj, dim=2, keepdim=True) + 1 + output = torch.matmul(adj, hidden) / denom + return output + self.bias if self.bias is not None else output From 16007548a685cc9c02f932a730551f848c04ea4c Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 10 Dec 2021 09:34:30 +0800 Subject: [PATCH 005/201] [#41] add initial data_class --- sgnlp/models/asgcn/data_class.py | 53 ++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 sgnlp/models/asgcn/data_class.py diff --git a/sgnlp/models/asgcn/data_class.py b/sgnlp/models/asgcn/data_class.py new file mode 100644 index 0000000..d0cee64 --- /dev/null +++ b/sgnlp/models/asgcn/data_class.py @@ -0,0 +1,53 @@ +from dataclasses import dataclass, field + + +@dataclass +class SenticGCNTrainArgs: + initializer: str = field( + default="xavier_uniform", metadata={"help": "Type of initalizer to use."} + ) + optimizer: str = field( + default="adam", metadata={"help": "Type of optimizer to use."} + ) + learning_rate: float = field( + default=0.001, metadata={"help": "Default learning rate for training."} + ) + l2reg: float = field(default=0.00001, metadata={"help": "Default l2reg value."}) + epochs: int = field(default=100, metadata={"help": "Number of epochs to train."}) + batch_size: int = field(default=32, metadata={"help": "Training batch size."}) + log_step: int = field(default=5, metadata={"help": "Default log step."}) + embed_dim: int = field( + default=300, metadata={"help": "Number of neurons for embed layer."} + ) + hidden_dim: int = field( + default=300, metadata={"help": "Number of neurons for hidden layer."} + ) + polarities_dim: int = field( + default=3, metadata={"help": "Default dimension for polarities."} + ) + save: bool = field( + default=False, metadata={"help": "Flag to indicate if results should be saved."} + ) + seed: int = field( + default=776, metadata={"help": "Default random seed for training."} + ) + device: str = field( + default="cuda", metadata={"help": "Type of compute device to use for training."} + ) + + def __post_init__(self): + assert self.initializer in [ + "xavier_uniform", + "xavier_uniform", + "orthogonal", + ], "Invalid initializer type!" + assert self.optimizer in [ + "adadelta", + "adagrad", + "adam", + "adamax", + "asgd", + "rmsprop", + "sgd", + ], "Invalid optimizer" + assert self.device in ["cuda", "cpu"], "Invalid device type." From 9b8417b748cdae46d9e84ccfb7d3ba8f42d57b82 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 10 Dec 2021 09:56:20 +0800 Subject: [PATCH 006/201] [#41] rename asgcn to sentic_asgcn --- .../{asgcn => sentic_asgcn}/__init__.py | 0 .../{asgcn => sentic_asgcn}/data_class.py | 0 sgnlp/models/sentic_asgcn/modeling.py | 24 +++++++++++++++++++ .../modules/__init__.py | 0 .../modules/dynamic_rnn.py | 0 .../{asgcn => sentic_asgcn}/modules/gcn.py | 0 6 files changed, 24 insertions(+) rename sgnlp/models/{asgcn => sentic_asgcn}/__init__.py (100%) rename sgnlp/models/{asgcn => sentic_asgcn}/data_class.py (100%) create mode 100644 sgnlp/models/sentic_asgcn/modeling.py rename sgnlp/models/{asgcn => sentic_asgcn}/modules/__init__.py (100%) rename sgnlp/models/{asgcn => sentic_asgcn}/modules/dynamic_rnn.py (100%) rename sgnlp/models/{asgcn => sentic_asgcn}/modules/gcn.py (100%) diff --git a/sgnlp/models/asgcn/__init__.py b/sgnlp/models/sentic_asgcn/__init__.py similarity index 100% rename from sgnlp/models/asgcn/__init__.py rename to sgnlp/models/sentic_asgcn/__init__.py diff --git a/sgnlp/models/asgcn/data_class.py b/sgnlp/models/sentic_asgcn/data_class.py similarity index 100% rename from sgnlp/models/asgcn/data_class.py rename to sgnlp/models/sentic_asgcn/data_class.py diff --git a/sgnlp/models/sentic_asgcn/modeling.py b/sgnlp/models/sentic_asgcn/modeling.py new file mode 100644 index 0000000..e1206aa --- /dev/null +++ b/sgnlp/models/sentic_asgcn/modeling.py @@ -0,0 +1,24 @@ +from dataclasses import dataclass + +import torch +import torch.nn as nn +from transformers.file_utils import ModelOutput +from transformers.utils.dummy_pt_objects import PreTrainedModel + + +@dataclass +class SenticASGCNModelOutput(ModelOutput): + pass + + +class SenticASGCNPreTrainedModel(PreTrainedModel): + # config_class = + base_model_prefix = "sentic_asgcn" + + def _init_weights(self, module): + pass + + +class SenticASGCNModel(SenticASGCNPreTrainedModel): + def __init__(self, config): + pass diff --git a/sgnlp/models/asgcn/modules/__init__.py b/sgnlp/models/sentic_asgcn/modules/__init__.py similarity index 100% rename from sgnlp/models/asgcn/modules/__init__.py rename to sgnlp/models/sentic_asgcn/modules/__init__.py diff --git a/sgnlp/models/asgcn/modules/dynamic_rnn.py b/sgnlp/models/sentic_asgcn/modules/dynamic_rnn.py similarity index 100% rename from sgnlp/models/asgcn/modules/dynamic_rnn.py rename to sgnlp/models/sentic_asgcn/modules/dynamic_rnn.py diff --git a/sgnlp/models/asgcn/modules/gcn.py b/sgnlp/models/sentic_asgcn/modules/gcn.py similarity index 100% rename from sgnlp/models/asgcn/modules/gcn.py rename to sgnlp/models/sentic_asgcn/modules/gcn.py From e290ba4f064699385fe241177707d036217008b5 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 10 Dec 2021 10:03:56 +0800 Subject: [PATCH 007/201] [#41] fix wrong import --- sgnlp/models/sentic_asgcn/modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_asgcn/modeling.py b/sgnlp/models/sentic_asgcn/modeling.py index e1206aa..7633b5b 100644 --- a/sgnlp/models/sentic_asgcn/modeling.py +++ b/sgnlp/models/sentic_asgcn/modeling.py @@ -2,8 +2,8 @@ import torch import torch.nn as nn +from transformers import PreTrainedModel from transformers.file_utils import ModelOutput -from transformers.utils.dummy_pt_objects import PreTrainedModel @dataclass From aeac3bdc51c0512c1d06c8c6544cc72c7f7f4945 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 10 Dec 2021 10:36:42 +0800 Subject: [PATCH 008/201] [#41] rename SenticGCN to SenticASGCN, add basic config --- .../config/sentic_asgcn_config.json | 15 +++++++++++ sgnlp/models/sentic_asgcn/data_class.py | 4 +-- sgnlp/models/sentic_asgcn/train.py | 11 ++++++++ sgnlp/models/sentic_asgcn/utils.py | 25 +++++++++++++++++++ 4 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json create mode 100644 sgnlp/models/sentic_asgcn/train.py create mode 100644 sgnlp/models/sentic_asgcn/utils.py diff --git a/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json b/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json new file mode 100644 index 0000000..0f4e1ce --- /dev/null +++ b/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json @@ -0,0 +1,15 @@ +{ + "initializer": "xavier_uniform", + "optimizer": "adam", + "learning_rate": 0.001, + "l2reg": 0.00001, + "epochs": 100, + "batch_size": 32, + "log_step": 5, + "embed_dim": 300, + "hidden_dim": 300, + "polarities_dim": 3, + "save": true, + "seed": 776, + "device": "cuda" +} \ No newline at end of file diff --git a/sgnlp/models/sentic_asgcn/data_class.py b/sgnlp/models/sentic_asgcn/data_class.py index d0cee64..a400ab9 100644 --- a/sgnlp/models/sentic_asgcn/data_class.py +++ b/sgnlp/models/sentic_asgcn/data_class.py @@ -2,7 +2,7 @@ @dataclass -class SenticGCNTrainArgs: +class SenticASGCNTrainArgs: initializer: str = field( default="xavier_uniform", metadata={"help": "Type of initalizer to use."} ) @@ -26,7 +26,7 @@ class SenticGCNTrainArgs: default=3, metadata={"help": "Default dimension for polarities."} ) save: bool = field( - default=False, metadata={"help": "Flag to indicate if results should be saved."} + default=True, metadata={"help": "Flag to indicate if results should be saved."} ) seed: int = field( default=776, metadata={"help": "Default random seed for training."} diff --git a/sgnlp/models/sentic_asgcn/train.py b/sgnlp/models/sentic_asgcn/train.py new file mode 100644 index 0000000..c77b13d --- /dev/null +++ b/sgnlp/models/sentic_asgcn/train.py @@ -0,0 +1,11 @@ +from .data_class import SenticASGCNTrainArgs +from .utils import parse_args_and_load_config + + +def train_model(cfg: SenticASGCNTrainArgs): + pass + + +if __name__ == "__main__": + cfg = parse_args_and_load_config() + train_model(cfg) diff --git a/sgnlp/models/sentic_asgcn/utils.py b/sgnlp/models/sentic_asgcn/utils.py new file mode 100644 index 0000000..b8ebdaa --- /dev/null +++ b/sgnlp/models/sentic_asgcn/utils.py @@ -0,0 +1,25 @@ +import argparse +import json +import pathlib + +from .data_class import SenticASGCNTrainArgs + + +def parse_args_and_load_config( + config_path: str = "config/sentic_asgcn_config.json", +) -> SenticASGCNTrainArgs: + """Get config from config file using argparser + + Returns: + SenticASGCNTrainArgs: SenticASGCNTrainArgs instance populated from config + """ + parser = argparse.ArgumentParser(description="SenticASGCN Training") + parser.add_argument("--config", type=str, default=config_path) + args = parser.parse_args() + + cfg_path = pathlib.Path(__file__).parent / args.config + with open(cfg_path, "r") as cfg_file: + cfg = json.load(cfg_file) + + sentic_asgcn_args = SenticASGCNTrainArgs(**cfg) + return sentic_asgcn_args From a2b470104de12b79ff1ea378418cafd4493e8ba5 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 10 Dec 2021 11:04:30 +0800 Subject: [PATCH 009/201] [#41] add SenticASGCNConfig --- sgnlp/models/sentic_asgcn/config.py | 37 +++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 sgnlp/models/sentic_asgcn/config.py diff --git a/sgnlp/models/sentic_asgcn/config.py b/sgnlp/models/sentic_asgcn/config.py new file mode 100644 index 0000000..fb063fc --- /dev/null +++ b/sgnlp/models/sentic_asgcn/config.py @@ -0,0 +1,37 @@ +import torch +from transformers import PreTrainedConfig + + +class SenticASGCNConfig(PreTrainedConfig): + """ + This is the configuration class to store the configuration of a + :class:`~sgnlp.models.sentic_asgcn.modeling.SenticASGCNModel`. + It is used to instantiate a SenticASGCN network according to the specific arguments, defining the mdoel architecture. + + Args: + embed_dim (:obj:`int`, defaults to 300): Embedding dimension size. + hidden_dim (:obj:`int`, defaults to 300): Size of hidden dimension. + polarities_dim (:obj:`int`, defaults to 3): Size of output dimension represeting available polarities (e.g. Positive, Negative, Neutral). + device (:obj:`torch.device`, defaults to torch.device('cuda`)): Type of torch device. + + Example: + + from sgnlp.models.sentic_asgcn import SenticASGCNConfig + + # Initialize with default values + config = SenticASGCNConfig() + """ + + def __init__( + self, + embed_dim=300, + hidden_dim=300, + polarities_dim=3, + device=torch.device("cuda"), + **kwargs + ): + super().__init__(**kwargs) + self.embed_dim = embed_dim + self.hidden_dim = hidden_dim + self.polarities_dim = polarities_dim + self.device = device From 0333babcbbcfdd406a83157e96bc1780ff48afc5 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 10 Dec 2021 13:49:04 +0800 Subject: [PATCH 010/201] [#41] add set random seed helper --- sgnlp/models/sentic_asgcn/utils.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sgnlp/models/sentic_asgcn/utils.py b/sgnlp/models/sentic_asgcn/utils.py index b8ebdaa..784e83f 100644 --- a/sgnlp/models/sentic_asgcn/utils.py +++ b/sgnlp/models/sentic_asgcn/utils.py @@ -1,7 +1,11 @@ import argparse import json +import random import pathlib +import numpy as np +import torch + from .data_class import SenticASGCNTrainArgs @@ -23,3 +27,17 @@ def parse_args_and_load_config( sentic_asgcn_args = SenticASGCNTrainArgs(**cfg) return sentic_asgcn_args + + +def set_random_seed(seed: int = 776) -> None: + """Helper method to set random seeds for python, numpy and torch + + Args: + seed (int, optional): seed value to set. Defaults to 776. + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False From 436240ea0cdcb60fe5d843171c166b5993d6fd1a Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 10 Dec 2021 15:09:49 +0800 Subject: [PATCH 011/201] [#41] add model init --- sgnlp/models/sentic_asgcn/config.py | 3 +++ .../config/sentic_asgcn_config.json | 1 + sgnlp/models/sentic_asgcn/data_class.py | 3 +++ sgnlp/models/sentic_asgcn/modeling.py | 25 ++++++++++++++++--- sgnlp/models/sentic_asgcn/train.py | 4 ++- 5 files changed, 32 insertions(+), 4 deletions(-) diff --git a/sgnlp/models/sentic_asgcn/config.py b/sgnlp/models/sentic_asgcn/config.py index fb063fc..ff16a8b 100644 --- a/sgnlp/models/sentic_asgcn/config.py +++ b/sgnlp/models/sentic_asgcn/config.py @@ -11,6 +11,7 @@ class SenticASGCNConfig(PreTrainedConfig): Args: embed_dim (:obj:`int`, defaults to 300): Embedding dimension size. hidden_dim (:obj:`int`, defaults to 300): Size of hidden dimension. + dropout (:obj:`float`, defaults to 0.3): Droput percentage. polarities_dim (:obj:`int`, defaults to 3): Size of output dimension represeting available polarities (e.g. Positive, Negative, Neutral). device (:obj:`torch.device`, defaults to torch.device('cuda`)): Type of torch device. @@ -27,11 +28,13 @@ def __init__( embed_dim=300, hidden_dim=300, polarities_dim=3, + dropout=0.3, device=torch.device("cuda"), **kwargs ): super().__init__(**kwargs) self.embed_dim = embed_dim self.hidden_dim = hidden_dim + self.dropout = dropout self.polarities_dim = polarities_dim self.device = device diff --git a/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json b/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json index 0f4e1ce..78f9294 100644 --- a/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json +++ b/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json @@ -9,6 +9,7 @@ "embed_dim": 300, "hidden_dim": 300, "polarities_dim": 3, + "dropout": 0.3, "save": true, "seed": 776, "device": "cuda" diff --git a/sgnlp/models/sentic_asgcn/data_class.py b/sgnlp/models/sentic_asgcn/data_class.py index a400ab9..d7e08c2 100644 --- a/sgnlp/models/sentic_asgcn/data_class.py +++ b/sgnlp/models/sentic_asgcn/data_class.py @@ -22,6 +22,9 @@ class SenticASGCNTrainArgs: hidden_dim: int = field( default=300, metadata={"help": "Number of neurons for hidden layer."} ) + dropout: float = field( + default=0.3, metadata={"help": "Default value for dropout percentages."} + ) polarities_dim: int = field( default=3, metadata={"help": "Default dimension for polarities."} ) diff --git a/sgnlp/models/sentic_asgcn/modeling.py b/sgnlp/models/sentic_asgcn/modeling.py index 7633b5b..f2c292e 100644 --- a/sgnlp/models/sentic_asgcn/modeling.py +++ b/sgnlp/models/sentic_asgcn/modeling.py @@ -5,6 +5,10 @@ from transformers import PreTrainedModel from transformers.file_utils import ModelOutput +from .modules.dynamic_rnn import DynamicLSTM +from .modules.gcn import GraphConvolution +from .config import SenticASGCNConfig + @dataclass class SenticASGCNModelOutput(ModelOutput): @@ -12,7 +16,11 @@ class SenticASGCNModelOutput(ModelOutput): class SenticASGCNPreTrainedModel(PreTrainedModel): - # config_class = + """ + An abstract class to handle weights initialization and a simple interface for download and loading pretrained models. + """ + + config_class = SenticASGCNConfig base_model_prefix = "sentic_asgcn" def _init_weights(self, module): @@ -20,5 +28,16 @@ def _init_weights(self, module): class SenticASGCNModel(SenticASGCNPreTrainedModel): - def __init__(self, config): - pass + def __init__(self, config: SenticASGCNConfig): + super().__init__(config) + self.text_lstm = DynamicLSTM( + config.embed_dim, + config.hidden_dim, + num_layers=1, + batch_first=True, + bidirectional=True, + ) + self.gc1 = GraphConvolution(2 * config.hidden_dim, 2 * config.hidden_dim) + self.gc2 = GraphConvolution(2 * config.hidden_dim, 2 * config.hidden_dim) + self.fc = nn.Linear(2 * config.hidden_dim, config.polarities_dim) + self.text_embed_dropout = nn.Dropout(config.dropout) diff --git a/sgnlp/models/sentic_asgcn/train.py b/sgnlp/models/sentic_asgcn/train.py index c77b13d..6471800 100644 --- a/sgnlp/models/sentic_asgcn/train.py +++ b/sgnlp/models/sentic_asgcn/train.py @@ -1,5 +1,5 @@ from .data_class import SenticASGCNTrainArgs -from .utils import parse_args_and_load_config +from .utils import parse_args_and_load_config, set_random_seed def train_model(cfg: SenticASGCNTrainArgs): @@ -8,4 +8,6 @@ def train_model(cfg: SenticASGCNTrainArgs): if __name__ == "__main__": cfg = parse_args_and_load_config() + if cfg.seed is not None: + set_random_seed(cfg.seed) train_model(cfg) From 549e7fca311a062a055d9c8f447311b5a9d68ef4 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 13 Dec 2021 09:04:14 +0800 Subject: [PATCH 012/201] [#41] add draft asgcn model --- sgnlp/models/sentic_asgcn/modeling.py | 65 ++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_asgcn/modeling.py b/sgnlp/models/sentic_asgcn/modeling.py index f2c292e..a23b9e6 100644 --- a/sgnlp/models/sentic_asgcn/modeling.py +++ b/sgnlp/models/sentic_asgcn/modeling.py @@ -2,6 +2,7 @@ import torch import torch.nn as nn +import torch.nn.functional as F from transformers import PreTrainedModel from transformers.file_utils import ModelOutput @@ -28,7 +29,7 @@ def _init_weights(self, module): class SenticASGCNModel(SenticASGCNPreTrainedModel): - def __init__(self, config: SenticASGCNConfig): + def __init__(self, config: SenticASGCNConfig) -> None: super().__init__(config) self.text_lstm = DynamicLSTM( config.embed_dim, @@ -41,3 +42,65 @@ def __init__(self, config: SenticASGCNConfig): self.gc2 = GraphConvolution(2 * config.hidden_dim, 2 * config.hidden_dim) self.fc = nn.Linear(2 * config.hidden_dim, config.polarities_dim) self.text_embed_dropout = nn.Dropout(config.dropout) + self.device = config.device + + def position_weight(self, x, aspect_double_idx, text_len, aspect_len): + batch_size = x.shape[0] + seq_len = x.shape[1] + aspect_double_idx = aspect_double_idx.cpu().numpy() + text_len = text_len.cpu().numpy() + aspect_len = aspect_len.cpu().numpy() + weight = [[] for i in range(batch_size)] + for i in range(batch_size): + context_len = text_len[i] - aspect_len[i] + for j in range(aspect_double_idx[i, 0]): + weight[i].append(1 - (aspect_double_idx[i, 0] - j) / context_len) + for j in range(aspect_double_idx[i, 0], aspect_double_idx[i, 1] + 1): + weight[i].append(0) + for j in range(aspect_double_idx[i, 1] + 1, text_len[i]): + weight[i].append(1 - (j - aspect_double_idx[i, 1] / context_len)) + for j in range(text_len[i], seq_len): + weight[i].append(0) + weight = torch.tensor(weight, dtype=torch.float).unsqueeze(2).to(self.device) + return weight * x + + def mask(self, x, aspect_double_idx): + batch_size, seq_len = x.shape[0], x.shape[1] + aspect_double_idx = aspect_double_idx.cpu().numpy() + mask = [[] for i in range(batch_size)] + for i in range(batch_size): + for j in range(aspect_double_idx[i, 0]): + mask[i].append(0) + for j in range(aspect_double_idx[i, 0], aspect_double_idx[i, 1] + 1): + mask[i].append(1) + for j in range(aspect_double_idx[i, 1] + 1, seq_len): + mask[i].append(0) + mask = torch.tensor(mask, dtype=torch.float).unsqueeze(2).to(self.device) + return mask * x + + def forward(self, inputs): + text_indices, aspect_indices, left_indices, adj = inputs + text_len = torch.sum(text_indices != 0, dim=-1) + aspect_len = torch.sum(aspect_indices != 0, dim=-1) + left_len = torch.sum(left_indices != 0, dim=-1) + aspect_double_idx = torch.cat( + [left_len.unsqueeze(1), (left_len + aspect_len - 1).unsqueeze(1)], dim=1 + ) + text = self.text_embed_dropout(self.embed(text_indices)) + text_out, (_, _) = self.text_lstm(text, text_len) + x = F.relu( + self.gc1( + self.position_weight(text_out, aspect_double_idx, text_len, aspect_len), + adj, + ) + ) + x = F.relu( + self.gc2( + self.position_weight(x, aspect_double_idx, text_len, aspect_len), adj + ) + ) + alpha_mat = torch.matmul(x, text_out.transpose(1, 2)) + alpha = F.softmax(alpha_mat.sum(1, keepdim=True), dim=2) + x = torch.matmul(alpha, text_out).squeeze(1) # batch_size x 2 * hidden_dim + output = self.fc(x) + return output From 7812f660da3e1af1d7e8f4d663dccba04d26eee2 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 13 Dec 2021 09:55:10 +0800 Subject: [PATCH 013/201] [#41] add bucketiterator class --- sgnlp/models/sentic_asgcn/utils.py | 88 ++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/sgnlp/models/sentic_asgcn/utils.py b/sgnlp/models/sentic_asgcn/utils.py index 784e83f..8b67d94 100644 --- a/sgnlp/models/sentic_asgcn/utils.py +++ b/sgnlp/models/sentic_asgcn/utils.py @@ -1,5 +1,6 @@ import argparse import json +import math import random import pathlib @@ -41,3 +42,90 @@ def set_random_seed(seed: int = 776) -> None: torch.cuda.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False + + +class BucketIterator(object): + def __init__( + self, data, batch_size, sort_key="text_indices", shuffle=True, sort=True + ): + self.shuffle = shuffle + self.sort = sort + self.sort_key = sort_key + self.batches = self.sort_and_pad(data, batch_size) + self.batch_len = len(self.batches) + + def sort_and_pad(self, data, batch_size): + num_batch = int(math.ceil(len(data) / batch_size)) + sorted_data = ( + sorted(data, key=lambda x: len(x[self.sort_key])) if self.sort else data + ) + batches = [ + self.pad_data(sorted_data[i * batch_size : (i + 1) * batch_size]) + for i in range(num_batch) + ] + return batches + + def pad_data(self, batch_data): + batch_text_indices = [] + batch_context_indices = [] + batch_aspect_indices = [] + batch_left_indices = [] + batch_polarity = [] + batch_dependency_graph = [] + batch_dependency_tree = [] + max_len = max([len(t[self.sort_key]) for t in batch_data]) + # [text_indices, context_indices, aspect_indices, left_indices, polarity, dependency_graph, dependency_tree] + for item in batch_data: + text_indices = item["text_indices"] + context_indices = item["context_indices"] + aspect_indices = item["aspect_indices"] + left_indices = item["left_indices"] + polarity = item["polarity"] + dependency_graph = item["dependency_graph"] + dependency_tree = item["dependency_tree"] + + text_padding = [0] * (max_len - len(text_indices)) + context_padding = [0] * (max_len - len(context_indices)) + aspect_padding = [0] * (max_len - len(aspect_indices)) + left_padding = [0] * (max_len - len(left_indices)) + + batch_text_indices.append(text_indices + text_padding) + batch_context_indices.append(context_indices + context_padding) + batch_aspect_indices.append(aspect_indices + aspect_padding) + batch_left_indices.append(left_indices + left_padding) + batch_polarity.append(polarity) + batch_dependency_graph.append( + np.pad( + dependency_graph, + ( + (0, max_len - len(text_indices)), + (0, max_len - len(text_indices)), + ), + "constant", + ) + ) + batch_dependency_tree.append( + np.pad( + dependency_tree, + ( + (0, max_len - len(text_indices)), + (0, max_len - len(text_indices)), + ), + "constant", + ) + ) + return { + "text_indices": torch.tensor(batch_text_indices), + "context_indices": torch.tensor(batch_context_indices), + "aspect_indices": torch.tensor(batch_aspect_indices), + "left_indices": torch.tensor(batch_left_indices), + "polarity": torch.tensor(batch_polarity), + "dependency_graph": torch.tensor(batch_dependency_graph), + "dependency_tree": torch.tensor(batch_dependency_tree), + } + + def __iter__(self): + if self.shuffle: + random.shuffle(self.batches) + for idx in range(self.batch_len): + yield self.batches[idx] From 8487ab31efe5c6ace1cb9247101e040c55989f42 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 13 Dec 2021 10:58:23 +0800 Subject: [PATCH 014/201] [#41] add preprocess dependency graph class --- .../preprocess_dependency_graph.py | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py diff --git a/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py b/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py new file mode 100644 index 0000000..ff2eb15 --- /dev/null +++ b/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py @@ -0,0 +1,43 @@ +import numpy as np +import spacy +import pickle +from spacy.tokens import Doc + + +class WhiteSpaceTokenizer(object): + def __init__(self, vocab): + self.vocab = vocab + + def __call__(self, text: str): + words = text.split() + spaces = [True] * len(words) + return Doc(self.vocab, words=words, spaces=spaces) + + +class DependencyGraphPreprocessor(object): + def __init__(self): + self.nlp = spacy.load('en_core_web_sm') + self.nlp.tokenizer = WhiteSpaceTokenizer(self.nlp.vocab) + + def __dependency_adj_matrix(self, text: str) -> np.ndarray: + tokens = self.nlp(text) + words = text.split() + matrix = np.zeros((len(words), len(words))).astype('float32') + + for token in tokens: + matrix[token.i][token.i] = 1 + for child in token.children: + matrix[token.i][child.i] = 1 + matrix[child.i][token.i] = 1 + return matrix + + def process(self, filename: str): + with open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore') as fin: + lines = fin.readlines() + idx2graph = {} + with open(f'{filename}.graph', 'wb') as fout: + for i in range(0, len(lines), 3): + text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] + aspect = lines[i + 1].lower().strip() + idx2graph[i] = self.__dependency_adj_matrix(f'{text_left} {aspect} {text_right}'}) + pickle.dump(idx2graph, fout) From 25554f5e0f0bc64a1f7cab0fa3528f2a580d0b38 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 13 Dec 2021 11:03:11 +0800 Subject: [PATCH 015/201] [#41] fix typo --- .../preprocess_dependency_graph.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py b/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py index ff2eb15..a3066e7 100644 --- a/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py +++ b/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py @@ -16,13 +16,13 @@ def __call__(self, text: str): class DependencyGraphPreprocessor(object): def __init__(self): - self.nlp = spacy.load('en_core_web_sm') + self.nlp = spacy.load("en_core_web_sm") self.nlp.tokenizer = WhiteSpaceTokenizer(self.nlp.vocab) def __dependency_adj_matrix(self, text: str) -> np.ndarray: tokens = self.nlp(text) words = text.split() - matrix = np.zeros((len(words), len(words))).astype('float32') + matrix = np.zeros((len(words), len(words))).astype("float32") for token in tokens: matrix[token.i][token.i] = 1 @@ -32,12 +32,18 @@ def __dependency_adj_matrix(self, text: str) -> np.ndarray: return matrix def process(self, filename: str): - with open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore') as fin: + with open( + filename, "r", encoding="utf-8", newline="\n", errors="ignore" + ) as fin: lines = fin.readlines() idx2graph = {} - with open(f'{filename}.graph', 'wb') as fout: + with open(f"{filename}.graph", "wb") as fout: for i in range(0, len(lines), 3): - text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] + text_left, _, text_right = [ + s.lower().strip() for s in lines[i].partition("$T$") + ] aspect = lines[i + 1].lower().strip() - idx2graph[i] = self.__dependency_adj_matrix(f'{text_left} {aspect} {text_right}'}) + idx2graph[i] = self.__dependency_adj_matrix( + f"{text_left} {aspect} {text_right}" + ) pickle.dump(idx2graph, fout) From 84dc698a50dc8480fdde4ecacb16e8ced1df4ae1 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 13 Dec 2021 12:06:40 +0800 Subject: [PATCH 016/201] [#41] add config for preprocess dependency graph --- .../sentic_asgcn/config/sentic_asgcn_config.json | 12 ++++++++++++ sgnlp/models/sentic_asgcn/data_class.py | 4 ++++ .../sentic_asgcn/preprocess_dependency_graph.py | 9 +++++++++ sgnlp/models/sentic_asgcn/utils.py | 2 +- 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json b/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json index 78f9294..d08aa9d 100644 --- a/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json +++ b/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json @@ -1,4 +1,16 @@ { + "dependency_graph_preprocess": [ + "./datasets/acl-14-short-data/train.raw", + "./datasets/acl-14-short-data/test.raw", + "./datasets/semeval14/restaurant_train.raw", + "./datasets/semeval14/restaurant_test.raw", + "./datasets/semeval14/laptop_train.raw", + "./datasets/semeval14/laptop_test.raw", + "./datasets/semeval15/restaurant_train.raw", + "./datasets/semeval15/restaurant_test.raw", + "./datasets/semeval16/restaurant_train.raw", + "./datasets/semeval16/restaurant_test.raw" + ], "initializer": "xavier_uniform", "optimizer": "adam", "learning_rate": 0.001, diff --git a/sgnlp/models/sentic_asgcn/data_class.py b/sgnlp/models/sentic_asgcn/data_class.py index d7e08c2..cdf4bcf 100644 --- a/sgnlp/models/sentic_asgcn/data_class.py +++ b/sgnlp/models/sentic_asgcn/data_class.py @@ -1,8 +1,12 @@ from dataclasses import dataclass, field +from typing import List @dataclass class SenticASGCNTrainArgs: + dependency_graph_preprocess: List[str] = field( + default=list, metadata={"help": "List of raw dataset to process."} + ) initializer: str = field( default="xavier_uniform", metadata={"help": "Type of initalizer to use."} ) diff --git a/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py b/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py index a3066e7..d78dc9d 100644 --- a/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py +++ b/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py @@ -3,6 +3,8 @@ import pickle from spacy.tokens import Doc +from utils import parse_args_and_load_config + class WhiteSpaceTokenizer(object): def __init__(self, vocab): @@ -47,3 +49,10 @@ def process(self, filename: str): f"{text_left} {aspect} {text_right}" ) pickle.dump(idx2graph, fout) + + +if __name__ == "__main__": + dgp = DependencyGraphPreprocessor() + cfg = parse_args_and_load_config() + for data_path in cfg.dependency_graph_preprocess: + dgp.process(data_path) diff --git a/sgnlp/models/sentic_asgcn/utils.py b/sgnlp/models/sentic_asgcn/utils.py index 8b67d94..a23ca8e 100644 --- a/sgnlp/models/sentic_asgcn/utils.py +++ b/sgnlp/models/sentic_asgcn/utils.py @@ -7,7 +7,7 @@ import numpy as np import torch -from .data_class import SenticASGCNTrainArgs +from data_class import SenticASGCNTrainArgs def parse_args_and_load_config( From aa3e3d58742a49210aef5741cfe3056cc82b6a4d Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 13 Dec 2021 15:47:02 +0800 Subject: [PATCH 017/201] [#41] add draft tokenizer, add docstring for preprocess dependency graphy class --- .../preprocess_dependency_graph.py | 14 +++ sgnlp/models/sentic_asgcn/tokenization.py | 97 +++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 sgnlp/models/sentic_asgcn/tokenization.py diff --git a/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py b/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py index d78dc9d..d0b5a44 100644 --- a/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py +++ b/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py @@ -7,6 +7,10 @@ class WhiteSpaceTokenizer(object): + """ + Simple white space tokenizer + """ + def __init__(self, vocab): self.vocab = vocab @@ -17,6 +21,10 @@ def __call__(self, text: str): class DependencyGraphPreprocessor(object): + """ + Preprocessor wrapper class for processing dependency graph. + """ + def __init__(self): self.nlp = spacy.load("en_core_web_sm") self.nlp.tokenizer = WhiteSpaceTokenizer(self.nlp.vocab) @@ -34,6 +42,12 @@ def __dependency_adj_matrix(self, text: str) -> np.ndarray: return matrix def process(self, filename: str): + """ + Main processing method, takes in raw data file and convert to adj matrix. + + Args: + filename (str): filename of raw dataset to process + """ with open( filename, "r", encoding="utf-8", newline="\n", errors="ignore" ) as fin: diff --git a/sgnlp/models/sentic_asgcn/tokenization.py b/sgnlp/models/sentic_asgcn/tokenization.py new file mode 100644 index 0000000..5e7aa00 --- /dev/null +++ b/sgnlp/models/sentic_asgcn/tokenization.py @@ -0,0 +1,97 @@ +import pathlib +import pickle +from typing import List + +from transformers import PreTrainedTokenizer + + +class SenticASGCNTokenizer(PreTrainedTokenizer): + def __init__( + self, + vocab_file: str = None, + train_files: List[str] = None, + train_vocab: bool = False, + do_lower_case: bool = True, + unk_token: str = "", + pad_token: str = "", + **kwargs, + ): + super().__init__( + do_lower_case=do_lower_case, + unk_token=unk_token, + pad_token=pad_token, + **kwargs, + ) + self.do_lower_case = do_lower_case + if train_vocab: + self.vocab = self.create_vocab(train_files) + else: + with open(vocab_file, "rb") as fin: + self.vocab = pickle.load(fin) + self.ids_to_tokens = {v: k for k, v in self.vocab.items()} + + @property + def do_lower_case(self): + return self.do_lower_case + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab) + + @staticmethod + def __read_text_file(file_names: List[str]) -> str: + """ + Helper method to read contents of a list of text files. + + Args: + file_names (List[str]): list of text files to read. + + Returns: + str: return a concatenated string of text files contents. + """ + text = "" + for fname in file_names: + with open( + fname, "r", encoding="utf-8", newline="\n", errors="ignore" + ) as fin: + lines = fin.readlines() + for i in range(0, len(lines), 3): + text_left, _, text_right = [ + s.lower().strip() for s in lines[i].partition("$T$") + ] + aspect = lines[i + 1].lower().strip() + text += f"{text_left} {aspect} {text_right} " # Left a space at the end + return text + + def create_vocab(self, save_directory: str): + text = self.__read_text_file() + if self.do_lower_case: + text = text.lower() + vocab = {} + vocab[self.pad_token] = 0 + vocab[self.unk_token] = 1 + offset = len(vocab.keys()) + + words = text.split() + for word in words: + if word not in vocab: + vocab[word] = offset + offset += 1 + return vocab + + def _tokenize(self, text, **kwargs): + text = text.lower() + words = text.split() + unknownidx = 1 + sequence = [ + self.word2idx[w] if w in self.word2idx else unknownidx for w in words + ] + if len(sequence) == 0: + sequence = [0] + return sequence + + def _convert_id_to_token(self, index: int) -> str: + return super()._convert_id_to_token(index) From a519024a2641fe3cfbe71da5ce77ed771a85fc3a Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 13 Dec 2021 15:52:21 +0800 Subject: [PATCH 018/201] [#41] fix tokenize method to call right attribute --- sgnlp/models/sentic_asgcn/tokenization.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sgnlp/models/sentic_asgcn/tokenization.py b/sgnlp/models/sentic_asgcn/tokenization.py index 5e7aa00..4075fd1 100644 --- a/sgnlp/models/sentic_asgcn/tokenization.py +++ b/sgnlp/models/sentic_asgcn/tokenization.py @@ -83,12 +83,11 @@ def create_vocab(self, save_directory: str): return vocab def _tokenize(self, text, **kwargs): - text = text.lower() + if self.do_lower_case: + text = text.lower() words = text.split() unknownidx = 1 - sequence = [ - self.word2idx[w] if w in self.word2idx else unknownidx for w in words - ] + sequence = [self.vocab[w] if w in self.vocab else unknownidx for w in words] if len(sequence) == 0: sequence = [0] return sequence From bcbb06db3bcfb25be9d913461e4666f0999381a7 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 13 Dec 2021 16:09:03 +0800 Subject: [PATCH 019/201] [#41] add converstion helper method to be compliant with huggingface api --- sgnlp/models/sentic_asgcn/tokenization.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sgnlp/models/sentic_asgcn/tokenization.py b/sgnlp/models/sentic_asgcn/tokenization.py index 4075fd1..a0005ae 100644 --- a/sgnlp/models/sentic_asgcn/tokenization.py +++ b/sgnlp/models/sentic_asgcn/tokenization.py @@ -41,6 +41,12 @@ def vocab_size(self): def get_vocab(self): return dict(self.vocab) + def _convert_token_to_id(self, token: str) -> int: + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index: int) -> str: + return self.ids_to_tokens(index, self.unk_token) + @staticmethod def __read_text_file(file_names: List[str]) -> str: """ @@ -91,6 +97,3 @@ def _tokenize(self, text, **kwargs): if len(sequence) == 0: sequence = [0] return sequence - - def _convert_id_to_token(self, index: int) -> str: - return super()._convert_id_to_token(index) From 30bfe8ee1f14377cc18632b2aee334032d19f05c Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 13 Dec 2021 17:22:16 +0800 Subject: [PATCH 020/201] [#41] fix tokenization attribute error, add save vocab method --- sgnlp/models/sentic_asgcn/tokenization.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/sgnlp/models/sentic_asgcn/tokenization.py b/sgnlp/models/sentic_asgcn/tokenization.py index a0005ae..0e5136b 100644 --- a/sgnlp/models/sentic_asgcn/tokenization.py +++ b/sgnlp/models/sentic_asgcn/tokenization.py @@ -1,6 +1,6 @@ import pathlib import pickle -from typing import List +from typing import Dict, List, Optional, Tuple from transformers import PreTrainedTokenizer @@ -30,10 +30,6 @@ def __init__( self.vocab = pickle.load(fin) self.ids_to_tokens = {v: k for k, v in self.vocab.items()} - @property - def do_lower_case(self): - return self.do_lower_case - @property def vocab_size(self): return len(self.vocab) @@ -72,8 +68,8 @@ def __read_text_file(file_names: List[str]) -> str: text += f"{text_left} {aspect} {text_right} " # Left a space at the end return text - def create_vocab(self, save_directory: str): - text = self.__read_text_file() + def create_vocab(self, train_files: List[str]) -> Dict[str, int]: + text = self.__read_text_file(train_files) if self.do_lower_case: text = text.lower() vocab = {} @@ -97,3 +93,13 @@ def _tokenize(self, text, **kwargs): if len(sequence) == 0: sequence = [0] return sequence + + def save_vocabulary( + self, save_directory: str, filename_prefix: Optional[str] = None + ) -> Tuple[str]: + save_dir = pathlib.Path(save_directory) + save_dir.mkdir(exist_ok=True) + vocab_file_path = save_dir.joinpath("vocab.pkl") + with open(vocab_file_path, "wb") as fout: + pickle.dump(self.vocab, fout) + return (str(vocab_file_path),) From 61c762a6018994f9c5b40d3a9885dd0d36b5460a Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 14 Dec 2021 09:07:39 +0800 Subject: [PATCH 021/201] [#41] fix tokenize method performing the tokenization twice from PreTrainedTokenizer --- sgnlp/models/sentic_asgcn/tokenization.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sgnlp/models/sentic_asgcn/tokenization.py b/sgnlp/models/sentic_asgcn/tokenization.py index 0e5136b..2a20df7 100644 --- a/sgnlp/models/sentic_asgcn/tokenization.py +++ b/sgnlp/models/sentic_asgcn/tokenization.py @@ -38,9 +38,11 @@ def get_vocab(self): return dict(self.vocab) def _convert_token_to_id(self, token: str) -> int: + print("_convert_token_to_id") return self.vocab.get(token, self.vocab.get(self.unk_token)) def _convert_id_to_token(self, index: int) -> str: + print("_convert_id_to_token") return self.ids_to_tokens(index, self.unk_token) @staticmethod @@ -88,11 +90,7 @@ def _tokenize(self, text, **kwargs): if self.do_lower_case: text = text.lower() words = text.split() - unknownidx = 1 - sequence = [self.vocab[w] if w in self.vocab else unknownidx for w in words] - if len(sequence) == 0: - sequence = [0] - return sequence + return words def save_vocabulary( self, save_directory: str, filename_prefix: Optional[str] = None From 421835e8965a029b5b298d972018b5e5a442d082 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 14 Dec 2021 09:46:31 +0800 Subject: [PATCH 022/201] [#41] specify vocab file in tokenizer class, add draft preprocessor --- sgnlp/models/sentic_asgcn/preprocess.py | 25 +++++++++++++++++++++++ sgnlp/models/sentic_asgcn/tokenization.py | 7 +++++-- 2 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 sgnlp/models/sentic_asgcn/preprocess.py diff --git a/sgnlp/models/sentic_asgcn/preprocess.py b/sgnlp/models/sentic_asgcn/preprocess.py new file mode 100644 index 0000000..d159d6b --- /dev/null +++ b/sgnlp/models/sentic_asgcn/preprocess.py @@ -0,0 +1,25 @@ +from typing import List + +import torch +from transformers import PreTrainedTokenizer +from transformers.tokenization_utils_base import BatchEncoding + +from tokenization import SenticASGCNTokenizer + + +class SenticASGCNPreprocessor: + def __init__( + self, + tokenizer: PreTrainedTokenizer = None, + tokenizer_name: str = None, + device: torch.device = torch.device("cpu"), + ): + self.device = device + if tokenizer is not None: + self.tokenizer = tokenizer + else: + self.tokenizer = SenticASGCNTokenizer.from_pretrained(tokenizer_name) + + def __call__(self, data_batch: List[str]) -> BatchEncoding: + tokens = self.tokenizer(data_batch, padding=True, return_tensors="pt") + return tokens diff --git a/sgnlp/models/sentic_asgcn/tokenization.py b/sgnlp/models/sentic_asgcn/tokenization.py index 2a20df7..992866f 100644 --- a/sgnlp/models/sentic_asgcn/tokenization.py +++ b/sgnlp/models/sentic_asgcn/tokenization.py @@ -5,7 +5,12 @@ from transformers import PreTrainedTokenizer +VOCAB_FILES_NAMES = {"vocab_file": "vocab.pkl"} + + class SenticASGCNTokenizer(PreTrainedTokenizer): + vocab_files_names = VOCAB_FILES_NAMES + def __init__( self, vocab_file: str = None, @@ -38,11 +43,9 @@ def get_vocab(self): return dict(self.vocab) def _convert_token_to_id(self, token: str) -> int: - print("_convert_token_to_id") return self.vocab.get(token, self.vocab.get(self.unk_token)) def _convert_id_to_token(self, index: int) -> str: - print("_convert_id_to_token") return self.ids_to_tokens(index, self.unk_token) @staticmethod From ffc7d4d4040c63cfef7488581e3820515f98deb5 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 14 Dec 2021 11:44:03 +0800 Subject: [PATCH 023/201] [#41] add helper methods to load word vector and generate embedding matrix --- sgnlp/models/sentic_asgcn/utils.py | 70 ++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/sgnlp/models/sentic_asgcn/utils.py b/sgnlp/models/sentic_asgcn/utils.py index a23ca8e..5235119 100644 --- a/sgnlp/models/sentic_asgcn/utils.py +++ b/sgnlp/models/sentic_asgcn/utils.py @@ -1,8 +1,12 @@ import argparse import json +from logging import error import math +from pickle import load +import pickle import random import pathlib +from typing import Dict import numpy as np import torch @@ -44,6 +48,72 @@ def set_random_seed(seed: int = 776) -> None: torch.backends.cudnn.benchmark = False +def load_word_vec( + word_vec_file_path: str, vocab: Dict[str, int], embed_dim: int = 300 +) -> Dict[str, np.asarray]: + """ + Helper method to load word vectors from file (e.g. GloVe) for each word in vocab. + + Args: + word_vec_file_path (str): full file path to word vectors. + vocab (Dict[str, int]): dictionary of vocab word as key and word index as values. + embed_dim (int, optional): embedding dimension. Defaults to 300. + + Returns: + Dict[str, np.asarray]: dictionary with words as key and word vectors as values. + """ + with open( + word_vec_file_path, "r", encoding="utf-8", newline="\n", errors="ignore" + ) as fin: + word_vec = {} + for line in fin: + tokens = line.rstrip().split() + word, vec = " ".join(tokens[:-embed_dim]), tokens[-embed_dim:] + if word in vocab.keys(): + word_vec[word] = np.asarray(vec, dtype="float32") + return word_vec + + +def build_embedding_matrix( + word_vec_file_path: str, + vocab: Dict[str, int], + embed_dim: int = 300, + save_embed_matrix: bool = False, + save_embed_directory: str = None, +) -> np.ndarray: + """ + Helper method to generate an embedding matrix. + + Args: + word_vec_file_path (str): full file path to word vectors. + vocab (Dict[str, int]): dictionary of vocab word as key and word index as values. + embed_dim (int, optional): embedding dimensiion. Defaults to 300. + save_embed_matrix (bool, optional): flag to indicate if . Defaults to False. + save_embed_directory (str, optional): [description]. Defaults to None. + + Returns: + np.array: numpy array of embedding matrix + """ + embedding_matrix = np.zeros(len(vocab), embed_dim) + embedding_matrix[1, :] = np.random.uniform( + -1 / np.sqrt(embed_dim), 1 / np.sqrt(embed_dim), (1, embed_dim) + ) + word_vec = load_word_vec(word_vec_file_path, vocab, embed_dim) + for word, idx in vocab.items(): + vec = word_vec.get(word) + if vec is not None: + embedding_matrix[idx] = vec + + if save_embed_matrix: + if save_embed_directory is not None: + save_dir = pathlib.Path(save_embed_directory) + save_dir.mkdir(exist_ok=True) + with open("embedding_matrix.pkl", "wb") as fout: + pickle.dump(embedding_matrix, fout) + + return embedding_matrix + + class BucketIterator(object): def __init__( self, data, batch_size, sort_key="text_indices", shuffle=True, sort=True From be7be52784c555a916589b459f1e7af54543a5c8 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 14 Dec 2021 13:22:19 +0800 Subject: [PATCH 024/201] [#41] flesh out docstrings and typehints for utils method --- sgnlp/models/sentic_asgcn/utils.py | 31 +++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/sgnlp/models/sentic_asgcn/utils.py b/sgnlp/models/sentic_asgcn/utils.py index 5235119..e4b7742 100644 --- a/sgnlp/models/sentic_asgcn/utils.py +++ b/sgnlp/models/sentic_asgcn/utils.py @@ -1,12 +1,10 @@ import argparse import json -from logging import error import math -from pickle import load import pickle import random import pathlib -from typing import Dict +from typing import Dict, Iterable, List import numpy as np import torch @@ -115,6 +113,10 @@ def build_embedding_matrix( class BucketIterator(object): + """ + Bucket iterator class which provides sorting and padding for input dataset, iterate thru dataset batches + """ + def __init__( self, data, batch_size, sort_key="text_indices", shuffle=True, sort=True ): @@ -124,7 +126,17 @@ def __init__( self.batches = self.sort_and_pad(data, batch_size) self.batch_len = len(self.batches) - def sort_and_pad(self, data, batch_size): + def sort_and_pad(self, data, batch_size: int) -> List[Dict[str, torch.tensor]]: + """ + Class method to sort and pad data batches + + Args: + data ([type]): input data + batch_size (int): batch size + + Returns: + List[Dict[str, torch.tensor]]: return a list of dictionaries of tensors + """ num_batch = int(math.ceil(len(data) / batch_size)) sorted_data = ( sorted(data, key=lambda x: len(x[self.sort_key])) if self.sort else data @@ -135,7 +147,16 @@ def sort_and_pad(self, data, batch_size): ] return batches - def pad_data(self, batch_data): + def pad_data(self, batch_data: Iterable) -> Dict[str, torch.tensor]: + """ + Class method to pad data batches + + Args: + batch_data (Iterable): An iterable for looping thru input dataset + + Returns: + Dict[str, torch.tensor]: return dictionary of tensors from data batches + """ batch_text_indices = [] batch_context_indices = [] batch_aspect_indices = [] From ed89c1d9d565c28955bf9873a5c51b4da8654cf8 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 14 Dec 2021 14:36:53 +0800 Subject: [PATCH 025/201] [#41] add draft ABSADatasetReader class --- sgnlp/models/sentic_asgcn/utils.py | 70 ++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/sgnlp/models/sentic_asgcn/utils.py b/sgnlp/models/sentic_asgcn/utils.py index e4b7742..333c305 100644 --- a/sgnlp/models/sentic_asgcn/utils.py +++ b/sgnlp/models/sentic_asgcn/utils.py @@ -1,5 +1,6 @@ import argparse import json +from logging import error import math import pickle import random @@ -8,6 +9,7 @@ import numpy as np import torch +from transformers import PreTrainedTokenizer from data_class import SenticASGCNTrainArgs @@ -220,3 +222,71 @@ def __iter__(self): random.shuffle(self.batches) for idx in range(self.batch_len): yield self.batches[idx] + + +class ABSADataset(object): + """ + Data class to hold dataset for training. + """ + + def __init__(self, data): + self.data = data + + def __getitem__(self, index): + return self.data[index] + + def __len__(self): + return len(self.data) + + +class ABSADatasetReader: + def __init__( + self, + dataset_file_names: List[str], + tokenizer: PreTrainedTokenizer, + embed_dim: int = 300, + ): + self.embed_dim = embed_dim + self.tokenizer = tokenizer + # TODO: Figure out how to include the embedding matrix here + # self.embedding_matrix = build_embedding_matrix() + + @staticmethod + def __read_data__(file_name: str, tokenizer: PreTrainedTokenizer): + # Read raw data, graph data and tree data + with open( + file_name, "r", encoding="utf-8", newline="\n", errors="ignore" + ) as fin: + lines = fin.readlines() + with open(f"{file_name}.graph", "rb") as fin_graph: + idx2graph = pickle.load(fin_graph) + with open(f"{file_name}.tree", "rb") as fin_tree: + idx2tree = pickle.load(fin_tree) + + # Prep all data + all_data = [] + for i in range(0, len(lines), 3): + text_left, _, text_right = [ + s.lower().strip() for s in lines[i].partition("$T$") + ] + aspect = lines[i + 1].lower().strip() + polarity = lines[i + 2].lower().strip() + text_indices = tokenizer(f"{text_left} {aspect} {text_right}") + context_indices = tokenizer(f"{text_left} {text_right}") + aspect_indices = tokenizer(aspect) + left_indices = tokenizer(text_left) + polarity = int(polarity) + 1 + dependency_graph = idx2graph[i] + dependency_tree = idx2tree[i] + + data = { + "text_indices": text_indices, + "context_indices": context_indices, + "aspect_indices": aspect_indices, + "left_indices": left_indices, + "polarity": polarity, + "dependency_graph": dependency_graph, + "dependency_tree": dependency_tree, + } + all_data.append(data) + return all_data From 90d2f6a54cd423c9381444d0fa8dfb2374fdddb3 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 15 Dec 2021 08:53:11 +0800 Subject: [PATCH 026/201] [#41] add raw, graph and tree dataset path to config and dataclass --- .../sentic_asgcn/config/sentic_asgcn_config.json | 10 ++++++++++ sgnlp/models/sentic_asgcn/data_class.py | 14 +++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json b/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json index d08aa9d..272d4b0 100644 --- a/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json +++ b/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json @@ -11,6 +11,16 @@ "./datasets/semeval16/restaurant_train.raw", "./datasets/semeval16/restaurant_test.raw" ], + "dataset_train": { + "raw": "./datasets/semeval14/restaurant_train.raw", + "graph": "./datasets/semeval14/restaurant_train.raw.graph", + "tree": "./datasets/semeval14/restaurant_train.raw.tree" + }, + "dataset_test": { + "raw": "./datasets/semeval14/restaurant_test.raw", + "graph": "./datasets/semeval14/restaurant_test.raw.graph", + "tree": "./datasets/semeval14/restaurant_test.raw.tree" + }, "initializer": "xavier_uniform", "optimizer": "adam", "learning_rate": 0.001, diff --git a/sgnlp/models/sentic_asgcn/data_class.py b/sgnlp/models/sentic_asgcn/data_class.py index cdf4bcf..7ace9cc 100644 --- a/sgnlp/models/sentic_asgcn/data_class.py +++ b/sgnlp/models/sentic_asgcn/data_class.py @@ -1,5 +1,5 @@ from dataclasses import dataclass, field -from typing import List +from typing import Dict, List @dataclass @@ -7,6 +7,18 @@ class SenticASGCNTrainArgs: dependency_graph_preprocess: List[str] = field( default=list, metadata={"help": "List of raw dataset to process."} ) + dataset_train: Dict[str, str] = field( + default=dict, + metadata={ + "help": "Dictionary containing 3 file paths to the raw, graph and tree train datasets." + }, + ) + dataset_test: Dict[str, str] = field( + default=dict, + metadata={ + "help": "Dictionary containing 3 file paths to the raw, graph and tree test datasets." + }, + ) initializer: str = field( default="xavier_uniform", metadata={"help": "Type of initalizer to use."} ) From 50ae2f0f1e493cdff655111646406e92e1722165 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 15 Dec 2021 09:39:17 +0800 Subject: [PATCH 027/201] [#41] rework integration for embedding matrix, apply proper formatting with black --- sgnlp/models/sentic_asgcn/config.py | 8 +-- .../config/sentic_asgcn_config.json | 3 + sgnlp/models/sentic_asgcn/data_class.py | 60 +++++++---------- sgnlp/models/sentic_asgcn/modeling.py | 10 +-- .../sentic_asgcn/modules/dynamic_rnn.py | 32 +++++----- sgnlp/models/sentic_asgcn/modules/gcn.py | 4 +- .../preprocess_dependency_graph.py | 12 +--- sgnlp/models/sentic_asgcn/tokenization.py | 12 +--- sgnlp/models/sentic_asgcn/utils.py | 64 ++++++++----------- 9 files changed, 82 insertions(+), 123 deletions(-) diff --git a/sgnlp/models/sentic_asgcn/config.py b/sgnlp/models/sentic_asgcn/config.py index ff16a8b..225affa 100644 --- a/sgnlp/models/sentic_asgcn/config.py +++ b/sgnlp/models/sentic_asgcn/config.py @@ -24,13 +24,7 @@ class SenticASGCNConfig(PreTrainedConfig): """ def __init__( - self, - embed_dim=300, - hidden_dim=300, - polarities_dim=3, - dropout=0.3, - device=torch.device("cuda"), - **kwargs + self, embed_dim=300, hidden_dim=300, polarities_dim=3, dropout=0.3, device=torch.device("cuda"), **kwargs ): super().__init__(**kwargs) self.embed_dim = embed_dim diff --git a/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json b/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json index 272d4b0..dadf835 100644 --- a/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json +++ b/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json @@ -21,6 +21,9 @@ "graph": "./datasets/semeval14/restaurant_test.raw.graph", "tree": "./datasets/semeval14/restaurant_test.raw.tree" }, + "word_vec_file_path": "./glove/glove.840B.300d.txt", + "save_embedding_matrix": true, + "saved_embedding_matrix_file_path": "./embedding/embeddings.pickle", "initializer": "xavier_uniform", "optimizer": "adam", "learning_rate": 0.001, diff --git a/sgnlp/models/sentic_asgcn/data_class.py b/sgnlp/models/sentic_asgcn/data_class.py index 7ace9cc..5ce38ab 100644 --- a/sgnlp/models/sentic_asgcn/data_class.py +++ b/sgnlp/models/sentic_asgcn/data_class.py @@ -4,55 +4,43 @@ @dataclass class SenticASGCNTrainArgs: - dependency_graph_preprocess: List[str] = field( - default=list, metadata={"help": "List of raw dataset to process."} - ) + dependency_graph_preprocess: List[str] = field(default=list, metadata={"help": "List of raw dataset to process."}) dataset_train: Dict[str, str] = field( default=dict, - metadata={ - "help": "Dictionary containing 3 file paths to the raw, graph and tree train datasets." - }, + metadata={"help": "Dictionary containing 3 file paths to the raw, graph and tree train datasets."}, ) dataset_test: Dict[str, str] = field( default=dict, - metadata={ - "help": "Dictionary containing 3 file paths to the raw, graph and tree test datasets." - }, + metadata={"help": "Dictionary containing 3 file paths to the raw, graph and tree test datasets."}, ) - initializer: str = field( - default="xavier_uniform", metadata={"help": "Type of initalizer to use."} + word_vec_file_path: str = field( + default="glove/glove.840B.300d.txt", + metadata={"help": "File path to word vector."}, ) - optimizer: str = field( - default="adam", metadata={"help": "Type of optimizer to use."} + save_embedding_matrix: bool = field( + default=True, + metadata="Flag to indicate if embedding matrix should be saved. Flag is ignored if 'saved_embedding_matrix_file_path' is populated and valid.", ) - learning_rate: float = field( - default=0.001, metadata={"help": "Default learning rate for training."} + saved_embedding_matrix_file_path: str = field( + default="embedding/embeddings.pickle", + metadata={ + "help": "Full path of saved embedding matrix, if file exists, embeddings will be generated from file instead of generated from word vector and vocab." + }, ) + initializer: str = field(default="xavier_uniform", metadata={"help": "Type of initalizer to use."}) + optimizer: str = field(default="adam", metadata={"help": "Type of optimizer to use."}) + learning_rate: float = field(default=0.001, metadata={"help": "Default learning rate for training."}) l2reg: float = field(default=0.00001, metadata={"help": "Default l2reg value."}) epochs: int = field(default=100, metadata={"help": "Number of epochs to train."}) batch_size: int = field(default=32, metadata={"help": "Training batch size."}) log_step: int = field(default=5, metadata={"help": "Default log step."}) - embed_dim: int = field( - default=300, metadata={"help": "Number of neurons for embed layer."} - ) - hidden_dim: int = field( - default=300, metadata={"help": "Number of neurons for hidden layer."} - ) - dropout: float = field( - default=0.3, metadata={"help": "Default value for dropout percentages."} - ) - polarities_dim: int = field( - default=3, metadata={"help": "Default dimension for polarities."} - ) - save: bool = field( - default=True, metadata={"help": "Flag to indicate if results should be saved."} - ) - seed: int = field( - default=776, metadata={"help": "Default random seed for training."} - ) - device: str = field( - default="cuda", metadata={"help": "Type of compute device to use for training."} - ) + embed_dim: int = field(default=300, metadata={"help": "Size of embedding."}) + hidden_dim: int = field(default=300, metadata={"help": "Number of neurons for hidden layer."}) + dropout: float = field(default=0.3, metadata={"help": "Default value for dropout percentages."}) + polarities_dim: int = field(default=3, metadata={"help": "Default dimension for polarities."}) + save: bool = field(default=True, metadata={"help": "Flag to indicate if results should be saved."}) + seed: int = field(default=776, metadata={"help": "Default random seed for training."}) + device: str = field(default="cuda", metadata={"help": "Type of compute device to use for training."}) def __post_init__(self): assert self.initializer in [ diff --git a/sgnlp/models/sentic_asgcn/modeling.py b/sgnlp/models/sentic_asgcn/modeling.py index a23b9e6..604213d 100644 --- a/sgnlp/models/sentic_asgcn/modeling.py +++ b/sgnlp/models/sentic_asgcn/modeling.py @@ -83,9 +83,7 @@ def forward(self, inputs): text_len = torch.sum(text_indices != 0, dim=-1) aspect_len = torch.sum(aspect_indices != 0, dim=-1) left_len = torch.sum(left_indices != 0, dim=-1) - aspect_double_idx = torch.cat( - [left_len.unsqueeze(1), (left_len + aspect_len - 1).unsqueeze(1)], dim=1 - ) + aspect_double_idx = torch.cat([left_len.unsqueeze(1), (left_len + aspect_len - 1).unsqueeze(1)], dim=1) text = self.text_embed_dropout(self.embed(text_indices)) text_out, (_, _) = self.text_lstm(text, text_len) x = F.relu( @@ -94,11 +92,7 @@ def forward(self, inputs): adj, ) ) - x = F.relu( - self.gc2( - self.position_weight(x, aspect_double_idx, text_len, aspect_len), adj - ) - ) + x = F.relu(self.gc2(self.position_weight(x, aspect_double_idx, text_len, aspect_len), adj)) alpha_mat = torch.matmul(x, text_out.transpose(1, 2)) alpha = F.softmax(alpha_mat.sum(1, keepdim=True), dim=2) x = torch.matmul(alpha, text_out).squeeze(1) # batch_size x 2 * hidden_dim diff --git a/sgnlp/models/sentic_asgcn/modules/dynamic_rnn.py b/sgnlp/models/sentic_asgcn/modules/dynamic_rnn.py index fa96702..74b750e 100644 --- a/sgnlp/models/sentic_asgcn/modules/dynamic_rnn.py +++ b/sgnlp/models/sentic_asgcn/modules/dynamic_rnn.py @@ -6,17 +6,19 @@ class DynamicLSTM(nn.Module): """ A dynamic LSTM class which can hold variable length sequence """ + def __init__( - self, - input_size, - hidden_size, - num_layers=1, - bias=True, - batch_first=True, - dropout=0, - bidirectional=False, - only_use_last_hidden_state=False, - rnn_type='LSTM') -> None: + self, + input_size, + hidden_size, + num_layers=1, + bias=True, + batch_first=True, + dropout=0, + bidirectional=False, + only_use_last_hidden_state=False, + rnn_type="LSTM", + ) -> None: super(DynamicLSTM, self).__init__() self.input_size = input_size self.hidden_size = hidden_size @@ -40,13 +42,13 @@ def __init_rnn(self) -> None: "bias": self.bias, "batch_first": self.batch_first, "dropout": self.dropout, - "bidirectional": self.bidirectional + "bidirectional": self.bidirectional, } - if self.rnn_type == 'LSTM': + if self.rnn_type == "LSTM": self.rnn = nn.LSTM(**input_args) - elif self.rnn_type == 'GRU': + elif self.rnn_type == "GRU": self.rnn = nn.GRU(**input_args) - elif self.rnn_type == 'RNN': + elif self.rnn_type == "RNN": self.rnn = nn.RNN(**input_args) def forward(self, x, x_len, h0=None): @@ -79,7 +81,7 @@ def forward(self, x, x_len, h0=None): out = out[x_unsort_idx] # Unsort: out c - if self.rnn_type == 'LSTM': + if self.rnn_type == "LSTM": # (num_layers * num_directions, batch, hidden_size) -> (batch, ...) ct = torch.transpose(ct, 0, 1)[x_unsort_idx] ct = torch.transpose(ct, 0, 1) diff --git a/sgnlp/models/sentic_asgcn/modules/gcn.py b/sgnlp/models/sentic_asgcn/modules/gcn.py index 58edec4..f53ddbd 100644 --- a/sgnlp/models/sentic_asgcn/modules/gcn.py +++ b/sgnlp/models/sentic_asgcn/modules/gcn.py @@ -1,10 +1,12 @@ import torch import torch.nn as nn + class GraphConvolution(nn.Module): """ Simple GCN Layer, similar to https://arxiv.org/abs/1609.02907 """ + def __init__(self, in_features, out_features, bias=True) -> None: super(GraphConvolution, self).__init__() self.in_features = in_features @@ -13,7 +15,7 @@ def __init__(self, in_features, out_features, bias=True) -> None: if bias: self.bias = nn.Parameter(torch.FloatTensor(out_features)) else: - self.register_parameter('bias', None) + self.register_parameter("bias", None) def forward(self, text, adj): hidden = torch.matmul(text, self.weight) diff --git a/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py b/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py index d0b5a44..32afce8 100644 --- a/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py +++ b/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py @@ -48,20 +48,14 @@ def process(self, filename: str): Args: filename (str): filename of raw dataset to process """ - with open( - filename, "r", encoding="utf-8", newline="\n", errors="ignore" - ) as fin: + with open(filename, "r", encoding="utf-8", newline="\n", errors="ignore") as fin: lines = fin.readlines() idx2graph = {} with open(f"{filename}.graph", "wb") as fout: for i in range(0, len(lines), 3): - text_left, _, text_right = [ - s.lower().strip() for s in lines[i].partition("$T$") - ] + text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] aspect = lines[i + 1].lower().strip() - idx2graph[i] = self.__dependency_adj_matrix( - f"{text_left} {aspect} {text_right}" - ) + idx2graph[i] = self.__dependency_adj_matrix(f"{text_left} {aspect} {text_right}") pickle.dump(idx2graph, fout) diff --git a/sgnlp/models/sentic_asgcn/tokenization.py b/sgnlp/models/sentic_asgcn/tokenization.py index 992866f..a2a94a9 100644 --- a/sgnlp/models/sentic_asgcn/tokenization.py +++ b/sgnlp/models/sentic_asgcn/tokenization.py @@ -61,14 +61,10 @@ def __read_text_file(file_names: List[str]) -> str: """ text = "" for fname in file_names: - with open( - fname, "r", encoding="utf-8", newline="\n", errors="ignore" - ) as fin: + with open(fname, "r", encoding="utf-8", newline="\n", errors="ignore") as fin: lines = fin.readlines() for i in range(0, len(lines), 3): - text_left, _, text_right = [ - s.lower().strip() for s in lines[i].partition("$T$") - ] + text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] aspect = lines[i + 1].lower().strip() text += f"{text_left} {aspect} {text_right} " # Left a space at the end return text @@ -95,9 +91,7 @@ def _tokenize(self, text, **kwargs): words = text.split() return words - def save_vocabulary( - self, save_directory: str, filename_prefix: Optional[str] = None - ) -> Tuple[str]: + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: save_dir = pathlib.Path(save_directory) save_dir.mkdir(exist_ok=True) vocab_file_path = save_dir.joinpath("vocab.pkl") diff --git a/sgnlp/models/sentic_asgcn/utils.py b/sgnlp/models/sentic_asgcn/utils.py index 333c305..20846d5 100644 --- a/sgnlp/models/sentic_asgcn/utils.py +++ b/sgnlp/models/sentic_asgcn/utils.py @@ -48,9 +48,7 @@ def set_random_seed(seed: int = 776) -> None: torch.backends.cudnn.benchmark = False -def load_word_vec( - word_vec_file_path: str, vocab: Dict[str, int], embed_dim: int = 300 -) -> Dict[str, np.asarray]: +def load_word_vec(word_vec_file_path: str, vocab: Dict[str, int], embed_dim: int = 300) -> Dict[str, np.asarray]: """ Helper method to load word vectors from file (e.g. GloVe) for each word in vocab. @@ -62,9 +60,7 @@ def load_word_vec( Returns: Dict[str, np.asarray]: dictionary with words as key and word vectors as values. """ - with open( - word_vec_file_path, "r", encoding="utf-8", newline="\n", errors="ignore" - ) as fin: + with open(word_vec_file_path, "r", encoding="utf-8", newline="\n", errors="ignore") as fin: word_vec = {} for line in fin: tokens = line.rstrip().split() @@ -79,7 +75,7 @@ def build_embedding_matrix( vocab: Dict[str, int], embed_dim: int = 300, save_embed_matrix: bool = False, - save_embed_directory: str = None, + save_embed_file_path: str = None, ) -> np.ndarray: """ Helper method to generate an embedding matrix. @@ -95,9 +91,7 @@ def build_embedding_matrix( np.array: numpy array of embedding matrix """ embedding_matrix = np.zeros(len(vocab), embed_dim) - embedding_matrix[1, :] = np.random.uniform( - -1 / np.sqrt(embed_dim), 1 / np.sqrt(embed_dim), (1, embed_dim) - ) + embedding_matrix[1, :] = np.random.uniform(-1 / np.sqrt(embed_dim), 1 / np.sqrt(embed_dim), (1, embed_dim)) word_vec = load_word_vec(word_vec_file_path, vocab, embed_dim) for word, idx in vocab.items(): vec = word_vec.get(word) @@ -105,10 +99,10 @@ def build_embedding_matrix( embedding_matrix[idx] = vec if save_embed_matrix: - if save_embed_directory is not None: - save_dir = pathlib.Path(save_embed_directory) - save_dir.mkdir(exist_ok=True) - with open("embedding_matrix.pkl", "wb") as fout: + save_file_path = pathlib.Path(save_embed_file_path) + if not save_file_path.exists(): + save_file_path.parent.mkdir(exist_ok=True) + with open(save_file_path, "wb") as fout: pickle.dump(embedding_matrix, fout) return embedding_matrix @@ -119,9 +113,7 @@ class BucketIterator(object): Bucket iterator class which provides sorting and padding for input dataset, iterate thru dataset batches """ - def __init__( - self, data, batch_size, sort_key="text_indices", shuffle=True, sort=True - ): + def __init__(self, data, batch_size, sort_key="text_indices", shuffle=True, sort=True): self.shuffle = shuffle self.sort = sort self.sort_key = sort_key @@ -140,13 +132,8 @@ def sort_and_pad(self, data, batch_size: int) -> List[Dict[str, torch.tensor]]: List[Dict[str, torch.tensor]]: return a list of dictionaries of tensors """ num_batch = int(math.ceil(len(data) / batch_size)) - sorted_data = ( - sorted(data, key=lambda x: len(x[self.sort_key])) if self.sort else data - ) - batches = [ - self.pad_data(sorted_data[i * batch_size : (i + 1) * batch_size]) - for i in range(num_batch) - ] + sorted_data = sorted(data, key=lambda x: len(x[self.sort_key])) if self.sort else data + batches = [self.pad_data(sorted_data[i * batch_size : (i + 1) * batch_size]) for i in range(num_batch)] return batches def pad_data(self, batch_data: Iterable) -> Dict[str, torch.tensor]: @@ -242,33 +229,34 @@ def __len__(self): class ABSADatasetReader: def __init__( self, - dataset_file_names: List[str], + config: SenticASGCNTrainArgs, tokenizer: PreTrainedTokenizer, - embed_dim: int = 300, ): - self.embed_dim = embed_dim + self.cfg = config self.tokenizer = tokenizer - # TODO: Figure out how to include the embedding matrix here - # self.embedding_matrix = build_embedding_matrix() + self.embedding_matrix = build_embedding_matrix( + config.word_vec_file_path, + tokenizer.vocab, + config.embed_dim, + config.save_embedding_matrix, + config.saved_embedding_matrix_file_path, + ) + self.train_data = ABSADataset(ABSADatasetReader.__read_data__(self.cfg.dataset_train)) @staticmethod - def __read_data__(file_name: str, tokenizer: PreTrainedTokenizer): + def __read_data__(datasets: Dict[str, str], tokenizer: PreTrainedTokenizer): # Read raw data, graph data and tree data - with open( - file_name, "r", encoding="utf-8", newline="\n", errors="ignore" - ) as fin: + with open(datasets["raw"], "r", encoding="utf-8", newline="\n", errors="ignore") as fin: lines = fin.readlines() - with open(f"{file_name}.graph", "rb") as fin_graph: + with open(datasets["graph"], "rb") as fin_graph: idx2graph = pickle.load(fin_graph) - with open(f"{file_name}.tree", "rb") as fin_tree: + with open(datasets["tree"], "rb") as fin_tree: idx2tree = pickle.load(fin_tree) # Prep all data all_data = [] for i in range(0, len(lines), 3): - text_left, _, text_right = [ - s.lower().strip() for s in lines[i].partition("$T$") - ] + text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] aspect = lines[i + 1].lower().strip() polarity = lines[i + 2].lower().strip() text_indices = tokenizer(f"{text_left} {aspect} {text_right}") From 21098dd9d4e83be0c17697a3993f9ea094dd6ace Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 15 Dec 2021 09:43:12 +0800 Subject: [PATCH 028/201] [#41] add missing tokenizer args --- sgnlp/models/sentic_asgcn/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_asgcn/utils.py b/sgnlp/models/sentic_asgcn/utils.py index 20846d5..1aa6c20 100644 --- a/sgnlp/models/sentic_asgcn/utils.py +++ b/sgnlp/models/sentic_asgcn/utils.py @@ -241,7 +241,8 @@ def __init__( config.save_embedding_matrix, config.saved_embedding_matrix_file_path, ) - self.train_data = ABSADataset(ABSADatasetReader.__read_data__(self.cfg.dataset_train)) + self.train_data = ABSADataset(ABSADatasetReader.__read_data__(self.cfg.dataset_train, tokenizer)) + self.test_data = ABSADataset(ABSADatasetReader.__read_data__(self.cfg.dataset_test, tokenizer)) @staticmethod def __read_data__(datasets: Dict[str, str], tokenizer: PreTrainedTokenizer): From 8067150b92cc0b62a7f4cab37025f9c096f9176d Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 15 Dec 2021 10:50:27 +0800 Subject: [PATCH 029/201] [#41] bug fix for missing brackets --- sgnlp/models/sentic_asgcn/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sgnlp/models/sentic_asgcn/utils.py b/sgnlp/models/sentic_asgcn/utils.py index 1aa6c20..4738620 100644 --- a/sgnlp/models/sentic_asgcn/utils.py +++ b/sgnlp/models/sentic_asgcn/utils.py @@ -83,14 +83,14 @@ def build_embedding_matrix( Args: word_vec_file_path (str): full file path to word vectors. vocab (Dict[str, int]): dictionary of vocab word as key and word index as values. - embed_dim (int, optional): embedding dimensiion. Defaults to 300. + embed_dim (int, optional): embedding dimension. Defaults to 300. save_embed_matrix (bool, optional): flag to indicate if . Defaults to False. save_embed_directory (str, optional): [description]. Defaults to None. Returns: np.array: numpy array of embedding matrix """ - embedding_matrix = np.zeros(len(vocab), embed_dim) + embedding_matrix = np.zeros((len(vocab), embed_dim)) embedding_matrix[1, :] = np.random.uniform(-1 / np.sqrt(embed_dim), 1 / np.sqrt(embed_dim), (1, embed_dim)) word_vec = load_word_vec(word_vec_file_path, vocab, embed_dim) for word, idx in vocab.items(): From ec5deb04a933c3b836cc9a6cf72592e1ad33b0b6 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 16 Dec 2021 14:40:23 +0800 Subject: [PATCH 030/201] [#41] draft fix to integrate batch_encoding structure with BucketIterator --- .../config/sentic_asgcn_config.json | 36 ++++++------- sgnlp/models/sentic_asgcn/train.py | 13 +++-- sgnlp/models/sentic_asgcn/utils.py | 53 ++++++++++--------- 3 files changed, 57 insertions(+), 45 deletions(-) diff --git a/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json b/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json index dadf835..fc4f619 100644 --- a/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json +++ b/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json @@ -1,29 +1,29 @@ { "dependency_graph_preprocess": [ - "./datasets/acl-14-short-data/train.raw", - "./datasets/acl-14-short-data/test.raw", - "./datasets/semeval14/restaurant_train.raw", - "./datasets/semeval14/restaurant_test.raw", - "./datasets/semeval14/laptop_train.raw", - "./datasets/semeval14/laptop_test.raw", - "./datasets/semeval15/restaurant_train.raw", - "./datasets/semeval15/restaurant_test.raw", - "./datasets/semeval16/restaurant_train.raw", - "./datasets/semeval16/restaurant_test.raw" + "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/acl-14-short-data/train.raw", + "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/acl-14-short-data/test.raw", + "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", + "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw", + "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/laptop_train.raw", + "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/laptop_test.raw", + "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval15/restaurant_train.raw", + "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval15/restaurant_test.raw", + "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval16/restaurant_train.raw", + "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval16/restaurant_test.raw" ], "dataset_train": { - "raw": "./datasets/semeval14/restaurant_train.raw", - "graph": "./datasets/semeval14/restaurant_train.raw.graph", - "tree": "./datasets/semeval14/restaurant_train.raw.tree" + "raw": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", + "graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw.graph", + "tree": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw.tree" }, "dataset_test": { - "raw": "./datasets/semeval14/restaurant_test.raw", - "graph": "./datasets/semeval14/restaurant_test.raw.graph", - "tree": "./datasets/semeval14/restaurant_test.raw.tree" + "raw": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw", + "graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw.graph", + "tree": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw.tree" }, - "word_vec_file_path": "./glove/glove.840B.300d.txt", + "word_vec_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/glove/glove.840B.300d.txt", "save_embedding_matrix": true, - "saved_embedding_matrix_file_path": "./embedding/embeddings.pickle", + "saved_embedding_matrix_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/embedding/embeddings.pickle", "initializer": "xavier_uniform", "optimizer": "adam", "learning_rate": 0.001, diff --git a/sgnlp/models/sentic_asgcn/train.py b/sgnlp/models/sentic_asgcn/train.py index 6471800..b49ac36 100644 --- a/sgnlp/models/sentic_asgcn/train.py +++ b/sgnlp/models/sentic_asgcn/train.py @@ -1,9 +1,16 @@ -from .data_class import SenticASGCNTrainArgs -from .utils import parse_args_and_load_config, set_random_seed +from data_class import SenticASGCNTrainArgs +from utils import parse_args_and_load_config, set_random_seed, ABSADatasetReader, BucketIterator + +from tokenization import SenticASGCNTokenizer def train_model(cfg: SenticASGCNTrainArgs): - pass + tokenizer = SenticASGCNTokenizer.from_pretrained( + "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/tokenizer/" + ) + absa_dataset = ABSADatasetReader(cfg, tokenizer) + train_dataloader = BucketIterator(data=absa_dataset.train_data, batch_size=cfg.batch_size, shuffle=True) + test_dataloader = BucketIterator(data=absa_dataset.test_data, batch_size=cfg.batch_size, shuffle=False) if __name__ == "__main__": diff --git a/sgnlp/models/sentic_asgcn/utils.py b/sgnlp/models/sentic_asgcn/utils.py index 4738620..688c69b 100644 --- a/sgnlp/models/sentic_asgcn/utils.py +++ b/sgnlp/models/sentic_asgcn/utils.py @@ -5,11 +5,12 @@ import pickle import random import pathlib -from typing import Dict, Iterable, List +from typing import Dict, Iterable, List, Union import numpy as np import torch from transformers import PreTrainedTokenizer +from transformers.tokenization_utils_base import BatchEncoding from data_class import SenticASGCNTrainArgs @@ -113,7 +114,7 @@ class BucketIterator(object): Bucket iterator class which provides sorting and padding for input dataset, iterate thru dataset batches """ - def __init__(self, data, batch_size, sort_key="text_indices", shuffle=True, sort=True): + def __init__(self, data, batch_size: int, sort_key="text_indices", shuffle=True, sort=True): self.shuffle = shuffle self.sort = sort self.sort_key = sort_key @@ -125,7 +126,7 @@ def sort_and_pad(self, data, batch_size: int) -> List[Dict[str, torch.tensor]]: Class method to sort and pad data batches Args: - data ([type]): input data + data (ABSADataset): input data batch_size (int): batch size Returns: @@ -136,12 +137,21 @@ def sort_and_pad(self, data, batch_size: int) -> List[Dict[str, torch.tensor]]: batches = [self.pad_data(sorted_data[i * batch_size : (i + 1) * batch_size]) for i in range(num_batch)] return batches - def pad_data(self, batch_data: Iterable) -> Dict[str, torch.tensor]: + def pad_batch_encoding(self, data: BatchEncoding, max_len: int) -> BatchEncoding: + input_ids_pad = [0] * (max_len - len(data["input_ids"])) + token_type_ids_pad = input_ids_pad.copy() + attention_mask_pad = [1] * (max_len - len(data["attention_mask"])) + data["input_ids"] = torch.tensor(data["input_ids"] + input_ids_pad) + data["token_type_ids"] = torch.tensor(data["token_type_ids"] + token_type_ids_pad) + data["attention_mask_pad"] = torch.tensor(data["attention_mask"] + attention_mask_pad) + return data + + def pad_data(self, batch_data: List[Dict[str, Union[BatchEncoding, int, np.ndarray]]]) -> Dict[str, torch.tensor]: """ Class method to pad data batches Args: - batch_data (Iterable): An iterable for looping thru input dataset + batch_data (List[Dict[str, Union[BatchEncoding, int, np.ndarray]]]): List of dictionaries containing all batches of data Returns: Dict[str, torch.tensor]: return dictionary of tensors from data batches @@ -153,7 +163,7 @@ def pad_data(self, batch_data: Iterable) -> Dict[str, torch.tensor]: batch_polarity = [] batch_dependency_graph = [] batch_dependency_tree = [] - max_len = max([len(t[self.sort_key]) for t in batch_data]) + max_len = max([len(t[self.sort_key]["input_ids"]) for t in batch_data]) # [text_indices, context_indices, aspect_indices, left_indices, polarity, dependency_graph, dependency_tree] for item in batch_data: text_indices = item["text_indices"] @@ -164,22 +174,17 @@ def pad_data(self, batch_data: Iterable) -> Dict[str, torch.tensor]: dependency_graph = item["dependency_graph"] dependency_tree = item["dependency_tree"] - text_padding = [0] * (max_len - len(text_indices)) - context_padding = [0] * (max_len - len(context_indices)) - aspect_padding = [0] * (max_len - len(aspect_indices)) - left_padding = [0] * (max_len - len(left_indices)) - - batch_text_indices.append(text_indices + text_padding) - batch_context_indices.append(context_indices + context_padding) - batch_aspect_indices.append(aspect_indices + aspect_padding) - batch_left_indices.append(left_indices + left_padding) + batch_text_indices.append(self.pad_batch_encoding(text_indices, max_len)) + batch_context_indices.append(self.pad_batch_encoding(context_indices, max_len)) + batch_aspect_indices.append(self.pad_batch_encoding(aspect_indices, max_len)) + batch_left_indices.append(self.pad_batch_encoding(left_indices, max_len)) batch_polarity.append(polarity) batch_dependency_graph.append( np.pad( dependency_graph, ( - (0, max_len - len(text_indices)), - (0, max_len - len(text_indices)), + (0, max_len - len(text_indices["input_ids"])), + (0, max_len - len(text_indices["input_ids"])), ), "constant", ) @@ -188,18 +193,18 @@ def pad_data(self, batch_data: Iterable) -> Dict[str, torch.tensor]: np.pad( dependency_tree, ( - (0, max_len - len(text_indices)), - (0, max_len - len(text_indices)), + (0, max_len - len(text_indices["input_ids"])), + (0, max_len - len(text_indices["input_ids"])), ), "constant", ) ) return { - "text_indices": torch.tensor(batch_text_indices), - "context_indices": torch.tensor(batch_context_indices), - "aspect_indices": torch.tensor(batch_aspect_indices), - "left_indices": torch.tensor(batch_left_indices), - "polarity": torch.tensor(batch_polarity), + "text_indices": batch_text_indices, + "context_indices": batch_context_indices, + "aspect_indices": batch_aspect_indices, + "left_indices": batch_left_indices, + "polarity": batch_polarity, "dependency_graph": torch.tensor(batch_dependency_graph), "dependency_tree": torch.tensor(batch_dependency_tree), } From e756993f1d8a52ed56a6c0e3803093f86eeadf01 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 21 Dec 2021 14:17:17 +0800 Subject: [PATCH 031/201] [#41] cast input text sequence to float32 --- sgnlp/models/sentic_asgcn/modules/gcn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sgnlp/models/sentic_asgcn/modules/gcn.py b/sgnlp/models/sentic_asgcn/modules/gcn.py index f53ddbd..af6dc5b 100644 --- a/sgnlp/models/sentic_asgcn/modules/gcn.py +++ b/sgnlp/models/sentic_asgcn/modules/gcn.py @@ -18,6 +18,7 @@ def __init__(self, in_features, out_features, bias=True) -> None: self.register_parameter("bias", None) def forward(self, text, adj): + text = text.to(torch.float32) hidden = torch.matmul(text, self.weight) denom = torch.sum(adj, dim=2, keepdim=True) + 1 output = torch.matmul(adj, hidden) / denom From b7e2cadf07265ba97aa82c6e4b815bdf09a39f31 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 21 Dec 2021 14:46:19 +0800 Subject: [PATCH 032/201] [#41] rename filenames and classes to reflect proper name from paper --- .../{sentic_asgcn => senticnet_gcn}/__init__.py | 0 .../{sentic_asgcn => senticnet_gcn}/config.py | 10 +++++----- .../config/senticnet_gcn_config.json} | 0 .../{sentic_asgcn => senticnet_gcn}/data_class.py | 2 +- .../{sentic_asgcn => senticnet_gcn}/modeling.py | 14 +++++++------- .../modules/__init__.py | 0 .../modules/dynamic_rnn.py | 0 .../{sentic_asgcn => senticnet_gcn}/modules/gcn.py | 0 .../{sentic_asgcn => senticnet_gcn}/preprocess.py | 6 +++--- .../preprocess_dependency_graph.py | 0 .../tokenization.py | 2 +- .../{sentic_asgcn => senticnet_gcn}/train.py | 12 ++++++++++++ .../{sentic_asgcn => senticnet_gcn}/utils.py | 14 +++++++------- 13 files changed, 36 insertions(+), 24 deletions(-) rename sgnlp/models/{sentic_asgcn => senticnet_gcn}/__init__.py (100%) rename sgnlp/models/{sentic_asgcn => senticnet_gcn}/config.py (74%) rename sgnlp/models/{sentic_asgcn/config/sentic_asgcn_config.json => senticnet_gcn/config/senticnet_gcn_config.json} (100%) rename sgnlp/models/{sentic_asgcn => senticnet_gcn}/data_class.py (99%) rename sgnlp/models/{sentic_asgcn => senticnet_gcn}/modeling.py (92%) rename sgnlp/models/{sentic_asgcn => senticnet_gcn}/modules/__init__.py (100%) rename sgnlp/models/{sentic_asgcn => senticnet_gcn}/modules/dynamic_rnn.py (100%) rename sgnlp/models/{sentic_asgcn => senticnet_gcn}/modules/gcn.py (100%) rename sgnlp/models/{sentic_asgcn => senticnet_gcn}/preprocess.py (78%) rename sgnlp/models/{sentic_asgcn => senticnet_gcn}/preprocess_dependency_graph.py (100%) rename sgnlp/models/{sentic_asgcn => senticnet_gcn}/tokenization.py (98%) rename sgnlp/models/{sentic_asgcn => senticnet_gcn}/train.py (69%) rename sgnlp/models/{sentic_asgcn => senticnet_gcn}/utils.py (96%) diff --git a/sgnlp/models/sentic_asgcn/__init__.py b/sgnlp/models/senticnet_gcn/__init__.py similarity index 100% rename from sgnlp/models/sentic_asgcn/__init__.py rename to sgnlp/models/senticnet_gcn/__init__.py diff --git a/sgnlp/models/sentic_asgcn/config.py b/sgnlp/models/senticnet_gcn/config.py similarity index 74% rename from sgnlp/models/sentic_asgcn/config.py rename to sgnlp/models/senticnet_gcn/config.py index 225affa..491cd61 100644 --- a/sgnlp/models/sentic_asgcn/config.py +++ b/sgnlp/models/senticnet_gcn/config.py @@ -2,11 +2,11 @@ from transformers import PreTrainedConfig -class SenticASGCNConfig(PreTrainedConfig): +class SenticNetGCNConfig(PreTrainedConfig): """ This is the configuration class to store the configuration of a - :class:`~sgnlp.models.sentic_asgcn.modeling.SenticASGCNModel`. - It is used to instantiate a SenticASGCN network according to the specific arguments, defining the mdoel architecture. + :class:`~sgnlp.models.senticnet_gcn.modeling.SenticNetGCNConfig`. + It is used to instantiate a SenticNetGCNConfig network according to the specific arguments, defining the mdoel architecture. Args: embed_dim (:obj:`int`, defaults to 300): Embedding dimension size. @@ -17,10 +17,10 @@ class SenticASGCNConfig(PreTrainedConfig): Example: - from sgnlp.models.sentic_asgcn import SenticASGCNConfig + from sgnlp.models.senticnet_gcn import SenticNetGCNConfig # Initialize with default values - config = SenticASGCNConfig() + config = SenticNetGCNConfig() """ def __init__( diff --git a/sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json similarity index 100% rename from sgnlp/models/sentic_asgcn/config/sentic_asgcn_config.json rename to sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json diff --git a/sgnlp/models/sentic_asgcn/data_class.py b/sgnlp/models/senticnet_gcn/data_class.py similarity index 99% rename from sgnlp/models/sentic_asgcn/data_class.py rename to sgnlp/models/senticnet_gcn/data_class.py index 5ce38ab..8c96347 100644 --- a/sgnlp/models/sentic_asgcn/data_class.py +++ b/sgnlp/models/senticnet_gcn/data_class.py @@ -3,7 +3,7 @@ @dataclass -class SenticASGCNTrainArgs: +class SenticNetGCNTrainArgs: dependency_graph_preprocess: List[str] = field(default=list, metadata={"help": "List of raw dataset to process."}) dataset_train: Dict[str, str] = field( default=dict, diff --git a/sgnlp/models/sentic_asgcn/modeling.py b/sgnlp/models/senticnet_gcn/modeling.py similarity index 92% rename from sgnlp/models/sentic_asgcn/modeling.py rename to sgnlp/models/senticnet_gcn/modeling.py index 604213d..912f740 100644 --- a/sgnlp/models/sentic_asgcn/modeling.py +++ b/sgnlp/models/senticnet_gcn/modeling.py @@ -8,28 +8,28 @@ from .modules.dynamic_rnn import DynamicLSTM from .modules.gcn import GraphConvolution -from .config import SenticASGCNConfig +from .config import SenticNetGCNConfig @dataclass -class SenticASGCNModelOutput(ModelOutput): +class SenticNetGCNModelOutput(ModelOutput): pass -class SenticASGCNPreTrainedModel(PreTrainedModel): +class SenticNetGCNPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for download and loading pretrained models. """ - config_class = SenticASGCNConfig - base_model_prefix = "sentic_asgcn" + config_class = SenticNetGCNConfig + base_model_prefix = "senticnet_gcn" def _init_weights(self, module): pass -class SenticASGCNModel(SenticASGCNPreTrainedModel): - def __init__(self, config: SenticASGCNConfig) -> None: +class SenticNetGCNModel(SenticNetGCNPreTrainedModel): + def __init__(self, config: SenticNetGCNConfig) -> None: super().__init__(config) self.text_lstm = DynamicLSTM( config.embed_dim, diff --git a/sgnlp/models/sentic_asgcn/modules/__init__.py b/sgnlp/models/senticnet_gcn/modules/__init__.py similarity index 100% rename from sgnlp/models/sentic_asgcn/modules/__init__.py rename to sgnlp/models/senticnet_gcn/modules/__init__.py diff --git a/sgnlp/models/sentic_asgcn/modules/dynamic_rnn.py b/sgnlp/models/senticnet_gcn/modules/dynamic_rnn.py similarity index 100% rename from sgnlp/models/sentic_asgcn/modules/dynamic_rnn.py rename to sgnlp/models/senticnet_gcn/modules/dynamic_rnn.py diff --git a/sgnlp/models/sentic_asgcn/modules/gcn.py b/sgnlp/models/senticnet_gcn/modules/gcn.py similarity index 100% rename from sgnlp/models/sentic_asgcn/modules/gcn.py rename to sgnlp/models/senticnet_gcn/modules/gcn.py diff --git a/sgnlp/models/sentic_asgcn/preprocess.py b/sgnlp/models/senticnet_gcn/preprocess.py similarity index 78% rename from sgnlp/models/sentic_asgcn/preprocess.py rename to sgnlp/models/senticnet_gcn/preprocess.py index d159d6b..ec6ba2f 100644 --- a/sgnlp/models/sentic_asgcn/preprocess.py +++ b/sgnlp/models/senticnet_gcn/preprocess.py @@ -4,10 +4,10 @@ from transformers import PreTrainedTokenizer from transformers.tokenization_utils_base import BatchEncoding -from tokenization import SenticASGCNTokenizer +from tokenization import SenticNetGCNTokenizer -class SenticASGCNPreprocessor: +class SenticNetGCNPreprocessor: def __init__( self, tokenizer: PreTrainedTokenizer = None, @@ -18,7 +18,7 @@ def __init__( if tokenizer is not None: self.tokenizer = tokenizer else: - self.tokenizer = SenticASGCNTokenizer.from_pretrained(tokenizer_name) + self.tokenizer = SenticNetGCNTokenizer.from_pretrained(tokenizer_name) def __call__(self, data_batch: List[str]) -> BatchEncoding: tokens = self.tokenizer(data_batch, padding=True, return_tensors="pt") diff --git a/sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py b/sgnlp/models/senticnet_gcn/preprocess_dependency_graph.py similarity index 100% rename from sgnlp/models/sentic_asgcn/preprocess_dependency_graph.py rename to sgnlp/models/senticnet_gcn/preprocess_dependency_graph.py diff --git a/sgnlp/models/sentic_asgcn/tokenization.py b/sgnlp/models/senticnet_gcn/tokenization.py similarity index 98% rename from sgnlp/models/sentic_asgcn/tokenization.py rename to sgnlp/models/senticnet_gcn/tokenization.py index a2a94a9..891a327 100644 --- a/sgnlp/models/sentic_asgcn/tokenization.py +++ b/sgnlp/models/senticnet_gcn/tokenization.py @@ -8,7 +8,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.pkl"} -class SenticASGCNTokenizer(PreTrainedTokenizer): +class SenticNetGCNTokenizer(PreTrainedTokenizer): vocab_files_names = VOCAB_FILES_NAMES def __init__( diff --git a/sgnlp/models/sentic_asgcn/train.py b/sgnlp/models/senticnet_gcn/train.py similarity index 69% rename from sgnlp/models/sentic_asgcn/train.py rename to sgnlp/models/senticnet_gcn/train.py index b49ac36..c645e1d 100644 --- a/sgnlp/models/sentic_asgcn/train.py +++ b/sgnlp/models/senticnet_gcn/train.py @@ -4,6 +4,18 @@ from tokenization import SenticASGCNTokenizer +class Trainer: + def __init__(self, cfg: SenticASGCNTrainArgs): + self.cfg = cfg + tokenizer = SenticASGCNTokenizer.from_pretrained( + "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/tokenizer/" + ) + dataset = ABSADatasetReader(self.cfg, tokenizer=tokenizer) + + def _train(self): + pass + + def train_model(cfg: SenticASGCNTrainArgs): tokenizer = SenticASGCNTokenizer.from_pretrained( "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/tokenizer/" diff --git a/sgnlp/models/sentic_asgcn/utils.py b/sgnlp/models/senticnet_gcn/utils.py similarity index 96% rename from sgnlp/models/sentic_asgcn/utils.py rename to sgnlp/models/senticnet_gcn/utils.py index 688c69b..f3101c7 100644 --- a/sgnlp/models/sentic_asgcn/utils.py +++ b/sgnlp/models/senticnet_gcn/utils.py @@ -5,23 +5,23 @@ import pickle import random import pathlib -from typing import Dict, Iterable, List, Union +from typing import Dict, List, Union import numpy as np import torch from transformers import PreTrainedTokenizer from transformers.tokenization_utils_base import BatchEncoding -from data_class import SenticASGCNTrainArgs +from data_class import SenticNetGCNTrainArgs def parse_args_and_load_config( - config_path: str = "config/sentic_asgcn_config.json", -) -> SenticASGCNTrainArgs: + config_path: str = "config/senticnet_gcn_config.json", +) -> SenticNetGCNTrainArgs: """Get config from config file using argparser Returns: - SenticASGCNTrainArgs: SenticASGCNTrainArgs instance populated from config + SenticNetGCNTrainArgs: SenticNetGCNTrainArgs instance populated from config """ parser = argparse.ArgumentParser(description="SenticASGCN Training") parser.add_argument("--config", type=str, default=config_path) @@ -31,7 +31,7 @@ def parse_args_and_load_config( with open(cfg_path, "r") as cfg_file: cfg = json.load(cfg_file) - sentic_asgcn_args = SenticASGCNTrainArgs(**cfg) + sentic_asgcn_args = SenticNetGCNTrainArgs(**cfg) return sentic_asgcn_args @@ -234,7 +234,7 @@ def __len__(self): class ABSADatasetReader: def __init__( self, - config: SenticASGCNTrainArgs, + config: SenticNetGCNTrainArgs, tokenizer: PreTrainedTokenizer, ): self.cfg = config From 48d6608e97425c9c57996bb46ec8dc3580ea8b64 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 21 Dec 2021 15:29:13 +0800 Subject: [PATCH 033/201] [#41] add SenticNetBertGCNConfig class --- sgnlp/models/senticnet_gcn/config.py | 36 +++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/config.py b/sgnlp/models/senticnet_gcn/config.py index 491cd61..38ff9ec 100644 --- a/sgnlp/models/senticnet_gcn/config.py +++ b/sgnlp/models/senticnet_gcn/config.py @@ -5,14 +5,14 @@ class SenticNetGCNConfig(PreTrainedConfig): """ This is the configuration class to store the configuration of a - :class:`~sgnlp.models.senticnet_gcn.modeling.SenticNetGCNConfig`. - It is used to instantiate a SenticNetGCNConfig network according to the specific arguments, defining the mdoel architecture. + :class:`~sgnlp.models.senticnet_gcn.modeling.SenticNetGCNModel`. + It is used to instantiate a SenticNetGCNModel network according to the specific arguments, defining the model architecture. Args: embed_dim (:obj:`int`, defaults to 300): Embedding dimension size. hidden_dim (:obj:`int`, defaults to 300): Size of hidden dimension. dropout (:obj:`float`, defaults to 0.3): Droput percentage. - polarities_dim (:obj:`int`, defaults to 3): Size of output dimension represeting available polarities (e.g. Positive, Negative, Neutral). + polarities_dim (:obj:`int`, defaults to 3): Size of output dimension representing available polarities (e.g. Positive, Negative, Neutral). device (:obj:`torch.device`, defaults to torch.device('cuda`)): Type of torch device. Example: @@ -32,3 +32,33 @@ def __init__( self.dropout = dropout self.polarities_dim = polarities_dim self.device = device + + +class SenticNetGCNBertConfig(PreTrainedConfig): + """ + This is the configuration class to store the configuration of a :class:`~sgnlp.models.senticnet_gcn.modeling.SenticNetBertGCNModel`. + It is used to instantiate a SenticNetBertGCNModel network according to the specific arguments, defining the model architecture. + + Args: + hidden_dim (:obj:`int`, defaults to 768): The embedding dimension size for the Bert model as well as GCN dimension. + max_seq_len (:obj:`int`, defaults to 85): The max sequence length to pad and truncate. + dropout (:obj:`float`, defaults to 0.3): Dropout percentage. + polarities_dim (:ob:`int`, defaults to 3): Size of output dimension representing available polarities (e.g. Positive, Negative, Neutral). + device (:obj:`torch.device`, defaults to torch.device('cuda')): Type of torch device + Example: + + from sgnlp.models.senticnet_gcn import SenticNetBertGCNConfig + + # Initialize with default values + config = SenticNetBertGCNConfig() + """ + + def __init__( + self, hidden_dim=768, max_seq_len=85, polarities_dim=3, dropout=0.3, device=torch.device("cuda"), **kwargs + ): + super().__init__(**kwargs) + self.hidden_dim = hidden_dim + self.max_seq_len = max_seq_len + self.dropout = dropout + self.polarities_dim = polarities_dim + self.device = device From 3e4667d40904ff766c7cb1a9b17b39622a8cec42 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 21 Dec 2021 16:39:18 +0800 Subject: [PATCH 034/201] [#41] add bert_model attributes to config, add skeleton SenticNetBertGCNModel --- sgnlp/models/senticnet_gcn/config.py | 26 +++++++++++++++----- sgnlp/models/senticnet_gcn/modeling.py | 33 ++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/config.py b/sgnlp/models/senticnet_gcn/config.py index 38ff9ec..faa372a 100644 --- a/sgnlp/models/senticnet_gcn/config.py +++ b/sgnlp/models/senticnet_gcn/config.py @@ -1,4 +1,3 @@ -import torch from transformers import PreTrainedConfig @@ -13,7 +12,7 @@ class SenticNetGCNConfig(PreTrainedConfig): hidden_dim (:obj:`int`, defaults to 300): Size of hidden dimension. dropout (:obj:`float`, defaults to 0.3): Droput percentage. polarities_dim (:obj:`int`, defaults to 3): Size of output dimension representing available polarities (e.g. Positive, Negative, Neutral). - device (:obj:`torch.device`, defaults to torch.device('cuda`)): Type of torch device. + device (:obj:`str`, defaults to 'cuda`): Type of torch device. Example: @@ -24,7 +23,13 @@ class SenticNetGCNConfig(PreTrainedConfig): """ def __init__( - self, embed_dim=300, hidden_dim=300, polarities_dim=3, dropout=0.3, device=torch.device("cuda"), **kwargs + self, + embed_dim: int = 300, + hidden_dim: int = 300, + polarities_dim: int = 3, + dropout: float = 0.3, + device: str = "cuda", + **kwargs ): super().__init__(**kwargs) self.embed_dim = embed_dim @@ -34,17 +39,18 @@ def __init__( self.device = device -class SenticNetGCNBertConfig(PreTrainedConfig): +class SenticNetBertGCNConfig(PreTrainedConfig): """ This is the configuration class to store the configuration of a :class:`~sgnlp.models.senticnet_gcn.modeling.SenticNetBertGCNModel`. It is used to instantiate a SenticNetBertGCNModel network according to the specific arguments, defining the model architecture. Args: + bert_model (:obj:`str`, defaults to 'bert-base-uncased'): The Bert model type to initalized from transformers package. hidden_dim (:obj:`int`, defaults to 768): The embedding dimension size for the Bert model as well as GCN dimension. max_seq_len (:obj:`int`, defaults to 85): The max sequence length to pad and truncate. dropout (:obj:`float`, defaults to 0.3): Dropout percentage. polarities_dim (:ob:`int`, defaults to 3): Size of output dimension representing available polarities (e.g. Positive, Negative, Neutral). - device (:obj:`torch.device`, defaults to torch.device('cuda')): Type of torch device + device (:obj:`str`, defaults to 'cuda'): Type of torch device. Example: from sgnlp.models.senticnet_gcn import SenticNetBertGCNConfig @@ -54,9 +60,17 @@ class SenticNetGCNBertConfig(PreTrainedConfig): """ def __init__( - self, hidden_dim=768, max_seq_len=85, polarities_dim=3, dropout=0.3, device=torch.device("cuda"), **kwargs + self, + bert_model: str = "bert-base-uncased", + hidden_dim: int = 768, + max_seq_len: int = 85, + polarities_dim: int = 3, + dropout: float = 0.3, + device: str = "cuda", + **kwargs ): super().__init__(**kwargs) + self.bert_model = bert_model self.hidden_dim = hidden_dim self.max_seq_len = max_seq_len self.dropout = dropout diff --git a/sgnlp/models/senticnet_gcn/modeling.py b/sgnlp/models/senticnet_gcn/modeling.py index 912f740..54d66ce 100644 --- a/sgnlp/models/senticnet_gcn/modeling.py +++ b/sgnlp/models/senticnet_gcn/modeling.py @@ -3,12 +3,12 @@ import torch import torch.nn as nn import torch.nn.functional as F -from transformers import PreTrainedModel +from transformers import PreTrainedModel, BertModel from transformers.file_utils import ModelOutput from .modules.dynamic_rnn import DynamicLSTM from .modules.gcn import GraphConvolution -from .config import SenticNetGCNConfig +from .config import SenticNetGCNConfig, SenticNetBertGCNConfig @dataclass @@ -98,3 +98,32 @@ def forward(self, inputs): x = torch.matmul(alpha, text_out).squeeze(1) # batch_size x 2 * hidden_dim output = self.fc(x) return output + + +class SenticNetBertGCNPreTrainedModel(PreTrainedModel): + config_class = SenticNetBertGCNConfig + base_model_prefix = "senticnetbert_gcn" + + def _init_weights(self, module): + pass + + +class SenticNetBertGCPModel(SenticNetBertGCNPreTrainedModel): + def __init__(self, config: SenticNetBertGCNConfig) -> None: + super().__init__() + self.bert = BertModel.from_pretrained(config.bert_model) + self.gc1 = GraphConvolution(config.hidden_dim, config.hidden_dim) + self.gc2 = GraphConvolution(config.hidden_dim, config.hidden_dim) + self.gc3 = GraphConvolution(config.hidden_dim, config.hidden_dim) + self.fc = nn.Linear(config.hidden_dim, config.polarities_dim) + self.text_embed_dropout = nn.Dropout(config.dropout) + self.device = config.device + + def position_weight(self, x, aspect_double_idx, text_len, aspect_len): + pass + + def mask(self, x, aspect_double_idx): + pass + + def forward(self, inputs): + pass From 7d88c25d6c7ad52cdd94da35d49f24feb8eaa2c3 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 22 Dec 2021 09:10:54 +0800 Subject: [PATCH 035/201] [#41] add draft SenticNetBertGCNModel --- sgnlp/models/senticnet_gcn/modeling.py | 59 +++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/modeling.py b/sgnlp/models/senticnet_gcn/modeling.py index 54d66ce..d12e454 100644 --- a/sgnlp/models/senticnet_gcn/modeling.py +++ b/sgnlp/models/senticnet_gcn/modeling.py @@ -45,8 +45,7 @@ def __init__(self, config: SenticNetGCNConfig) -> None: self.device = config.device def position_weight(self, x, aspect_double_idx, text_len, aspect_len): - batch_size = x.shape[0] - seq_len = x.shape[1] + batch_size, seq_len = x.shape[0], x.shape[1] aspect_double_idx = aspect_double_idx.cpu().numpy() text_len = text_len.cpu().numpy() aspect_len = aspect_len.cpu().numpy() @@ -111,19 +110,67 @@ def _init_weights(self, module): class SenticNetBertGCPModel(SenticNetBertGCNPreTrainedModel): def __init__(self, config: SenticNetBertGCNConfig) -> None: super().__init__() - self.bert = BertModel.from_pretrained(config.bert_model) + self._init_bert_model(config.bert_model) self.gc1 = GraphConvolution(config.hidden_dim, config.hidden_dim) self.gc2 = GraphConvolution(config.hidden_dim, config.hidden_dim) self.gc3 = GraphConvolution(config.hidden_dim, config.hidden_dim) self.fc = nn.Linear(config.hidden_dim, config.polarities_dim) self.text_embed_dropout = nn.Dropout(config.dropout) self.device = config.device + self.max_seq_len = config.max_seq_len + + def _init_bert_model(self, bert_model: str): + self.bert = BertModel.from_pretrained(bert_model) def position_weight(self, x, aspect_double_idx, text_len, aspect_len): - pass + batch_size, seq_len = x.shape[0], x.shape[1] + aspect_double_idx = aspect_double_idx.cpu().numpy() + text_len = text_len.cpu().numpy() + aspect_len = aspect_len.cpu().numpy() + weight = [[] for i in range(batch_size)] + for i in range(batch_size): + context_len = text_len[i] - aspect_len[i] + for j in range(aspect_double_idx[i, 0]): + weight[i].append(1 - (aspect_double_idx[i, 0] - j) / context_len) + for j in range(aspect_double_idx[i, 0], min(aspect_double_idx[i, 1] + 1, self.max_seq_len)): + weight[i].append(0) + for j in range(aspect_double_idx[i, 1] + 1, text_len[i]): + weight[i].append(1 - (j - aspect_double_idx[i, 1]) / context_len) + for j in range(text_len[i], seq_len): + weight[i].append(0) + weight = torch.tensor(weight).unsqueeze(2).to(self.device) + return weight * x def mask(self, x, aspect_double_idx): - pass + batch_size, seq_len = x.shape[0], x.shape[1] + aspect_double_idx = aspect_double_idx.cpu().numpy() + mask = [[] for i in range(batch_size)] + for i in range(batch_size): + for j in range(aspect_double_idx[i, 0]): + mask[i].append(0) + for j in range(aspect_double_idx[i, 0], min(aspect_double_idx[i, 1] + 1, self.max_seq_len)): + mask[i].append(1) + for j in range(min(aspect_double_idx[i, 1] + 1, self.max_seq_len), seq_len): + mask[i].append(0) + mask = torch.tensor(mask).unsqueeze(2).float().to(self.device) + return mask * x def forward(self, inputs): - pass + text_bert_indices, text_indices, aspect_indices, bert_segments_ids, left_indices, adj = inputs + text_len = torch.sum(text_indices != 0, dim=-1) + aspect_len = torch.sum(aspect_indices != 0, dim=-1) + left_len = torch.sum(left_indices != 0, dim=-1) + aspect_double_idx = torch.cat([left_len.unsqueeze(1), (left_len + aspect_len - 1).unsqueeze(1)], dim=1) + encoder_layer, _ = self.bert( + text_bert_indices, token_type_ids=bert_segments_ids, output_all_encoded_layers=False + ) + text_out = encoder_layer + x = F.relu(self.gc1(self.position_weight(text_out, aspect_double_idx, text_len, aspect_len), adj)) + x = F.relu(self.gc2(self.position_weight(x, aspect_double_idx, text_len, aspect_len), adj)) + x = F.relu(self.gc3(self.position_weight(x, aspect_double_idx, text_len, aspect_len), adj)) + x = self.mask(x, aspect_double_idx) + alpha_mat = torch.matmul(x, text_out.transpose(1, 2)) + alpha = F.softmax(alpha_mat.sum(1, keepdim=True), dim=2) + x = torch.matmul(alpha, text_out).squeeze(1) # batch_size x 2*hidden_dim + output = self.fc(x) + return output From 9b6da4c6fa78c3d3c586d8ad236be8d78a951c52 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 22 Dec 2021 10:32:24 +0800 Subject: [PATCH 036/201] [#41] draft combined steps for dependency processing --- sgnlp/models/senticnet_gcn/data_class.py | 11 +++ .../senticnet_gcn/preprocess_dependency.py | 76 +++++++++++++++++++ .../preprocess_dependency_graph.py | 66 ---------------- 3 files changed, 87 insertions(+), 66 deletions(-) create mode 100644 sgnlp/models/senticnet_gcn/preprocess_dependency.py delete mode 100644 sgnlp/models/senticnet_gcn/preprocess_dependency_graph.py diff --git a/sgnlp/models/senticnet_gcn/data_class.py b/sgnlp/models/senticnet_gcn/data_class.py index 8c96347..a6c00fb 100644 --- a/sgnlp/models/senticnet_gcn/data_class.py +++ b/sgnlp/models/senticnet_gcn/data_class.py @@ -58,3 +58,14 @@ def __post_init__(self): "sgd", ], "Invalid optimizer" assert self.device in ["cuda", "cpu"], "Invalid device type." + + +@dataclass +class DependencyProcessorArgs: + dependency_graph_preprocess: List[str] = field(default=list, metadata={"help": "List of raw dataset to process."}) + senticnet_word_file_path: str = field( + default="./senticNet/senticnet_word.txt", metadata={"help": "SenticNet word file path."} + ) + spacy_pipeline: str = field( + default="en_core_web_sm", metadata={"help": "Type of spacy pipeline to load for processor."} + ) diff --git a/sgnlp/models/senticnet_gcn/preprocess_dependency.py b/sgnlp/models/senticnet_gcn/preprocess_dependency.py new file mode 100644 index 0000000..ef5c03b --- /dev/null +++ b/sgnlp/models/senticnet_gcn/preprocess_dependency.py @@ -0,0 +1,76 @@ +import numpy as np +import spacy +import pickle +from spacy.tokens import Doc + +from utils import parse_args_and_load_config +from data_class import DependencyProcessorArgs + + +class DependencyProcessor: + def __init__(self, config: DependencyProcessorArgs): + self.nlp = spacy.load(config.spacy_pipeline) + self.senticnet = self._load_senticnet(config.senticnet_word_file_path) + + def _load_senticnet(self, senticnet_file_path: str): + senticNet = {} + with open(senticnet_file_path, "r") as f: + for line in f: + line = line.strip() + if not line(): + continue + word, sentic = line.split("\t") + sentic[word] = sentic + return senticNet + + def _generate_dependency_adj_matrix(self, text: str) -> np.ndarray: + words_list = text.split() + seq_len = len(words_list) + matrix = np.zeros((seq_len, seq_len)).astype("float32") + for i in range(seq_len): + word = words_list[i] + sentic = float(self.senticnet[word]) + 1.0 if word in self.senticnet else 0.5 + for j in range(seq_len): + matrix[i][j] += sentic + for k in range(seq_len): + matrix[k][i] += sentic + matrix[i][i] = 1 + return matrix + + def _generate_sentic_graph(self, text: str, aspect: str) -> np.ndarray: + words_list = text.split() + seq_len = len(words_list) + matrix = np.zeros((seq_len, seq_len)).astype("float32") + for i in range(seq_len): + word = words_list[i] + sentic = float(self.senticnet[word]) + 1.0 if word in self.senticnet else 0 + if word in aspect: + sentic += 1.0 + for j in range(seq_len): + matrix[i][j] += sentic + matrix[j][i] += sentic + for i in range(seq_len): + if matrix[i][i] == 0: + matrix[i][i] = 1 + return matrix + + def _generate_sentic_dependency_adj_matrix(self, text: str, aspect: str) -> np.ndarray: + doc = self.nlp(text) + seq_len = len(text.split()) + matrix = np.zeros((seq_len, seq_len)).astype("float32") + for token in doc: + sentic = float(self.senticnet[str(token)]) + 1 if str(token) in self.senticnet else 0 + if str(token) in aspect: + sentic += 1 + if token.i < seq_len: + matrix[token.i][token.i] = 1 * sentic + for child in token.children: + if str(child) in aspect: + sentic += 1 + if child.i < seq_len: + matrix[token.i][child.i] = 1 * sentic + matrix[child.i][token.i] = 1 * sentic + return matrix + + def process(self): + pass diff --git a/sgnlp/models/senticnet_gcn/preprocess_dependency_graph.py b/sgnlp/models/senticnet_gcn/preprocess_dependency_graph.py deleted file mode 100644 index 32afce8..0000000 --- a/sgnlp/models/senticnet_gcn/preprocess_dependency_graph.py +++ /dev/null @@ -1,66 +0,0 @@ -import numpy as np -import spacy -import pickle -from spacy.tokens import Doc - -from utils import parse_args_and_load_config - - -class WhiteSpaceTokenizer(object): - """ - Simple white space tokenizer - """ - - def __init__(self, vocab): - self.vocab = vocab - - def __call__(self, text: str): - words = text.split() - spaces = [True] * len(words) - return Doc(self.vocab, words=words, spaces=spaces) - - -class DependencyGraphPreprocessor(object): - """ - Preprocessor wrapper class for processing dependency graph. - """ - - def __init__(self): - self.nlp = spacy.load("en_core_web_sm") - self.nlp.tokenizer = WhiteSpaceTokenizer(self.nlp.vocab) - - def __dependency_adj_matrix(self, text: str) -> np.ndarray: - tokens = self.nlp(text) - words = text.split() - matrix = np.zeros((len(words), len(words))).astype("float32") - - for token in tokens: - matrix[token.i][token.i] = 1 - for child in token.children: - matrix[token.i][child.i] = 1 - matrix[child.i][token.i] = 1 - return matrix - - def process(self, filename: str): - """ - Main processing method, takes in raw data file and convert to adj matrix. - - Args: - filename (str): filename of raw dataset to process - """ - with open(filename, "r", encoding="utf-8", newline="\n", errors="ignore") as fin: - lines = fin.readlines() - idx2graph = {} - with open(f"{filename}.graph", "wb") as fout: - for i in range(0, len(lines), 3): - text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] - aspect = lines[i + 1].lower().strip() - idx2graph[i] = self.__dependency_adj_matrix(f"{text_left} {aspect} {text_right}") - pickle.dump(idx2graph, fout) - - -if __name__ == "__main__": - dgp = DependencyGraphPreprocessor() - cfg = parse_args_and_load_config() - for data_path in cfg.dependency_graph_preprocess: - dgp.process(data_path) From 73ad22cfc97d0ff28d10336c142ef1efe987f2f6 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 22 Dec 2021 11:08:00 +0800 Subject: [PATCH 037/201] [#41] merge preprocessor specific dataclass to training dataclass --- .../config/senticnet_gcn_config.json | 4 ++- sgnlp/models/senticnet_gcn/data_class.py | 19 +++++--------- .../senticnet_gcn/preprocess_dependency.py | 26 +++++++++++-------- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json index fc4f619..fe1bcfb 100644 --- a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json @@ -1,5 +1,5 @@ { - "dependency_graph_preprocess": [ + "raw_dataset_files": [ "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/acl-14-short-data/train.raw", "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/acl-14-short-data/test.raw", "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", @@ -11,6 +11,8 @@ "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval16/restaurant_train.raw", "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval16/restaurant_test.raw" ], + "senticnet_word_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/senticNet/senticnet_word.txt", + "spacy_pipeline": "en_core_web_sm", "dataset_train": { "raw": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", "graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw.graph", diff --git a/sgnlp/models/senticnet_gcn/data_class.py b/sgnlp/models/senticnet_gcn/data_class.py index a6c00fb..074b4c0 100644 --- a/sgnlp/models/senticnet_gcn/data_class.py +++ b/sgnlp/models/senticnet_gcn/data_class.py @@ -4,7 +4,13 @@ @dataclass class SenticNetGCNTrainArgs: - dependency_graph_preprocess: List[str] = field(default=list, metadata={"help": "List of raw dataset to process."}) + raw_dataset_files: List[str] = field(default=list, metadata={"help": "List of raw dataset to process."}) + senticnet_word_file_path: str = field( + default="./senticNet/senticnet_word.txt", metadata={"help": "SenticNet word file path."} + ) + spacy_pipeline: str = field( + default="en_core_web_sm", metadata={"help": "Type of spacy pipeline to load for processor."} + ) dataset_train: Dict[str, str] = field( default=dict, metadata={"help": "Dictionary containing 3 file paths to the raw, graph and tree train datasets."}, @@ -58,14 +64,3 @@ def __post_init__(self): "sgd", ], "Invalid optimizer" assert self.device in ["cuda", "cpu"], "Invalid device type." - - -@dataclass -class DependencyProcessorArgs: - dependency_graph_preprocess: List[str] = field(default=list, metadata={"help": "List of raw dataset to process."}) - senticnet_word_file_path: str = field( - default="./senticNet/senticnet_word.txt", metadata={"help": "SenticNet word file path."} - ) - spacy_pipeline: str = field( - default="en_core_web_sm", metadata={"help": "Type of spacy pipeline to load for processor."} - ) diff --git a/sgnlp/models/senticnet_gcn/preprocess_dependency.py b/sgnlp/models/senticnet_gcn/preprocess_dependency.py index ef5c03b..de7d01f 100644 --- a/sgnlp/models/senticnet_gcn/preprocess_dependency.py +++ b/sgnlp/models/senticnet_gcn/preprocess_dependency.py @@ -1,14 +1,13 @@ import numpy as np import spacy import pickle -from spacy.tokens import Doc from utils import parse_args_and_load_config -from data_class import DependencyProcessorArgs +from data_class import SenticNetGCNTrainArgs class DependencyProcessor: - def __init__(self, config: DependencyProcessorArgs): + def __init__(self, config: SenticNetGCNTrainArgs): self.nlp = spacy.load(config.spacy_pipeline) self.senticnet = self._load_senticnet(config.senticnet_word_file_path) @@ -34,7 +33,7 @@ def _generate_dependency_adj_matrix(self, text: str) -> np.ndarray: matrix[i][j] += sentic for k in range(seq_len): matrix[k][i] += sentic - matrix[i][i] = 1 + matrix[i][i] = 1.0 return matrix def _generate_sentic_graph(self, text: str, aspect: str) -> np.ndarray: @@ -51,7 +50,7 @@ def _generate_sentic_graph(self, text: str, aspect: str) -> np.ndarray: matrix[j][i] += sentic for i in range(seq_len): if matrix[i][i] == 0: - matrix[i][i] = 1 + matrix[i][i] = 1.0 return matrix def _generate_sentic_dependency_adj_matrix(self, text: str, aspect: str) -> np.ndarray: @@ -59,18 +58,23 @@ def _generate_sentic_dependency_adj_matrix(self, text: str, aspect: str) -> np.n seq_len = len(text.split()) matrix = np.zeros((seq_len, seq_len)).astype("float32") for token in doc: - sentic = float(self.senticnet[str(token)]) + 1 if str(token) in self.senticnet else 0 + sentic = float(self.senticnet[str(token)]) + 1.0 if str(token) in self.senticnet else 0 if str(token) in aspect: - sentic += 1 + sentic += 1.0 if token.i < seq_len: - matrix[token.i][token.i] = 1 * sentic + matrix[token.i][token.i] = 1.0 * sentic for child in token.children: if str(child) in aspect: - sentic += 1 + sentic += 1.0 if child.i < seq_len: - matrix[token.i][child.i] = 1 * sentic - matrix[child.i][token.i] = 1 * sentic + matrix[token.i][child.i] = 1.0 * sentic + matrix[child.i][token.i] = 1.0 * sentic return matrix def process(self): pass + + +if __name__ == "__main__": + cfg = parse_args_and_load_config() + print(cfg) From 47bedcc7c4e7c626461a418c14c37c45d6d548dd Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 22 Dec 2021 15:05:51 +0800 Subject: [PATCH 038/201] [#41] rework workflow for dependency processor --- .../config/senticnet_gcn_config.json | 23 ++++--------- sgnlp/models/senticnet_gcn/data_class.py | 33 ++++++++++++++++--- .../senticnet_gcn/preprocess_dependency.py | 15 ++++++++- 3 files changed, 49 insertions(+), 22 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json index fe1bcfb..654d231 100644 --- a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json @@ -1,27 +1,18 @@ { - "raw_dataset_files": [ - "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/acl-14-short-data/train.raw", - "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/acl-14-short-data/test.raw", - "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", - "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw", - "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/laptop_train.raw", - "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/laptop_test.raw", - "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval15/restaurant_train.raw", - "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval15/restaurant_test.raw", - "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval16/restaurant_train.raw", - "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval16/restaurant_test.raw" - ], "senticnet_word_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/senticNet/senticnet_word.txt", "spacy_pipeline": "en_core_web_sm", + "save_preprocessed_dependency": true, "dataset_train": { "raw": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", - "graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw.graph", - "tree": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw.tree" + "dependency_graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw.dgraph", + "sentic_graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw.senticgraph", + "dependency_sencticnet_graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw.dsenticgraph" }, "dataset_test": { "raw": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw", - "graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw.graph", - "tree": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw.tree" + "dependency_graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw.dgraph", + "sentic_graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw.senticgraph", + "dependency_sencticnet_graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw.dsenticgraph" }, "word_vec_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/glove/glove.840B.300d.txt", "save_embedding_matrix": true, diff --git a/sgnlp/models/senticnet_gcn/data_class.py b/sgnlp/models/senticnet_gcn/data_class.py index 074b4c0..60e10f5 100644 --- a/sgnlp/models/senticnet_gcn/data_class.py +++ b/sgnlp/models/senticnet_gcn/data_class.py @@ -4,20 +4,37 @@ @dataclass class SenticNetGCNTrainArgs: - raw_dataset_files: List[str] = field(default=list, metadata={"help": "List of raw dataset to process."}) + dataset_files: List[str] = field(default=list, metadata={"help": "List of raw dataset to process."}) senticnet_word_file_path: str = field( default="./senticNet/senticnet_word.txt", metadata={"help": "SenticNet word file path."} ) spacy_pipeline: str = field( default="en_core_web_sm", metadata={"help": "Type of spacy pipeline to load for processor."} ) + save_preprocessed_dependency: bool = field( + default=True, + metadata={ + "help": """Flag to indicate if dependency preprocess should run, + if pickle files already present, it will be overwritten.""" + }, + ) dataset_train: Dict[str, str] = field( default=dict, - metadata={"help": "Dictionary containing 3 file paths to the raw, graph and tree train datasets."}, + metadata={ + "help": """Dictionary containing 3 file paths to the raw dataset file, + dependency_graph, sentic_graph and the dependency_senticnet_graph files for the train datasets. + Raw file path is mandatory, the graph files are optional. If graph files are not present, + it will be generated during preprocessing step.""" + }, ) dataset_test: Dict[str, str] = field( default=dict, - metadata={"help": "Dictionary containing 3 file paths to the raw, graph and tree test datasets."}, + metadata={ + "help": """Dictionary containing 3 file paths to the raw dataset file, + dependency_graph, sentic_graph and the dependency_senticnet_graph files for the test datasets. + Raw file path is mandatory, the graph files are optional. If graph files are not present, + it will be generated during preprocessing step.""" + }, ) word_vec_file_path: str = field( default="glove/glove.840B.300d.txt", @@ -25,12 +42,16 @@ class SenticNetGCNTrainArgs: ) save_embedding_matrix: bool = field( default=True, - metadata="Flag to indicate if embedding matrix should be saved. Flag is ignored if 'saved_embedding_matrix_file_path' is populated and valid.", + metadata={ + "help": """Flag to indicate if embedding matrix should be saved. + Flag is ignored if 'saved_embedding_matrix_file_path' is populated and valid.""" + }, ) saved_embedding_matrix_file_path: str = field( default="embedding/embeddings.pickle", metadata={ - "help": "Full path of saved embedding matrix, if file exists, embeddings will be generated from file instead of generated from word vector and vocab." + "help": """Full path of saved embedding matrix, if file exists, + embeddings will be generated from file instead of generated from word vector and vocab.""" }, ) initializer: str = field(default="xavier_uniform", metadata={"help": "Type of initalizer to use."}) @@ -64,3 +85,5 @@ def __post_init__(self): "sgd", ], "Invalid optimizer" assert self.device in ["cuda", "cpu"], "Invalid device type." + assert "raw" in self.dataset_train.keys(), "File path to raw dataset is required!" + assert "raw" in self.dataset_test.keys(), "File path to raw dataset is required!" diff --git a/sgnlp/models/senticnet_gcn/preprocess_dependency.py b/sgnlp/models/senticnet_gcn/preprocess_dependency.py index de7d01f..6096af1 100644 --- a/sgnlp/models/senticnet_gcn/preprocess_dependency.py +++ b/sgnlp/models/senticnet_gcn/preprocess_dependency.py @@ -1,5 +1,6 @@ import numpy as np import spacy +import pathlib import pickle from utils import parse_args_and_load_config @@ -8,6 +9,7 @@ class DependencyProcessor: def __init__(self, config: SenticNetGCNTrainArgs): + self.config = config self.nlp = spacy.load(config.spacy_pipeline) self.senticnet = self._load_senticnet(config.senticnet_word_file_path) @@ -71,10 +73,21 @@ def _generate_sentic_dependency_adj_matrix(self, text: str, aspect: str) -> np.n matrix[child.i][token.i] = 1.0 * sentic return matrix + def _check_saved_file(self, file_path: str) -> bool: + pl_file_path = pathlib.Path(file_path) + return pl_file_path.exists() + + def _load_save_file(self, file_path: str) -> dict[int, str]: + with open(file_path, "rb") as f: + data = pickle.load(f) + return data + def process(self): pass if __name__ == "__main__": cfg = parse_args_and_load_config() - print(cfg) + import pprint + + pprint.pprint(cfg.dataset_files) From 3b7b272cde988c6f6cfe1cb85794c5b9304c0f78 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 22 Dec 2021 16:29:59 +0800 Subject: [PATCH 039/201] [#41] implement draft workflow to preprocess all dependency graph --- sgnlp/models/senticnet_gcn/data_class.py | 12 ++++++- .../senticnet_gcn/preprocess_dependency.py | 32 ++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/data_class.py b/sgnlp/models/senticnet_gcn/data_class.py index 60e10f5..acb4944 100644 --- a/sgnlp/models/senticnet_gcn/data_class.py +++ b/sgnlp/models/senticnet_gcn/data_class.py @@ -4,7 +4,6 @@ @dataclass class SenticNetGCNTrainArgs: - dataset_files: List[str] = field(default=list, metadata={"help": "List of raw dataset to process."}) senticnet_word_file_path: str = field( default="./senticNet/senticnet_word.txt", metadata={"help": "SenticNet word file path."} ) @@ -18,6 +17,10 @@ class SenticNetGCNTrainArgs: if pickle files already present, it will be overwritten.""" }, ) + dataset_keys: List[str] = field( + default_factory=lambda: ["raw", "dependency_graph", "sentic_graph", "dependency_sencticnet_graph"], + metadata={"help": "Default dataset keys."}, + ) dataset_train: Dict[str, str] = field( default=dict, metadata={ @@ -87,3 +90,10 @@ def __post_init__(self): assert self.device in ["cuda", "cpu"], "Invalid device type." assert "raw" in self.dataset_train.keys(), "File path to raw dataset is required!" assert "raw" in self.dataset_test.keys(), "File path to raw dataset is required!" + # populate keys if not presents + train_diff_keys = set(self.dataset_keys).difference(set(self.dataset_train.keys())) + for key in train_diff_keys: + self.dataset_train[key] = "" + test_diff_keys = set(self.dataset_keys).difference(set(self.dataset_test.keys())) + for key in test_diff_keys: + self.dataset_test[key] = "" diff --git a/sgnlp/models/senticnet_gcn/preprocess_dependency.py b/sgnlp/models/senticnet_gcn/preprocess_dependency.py index 6096af1..cefafd5 100644 --- a/sgnlp/models/senticnet_gcn/preprocess_dependency.py +++ b/sgnlp/models/senticnet_gcn/preprocess_dependency.py @@ -12,6 +12,7 @@ def __init__(self, config: SenticNetGCNTrainArgs): self.config = config self.nlp = spacy.load(config.spacy_pipeline) self.senticnet = self._load_senticnet(config.senticnet_word_file_path) + self.dataset_keys = ["raw"] def _load_senticnet(self, senticnet_file_path: str): senticNet = {} @@ -82,8 +83,37 @@ def _load_save_file(self, file_path: str) -> dict[int, str]: data = pickle.load(f) return data + def _process_file(self, raw_file_path: str, file_path: str, process_function: function): + try: + with open(raw_file_path, "r", encoding="utf-8", newline="\n", errors="ignore") as f: + lines = f.readlines() + except: + raise Exception("Error opening raw dataset file!") + + graph = {} + for i in range(0, len(lines), 3): + text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] + aspect = lines[i + 1].lower().strip() + adj_matrix = process_function(text_left + " " + aspect + " " + text_right, aspect, self.senticnet) + graph[i] = adj_matrix + try: + if self.config.save_preprocessed_dependency: + with open(file_path, "wb") as f: + pickle.dump(graph, f) + except: + raise Exception("Error writing graph to file") + # return graph + def process(self): - pass + dependency_keys_map = { + "dependency_graph": self._generate_dependency_adj_matrix, + "sentic_graph": self._generate_sentic_graph, + "dependency_sencticnet_graph": self._generate_sentic_dependency_adj_matrix, + } + for dataset in [self.config.dataset_train, self.config.dataset_test]: + for key, func in dependency_keys_map.items(): + if not dataset[key]: + self._process_file(dataset["raw"], dataset[key], func) if __name__ == "__main__": From f1284c66d40593b4a35fc64aaeb2c22c541fec28 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 23 Dec 2021 10:32:50 +0800 Subject: [PATCH 040/201] [#41] add draft SenticNetBertTokenizer class with reverse function --- sgnlp/models/senticnet_gcn/tokenization.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/sgnlp/models/senticnet_gcn/tokenization.py b/sgnlp/models/senticnet_gcn/tokenization.py index 891a327..0117976 100644 --- a/sgnlp/models/senticnet_gcn/tokenization.py +++ b/sgnlp/models/senticnet_gcn/tokenization.py @@ -2,7 +2,7 @@ import pickle from typing import Dict, List, Optional, Tuple -from transformers import PreTrainedTokenizer +from transformers import PreTrainedTokenizer, BertTokenizer VOCAB_FILES_NAMES = {"vocab_file": "vocab.pkl"} @@ -98,3 +98,15 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = with open(vocab_file_path, "wb") as fout: pickle.dump(self.vocab, fout) return (str(vocab_file_path),) + + +class SenticNetBertGCNTokenizer(BertTokenizer): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def __call__(self, text, reverse=False, **kwargs): + encoding = super().__call__(text, add_special_tokens=False, truncation=True, max_length=85, **kwargs) + if reverse: + for key in encoding.keys(): + encoding[key] = encoding[key][::-1] + return encoding From 27e97c2ac89e28b5e2582ff648f093a63d1dc008 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 23 Dec 2021 11:02:03 +0800 Subject: [PATCH 041/201] [#41] trim unused functions after further review --- .../config/senticnet_gcn_config.json | 4 -- sgnlp/models/senticnet_gcn/data_class.py | 2 +- .../senticnet_gcn/preprocess_dependency.py | 38 +------------------ 3 files changed, 2 insertions(+), 42 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json index 654d231..9569570 100644 --- a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json @@ -4,14 +4,10 @@ "save_preprocessed_dependency": true, "dataset_train": { "raw": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", - "dependency_graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw.dgraph", - "sentic_graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw.senticgraph", "dependency_sencticnet_graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw.dsenticgraph" }, "dataset_test": { "raw": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw", - "dependency_graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw.dgraph", - "sentic_graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw.senticgraph", "dependency_sencticnet_graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw.dsenticgraph" }, "word_vec_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/glove/glove.840B.300d.txt", diff --git a/sgnlp/models/senticnet_gcn/data_class.py b/sgnlp/models/senticnet_gcn/data_class.py index acb4944..945370c 100644 --- a/sgnlp/models/senticnet_gcn/data_class.py +++ b/sgnlp/models/senticnet_gcn/data_class.py @@ -18,7 +18,7 @@ class SenticNetGCNTrainArgs: }, ) dataset_keys: List[str] = field( - default_factory=lambda: ["raw", "dependency_graph", "sentic_graph", "dependency_sencticnet_graph"], + default_factory=lambda: ["raw", "dependency_sencticnet_graph"], metadata={"help": "Default dataset keys."}, ) dataset_train: Dict[str, str] = field( diff --git a/sgnlp/models/senticnet_gcn/preprocess_dependency.py b/sgnlp/models/senticnet_gcn/preprocess_dependency.py index cefafd5..a54a7f9 100644 --- a/sgnlp/models/senticnet_gcn/preprocess_dependency.py +++ b/sgnlp/models/senticnet_gcn/preprocess_dependency.py @@ -25,37 +25,6 @@ def _load_senticnet(self, senticnet_file_path: str): sentic[word] = sentic return senticNet - def _generate_dependency_adj_matrix(self, text: str) -> np.ndarray: - words_list = text.split() - seq_len = len(words_list) - matrix = np.zeros((seq_len, seq_len)).astype("float32") - for i in range(seq_len): - word = words_list[i] - sentic = float(self.senticnet[word]) + 1.0 if word in self.senticnet else 0.5 - for j in range(seq_len): - matrix[i][j] += sentic - for k in range(seq_len): - matrix[k][i] += sentic - matrix[i][i] = 1.0 - return matrix - - def _generate_sentic_graph(self, text: str, aspect: str) -> np.ndarray: - words_list = text.split() - seq_len = len(words_list) - matrix = np.zeros((seq_len, seq_len)).astype("float32") - for i in range(seq_len): - word = words_list[i] - sentic = float(self.senticnet[word]) + 1.0 if word in self.senticnet else 0 - if word in aspect: - sentic += 1.0 - for j in range(seq_len): - matrix[i][j] += sentic - matrix[j][i] += sentic - for i in range(seq_len): - if matrix[i][i] == 0: - matrix[i][i] = 1.0 - return matrix - def _generate_sentic_dependency_adj_matrix(self, text: str, aspect: str) -> np.ndarray: doc = self.nlp(text) seq_len = len(text.split()) @@ -106,8 +75,6 @@ def _process_file(self, raw_file_path: str, file_path: str, process_function: fu def process(self): dependency_keys_map = { - "dependency_graph": self._generate_dependency_adj_matrix, - "sentic_graph": self._generate_sentic_graph, "dependency_sencticnet_graph": self._generate_sentic_dependency_adj_matrix, } for dataset in [self.config.dataset_train, self.config.dataset_test]: @@ -117,7 +84,4 @@ def process(self): if __name__ == "__main__": - cfg = parse_args_and_load_config() - import pprint - - pprint.pprint(cfg.dataset_files) + pass From 68aed25b954186705157fdef1b7640a298ebd0fa Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 23 Dec 2021 14:54:15 +0800 Subject: [PATCH 042/201] [#41] create draft base trainer class --- .../config/senticnet_gcn_config.json | 3 +- sgnlp/models/senticnet_gcn/data_class.py | 4 + sgnlp/models/senticnet_gcn/train.py | 141 +++++++++++++++--- 3 files changed, 126 insertions(+), 22 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json index 9569570..6e804c9 100644 --- a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json @@ -26,5 +26,6 @@ "dropout": 0.3, "save": true, "seed": 776, - "device": "cuda" + "device": "cuda", + "repeats": 10 } \ No newline at end of file diff --git a/sgnlp/models/senticnet_gcn/data_class.py b/sgnlp/models/senticnet_gcn/data_class.py index 945370c..c5944e1 100644 --- a/sgnlp/models/senticnet_gcn/data_class.py +++ b/sgnlp/models/senticnet_gcn/data_class.py @@ -4,6 +4,7 @@ @dataclass class SenticNetGCNTrainArgs: + model: str = field(default="senticnet_gcn", metadata={"help": "Options to choose which model to train."}) senticnet_word_file_path: str = field( default="./senticNet/senticnet_word.txt", metadata={"help": "SenticNet word file path."} ) @@ -71,8 +72,10 @@ class SenticNetGCNTrainArgs: save: bool = field(default=True, metadata={"help": "Flag to indicate if results should be saved."}) seed: int = field(default=776, metadata={"help": "Default random seed for training."}) device: str = field(default="cuda", metadata={"help": "Type of compute device to use for training."}) + repeats: int = field(default=10, metadata={"help": "Number of times to repeat train loop."}) def __post_init__(self): + assert self.model in ["senticgcn", "senticgcn_bert"] assert self.initializer in [ "xavier_uniform", "xavier_uniform", @@ -97,3 +100,4 @@ def __post_init__(self): test_diff_keys = set(self.dataset_keys).difference(set(self.dataset_test.keys())) for key in test_diff_keys: self.dataset_test[key] = "" + assert self.repeats > 1, "Repeats num must be at least 1." diff --git a/sgnlp/models/senticnet_gcn/train.py b/sgnlp/models/senticnet_gcn/train.py index c645e1d..cc94a41 100644 --- a/sgnlp/models/senticnet_gcn/train.py +++ b/sgnlp/models/senticnet_gcn/train.py @@ -1,32 +1,131 @@ -from data_class import SenticASGCNTrainArgs +import logging + +import numpy as np +from sklearn.metrics import f1_score +import torch +import torch.nn as nn +import torch.optim as optim + +from data_class import SenticNetGCNTrainArgs +from tokenization import SenticASGCNTokenizer, SenticNetBertGCNTokenizer from utils import parse_args_and_load_config, set_random_seed, ABSADatasetReader, BucketIterator -from tokenization import SenticASGCNTokenizer +logging.basicConfig(level=logging.DEBUG) + + +class SenticNetGCNBaseTrainer: + def __init__(self, config: SenticNetGCNTrainArgs): + self.config = config + self.global_max_acc = 0.0 + self.global_max_f1 = 0.0 + self.device = ( + torch.device("cuda" if torch.cuda.is_available() else "cpu") + if not self.config.device + else torch.device(self.config.device) + ) + # self.dataset_train = # dataloader + # self.dataset_test = # dataloader + + def _create_initializers(self): + initializers = { + "xavier_uniform_": nn.init.xavier_uniform_, + "xavier_normal_": nn.init.xavier_normal, + "orthogonal": nn.init.orthogonal_, + } + return initializers[self.config.initializer] + + def _create_optimizer(self): + optimizers = { + "adadelta": optim.Adadelta, + "adagrad": optim.Adagrad, + "adam": optim.Adam, + "adamax": optim.Adamax, + "asgd": optim.ASGD, + "rmsprop": optim.RMSprop, + "sgd": optim.SGD, + } + return optimizers[self.config.optimizer] + + def _reset_params(self): + raise NotImplementedError("Please call from derived class only.") + + def _evaluate_acc_f1(self): + self.model.eval() + n_correct, n_total = 0, 0 + t_targets_all, t_outputs_all = None, None + with torch.no_grad(): + for _, t_batch in enumerate(self.dataset_test): + t_inputs = [t_batch[col].to(self.device) for col in t_batch.keys()] + t_targets = t_batch["polarity"].to(self.device) + t_outputs = self.model(t_inputs) -class Trainer: - def __init__(self, cfg: SenticASGCNTrainArgs): - self.cfg = cfg - tokenizer = SenticASGCNTokenizer.from_pretrained( - "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/tokenizer/" + n_correct += (torch.argmax(t_outputs, -1) == t_targets).sum().item() + n_total += len(t_outputs) + + if t_targets_all is None: + t_targets_all = t_targets + t_outputs_all = t_outputs + else: + t_targets_all = torch.cat((t_targets_all, t_targets), dim=0) + t_outputs_all = torch.cat((t_outputs_all, t_outputs), dim=0) + test_acc = n_correct / n_total + f1 = f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average="macro") + return test_acc, f1 + + def _train_epoch(self): + max_val_acc, max_val_f1 = 0, 0 + max_val_epoch = 0 + global_step = 0 + path = 0 + + for epoch in range(self.config.epochs): + n_correct, n_total = 0, 0 + # TODO: How to produce dataset for both model types? + + def train(self): + criterion = nn.CrossEntropyLoss() + _params = filter(lambda p: p.requires_grad, self.model.parameters()) + optimizer = self._create_optimizer()(_params, lr=self.config.learning_rate, weight_decay=self.config.l2reg) + + test_accs, test_f1s = [], [] + for i in range(self.config.repeats): + logging.info(f"Start overall train loop : {i + 1}") + + self._reset_params() + test_acc, test_f1 = self._train(criterion, optimizer) + test_accs.append(test_acc) + test_f1s.append(test_f1) + + logging.info(f"Test_acc: {test_acc}, Test_f1: {test_f1}") + test_accs_avg = np.sum(test_accs) / self.config.repeats + test_f1s_avg = np.sum(test_f1s) / self.config.repeats + max_accs = np.max(test_accs) + max_f1s = np.max(test_f1s) + + logging.info( + f""" + Test acc average: {test_accs_avg} + Test f1 average: {test_f1s_avg} + Test acc max: {max_accs} + Test f1 max: {max_f1s} + """ ) - dataset = ABSADatasetReader(self.cfg, tokenizer=tokenizer) - def _train(self): - pass + +class SenticNetBertGCNTrainer(SenticNetGCNBaseTrainer): + def __init__(self, config: SenticNetGCNTrainArgs): + self.config = config -def train_model(cfg: SenticASGCNTrainArgs): - tokenizer = SenticASGCNTokenizer.from_pretrained( - "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/tokenizer/" - ) - absa_dataset = ABSADatasetReader(cfg, tokenizer) - train_dataloader = BucketIterator(data=absa_dataset.train_data, batch_size=cfg.batch_size, shuffle=True) - test_dataloader = BucketIterator(data=absa_dataset.test_data, batch_size=cfg.batch_size, shuffle=False) +class SenticNetGCNTrainer(SenticNetGCNBaseTrainer): + def __init__(self, config: SenticNetGCNTrainArgs): + self.config = config if __name__ == "__main__": - cfg = parse_args_and_load_config() - if cfg.seed is not None: - set_random_seed(cfg.seed) - train_model(cfg) + # cfg = parse_args_and_load_config() + # if cfg.seed is not None: + # set_random_seed(cfg.seed) + # train_model(cfg) + pass From 39975fe03b4b6437893f7a8f1a4add016645a264 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 23 Dec 2021 16:10:45 +0800 Subject: [PATCH 043/201] [#41] remove reverse feature from tokenizer, add save state_dict config and add trainer creation strategy --- sgnlp/models/senticnet_gcn/data_class.py | 6 ++++++ sgnlp/models/senticnet_gcn/tokenization.py | 5 +---- sgnlp/models/senticnet_gcn/train.py | 10 +++++----- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/data_class.py b/sgnlp/models/senticnet_gcn/data_class.py index c5944e1..c8ed737 100644 --- a/sgnlp/models/senticnet_gcn/data_class.py +++ b/sgnlp/models/senticnet_gcn/data_class.py @@ -58,6 +58,12 @@ class SenticNetGCNTrainArgs: embeddings will be generated from file instead of generated from word vector and vocab.""" }, ) + save_state_dict: bool = field( + default=True, metadata={"help": "Flag to indicate if best model state_dict should be saved."} + ) + saved_state_dict_folder_path: str = field( + default="/state_dict", metadata={"help": "Folder to save model state_dict."} + ) initializer: str = field(default="xavier_uniform", metadata={"help": "Type of initalizer to use."}) optimizer: str = field(default="adam", metadata={"help": "Type of optimizer to use."}) learning_rate: float = field(default=0.001, metadata={"help": "Default learning rate for training."}) diff --git a/sgnlp/models/senticnet_gcn/tokenization.py b/sgnlp/models/senticnet_gcn/tokenization.py index 0117976..4db2fe1 100644 --- a/sgnlp/models/senticnet_gcn/tokenization.py +++ b/sgnlp/models/senticnet_gcn/tokenization.py @@ -104,9 +104,6 @@ class SenticNetBertGCNTokenizer(BertTokenizer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def __call__(self, text, reverse=False, **kwargs): + def __call__(self, text, **kwargs): encoding = super().__call__(text, add_special_tokens=False, truncation=True, max_length=85, **kwargs) - if reverse: - for key in encoding.keys(): - encoding[key] = encoding[key][::-1] return encoding diff --git a/sgnlp/models/senticnet_gcn/train.py b/sgnlp/models/senticnet_gcn/train.py index cc94a41..e0dc2e4 100644 --- a/sgnlp/models/senticnet_gcn/train.py +++ b/sgnlp/models/senticnet_gcn/train.py @@ -124,8 +124,8 @@ def __init__(self, config: SenticNetGCNTrainArgs): if __name__ == "__main__": - # cfg = parse_args_and_load_config() - # if cfg.seed is not None: - # set_random_seed(cfg.seed) - # train_model(cfg) - pass + cfg = parse_args_and_load_config() + if cfg.seed is not None: + set_random_seed(cfg.seed) + trainer = SenticNetGCNTrainer(cfg) if cfg.model == "senticgcn" else SenticNetBertGCNTrainer(cfg) + trainer.train() From 933f55317494860d20929f620884b3fa804278ca Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 24 Dec 2021 09:14:23 +0800 Subject: [PATCH 044/201] [#41] add model specific _reset_params method --- sgnlp/models/senticnet_gcn/train.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/sgnlp/models/senticnet_gcn/train.py b/sgnlp/models/senticnet_gcn/train.py index e0dc2e4..829793a 100644 --- a/sgnlp/models/senticnet_gcn/train.py +++ b/sgnlp/models/senticnet_gcn/train.py @@ -1,4 +1,5 @@ import logging +import math import numpy as np from sklearn.metrics import f1_score @@ -7,6 +8,7 @@ import torch.optim as optim from data_class import SenticNetGCNTrainArgs +from sgnlp.models.senticnet_gcn.modeling import SenticNetBertGCNPreTrainedModel from tokenization import SenticASGCNTokenizer, SenticNetBertGCNTokenizer from utils import parse_args_and_load_config, set_random_seed, ABSADatasetReader, BucketIterator @@ -117,11 +119,31 @@ class SenticNetBertGCNTrainer(SenticNetGCNBaseTrainer): def __init__(self, config: SenticNetGCNTrainArgs): self.config = config + def _reset_params(self): + for child in self.model.children(): + if type(child) != SenticNetBertGCNPreTrainedModel: + for param in child.parameters(): + if param.requires_grad: + if len(param.shape) > 1: + self._create_initializers(param) + else: + stdv = 1.0 / math.sqrt(param.shape[0]) + nn.init.uniform_(param, a=-stdv, b=stdv) + class SenticNetGCNTrainer(SenticNetGCNBaseTrainer): def __init__(self, config: SenticNetGCNTrainArgs): self.config = config + def _reset_params(self): + for param in self.modelparameters(): + if param.requires_grad: + if len(param.shape) > 1: + self._create_initializers(param) + else: + stdv = 1.0 / math.sqrt(param.shape[0]) + nn.init.uniform_(param, a=-stdv, b=stdv) + if __name__ == "__main__": cfg = parse_args_and_load_config() From 0cfc32a9f6c70dc5aa9f83f390b34df85fb5992a Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 24 Dec 2021 09:57:09 +0800 Subject: [PATCH 045/201] [#41] add save state_dict method --- sgnlp/models/senticnet_gcn/data_class.py | 2 +- sgnlp/models/senticnet_gcn/train.py | 44 ++++++++++++++++++++---- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/data_class.py b/sgnlp/models/senticnet_gcn/data_class.py index c8ed737..e94a700 100644 --- a/sgnlp/models/senticnet_gcn/data_class.py +++ b/sgnlp/models/senticnet_gcn/data_class.py @@ -70,7 +70,7 @@ class SenticNetGCNTrainArgs: l2reg: float = field(default=0.00001, metadata={"help": "Default l2reg value."}) epochs: int = field(default=100, metadata={"help": "Number of epochs to train."}) batch_size: int = field(default=32, metadata={"help": "Training batch size."}) - log_step: int = field(default=5, metadata={"help": "Default log step."}) + log_step: int = field(default=5, metadata={"help": "Number of train steps to log results."}) embed_dim: int = field(default=300, metadata={"help": "Size of embedding."}) hidden_dim: int = field(default=300, metadata={"help": "Number of neurons for hidden layer."}) dropout: float = field(default=0.3, metadata={"help": "Default value for dropout percentages."}) diff --git a/sgnlp/models/senticnet_gcn/train.py b/sgnlp/models/senticnet_gcn/train.py index 829793a..0015940 100644 --- a/sgnlp/models/senticnet_gcn/train.py +++ b/sgnlp/models/senticnet_gcn/train.py @@ -1,5 +1,7 @@ +import datetime import logging import math +import pathlib import numpy as np from sklearn.metrics import f1_score @@ -9,8 +11,7 @@ from data_class import SenticNetGCNTrainArgs from sgnlp.models.senticnet_gcn.modeling import SenticNetBertGCNPreTrainedModel -from tokenization import SenticASGCNTokenizer, SenticNetBertGCNTokenizer -from utils import parse_args_and_load_config, set_random_seed, ABSADatasetReader, BucketIterator +from utils import parse_args_and_load_config, set_random_seed logging.basicConfig(level=logging.DEBUG) @@ -26,8 +27,11 @@ def __init__(self, config: SenticNetGCNTrainArgs): if not self.config.device else torch.device(self.config.device) ) - # self.dataset_train = # dataloader - # self.dataset_test = # dataloader + # self.dataloader_train = # dataloader + # self.dataloader_test = # dataloader + if config.save_state_dict: + self.save_state_dict_folder = pathlib.Path(self.config.saved_state_dict_folder_path) + self.save_state_dict_folder.mkdir(exist_ok=True) def _create_initializers(self): initializers = { @@ -75,7 +79,17 @@ def _evaluate_acc_f1(self): f1 = f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average="macro") return test_acc, f1 - def _train_epoch(self): + def _save_state_dict(self): + if self.config.save_state_dict: + curr_dt = datetime.datetime.now() + curr_dt_str = curr_dt.strftime("%Y-%m-%d_%H%M%S") + filename = f"{self.config.model}_{curr_dt_str}.pkl" + try: + torch.save(self.model.state_dict(), self.save_state_dict_folder.joinpath(filename)) + except: + raise Exception("Error saving model state dict!") + + def _train_epoch(self, criterion: function, optimizer: function): max_val_acc, max_val_f1 = 0, 0 max_val_epoch = 0 global_step = 0 @@ -83,7 +97,23 @@ def _train_epoch(self): for epoch in range(self.config.epochs): n_correct, n_total = 0, 0 - # TODO: How to produce dataset for both model types? + self.model.train() + for _, batch in enumerate(self.dataloader_train): + global_step += 1 + optimizer.zero_grad() + + inputs = [batch[col].to(self.device) for col in batch.keys()] + targets = batch["polarity"].to(self.device) + outputs = self.model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + + n_correct += (torch.argmax(outputs, -1) == targets).sum().item() + n_total += len(outputs) + + if global_step % self.config.log_step == 0: + pass # TODO: how to merge both calculate for bert and non-bert def train(self): criterion = nn.CrossEntropyLoss() @@ -95,7 +125,7 @@ def train(self): logging.info(f"Start overall train loop : {i + 1}") self._reset_params() - test_acc, test_f1 = self._train(criterion, optimizer) + test_acc, test_f1 = self._train_epoch(criterion, optimizer) test_accs.append(test_acc) test_f1s.append(test_f1) From d77c4e4e335611ac2daf33073fa97e9996d2ca27 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 27 Dec 2021 10:58:24 +0800 Subject: [PATCH 046/201] [#41] draft implementation of train_epoch --- .../config/senticnet_gcn_config.json | 3 +- sgnlp/models/senticnet_gcn/data_class.py | 6 +- sgnlp/models/senticnet_gcn/train.py | 60 ++++++++++++++----- 3 files changed, 53 insertions(+), 16 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json index 6e804c9..44e9155 100644 --- a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json @@ -27,5 +27,6 @@ "save": true, "seed": 776, "device": "cuda", - "repeats": 10 + "repeats": 10, + "patience": 5 } \ No newline at end of file diff --git a/sgnlp/models/senticnet_gcn/data_class.py b/sgnlp/models/senticnet_gcn/data_class.py index e94a700..b41cb45 100644 --- a/sgnlp/models/senticnet_gcn/data_class.py +++ b/sgnlp/models/senticnet_gcn/data_class.py @@ -79,6 +79,9 @@ class SenticNetGCNTrainArgs: seed: int = field(default=776, metadata={"help": "Default random seed for training."}) device: str = field(default="cuda", metadata={"help": "Type of compute device to use for training."}) repeats: int = field(default=10, metadata={"help": "Number of times to repeat train loop."}) + patience: int = field( + default=5, metadata={"help": "Number of train epoch without improvements prior to early stopping."} + ) def __post_init__(self): assert self.model in ["senticgcn", "senticgcn_bert"] @@ -106,4 +109,5 @@ def __post_init__(self): test_diff_keys = set(self.dataset_keys).difference(set(self.dataset_test.keys())) for key in test_diff_keys: self.dataset_test[key] = "" - assert self.repeats > 1, "Repeats num must be at least 1." + assert self.repeats > 1, "Repeats value must be at least 1." + assert self.patience > 1, "Patience value must be at least 1." diff --git a/sgnlp/models/senticnet_gcn/train.py b/sgnlp/models/senticnet_gcn/train.py index 0015940..0c5b862 100644 --- a/sgnlp/models/senticnet_gcn/train.py +++ b/sgnlp/models/senticnet_gcn/train.py @@ -79,24 +79,25 @@ def _evaluate_acc_f1(self): f1 = f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average="macro") return test_acc, f1 - def _save_state_dict(self): - if self.config.save_state_dict: - curr_dt = datetime.datetime.now() - curr_dt_str = curr_dt.strftime("%Y-%m-%d_%H%M%S") - filename = f"{self.config.model}_{curr_dt_str}.pkl" - try: - torch.save(self.model.state_dict(), self.save_state_dict_folder.joinpath(filename)) - except: - raise Exception("Error saving model state dict!") - - def _train_epoch(self, criterion: function, optimizer: function): + def _save_state_dict(self, epoch: int) -> pathlib.Path: + curr_dt = datetime.datetime.now() + curr_dt_str = curr_dt.strftime("%Y-%m-%d_%H%M%S") + filename = f"{self.config.model}_epoch_{epoch}_{curr_dt_str}.pkl" + full_path = self.save_state_dict_folder.joinpath(filename) + try: + torch.save(self.model.state_dict(), full_path) + except: + raise Exception("Error saving model state dict!") + return full_path + + def _train_epoch(self, criterion: function, optimizer: function) -> pathlib.Path: max_val_acc, max_val_f1 = 0, 0 max_val_epoch = 0 global_step = 0 - path = 0 + path = None for epoch in range(self.config.epochs): - n_correct, n_total = 0, 0 + n_correct, n_total, loss_total = 0, 0, 0 self.model.train() for _, batch in enumerate(self.dataloader_train): global_step += 1 @@ -111,9 +112,40 @@ def _train_epoch(self, criterion: function, optimizer: function): n_correct += (torch.argmax(outputs, -1) == targets).sum().item() n_total += len(outputs) + loss_total += loss.item() * len(outputs) if global_step % self.config.log_step == 0: - pass # TODO: how to merge both calculate for bert and non-bert + # pass # TODO: how to merge both calculate for bert and non-bert + train_acc = n_correct / n_total + train_loss = loss_total / n_total + logging.info(f"Train Acc: {train_acc:.4f}, Train Loss: {train_loss:.4f}") + + val_acc, val_f1 = self._evaluate_acc_f1() + logging.info( + f""" + Epoch: {epoch} + Test Acc: {val_acc:.4f} + Test Loss: {val_f1:.4f} + """ + ) + if val_f1 > max_val_f1: + max_val_f1 = val_f1 + + if val_acc > max_val_acc: + max_val_acc = val_acc + max_val_epoch = epoch + if self.config.save_state_dict: + path = self._save_state_dict(epoch) + logging.info( + f""" + Best model saved. Acc: {max_val_acc:.4f}, F1: {max_val_f1}, Epoch: {max_val_epoch} + """ + ) + + if epoch - max_val_epoch >= self.config.patience: + logging.info(f"Early stopping") + break + return path def train(self): criterion = nn.CrossEntropyLoss() From b31beb9a0bfa58700763f9b4848f610af372cf48 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 27 Dec 2021 11:25:12 +0800 Subject: [PATCH 047/201] [#41] add model and tokenizer config options --- sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json | 2 ++ sgnlp/models/senticnet_gcn/data_class.py | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json index 44e9155..68bfc34 100644 --- a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json @@ -1,4 +1,6 @@ { + "model": "senticnet_gcn", + "tokenizer": "senticnet_gcn", "senticnet_word_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/senticNet/senticnet_word.txt", "spacy_pipeline": "en_core_web_sm", "save_preprocessed_dependency": true, diff --git a/sgnlp/models/senticnet_gcn/data_class.py b/sgnlp/models/senticnet_gcn/data_class.py index b41cb45..6cb58e6 100644 --- a/sgnlp/models/senticnet_gcn/data_class.py +++ b/sgnlp/models/senticnet_gcn/data_class.py @@ -4,7 +4,11 @@ @dataclass class SenticNetGCNTrainArgs: - model: str = field(default="senticnet_gcn", metadata={"help": "Options to choose which model to train."}) + model: str = field(default="senticnet_gcn", metadata={"help": "Option to choose which model to train."}) + tokenizer: str = field( + default="senticnet_gcn", + metadata={"help": "Option to choose which tokenizer to use for training preprocessing."}, + ) senticnet_word_file_path: str = field( default="./senticNet/senticnet_word.txt", metadata={"help": "SenticNet word file path."} ) From 407d66dd06a88544553b2d086ed9dda3d5fb0b28 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 27 Dec 2021 11:27:47 +0800 Subject: [PATCH 048/201] [#41] standardize model and tokenizer name convention --- sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json | 4 ++-- sgnlp/models/senticnet_gcn/data_class.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json index 68bfc34..e91ec31 100644 --- a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json @@ -1,6 +1,6 @@ { - "model": "senticnet_gcn", - "tokenizer": "senticnet_gcn", + "model": "senticnetgcn", + "tokenizer": "senticnetgcn", "senticnet_word_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/senticNet/senticnet_word.txt", "spacy_pipeline": "en_core_web_sm", "save_preprocessed_dependency": true, diff --git a/sgnlp/models/senticnet_gcn/data_class.py b/sgnlp/models/senticnet_gcn/data_class.py index 6cb58e6..1ef3519 100644 --- a/sgnlp/models/senticnet_gcn/data_class.py +++ b/sgnlp/models/senticnet_gcn/data_class.py @@ -4,9 +4,9 @@ @dataclass class SenticNetGCNTrainArgs: - model: str = field(default="senticnet_gcn", metadata={"help": "Option to choose which model to train."}) + model: str = field(default="senticnetgcn", metadata={"help": "Option to choose which model to train."}) tokenizer: str = field( - default="senticnet_gcn", + default="senticnetgcn", metadata={"help": "Option to choose which tokenizer to use for training preprocessing."}, ) senticnet_word_file_path: str = field( @@ -88,7 +88,7 @@ class SenticNetGCNTrainArgs: ) def __post_init__(self): - assert self.model in ["senticgcn", "senticgcn_bert"] + assert self.model in ["senticnetgcn", "senticnetgcn_bert"] assert self.initializer in [ "xavier_uniform", "xavier_uniform", From aa0aad4c5c266ef1de985db97e0eac931a53694a Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 27 Dec 2021 11:41:38 +0800 Subject: [PATCH 049/201] [#41] add option for validation split --- .../senticnet_gcn/config/senticnet_gcn_config.json | 1 + sgnlp/models/senticnet_gcn/data_class.py | 9 +++++++++ sgnlp/models/senticnet_gcn/utils.py | 8 ++++++++ 3 files changed, 18 insertions(+) diff --git a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json index e91ec31..bdd6db8 100644 --- a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json @@ -12,6 +12,7 @@ "raw": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw", "dependency_sencticnet_graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw.dsenticgraph" }, + "valset_ratio": 0, "word_vec_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/glove/glove.840B.300d.txt", "save_embedding_matrix": true, "saved_embedding_matrix_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/embedding/embeddings.pickle", diff --git a/sgnlp/models/senticnet_gcn/data_class.py b/sgnlp/models/senticnet_gcn/data_class.py index 1ef3519..9328b12 100644 --- a/sgnlp/models/senticnet_gcn/data_class.py +++ b/sgnlp/models/senticnet_gcn/data_class.py @@ -44,6 +44,14 @@ class SenticNetGCNTrainArgs: it will be generated during preprocessing step.""" }, ) + valset_ratio: float = field( + default=0.0, + metadata={ + "help": """ + Ratio of train dataset to be split for validation. + If value is set to 0, test dataset is set as validation dataset as well.""" + }, + ) word_vec_file_path: str = field( default="glove/glove.840B.300d.txt", metadata={"help": "File path to word vector."}, @@ -115,3 +123,4 @@ def __post_init__(self): self.dataset_test[key] = "" assert self.repeats > 1, "Repeats value must be at least 1." assert self.patience > 1, "Patience value must be at least 1." + assert 0 >= self.valset_ratio < 1, "Valset_ratio must be greater or equals to 0 and less than 1." diff --git a/sgnlp/models/senticnet_gcn/utils.py b/sgnlp/models/senticnet_gcn/utils.py index f3101c7..9c41949 100644 --- a/sgnlp/models/senticnet_gcn/utils.py +++ b/sgnlp/models/senticnet_gcn/utils.py @@ -9,6 +9,7 @@ import numpy as np import torch +from torch.utils.data import random_split from transformers import PreTrainedTokenizer from transformers.tokenization_utils_base import BatchEncoding @@ -248,6 +249,13 @@ def __init__( ) self.train_data = ABSADataset(ABSADatasetReader.__read_data__(self.cfg.dataset_train, tokenizer)) self.test_data = ABSADataset(ABSADatasetReader.__read_data__(self.cfg.dataset_test, tokenizer)) + if config.valset_ratio: + valset_len = int(len(self.train_data) * config.valset_ratio) + self.train_data, self.val_data = random_split( + self.train_data, (len(self.train_data) - valset_len, valset_len) + ) + else: + self.val_data = self.test_data @staticmethod def __read_data__(datasets: Dict[str, str], tokenizer: PreTrainedTokenizer): From 3f4bfd3048db2584d46f5fbd4b6ba4970f1056b7 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 27 Dec 2021 15:11:22 +0800 Subject: [PATCH 050/201] [#41] clean up and draft rework to implement both tokenizer --- .../senticnet_gcn/preprocess_dependency.py | 3 --- sgnlp/models/senticnet_gcn/tokenization.py | 4 ++-- sgnlp/models/senticnet_gcn/train.py | 17 ++++++++++++----- sgnlp/models/senticnet_gcn/utils.py | 5 +---- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/preprocess_dependency.py b/sgnlp/models/senticnet_gcn/preprocess_dependency.py index a54a7f9..687d731 100644 --- a/sgnlp/models/senticnet_gcn/preprocess_dependency.py +++ b/sgnlp/models/senticnet_gcn/preprocess_dependency.py @@ -82,6 +82,3 @@ def process(self): if not dataset[key]: self._process_file(dataset["raw"], dataset[key], func) - -if __name__ == "__main__": - pass diff --git a/sgnlp/models/senticnet_gcn/tokenization.py b/sgnlp/models/senticnet_gcn/tokenization.py index 4db2fe1..7a1cda3 100644 --- a/sgnlp/models/senticnet_gcn/tokenization.py +++ b/sgnlp/models/senticnet_gcn/tokenization.py @@ -104,6 +104,6 @@ class SenticNetBertGCNTokenizer(BertTokenizer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def __call__(self, text, **kwargs): - encoding = super().__call__(text, add_special_tokens=False, truncation=True, max_length=85, **kwargs) + def __call__(self, text, max_length: int = 85, **kwargs): + encoding = super().__call__(text, add_special_tokens=False, truncation=True, max_length=max_length, **kwargs) return encoding diff --git a/sgnlp/models/senticnet_gcn/train.py b/sgnlp/models/senticnet_gcn/train.py index 0c5b862..88989ea 100644 --- a/sgnlp/models/senticnet_gcn/train.py +++ b/sgnlp/models/senticnet_gcn/train.py @@ -8,10 +8,13 @@ import torch import torch.nn as nn import torch.optim as optim +from transformers.tokenization_utils import PreTrainedTokenizer from data_class import SenticNetGCNTrainArgs -from sgnlp.models.senticnet_gcn.modeling import SenticNetBertGCNPreTrainedModel -from utils import parse_args_and_load_config, set_random_seed +from modeling import SenticNetBertGCNPreTrainedModel +from preprocess_dependency import DependencyProcessor +from tokenization import SenticNetGCNTokenizer, SenticNetBertGCNTokenizer +from utils import parse_args_and_load_config, set_random_seed, ABSADatasetReader logging.basicConfig(level=logging.DEBUG) @@ -27,8 +30,8 @@ def __init__(self, config: SenticNetGCNTrainArgs): if not self.config.device else torch.device(self.config.device) ) - # self.dataloader_train = # dataloader - # self.dataloader_test = # dataloader + tokenizer = self._create_tokenizer() + # self.dataloader if config.save_state_dict: self.save_state_dict_folder = pathlib.Path(self.config.saved_state_dict_folder_path) self.save_state_dict_folder.mkdir(exist_ok=True) @@ -53,6 +56,11 @@ def _create_optimizer(self): } return optimizers[self.config.optimizer] + def _create_tokenizer(self): + self.tokenizer = SenticNetBertGCNTokenizer.from_pretrained(self.config.tokenizer) \ + if self.config.model == 'senticnetgcn' \ + else SenticNetBertGCNTokenizer.from_pretrained(self.config.tokenizer) + def _reset_params(self): raise NotImplementedError("Please call from derived class only.") @@ -115,7 +123,6 @@ def _train_epoch(self, criterion: function, optimizer: function) -> pathlib.Path loss_total += loss.item() * len(outputs) if global_step % self.config.log_step == 0: - # pass # TODO: how to merge both calculate for bert and non-bert train_acc = n_correct / n_total train_loss = loss_total / n_total logging.info(f"Train Acc: {train_acc:.4f}, Train Loss: {train_loss:.4f}") diff --git a/sgnlp/models/senticnet_gcn/utils.py b/sgnlp/models/senticnet_gcn/utils.py index 9c41949..df8d96f 100644 --- a/sgnlp/models/senticnet_gcn/utils.py +++ b/sgnlp/models/senticnet_gcn/utils.py @@ -9,7 +9,7 @@ import numpy as np import torch -from torch.utils.data import random_split +from torch.utils.data import random_split, Dataset from transformers import PreTrainedTokenizer from transformers.tokenization_utils_base import BatchEncoding @@ -264,8 +264,6 @@ def __read_data__(datasets: Dict[str, str], tokenizer: PreTrainedTokenizer): lines = fin.readlines() with open(datasets["graph"], "rb") as fin_graph: idx2graph = pickle.load(fin_graph) - with open(datasets["tree"], "rb") as fin_tree: - idx2tree = pickle.load(fin_tree) # Prep all data all_data = [] @@ -279,7 +277,6 @@ def __read_data__(datasets: Dict[str, str], tokenizer: PreTrainedTokenizer): left_indices = tokenizer(text_left) polarity = int(polarity) + 1 dependency_graph = idx2graph[i] - dependency_tree = idx2tree[i] data = { "text_indices": text_indices, From 6b7c8ae1f1261d2b608947bd5d61d84ee1c5bc54 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 27 Dec 2021 15:29:59 +0800 Subject: [PATCH 051/201] [#41] add embedding layer to SenticNetGCN model and load pretrained embedding method --- sgnlp/models/senticnet_gcn/config.py | 3 +++ sgnlp/models/senticnet_gcn/modeling.py | 11 ++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/sgnlp/models/senticnet_gcn/config.py b/sgnlp/models/senticnet_gcn/config.py index faa372a..cdd6683 100644 --- a/sgnlp/models/senticnet_gcn/config.py +++ b/sgnlp/models/senticnet_gcn/config.py @@ -8,6 +8,7 @@ class SenticNetGCNConfig(PreTrainedConfig): It is used to instantiate a SenticNetGCNModel network according to the specific arguments, defining the model architecture. Args: + vocab_size (:obj:`int`, defaults to 3597): Vocab size derived from combined SemeVal14/15/16 datasets. embed_dim (:obj:`int`, defaults to 300): Embedding dimension size. hidden_dim (:obj:`int`, defaults to 300): Size of hidden dimension. dropout (:obj:`float`, defaults to 0.3): Droput percentage. @@ -24,6 +25,7 @@ class SenticNetGCNConfig(PreTrainedConfig): def __init__( self, + vocab_size: int = 3597, embed_dim: int = 300, hidden_dim: int = 300, polarities_dim: int = 3, @@ -32,6 +34,7 @@ def __init__( **kwargs ): super().__init__(**kwargs) + self.vocab_size = vocab_size self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.dropout = dropout diff --git a/sgnlp/models/senticnet_gcn/modeling.py b/sgnlp/models/senticnet_gcn/modeling.py index d12e454..54e9593 100644 --- a/sgnlp/models/senticnet_gcn/modeling.py +++ b/sgnlp/models/senticnet_gcn/modeling.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +import pickle import torch import torch.nn as nn @@ -31,6 +32,7 @@ def _init_weights(self, module): class SenticNetGCNModel(SenticNetGCNPreTrainedModel): def __init__(self, config: SenticNetGCNConfig) -> None: super().__init__(config) + self.embedding = nn.Embedding(config.vocab_size, config.embed_dim) self.text_lstm = DynamicLSTM( config.embed_dim, config.hidden_dim, @@ -44,6 +46,12 @@ def __init__(self, config: SenticNetGCNConfig) -> None: self.text_embed_dropout = nn.Dropout(config.dropout) self.device = config.device + def _load_pretrained_embeddings(self, pretrained_embedding_path: str) -> None: + with open(pretrained_embedding_path, "rb") as f: + embedding_matrix = pickle.load(f) + embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float) + self.embedding.weight.data.copy_(embedding_tensor) + def position_weight(self, x, aspect_double_idx, text_len, aspect_len): batch_size, seq_len = x.shape[0], x.shape[1] aspect_double_idx = aspect_double_idx.cpu().numpy() @@ -83,7 +91,8 @@ def forward(self, inputs): aspect_len = torch.sum(aspect_indices != 0, dim=-1) left_len = torch.sum(left_indices != 0, dim=-1) aspect_double_idx = torch.cat([left_len.unsqueeze(1), (left_len + aspect_len - 1).unsqueeze(1)], dim=1) - text = self.text_embed_dropout(self.embed(text_indices)) + text = self.embedding(text_indices) + text = self.text_embed_dropout(text_indices) text_out, (_, _) = self.text_lstm(text, text_len) x = F.relu( self.gc1( From 67f7fa18479523817e2f9032f884d73eae9e1f2b Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 28 Dec 2021 10:00:51 +0800 Subject: [PATCH 052/201] [#41] standardise model naming --- sgnlp/models/senticnet_gcn/config.py | 2 +- sgnlp/models/senticnet_gcn/data_class.py | 2 +- sgnlp/models/senticnet_gcn/modeling.py | 16 ++++++++-------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/config.py b/sgnlp/models/senticnet_gcn/config.py index cdd6683..b26b7f7 100644 --- a/sgnlp/models/senticnet_gcn/config.py +++ b/sgnlp/models/senticnet_gcn/config.py @@ -42,7 +42,7 @@ def __init__( self.device = device -class SenticNetBertGCNConfig(PreTrainedConfig): +class SenticNetGCNBertConfig(PreTrainedConfig): """ This is the configuration class to store the configuration of a :class:`~sgnlp.models.senticnet_gcn.modeling.SenticNetBertGCNModel`. It is used to instantiate a SenticNetBertGCNModel network according to the specific arguments, defining the model architecture. diff --git a/sgnlp/models/senticnet_gcn/data_class.py b/sgnlp/models/senticnet_gcn/data_class.py index 9328b12..5a0ee94 100644 --- a/sgnlp/models/senticnet_gcn/data_class.py +++ b/sgnlp/models/senticnet_gcn/data_class.py @@ -96,7 +96,7 @@ class SenticNetGCNTrainArgs: ) def __post_init__(self): - assert self.model in ["senticnetgcn", "senticnetgcn_bert"] + assert self.model in ["senticnetgcn", "senticnetgcnbert"] assert self.initializer in [ "xavier_uniform", "xavier_uniform", diff --git a/sgnlp/models/senticnet_gcn/modeling.py b/sgnlp/models/senticnet_gcn/modeling.py index 54e9593..a147eb9 100644 --- a/sgnlp/models/senticnet_gcn/modeling.py +++ b/sgnlp/models/senticnet_gcn/modeling.py @@ -9,7 +9,7 @@ from .modules.dynamic_rnn import DynamicLSTM from .modules.gcn import GraphConvolution -from .config import SenticNetGCNConfig, SenticNetBertGCNConfig +from .config import SenticNetGCNConfig, SenticNetGCNBertConfig @dataclass @@ -23,7 +23,7 @@ class SenticNetGCNPreTrainedModel(PreTrainedModel): """ config_class = SenticNetGCNConfig - base_model_prefix = "senticnet_gcn" + base_model_prefix = "senticnetgcn" def _init_weights(self, module): pass @@ -108,16 +108,16 @@ def forward(self, inputs): return output -class SenticNetBertGCNPreTrainedModel(PreTrainedModel): - config_class = SenticNetBertGCNConfig - base_model_prefix = "senticnetbert_gcn" +class SenticNetGCNBertPreTrainedModel(PreTrainedModel): + config_class = SenticNetGCNBertConfig + base_model_prefix = "senticnetgcnbert" - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): pass -class SenticNetBertGCPModel(SenticNetBertGCNPreTrainedModel): - def __init__(self, config: SenticNetBertGCNConfig) -> None: +class SenticNetGCNBertPModel(SenticNetGCNBertPreTrainedModel): + def __init__(self, config: SenticNetGCNBertConfig) -> None: super().__init__() self._init_bert_model(config.bert_model) self.gc1 = GraphConvolution(config.hidden_dim, config.hidden_dim) From 6aacf630f6b912fc3002ee3508e69a7ec413a2e5 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 28 Dec 2021 10:48:58 +0800 Subject: [PATCH 053/201] [#41] updated models output to their respective model output class --- sgnlp/models/senticnet_gcn/config.py | 14 +++-- .../config/senticnet_gcn_config.json | 1 + sgnlp/models/senticnet_gcn/data_class.py | 1 + sgnlp/models/senticnet_gcn/modeling.py | 63 ++++++++++++++++--- 4 files changed, 65 insertions(+), 14 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/config.py b/sgnlp/models/senticnet_gcn/config.py index b26b7f7..a7b54e8 100644 --- a/sgnlp/models/senticnet_gcn/config.py +++ b/sgnlp/models/senticnet_gcn/config.py @@ -8,12 +8,13 @@ class SenticNetGCNConfig(PreTrainedConfig): It is used to instantiate a SenticNetGCNModel network according to the specific arguments, defining the model architecture. Args: - vocab_size (:obj:`int`, defaults to 3597): Vocab size derived from combined SemeVal14/15/16 datasets. + vocab_size (:obj:`int`, defaults to 17662): Vocab size derived from combined Twitter and SemeVal14/15/16 datasets. embed_dim (:obj:`int`, defaults to 300): Embedding dimension size. hidden_dim (:obj:`int`, defaults to 300): Size of hidden dimension. dropout (:obj:`float`, defaults to 0.3): Droput percentage. polarities_dim (:obj:`int`, defaults to 3): Size of output dimension representing available polarities (e.g. Positive, Negative, Neutral). device (:obj:`str`, defaults to 'cuda`): Type of torch device. + loss_function (:obj:`str`, defaults to 'cross_entropy'): Loss function for training/eval. Example: @@ -25,12 +26,13 @@ class SenticNetGCNConfig(PreTrainedConfig): def __init__( self, - vocab_size: int = 3597, + vocab_size: int = 17662, embed_dim: int = 300, hidden_dim: int = 300, polarities_dim: int = 3, dropout: float = 0.3, device: str = "cuda", + loss_function: str = "cross_entropy", **kwargs ): super().__init__(**kwargs) @@ -40,6 +42,7 @@ def __init__( self.dropout = dropout self.polarities_dim = polarities_dim self.device = device + self.loss_function = loss_function class SenticNetGCNBertConfig(PreTrainedConfig): @@ -54,12 +57,13 @@ class SenticNetGCNBertConfig(PreTrainedConfig): dropout (:obj:`float`, defaults to 0.3): Dropout percentage. polarities_dim (:ob:`int`, defaults to 3): Size of output dimension representing available polarities (e.g. Positive, Negative, Neutral). device (:obj:`str`, defaults to 'cuda'): Type of torch device. + loss_function (:obj:`str`, defaults to 'cross_entropy'): Loss function for training/eval. Example: - from sgnlp.models.senticnet_gcn import SenticNetBertGCNConfig + from sgnlp.models.senticnet_gcn import SenticNetGCNBertConfig # Initialize with default values - config = SenticNetBertGCNConfig() + config = SenticNetGCNBertConfig() """ def __init__( @@ -70,6 +74,7 @@ def __init__( polarities_dim: int = 3, dropout: float = 0.3, device: str = "cuda", + loss_function: str = "cross_entropy", **kwargs ): super().__init__(**kwargs) @@ -79,3 +84,4 @@ def __init__( self.dropout = dropout self.polarities_dim = polarities_dim self.device = device + self.loss_function = loss_function diff --git a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json index bdd6db8..be8fb54 100644 --- a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json @@ -18,6 +18,7 @@ "saved_embedding_matrix_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/embedding/embeddings.pickle", "initializer": "xavier_uniform", "optimizer": "adam", + "loss_function": "cross_entropy", "learning_rate": 0.001, "l2reg": 0.00001, "epochs": 100, diff --git a/sgnlp/models/senticnet_gcn/data_class.py b/sgnlp/models/senticnet_gcn/data_class.py index 5a0ee94..0d6b349 100644 --- a/sgnlp/models/senticnet_gcn/data_class.py +++ b/sgnlp/models/senticnet_gcn/data_class.py @@ -78,6 +78,7 @@ class SenticNetGCNTrainArgs: ) initializer: str = field(default="xavier_uniform", metadata={"help": "Type of initalizer to use."}) optimizer: str = field(default="adam", metadata={"help": "Type of optimizer to use."}) + loss_function: str = field(default="cross_entropy", metadata={"help": "Loss function for training/eval."}) learning_rate: float = field(default=0.001, metadata={"help": "Default learning rate for training."}) l2reg: float = field(default=0.00001, metadata={"help": "Default l2reg value."}) epochs: int = field(default=100, metadata={"help": "Number of epochs to train."}) diff --git a/sgnlp/models/senticnet_gcn/modeling.py b/sgnlp/models/senticnet_gcn/modeling.py index a147eb9..3e4d7da 100644 --- a/sgnlp/models/senticnet_gcn/modeling.py +++ b/sgnlp/models/senticnet_gcn/modeling.py @@ -1,5 +1,7 @@ -from dataclasses import dataclass import pickle +from dataclasses import dataclass +from typing import Optional + import torch import torch.nn as nn @@ -14,7 +16,18 @@ @dataclass class SenticNetGCNModelOutput(ModelOutput): - pass + """ + Base class for outputs of SenticNetGCNModel. + + Args: + loss (:obj:`torch.Tensor` of shape `(1,)`, `optional`, return when :obj:`labels` is provided): + classification loss, typically cross entropy. Loss function used is dependent on what is specified in SenticNetGCNConfig. + logits (:obj:`torch.Tensor` of shape :obj:`(batch_size, num_classes)`): + raw logits for each class. num_classes = 3 by default. + """ + + loss: Optional[torch.Tensor] = None + logits: torch.Tensor = None class SenticNetGCNPreTrainedModel(PreTrainedModel): @@ -25,7 +38,7 @@ class SenticNetGCNPreTrainedModel(PreTrainedModel): config_class = SenticNetGCNConfig base_model_prefix = "senticnetgcn" - def _init_weights(self, module): + def _init_weights(self, module: nn.Module) -> None: pass @@ -45,6 +58,8 @@ def __init__(self, config: SenticNetGCNConfig) -> None: self.fc = nn.Linear(2 * config.hidden_dim, config.polarities_dim) self.text_embed_dropout = nn.Dropout(config.dropout) self.device = config.device + if config.loss_function == "cross_entropy": + self.loss_function = nn.CrossEntropyLoss() def _load_pretrained_embeddings(self, pretrained_embedding_path: str) -> None: with open(pretrained_embedding_path, "rb") as f: @@ -85,7 +100,9 @@ def mask(self, x, aspect_double_idx): mask = torch.tensor(mask, dtype=torch.float).unsqueeze(2).to(self.device) return mask * x - def forward(self, inputs): + def forward( + self, inputs: dict[str, torch.Tensor], labels: Optional[torch.Tensor] = None + ) -> SenticNetGCNModelOutput: text_indices, aspect_indices, left_indices, adj = inputs text_len = torch.sum(text_indices != 0, dim=-1) aspect_len = torch.sum(aspect_indices != 0, dim=-1) @@ -104,15 +121,38 @@ def forward(self, inputs): alpha_mat = torch.matmul(x, text_out.transpose(1, 2)) alpha = F.softmax(alpha_mat.sum(1, keepdim=True), dim=2) x = torch.matmul(alpha, text_out).squeeze(1) # batch_size x 2 * hidden_dim - output = self.fc(x) - return output + logits = self.fc(x) + + loss = self.loss_function(logits, labels) if labels is not None else None + return SenticNetGCNModelOutput(loss=loss, logits=logits) + + +@dataclass +class SenticNetGCNBertModelOutput(ModelOutput): + """ + Base class for outputs of SenticNetGCNBertModel. + + Args: + loss (:obj:`torch.Tensor` of shape `(1,)`, `optional`, return when :obj:`labels` is provided): + classification loss, typically cross entropy. + Loss function used is dependent on what is specified in SenticNetGCNBertConfig. + logits (:obj:`torch.Tensor` of shape :obj:`(batch_size, num_classes)`): + raw logits for each class. num_classes = 3 by default. + """ + + loss: Optional[torch.Tensor] = None + logits: torch.Tensor = None class SenticNetGCNBertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for download and loading pretrained models. + """ + config_class = SenticNetGCNBertConfig base_model_prefix = "senticnetgcnbert" - def _init_weights(self, module: nn.Module): + def _init_weights(self, module: nn.Module) -> None: pass @@ -127,6 +167,7 @@ def __init__(self, config: SenticNetGCNBertConfig) -> None: self.text_embed_dropout = nn.Dropout(config.dropout) self.device = config.device self.max_seq_len = config.max_seq_len + self.loss_function = config.loss_function def _init_bert_model(self, bert_model: str): self.bert = BertModel.from_pretrained(bert_model) @@ -164,7 +205,7 @@ def mask(self, x, aspect_double_idx): mask = torch.tensor(mask).unsqueeze(2).float().to(self.device) return mask * x - def forward(self, inputs): + def forward(self, inputs, labels: torch.Tensor): text_bert_indices, text_indices, aspect_indices, bert_segments_ids, left_indices, adj = inputs text_len = torch.sum(text_indices != 0, dim=-1) aspect_len = torch.sum(aspect_indices != 0, dim=-1) @@ -181,5 +222,7 @@ def forward(self, inputs): alpha_mat = torch.matmul(x, text_out.transpose(1, 2)) alpha = F.softmax(alpha_mat.sum(1, keepdim=True), dim=2) x = torch.matmul(alpha, text_out).squeeze(1) # batch_size x 2*hidden_dim - output = self.fc(x) - return output + logits = self.fc(x) + + loss = self.loss_function(logits, labels) if labels is not None else None + return SenticNetGCNBertModelOutput(loss=loss, logits=logits) From 6750538bd2d1aae9e15274a4dbb81e7d709b448b Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 28 Dec 2021 14:04:31 +0800 Subject: [PATCH 054/201] [#41] add SenticNetGCN Bert Embedding model and remove bert model layer from SenticNetGCNBertModel --- sgnlp/models/senticnet_gcn/config.py | 15 ++++++++++++++- sgnlp/models/senticnet_gcn/modeling.py | 26 +++++++++++++++++++++----- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/config.py b/sgnlp/models/senticnet_gcn/config.py index a7b54e8..14f7281 100644 --- a/sgnlp/models/senticnet_gcn/config.py +++ b/sgnlp/models/senticnet_gcn/config.py @@ -1,4 +1,4 @@ -from transformers import PreTrainedConfig +from transformers import PreTrainedConfig, BertConfig class SenticNetGCNConfig(PreTrainedConfig): @@ -85,3 +85,16 @@ def __init__( self.polarities_dim = polarities_dim self.device = device self.loss_function = loss_function + + +class SenticNetGCNBertEmbeddingConfig(BertConfig): + """ + This is the configuration class to store the configuration of a :class:`~SenticNetGCNBertEmbeddingModel`. + It is used to instantiate a UFD Embedding model according to the specified arguments, defining the model architecture. + + Args: + BertConfig (:obj:`BertConfig`): transformer :obj:`BertConfig` base class + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) diff --git a/sgnlp/models/senticnet_gcn/modeling.py b/sgnlp/models/senticnet_gcn/modeling.py index 3e4d7da..07bb0ef 100644 --- a/sgnlp/models/senticnet_gcn/modeling.py +++ b/sgnlp/models/senticnet_gcn/modeling.py @@ -11,7 +11,7 @@ from .modules.dynamic_rnn import DynamicLSTM from .modules.gcn import GraphConvolution -from .config import SenticNetGCNConfig, SenticNetGCNBertConfig +from .config import SenticNetGCNConfig, SenticNetGCNBertConfig, SenticNetGCNBertEmbeddingConfig @dataclass @@ -159,7 +159,6 @@ def _init_weights(self, module: nn.Module) -> None: class SenticNetGCNBertPModel(SenticNetGCNBertPreTrainedModel): def __init__(self, config: SenticNetGCNBertConfig) -> None: super().__init__() - self._init_bert_model(config.bert_model) self.gc1 = GraphConvolution(config.hidden_dim, config.hidden_dim) self.gc2 = GraphConvolution(config.hidden_dim, config.hidden_dim) self.gc3 = GraphConvolution(config.hidden_dim, config.hidden_dim) @@ -169,9 +168,6 @@ def __init__(self, config: SenticNetGCNBertConfig) -> None: self.max_seq_len = config.max_seq_len self.loss_function = config.loss_function - def _init_bert_model(self, bert_model: str): - self.bert = BertModel.from_pretrained(bert_model) - def position_weight(self, x, aspect_double_idx, text_len, aspect_len): batch_size, seq_len = x.shape[0], x.shape[1] aspect_double_idx = aspect_double_idx.cpu().numpy() @@ -211,6 +207,7 @@ def forward(self, inputs, labels: torch.Tensor): aspect_len = torch.sum(aspect_indices != 0, dim=-1) left_len = torch.sum(left_indices != 0, dim=-1) aspect_double_idx = torch.cat([left_len.unsqueeze(1), (left_len + aspect_len - 1).unsqueeze(1)], dim=1) + # TODO: How to embed in the preprocessor? encoder_layer, _ = self.bert( text_bert_indices, token_type_ids=bert_segments_ids, output_all_encoded_layers=False ) @@ -226,3 +223,22 @@ def forward(self, inputs, labels: torch.Tensor): loss = self.loss_function(logits, labels) if labels is not None else None return SenticNetGCNBertModelOutput(loss=loss, logits=logits) + + +class SenticNetGCNBertEmbeddingModel(BertModel): + """ + The SenticNetGCN Bert Embedding Model used to generate embeddings for model inputs. + + This class inherits from :obj:`BertModel` for weights initalization and utility functions + from transformers :obj:`PreTrainedModel` class. + + Args: + config (:obj:`~SenticNetGCNBertEmbeddingConfig`): + Model configuration class with all parameters required for the model. + Initializing with a config file does not load + the weights associated with the model, only the configuration. + Use the :obj:`.from_pretrained` method to load the model weights. + """ + + def __init__(self, config: SenticNetGCNBertEmbeddingConfig): + super().__init__(config) From f618f6bca314a085a00aef5bfc7cd666881f9dd5 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 28 Dec 2021 14:59:49 +0800 Subject: [PATCH 055/201] [#41] extract embedding layer from SenticNetGCNModel into standalone class implementation --- sgnlp/models/senticnet_gcn/config.py | 17 +++++++- sgnlp/models/senticnet_gcn/modeling.py | 56 +++++++++++++++++++++----- 2 files changed, 63 insertions(+), 10 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/config.py b/sgnlp/models/senticnet_gcn/config.py index 14f7281..282d084 100644 --- a/sgnlp/models/senticnet_gcn/config.py +++ b/sgnlp/models/senticnet_gcn/config.py @@ -87,10 +87,25 @@ def __init__( self.loss_function = loss_function +class SenticNetGCNEmbeddingConfig(PreTrainedConfig): + """ + This is the configuration class to store the configuration of a :class:`~SenticNetGCNEmbeddingModel`. + It is used to instantiate a SenticNetGCN Embedding model according to the specified arguments, defining the model architecture. + + Args: + PreTrainedConfig (:obj:`PretrainedConfig`): transformer :obj:`PreTrainedConfig` base class + """ + + def __init__(self, vocab_size: int = 17662, embed_dim: int = 300, **kwargs) -> None: + super().__init__(**kwargs) + self.vocab_size = vocab_size + self.embed_dim = embed_dim + + class SenticNetGCNBertEmbeddingConfig(BertConfig): """ This is the configuration class to store the configuration of a :class:`~SenticNetGCNBertEmbeddingModel`. - It is used to instantiate a UFD Embedding model according to the specified arguments, defining the model architecture. + It is used to instantiate a SenticNetGCN Bert Embedding model according to the specified arguments, defining the model architecture. Args: BertConfig (:obj:`BertConfig`): transformer :obj:`BertConfig` base class diff --git a/sgnlp/models/senticnet_gcn/modeling.py b/sgnlp/models/senticnet_gcn/modeling.py index 07bb0ef..d457179 100644 --- a/sgnlp/models/senticnet_gcn/modeling.py +++ b/sgnlp/models/senticnet_gcn/modeling.py @@ -1,6 +1,7 @@ +import pathlib import pickle from dataclasses import dataclass -from typing import Optional +from typing import Optional, Union import torch @@ -11,7 +12,13 @@ from .modules.dynamic_rnn import DynamicLSTM from .modules.gcn import GraphConvolution -from .config import SenticNetGCNConfig, SenticNetGCNBertConfig, SenticNetGCNBertEmbeddingConfig +from .config import ( + SenticNetGCNConfig, + SenticNetGCNBertConfig, + SenticNetGCNEmbeddingConfig, + SenticNetGCNBertEmbeddingConfig, +) +from .utils import build_embedding_matrix @dataclass @@ -45,7 +52,6 @@ def _init_weights(self, module: nn.Module) -> None: class SenticNetGCNModel(SenticNetGCNPreTrainedModel): def __init__(self, config: SenticNetGCNConfig) -> None: super().__init__(config) - self.embedding = nn.Embedding(config.vocab_size, config.embed_dim) self.text_lstm = DynamicLSTM( config.embed_dim, config.hidden_dim, @@ -61,12 +67,6 @@ def __init__(self, config: SenticNetGCNConfig) -> None: if config.loss_function == "cross_entropy": self.loss_function = nn.CrossEntropyLoss() - def _load_pretrained_embeddings(self, pretrained_embedding_path: str) -> None: - with open(pretrained_embedding_path, "rb") as f: - embedding_matrix = pickle.load(f) - embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float) - self.embedding.weight.data.copy_(embedding_tensor) - def position_weight(self, x, aspect_double_idx, text_len, aspect_len): batch_size, seq_len = x.shape[0], x.shape[1] aspect_double_idx = aspect_double_idx.cpu().numpy() @@ -108,6 +108,7 @@ def forward( aspect_len = torch.sum(aspect_indices != 0, dim=-1) left_len = torch.sum(left_indices != 0, dim=-1) aspect_double_idx = torch.cat([left_len.unsqueeze(1), (left_len + aspect_len - 1).unsqueeze(1)], dim=1) + # TODO: How to replace embedding layer here? text = self.embedding(text_indices) text = self.text_embed_dropout(text_indices) text_out, (_, _) = self.text_lstm(text, text_len) @@ -225,6 +226,43 @@ def forward(self, inputs, labels: torch.Tensor): return SenticNetGCNBertModelOutput(loss=loss, logits=logits) +class SenticNetGCNEmbeddingPreTrainedModel(PreTrainedModel): + config_class = SenticNetGCNEmbeddingConfig + base_model_prefix = "senticnetgcnembedding" + + def _init_weights(self, module: nn.Module) -> None: + pass + + +class SenticNetGCNEmbeddingPreTrainedModel(SenticNetGCNEmbeddingPreTrainedModel): + def __init__(self, config: SenticNetGCNEmbeddingConfig): + super().__init__() + self.vocab_size = config.vocab_size + self.embed = nn.Embedding(config.vocab_size, config.embed_dim) + + def load_pretrained_embedding(self, pretrained_embedding_path: Union[str, pathlib.Path]): + with open(pretrained_embedding_path, "rb") as emb_f: + embedding_matrix = pickle.load(emb_f) + embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float) + self.embed.weight.data.copy_(embedding_tensor) + + @classmethod + def build_embedding_matrix( + cls, + word_vec_file_path: str, + vocab: dict[str, int], + embed_dim: int = 300, + ): + embedding_matrix = build_embedding_matrix( + word_vec_file_path=word_vec_file_path, vocab=vocab, embed_dim=embed_dim + ) + embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float) + config = SenticNetGCNEmbeddingConfig() + senticnetgcn_embed = cls(config) + senticnetgcn_embed.embed.weight.data.copy_(embedding_tensor) + return senticnetgcn_embed + + class SenticNetGCNBertEmbeddingModel(BertModel): """ The SenticNetGCN Bert Embedding Model used to generate embeddings for model inputs. From 80fe83c02eeeb0cae528936bdc42608f1f743d44 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 28 Dec 2021 15:01:02 +0800 Subject: [PATCH 056/201] [#41] add missing input args --- sgnlp/models/senticnet_gcn/modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgnlp/models/senticnet_gcn/modeling.py b/sgnlp/models/senticnet_gcn/modeling.py index d457179..e80d667 100644 --- a/sgnlp/models/senticnet_gcn/modeling.py +++ b/sgnlp/models/senticnet_gcn/modeling.py @@ -257,7 +257,7 @@ def build_embedding_matrix( word_vec_file_path=word_vec_file_path, vocab=vocab, embed_dim=embed_dim ) embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float) - config = SenticNetGCNEmbeddingConfig() + config = SenticNetGCNEmbeddingConfig(vocab_size=vocab, embed_dim=embed_dim) senticnetgcn_embed = cls(config) senticnetgcn_embed.embed.weight.data.copy_(embedding_tensor) return senticnetgcn_embed From f2506d8151d9ccc49205b379e6fa723941b257b1 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 28 Dec 2021 16:23:26 +0800 Subject: [PATCH 057/201] [#41] clean up and add SenticNetGCNBertPreprocessor --- sgnlp/models/senticnet_gcn/modeling.py | 2 +- sgnlp/models/senticnet_gcn/preprocess.py | 43 +++++++- sgnlp/models/senticnet_gcn/tokenization.py | 2 +- sgnlp/models/senticnet_gcn/utils.py | 108 --------------------- 4 files changed, 44 insertions(+), 111 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/modeling.py b/sgnlp/models/senticnet_gcn/modeling.py index e80d667..820ddcb 100644 --- a/sgnlp/models/senticnet_gcn/modeling.py +++ b/sgnlp/models/senticnet_gcn/modeling.py @@ -234,7 +234,7 @@ def _init_weights(self, module: nn.Module) -> None: pass -class SenticNetGCNEmbeddingPreTrainedModel(SenticNetGCNEmbeddingPreTrainedModel): +class SenticNetGCNEmbeddingModel(SenticNetGCNEmbeddingPreTrainedModel): def __init__(self, config: SenticNetGCNEmbeddingConfig): super().__init__() self.vocab_size = config.vocab_size diff --git a/sgnlp/models/senticnet_gcn/preprocess.py b/sgnlp/models/senticnet_gcn/preprocess.py index ec6ba2f..51a7571 100644 --- a/sgnlp/models/senticnet_gcn/preprocess.py +++ b/sgnlp/models/senticnet_gcn/preprocess.py @@ -3,15 +3,20 @@ import torch from transformers import PreTrainedTokenizer from transformers.tokenization_utils_base import BatchEncoding +from transformers.utils.dummy_pt_objects import PreTrainedModel -from tokenization import SenticNetGCNTokenizer +from config import SenticNetGCNEmbeddingConfig, SenticNetGCNBertEmbeddingConfig +from modeling import SenticNetGCNEmbeddingModel, SenticNetGCNBertEmbeddingModel +from tokenization import SenticNetGCNTokenizer, SenticNetGCNBertTokenizer class SenticNetGCNPreprocessor: def __init__( self, tokenizer: PreTrainedTokenizer = None, + embedding_model: PreTrainedModel = None, tokenizer_name: str = None, + embedding_model_name: str = None, device: torch.device = torch.device("cpu"), ): self.device = device @@ -20,6 +25,42 @@ def __init__( else: self.tokenizer = SenticNetGCNTokenizer.from_pretrained(tokenizer_name) + if embedding_model is not None: + self.embedding_model = embedding_model + else: + embedding_config = SenticNetGCNEmbeddingConfig.from_pretrained(embedding_model_name) + self.embedding_model = SenticNetGCNEmbeddingModel.from_pretrained( + embedding_model_name, config=embedding_config + ).to(device) + + def __call__(self, data_batch: List[str]) -> BatchEncoding: + tokens = self.tokenizer(data_batch, padding=True, return_tensors="pt") + return tokens + + +class SenticNetGCNBertPreprocessor: + def __init__( + self, + tokenizer: PreTrainedTokenizer = None, + embedding_model: PreTrainedModel = None, + tokenizer_name: str = None, + embedding_model_name: str = None, + device: torch.device = torch.device("cpu"), + ): + self.device = device + if tokenizer is not None: + self.tokenizer = tokenizer + else: + self.tokenizer = SenticNetGCNBertTokenizer.from_pretrained(tokenizer_name) + + if embedding_model is not None: + self.embedding_model = embedding_model + else: + embedding_config = SenticNetGCNBertEmbeddingConfig.from_pretrained(embedding_model_name) + self.embedding_model = SenticNetGCNBertEmbeddingModel.from_pretrained( + embedding_model_name, config=embedding_config + ).to(device) + def __call__(self, data_batch: List[str]) -> BatchEncoding: tokens = self.tokenizer(data_batch, padding=True, return_tensors="pt") return tokens diff --git a/sgnlp/models/senticnet_gcn/tokenization.py b/sgnlp/models/senticnet_gcn/tokenization.py index 7a1cda3..86cf3c2 100644 --- a/sgnlp/models/senticnet_gcn/tokenization.py +++ b/sgnlp/models/senticnet_gcn/tokenization.py @@ -100,7 +100,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = return (str(vocab_file_path),) -class SenticNetBertGCNTokenizer(BertTokenizer): +class SenticNetGCNBertTokenizer(BertTokenizer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/sgnlp/models/senticnet_gcn/utils.py b/sgnlp/models/senticnet_gcn/utils.py index df8d96f..2f7d698 100644 --- a/sgnlp/models/senticnet_gcn/utils.py +++ b/sgnlp/models/senticnet_gcn/utils.py @@ -110,113 +110,6 @@ def build_embedding_matrix( return embedding_matrix -class BucketIterator(object): - """ - Bucket iterator class which provides sorting and padding for input dataset, iterate thru dataset batches - """ - - def __init__(self, data, batch_size: int, sort_key="text_indices", shuffle=True, sort=True): - self.shuffle = shuffle - self.sort = sort - self.sort_key = sort_key - self.batches = self.sort_and_pad(data, batch_size) - self.batch_len = len(self.batches) - - def sort_and_pad(self, data, batch_size: int) -> List[Dict[str, torch.tensor]]: - """ - Class method to sort and pad data batches - - Args: - data (ABSADataset): input data - batch_size (int): batch size - - Returns: - List[Dict[str, torch.tensor]]: return a list of dictionaries of tensors - """ - num_batch = int(math.ceil(len(data) / batch_size)) - sorted_data = sorted(data, key=lambda x: len(x[self.sort_key])) if self.sort else data - batches = [self.pad_data(sorted_data[i * batch_size : (i + 1) * batch_size]) for i in range(num_batch)] - return batches - - def pad_batch_encoding(self, data: BatchEncoding, max_len: int) -> BatchEncoding: - input_ids_pad = [0] * (max_len - len(data["input_ids"])) - token_type_ids_pad = input_ids_pad.copy() - attention_mask_pad = [1] * (max_len - len(data["attention_mask"])) - data["input_ids"] = torch.tensor(data["input_ids"] + input_ids_pad) - data["token_type_ids"] = torch.tensor(data["token_type_ids"] + token_type_ids_pad) - data["attention_mask_pad"] = torch.tensor(data["attention_mask"] + attention_mask_pad) - return data - - def pad_data(self, batch_data: List[Dict[str, Union[BatchEncoding, int, np.ndarray]]]) -> Dict[str, torch.tensor]: - """ - Class method to pad data batches - - Args: - batch_data (List[Dict[str, Union[BatchEncoding, int, np.ndarray]]]): List of dictionaries containing all batches of data - - Returns: - Dict[str, torch.tensor]: return dictionary of tensors from data batches - """ - batch_text_indices = [] - batch_context_indices = [] - batch_aspect_indices = [] - batch_left_indices = [] - batch_polarity = [] - batch_dependency_graph = [] - batch_dependency_tree = [] - max_len = max([len(t[self.sort_key]["input_ids"]) for t in batch_data]) - # [text_indices, context_indices, aspect_indices, left_indices, polarity, dependency_graph, dependency_tree] - for item in batch_data: - text_indices = item["text_indices"] - context_indices = item["context_indices"] - aspect_indices = item["aspect_indices"] - left_indices = item["left_indices"] - polarity = item["polarity"] - dependency_graph = item["dependency_graph"] - dependency_tree = item["dependency_tree"] - - batch_text_indices.append(self.pad_batch_encoding(text_indices, max_len)) - batch_context_indices.append(self.pad_batch_encoding(context_indices, max_len)) - batch_aspect_indices.append(self.pad_batch_encoding(aspect_indices, max_len)) - batch_left_indices.append(self.pad_batch_encoding(left_indices, max_len)) - batch_polarity.append(polarity) - batch_dependency_graph.append( - np.pad( - dependency_graph, - ( - (0, max_len - len(text_indices["input_ids"])), - (0, max_len - len(text_indices["input_ids"])), - ), - "constant", - ) - ) - batch_dependency_tree.append( - np.pad( - dependency_tree, - ( - (0, max_len - len(text_indices["input_ids"])), - (0, max_len - len(text_indices["input_ids"])), - ), - "constant", - ) - ) - return { - "text_indices": batch_text_indices, - "context_indices": batch_context_indices, - "aspect_indices": batch_aspect_indices, - "left_indices": batch_left_indices, - "polarity": batch_polarity, - "dependency_graph": torch.tensor(batch_dependency_graph), - "dependency_tree": torch.tensor(batch_dependency_tree), - } - - def __iter__(self): - if self.shuffle: - random.shuffle(self.batches) - for idx in range(self.batch_len): - yield self.batches[idx] - - class ABSADataset(object): """ Data class to hold dataset for training. @@ -285,7 +178,6 @@ def __read_data__(datasets: Dict[str, str], tokenizer: PreTrainedTokenizer): "left_indices": left_indices, "polarity": polarity, "dependency_graph": dependency_graph, - "dependency_tree": dependency_tree, } all_data.append(data) return all_data From f8118baf177681d9d53d7aac782a2fa31e1e0822 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 29 Dec 2021 09:51:52 +0800 Subject: [PATCH 058/201] [#41] remove unused attributes --- sgnlp/models/senticnet_gcn/modules/gcn.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/modules/gcn.py b/sgnlp/models/senticnet_gcn/modules/gcn.py index af6dc5b..63398d6 100644 --- a/sgnlp/models/senticnet_gcn/modules/gcn.py +++ b/sgnlp/models/senticnet_gcn/modules/gcn.py @@ -7,10 +7,8 @@ class GraphConvolution(nn.Module): Simple GCN Layer, similar to https://arxiv.org/abs/1609.02907 """ - def __init__(self, in_features, out_features, bias=True) -> None: + def __init__(self, in_features: torch.tensor, out_features: torch.tensor, bias=True) -> None: super(GraphConvolution, self).__init__() - self.in_features = in_features - self.out_features = out_features self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features)) if bias: self.bias = nn.Parameter(torch.FloatTensor(out_features)) From 68ac63fcd9942efb7e2756f54bd130128d5d4d73 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 29 Dec 2021 09:56:50 +0800 Subject: [PATCH 059/201] [#41] small clean up for type hints --- .../senticnet_gcn/modules/dynamic_rnn.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/modules/dynamic_rnn.py b/sgnlp/models/senticnet_gcn/modules/dynamic_rnn.py index 74b750e..5cd69d7 100644 --- a/sgnlp/models/senticnet_gcn/modules/dynamic_rnn.py +++ b/sgnlp/models/senticnet_gcn/modules/dynamic_rnn.py @@ -9,15 +9,15 @@ class DynamicLSTM(nn.Module): def __init__( self, - input_size, - hidden_size, - num_layers=1, - bias=True, - batch_first=True, - dropout=0, - bidirectional=False, - only_use_last_hidden_state=False, - rnn_type="LSTM", + input_size: int, + hidden_size: int, + num_layers: int = 1, + bias: bool = True, + batch_first: bool = True, + dropout: float = 0, + bidirectional: bool = False, + only_use_last_hidden_state: bool = False, + rnn_type: str = "LSTM", ) -> None: super(DynamicLSTM, self).__init__() self.input_size = input_size @@ -51,7 +51,7 @@ def __init_rnn(self) -> None: elif self.rnn_type == "RNN": self.rnn = nn.RNN(**input_args) - def forward(self, x, x_len, h0=None): + def forward(self, x: torch.tensor, x_len: torch.tensor, h0: torch.tensor = None) -> torch.tensor: # Sort x_sort_idx = torch.argsort(-x_len) x_unsort_idx = torch.argsort(x_sort_idx).long() From 5560661d040c356be96b2dc40cd94321743dbc31 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 29 Dec 2021 10:19:39 +0800 Subject: [PATCH 060/201] [#41] fix type hints, remove unused dataclass attributes --- sgnlp/models/senticnet_gcn/config.py | 6 ------ sgnlp/models/senticnet_gcn/modules/dynamic_rnn.py | 2 +- sgnlp/models/senticnet_gcn/modules/gcn.py | 4 ++-- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/config.py b/sgnlp/models/senticnet_gcn/config.py index 282d084..d2fc645 100644 --- a/sgnlp/models/senticnet_gcn/config.py +++ b/sgnlp/models/senticnet_gcn/config.py @@ -8,7 +8,6 @@ class SenticNetGCNConfig(PreTrainedConfig): It is used to instantiate a SenticNetGCNModel network according to the specific arguments, defining the model architecture. Args: - vocab_size (:obj:`int`, defaults to 17662): Vocab size derived from combined Twitter and SemeVal14/15/16 datasets. embed_dim (:obj:`int`, defaults to 300): Embedding dimension size. hidden_dim (:obj:`int`, defaults to 300): Size of hidden dimension. dropout (:obj:`float`, defaults to 0.3): Droput percentage. @@ -26,7 +25,6 @@ class SenticNetGCNConfig(PreTrainedConfig): def __init__( self, - vocab_size: int = 17662, embed_dim: int = 300, hidden_dim: int = 300, polarities_dim: int = 3, @@ -36,7 +34,6 @@ def __init__( **kwargs ): super().__init__(**kwargs) - self.vocab_size = vocab_size self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.dropout = dropout @@ -51,7 +48,6 @@ class SenticNetGCNBertConfig(PreTrainedConfig): It is used to instantiate a SenticNetBertGCNModel network according to the specific arguments, defining the model architecture. Args: - bert_model (:obj:`str`, defaults to 'bert-base-uncased'): The Bert model type to initalized from transformers package. hidden_dim (:obj:`int`, defaults to 768): The embedding dimension size for the Bert model as well as GCN dimension. max_seq_len (:obj:`int`, defaults to 85): The max sequence length to pad and truncate. dropout (:obj:`float`, defaults to 0.3): Dropout percentage. @@ -68,7 +64,6 @@ class SenticNetGCNBertConfig(PreTrainedConfig): def __init__( self, - bert_model: str = "bert-base-uncased", hidden_dim: int = 768, max_seq_len: int = 85, polarities_dim: int = 3, @@ -78,7 +73,6 @@ def __init__( **kwargs ): super().__init__(**kwargs) - self.bert_model = bert_model self.hidden_dim = hidden_dim self.max_seq_len = max_seq_len self.dropout = dropout diff --git a/sgnlp/models/senticnet_gcn/modules/dynamic_rnn.py b/sgnlp/models/senticnet_gcn/modules/dynamic_rnn.py index 5cd69d7..76ce70b 100644 --- a/sgnlp/models/senticnet_gcn/modules/dynamic_rnn.py +++ b/sgnlp/models/senticnet_gcn/modules/dynamic_rnn.py @@ -51,7 +51,7 @@ def __init_rnn(self) -> None: elif self.rnn_type == "RNN": self.rnn = nn.RNN(**input_args) - def forward(self, x: torch.tensor, x_len: torch.tensor, h0: torch.tensor = None) -> torch.tensor: + def forward(self, x: torch.Tensor, x_len: torch.Tensor, h0: torch.Tensor = None) -> torch.Tensor: # Sort x_sort_idx = torch.argsort(-x_len) x_unsort_idx = torch.argsort(x_sort_idx).long() diff --git a/sgnlp/models/senticnet_gcn/modules/gcn.py b/sgnlp/models/senticnet_gcn/modules/gcn.py index 63398d6..618156e 100644 --- a/sgnlp/models/senticnet_gcn/modules/gcn.py +++ b/sgnlp/models/senticnet_gcn/modules/gcn.py @@ -7,7 +7,7 @@ class GraphConvolution(nn.Module): Simple GCN Layer, similar to https://arxiv.org/abs/1609.02907 """ - def __init__(self, in_features: torch.tensor, out_features: torch.tensor, bias=True) -> None: + def __init__(self, in_features: torch.Tensor, out_features: torch.Tensor, bias=True) -> None: super(GraphConvolution, self).__init__() self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features)) if bias: @@ -15,7 +15,7 @@ def __init__(self, in_features: torch.tensor, out_features: torch.tensor, bias=T else: self.register_parameter("bias", None) - def forward(self, text, adj): + def forward(self, text: torch.Tensor, adj: torch.Tensor): text = text.to(torch.float32) hidden = torch.matmul(text, self.weight) denom = torch.sum(adj, dim=2, keepdim=True) + 1 From 136a6a8dccf62e7209583d39f61a020907e2505d Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 29 Dec 2021 11:01:11 +0800 Subject: [PATCH 061/201] [#41] fix class name and flesh out class definitions --- sgnlp/models/senticnet_gcn/modeling.py | 81 ++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 4 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/modeling.py b/sgnlp/models/senticnet_gcn/modeling.py index 820ddcb..cf4819e 100644 --- a/sgnlp/models/senticnet_gcn/modeling.py +++ b/sgnlp/models/senticnet_gcn/modeling.py @@ -39,7 +39,11 @@ class SenticNetGCNModelOutput(ModelOutput): class SenticNetGCNPreTrainedModel(PreTrainedModel): """ - An abstract class to handle weights initialization and a simple interface for download and loading pretrained models. + The SenticNetGCN Pre-Trained Model used as base class for derived SenticNetGCN Model. + + This model is the abstract super class for the SenticNetGCN Model which defines the config + class types and weights initalization method. This class should not be used or instantiated directly, + see SenticNetGCNModel class for usage. """ config_class = SenticNetGCNConfig @@ -50,6 +54,20 @@ def _init_weights(self, module: nn.Module) -> None: class SenticNetGCNModel(SenticNetGCNPreTrainedModel): + """ + The SenticNetGCN Model for aspect based sentiment analysis. + + This method inherits from :obj:`SenticNetGCNPreTrainedModel` for weights initalization and utility functions + from transformer :obj:`PreTrainedModel` class. + + Args: + config (:obj:`~SenticNetGCNConfig`): + Model configuration class with all parameters required for the model. + Initializing with a config file does not load + the weights associated with the model, only the configuration. + Use the :obj:`.from_pretrained` method to load the model weights. + """ + def __init__(self, config: SenticNetGCNConfig) -> None: super().__init__(config) self.text_lstm = DynamicLSTM( @@ -147,7 +165,11 @@ class SenticNetGCNBertModelOutput(ModelOutput): class SenticNetGCNBertPreTrainedModel(PreTrainedModel): """ - An abstract class to handle weights initialization and a simple interface for download and loading pretrained models. + The SenticNetGCNBert Pre-Trained Model used as base class for derived SenticNetGCNBert Model. + + This model is the abstract super class for the SenticNetGCNBert Model which defines the config + class types and weights initalization method. This class should not be used or instantiated directly, + see SenticNetGCNBertModel class for usage. """ config_class = SenticNetGCNBertConfig @@ -157,7 +179,21 @@ def _init_weights(self, module: nn.Module) -> None: pass -class SenticNetGCNBertPModel(SenticNetGCNBertPreTrainedModel): +class SenticNetGCNBertModel(SenticNetGCNBertPreTrainedModel): + """ + The SenticNetGCNBert Model for aspect based sentiment analysis. + + This method inherits from :obj:`SenticNetGCNBertPreTrainedModel` for weights initalization and utility functions + from transformer :obj:`PreTrainedModel` class. + + Args: + config (:obj:`~SenticNetGCNBertConfig`): + Model configuration class with all parameters required for the model. + Initializing with a config file does not load + the weights associated with the model, only the configuration. + Use the :obj:`.from_pretrained` method to load the model weights. + """ + def __init__(self, config: SenticNetGCNBertConfig) -> None: super().__init__() self.gc1 = GraphConvolution(config.hidden_dim, config.hidden_dim) @@ -227,6 +263,14 @@ def forward(self, inputs, labels: torch.Tensor): class SenticNetGCNEmbeddingPreTrainedModel(PreTrainedModel): + """ + The SenticNetGCN Embedding Pre-Trained Model used as base class for derived SenticNetGCN Embedding Model. + + This model is the abstract super class for the SenticNetGCN Embedding Model which defines the config + class types and weights initalization method. This class should not be used or instantiated directly, + see SenticNetGCNEmbeddingModel class for usage. + """ + config_class = SenticNetGCNEmbeddingConfig base_model_prefix = "senticnetgcnembedding" @@ -235,12 +279,29 @@ def _init_weights(self, module: nn.Module) -> None: class SenticNetGCNEmbeddingModel(SenticNetGCNEmbeddingPreTrainedModel): + """ + The SenticNetGCN Embedding Model used to generate embeddings for model inputs. + By default, the embeddings are generated from the glove.840B.300d embeddings. + + This class inherits from :obj:`SenticNetGCNEmbeddingPreTrainedModel` for weights initalization and utility functions + from transformers :obj:`PreTrainedModel` class. + + This class can also be constructed via the SenticNetGCNEmbeddingModel.build_embedding_matrix class method. + + Args: + config (:obj:`~SenticNetGCNEmbeddingConfig`): + Model configuration class with all parameters required for the model. + Initializing with a config file does not load + the weights associated with the model, only the configuration. + Use the :obj:`.from_pretrained` method to load the model weights. + """ + def __init__(self, config: SenticNetGCNEmbeddingConfig): super().__init__() self.vocab_size = config.vocab_size self.embed = nn.Embedding(config.vocab_size, config.embed_dim) - def load_pretrained_embedding(self, pretrained_embedding_path: Union[str, pathlib.Path]): + def load_pretrained_embedding(self, pretrained_embedding_path: Union[str, pathlib.Path]) -> None: with open(pretrained_embedding_path, "rb") as emb_f: embedding_matrix = pickle.load(emb_f) embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float) @@ -253,6 +314,18 @@ def build_embedding_matrix( vocab: dict[str, int], embed_dim: int = 300, ): + """ + This class method is a helper method to construct the embedding model from a file containing word vectors (i.e. GloVe) + and a vocab dictionary. + + Args: + word_vec_file_path (str): file path to the word vectors + vocab (dict[str, int]): vocab dictionary consisting of words as key and index as values + embed_dim (int, optional): the embedding dimension. Defaults to 300. + + Returns: + SenticNetGCNEmbeddingModel: return an instance of SenticNetGCNEmbeddingModel + """ embedding_matrix = build_embedding_matrix( word_vec_file_path=word_vec_file_path, vocab=vocab, embed_dim=embed_dim ) From 7b9add3fd386e3604f414043128f257fde1235de Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 3 Jan 2022 15:58:21 +0800 Subject: [PATCH 062/201] [#41] remove unused imports, add download files helper methods --- sgnlp/models/senticnet_gcn/utils.py | 44 +++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/sgnlp/models/senticnet_gcn/utils.py b/sgnlp/models/senticnet_gcn/utils.py index 2f7d698..1687a4e 100644 --- a/sgnlp/models/senticnet_gcn/utils.py +++ b/sgnlp/models/senticnet_gcn/utils.py @@ -1,10 +1,11 @@ import argparse import json -from logging import error -import math +import logging import pickle import random import pathlib +import requests +import urllib from typing import Dict, List, Union import numpy as np @@ -50,6 +51,45 @@ def set_random_seed(seed: int = 776) -> None: torch.backends.cudnn.benchmark = False +def download_tokenizer_files( + base_url: str, + save_folder: str, + files: list[str] = ["special_tokens_map.json", "tokenizer_config.json", "vocab.pkl"], +) -> None: + """ + Helper method to download files from online storage. + + Args: + base_url (str): Url string to storage folder. + save_folder (str): Local folder to save downloaded files. Folder will be created if it does not exists. + """ + file_paths = [urllib.parse.urljoin(base_url, file_name) for file_name in files] + for file_path in file_paths: + pass + + +def download_url_file(url: str, save_folder: str) -> None: + """ + Helper method to download and save url file. + + Args: + url (str): Url of file to download. + save_folder (str): Folder to save downloaded file. Will be created if it does not exists. + """ + save_folder_path = pathlib.Path(save_folder) + save_folder_path.mkdir(exist_ok=True) + fn_start_pos = url.rfind("/") + 1 + file_name = url[fn_start_pos:] + save_file_path = save_folder_path.joinpath(file_name) + req = requests.get(url) + if req.status_code == requests.codes.ok: + with open(save_file_path, "wb") as f: + for data in req: + f.write(data) + else: + logging.error(f"Fail to request files from {url}.") + + def load_word_vec(word_vec_file_path: str, vocab: Dict[str, int], embed_dim: int = 300) -> Dict[str, np.asarray]: """ Helper method to load word vectors from file (e.g. GloVe) for each word in vocab. From ed9962eb75506d0529eb0b2fdb1f67f7e3570372 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 3 Jan 2022 16:12:15 +0800 Subject: [PATCH 063/201] [#41] final rename to match original repo --- .../{senticnet_gcn => sentic_gcn}/__init__.py | 0 .../{senticnet_gcn => sentic_gcn}/config.py | 32 ++--- .../config/senticnet_gcn_config.json | 0 .../data_class.py | 8 +- .../{senticnet_gcn => sentic_gcn}/modeling.py | 112 +++++++++--------- .../modules/__init__.py | 0 .../modules/dynamic_rnn.py | 0 .../modules/gcn.py | 0 .../preprocess.py | 22 ++-- .../preprocess_dependency.py | 7 +- .../tokenization.py | 4 +- .../{senticnet_gcn => sentic_gcn}/train.py | 34 +++--- .../{senticnet_gcn => sentic_gcn}/utils.py | 0 13 files changed, 109 insertions(+), 110 deletions(-) rename sgnlp/models/{senticnet_gcn => sentic_gcn}/__init__.py (100%) rename sgnlp/models/{senticnet_gcn => sentic_gcn}/config.py (72%) rename sgnlp/models/{senticnet_gcn => sentic_gcn}/config/senticnet_gcn_config.json (100%) rename sgnlp/models/{senticnet_gcn => sentic_gcn}/data_class.py (96%) rename sgnlp/models/{senticnet_gcn => sentic_gcn}/modeling.py (77%) rename sgnlp/models/{senticnet_gcn => sentic_gcn}/modules/__init__.py (100%) rename sgnlp/models/{senticnet_gcn => sentic_gcn}/modules/dynamic_rnn.py (100%) rename sgnlp/models/{senticnet_gcn => sentic_gcn}/modules/gcn.py (100%) rename sgnlp/models/{senticnet_gcn => sentic_gcn}/preprocess.py (67%) rename sgnlp/models/{senticnet_gcn => sentic_gcn}/preprocess_dependency.py (94%) rename sgnlp/models/{senticnet_gcn => sentic_gcn}/tokenization.py (97%) rename sgnlp/models/{senticnet_gcn => sentic_gcn}/train.py (87%) rename sgnlp/models/{senticnet_gcn => sentic_gcn}/utils.py (100%) diff --git a/sgnlp/models/senticnet_gcn/__init__.py b/sgnlp/models/sentic_gcn/__init__.py similarity index 100% rename from sgnlp/models/senticnet_gcn/__init__.py rename to sgnlp/models/sentic_gcn/__init__.py diff --git a/sgnlp/models/senticnet_gcn/config.py b/sgnlp/models/sentic_gcn/config.py similarity index 72% rename from sgnlp/models/senticnet_gcn/config.py rename to sgnlp/models/sentic_gcn/config.py index d2fc645..0d4bdb5 100644 --- a/sgnlp/models/senticnet_gcn/config.py +++ b/sgnlp/models/sentic_gcn/config.py @@ -1,11 +1,11 @@ from transformers import PreTrainedConfig, BertConfig -class SenticNetGCNConfig(PreTrainedConfig): +class SenticGCNConfig(PreTrainedConfig): """ This is the configuration class to store the configuration of a - :class:`~sgnlp.models.senticnet_gcn.modeling.SenticNetGCNModel`. - It is used to instantiate a SenticNetGCNModel network according to the specific arguments, defining the model architecture. + :class:`~sgnlp.models.sentic_gcn.modeling.SenticGCNModel`. + It is used to instantiate a SenticGCNModel network according to the specific arguments, defining the model architecture. Args: embed_dim (:obj:`int`, defaults to 300): Embedding dimension size. @@ -17,10 +17,10 @@ class SenticNetGCNConfig(PreTrainedConfig): Example: - from sgnlp.models.senticnet_gcn import SenticNetGCNConfig + from sgnlp.models.sentic_gcn import SenticGCNConfig # Initialize with default values - config = SenticNetGCNConfig() + config = SenticGCNConfig() """ def __init__( @@ -42,10 +42,10 @@ def __init__( self.loss_function = loss_function -class SenticNetGCNBertConfig(PreTrainedConfig): +class SenticGCNBertConfig(PreTrainedConfig): """ - This is the configuration class to store the configuration of a :class:`~sgnlp.models.senticnet_gcn.modeling.SenticNetBertGCNModel`. - It is used to instantiate a SenticNetBertGCNModel network according to the specific arguments, defining the model architecture. + This is the configuration class to store the configuration of a :class:`~sgnlp.models.sentic_gcn.modeling.SenticBertGCNModel`. + It is used to instantiate a SenticBertGCNModel network according to the specific arguments, defining the model architecture. Args: hidden_dim (:obj:`int`, defaults to 768): The embedding dimension size for the Bert model as well as GCN dimension. @@ -56,10 +56,10 @@ class SenticNetGCNBertConfig(PreTrainedConfig): loss_function (:obj:`str`, defaults to 'cross_entropy'): Loss function for training/eval. Example: - from sgnlp.models.senticnet_gcn import SenticNetGCNBertConfig + from sgnlp.models.sentic_gcn import SenticGCNBertConfig # Initialize with default values - config = SenticNetGCNBertConfig() + config = SenticGCNBertConfig() """ def __init__( @@ -81,10 +81,10 @@ def __init__( self.loss_function = loss_function -class SenticNetGCNEmbeddingConfig(PreTrainedConfig): +class SenticGCNEmbeddingConfig(PreTrainedConfig): """ - This is the configuration class to store the configuration of a :class:`~SenticNetGCNEmbeddingModel`. - It is used to instantiate a SenticNetGCN Embedding model according to the specified arguments, defining the model architecture. + This is the configuration class to store the configuration of a :class:`~SenticGCNEmbeddingModel`. + It is used to instantiate a SenticGCN Embedding model according to the specified arguments, defining the model architecture. Args: PreTrainedConfig (:obj:`PretrainedConfig`): transformer :obj:`PreTrainedConfig` base class @@ -96,10 +96,10 @@ def __init__(self, vocab_size: int = 17662, embed_dim: int = 300, **kwargs) -> N self.embed_dim = embed_dim -class SenticNetGCNBertEmbeddingConfig(BertConfig): +class SenticGCNBertEmbeddingConfig(BertConfig): """ - This is the configuration class to store the configuration of a :class:`~SenticNetGCNBertEmbeddingModel`. - It is used to instantiate a SenticNetGCN Bert Embedding model according to the specified arguments, defining the model architecture. + This is the configuration class to store the configuration of a :class:`~SenticGCNBertEmbeddingModel`. + It is used to instantiate a SenticGCN Bert Embedding model according to the specified arguments, defining the model architecture. Args: BertConfig (:obj:`BertConfig`): transformer :obj:`BertConfig` base class diff --git a/sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json similarity index 100% rename from sgnlp/models/senticnet_gcn/config/senticnet_gcn_config.json rename to sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json diff --git a/sgnlp/models/senticnet_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py similarity index 96% rename from sgnlp/models/senticnet_gcn/data_class.py rename to sgnlp/models/sentic_gcn/data_class.py index 0d6b349..6ffd55b 100644 --- a/sgnlp/models/senticnet_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -3,10 +3,10 @@ @dataclass -class SenticNetGCNTrainArgs: - model: str = field(default="senticnetgcn", metadata={"help": "Option to choose which model to train."}) +class SenticGCNTrainArgs: + model: str = field(default="senticgcn", metadata={"help": "Option to choose which model to train."}) tokenizer: str = field( - default="senticnetgcn", + default="senticgcn", metadata={"help": "Option to choose which tokenizer to use for training preprocessing."}, ) senticnet_word_file_path: str = field( @@ -97,7 +97,7 @@ class SenticNetGCNTrainArgs: ) def __post_init__(self): - assert self.model in ["senticnetgcn", "senticnetgcnbert"] + assert self.model in ["senticgcn", "senticgcnbert"] assert self.initializer in [ "xavier_uniform", "xavier_uniform", diff --git a/sgnlp/models/senticnet_gcn/modeling.py b/sgnlp/models/sentic_gcn/modeling.py similarity index 77% rename from sgnlp/models/senticnet_gcn/modeling.py rename to sgnlp/models/sentic_gcn/modeling.py index cf4819e..43084e7 100644 --- a/sgnlp/models/senticnet_gcn/modeling.py +++ b/sgnlp/models/sentic_gcn/modeling.py @@ -13,22 +13,22 @@ from .modules.dynamic_rnn import DynamicLSTM from .modules.gcn import GraphConvolution from .config import ( - SenticNetGCNConfig, - SenticNetGCNBertConfig, - SenticNetGCNEmbeddingConfig, - SenticNetGCNBertEmbeddingConfig, + SenticGCNConfig, + SenticGCNBertConfig, + SenticGCNEmbeddingConfig, + SenticGCNBertEmbeddingConfig, ) from .utils import build_embedding_matrix @dataclass -class SenticNetGCNModelOutput(ModelOutput): +class SenticGCNModelOutput(ModelOutput): """ - Base class for outputs of SenticNetGCNModel. + Base class for outputs of SenticGCNModel. Args: loss (:obj:`torch.Tensor` of shape `(1,)`, `optional`, return when :obj:`labels` is provided): - classification loss, typically cross entropy. Loss function used is dependent on what is specified in SenticNetGCNConfig. + classification loss, typically cross entropy. Loss function used is dependent on what is specified in SenticGCNConfig. logits (:obj:`torch.Tensor` of shape :obj:`(batch_size, num_classes)`): raw logits for each class. num_classes = 3 by default. """ @@ -37,38 +37,38 @@ class SenticNetGCNModelOutput(ModelOutput): logits: torch.Tensor = None -class SenticNetGCNPreTrainedModel(PreTrainedModel): +class SenticGCNPreTrainedModel(PreTrainedModel): """ - The SenticNetGCN Pre-Trained Model used as base class for derived SenticNetGCN Model. + The SenticGCN Pre-Trained Model used as base class for derived SenticGCN Model. - This model is the abstract super class for the SenticNetGCN Model which defines the config + This model is the abstract super class for the SenticGCN Model which defines the config class types and weights initalization method. This class should not be used or instantiated directly, - see SenticNetGCNModel class for usage. + see SenticGCNModel class for usage. """ - config_class = SenticNetGCNConfig - base_model_prefix = "senticnetgcn" + config_class = SenticGCNConfig + base_model_prefix = "senticgcn" def _init_weights(self, module: nn.Module) -> None: pass -class SenticNetGCNModel(SenticNetGCNPreTrainedModel): +class SenticGCNModel(SenticGCNPreTrainedModel): """ - The SenticNetGCN Model for aspect based sentiment analysis. + The SenticGCN Model for aspect based sentiment analysis. - This method inherits from :obj:`SenticNetGCNPreTrainedModel` for weights initalization and utility functions + This method inherits from :obj:`SenticGCNPreTrainedModel` for weights initalization and utility functions from transformer :obj:`PreTrainedModel` class. Args: - config (:obj:`~SenticNetGCNConfig`): + config (:obj:`~SenticGCNConfig`): Model configuration class with all parameters required for the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Use the :obj:`.from_pretrained` method to load the model weights. """ - def __init__(self, config: SenticNetGCNConfig) -> None: + def __init__(self, config: SenticGCNConfig) -> None: super().__init__(config) self.text_lstm = DynamicLSTM( config.embed_dim, @@ -120,7 +120,7 @@ def mask(self, x, aspect_double_idx): def forward( self, inputs: dict[str, torch.Tensor], labels: Optional[torch.Tensor] = None - ) -> SenticNetGCNModelOutput: + ) -> SenticGCNModelOutput: text_indices, aspect_indices, left_indices, adj = inputs text_len = torch.sum(text_indices != 0, dim=-1) aspect_len = torch.sum(aspect_indices != 0, dim=-1) @@ -143,18 +143,18 @@ def forward( logits = self.fc(x) loss = self.loss_function(logits, labels) if labels is not None else None - return SenticNetGCNModelOutput(loss=loss, logits=logits) + return SenticGCNModelOutput(loss=loss, logits=logits) @dataclass -class SenticNetGCNBertModelOutput(ModelOutput): +class SenticGCNBertModelOutput(ModelOutput): """ - Base class for outputs of SenticNetGCNBertModel. + Base class for outputs of SenticGCNBertModel. Args: loss (:obj:`torch.Tensor` of shape `(1,)`, `optional`, return when :obj:`labels` is provided): classification loss, typically cross entropy. - Loss function used is dependent on what is specified in SenticNetGCNBertConfig. + Loss function used is dependent on what is specified in SenticGCNBertConfig. logits (:obj:`torch.Tensor` of shape :obj:`(batch_size, num_classes)`): raw logits for each class. num_classes = 3 by default. """ @@ -163,38 +163,38 @@ class SenticNetGCNBertModelOutput(ModelOutput): logits: torch.Tensor = None -class SenticNetGCNBertPreTrainedModel(PreTrainedModel): +class SenticGCNBertPreTrainedModel(PreTrainedModel): """ - The SenticNetGCNBert Pre-Trained Model used as base class for derived SenticNetGCNBert Model. + The SenticGCNBert Pre-Trained Model used as base class for derived SenticGCNBert Model. - This model is the abstract super class for the SenticNetGCNBert Model which defines the config + This model is the abstract super class for the SenticGCNBert Model which defines the config class types and weights initalization method. This class should not be used or instantiated directly, - see SenticNetGCNBertModel class for usage. + see SenticGCNBertModel class for usage. """ - config_class = SenticNetGCNBertConfig - base_model_prefix = "senticnetgcnbert" + config_class = SenticGCNBertConfig + base_model_prefix = "senticgcnbert" def _init_weights(self, module: nn.Module) -> None: pass -class SenticNetGCNBertModel(SenticNetGCNBertPreTrainedModel): +class SenticGCNBertModel(SenticGCNBertPreTrainedModel): """ - The SenticNetGCNBert Model for aspect based sentiment analysis. + The SenticGCNBert Model for aspect based sentiment analysis. - This method inherits from :obj:`SenticNetGCNBertPreTrainedModel` for weights initalization and utility functions + This method inherits from :obj:`SenticGCNBertPreTrainedModel` for weights initalization and utility functions from transformer :obj:`PreTrainedModel` class. Args: - config (:obj:`~SenticNetGCNBertConfig`): + config (:obj:`~SenticGCNBertConfig`): Model configuration class with all parameters required for the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Use the :obj:`.from_pretrained` method to load the model weights. """ - def __init__(self, config: SenticNetGCNBertConfig) -> None: + def __init__(self, config: SenticGCNBertConfig) -> None: super().__init__() self.gc1 = GraphConvolution(config.hidden_dim, config.hidden_dim) self.gc2 = GraphConvolution(config.hidden_dim, config.hidden_dim) @@ -259,44 +259,44 @@ def forward(self, inputs, labels: torch.Tensor): logits = self.fc(x) loss = self.loss_function(logits, labels) if labels is not None else None - return SenticNetGCNBertModelOutput(loss=loss, logits=logits) + return SenticGCNBertModelOutput(loss=loss, logits=logits) -class SenticNetGCNEmbeddingPreTrainedModel(PreTrainedModel): +class SenticGCNEmbeddingPreTrainedModel(PreTrainedModel): """ - The SenticNetGCN Embedding Pre-Trained Model used as base class for derived SenticNetGCN Embedding Model. + The SenticGCN Embedding Pre-Trained Model used as base class for derived SenticGCN Embedding Model. - This model is the abstract super class for the SenticNetGCN Embedding Model which defines the config + This model is the abstract super class for the SenticGCN Embedding Model which defines the config class types and weights initalization method. This class should not be used or instantiated directly, - see SenticNetGCNEmbeddingModel class for usage. + see SenticGCNEmbeddingModel class for usage. """ - config_class = SenticNetGCNEmbeddingConfig - base_model_prefix = "senticnetgcnembedding" + config_class = SenticGCNEmbeddingConfig + base_model_prefix = "senticgcnembedding" def _init_weights(self, module: nn.Module) -> None: pass -class SenticNetGCNEmbeddingModel(SenticNetGCNEmbeddingPreTrainedModel): +class SenticGCNEmbeddingModel(SenticGCNEmbeddingPreTrainedModel): """ - The SenticNetGCN Embedding Model used to generate embeddings for model inputs. + The SenticGCN Embedding Model used to generate embeddings for model inputs. By default, the embeddings are generated from the glove.840B.300d embeddings. - This class inherits from :obj:`SenticNetGCNEmbeddingPreTrainedModel` for weights initalization and utility functions + This class inherits from :obj:`SenticGCNEmbeddingPreTrainedModel` for weights initalization and utility functions from transformers :obj:`PreTrainedModel` class. - This class can also be constructed via the SenticNetGCNEmbeddingModel.build_embedding_matrix class method. + This class can also be constructed via the SenticGCNEmbeddingModel.build_embedding_matrix class method. Args: - config (:obj:`~SenticNetGCNEmbeddingConfig`): + config (:obj:`~SenticGCNEmbeddingConfig`): Model configuration class with all parameters required for the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Use the :obj:`.from_pretrained` method to load the model weights. """ - def __init__(self, config: SenticNetGCNEmbeddingConfig): + def __init__(self, config: SenticGCNEmbeddingConfig): super().__init__() self.vocab_size = config.vocab_size self.embed = nn.Embedding(config.vocab_size, config.embed_dim) @@ -324,32 +324,32 @@ def build_embedding_matrix( embed_dim (int, optional): the embedding dimension. Defaults to 300. Returns: - SenticNetGCNEmbeddingModel: return an instance of SenticNetGCNEmbeddingModel + SenticGCNEmbeddingModel: return an instance of SenticGCNEmbeddingModel """ embedding_matrix = build_embedding_matrix( word_vec_file_path=word_vec_file_path, vocab=vocab, embed_dim=embed_dim ) embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float) - config = SenticNetGCNEmbeddingConfig(vocab_size=vocab, embed_dim=embed_dim) - senticnetgcn_embed = cls(config) - senticnetgcn_embed.embed.weight.data.copy_(embedding_tensor) - return senticnetgcn_embed + config = SenticGCNEmbeddingConfig(vocab_size=vocab, embed_dim=embed_dim) + senticgcn_embed = cls(config) + senticgcn_embed.embed.weight.data.copy_(embedding_tensor) + return senticgcn_embed -class SenticNetGCNBertEmbeddingModel(BertModel): +class SenticGCNBertEmbeddingModel(BertModel): """ - The SenticNetGCN Bert Embedding Model used to generate embeddings for model inputs. + The SenticGCN Bert Embedding Model used to generate embeddings for model inputs. This class inherits from :obj:`BertModel` for weights initalization and utility functions from transformers :obj:`PreTrainedModel` class. Args: - config (:obj:`~SenticNetGCNBertEmbeddingConfig`): + config (:obj:`~SenticGCNBertEmbeddingConfig`): Model configuration class with all parameters required for the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Use the :obj:`.from_pretrained` method to load the model weights. """ - def __init__(self, config: SenticNetGCNBertEmbeddingConfig): + def __init__(self, config: SenticGCNBertEmbeddingConfig): super().__init__(config) diff --git a/sgnlp/models/senticnet_gcn/modules/__init__.py b/sgnlp/models/sentic_gcn/modules/__init__.py similarity index 100% rename from sgnlp/models/senticnet_gcn/modules/__init__.py rename to sgnlp/models/sentic_gcn/modules/__init__.py diff --git a/sgnlp/models/senticnet_gcn/modules/dynamic_rnn.py b/sgnlp/models/sentic_gcn/modules/dynamic_rnn.py similarity index 100% rename from sgnlp/models/senticnet_gcn/modules/dynamic_rnn.py rename to sgnlp/models/sentic_gcn/modules/dynamic_rnn.py diff --git a/sgnlp/models/senticnet_gcn/modules/gcn.py b/sgnlp/models/sentic_gcn/modules/gcn.py similarity index 100% rename from sgnlp/models/senticnet_gcn/modules/gcn.py rename to sgnlp/models/sentic_gcn/modules/gcn.py diff --git a/sgnlp/models/senticnet_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py similarity index 67% rename from sgnlp/models/senticnet_gcn/preprocess.py rename to sgnlp/models/sentic_gcn/preprocess.py index 51a7571..e478dfc 100644 --- a/sgnlp/models/senticnet_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -5,12 +5,12 @@ from transformers.tokenization_utils_base import BatchEncoding from transformers.utils.dummy_pt_objects import PreTrainedModel -from config import SenticNetGCNEmbeddingConfig, SenticNetGCNBertEmbeddingConfig -from modeling import SenticNetGCNEmbeddingModel, SenticNetGCNBertEmbeddingModel -from tokenization import SenticNetGCNTokenizer, SenticNetGCNBertTokenizer +from config import SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig +from modeling import SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel +from tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer -class SenticNetGCNPreprocessor: +class SenticGCNPreprocessor: def __init__( self, tokenizer: PreTrainedTokenizer = None, @@ -23,13 +23,13 @@ def __init__( if tokenizer is not None: self.tokenizer = tokenizer else: - self.tokenizer = SenticNetGCNTokenizer.from_pretrained(tokenizer_name) + self.tokenizer = SenticGCNTokenizer.from_pretrained(tokenizer_name) if embedding_model is not None: self.embedding_model = embedding_model else: - embedding_config = SenticNetGCNEmbeddingConfig.from_pretrained(embedding_model_name) - self.embedding_model = SenticNetGCNEmbeddingModel.from_pretrained( + embedding_config = SenticGCNEmbeddingConfig.from_pretrained(embedding_model_name) + self.embedding_model = SenticGCNEmbeddingModel.from_pretrained( embedding_model_name, config=embedding_config ).to(device) @@ -38,7 +38,7 @@ def __call__(self, data_batch: List[str]) -> BatchEncoding: return tokens -class SenticNetGCNBertPreprocessor: +class SenticGCNBertPreprocessor: def __init__( self, tokenizer: PreTrainedTokenizer = None, @@ -51,13 +51,13 @@ def __init__( if tokenizer is not None: self.tokenizer = tokenizer else: - self.tokenizer = SenticNetGCNBertTokenizer.from_pretrained(tokenizer_name) + self.tokenizer = SenticGCNBertTokenizer.from_pretrained(tokenizer_name) if embedding_model is not None: self.embedding_model = embedding_model else: - embedding_config = SenticNetGCNBertEmbeddingConfig.from_pretrained(embedding_model_name) - self.embedding_model = SenticNetGCNBertEmbeddingModel.from_pretrained( + embedding_config = SenticGCNBertEmbeddingConfig.from_pretrained(embedding_model_name) + self.embedding_model = SenticGCNBertEmbeddingModel.from_pretrained( embedding_model_name, config=embedding_config ).to(device) diff --git a/sgnlp/models/senticnet_gcn/preprocess_dependency.py b/sgnlp/models/sentic_gcn/preprocess_dependency.py similarity index 94% rename from sgnlp/models/senticnet_gcn/preprocess_dependency.py rename to sgnlp/models/sentic_gcn/preprocess_dependency.py index 687d731..465711e 100644 --- a/sgnlp/models/senticnet_gcn/preprocess_dependency.py +++ b/sgnlp/models/sentic_gcn/preprocess_dependency.py @@ -4,14 +4,14 @@ import pickle from utils import parse_args_and_load_config -from data_class import SenticNetGCNTrainArgs +from data_class import SenticGCNTrainArgs class DependencyProcessor: - def __init__(self, config: SenticNetGCNTrainArgs): + def __init__(self, config: SenticGCNTrainArgs): self.config = config self.nlp = spacy.load(config.spacy_pipeline) - self.senticnet = self._load_senticnet(config.senticnet_word_file_path) + self.sentic = self._load_sentic(config.sentic_word_file_path) self.dataset_keys = ["raw"] def _load_senticnet(self, senticnet_file_path: str): @@ -81,4 +81,3 @@ def process(self): for key, func in dependency_keys_map.items(): if not dataset[key]: self._process_file(dataset["raw"], dataset[key], func) - diff --git a/sgnlp/models/senticnet_gcn/tokenization.py b/sgnlp/models/sentic_gcn/tokenization.py similarity index 97% rename from sgnlp/models/senticnet_gcn/tokenization.py rename to sgnlp/models/sentic_gcn/tokenization.py index 86cf3c2..5db79b3 100644 --- a/sgnlp/models/senticnet_gcn/tokenization.py +++ b/sgnlp/models/sentic_gcn/tokenization.py @@ -8,7 +8,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.pkl"} -class SenticNetGCNTokenizer(PreTrainedTokenizer): +class SenticGCNTokenizer(PreTrainedTokenizer): vocab_files_names = VOCAB_FILES_NAMES def __init__( @@ -100,7 +100,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = return (str(vocab_file_path),) -class SenticNetGCNBertTokenizer(BertTokenizer): +class SenticGCNBertTokenizer(BertTokenizer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/sgnlp/models/senticnet_gcn/train.py b/sgnlp/models/sentic_gcn/train.py similarity index 87% rename from sgnlp/models/senticnet_gcn/train.py rename to sgnlp/models/sentic_gcn/train.py index 88989ea..a38e57d 100644 --- a/sgnlp/models/senticnet_gcn/train.py +++ b/sgnlp/models/sentic_gcn/train.py @@ -8,20 +8,18 @@ import torch import torch.nn as nn import torch.optim as optim -from transformers.tokenization_utils import PreTrainedTokenizer -from data_class import SenticNetGCNTrainArgs -from modeling import SenticNetBertGCNPreTrainedModel -from preprocess_dependency import DependencyProcessor -from tokenization import SenticNetGCNTokenizer, SenticNetBertGCNTokenizer +from data_class import SenticGCNTrainArgs +from modeling import SenticGCNBertPreTrainedModel +from tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer from utils import parse_args_and_load_config, set_random_seed, ABSADatasetReader logging.basicConfig(level=logging.DEBUG) -class SenticNetGCNBaseTrainer: - def __init__(self, config: SenticNetGCNTrainArgs): +class SenticGCNBaseTrainer: + def __init__(self, config: SenticGCNTrainArgs): self.config = config self.global_max_acc = 0.0 self.global_max_f1 = 0.0 @@ -31,7 +29,7 @@ def __init__(self, config: SenticNetGCNTrainArgs): else torch.device(self.config.device) ) tokenizer = self._create_tokenizer() - # self.dataloader + # self.dataloader if config.save_state_dict: self.save_state_dict_folder = pathlib.Path(self.config.saved_state_dict_folder_path) self.save_state_dict_folder.mkdir(exist_ok=True) @@ -57,9 +55,11 @@ def _create_optimizer(self): return optimizers[self.config.optimizer] def _create_tokenizer(self): - self.tokenizer = SenticNetBertGCNTokenizer.from_pretrained(self.config.tokenizer) \ - if self.config.model == 'senticnetgcn' \ - else SenticNetBertGCNTokenizer.from_pretrained(self.config.tokenizer) + self.tokenizer = ( + SenticGCNBertTokenizer.from_pretrained(self.config.tokenizer) + if self.config.model == "senticgcn" + else SenticGCNBertTokenizer.from_pretrained(self.config.tokenizer) + ) def _reset_params(self): raise NotImplementedError("Please call from derived class only.") @@ -184,13 +184,13 @@ def train(self): ) -class SenticNetBertGCNTrainer(SenticNetGCNBaseTrainer): - def __init__(self, config: SenticNetGCNTrainArgs): +class SenticBertGCNTrainer(SenticGCNBaseTrainer): + def __init__(self, config: SenticGCNTrainArgs): self.config = config def _reset_params(self): for child in self.model.children(): - if type(child) != SenticNetBertGCNPreTrainedModel: + if type(child) != SenticGCNBertPreTrainedModel: for param in child.parameters(): if param.requires_grad: if len(param.shape) > 1: @@ -200,8 +200,8 @@ def _reset_params(self): nn.init.uniform_(param, a=-stdv, b=stdv) -class SenticNetGCNTrainer(SenticNetGCNBaseTrainer): - def __init__(self, config: SenticNetGCNTrainArgs): +class SenticGCNTrainer(SenticGCNBaseTrainer): + def __init__(self, config: SenticGCNTrainArgs): self.config = config def _reset_params(self): @@ -218,5 +218,5 @@ def _reset_params(self): cfg = parse_args_and_load_config() if cfg.seed is not None: set_random_seed(cfg.seed) - trainer = SenticNetGCNTrainer(cfg) if cfg.model == "senticgcn" else SenticNetBertGCNTrainer(cfg) + trainer = SenticGCNTrainer(cfg) if cfg.model == "senticgcn" else SenticBertGCNTrainer(cfg) trainer.train() diff --git a/sgnlp/models/senticnet_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py similarity index 100% rename from sgnlp/models/senticnet_gcn/utils.py rename to sgnlp/models/sentic_gcn/utils.py From 0ef2d392ac4a4b3cbf726d6593194d97d186aad8 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 3 Jan 2022 17:07:54 +0800 Subject: [PATCH 064/201] [#41] add padding option for bert tokenizer to pad up to max length to match original implementation --- sgnlp/models/sentic_gcn/tokenization.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_gcn/tokenization.py b/sgnlp/models/sentic_gcn/tokenization.py index 5db79b3..49731f9 100644 --- a/sgnlp/models/sentic_gcn/tokenization.py +++ b/sgnlp/models/sentic_gcn/tokenization.py @@ -105,5 +105,7 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def __call__(self, text, max_length: int = 85, **kwargs): - encoding = super().__call__(text, add_special_tokens=False, truncation=True, max_length=max_length, **kwargs) + encoding = super().__call__( + text, add_special_tokens=False, padding="max_length", truncation=True, max_length=max_length, **kwargs + ) return encoding From 168fe8343a53d4b4cbef66a05b1794f3f54aef9c Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 3 Jan 2022 18:39:51 +0800 Subject: [PATCH 065/201] [#41] add docstring and add input arguments for method call --- sgnlp/models/sentic_gcn/tokenization.py | 47 +++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/sgnlp/models/sentic_gcn/tokenization.py b/sgnlp/models/sentic_gcn/tokenization.py index 49731f9..34ea9f9 100644 --- a/sgnlp/models/sentic_gcn/tokenization.py +++ b/sgnlp/models/sentic_gcn/tokenization.py @@ -9,6 +9,19 @@ class SenticGCNTokenizer(PreTrainedTokenizer): + """ + The SenticGCN tokenizer class used for to generate tokens for the embedding model. + + Args: + text (:obj:`str`): + input text string to tokenize + + Example:: + tokenizer = SenticGCNTokenizer.from_pretrained("senticgcn") + inputs = tokenizer('Hello World!') + inputs['input_ids'] + """ + vocab_files_names = VOCAB_FILES_NAMES def __init__( @@ -101,11 +114,41 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = class SenticGCNBertTokenizer(BertTokenizer): + """ + The senticGCN Bert Tokenizer class used to generate tokens for the embedding model, derived from BertTokenizer class. + + Args: + text (:obj:`str`): + input text string to tokenize + + Example:: + tokenizer = SenticGCNBertTokenizer.from_pretrained('bert-base-uncased') + inputs = tokenizer('Hello World!') + inputs['input_ids'] + """ + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def __call__(self, text, max_length: int = 85, **kwargs): + def __call__( + self, + text, + max_length: int = 85, + add_special_tokens: bool = False, + padding: bool = True, + truncation: bool = True, + return_token_type_ids: bool = False, + return_attention_mask: bool = False, + **kwargs, + ): encoding = super().__call__( - text, add_special_tokens=False, padding="max_length", truncation=True, max_length=max_length, **kwargs + text, + max_length=max_length, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + **kwargs, ) return encoding From 28e7dd5c450e48f859ac93e788fb290ea0dcd943 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 3 Jan 2022 20:54:02 +0800 Subject: [PATCH 066/201] [#41] fix wrong class name for config --- sgnlp/models/sentic_gcn/utils.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index 1687a4e..f11f00e 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -14,16 +14,16 @@ from transformers import PreTrainedTokenizer from transformers.tokenization_utils_base import BatchEncoding -from data_class import SenticNetGCNTrainArgs +from data_class import SenticGCNTrainArgs def parse_args_and_load_config( config_path: str = "config/senticnet_gcn_config.json", -) -> SenticNetGCNTrainArgs: +) -> SenticGCNTrainArgs: """Get config from config file using argparser Returns: - SenticNetGCNTrainArgs: SenticNetGCNTrainArgs instance populated from config + SenticGCNTrainArgs: SenticGCNTrainArgs instance populated from config """ parser = argparse.ArgumentParser(description="SenticASGCN Training") parser.add_argument("--config", type=str, default=config_path) @@ -33,7 +33,7 @@ def parse_args_and_load_config( with open(cfg_path, "r") as cfg_file: cfg = json.load(cfg_file) - sentic_asgcn_args = SenticNetGCNTrainArgs(**cfg) + sentic_asgcn_args = SenticGCNTrainArgs(**cfg) return sentic_asgcn_args @@ -65,7 +65,7 @@ def download_tokenizer_files( """ file_paths = [urllib.parse.urljoin(base_url, file_name) for file_name in files] for file_path in file_paths: - pass + download_url_file(file_path, save_folder) def download_url_file(url: str, save_folder: str) -> None: @@ -165,10 +165,27 @@ def __len__(self): return len(self.data) +def generate_senticgcn_dataset(cfg: SenticGCNTrainArgs) -> dict[str, torch.Tensor]: + # TODO: add senticgcn dataset prep + pass + + +def generate_senticgcn_bert_dataset(cfg: SenticGCNTrainArgs) -> dict[str, torch.Tensor]: + # TODO: add senticgcn bert dataset prep + pass + + +def generate_train_val_dataset(cfg: SenticGCNTrainArgs) -> dict[str, torch.Tensor]: + if cfg.model == "senticgcn": + return generate_senticgcn_dataset(cfg) + elif cfg.model == "senticgcnbert": + return generate_senticgcn_bert_dataset(cfg) + + class ABSADatasetReader: def __init__( self, - config: SenticNetGCNTrainArgs, + config: SenticGCNTrainArgs, tokenizer: PreTrainedTokenizer, ): self.cfg = config From 9ba36f1c758024787a2d03cac94936af0bac47e6 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 4 Jan 2022 10:24:11 +0800 Subject: [PATCH 067/201] [#41] add save senticnet options in config --- .../config/senticnet_gcn_config.json | 2 ++ sgnlp/models/sentic_gcn/data_class.py | 21 ++++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json index be8fb54..d61d681 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json @@ -16,6 +16,8 @@ "word_vec_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/glove/glove.840B.300d.txt", "save_embedding_matrix": true, "saved_embedding_matrix_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/embedding/embeddings.pickle", + "save_preprocessed_senticnet": true, + "saved_preprocessed_senticnet_file_path": "senticnet/senticnet.pickle", "initializer": "xavier_uniform", "optimizer": "adam", "loss_function": "cross_entropy", diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index 6ffd55b..9fecb7f 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -60,14 +60,15 @@ class SenticGCNTrainArgs: default=True, metadata={ "help": """Flag to indicate if embedding matrix should be saved. - Flag is ignored if 'saved_embedding_matrix_file_path' is populated and valid.""" + If 'saved_embedding_matrix_file_path' is populated and valid, it will be overwritten if flag is set to True. + """ }, ) saved_embedding_matrix_file_path: str = field( default="embedding/embeddings.pickle", metadata={ - "help": """Full path of saved embedding matrix, if file exists, - embeddings will be generated from file instead of generated from word vector and vocab.""" + "help": """Full path of saved embedding matrix, if file exists and 'save_embedding_matrix' flag is set to False. + Embeddings will be generated from file instead of generated from word vector and vocab.""" }, ) save_state_dict: bool = field( @@ -76,6 +77,20 @@ class SenticGCNTrainArgs: saved_state_dict_folder_path: str = field( default="/state_dict", metadata={"help": "Folder to save model state_dict."} ) + save_preprocessed_senticnet: str = field( + default=True, + metadata={ + "help": """Flag to indicate if senticnet dictionary should be saved during preprocess step. + If 'saved_preprocessed_senticnet_file_path' is populated and valid, it will be overwritten if flag is set to True.""" + }, + ) + saved_preprocessed_senticnet_file_path: str = field( + default="senticnet/senticnet.pickle", + metadata={ + "help": """File path to saved preprocessed senticnet, if file exists and 'save_preprocessed_senticnet' flag is set to False. + SenticNet will be loaded from file instead of generated from raw senticnet files.""" + }, + ) initializer: str = field(default="xavier_uniform", metadata={"help": "Type of initalizer to use."}) optimizer: str = field(default="adam", metadata={"help": "Type of optimizer to use."}) loss_function: str = field(default="cross_entropy", metadata={"help": "Loss function for training/eval."}) From 8fd683ae8e6e09bbd132d82618da3e36b86638b7 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 4 Jan 2022 10:47:23 +0800 Subject: [PATCH 068/201] [#41] drop preprocess dependency script and refactor into util function --- .../sentic_gcn/preprocess_dependency.py | 83 ------------ sgnlp/models/sentic_gcn/utils.py | 124 +++++++++++------- 2 files changed, 78 insertions(+), 129 deletions(-) delete mode 100644 sgnlp/models/sentic_gcn/preprocess_dependency.py diff --git a/sgnlp/models/sentic_gcn/preprocess_dependency.py b/sgnlp/models/sentic_gcn/preprocess_dependency.py deleted file mode 100644 index 465711e..0000000 --- a/sgnlp/models/sentic_gcn/preprocess_dependency.py +++ /dev/null @@ -1,83 +0,0 @@ -import numpy as np -import spacy -import pathlib -import pickle - -from utils import parse_args_and_load_config -from data_class import SenticGCNTrainArgs - - -class DependencyProcessor: - def __init__(self, config: SenticGCNTrainArgs): - self.config = config - self.nlp = spacy.load(config.spacy_pipeline) - self.sentic = self._load_sentic(config.sentic_word_file_path) - self.dataset_keys = ["raw"] - - def _load_senticnet(self, senticnet_file_path: str): - senticNet = {} - with open(senticnet_file_path, "r") as f: - for line in f: - line = line.strip() - if not line(): - continue - word, sentic = line.split("\t") - sentic[word] = sentic - return senticNet - - def _generate_sentic_dependency_adj_matrix(self, text: str, aspect: str) -> np.ndarray: - doc = self.nlp(text) - seq_len = len(text.split()) - matrix = np.zeros((seq_len, seq_len)).astype("float32") - for token in doc: - sentic = float(self.senticnet[str(token)]) + 1.0 if str(token) in self.senticnet else 0 - if str(token) in aspect: - sentic += 1.0 - if token.i < seq_len: - matrix[token.i][token.i] = 1.0 * sentic - for child in token.children: - if str(child) in aspect: - sentic += 1.0 - if child.i < seq_len: - matrix[token.i][child.i] = 1.0 * sentic - matrix[child.i][token.i] = 1.0 * sentic - return matrix - - def _check_saved_file(self, file_path: str) -> bool: - pl_file_path = pathlib.Path(file_path) - return pl_file_path.exists() - - def _load_save_file(self, file_path: str) -> dict[int, str]: - with open(file_path, "rb") as f: - data = pickle.load(f) - return data - - def _process_file(self, raw_file_path: str, file_path: str, process_function: function): - try: - with open(raw_file_path, "r", encoding="utf-8", newline="\n", errors="ignore") as f: - lines = f.readlines() - except: - raise Exception("Error opening raw dataset file!") - - graph = {} - for i in range(0, len(lines), 3): - text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] - aspect = lines[i + 1].lower().strip() - adj_matrix = process_function(text_left + " " + aspect + " " + text_right, aspect, self.senticnet) - graph[i] = adj_matrix - try: - if self.config.save_preprocessed_dependency: - with open(file_path, "wb") as f: - pickle.dump(graph, f) - except: - raise Exception("Error writing graph to file") - # return graph - - def process(self): - dependency_keys_map = { - "dependency_sencticnet_graph": self._generate_sentic_dependency_adj_matrix, - } - for dataset in [self.config.dataset_train, self.config.dataset_test]: - for key, func in dependency_keys_map.items(): - if not dataset[key]: - self._process_file(dataset["raw"], dataset[key], func) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index f11f00e..ff295ac 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -1,4 +1,5 @@ import argparse +from collections import namedtuple import json import logging import pickle @@ -150,62 +151,93 @@ def build_embedding_matrix( return embedding_matrix -class ABSADataset(object): +def load_and_process_senticnet(config: SenticGCNTrainArgs) -> dict[str, float]: """ - Data class to hold dataset for training. - """ - - def __init__(self, data): - self.data = data - - def __getitem__(self, index): - return self.data[index] - - def __len__(self): - return len(self.data) - - -def generate_senticgcn_dataset(cfg: SenticGCNTrainArgs) -> dict[str, torch.Tensor]: - # TODO: add senticgcn dataset prep - pass - - -def generate_senticgcn_bert_dataset(cfg: SenticGCNTrainArgs) -> dict[str, torch.Tensor]: - # TODO: add senticgcn bert dataset prep - pass - - -def generate_train_val_dataset(cfg: SenticGCNTrainArgs) -> dict[str, torch.Tensor]: - if cfg.model == "senticgcn": - return generate_senticgcn_dataset(cfg) - elif cfg.model == "senticgcnbert": - return generate_senticgcn_bert_dataset(cfg) + Helper method to load and process senticnet. Default is SenticNet 5.0. + If a saved preprocess senticnet file is available, and save flag is set to false, it will be loaded from file instead. + Source: + https://github.com/BinLiang-NLP/Sentic-GCN/tree/main/senticnet-5.0 + Args: + config (SenticGCNTrainArgs): SenticGCN training config -class ABSADatasetReader: + Returns: + dict[str, float]: return dictionary with concept word as keys and intensity as values. + """ + saved_senticnet_file_path = pathlib.Path(config.saved_preprocessed_senticnet_file_path) + if saved_senticnet_file_path.exists() and not config.save_preprocessed_senticnet: + with open(saved_senticnet_file_path, "r") as f: + sentic_dict = pickle.load(f) + else: + senticnet_file_path = pathlib.Path(config.senticnet_word_file_path) + sentic_dict = {} + with open(senticnet_file_path, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + items = line.split("\t") + if "_" in items[0]: + continue # skip words with '_' + sentic_dict[items[0]] = items[-1] + return sentic_dict + + +class SenticGCNDatasetGenerator(Dataset): def __init__( self, + dataset_type: str, config: SenticGCNTrainArgs, tokenizer: PreTrainedTokenizer, ): - self.cfg = config + self.config = config self.tokenizer = tokenizer - self.embedding_matrix = build_embedding_matrix( - config.word_vec_file_path, - tokenizer.vocab, - config.embed_dim, - config.save_embedding_matrix, - config.saved_embedding_matrix_file_path, - ) - self.train_data = ABSADataset(ABSADatasetReader.__read_data__(self.cfg.dataset_train, tokenizer)) - self.test_data = ABSADataset(ABSADatasetReader.__read_data__(self.cfg.dataset_test, tokenizer)) - if config.valset_ratio: - valset_len = int(len(self.train_data) * config.valset_ratio) - self.train_data, self.val_data = random_split( - self.train_data, (len(self.train_data) - valset_len, valset_len) + + def __getitem__(self, index: int): + return self.data[index] + + def __len__(self) -> int: + return len(self.data) + + def _read_raw_dataset(self, dataset_type: str) -> list[namedtuple]: + """ + Private helper method to read raw dataset files based on requested type (e.g. Train or Test). + + Args: + dataset_type (str): Type of dataset files to read. Train or Test. + + Returns: + list[namedtuple]: list of namedtuples consisting of the full text, the aspect and polarity. + """ + file_path = self.config.dataset_train["raw"] if dataset_type == "train" else self.config.dataset_test["raw"] + RawDataSet = namedtuple("RawDataSet", ["text", "aspect", "polarity"]) + with open(file_path, "r", encoding="utf-8", newline="\n", errors="ignore") as f: + lines = f.readlines() + output = [] + for i in range(0, len(lines), 3): + output.append( + RawDataSet(lines[i].lower().strip(), lines[i + 1].lower().strip(), lines[i + 2].lower().strip()) ) - else: - self.val_data = self.test_data + return output + + def _read_dependency_senticnet_graph(self, dataset_type: str) -> dict[str, np.ndarray]: + """ + Private helpder method to read senticnet graph dataset based on requested type (i.e. Train or Test). + + Args: + dataset_type (str): Type of dataset files to read. Train or Test. + + Returns: + dict[str, np.ndarray]: dictionary with + """ + file_path = ( + self.config.dataset_train["dependency_sencticnet_graph"] + if dataset_type == "train" + else self.config.dataset_test["dependency_sencticnet_graph"] + ) + with open(file_path, "rb") as f: + graph = pickle.load(f) + return graph @staticmethod def __read_data__(datasets: Dict[str, str], tokenizer: PreTrainedTokenizer): From dd8cf61e69ea8a4bbdd01a8a7741066054174c92 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 4 Jan 2022 10:51:40 +0800 Subject: [PATCH 069/201] [#41] add save option for preprocess senticnet --- sgnlp/models/sentic_gcn/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index ff295ac..aabace5 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -180,6 +180,9 @@ def load_and_process_senticnet(config: SenticGCNTrainArgs) -> dict[str, float]: if "_" in items[0]: continue # skip words with '_' sentic_dict[items[0]] = items[-1] + if config.save_preprocessed_senticnet: + with open(saved_senticnet_file_path, "wb") as f: + pickle.dump(sentic_dict, f) return sentic_dict From 4bad0fbe15bd9c6721c1f3e7733f55b1534a6457 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 4 Jan 2022 11:37:31 +0800 Subject: [PATCH 070/201] [#41] refractor load_and_process_senticnet function to remove dependency SenticGCNTrainArgs, add functions to generate dependency adj matrix --- sgnlp/models/sentic_gcn/utils.py | 156 ++++++++++++++++++------------- 1 file changed, 89 insertions(+), 67 deletions(-) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index aabace5..c760dea 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -151,7 +151,11 @@ def build_embedding_matrix( return embedding_matrix -def load_and_process_senticnet(config: SenticGCNTrainArgs) -> dict[str, float]: +def load_and_process_senticnet( + senticnet_file_path: str = None, + save_preprocessed_senticnet: bool = False, + saved_preprocessed_senticnet_file_path: str = "senticnet.pkl", +) -> dict[str, float]: """ Helper method to load and process senticnet. Default is SenticNet 5.0. If a saved preprocess senticnet file is available, and save flag is set to false, it will be loaded from file instead. @@ -159,17 +163,19 @@ def load_and_process_senticnet(config: SenticGCNTrainArgs) -> dict[str, float]: https://github.com/BinLiang-NLP/Sentic-GCN/tree/main/senticnet-5.0 Args: - config (SenticGCNTrainArgs): SenticGCN training config + senticnet_file_path (str): File path to senticnet 5.0 file. + save_preprocessed_senticnet (bool): Flag to indicate if processed senticnet should be saved. + saved_preprocessed_senticnet_file_path: (str): File path to saved preprocessed senticnet file. Returns: dict[str, float]: return dictionary with concept word as keys and intensity as values. """ - saved_senticnet_file_path = pathlib.Path(config.saved_preprocessed_senticnet_file_path) - if saved_senticnet_file_path.exists() and not config.save_preprocessed_senticnet: + saved_senticnet_file_path = pathlib.Path(saved_preprocessed_senticnet_file_path) + if saved_senticnet_file_path.exists() and not save_preprocessed_senticnet: with open(saved_senticnet_file_path, "r") as f: sentic_dict = pickle.load(f) else: - senticnet_file_path = pathlib.Path(config.senticnet_word_file_path) + senticnet_file_path = pathlib.Path(senticnet_file_path) sentic_dict = {} with open(senticnet_file_path, "r") as f: for line in f: @@ -180,28 +186,63 @@ def load_and_process_senticnet(config: SenticGCNTrainArgs) -> dict[str, float]: if "_" in items[0]: continue # skip words with '_' sentic_dict[items[0]] = items[-1] - if config.save_preprocessed_senticnet: + if save_preprocessed_senticnet: + saved_senticnet_file_path.parent.mkdir(exist_ok=True) with open(saved_senticnet_file_path, "wb") as f: pickle.dump(sentic_dict, f) return sentic_dict -class SenticGCNDatasetGenerator(Dataset): - def __init__( - self, - dataset_type: str, - config: SenticGCNTrainArgs, - tokenizer: PreTrainedTokenizer, - ): - self.config = config - self.tokenizer = tokenizer +def generate_dependency_adj_matrix(text: str, aspect: str, senticnet: dict[str, float], spacy_pipeline) -> np.ndarray: + """ + Helper method to generate senticnet depdency adj matrix. + + Args: + text (str): input text to process + aspect (str): aspect from input text + senticnet (dict[str, float]): dictionary of preprocessed senticnet. See load_and_process_senticnet() + spacy_pipeline : Spacy pretrained pipeline (e.g. 'en_core_web_sm') - def __getitem__(self, index: int): + Returns: + np.ndarray: return ndarry representing adj matrix. + """ + document = spacy_pipeline(text) + seq_len = len(text.split()) + matrix = np.zeros((seq_len, seq_len)).astype("float32") + for token in document: + sentic = float(senticnet[str(token)]) + 1.0 if str(token) in senticnet else 0 + if str(token) in aspect: + sentic += 1.0 + if token.i < seq_len: + matrix[token.i][token.i] = 1.0 * sentic + for child in token.children: + if str(child) in aspect: + sentic += 1.0 + if child.i < seq_len: + matrix[token.i][child.i] = 1.0 * sentic + matrix[child.i][token.i] = 1.0 * sentic + return matrix + + +class SenticGCNDataset(Dataset): + """ + Data class for SenticGCN dataset. + """ + + def __init__(self, data: list[dict[str, torch.Tensor]]) -> None: + self.data = data + + def __getitem__(self, index: int) -> dict[str, torch.Tensor]: return self.data[index] - def __len__(self) -> int: + def __len__(self): return len(self.data) + +class SenticGCNDatasetGenerator: + def __init__(self, config: SenticGCNTrainArgs): + self.config = config + def _read_raw_dataset(self, dataset_type: str) -> list[namedtuple]: """ Private helper method to read raw dataset files based on requested type (e.g. Train or Test). @@ -223,53 +264,34 @@ def _read_raw_dataset(self, dataset_type: str) -> list[namedtuple]: ) return output - def _read_dependency_senticnet_graph(self, dataset_type: str) -> dict[str, np.ndarray]: - """ - Private helpder method to read senticnet graph dataset based on requested type (i.e. Train or Test). - - Args: - dataset_type (str): Type of dataset files to read. Train or Test. - - Returns: - dict[str, np.ndarray]: dictionary with - """ - file_path = ( - self.config.dataset_train["dependency_sencticnet_graph"] - if dataset_type == "train" - else self.config.dataset_test["dependency_sencticnet_graph"] - ) - with open(file_path, "rb") as f: - graph = pickle.load(f) - return graph - - @staticmethod - def __read_data__(datasets: Dict[str, str], tokenizer: PreTrainedTokenizer): - # Read raw data, graph data and tree data - with open(datasets["raw"], "r", encoding="utf-8", newline="\n", errors="ignore") as fin: - lines = fin.readlines() - with open(datasets["graph"], "rb") as fin_graph: - idx2graph = pickle.load(fin_graph) - - # Prep all data - all_data = [] - for i in range(0, len(lines), 3): - text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] - aspect = lines[i + 1].lower().strip() - polarity = lines[i + 2].lower().strip() - text_indices = tokenizer(f"{text_left} {aspect} {text_right}") - context_indices = tokenizer(f"{text_left} {text_right}") - aspect_indices = tokenizer(aspect) - left_indices = tokenizer(text_left) - polarity = int(polarity) + 1 - dependency_graph = idx2graph[i] - - data = { - "text_indices": text_indices, - "context_indices": context_indices, - "aspect_indices": aspect_indices, - "left_indices": left_indices, - "polarity": polarity, - "dependency_graph": dependency_graph, - } - all_data.append(data) - return all_data + # @staticmethod + # def __read_data__(datasets: Dict[str, str], tokenizer: PreTrainedTokenizer): + # # Read raw data, graph data and tree data + # with open(datasets["raw"], "r", encoding="utf-8", newline="\n", errors="ignore") as fin: + # lines = fin.readlines() + # with open(datasets["graph"], "rb") as fin_graph: + # idx2graph = pickle.load(fin_graph) + + # # Prep all data + # all_data = [] + # for i in range(0, len(lines), 3): + # text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] + # aspect = lines[i + 1].lower().strip() + # polarity = lines[i + 2].lower().strip() + # text_indices = tokenizer(f"{text_left} {aspect} {text_right}") + # context_indices = tokenizer(f"{text_left} {text_right}") + # aspect_indices = tokenizer(aspect) + # left_indices = tokenizer(text_left) + # polarity = int(polarity) + 1 + # dependency_graph = idx2graph[i] + + # data = { + # "text_indices": text_indices, + # "context_indices": context_indices, + # "aspect_indices": aspect_indices, + # "left_indices": left_indices, + # "polarity": polarity, + # "dependency_graph": dependency_graph, + # } + # all_data.append(data) + # return all_data From eda69226a63b8f9f9d46c1ccb6119c6602bea599 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 4 Jan 2022 11:59:22 +0800 Subject: [PATCH 071/201] [#41] remove convoluted dataset file path structure --- .../config/senticnet_gcn_config.json | 10 +---- sgnlp/models/sentic_gcn/data_class.py | 42 +++---------------- sgnlp/models/sentic_gcn/utils.py | 2 +- 3 files changed, 9 insertions(+), 45 deletions(-) diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json index d61d681..9fb9169 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json @@ -4,14 +4,8 @@ "senticnet_word_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/senticNet/senticnet_word.txt", "spacy_pipeline": "en_core_web_sm", "save_preprocessed_dependency": true, - "dataset_train": { - "raw": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", - "dependency_sencticnet_graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw.dsenticgraph" - }, - "dataset_test": { - "raw": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw", - "dependency_sencticnet_graph": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw.dsenticgraph" - }, + "dataset_train": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", + "dataset_test": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw", "valset_ratio": 0, "word_vec_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/glove/glove.840B.300d.txt", "save_embedding_matrix": true, diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index 9fecb7f..4a49f78 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -15,34 +15,13 @@ class SenticGCNTrainArgs: spacy_pipeline: str = field( default="en_core_web_sm", metadata={"help": "Type of spacy pipeline to load for processor."} ) - save_preprocessed_dependency: bool = field( - default=True, - metadata={ - "help": """Flag to indicate if dependency preprocess should run, - if pickle files already present, it will be overwritten.""" - }, + dataset_train: str = field( + default="train.raw", + metadata={"help": "File path to train dataset."}, ) - dataset_keys: List[str] = field( - default_factory=lambda: ["raw", "dependency_sencticnet_graph"], - metadata={"help": "Default dataset keys."}, - ) - dataset_train: Dict[str, str] = field( - default=dict, - metadata={ - "help": """Dictionary containing 3 file paths to the raw dataset file, - dependency_graph, sentic_graph and the dependency_senticnet_graph files for the train datasets. - Raw file path is mandatory, the graph files are optional. If graph files are not present, - it will be generated during preprocessing step.""" - }, - ) - dataset_test: Dict[str, str] = field( - default=dict, - metadata={ - "help": """Dictionary containing 3 file paths to the raw dataset file, - dependency_graph, sentic_graph and the dependency_senticnet_graph files for the test datasets. - Raw file path is mandatory, the graph files are optional. If graph files are not present, - it will be generated during preprocessing step.""" - }, + dataset_test: str = field( + default="test.raw", + metadata={"help": "File path to test dataset."}, ) valset_ratio: float = field( default=0.0, @@ -128,15 +107,6 @@ def __post_init__(self): "sgd", ], "Invalid optimizer" assert self.device in ["cuda", "cpu"], "Invalid device type." - assert "raw" in self.dataset_train.keys(), "File path to raw dataset is required!" - assert "raw" in self.dataset_test.keys(), "File path to raw dataset is required!" - # populate keys if not presents - train_diff_keys = set(self.dataset_keys).difference(set(self.dataset_train.keys())) - for key in train_diff_keys: - self.dataset_train[key] = "" - test_diff_keys = set(self.dataset_keys).difference(set(self.dataset_test.keys())) - for key in test_diff_keys: - self.dataset_test[key] = "" assert self.repeats > 1, "Repeats value must be at least 1." assert self.patience > 1, "Patience value must be at least 1." assert 0 >= self.valset_ratio < 1, "Valset_ratio must be greater or equals to 0 and less than 1." diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index c760dea..ca5647b 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -253,7 +253,7 @@ def _read_raw_dataset(self, dataset_type: str) -> list[namedtuple]: Returns: list[namedtuple]: list of namedtuples consisting of the full text, the aspect and polarity. """ - file_path = self.config.dataset_train["raw"] if dataset_type == "train" else self.config.dataset_test["raw"] + file_path = self.config.dataset_train if dataset_type == "train" else self.config.dataset_test RawDataSet = namedtuple("RawDataSet", ["text", "aspect", "polarity"]) with open(file_path, "r", encoding="utf-8", newline="\n", errors="ignore") as f: lines = f.readlines() From b296d2406c1aadd790b0533e9602679a3828359f Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 4 Jan 2022 16:34:01 +0800 Subject: [PATCH 072/201] [#41] completed dataset generator class --- .../config/senticnet_gcn_config.json | 3 +- sgnlp/models/sentic_gcn/data_class.py | 2 + sgnlp/models/sentic_gcn/tokenization.py | 2 + sgnlp/models/sentic_gcn/utils.py | 234 ++++++++++++++---- 4 files changed, 189 insertions(+), 52 deletions(-) diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json index 9fb9169..fc7c887 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json @@ -28,5 +28,6 @@ "seed": 776, "device": "cuda", "repeats": 10, - "patience": 5 + "patience": 5, + "max_len": 85 } \ No newline at end of file diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index 4a49f78..d2ca8e0 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -89,6 +89,7 @@ class SenticGCNTrainArgs: patience: int = field( default=5, metadata={"help": "Number of train epoch without improvements prior to early stopping."} ) + max_len: int = field(default=85, metadata={"help": "Max length to pad for bert tokenizer."}) def __post_init__(self): assert self.model in ["senticgcn", "senticgcnbert"] @@ -110,3 +111,4 @@ def __post_init__(self): assert self.repeats > 1, "Repeats value must be at least 1." assert self.patience > 1, "Patience value must be at least 1." assert 0 >= self.valset_ratio < 1, "Valset_ratio must be greater or equals to 0 and less than 1." + assert 0 >= self.max_len < 1, "Max_len must be greater or equals to 0 and less than 1." diff --git a/sgnlp/models/sentic_gcn/tokenization.py b/sgnlp/models/sentic_gcn/tokenization.py index 34ea9f9..2645cf2 100644 --- a/sgnlp/models/sentic_gcn/tokenization.py +++ b/sgnlp/models/sentic_gcn/tokenization.py @@ -139,6 +139,7 @@ def __call__( truncation: bool = True, return_token_type_ids: bool = False, return_attention_mask: bool = False, + return_tensors: str = None, **kwargs, ): encoding = super().__call__( @@ -149,6 +150,7 @@ def __call__( truncation=truncation, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, + return_tensors=return_tensors, **kwargs, ) return encoding diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index ca5647b..b948634 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -1,5 +1,4 @@ import argparse -from collections import namedtuple import json import logging import pickle @@ -7,13 +6,13 @@ import pathlib import requests import urllib -from typing import Dict, List, Union +from typing import Dict, Tuple import numpy as np +import spacy import torch from torch.utils.data import random_split, Dataset from transformers import PreTrainedTokenizer -from transformers.tokenization_utils_base import BatchEncoding from data_class import SenticGCNTrainArgs @@ -91,6 +90,38 @@ def download_url_file(url: str, save_folder: str) -> None: logging.error(f"Fail to request files from {url}.") +def pad_and_truncate( + sequence: list[float], + max_len: int, + dtype: str = "int64", + padding: str = "post", + truncating: str = "post", + value: int = 0, +): + """ + Helper method for padding and truncating text and aspect segment. + + Args: + sequence (list[float]): input sequence of indices + max_len (int): maximum len to pad + dtype (str, optional): data type to cast indices. Defaults to "int64". + padding (str, optional): type of padding, 'pre' or 'post'. Defaults to "post". + truncating (str, optional): type of truncating, 'pre' or 'post'. Defaults to "post". + value (int, optional): value used for padding. Defaults to 0. + + Returns: + [type]: [description] + """ + seq_arr = (np.ones(max_len) * value).astype(dtype) + trunc = sequence[-max_len:] if truncating == "pre" else sequence[:max_len] + trunc = np.asarray(trunc, dtype=dtype) + if padding == "post": + seq_arr[: len(trunc)] = trunc + else: + seq_arr[-len(trunc) :] = trunc + return seq_arr + + def load_word_vec(word_vec_file_path: str, vocab: Dict[str, int], embed_dim: int = 300) -> Dict[str, np.asarray]: """ Helper method to load word vectors from file (e.g. GloVe) for each word in vocab. @@ -155,7 +186,7 @@ def load_and_process_senticnet( senticnet_file_path: str = None, save_preprocessed_senticnet: bool = False, saved_preprocessed_senticnet_file_path: str = "senticnet.pkl", -) -> dict[str, float]: +) -> Dict[str, float]: """ Helper method to load and process senticnet. Default is SenticNet 5.0. If a saved preprocess senticnet file is available, and save flag is set to false, it will be loaded from file instead. @@ -168,7 +199,7 @@ def load_and_process_senticnet( saved_preprocessed_senticnet_file_path: (str): File path to saved preprocessed senticnet file. Returns: - dict[str, float]: return dictionary with concept word as keys and intensity as values. + Dict[str, float]: return dictionary with concept word as keys and intensity as values. """ saved_senticnet_file_path = pathlib.Path(saved_preprocessed_senticnet_file_path) if saved_senticnet_file_path.exists() and not save_preprocessed_senticnet: @@ -193,14 +224,14 @@ def load_and_process_senticnet( return sentic_dict -def generate_dependency_adj_matrix(text: str, aspect: str, senticnet: dict[str, float], spacy_pipeline) -> np.ndarray: +def generate_dependency_adj_matrix(text: str, aspect: str, senticnet: Dict[str, float], spacy_pipeline) -> np.ndarray: """ Helper method to generate senticnet depdency adj matrix. Args: text (str): input text to process aspect (str): aspect from input text - senticnet (dict[str, float]): dictionary of preprocessed senticnet. See load_and_process_senticnet() + senticnet (Dict[str, float]): dictionary of preprocessed senticnet. See load_and_process_senticnet() spacy_pipeline : Spacy pretrained pipeline (e.g. 'en_core_web_sm') Returns: @@ -229,10 +260,10 @@ class SenticGCNDataset(Dataset): Data class for SenticGCN dataset. """ - def __init__(self, data: list[dict[str, torch.Tensor]]) -> None: + def __init__(self, data: list[Dict[str, torch.Tensor]]) -> None: self.data = data - def __getitem__(self, index: int) -> dict[str, torch.Tensor]: + def __getitem__(self, index: int) -> Dict[str, torch.Tensor]: return self.data[index] def __len__(self): @@ -240,10 +271,26 @@ def __len__(self): class SenticGCNDatasetGenerator: - def __init__(self, config: SenticGCNTrainArgs): - self.config = config + """ + Main dataset generator class to preprocess raw dataset file. + """ - def _read_raw_dataset(self, dataset_type: str) -> list[namedtuple]: + def __init__(self, config: SenticGCNTrainArgs, tokenizer: PreTrainedTokenizer): + self.config = config + self.senticnet = load_and_process_senticnet( + config.senticnet_word_file_path, + config.save_preprocessed_senticnet, + config.saved_preprocessed_senticnet_file_path, + ) + self.spacy_pipeline = spacy.load(config.spacy_pipeline) + self.tokenizer = tokenizer + self.device = ( + torch.device("cuda" if torch.cuda.is_available() else "cpu") + if config.device is None + else torch.device(config.device) + ) + + def _read_raw_dataset(self, dataset_type: str) -> list[str]: """ Private helper method to read raw dataset files based on requested type (e.g. Train or Test). @@ -251,47 +298,132 @@ def _read_raw_dataset(self, dataset_type: str) -> list[namedtuple]: dataset_type (str): Type of dataset files to read. Train or Test. Returns: - list[namedtuple]: list of namedtuples consisting of the full text, the aspect and polarity. + list[str]: list of str consisting of the full text, aspect and polarity index. """ file_path = self.config.dataset_train if dataset_type == "train" else self.config.dataset_test - RawDataSet = namedtuple("RawDataSet", ["text", "aspect", "polarity"]) with open(file_path, "r", encoding="utf-8", newline="\n", errors="ignore") as f: lines = f.readlines() - output = [] - for i in range(0, len(lines), 3): - output.append( - RawDataSet(lines[i].lower().strip(), lines[i + 1].lower().strip(), lines[i + 2].lower().strip()) + return lines + + def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, torch.Tensor]: + """ + Data preprocess method to generate all indices required for SenticGCN model training. + + Args: + raw_data (list[str]): list of text, aspect word and polarity read from raw dataset file. + + Returns: + Dict[str, torch.Tensor]: return a dictionary of dataset sub-type and their tensors. + """ + all_data = [] + for i in range(0, len(raw_data), 3): + # Process full text, aspect and polarity index + text_left, _, text_right = [s.lower().strip() for s in raw_data[i].partition("$T$")] + aspect = raw_data[i + 1].lower().strip() + full_text = f"{text_left} {aspect} {text_right}" + polarity = raw_data[i + 2].strip() + + # Process indices + text_indices = self.tokenizer(full_text) + aspect_indices = self.tokenizer(aspect) + left_indices = self.tokenizer(text_left) + polarity = int(polarity) + 1 + graph = generate_dependency_adj_matrix(full_text, aspect, self.senticnet, self.spacy_pipeline) + + all_data.append( + { + "text_indices": text_indices.to(self.device), + "aspect_indices": aspect_indices.to(self.device), + "left_indices": left_indices.to(self.device), + "polarity": polarity.to(self.device), + "sdat_graph": graph.to(self.device), + } ) - return output - - # @staticmethod - # def __read_data__(datasets: Dict[str, str], tokenizer: PreTrainedTokenizer): - # # Read raw data, graph data and tree data - # with open(datasets["raw"], "r", encoding="utf-8", newline="\n", errors="ignore") as fin: - # lines = fin.readlines() - # with open(datasets["graph"], "rb") as fin_graph: - # idx2graph = pickle.load(fin_graph) - - # # Prep all data - # all_data = [] - # for i in range(0, len(lines), 3): - # text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] - # aspect = lines[i + 1].lower().strip() - # polarity = lines[i + 2].lower().strip() - # text_indices = tokenizer(f"{text_left} {aspect} {text_right}") - # context_indices = tokenizer(f"{text_left} {text_right}") - # aspect_indices = tokenizer(aspect) - # left_indices = tokenizer(text_left) - # polarity = int(polarity) + 1 - # dependency_graph = idx2graph[i] - - # data = { - # "text_indices": text_indices, - # "context_indices": context_indices, - # "aspect_indices": aspect_indices, - # "left_indices": left_indices, - # "polarity": polarity, - # "dependency_graph": dependency_graph, - # } - # all_data.append(data) - # return all_data + return all_data + + def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, torch.Tensor]: + """ + Data preprocess method to generate all indices required for SenticGCNBert model training. + + Args: + raw_data (list[str]): list of text, aspect word and polarity read from raw dataset file. + + Returns: + Dict[str, torch.Tensor]: return a dictionary of dataset sub-type and their tensors. + """ + all_data = [] + max_len = self.config.max_len + for i in range(0, len(raw_data), 3): + # Process full text, aspect and polarity index + text_left, _, text_right = [s.lower().strip() for s in raw_data[i].partition("$T$")] + aspect = raw_data[i + 1].lower().strip() + polarity = raw_data[i + 2].strip() + full_text = f"{text_left} {aspect} {text_right}" + full_text_with_bert_tokens = f"[CLS] {full_text} [SEP] {aspect} [SEP]" + + # Process indices + text_indices = self.tokenizer(full_text, return_tensors="pt") + aspect_indices = self.tokenizer(aspect, return_tensors="pt") + left_indices = self.tokenizer(text_left, return_tensors="pt") + polarity = int(polarity) + 1 + polarity = torch.tensor(polarity) + + # Process bert related indices + text_bert_indices = self.tokenizer(full_text_with_bert_tokens) + text_len = np.sum(text_indices["input_ids"] != 0) + aspect_len = np.sum(aspect_indices["input_ids"] != 0) + + # array of [0] for texts including [CLS] and [SEP] and [1] for aspect and ending [SEP] + concat_segment_indices = [0] * (text_len + 2) + [1] * (aspect_len + 1) + concat_segment_indices = pad_and_truncate(concat_segment_indices, max_len) + concat_segment_indices = torch.tensor(concat_segment_indices) + + # Process graph + graph = generate_dependency_adj_matrix(full_text, aspect, self.senticnet, self.spacy_pipeline) + sdat_graph = np.pad( + graph, + ( + (0, max_len - graph.shape[0]), + (0, max_len - graph.shape[0]), + ), + "constant", + ) + sdat_graph = torch.tensor(sdat_graph) + + all_data.append( + { + "text_indices": text_indices.to(self.device), + "aspect_indices": aspect_indices.to(self.device), + "left_indices": left_indices.to(self.device), + "text_bert_indices": text_bert_indices.to(self.device), + "bert_segment_indices": concat_segment_indices.to(self.device), + "polarity": polarity.to(self.device), + "sdat_graph": sdat_graph.to(self.device), + } + ) + return all_data + + def generate_datasets(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: + """ + Main wrapper method to generate datasets for both SenticGCN and SenticGCNBert based on config. + + Returns: + Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: return dictionaries for train/val/test data. + """ + # Read raw data from dataset files + raw_train_data = self._read_raw_dataset(self.config.dataset_train) + raw_test_data = self._read_raw_dataset(self.config.dataset_test) + # Generate dataset dictionary + if self.config.model == "senticgcn": + train_data = self._generate_senticgcn_dataset(raw_train_data) + test_data = self._generate_senticgcn_dataset(raw_test_data) + else: + train_data = self._generate_senticgcnbert_dataset(raw_train_data) + test_data = self._generate_senticgcnbert_dataset(raw_test_data) + # Train/Val/Test split + if self.config.valset_ratio > 0: + valset_len = int(len(train_data) * self.config.valset_ratio) + train_data, val_data = random_split(train_data, (len(train_data) - valset_len, valset_len)) + else: + val_data = test_data + return train_data, val_data, test_data From 5f9cc288b69cc385ce3f8357cc6af0b6b52de52e Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 4 Jan 2022 16:38:38 +0800 Subject: [PATCH 073/201] [#41] cast dataset generator output to SenticGCNDataset instances --- sgnlp/models/sentic_gcn/utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index b948634..825e5eb 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -403,16 +403,18 @@ def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, torc ) return all_data - def generate_datasets(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: + def generate_datasets(self) -> Tuple[SenticGCNDataset, SenticGCNDataset, SenticGCNDataset]: """ Main wrapper method to generate datasets for both SenticGCN and SenticGCNBert based on config. Returns: - Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: return dictionaries for train/val/test data. + Tuple[SenticGCNDataset, SenticGCNDataset, SenticGCNDataset]: + return SenticGCNDataset instances for train/val/test data. """ # Read raw data from dataset files raw_train_data = self._read_raw_dataset(self.config.dataset_train) raw_test_data = self._read_raw_dataset(self.config.dataset_test) + # Generate dataset dictionary if self.config.model == "senticgcn": train_data = self._generate_senticgcn_dataset(raw_train_data) @@ -420,10 +422,11 @@ def generate_datasets(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Te else: train_data = self._generate_senticgcnbert_dataset(raw_train_data) test_data = self._generate_senticgcnbert_dataset(raw_test_data) + # Train/Val/Test split if self.config.valset_ratio > 0: valset_len = int(len(train_data) * self.config.valset_ratio) train_data, val_data = random_split(train_data, (len(train_data) - valset_len, valset_len)) else: val_data = test_data - return train_data, val_data, test_data + return SenticGCNDataset(train_data), SenticGCNDataset(val_data), SenticGCNDataset(test_data) From 9224bcd9c7785a91aae9e1fad63f4324bdad2af3 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 4 Jan 2022 16:53:25 +0800 Subject: [PATCH 074/201] [#41] clean up unused imports --- sgnlp/models/sentic_gcn/data_class.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index d2ca8e0..add008a 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import Dict, List @dataclass From 9829b235b88261b43a851ea6fa660b6f91ef0ad0 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 4 Jan 2022 17:47:53 +0800 Subject: [PATCH 075/201] [#41] fix naming error in config --- sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json | 7 +++---- sgnlp/models/sentic_gcn/data_class.py | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json index fc7c887..3bd7131 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json @@ -1,9 +1,8 @@ { - "model": "senticnetgcn", - "tokenizer": "senticnetgcn", + "model": "senticgcn", + "tokenizer": "senticgcn", "senticnet_word_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/senticNet/senticnet_word.txt", "spacy_pipeline": "en_core_web_sm", - "save_preprocessed_dependency": true, "dataset_train": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", "dataset_test": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw", "valset_ratio": 0, @@ -24,7 +23,7 @@ "hidden_dim": 300, "polarities_dim": 3, "dropout": 0.3, - "save": true, + "save_results": true, "seed": 776, "device": "cuda", "repeats": 10, diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index add008a..2094273 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -81,7 +81,7 @@ class SenticGCNTrainArgs: hidden_dim: int = field(default=300, metadata={"help": "Number of neurons for hidden layer."}) dropout: float = field(default=0.3, metadata={"help": "Default value for dropout percentages."}) polarities_dim: int = field(default=3, metadata={"help": "Default dimension for polarities."}) - save: bool = field(default=True, metadata={"help": "Flag to indicate if results should be saved."}) + save_results: bool = field(default=True, metadata={"help": "Flag to indicate if results should be saved."}) seed: int = field(default=776, metadata={"help": "Default random seed for training."}) device: str = field(default="cuda", metadata={"help": "Type of compute device to use for training."}) repeats: int = field(default=10, metadata={"help": "Number of times to repeat train loop."}) @@ -91,7 +91,7 @@ class SenticGCNTrainArgs: max_len: int = field(default=85, metadata={"help": "Max length to pad for bert tokenizer."}) def __post_init__(self): - assert self.model in ["senticgcn", "senticgcnbert"] + assert self.model in ["senticgcn", "senticgcnbert"], "Invalid model type!" assert self.initializer in [ "xavier_uniform", "xavier_uniform", @@ -110,4 +110,4 @@ def __post_init__(self): assert self.repeats > 1, "Repeats value must be at least 1." assert self.patience > 1, "Patience value must be at least 1." assert 0 >= self.valset_ratio < 1, "Valset_ratio must be greater or equals to 0 and less than 1." - assert 0 >= self.max_len < 1, "Max_len must be greater or equals to 0 and less than 1." + assert self.max_len > 0, "Max_len must be greater than 0." From 0c9065687283d07ea1b6209d8bc5d7ea99ae9eb2 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 5 Jan 2022 09:20:22 +0800 Subject: [PATCH 076/201] [#41] fix casting data to tensors --- sgnlp/models/sentic_gcn/utils.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index 825e5eb..e155e47 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -6,13 +6,14 @@ import pathlib import requests import urllib -from typing import Dict, Tuple +from typing import Dict, Tuple, Union import numpy as np import spacy import torch from torch.utils.data import random_split, Dataset from transformers import PreTrainedTokenizer +from transformers.tokenization_utils_base import BatchEncoding from data_class import SenticGCNTrainArgs @@ -305,7 +306,7 @@ def _read_raw_dataset(self, dataset_type: str) -> list[str]: lines = f.readlines() return lines - def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, torch.Tensor]: + def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, Union[BatchEncoding, torch.Tensor]]: """ Data preprocess method to generate all indices required for SenticGCN model training. @@ -313,7 +314,7 @@ def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, torch.Te raw_data (list[str]): list of text, aspect word and polarity read from raw dataset file. Returns: - Dict[str, torch.Tensor]: return a dictionary of dataset sub-type and their tensors. + Dict[str, Union[BatchEncoding, torch.Tensor]]: return a dictionary of dataset sub-type and their tensors. """ all_data = [] for i in range(0, len(raw_data), 3): @@ -324,11 +325,13 @@ def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, torch.Te polarity = raw_data[i + 2].strip() # Process indices - text_indices = self.tokenizer(full_text) - aspect_indices = self.tokenizer(aspect) - left_indices = self.tokenizer(text_left) + text_indices = self.tokenizer(full_text, return_tensors="pt") + aspect_indices = self.tokenizer(aspect, return_tensors="pt") + left_indices = self.tokenizer(text_left, return_tensors="pt") polarity = int(polarity) + 1 + polarity = torch.tensor(polarity) graph = generate_dependency_adj_matrix(full_text, aspect, self.senticnet, self.spacy_pipeline) + graph = torch.tensor(polarity) all_data.append( { @@ -376,7 +379,8 @@ def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, torc # array of [0] for texts including [CLS] and [SEP] and [1] for aspect and ending [SEP] concat_segment_indices = [0] * (text_len + 2) + [1] * (aspect_len + 1) concat_segment_indices = pad_and_truncate(concat_segment_indices, max_len) - concat_segment_indices = torch.tensor(concat_segment_indices) + concat_segment_indices = BatchEncoding({"input_ids": concat_segment_indices}) + concat_segment_indices = concat_segment_indices.convert_to_tensors("pt") # Process graph graph = generate_dependency_adj_matrix(full_text, aspect, self.senticnet, self.spacy_pipeline) @@ -388,7 +392,8 @@ def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, torc ), "constant", ) - sdat_graph = torch.tensor(sdat_graph) + sdat_graph = BatchEncoding({"graph": sdat_graph}) + sdat_graph = sdat_graph.convert_to_tensors("pt") all_data.append( { @@ -422,7 +427,6 @@ def generate_datasets(self) -> Tuple[SenticGCNDataset, SenticGCNDataset, SenticG else: train_data = self._generate_senticgcnbert_dataset(raw_train_data) test_data = self._generate_senticgcnbert_dataset(raw_test_data) - # Train/Val/Test split if self.config.valset_ratio > 0: valset_len = int(len(train_data) * self.config.valset_ratio) From 9a8250ad68e847eeda7a659e950c5716fd61c902 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 5 Jan 2022 09:28:01 +0800 Subject: [PATCH 077/201] [#41] fix wrong variable used --- sgnlp/models/sentic_gcn/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index e155e47..f7ec43b 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -331,7 +331,7 @@ def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, Union[Ba polarity = int(polarity) + 1 polarity = torch.tensor(polarity) graph = generate_dependency_adj_matrix(full_text, aspect, self.senticnet, self.spacy_pipeline) - graph = torch.tensor(polarity) + graph = torch.tensor(graph) all_data.append( { From 6f71d249c94fe89d27cc5e129f9c9e368f200665 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 5 Jan 2022 09:51:46 +0800 Subject: [PATCH 078/201] [#41] standardise all dataset type to use batchencoding --- sgnlp/models/sentic_gcn/utils.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index f7ec43b..094a249 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -6,7 +6,7 @@ import pathlib import requests import urllib -from typing import Dict, Tuple, Union +from typing import Dict, Tuple import numpy as np import spacy @@ -306,7 +306,7 @@ def _read_raw_dataset(self, dataset_type: str) -> list[str]: lines = f.readlines() return lines - def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, Union[BatchEncoding, torch.Tensor]]: + def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, BatchEncoding]: """ Data preprocess method to generate all indices required for SenticGCN model training. @@ -314,7 +314,7 @@ def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, Union[Ba raw_data (list[str]): list of text, aspect word and polarity read from raw dataset file. Returns: - Dict[str, Union[BatchEncoding, torch.Tensor]]: return a dictionary of dataset sub-type and their tensors. + Dict[str, BatchEncoding]]: return a dictionary of dataset sub-type and their tensors. """ all_data = [] for i in range(0, len(raw_data), 3): @@ -329,9 +329,11 @@ def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, Union[Ba aspect_indices = self.tokenizer(aspect, return_tensors="pt") left_indices = self.tokenizer(text_left, return_tensors="pt") polarity = int(polarity) + 1 - polarity = torch.tensor(polarity) + polarity = BatchEncoding({"input_ids": polarity}) + polarity.convert_to_tensors("pt") graph = generate_dependency_adj_matrix(full_text, aspect, self.senticnet, self.spacy_pipeline) - graph = torch.tensor(graph) + graph = BatchEncoding({"input_ids": graph}) + graph.convert_to_tensors("pt") all_data.append( { @@ -344,7 +346,7 @@ def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, Union[Ba ) return all_data - def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, torch.Tensor]: + def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, BatchEncoding]: """ Data preprocess method to generate all indices required for SenticGCNBert model training. @@ -352,7 +354,7 @@ def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, torc raw_data (list[str]): list of text, aspect word and polarity read from raw dataset file. Returns: - Dict[str, torch.Tensor]: return a dictionary of dataset sub-type and their tensors. + Dict[str, BatchEncoding]: return a dictionary of dataset sub-type and their tensors. """ all_data = [] max_len = self.config.max_len @@ -369,7 +371,8 @@ def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, torc aspect_indices = self.tokenizer(aspect, return_tensors="pt") left_indices = self.tokenizer(text_left, return_tensors="pt") polarity = int(polarity) + 1 - polarity = torch.tensor(polarity) + polarity = BatchEncoding({"input_ids": polarity}) + polarty = polarity.convert_to_tensors("pt") # Process bert related indices text_bert_indices = self.tokenizer(full_text_with_bert_tokens) @@ -392,7 +395,7 @@ def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, torc ), "constant", ) - sdat_graph = BatchEncoding({"graph": sdat_graph}) + sdat_graph = BatchEncoding({"input_ids": sdat_graph}) sdat_graph = sdat_graph.convert_to_tensors("pt") all_data.append( From 6f19690b40dc4d8e7146ee8102b92fe66d9eddb8 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 5 Jan 2022 10:35:14 +0800 Subject: [PATCH 079/201] [#41] cast tensors to numpy for summation method call --- sgnlp/models/sentic_gcn/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index 094a249..da062e6 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -372,12 +372,12 @@ def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, Batc left_indices = self.tokenizer(text_left, return_tensors="pt") polarity = int(polarity) + 1 polarity = BatchEncoding({"input_ids": polarity}) - polarty = polarity.convert_to_tensors("pt") + polarity = polarity.convert_to_tensors("pt") # Process bert related indices - text_bert_indices = self.tokenizer(full_text_with_bert_tokens) - text_len = np.sum(text_indices["input_ids"] != 0) - aspect_len = np.sum(aspect_indices["input_ids"] != 0) + text_bert_indices = self.tokenizer(full_text_with_bert_tokens, return_tensors="pt") + text_len = np.sum(text_indices["input_ids"].numpy() != 0) + aspect_len = np.sum(aspect_indices["input_ids"].numpy() != 0) # array of [0] for texts including [CLS] and [SEP] and [1] for aspect and ending [SEP] concat_segment_indices = [0] * (text_len + 2) + [1] * (aspect_len + 1) From b8249d46fbc7a0a0f958416aea5c0ee6bcf182aa Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 5 Jan 2022 11:38:53 +0800 Subject: [PATCH 080/201] [#41] fix config import --- sgnlp/models/sentic_gcn/config.py | 10 +++++----- sgnlp/models/sentic_gcn/modeling.py | 12 +++++------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/sgnlp/models/sentic_gcn/config.py b/sgnlp/models/sentic_gcn/config.py index 0d4bdb5..c79f0e1 100644 --- a/sgnlp/models/sentic_gcn/config.py +++ b/sgnlp/models/sentic_gcn/config.py @@ -1,7 +1,7 @@ -from transformers import PreTrainedConfig, BertConfig +from transformers import PretrainedConfig, BertConfig -class SenticGCNConfig(PreTrainedConfig): +class SenticGCNConfig(PretrainedConfig): """ This is the configuration class to store the configuration of a :class:`~sgnlp.models.sentic_gcn.modeling.SenticGCNModel`. @@ -42,7 +42,7 @@ def __init__( self.loss_function = loss_function -class SenticGCNBertConfig(PreTrainedConfig): +class SenticGCNBertConfig(PretrainedConfig): """ This is the configuration class to store the configuration of a :class:`~sgnlp.models.sentic_gcn.modeling.SenticBertGCNModel`. It is used to instantiate a SenticBertGCNModel network according to the specific arguments, defining the model architecture. @@ -81,13 +81,13 @@ def __init__( self.loss_function = loss_function -class SenticGCNEmbeddingConfig(PreTrainedConfig): +class SenticGCNEmbeddingConfig(PretrainedConfig): """ This is the configuration class to store the configuration of a :class:`~SenticGCNEmbeddingModel`. It is used to instantiate a SenticGCN Embedding model according to the specified arguments, defining the model architecture. Args: - PreTrainedConfig (:obj:`PretrainedConfig`): transformer :obj:`PreTrainedConfig` base class + PretrainedConfig (:obj:`PretrainedConfig`): transformer :obj:`PretrainedConfig` base class """ def __init__(self, vocab_size: int = 17662, embed_dim: int = 300, **kwargs) -> None: diff --git a/sgnlp/models/sentic_gcn/modeling.py b/sgnlp/models/sentic_gcn/modeling.py index 43084e7..9be570e 100644 --- a/sgnlp/models/sentic_gcn/modeling.py +++ b/sgnlp/models/sentic_gcn/modeling.py @@ -10,15 +10,15 @@ from transformers import PreTrainedModel, BertModel from transformers.file_utils import ModelOutput -from .modules.dynamic_rnn import DynamicLSTM -from .modules.gcn import GraphConvolution -from .config import ( +from modules.dynamic_rnn import DynamicLSTM +from modules.gcn import GraphConvolution +from config import ( SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig, ) -from .utils import build_embedding_matrix +from utils import build_embedding_matrix @dataclass @@ -118,9 +118,7 @@ def mask(self, x, aspect_double_idx): mask = torch.tensor(mask, dtype=torch.float).unsqueeze(2).to(self.device) return mask * x - def forward( - self, inputs: dict[str, torch.Tensor], labels: Optional[torch.Tensor] = None - ) -> SenticGCNModelOutput: + def forward(self, inputs: dict[str, torch.Tensor], labels: Optional[torch.Tensor] = None) -> SenticGCNModelOutput: text_indices, aspect_indices, left_indices, adj = inputs text_len = torch.sum(text_indices != 0, dim=-1) aspect_len = torch.sum(aspect_indices != 0, dim=-1) From 997377d55b302cfb68e967a40241ba59e1ea0a94 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 5 Jan 2022 15:47:46 +0800 Subject: [PATCH 081/201] [#41] add SenticGCNEmbedding model as part of dataset generation step for SenticGCNModel --- sgnlp/models/sentic_gcn/config.py | 6 ++-- sgnlp/models/sentic_gcn/modeling.py | 50 +++++++++++++++-------------- sgnlp/models/sentic_gcn/utils.py | 18 +++++++++-- 3 files changed, 44 insertions(+), 30 deletions(-) diff --git a/sgnlp/models/sentic_gcn/config.py b/sgnlp/models/sentic_gcn/config.py index c79f0e1..cee9948 100644 --- a/sgnlp/models/sentic_gcn/config.py +++ b/sgnlp/models/sentic_gcn/config.py @@ -32,7 +32,7 @@ def __init__( device: str = "cuda", loss_function: str = "cross_entropy", **kwargs - ): + ) -> None: super().__init__(**kwargs) self.embed_dim = embed_dim self.hidden_dim = hidden_dim @@ -71,7 +71,7 @@ def __init__( device: str = "cuda", loss_function: str = "cross_entropy", **kwargs - ): + ) -> None: super().__init__(**kwargs) self.hidden_dim = hidden_dim self.max_seq_len = max_seq_len @@ -105,5 +105,5 @@ class SenticGCNBertEmbeddingConfig(BertConfig): BertConfig (:obj:`BertConfig`): transformer :obj:`BertConfig` base class """ - def __init__(self, **kwargs): + def __init__(self, **kwargs) -> None: super().__init__(**kwargs) diff --git a/sgnlp/models/sentic_gcn/modeling.py b/sgnlp/models/sentic_gcn/modeling.py index 9be570e..14bf6e9 100644 --- a/sgnlp/models/sentic_gcn/modeling.py +++ b/sgnlp/models/sentic_gcn/modeling.py @@ -1,8 +1,5 @@ -import pathlib -import pickle from dataclasses import dataclass -from typing import Optional, Union - +from typing import Optional import torch import torch.nn as nn @@ -81,7 +78,7 @@ def __init__(self, config: SenticGCNConfig) -> None: self.gc2 = GraphConvolution(2 * config.hidden_dim, 2 * config.hidden_dim) self.fc = nn.Linear(2 * config.hidden_dim, config.polarities_dim) self.text_embed_dropout = nn.Dropout(config.dropout) - self.device = config.device + self.torch_device = torch.device(config.device) if config.loss_function == "cross_entropy": self.loss_function = nn.CrossEntropyLoss() @@ -101,7 +98,7 @@ def position_weight(self, x, aspect_double_idx, text_len, aspect_len): weight[i].append(1 - (j - aspect_double_idx[i, 1] / context_len)) for j in range(text_len[i], seq_len): weight[i].append(0) - weight = torch.tensor(weight, dtype=torch.float).unsqueeze(2).to(self.device) + weight = torch.tensor(weight, dtype=torch.float).unsqueeze(2).to(self.torch_device) return weight * x def mask(self, x, aspect_double_idx): @@ -115,18 +112,16 @@ def mask(self, x, aspect_double_idx): mask[i].append(1) for j in range(aspect_double_idx[i, 1] + 1, seq_len): mask[i].append(0) - mask = torch.tensor(mask, dtype=torch.float).unsqueeze(2).to(self.device) + mask = torch.tensor(mask, dtype=torch.float).unsqueeze(2).to(self.torch_device) return mask * x def forward(self, inputs: dict[str, torch.Tensor], labels: Optional[torch.Tensor] = None) -> SenticGCNModelOutput: - text_indices, aspect_indices, left_indices, adj = inputs + text_indices, aspect_indices, left_indices, text_embeddings, adj = inputs text_len = torch.sum(text_indices != 0, dim=-1) aspect_len = torch.sum(aspect_indices != 0, dim=-1) left_len = torch.sum(left_indices != 0, dim=-1) aspect_double_idx = torch.cat([left_len.unsqueeze(1), (left_len + aspect_len - 1).unsqueeze(1)], dim=1) - # TODO: How to replace embedding layer here? - text = self.embedding(text_indices) - text = self.text_embed_dropout(text_indices) + text = self.text_embed_dropout(text_embeddings) text_out, (_, _) = self.text_lstm(text, text_len) x = F.relu( self.gc1( @@ -199,7 +194,7 @@ def __init__(self, config: SenticGCNBertConfig) -> None: self.gc3 = GraphConvolution(config.hidden_dim, config.hidden_dim) self.fc = nn.Linear(config.hidden_dim, config.polarities_dim) self.text_embed_dropout = nn.Dropout(config.dropout) - self.device = config.device + self.torch_device = torch.device(config.device) self.max_seq_len = config.max_seq_len self.loss_function = config.loss_function @@ -219,7 +214,7 @@ def position_weight(self, x, aspect_double_idx, text_len, aspect_len): weight[i].append(1 - (j - aspect_double_idx[i, 1]) / context_len) for j in range(text_len[i], seq_len): weight[i].append(0) - weight = torch.tensor(weight).unsqueeze(2).to(self.device) + weight = torch.tensor(weight).unsqueeze(2).to(self.torch_device) return weight * x def mask(self, x, aspect_double_idx): @@ -233,11 +228,12 @@ def mask(self, x, aspect_double_idx): mask[i].append(1) for j in range(min(aspect_double_idx[i, 1] + 1, self.max_seq_len), seq_len): mask[i].append(0) - mask = torch.tensor(mask).unsqueeze(2).float().to(self.device) + mask = torch.tensor(mask).unsqueeze(2).float().to(self.torch_device) return mask * x def forward(self, inputs, labels: torch.Tensor): text_bert_indices, text_indices, aspect_indices, bert_segments_ids, left_indices, adj = inputs + # text_indices, text_ text_len = torch.sum(text_indices != 0, dim=-1) aspect_len = torch.sum(aspect_indices != 0, dim=-1) left_len = torch.sum(left_indices != 0, dim=-1) @@ -294,16 +290,22 @@ class SenticGCNEmbeddingModel(SenticGCNEmbeddingPreTrainedModel): Use the :obj:`.from_pretrained` method to load the model weights. """ - def __init__(self, config: SenticGCNEmbeddingConfig): - super().__init__() + def __init__(self, config: SenticGCNEmbeddingConfig) -> None: + super().__init__(config) self.vocab_size = config.vocab_size self.embed = nn.Embedding(config.vocab_size, config.embed_dim) - def load_pretrained_embedding(self, pretrained_embedding_path: Union[str, pathlib.Path]) -> None: - with open(pretrained_embedding_path, "rb") as emb_f: - embedding_matrix = pickle.load(emb_f) - embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float) - self.embed.weight.data.copy_(embedding_tensor) + def forward(self, token_ids: torch.Tensor) -> torch.Tensor: + """ + Encode input token ids using word embedding. + + Args: + token_ids (torch.Tensor): Tensor of token ids with shape [batch_size, num_words] + + Returns: + torch.Tensor: return Tensor of embeddings with shape (batch_size, num_words, embed_dim) + """ + return self.embed(token_ids) @classmethod def build_embedding_matrix( @@ -328,8 +330,8 @@ def build_embedding_matrix( word_vec_file_path=word_vec_file_path, vocab=vocab, embed_dim=embed_dim ) embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float) - config = SenticGCNEmbeddingConfig(vocab_size=vocab, embed_dim=embed_dim) - senticgcn_embed = cls(config) + sentic_embed_config = SenticGCNEmbeddingConfig(vocab_size=len(vocab), embed_dim=embed_dim) + senticgcn_embed = cls(sentic_embed_config) senticgcn_embed.embed.weight.data.copy_(embedding_tensor) return senticgcn_embed @@ -349,5 +351,5 @@ class SenticGCNBertEmbeddingModel(BertModel): Use the :obj:`.from_pretrained` method to load the model weights. """ - def __init__(self, config: SenticGCNBertEmbeddingConfig): + def __init__(self, config: SenticGCNBertEmbeddingConfig) -> None: super().__init__(config) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index da062e6..2bee92c 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -12,7 +12,7 @@ import spacy import torch from torch.utils.data import random_split, Dataset -from transformers import PreTrainedTokenizer +from transformers import PreTrainedTokenizer, PreTrainedModel from transformers.tokenization_utils_base import BatchEncoding from data_class import SenticGCNTrainArgs @@ -276,7 +276,9 @@ class SenticGCNDatasetGenerator: Main dataset generator class to preprocess raw dataset file. """ - def __init__(self, config: SenticGCNTrainArgs, tokenizer: PreTrainedTokenizer): + def __init__( + self, config: SenticGCNTrainArgs, tokenizer: PreTrainedTokenizer, embedding_model: PreTrainedModel + ) -> None: self.config = config self.senticnet = load_and_process_senticnet( config.senticnet_word_file_path, @@ -285,6 +287,7 @@ def __init__(self, config: SenticGCNTrainArgs, tokenizer: PreTrainedTokenizer): ) self.spacy_pipeline = spacy.load(config.spacy_pipeline) self.tokenizer = tokenizer + self.embedding_model = embedding_model self.device = ( torch.device("cuda" if torch.cuda.is_available() else "cpu") if config.device is None @@ -335,11 +338,16 @@ def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, BatchEnc graph = BatchEncoding({"input_ids": graph}) graph.convert_to_tensors("pt") + # Process embeddings + text_embeddings = self.embedding_model(text_indices["input_ids"]) + text_embeddings = BatchEncoding({"input_ids": text_embeddings}) + all_data.append( { "text_indices": text_indices.to(self.device), "aspect_indices": aspect_indices.to(self.device), "left_indices": left_indices.to(self.device), + "text_embeddings": text_embeddings.to(self.device), "polarity": polarity.to(self.device), "sdat_graph": graph.to(self.device), } @@ -375,7 +383,9 @@ def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, Batc polarity = polarity.convert_to_tensors("pt") # Process bert related indices - text_bert_indices = self.tokenizer(full_text_with_bert_tokens, return_tensors="pt") + text_bert_indices = self.tokenizer( + full_text_with_bert_tokens, return_tensors="pt", add_special_tokens=True, return_token_type_ids=True + ) text_len = np.sum(text_indices["input_ids"].numpy() != 0) aspect_len = np.sum(aspect_indices["input_ids"].numpy() != 0) @@ -385,6 +395,8 @@ def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, Batc concat_segment_indices = BatchEncoding({"input_ids": concat_segment_indices}) concat_segment_indices = concat_segment_indices.convert_to_tensors("pt") + # Process embeddings + # Process graph graph = generate_dependency_adj_matrix(full_text, aspect, self.senticnet, self.spacy_pipeline) sdat_graph = np.pad( From fecc7625e2d573c14fbe747a7a3e53a72a42a568 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 5 Jan 2022 18:47:32 +0800 Subject: [PATCH 082/201] [#41] revert changes to dataset generator to exactly match implmentation from original code --- sgnlp/models/sentic_gcn/utils.py | 118 ++++++++++++++++++++++++------- 1 file changed, 91 insertions(+), 27 deletions(-) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index 2bee92c..1a4edac 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -6,6 +6,7 @@ import pathlib import requests import urllib +import math from typing import Dict, Tuple import numpy as np @@ -276,9 +277,7 @@ class SenticGCNDatasetGenerator: Main dataset generator class to preprocess raw dataset file. """ - def __init__( - self, config: SenticGCNTrainArgs, tokenizer: PreTrainedTokenizer, embedding_model: PreTrainedModel - ) -> None: + def __init__(self, config: SenticGCNTrainArgs, tokenizer: PreTrainedTokenizer) -> None: self.config = config self.senticnet = load_and_process_senticnet( config.senticnet_word_file_path, @@ -287,12 +286,6 @@ def __init__( ) self.spacy_pipeline = spacy.load(config.spacy_pipeline) self.tokenizer = tokenizer - self.embedding_model = embedding_model - self.device = ( - torch.device("cuda" if torch.cuda.is_available() else "cpu") - if config.device is None - else torch.device(config.device) - ) def _read_raw_dataset(self, dataset_type: str) -> list[str]: """ @@ -320,6 +313,7 @@ def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, BatchEnc Dict[str, BatchEncoding]]: return a dictionary of dataset sub-type and their tensors. """ all_data = [] + max_len = self.config.max_len for i in range(0, len(raw_data), 3): # Process full text, aspect and polarity index text_left, _, text_right = [s.lower().strip() for s in raw_data[i].partition("$T$")] @@ -328,28 +322,33 @@ def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, BatchEnc polarity = raw_data[i + 2].strip() # Process indices - text_indices = self.tokenizer(full_text, return_tensors="pt") - aspect_indices = self.tokenizer(aspect, return_tensors="pt") - left_indices = self.tokenizer(text_left, return_tensors="pt") + text_indices = self.tokenizer( + full_text, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + aspect_indices = self.tokenizer( + aspect, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + left_indices = self.tokenizer( + text_left, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) polarity = int(polarity) + 1 - polarity = BatchEncoding({"input_ids": polarity}) - polarity.convert_to_tensors("pt") graph = generate_dependency_adj_matrix(full_text, aspect, self.senticnet, self.spacy_pipeline) - graph = BatchEncoding({"input_ids": graph}) - graph.convert_to_tensors("pt") - - # Process embeddings - text_embeddings = self.embedding_model(text_indices["input_ids"]) - text_embeddings = BatchEncoding({"input_ids": text_embeddings}) - all_data.append( { - "text_indices": text_indices.to(self.device), - "aspect_indices": aspect_indices.to(self.device), - "left_indices": left_indices.to(self.device), - "text_embeddings": text_embeddings.to(self.device), - "polarity": polarity.to(self.device), - "sdat_graph": graph.to(self.device), + "text_indices": text_indices["input_ids"], + "aspect_indices": aspect_indices["input_ids"], + "left_indices": left_indices["input_ids"], + "polarity": polarity, + "sdat_graph": graph, } ) return all_data @@ -449,3 +448,68 @@ def generate_datasets(self) -> Tuple[SenticGCNDataset, SenticGCNDataset, SenticG else: val_data = test_data return SenticGCNDataset(train_data), SenticGCNDataset(val_data), SenticGCNDataset(test_data) + + +class BucketIterator: + def __init__( + self, + data: list[dict[str, BatchEncoding]], + batch_size: int, + sort_key: str = "text_indices", + shuffle=True, + sort=True, + ): + self.shuffle = shuffle + self.sort = sort + self.batches = self.sort_and_pad(data, batch_size) + self.batch_len = len(self.batches) + + def sort_and_pad(self, data: list[dict[str, BatchEncoding]], batch_size: int): + num_batch = int(math.ceil(len(data) / batch_size)) + if self.sort: + sorted_data = sorted(data, key=lambda x: len(x[self.sort_key])) + else: + sorted_data = data + batches = [] + for i in range(num_batch): + batches.append(self.pad_data(sorted_data[i * batch_size : (i + 1) * batch_size])) + return batches + + def pad_data(self, batch_data): + batch_text_indices = [] + batch_aspect_indices = [] + batch_left_indices = [] + batch_text_embeddings = [] + batch_polarity = [] + batch_sdat_graph = [] + max_len = max([len(t[self.sortkey]) for t in batch_data]) + for item in batch_data: + (text_indices, aspect_indices, left_indices, text_embeddings, polarity, sdat_graph,) = ( + item["text_indices"], + item["aspect_indices"], + item["left_indices"], + item["text_embeddings"], + item["polarity"], + item["sdat_graph"], + ) + # Calculate padding length + text_padding = [0] * (max_len - len(text_indices["input_ids"])) + aspect_padding = [0] * (max_len - len(aspect_indices["input_ids"])) + left_padding = [0] * (max_len - len(left_indices["input_ids"])) + text_embed_padding = [0] * (max_len - len(text_embeddings["input_ids"])) + + # Convert to tensor + text_indices["input_ids"] = torch.concat((text_indices, torch.tensor(text_padding))) + context_padding + + batch_text_indices.append(text_indices + text_padding) + batch_context_indices.append(context_indices + context_padding) + batch_aspect_indices.append(aspect_indices + aspect_padding) + batch_left_indices.append(left_indices + left_padding) + batch_text_embeddings.append(text_embeddings + text_embed_padding) + batch_polarity.append(polarity) + batch_sdat_graph.append( + np.pad(sdat_graph, ((0, max_len - len(text_indices)), (0, max_len - len(text_indices))), "constant") + ) + + return From 26fbc87a1e1b2755856d42dfd5db891994eb1067 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 5 Jan 2022 18:58:16 +0800 Subject: [PATCH 083/201] [#41] clean up and re-add bucketiterator --- sgnlp/models/sentic_gcn/utils.py | 35 +++++++++++++++++++------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index 1a4edac..a981372 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -302,7 +302,7 @@ def _read_raw_dataset(self, dataset_type: str) -> list[str]: lines = f.readlines() return lines - def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, BatchEncoding]: + def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, list]: """ Data preprocess method to generate all indices required for SenticGCN model training. @@ -310,10 +310,9 @@ def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, BatchEnc raw_data (list[str]): list of text, aspect word and polarity read from raw dataset file. Returns: - Dict[str, BatchEncoding]]: return a dictionary of dataset sub-type and their tensors. + Dict[str, list]]: return a dictionary of dataset sub-type and their list of values. """ all_data = [] - max_len = self.config.max_len for i in range(0, len(raw_data), 3): # Process full text, aspect and polarity index text_left, _, text_right = [s.lower().strip() for s in raw_data[i].partition("$T$")] @@ -451,6 +450,10 @@ def generate_datasets(self) -> Tuple[SenticGCNDataset, SenticGCNDataset, SenticG class BucketIterator: + """ + Iterator class for use with non-bert version of SenticGCN. + """ + def __init__( self, data: list[dict[str, BatchEncoding]], @@ -461,6 +464,7 @@ def __init__( ): self.shuffle = shuffle self.sort = sort + self.sort_key = sort_key self.batches = self.sort_and_pad(data, batch_size) self.batch_len = len(self.batches) @@ -479,16 +483,14 @@ def pad_data(self, batch_data): batch_text_indices = [] batch_aspect_indices = [] batch_left_indices = [] - batch_text_embeddings = [] batch_polarity = [] batch_sdat_graph = [] max_len = max([len(t[self.sortkey]) for t in batch_data]) for item in batch_data: - (text_indices, aspect_indices, left_indices, text_embeddings, polarity, sdat_graph,) = ( + (text_indices, aspect_indices, left_indices, polarity, sdat_graph,) = ( item["text_indices"], item["aspect_indices"], item["left_indices"], - item["text_embeddings"], item["polarity"], item["sdat_graph"], ) @@ -496,20 +498,25 @@ def pad_data(self, batch_data): text_padding = [0] * (max_len - len(text_indices["input_ids"])) aspect_padding = [0] * (max_len - len(aspect_indices["input_ids"])) left_padding = [0] * (max_len - len(left_indices["input_ids"])) - text_embed_padding = [0] * (max_len - len(text_embeddings["input_ids"])) - - # Convert to tensor - text_indices["input_ids"] = torch.concat((text_indices, torch.tensor(text_padding))) - context_padding batch_text_indices.append(text_indices + text_padding) - batch_context_indices.append(context_indices + context_padding) batch_aspect_indices.append(aspect_indices + aspect_padding) batch_left_indices.append(left_indices + left_padding) - batch_text_embeddings.append(text_embeddings + text_embed_padding) batch_polarity.append(polarity) batch_sdat_graph.append( np.pad(sdat_graph, ((0, max_len - len(text_indices)), (0, max_len - len(text_indices))), "constant") ) - return + return { + "text_indices": torch.tensor(batch_text_indices), + "aspect_indices": torch.tensor(batch_aspect_indices), + "left_indices": torch.tensor(batch_left_indices), + "polarity": torch.tensor(batch_polarity), + "sdat_graph": torch.tensor(batch_sdat_graph), + } + + def __iter__(self): + if self.shuffle: + random.shuffle(self.batches) + for idx in range(self.batch_len): + yield self.batches[idx] From 74f9602e8e48acb2568df26f7ce30047b41c0baa Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 6 Jan 2022 09:02:03 +0800 Subject: [PATCH 084/201] [#41] fix wrong index access --- sgnlp/models/sentic_gcn/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index a981372..4c88f32 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -485,7 +485,7 @@ def pad_data(self, batch_data): batch_left_indices = [] batch_polarity = [] batch_sdat_graph = [] - max_len = max([len(t[self.sortkey]) for t in batch_data]) + max_len = max([len(t[self.sort_key]) for t in batch_data]) for item in batch_data: (text_indices, aspect_indices, left_indices, polarity, sdat_graph,) = ( item["text_indices"], @@ -495,9 +495,9 @@ def pad_data(self, batch_data): item["sdat_graph"], ) # Calculate padding length - text_padding = [0] * (max_len - len(text_indices["input_ids"])) - aspect_padding = [0] * (max_len - len(aspect_indices["input_ids"])) - left_padding = [0] * (max_len - len(left_indices["input_ids"])) + text_padding = [0] * (max_len - len(text_indices)) + aspect_padding = [0] * (max_len - len(aspect_indices)) + left_padding = [0] * (max_len - len(left_indices)) batch_text_indices.append(text_indices + text_padding) batch_aspect_indices.append(aspect_indices + aspect_padding) From b57e37def75b81e74dfbb4e256cea9114af897c4 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 6 Jan 2022 09:19:37 +0800 Subject: [PATCH 085/201] [#41] clean up bucketiterator class --- sgnlp/models/sentic_gcn/utils.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index 4c88f32..1019788 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -465,10 +465,20 @@ def __init__( self.shuffle = shuffle self.sort = sort self.sort_key = sort_key - self.batches = self.sort_and_pad(data, batch_size) + self.batches = self._sort_and_pad(data, batch_size) self.batch_len = len(self.batches) - def sort_and_pad(self, data: list[dict[str, BatchEncoding]], batch_size: int): + def _sort_and_pad(self, data: list[dict[str, list]], batch_size: int) -> list[dict[str, list[torch.Tensor]]]: + """ + Private method to sort and pad input dataset. + + Args: + data (list[dict[str, list]]): input dataset + batch_size (int): batch size to split dataset + + Returns: + list[dict[str, list[torch.Tensor]]]: return list of dictionary of dataset batches + """ num_batch = int(math.ceil(len(data) / batch_size)) if self.sort: sorted_data = sorted(data, key=lambda x: len(x[self.sort_key])) @@ -476,10 +486,19 @@ def sort_and_pad(self, data: list[dict[str, BatchEncoding]], batch_size: int): sorted_data = data batches = [] for i in range(num_batch): - batches.append(self.pad_data(sorted_data[i * batch_size : (i + 1) * batch_size])) + batches.append(self._pad_data(sorted_data[i * batch_size : (i + 1) * batch_size])) return batches - def pad_data(self, batch_data): + def _pad_data(self, batch_data: dict[str, list]) -> dict[str, list[torch.Tensor]]: + """ + Private method to each sub dataset to max length for their specific batch + + Args: + batch_data (dict[str, list]): dictionary of sub dataset and their list of values + + Returns: + dict[str, list[torch.Tensor]]: return a dictionary of list of tensor values + """ batch_text_indices = [] batch_aspect_indices = [] batch_left_indices = [] From 50ba40abd43f2179b9b6df5dcc71cb5d1dfd5f62 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 6 Jan 2022 09:28:02 +0800 Subject: [PATCH 086/201] [#41] add data_cols to attribute to identify dataset subtypes --- sgnlp/models/sentic_gcn/data_class.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index 2094273..b553c43 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -111,3 +111,17 @@ def __post_init__(self): assert self.patience > 1, "Patience value must be at least 1." assert 0 >= self.valset_ratio < 1, "Valset_ratio must be greater or equals to 0 and less than 1." assert self.max_len > 0, "Max_len must be greater than 0." + + # Assign sub dataset columns name + self.data_cols = ( + ["text_indices", "aspect_indices", "left_indices", "text_embeddings", "sdat_graph"] + if self.model == "senticgcn" + else [ + "text_indices", + "aspect_indices", + "left_indices", + "text_bert_indices", + "text_embeddings", + "sdat_graph", + ] + ) From 5a1af11d62510a006c518c53e83306b25ce9ed76 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 6 Jan 2022 10:53:47 +0800 Subject: [PATCH 087/201] [#41] standarised config options for model, tokenizer and embedding models --- .../config/senticnet_gcn_config.json | 26 +++- sgnlp/models/sentic_gcn/data_class.py | 122 +++++++++++++----- 2 files changed, 112 insertions(+), 36 deletions(-) diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json index 3bd7131..1d7a83f 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json @@ -1,16 +1,28 @@ { - "model": "senticgcn", - "tokenizer": "senticgcn", "senticnet_word_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/senticNet/senticnet_word.txt", + "save_preprocessed_senticnet": true, + "saved_preprocessed_senticnet_file_path": "senticnet/senticnet.pickle", "spacy_pipeline": "en_core_web_sm", + "word_vec_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/glove/glove.840B.300d.txt", + "dataset_train": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", "dataset_test": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw", "valset_ratio": 0, - "word_vec_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/glove/glove.840B.300d.txt", - "save_embedding_matrix": true, - "saved_embedding_matrix_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/embedding/embeddings.pickle", - "save_preprocessed_senticnet": true, - "saved_preprocessed_senticnet_file_path": "senticnet/senticnet.pickle", + + "model": "senticgcn", + "save_best_model": true, + "save_model_path": "senticgcn", + + "tokenizer": "senticgcn", + "train_tokenizer": false, + "save_tokenizer": false, + "save_tokenizer_path": "senticgcn_tokenizer", + + "embedding_model": "senticgcn_embed_model", + "build_embedding_model": false, + "save_embedding_model": false, + "save_embedding_model_path": "senticgcn_embed_model", + "initializer": "xavier_uniform", "optimizer": "adam", "loss_function": "cross_entropy", diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index b553c43..defc974 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -1,19 +1,41 @@ from dataclasses import dataclass, field +from torch.cuda.memory import memory_stats_as_nested_dict + @dataclass class SenticGCNTrainArgs: - model: str = field(default="senticgcn", metadata={"help": "Option to choose which model to train."}) - tokenizer: str = field( - default="senticgcn", - metadata={"help": "Option to choose which tokenizer to use for training preprocessing."}, - ) + """ + Data class for training config for both SenticGCNModel and SenticGCNBertModel + """ + + # External resources (e.g. Senticnet file, GloVe word vectors, etc) senticnet_word_file_path: str = field( default="./senticNet/senticnet_word.txt", metadata={"help": "SenticNet word file path."} ) + save_preprocessed_senticnet: str = field( + default=True, + metadata={ + "help": """Flag to indicate if senticnet dictionary should be saved during preprocess step. + If 'saved_preprocessed_senticnet_file_path' is populated and valid, it will be overwritten if flag is set to True.""" + }, + ) + saved_preprocessed_senticnet_file_path: str = field( + default="senticnet/senticnet.pickle", + metadata={ + "help": """File path to saved preprocessed senticnet, if file exists and 'save_preprocessed_senticnet' flag is set to False. + SenticNet will be loaded from file instead of generated from raw senticnet files.""" + }, + ) spacy_pipeline: str = field( default="en_core_web_sm", metadata={"help": "Type of spacy pipeline to load for processor."} ) + word_vec_file_path: str = field( + default="glove/glove.840B.300d.txt", + metadata={"help": "File path to word vector."}, + ) + + # Dataset specific config dataset_train: str = field( default="train.raw", metadata={"help": "File path to train dataset."}, @@ -30,45 +52,85 @@ class SenticGCNTrainArgs: If value is set to 0, test dataset is set as validation dataset as well.""" }, ) - word_vec_file_path: str = field( - default="glove/glove.840B.300d.txt", - metadata={"help": "File path to word vector."}, - ) - save_embedding_matrix: bool = field( + + # Model specific config + model: str = field(default="senticgcn", metadata={"help": "Option to choose which model to train."}) + save_best_model: bool = field( default=True, metadata={ - "help": """Flag to indicate if embedding matrix should be saved. - If 'saved_embedding_matrix_file_path' is populated and valid, it will be overwritten if flag is set to True. - """ + "help": """Flag to indicate if best model should be saved during training. + Applies to both bert and non-bert SenticGCN models.""" }, ) - saved_embedding_matrix_file_path: str = field( - default="embedding/embeddings.pickle", + save_model_path: str = field( + default="senticgcn", metadata={ - "help": """Full path of saved embedding matrix, if file exists and 'save_embedding_matrix' flag is set to False. - Embeddings will be generated from file instead of generated from word vector and vocab.""" + "help": """Folder path to save trained model using the save_pretrained method. + Applies to both bert and non-bert SenticGCN models.""" }, ) - save_state_dict: bool = field( - default=True, metadata={"help": "Flag to indicate if best model state_dict should be saved."} + + # Tokenizer specific config + tokenizer: str = field( + default="senticgcn", + metadata={ + "help": """Option to choose which tokenizer to use for training preprocessing. + Value will be used to create tokenizer via the from_pretrained method.""" + }, ) - saved_state_dict_folder_path: str = field( - default="/state_dict", metadata={"help": "Folder to save model state_dict."} + train_tokenizer: bool = field( + default=False, + metadata={ + "help": """Flag to indicate if tokenizer should be trained on input dataset. + Only applies to non-bert SenticGCN tokenizer.""" + }, ) - save_preprocessed_senticnet: str = field( - default=True, + save_tokenizer: bool = field( + default=False, metadata={ - "help": """Flag to indicate if senticnet dictionary should be saved during preprocess step. - If 'saved_preprocessed_senticnet_file_path' is populated and valid, it will be overwritten if flag is set to True.""" + "help": """Flag to indicate if tokenizer should be saved using the save_pretrained method. + Only applies to non-bert SenticGCN tokenizer.""" }, ) - saved_preprocessed_senticnet_file_path: str = field( - default="senticnet/senticnet.pickle", + save_tokenizer_path: str = field( + default="senticgcn_tokenizer", metadata={ - "help": """File path to saved preprocessed senticnet, if file exists and 'save_preprocessed_senticnet' flag is set to False. - SenticNet will be loaded from file instead of generated from raw senticnet files.""" + "help": """Folder path to save pretrained tokenizer using the save_pretrained method. + Only applies to non-bert SenticGCN tokenizer.""" }, ) + + # Embedding specific config + embedding_model: str = field( + default="senticgcn", + metadata={ + "help": """Option to choose which embeding model to use for training preprocessing. + Value will be used to create embedding model via the from_pretrained method.""" + }, + ) + build_embedding_model: bool = field( + default=False, + metadata={ + "help": """Flag to indicate if embedding model should be built from input word vectors. + Only applies to non-bert SenticGCN embedding models. + Word vectors to train on is indicated in 'word_vec_file_path' config.""" + }, + ) + save_embedding_model: bool = field( + default=False, + metadata={ + "help": """Flag to indicate if embedding model should be saved using the save_pretrained method. + Only applies to non-bert SenticGCN embedding model.""" + }, + ) + save_embedding_model_path: str = field( + default=False, + metadata={ + "help": """Folder path to save pretrained embedding model using the save_pretrained method. + Only applies to non-bert SenticGCN embeddding model.""" + }, + ) + initializer: str = field(default="xavier_uniform", metadata={"help": "Type of initalizer to use."}) optimizer: str = field(default="adam", metadata={"help": "Type of optimizer to use."}) loss_function: str = field(default="cross_entropy", metadata={"help": "Loss function for training/eval."}) @@ -91,7 +153,9 @@ class SenticGCNTrainArgs: max_len: int = field(default=85, metadata={"help": "Max length to pad for bert tokenizer."}) def __post_init__(self): + # Model assert self.model in ["senticgcn", "senticgcnbert"], "Invalid model type!" + assert self.initializer in [ "xavier_uniform", "xavier_uniform", From 14e0a8e0322e327b79c3ac0b9e6d680bb7c2f15e Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 6 Jan 2022 11:42:43 +0800 Subject: [PATCH 088/201] [#41] provide more clarification on embedding model field --- sgnlp/models/sentic_gcn/data_class.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index defc974..9fe058d 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -105,7 +105,11 @@ class SenticGCNTrainArgs: default="senticgcn", metadata={ "help": """Option to choose which embeding model to use for training preprocessing. - Value will be used to create embedding model via the from_pretrained method.""" + For non-bert model, value should point to a pretraine model folder. + 'config.json' and 'pytorch_model.bin' will be used to create the config and embedding model + via the from_pretrained method. + Ignore if 'build_embedding_model' flag is set, only affects non-bert SenticGCN embedding model. + For bert model, value should be model name used to download from huggingface model hub.""" }, ) build_embedding_model: bool = field( From dcd3bca51e72a626055e197666801a70d44a0ae0 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 6 Jan 2022 14:25:50 +0800 Subject: [PATCH 089/201] [#41] completed create tokenizer, embeddings and dataloaders pipeline --- sgnlp/models/sentic_gcn/data_class.py | 8 +- sgnlp/models/sentic_gcn/modeling.py | 2 +- sgnlp/models/sentic_gcn/tokenization.py | 2 +- sgnlp/models/sentic_gcn/train.py | 421 +++++++++++++++++------- 4 files changed, 311 insertions(+), 122 deletions(-) diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index 9fe058d..0bdcdea 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -72,7 +72,7 @@ class SenticGCNTrainArgs: # Tokenizer specific config tokenizer: str = field( - default="senticgcn", + default="senticgcn_tokenizer", metadata={ "help": """Option to choose which tokenizer to use for training preprocessing. Value will be used to create tokenizer via the from_pretrained method.""" @@ -81,7 +81,7 @@ class SenticGCNTrainArgs: train_tokenizer: bool = field( default=False, metadata={ - "help": """Flag to indicate if tokenizer should be trained on input dataset. + "help": """Flag to indicate if tokenizer should be trained on train and test input dataset. Only applies to non-bert SenticGCN tokenizer.""" }, ) @@ -102,7 +102,7 @@ class SenticGCNTrainArgs: # Embedding specific config embedding_model: str = field( - default="senticgcn", + default="senticgcn_embed_model", metadata={ "help": """Option to choose which embeding model to use for training preprocessing. For non-bert model, value should point to a pretraine model folder. @@ -128,7 +128,7 @@ class SenticGCNTrainArgs: }, ) save_embedding_model_path: str = field( - default=False, + default="senticgcn_embed_model", metadata={ "help": """Folder path to save pretrained embedding model using the save_pretrained method. Only applies to non-bert SenticGCN embeddding model.""" diff --git a/sgnlp/models/sentic_gcn/modeling.py b/sgnlp/models/sentic_gcn/modeling.py index 14bf6e9..3603ccb 100644 --- a/sgnlp/models/sentic_gcn/modeling.py +++ b/sgnlp/models/sentic_gcn/modeling.py @@ -308,7 +308,7 @@ def forward(self, token_ids: torch.Tensor) -> torch.Tensor: return self.embed(token_ids) @classmethod - def build_embedding_matrix( + def build_embedding_model( cls, word_vec_file_path: str, vocab: dict[str, int], diff --git a/sgnlp/models/sentic_gcn/tokenization.py b/sgnlp/models/sentic_gcn/tokenization.py index 2645cf2..0bf0c0d 100644 --- a/sgnlp/models/sentic_gcn/tokenization.py +++ b/sgnlp/models/sentic_gcn/tokenization.py @@ -83,7 +83,7 @@ def __read_text_file(file_names: List[str]) -> str: return text def create_vocab(self, train_files: List[str]) -> Dict[str, int]: - text = self.__read_text_file(train_files) + text = SenticGCNTokenizer.__read_text_file(train_files) if self.do_lower_case: text = text.lower() vocab = {} diff --git a/sgnlp/models/sentic_gcn/train.py b/sgnlp/models/sentic_gcn/train.py index a38e57d..6b86e90 100644 --- a/sgnlp/models/sentic_gcn/train.py +++ b/sgnlp/models/sentic_gcn/train.py @@ -1,24 +1,29 @@ -import datetime import logging import math import pathlib +from typing import Tuple, Union -import numpy as np from sklearn.metrics import f1_score import torch import torch.nn as nn import torch.optim as optim +from torch.utils.data.dataloader import DataLoader +from config import SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig from data_class import SenticGCNTrainArgs -from modeling import SenticGCNBertPreTrainedModel +from modeling import SenticGCNBertPreTrainedModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel from tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer -from utils import parse_args_and_load_config, set_random_seed, ABSADatasetReader +from utils import parse_args_and_load_config, set_random_seed, SenticGCNDatasetGenerator, BucketIterator logging.basicConfig(level=logging.DEBUG) class SenticGCNBaseTrainer: + """ + Base Trainer class used for training SenticGCNModel and SenticGCNBertModel + """ + def __init__(self, config: SenticGCNTrainArgs): self.config = config self.global_max_acc = 0.0 @@ -28,13 +33,14 @@ def __init__(self, config: SenticGCNTrainArgs): if not self.config.device else torch.device(self.config.device) ) - tokenizer = self._create_tokenizer() - # self.dataloader if config.save_state_dict: self.save_state_dict_folder = pathlib.Path(self.config.saved_state_dict_folder_path) self.save_state_dict_folder.mkdir(exist_ok=True) def _create_initializers(self): + """ + Private helper method to instantiate initializer. + """ initializers = { "xavier_uniform_": nn.init.xavier_uniform_, "xavier_normal_": nn.init.xavier_normal, @@ -43,6 +49,9 @@ def _create_initializers(self): return initializers[self.config.initializer] def _create_optimizer(self): + """ + Private helper method to instantiate optimzer. + """ optimizers = { "adadelta": optim.Adadelta, "adagrad": optim.Adagrad, @@ -54,24 +63,37 @@ def _create_optimizer(self): } return optimizers[self.config.optimizer] - def _create_tokenizer(self): - self.tokenizer = ( - SenticGCNBertTokenizer.from_pretrained(self.config.tokenizer) - if self.config.model == "senticgcn" - else SenticGCNBertTokenizer.from_pretrained(self.config.tokenizer) - ) + def _reset_params(self) -> None: + raise NotImplementedError("Please call from derived class only.") - def _reset_params(self): + def _generate_data_loaders( + self, + ) -> Union[Tuple[DataLoader, DataLoader, DataLoader], Tuple[BucketIterator, BucketIterator, BucketIterator]]: + raise NotImplementedError("Please call from derived class only.") + + def _create_tokenizer(self) -> Union[SenticGCNTokenizer, SenticGCNBertTokenizer]: raise NotImplementedError("Please call from derived class only.") - def _evaluate_acc_f1(self): + def _create_embedding_model(self) -> Union[SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel]: + raise NotImplementedError("Please call from derived class only.") + + def _evaluate_acc_f1(self, dataloader: DataLoader) -> Tuple[float, float]: + """ + Private helper method to evaluate accuracy and f1 score. + + Args: + dataloader (DataLoader): input val and test dataloader + + Returns: + Tuple[float, float]: return acc and f1 score + """ self.model.eval() n_correct, n_total = 0, 0 t_targets_all, t_outputs_all = None, None with torch.no_grad(): - for _, t_batch in enumerate(self.dataset_test): - t_inputs = [t_batch[col].to(self.device) for col in t_batch.keys()] - t_targets = t_batch["polarity"].to(self.device) + for _, t_batch in enumerate(dataloader): + t_inputs = [t_batch[col] for col in t_batch.keys() if col != "polarity"] + t_targets = t_batch["polarity"] t_outputs = self.model(t_inputs) n_correct += (torch.argmax(t_outputs, -1) == t_targets).sum().item() @@ -87,108 +109,150 @@ def _evaluate_acc_f1(self): f1 = f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average="macro") return test_acc, f1 - def _save_state_dict(self, epoch: int) -> pathlib.Path: - curr_dt = datetime.datetime.now() - curr_dt_str = curr_dt.strftime("%Y-%m-%d_%H%M%S") - filename = f"{self.config.model}_epoch_{epoch}_{curr_dt_str}.pkl" - full_path = self.save_state_dict_folder.joinpath(filename) - try: - torch.save(self.model.state_dict(), full_path) - except: - raise Exception("Error saving model state dict!") - return full_path - - def _train_epoch(self, criterion: function, optimizer: function) -> pathlib.Path: - max_val_acc, max_val_f1 = 0, 0 - max_val_epoch = 0 - global_step = 0 - path = None - - for epoch in range(self.config.epochs): - n_correct, n_total, loss_total = 0, 0, 0 - self.model.train() - for _, batch in enumerate(self.dataloader_train): - global_step += 1 - optimizer.zero_grad() - - inputs = [batch[col].to(self.device) for col in batch.keys()] - targets = batch["polarity"].to(self.device) - outputs = self.model(inputs) - loss = criterion(outputs, targets) - loss.backward() - optimizer.step() - - n_correct += (torch.argmax(outputs, -1) == targets).sum().item() - n_total += len(outputs) - loss_total += loss.item() * len(outputs) - - if global_step % self.config.log_step == 0: - train_acc = n_correct / n_total - train_loss = loss_total / n_total - logging.info(f"Train Acc: {train_acc:.4f}, Train Loss: {train_loss:.4f}") - - val_acc, val_f1 = self._evaluate_acc_f1() - logging.info( - f""" - Epoch: {epoch} - Test Acc: {val_acc:.4f} - Test Loss: {val_f1:.4f} - """ - ) - if val_f1 > max_val_f1: - max_val_f1 = val_f1 - - if val_acc > max_val_acc: - max_val_acc = val_acc - max_val_epoch = epoch - if self.config.save_state_dict: - path = self._save_state_dict(epoch) - logging.info( - f""" - Best model saved. Acc: {max_val_acc:.4f}, F1: {max_val_f1}, Epoch: {max_val_epoch} - """ - ) - - if epoch - max_val_epoch >= self.config.patience: - logging.info(f"Early stopping") - break - return path + # def _save_state_dict(self, epoch: int) -> pathlib.Path: + # curr_dt = datetime.datetime.now() + # curr_dt_str = curr_dt.strftime("%Y-%m-%d_%H%M%S") + # filename = f"{self.config.model}_epoch_{epoch}_{curr_dt_str}.pkl" + # full_path = self.save_state_dict_folder.joinpath(filename) + # try: + # torch.save(self.model.state_dict(), full_path) + # except: + # raise Exception("Error saving model state dict!") + # return full_path + + def _train_epoch( + self, criterion: function, optimizer: function, train_dataloader: DataLoader, val_dataloader: DataLoader + ) -> pathlib.Path: + # max_val_acc, max_val_f1 = 0, 0 + # max_val_epoch = 0 + # global_step = 0 + # path = None + + # for epoch in range(self.config.epochs): + # n_correct, n_total, loss_total = 0, 0, 0 + # self.model.train() + # for _, batch in enumerate(train_dataloader): + # global_step += 1 + # optimizer.zero_grad() + + # inputs = [batch[col]["input_ids"] for col in batch.keys() if col != "polarity"] + # targets = batch["polarity"]["input_ids"] + # outputs = self.model(inputs) + # loss = criterion(outputs, targets) + # loss.backward() + # optimizer.step() + + # n_correct += (torch.argmax(outputs, -1) == targets).sum().item() + # n_total += len(outputs) + # loss_total += loss.item() * len(outputs) + + # if global_step % self.config.log_step == 0: + # train_acc = n_correct / n_total + # train_loss = loss_total / n_total + # logging.info(f"Train Acc: {train_acc:.4f}, Train Loss: {train_loss:.4f}") + + # val_acc, val_f1 = self._evaluate_acc_f1(val_dataloader) + # logging.info( + # f""" + # Epoch: {epoch} + # Test Acc: {val_acc:.4f} + # Test Loss: {val_f1:.4f} + # """ + # ) + # if val_f1 > max_val_f1: + # max_val_f1 = val_f1 + + # if val_acc > max_val_acc: + # max_val_acc = val_acc + # max_val_epoch = epoch + # if self.config.save_state_dict: + # path = self._save_state_dict(epoch) + # logging.info( + # f""" + # Best model saved. Acc: {max_val_acc:.4f}, F1: {max_val_f1}, Epoch: {max_val_epoch} + # """ + # ) + + # if epoch - max_val_epoch >= self.config.patience: + # logging.info(f"Early stopping") + # break + # return path + pass def train(self): - criterion = nn.CrossEntropyLoss() - _params = filter(lambda p: p.requires_grad, self.model.parameters()) - optimizer = self._create_optimizer()(_params, lr=self.config.learning_rate, weight_decay=self.config.l2reg) - - test_accs, test_f1s = [], [] - for i in range(self.config.repeats): - logging.info(f"Start overall train loop : {i + 1}") - - self._reset_params() - test_acc, test_f1 = self._train_epoch(criterion, optimizer) - test_accs.append(test_acc) - test_f1s.append(test_f1) - - logging.info(f"Test_acc: {test_acc}, Test_f1: {test_f1}") - test_accs_avg = np.sum(test_accs) / self.config.repeats - test_f1s_avg = np.sum(test_f1s) / self.config.repeats - max_accs = np.max(test_accs) - max_f1s = np.max(test_f1s) - - logging.info( - f""" - Test acc average: {test_accs_avg} - Test f1 average: {test_f1s_avg} - Test acc max: {max_accs} - Test f1 max: {max_f1s} - """ - ) - + # criterion = nn.CrossEntropyLoss() + # _params = filter(lambda p: p.requires_grad, self.model.parameters()) + # optimizer = self._create_optimizer()(_params, lr=self.config.learning_rate, weight_decay=self.config.l2reg) + + # train_dataloader, val_dataloader, test_dataloader = self._generate_data_loaders() + + # test_accs, test_f1s = [], [] + # for i in range(self.config.repeats): + # logging.info(f"Start overall train loop : {i + 1}") + + # self._reset_params() + # test_acc, test_f1 = self._train_epoch(criterion, optimizer, train_dataloader, val_dataloader) + # test_accs.append(test_acc) + # test_f1s.append(test_f1) + + # logging.info(f"Test_acc: {test_acc}, Test_f1: {test_f1}") + # test_accs_avg = np.sum(test_accs) / self.config.repeats + # test_f1s_avg = np.sum(test_f1s) / self.config.repeats + # max_accs = np.max(test_accs) + # max_f1s = np.max(test_f1s) + + # logging.info( + # f""" + # Test acc average: {test_accs_avg} + # Test f1 average: {test_f1s_avg} + # Test acc max: {max_accs} + # Test f1 max: {max_f1s} + # """ + # ) + pass + + +class SenticGCNBertTrainer(SenticGCNBaseTrainer): + """ + Trainer class derived from SenticGCNBaseTrainer. Used for training SenticGCNBertModel. + + Args: + config (SenticGCNTrainArgs): Training config for SenticGCNBertModel + """ -class SenticBertGCNTrainer(SenticGCNBaseTrainer): def __init__(self, config: SenticGCNTrainArgs): + super().__init__(config) self.config = config + tokenizer = self._create_tokenizer() + self.embed = self._create_embedding_model() + data_gen = SenticGCNDatasetGenerator(config, tokenizer) + self.train_data, self.val_data, self.test_data = data_gen.generate_datasets() + del data_gen + + def _create_tokenizer(self) -> SenticGCNBertTokenizer: + """ + Private method to construct tokenizer via the from_pretrained method. + + Returns: + SenticGCNBertTokenizer: return a SenticGCNBertTokenizer instance. + """ + return SenticGCNBertTokenizer.from_pretrained(self.config.tokenizer) + + def _create_embedding_model(self) -> SenticGCNBertEmbeddingModel: + """ + Private helper method to create the bert based embedding models. + + Returns: + SenticGCNBertEmbeddingModel: return instance of pretrained SenticGCNBertEmbeddingModel + """ + config = SenticGCNBertEmbeddingConfig.from_pretrained(self.config.embedding_model) + return SenticGCNBertEmbeddingModel.from_pretrained(self.config.embedding_model, config=config) def _reset_params(self): + """ + Private helper method to reset model parameters. + To be used during repeats train loop. + """ for child in self.model.children(): if type(child) != SenticGCNBertPreTrainedModel: for param in child.parameters(): @@ -199,12 +263,83 @@ def _reset_params(self): stdv = 1.0 / math.sqrt(param.shape[0]) nn.init.uniform_(param, a=-stdv, b=stdv) + def _generate_data_loaders(self) -> Tuple[DataLoader, DataLoader, DataLoader]: + """ + Private helper method to generate train, val and test dataloaders. + + Returns: + Tuple[DataLoader, DataLoader, DataLoader]: return train, val and test dataloaders. + """ + train_dataloader = DataLoader(self.train_data, batch_size=self.config.batch_size, shuffle=True) + val_dataloader = DataLoader(self.val_data, batch_size=self.config.batch_size, shuffle=False) + test_dataloader = DataLoader(self.test_data, batch_size=self.config.batch_size, shuffle=False) + return train_dataloader, val_dataloader, test_dataloader + class SenticGCNTrainer(SenticGCNBaseTrainer): - def __init__(self, config: SenticGCNTrainArgs): + """ + Trainer class derived from SenticGCNBaseTrainer. Used for training SenticGCNModel. + + Args: + config (SenticGCNTrainArgs): Training config for SenticGCNModel + """ + + def __init__(self, config: SenticGCNTrainArgs) -> None: + super().__init__(config) self.config = config + tokenizer = self._create_tokenizer() + self.embed = self._create_embedding_model(tokenizer.vocab) + data_gen = SenticGCNDatasetGenerator(config, tokenizer) + self.train_data, self.val_data, self.test_data = data_gen.generate_datasets() + del data_gen - def _reset_params(self): + def _create_tokenizer(self) -> SenticGCNTokenizer: + """ + Private method to construct tokenizer either via the from_pretrained method or + constructing the tokenizer using input dataset files. + + Returns: + SenticGCNTokenizer: return a SenticGCNTokenizer instance. + """ + if not self.config.train_tokenizer: + return SenticGCNTokenizer.from_pretrained(self.config.tokenizer) + else: + tokenizer = SenticGCNTokenizer( + train_files=[self.config.dataset_train, self.config.dataset_test], train_vocab=True + ) + if self.config.save_tokenizer: + tokenizer.save_pretrained(self.config.save_tokenizer_path) + return tokenizer + + def _create_embedding_model(self, vocab: dict[str, int]) -> SenticGCNEmbeddingModel: + """ + Private method to construct embedding model either via the from_pretrained method or + building the embedding model from word vector files. (e.g. GloVe word vectors) + + Args: + vocab (dict[str, int]): dictionary of vocab from tokenizer + + Returns: + SenticGCNEmbeddingModel: return a SenticGCNEmbeddingModel instance. + """ + if not self.config.build_embedding_model: + config_path = pathlib.Path(self.config.embedding_model).joinpath("config.json") + embed_config = SenticGCNEmbeddingConfig.from_pretrained(config_path) + embed_path = pathlib.Path(self.config.embedding_model).joinpath("pytorch_model.bin") + return SenticGCNEmbeddingModel.from_pretrained(embed_path, config=embed_config) + else: + embedding_model = SenticGCNEmbeddingModel.build_embedding_model( + self.config.word_vec_file_path, vocab, self.config.embed_dim + ) + if self.config.save_embedding_model: + embedding_model.save_pretrained(self.config.save_embedding_model_path) + return embedding_model + + def _reset_params(self) -> None: + """ + Private helper method to reset model parameters. + To be used during repeats train loop. + """ for param in self.modelparameters(): if param.requires_grad: if len(param.shape) > 1: @@ -213,10 +348,64 @@ def _reset_params(self): stdv = 1.0 / math.sqrt(param.shape[0]) nn.init.uniform_(param, a=-stdv, b=stdv) + def _generate_data_loaders(self) -> Tuple[BucketIterator, BucketIterator, BucketIterator]: + """ + Private helper method to generate train, val and test dataloaders. + + Returns: + Tuple[BucketIterator, BucketIterator, BucketIterator]: return train, val and test bucketiterators. + """ + train_dataloader = BucketIterator(self.train_data, batch_size=self.config.batch_size, shuffle=True) + val_dataloader = BucketIterator(self.val_data, batch_size=self.config.batch_size, shuffle=False) + test_dataloader = BucketIterator(self.test_data, batch_size=self.config.batch_size, shuffle=False) + return train_dataloader, val_dataloader, test_dataloader + if __name__ == "__main__": - cfg = parse_args_and_load_config() + # cfg = parse_args_and_load_config() + args = { + "senticnet_word_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/senticNet/senticnet_word.txt", + "save_preprocessed_senticnet": True, + "saved_preprocessed_senticnet_file_path": "senticnet/senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", + "word_vec_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/glove/glove.840B.300d.txt", + "dataset_train": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", + "dataset_test": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw", + "valset_ratio": 0, + "model": "senticgcn", + "save_best_model": True, + "save_model_path": "senticgcn", + "tokenizer": "senticgcn", + "train_tokenizer": False, + "save_tokenizer": False, + "save_tokenizer_path": "senticgcn_tokenizer", + "embedding_model": "senticgcn_embed_model", + "build_embedding_model": False, + "save_embedding_model": False, + "save_embedding_model_path": "senticgcn_embed_model", + "initializer": "xavier_uniform", + "optimizer": "adam", + "loss_function": "cross_entropy", + "learning_rate": 0.001, + "l2reg": 0.00001, + "epochs": 100, + "batch_size": 32, + "log_step": 5, + "embed_dim": 300, + "hidden_dim": 300, + "polarities_dim": 3, + "dropout": 0.3, + "save_results": True, + "seed": 776, + "device": "cuda", + "repeats": 10, + "patience": 5, + "max_len": 85, + } + from data_class import SenticGCNTrainArgs + + cfg = SenticGCNTrainArgs(**args) if cfg.seed is not None: set_random_seed(cfg.seed) - trainer = SenticGCNTrainer(cfg) if cfg.model == "senticgcn" else SenticBertGCNTrainer(cfg) + trainer = SenticGCNTrainer(cfg) if cfg.model == "senticgcn" else SenticGCNBertTrainer(cfg) trainer.train() From 54c587116cc4783ff6669f1792302e405d8b6263 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 6 Jan 2022 17:15:59 +0800 Subject: [PATCH 090/201] [#41] first draft complete end to end trainer constuction --- sgnlp/models/sentic_gcn/data_class.py | 6 +- sgnlp/models/sentic_gcn/train.py | 380 ++++++++++++++++++-------- 2 files changed, 272 insertions(+), 114 deletions(-) diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index 0bdcdea..e3794e5 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -161,9 +161,9 @@ def __post_init__(self): assert self.model in ["senticgcn", "senticgcnbert"], "Invalid model type!" assert self.initializer in [ - "xavier_uniform", - "xavier_uniform", - "orthogonal", + "xavier_uniform_", + "xavier_normal_", + "orthogonal_", ], "Invalid initializer type!" assert self.optimizer in [ "adadelta", diff --git a/sgnlp/models/sentic_gcn/train.py b/sgnlp/models/sentic_gcn/train.py index 6b86e90..d3d5a4b 100644 --- a/sgnlp/models/sentic_gcn/train.py +++ b/sgnlp/models/sentic_gcn/train.py @@ -1,17 +1,26 @@ import logging import math import pathlib +import pickle +import shutil +import tempfile from typing import Tuple, Union -from sklearn.metrics import f1_score import torch import torch.nn as nn import torch.optim as optim +from sklearn.metrics import f1_score from torch.utils.data.dataloader import DataLoader -from config import SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig +from config import SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig from data_class import SenticGCNTrainArgs -from modeling import SenticGCNBertPreTrainedModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel +from modeling import ( + SenticGCNBertPreTrainedModel, + SenticGCNModel, + SenticGCNBertModel, + SenticGCNEmbeddingModel, + SenticGCNBertEmbeddingModel, +) from tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer from utils import parse_args_and_load_config, set_random_seed, SenticGCNDatasetGenerator, BucketIterator @@ -28,14 +37,15 @@ def __init__(self, config: SenticGCNTrainArgs): self.config = config self.global_max_acc = 0.0 self.global_max_f1 = 0.0 + self.global_best_model_tmpdir = None self.device = ( torch.device("cuda" if torch.cuda.is_available() else "cpu") if not self.config.device else torch.device(self.config.device) ) - if config.save_state_dict: - self.save_state_dict_folder = pathlib.Path(self.config.saved_state_dict_folder_path) - self.save_state_dict_folder.mkdir(exist_ok=True) + self.initializer = self._create_initializers() + with tempfile.TemporaryDirectory() as tmpdir: + self.temp_dir = pathlib.Path(tmpdir) def _create_initializers(self): """ @@ -43,12 +53,12 @@ def _create_initializers(self): """ initializers = { "xavier_uniform_": nn.init.xavier_uniform_, - "xavier_normal_": nn.init.xavier_normal, + "xavier_normal_": nn.init.xavier_normal_, "orthogonal": nn.init.orthogonal_, } return initializers[self.config.initializer] - def _create_optimizer(self): + def _create_optimizer(self, params, lr, weight_decay): """ Private helper method to instantiate optimzer. """ @@ -61,7 +71,7 @@ def _create_optimizer(self): "rmsprop": optim.RMSprop, "sgd": optim.SGD, } - return optimizers[self.config.optimizer] + return optimizers[self.config.optimizer](params, lr=lr, weight_decay=weight_decay) def _reset_params(self) -> None: raise NotImplementedError("Please call from derived class only.") @@ -77,6 +87,25 @@ def _create_tokenizer(self) -> Union[SenticGCNTokenizer, SenticGCNBertTokenizer] def _create_embedding_model(self) -> Union[SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel]: raise NotImplementedError("Please call from derived class only.") + def _save_model(self) -> None: + """ + Private helper method to save the pretrained model. + """ + if self.config.save_best_model: + self.model.save_pretrained(self.config.save_model_path) + + def _clean_temp_dir(self, result_records: dict[str, dict[str, float]]) -> None: + """ + Helper method to clean up temp dir and model weights from repeat train loops. + + Args: + result_records (dict[str, dict[str, float]]): dictionary of result_records after training. + """ + for key, val in result_records.items(): + if key == "test": + continue + shutil.rmtree(val["tmp_dir"], ignore_errors=True) + def _evaluate_acc_f1(self, dataloader: DataLoader) -> Tuple[float, float]: """ Private helper method to evaluate accuracy and f1 score. @@ -92,10 +121,14 @@ def _evaluate_acc_f1(self, dataloader: DataLoader) -> Tuple[float, float]: t_targets_all, t_outputs_all = None, None with torch.no_grad(): for _, t_batch in enumerate(dataloader): - t_inputs = [t_batch[col] for col in t_batch.keys() if col != "polarity"] + # Prepare input data and targets + t_inputs = [t_batch[col] for col in self.config.data_cols] t_targets = t_batch["polarity"] + + # Inference t_outputs = self.model(t_inputs) + # Calculate loss n_correct += (torch.argmax(t_outputs, -1) == t_targets).sum().item() n_total += len(t_outputs) @@ -109,97 +142,123 @@ def _evaluate_acc_f1(self, dataloader: DataLoader) -> Tuple[float, float]: f1 = f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average="macro") return test_acc, f1 - # def _save_state_dict(self, epoch: int) -> pathlib.Path: - # curr_dt = datetime.datetime.now() - # curr_dt_str = curr_dt.strftime("%Y-%m-%d_%H%M%S") - # filename = f"{self.config.model}_epoch_{epoch}_{curr_dt_str}.pkl" - # full_path = self.save_state_dict_folder.joinpath(filename) - # try: - # torch.save(self.model.state_dict(), full_path) - # except: - # raise Exception("Error saving model state dict!") - # return full_path - - def _train_epoch( - self, criterion: function, optimizer: function, train_dataloader: DataLoader, val_dataloader: DataLoader + def _train_loop( + self, + criterion, + optimizer, + train_dataloader: DataLoader, + val_dataloader: DataLoader, + tmpdir: pathlib.Path, ) -> pathlib.Path: - # max_val_acc, max_val_f1 = 0, 0 - # max_val_epoch = 0 - # global_step = 0 - # path = None - - # for epoch in range(self.config.epochs): - # n_correct, n_total, loss_total = 0, 0, 0 - # self.model.train() - # for _, batch in enumerate(train_dataloader): - # global_step += 1 - # optimizer.zero_grad() - - # inputs = [batch[col]["input_ids"] for col in batch.keys() if col != "polarity"] - # targets = batch["polarity"]["input_ids"] - # outputs = self.model(inputs) - # loss = criterion(outputs, targets) - # loss.backward() - # optimizer.step() - - # n_correct += (torch.argmax(outputs, -1) == targets).sum().item() - # n_total += len(outputs) - # loss_total += loss.item() * len(outputs) - - # if global_step % self.config.log_step == 0: - # train_acc = n_correct / n_total - # train_loss = loss_total / n_total - # logging.info(f"Train Acc: {train_acc:.4f}, Train Loss: {train_loss:.4f}") - - # val_acc, val_f1 = self._evaluate_acc_f1(val_dataloader) - # logging.info( - # f""" - # Epoch: {epoch} - # Test Acc: {val_acc:.4f} - # Test Loss: {val_f1:.4f} - # """ - # ) - # if val_f1 > max_val_f1: - # max_val_f1 = val_f1 - - # if val_acc > max_val_acc: - # max_val_acc = val_acc - # max_val_epoch = epoch - # if self.config.save_state_dict: - # path = self._save_state_dict(epoch) - # logging.info( - # f""" - # Best model saved. Acc: {max_val_acc:.4f}, F1: {max_val_f1}, Epoch: {max_val_epoch} - # """ - # ) - - # if epoch - max_val_epoch >= self.config.patience: - # logging.info(f"Early stopping") - # break - # return path - pass + max_val_acc, max_val_f1 = 0, 0 + max_val_epoch = 0 + global_step = 0 + path = None + + for epoch in range(self.config.epochs): + logging.info(f"Training epoch: {epoch + 1}") + n_correct, n_total, loss_total = 0, 0, 0 + self.model.train() + for _, batch in enumerate(train_dataloader): + global_step += 1 + optimizer.zero_grad() + + # Prepare input data and targets + inputs = [batch[col] for col in self.config.data_cols] + targets = batch["polarity"] + + # Inference + outputs = self.model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + + # Calculate loss + n_correct += (torch.argmax(outputs, -1) == targets).sum().item() + n_total += len(outputs) + loss_total += loss.item() * len(outputs) + + # Report batch loop step results + if global_step % self.config.log_step == 0: + train_acc = n_correct / n_total + train_loss = loss_total / n_total + logging.info(f"Train Acc: {train_acc:.4f}, Train Loss: {train_loss:.4f}") + + # Run eval for validation dataloader + val_acc, val_f1 = self._evaluate_acc_f1(val_dataloader) + logging.info( + f""" + Epoch: {epoch} + Test Acc: {val_acc:.4f} + Test Loss: {val_f1:.4f} + """ + ) - def train(self): - # criterion = nn.CrossEntropyLoss() - # _params = filter(lambda p: p.requires_grad, self.model.parameters()) - # optimizer = self._create_optimizer()(_params, lr=self.config.learning_rate, weight_decay=self.config.l2reg) + # Report new max F1 + if val_f1 > max_val_f1: + logging.info(f"New max F1: {val_f1:.4f} @ epoch {epoch}") + max_val_f1 = val_f1 + + # Report new max acc and save if required + if val_acc > max_val_acc: + logging.info(f"New max Accuracy: {val_acc:.4f} @ epoch {epoch}") + max_val_acc = val_acc + max_val_epoch = epoch + self.model.save_pretrained(tmpdir) + logging.info( + f""" + Best model saved. Acc: {max_val_acc:.4f}, F1: {max_val_f1}, Epoch: {max_val_epoch} + """ + ) + + # Early stopping + if epoch - max_val_epoch >= self.config.patience: + logging.info(f"Early stopping") + break + return max_val_acc, max_val_f1, max_val_epoch + + def _train( + self, train_dataloader: Union[DataLoader, BucketIterator], val_dataloader: Union[DataLoader, BucketIterator] + ) -> dict[str, dict[str, Union[int, float]]]: + criterion = nn.CrossEntropyLoss() + _params = filter(lambda p: p.requires_grad, self.model.parameters()) + optimizer = self._create_optimizer(_params, lr=self.config.learning_rate, weight_decay=self.config.l2reg) + + repeat_result = {} + for i in range(self.config.repeats): + logging.info(f"Start repeat train loop : {i + 1}") + repeat_tmpdir = self.temp_dir.joinpath(f"repeat{i + 1}") + + self._reset_params() + max_val_acc, max_val_f1, max_val_epoch = self._train_loop( + criterion, optimizer, train_dataloader, val_dataloader, repeat_tmpdir + ) - # train_dataloader, val_dataloader, test_dataloader = self._generate_data_loaders() + # Record repeat runs + repeat_result[f"Repeat_{i + 1}"] = { + "max_val_acc": max_val_acc, + "max_val_f1": max_val_f1, + "max_val_epoch": max_val_epoch, + "tmp_dir": repeat_tmpdir, + } - # test_accs, test_f1s = [], [] - # for i in range(self.config.repeats): - # logging.info(f"Start overall train loop : {i + 1}") + # Overwrite global stats + if max_val_acc > self.global_max_acc: + self.global_max_acc = max_val_acc + self.global_best_model_tmpdir = repeat_tmpdir + if max_val_f1 > self.global_max_f1: + self.global_max_f1 - # self._reset_params() - # test_acc, test_f1 = self._train_epoch(criterion, optimizer, train_dataloader, val_dataloader) - # test_accs.append(test_acc) - # test_f1s.append(test_f1) + return repeat_result - # logging.info(f"Test_acc: {test_acc}, Test_f1: {test_f1}") - # test_accs_avg = np.sum(test_accs) / self.config.repeats - # test_f1s_avg = np.sum(test_f1s) / self.config.repeats - # max_accs = np.max(test_accs) - # max_f1s = np.max(test_f1s) + # Save results for all repeat runs + # if self.config.save_results: + # pickle.dump(repeat_record, "results.pkl") + + # Evaluate test set + # config_path = self.global_best_model_tmpdir.joinpath('config.json') + # model_config = + # max_test_acc, max_test_f1 = self._evaluate_acc_f1(test_dataloader) # logging.info( # f""" @@ -209,7 +268,6 @@ def train(self): # Test f1 max: {max_f1s} # """ # ) - pass class SenticGCNBertTrainer(SenticGCNBaseTrainer): @@ -223,8 +281,13 @@ class SenticGCNBertTrainer(SenticGCNBaseTrainer): def __init__(self, config: SenticGCNTrainArgs): super().__init__(config) self.config = config + # Create tokenizer tokenizer = self._create_tokenizer() + # Create self.embed = self._create_embedding_model() + self.embed.to(self.device) + self.model = self._save_model() + self.model.to(self.device) data_gen = SenticGCNDatasetGenerator(config, tokenizer) self.train_data, self.val_data, self.test_data = data_gen.generate_datasets() del data_gen @@ -248,6 +311,23 @@ def _create_embedding_model(self) -> SenticGCNBertEmbeddingModel: config = SenticGCNBertEmbeddingConfig.from_pretrained(self.config.embedding_model) return SenticGCNBertEmbeddingModel.from_pretrained(self.config.embedding_model, config=config) + def _create_model(self) -> SenticGCNBertModel: + """ + Private helper method to create the SenticGCNBertModel instance. + + Returns: + SenticGCNBertModel: return a SenticGCNBertModel based on SenticGCNBertConfig + """ + model_config = SenticGCNBertConfig( + hidden_dim=self.config.hidden_dim, + max_seq_len=self.config.max_len, + polarities_dim=self.config.polarities_dim, + dropout=self.config.dropout, + device=self.config.device, + loss_function=self.config.loss_function, + ) + return SenticGCNModel(model_config) + def _reset_params(self): """ Private helper method to reset model parameters. @@ -258,7 +338,7 @@ def _reset_params(self): for param in child.parameters(): if param.requires_grad: if len(param.shape) > 1: - self._create_initializers(param) + self.initializer(param) else: stdv = 1.0 / math.sqrt(param.shape[0]) nn.init.uniform_(param, a=-stdv, b=stdv) @@ -275,6 +355,33 @@ def _generate_data_loaders(self) -> Tuple[DataLoader, DataLoader, DataLoader]: test_dataloader = DataLoader(self.test_data, batch_size=self.config.batch_size, shuffle=False) return train_dataloader, val_dataloader, test_dataloader + def train(self): + # Generate data_loaders + train_dataloader, val_dataloader, test_dataloader = self._generate_data_loaders() + + # Run main train + repeat_result = self._train(train_dataloader, val_dataloader) + + # Recreate best model from all repeat loops + config_path = self.global_best_model_tmpdir.joinpath("config.json") + model_config = SenticGCNBertConfig.from_pretrained(config_path) + model_path = self.global_best_model_tmpdir.joinpath("pytorch_model.bin") + self.model = SenticGCNBertConfig.from_pretrained(model_path, config=model_config) + + # Evaluate test set + test_acc, test_f1 = self._evaluate_acc_f1(test_dataloader) + logging.info(f"Best Model - Test Acc: {test_acc:.4f} - Test F1: {test_f1:.4f}") + + repeat_result["test"] = {"max_val_acc": test_acc, "max_val_f1": test_f1} + + if self.config.save_results: + pickle.dump(repeat_result, "results.pkl") + + self._save_model() + self._clean_temp_dir(repeat_result) + + logging.info("Training Completed!") + class SenticGCNTrainer(SenticGCNBaseTrainer): """ @@ -287,8 +394,15 @@ class SenticGCNTrainer(SenticGCNBaseTrainer): def __init__(self, config: SenticGCNTrainArgs) -> None: super().__init__(config) self.config = config + # Create tokenizer tokenizer = self._create_tokenizer() + # Create embedding model self.embed = self._create_embedding_model(tokenizer.vocab) + self.embed.to(self.device) + # Create model + self.model = self._create_model() + self.model.to(self.device) + # Create dataset data_gen = SenticGCNDatasetGenerator(config, tokenizer) self.train_data, self.val_data, self.test_data = data_gen.generate_datasets() del data_gen @@ -335,6 +449,23 @@ def _create_embedding_model(self, vocab: dict[str, int]) -> SenticGCNEmbeddingMo embedding_model.save_pretrained(self.config.save_embedding_model_path) return embedding_model + def _create_model(self) -> SenticGCNModel: + """ + Private helper method to create the SenticGCNModel instance. + + Returns: + SenticGCNModel: return a SenticGCNModel based on SenticGCNConfig + """ + model_config = SenticGCNConfig( + embed_dim=self.config.embed_dim, + hidden_dim=self.config.hidden_dim, + polarities_dim=self.config.polarities_dim, + dropout=self.config.dropout, + device=self.config.device, + loss_function=self.config.loss_function, + ) + return SenticGCNModel(model_config) + def _reset_params(self) -> None: """ Private helper method to reset model parameters. @@ -343,7 +474,7 @@ def _reset_params(self) -> None: for param in self.modelparameters(): if param.requires_grad: if len(param.shape) > 1: - self._create_initializers(param) + self.initializer(param) else: stdv = 1.0 / math.sqrt(param.shape[0]) nn.init.uniform_(param, a=-stdv, b=stdv) @@ -360,30 +491,57 @@ def _generate_data_loaders(self) -> Tuple[BucketIterator, BucketIterator, Bucket test_dataloader = BucketIterator(self.test_data, batch_size=self.config.batch_size, shuffle=False) return train_dataloader, val_dataloader, test_dataloader + def train(self): + # Generate data_loaders + train_dataloader, val_dataloader, test_dataloader = self._generate_data_loaders() + + # Run main train + repeat_result = self._train(train_dataloader, val_dataloader) + + # Recreate best model from all repeat loops + config_path = self.global_best_model_tmpdir.joinpath("config.json") + model_config = SenticGCNConfig.from_pretrained(config_path) + model_path = self.global_best_model_tmpdir.joinpath("pytorch_model.bin") + self.model = SenticGCNConfig.from_pretrained(model_path, config=model_config) + + # Evaluate test set + test_acc, test_f1 = self._evaluate_acc_f1(test_dataloader) + logging.info(f"Best Model - Test Acc: {test_acc:.4f} - Test F1: {test_f1:.4f}") + + repeat_result["test"] = {"max_val_acc": test_acc, "max_val_f1": test_f1} + + if self.config.save_results: + pickle.dump(repeat_result, "results.pkl") + + self._save_model() + self._clean_temp_dir(repeat_result) + + logging.info("Training Completed!") + if __name__ == "__main__": # cfg = parse_args_and_load_config() args = { - "senticnet_word_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/senticNet/senticnet_word.txt", + "senticnet_word_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_gcn/senticNet/senticnet_word.txt", "save_preprocessed_senticnet": True, - "saved_preprocessed_senticnet_file_path": "senticnet/senticnet.pickle", + "saved_preprocessed_senticnet_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_gcn/senticnet/senticnet.pickle", "spacy_pipeline": "en_core_web_sm", - "word_vec_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/glove/glove.840B.300d.txt", - "dataset_train": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", - "dataset_test": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw", + "word_vec_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_gcn/glove/glove.840B.300d.txt", + "dataset_train": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_gcn/datasets/semeval14/restaurant_train.raw", + "dataset_test": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_gcn/datasets/semeval14/restaurant_test.raw", "valset_ratio": 0, "model": "senticgcn", "save_best_model": True, - "save_model_path": "senticgcn", - "tokenizer": "senticgcn", + "save_model_path": "senticgcn_model", + "tokenizer": "senticgcn_tokenizer_temp", "train_tokenizer": False, "save_tokenizer": False, - "save_tokenizer_path": "senticgcn_tokenizer", - "embedding_model": "senticgcn_embed_model", + "save_tokenizer_path": "senticgcn_tokenizer_temp", + "embedding_model": "senticgcn_embed_model_temp", "build_embedding_model": False, "save_embedding_model": False, - "save_embedding_model_path": "senticgcn_embed_model", - "initializer": "xavier_uniform", + "save_embedding_model_path": "senticgcn_embed_model_temp", + "initializer": "xavier_uniform_", "optimizer": "adam", "loss_function": "cross_entropy", "learning_rate": 0.001, @@ -397,8 +555,8 @@ def _generate_data_loaders(self) -> Tuple[BucketIterator, BucketIterator, Bucket "dropout": 0.3, "save_results": True, "seed": 776, - "device": "cuda", - "repeats": 10, + "device": "cpu", + "repeats": 2, "patience": 5, "max_len": 85, } From 3bc476c252356c38cdc81706d69d3d3699e73407 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 6 Jan 2022 17:48:16 +0800 Subject: [PATCH 091/201] [#41] first completed training for SenticGCN --- sgnlp/models/sentic_gcn/train.py | 54 +++++++++++++------------------- 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/sgnlp/models/sentic_gcn/train.py b/sgnlp/models/sentic_gcn/train.py index d3d5a4b..4cd2cd8 100644 --- a/sgnlp/models/sentic_gcn/train.py +++ b/sgnlp/models/sentic_gcn/train.py @@ -121,6 +121,9 @@ def _evaluate_acc_f1(self, dataloader: DataLoader) -> Tuple[float, float]: t_targets_all, t_outputs_all = None, None with torch.no_grad(): for _, t_batch in enumerate(dataloader): + # Generate embedings + t_batch["text_embeddings"] = self.embed(t_batch["text_indices"]) + # Prepare input data and targets t_inputs = [t_batch[col] for col in self.config.data_cols] t_targets = t_batch["polarity"] @@ -129,15 +132,15 @@ def _evaluate_acc_f1(self, dataloader: DataLoader) -> Tuple[float, float]: t_outputs = self.model(t_inputs) # Calculate loss - n_correct += (torch.argmax(t_outputs, -1) == t_targets).sum().item() - n_total += len(t_outputs) + n_correct += (torch.argmax(t_outputs.logits, -1) == t_targets).sum().item() + n_total += len(t_outputs.logits) if t_targets_all is None: t_targets_all = t_targets - t_outputs_all = t_outputs + t_outputs_all = t_outputs.logits else: t_targets_all = torch.cat((t_targets_all, t_targets), dim=0) - t_outputs_all = torch.cat((t_outputs_all, t_outputs), dim=0) + t_outputs_all = torch.cat((t_outputs_all, t_outputs.logits), dim=0) test_acc = n_correct / n_total f1 = f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average="macro") return test_acc, f1 @@ -163,20 +166,23 @@ def _train_loop( global_step += 1 optimizer.zero_grad() + # Generate embeddings + batch["text_embeddings"] = self.embed(batch["text_indices"]) + # Prepare input data and targets - inputs = [batch[col] for col in self.config.data_cols] + inputs = [batch[col].to(self.device) for col in self.config.data_cols] targets = batch["polarity"] # Inference outputs = self.model(inputs) - loss = criterion(outputs, targets) + loss = criterion(outputs.logits, targets) loss.backward() optimizer.step() # Calculate loss - n_correct += (torch.argmax(outputs, -1) == targets).sum().item() - n_total += len(outputs) - loss_total += loss.item() * len(outputs) + n_correct += (torch.argmax(outputs.logits, -1) == targets).sum().item() + n_total += len(outputs.logits) + loss_total += loss.item() * len(outputs.logits) # Report batch loop step results if global_step % self.config.log_step == 0: @@ -251,24 +257,6 @@ def _train( return repeat_result - # Save results for all repeat runs - # if self.config.save_results: - # pickle.dump(repeat_record, "results.pkl") - - # Evaluate test set - # config_path = self.global_best_model_tmpdir.joinpath('config.json') - # model_config = - # max_test_acc, max_test_f1 = self._evaluate_acc_f1(test_dataloader) - - # logging.info( - # f""" - # Test acc average: {test_accs_avg} - # Test f1 average: {test_f1s_avg} - # Test acc max: {max_accs} - # Test f1 max: {max_f1s} - # """ - # ) - class SenticGCNBertTrainer(SenticGCNBaseTrainer): """ @@ -366,7 +354,7 @@ def train(self): config_path = self.global_best_model_tmpdir.joinpath("config.json") model_config = SenticGCNBertConfig.from_pretrained(config_path) model_path = self.global_best_model_tmpdir.joinpath("pytorch_model.bin") - self.model = SenticGCNBertConfig.from_pretrained(model_path, config=model_config) + self.model = SenticGCNBertModel.from_pretrained(model_path, config=model_config) # Evaluate test set test_acc, test_f1 = self._evaluate_acc_f1(test_dataloader) @@ -471,7 +459,7 @@ def _reset_params(self) -> None: Private helper method to reset model parameters. To be used during repeats train loop. """ - for param in self.modelparameters(): + for param in self.model.parameters(): if param.requires_grad: if len(param.shape) > 1: self.initializer(param) @@ -497,12 +485,13 @@ def train(self): # Run main train repeat_result = self._train(train_dataloader, val_dataloader) + logging.info(f"Best Train Acc: {self.global_max_acc} - Best Train F1: {self.global_max_f1}") # Recreate best model from all repeat loops config_path = self.global_best_model_tmpdir.joinpath("config.json") model_config = SenticGCNConfig.from_pretrained(config_path) model_path = self.global_best_model_tmpdir.joinpath("pytorch_model.bin") - self.model = SenticGCNConfig.from_pretrained(model_path, config=model_config) + self.model = SenticGCNModel.from_pretrained(model_path, config=model_config) # Evaluate test set test_acc, test_f1 = self._evaluate_acc_f1(test_dataloader) @@ -511,7 +500,8 @@ def train(self): repeat_result["test"] = {"max_val_acc": test_acc, "max_val_f1": test_f1} if self.config.save_results: - pickle.dump(repeat_result, "results.pkl") + with open("results.pkl", "wb") as f: + pickle.dump(repeat_result, f) self._save_model() self._clean_temp_dir(repeat_result) @@ -546,7 +536,7 @@ def train(self): "loss_function": "cross_entropy", "learning_rate": 0.001, "l2reg": 0.00001, - "epochs": 100, + "epochs": 2, "batch_size": 32, "log_step": 5, "embed_dim": 300, From 9bc6b4da513716c01e62662cae5fdd5a326807e0 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 6 Jan 2022 20:33:45 +0800 Subject: [PATCH 092/201] [#41] re-add embed_dim to config as requires by LSTM layer --- sgnlp/models/sentic_gcn/config.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sgnlp/models/sentic_gcn/config.py b/sgnlp/models/sentic_gcn/config.py index cee9948..467b867 100644 --- a/sgnlp/models/sentic_gcn/config.py +++ b/sgnlp/models/sentic_gcn/config.py @@ -48,6 +48,7 @@ class SenticGCNBertConfig(PretrainedConfig): It is used to instantiate a SenticBertGCNModel network according to the specific arguments, defining the model architecture. Args: + embed_dim (:obj:`int`, defaults to 300): The input dimension for the LSTM layer hidden_dim (:obj:`int`, defaults to 768): The embedding dimension size for the Bert model as well as GCN dimension. max_seq_len (:obj:`int`, defaults to 85): The max sequence length to pad and truncate. dropout (:obj:`float`, defaults to 0.3): Dropout percentage. @@ -64,6 +65,7 @@ class SenticGCNBertConfig(PretrainedConfig): def __init__( self, + embed_dim: int = 300, hidden_dim: int = 768, max_seq_len: int = 85, polarities_dim: int = 3, @@ -73,6 +75,7 @@ def __init__( **kwargs ) -> None: super().__init__(**kwargs) + self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.max_seq_len = max_seq_len self.dropout = dropout From 0f51144e9e6cbe7653fcb7d385031ee9d5c64e6b Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 6 Jan 2022 21:56:52 +0800 Subject: [PATCH 093/201] [#41] revert streamline to bert dataset generator --- sgnlp/models/sentic_gcn/train.py | 8 +++---- sgnlp/models/sentic_gcn/utils.py | 40 +++++++++++++++----------------- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/sgnlp/models/sentic_gcn/train.py b/sgnlp/models/sentic_gcn/train.py index 4cd2cd8..860cace 100644 --- a/sgnlp/models/sentic_gcn/train.py +++ b/sgnlp/models/sentic_gcn/train.py @@ -274,7 +274,7 @@ def __init__(self, config: SenticGCNTrainArgs): # Create self.embed = self._create_embedding_model() self.embed.to(self.device) - self.model = self._save_model() + self.model = self._create_model() self.model.to(self.device) data_gen = SenticGCNDatasetGenerator(config, tokenizer) self.train_data, self.val_data, self.test_data = data_gen.generate_datasets() @@ -520,14 +520,14 @@ def train(self): "dataset_train": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_gcn/datasets/semeval14/restaurant_train.raw", "dataset_test": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_gcn/datasets/semeval14/restaurant_test.raw", "valset_ratio": 0, - "model": "senticgcn", + "model": "senticgcnbert", "save_best_model": True, "save_model_path": "senticgcn_model", - "tokenizer": "senticgcn_tokenizer_temp", + "tokenizer": "bert-base-uncased", "train_tokenizer": False, "save_tokenizer": False, "save_tokenizer_path": "senticgcn_tokenizer_temp", - "embedding_model": "senticgcn_embed_model_temp", + "embedding_model": "bert-base-uncased", "build_embedding_model": False, "save_embedding_model": False, "save_embedding_model_path": "senticgcn_embed_model_temp", diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index 1019788..c9a2adc 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -373,27 +373,27 @@ def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, Batc full_text_with_bert_tokens = f"[CLS] {full_text} [SEP] {aspect} [SEP]" # Process indices - text_indices = self.tokenizer(full_text, return_tensors="pt") - aspect_indices = self.tokenizer(aspect, return_tensors="pt") - left_indices = self.tokenizer(text_left, return_tensors="pt") + text_indices = self.tokenizer( + full_text, return_tensors=None, return_attention_mask=False, return_token_type_ids=False + ) + aspect_indices = self.tokenizer( + aspect, return_tensors=None, return_attention_mask=False, return_token_type_ids=False + ) + left_indices = self.tokenizer( + text_left, return_tensors=None, return_attention_mask=False, return_token_type_ids=False + ) polarity = int(polarity) + 1 - polarity = BatchEncoding({"input_ids": polarity}) - polarity = polarity.convert_to_tensors("pt") # Process bert related indices text_bert_indices = self.tokenizer( - full_text_with_bert_tokens, return_tensors="pt", add_special_tokens=True, return_token_type_ids=True + full_text_with_bert_tokens, return_tensors=None, add_special_tokens=False, return_token_type_ids=False ) - text_len = np.sum(text_indices["input_ids"].numpy() != 0) - aspect_len = np.sum(aspect_indices["input_ids"].numpy() != 0) + text_len = np.sum(text_indices["input_ids"] != 0) + aspect_len = np.sum(aspect_indices["input_ids"] != 0) # array of [0] for texts including [CLS] and [SEP] and [1] for aspect and ending [SEP] concat_segment_indices = [0] * (text_len + 2) + [1] * (aspect_len + 1) concat_segment_indices = pad_and_truncate(concat_segment_indices, max_len) - concat_segment_indices = BatchEncoding({"input_ids": concat_segment_indices}) - concat_segment_indices = concat_segment_indices.convert_to_tensors("pt") - - # Process embeddings # Process graph graph = generate_dependency_adj_matrix(full_text, aspect, self.senticnet, self.spacy_pipeline) @@ -405,18 +405,16 @@ def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, Batc ), "constant", ) - sdat_graph = BatchEncoding({"input_ids": sdat_graph}) - sdat_graph = sdat_graph.convert_to_tensors("pt") all_data.append( { - "text_indices": text_indices.to(self.device), - "aspect_indices": aspect_indices.to(self.device), - "left_indices": left_indices.to(self.device), - "text_bert_indices": text_bert_indices.to(self.device), - "bert_segment_indices": concat_segment_indices.to(self.device), - "polarity": polarity.to(self.device), - "sdat_graph": sdat_graph.to(self.device), + "text_indices": text_indices["input_ids"], + "aspect_indices": aspect_indices["input_ids"], + "left_indices": left_indices["input_ids"], + "text_bert_indices": text_bert_indices["input_ids"], + "bert_segment_indices": concat_segment_indices, + "polarity": polarity, + "sdat_graph": sdat_graph, } ) return all_data From 4f1746218379decf8e0c27cf50c9d50f43ae304a Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 6 Jan 2022 22:00:18 +0800 Subject: [PATCH 094/201] [#41] fix type hints --- sgnlp/models/sentic_gcn/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index c9a2adc..267c8e2 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -352,7 +352,7 @@ def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, list]: ) return all_data - def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, BatchEncoding]: + def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, list]: """ Data preprocess method to generate all indices required for SenticGCNBert model training. @@ -360,7 +360,7 @@ def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, Batc raw_data (list[str]): list of text, aspect word and polarity read from raw dataset file. Returns: - Dict[str, BatchEncoding]: return a dictionary of dataset sub-type and their tensors. + Dict[str, list]: return a dictionary of dataset sub-type and their values. """ all_data = [] max_len = self.config.max_len From bbd0e8fe7f17a23a01b47f9c079ba7c7ce2d1b39 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 7 Jan 2022 11:31:57 +0800 Subject: [PATCH 095/201] [#41] complete adaptation for bert based SenticGCN --- .../config/senticnet_gcn_bert_config.json | 38 +++++++++ sgnlp/models/sentic_gcn/data_class.py | 1 - sgnlp/models/sentic_gcn/modeling.py | 27 +++---- sgnlp/models/sentic_gcn/tokenization.py | 8 ++ sgnlp/models/sentic_gcn/train.py | 78 ++++++------------- sgnlp/models/sentic_gcn/utils.py | 57 +++++++++++--- 6 files changed, 130 insertions(+), 79 deletions(-) create mode 100644 sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json new file mode 100644 index 0000000..94e8d1d --- /dev/null +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json @@ -0,0 +1,38 @@ +{ + "senticnet_word_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/senticNet/senticnet_word.txt", + "save_preprocessed_senticnet": true, + "saved_preprocessed_senticnet_file_path": "senticnet/senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", + "word_vec_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/glove/glove.840B.300d.txt", + + "dataset_train": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", + "dataset_test": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw", + "valset_ratio": 0, + + "model": "senticgcnbert", + "save_best_model": true, + "save_model_path": "senticgcnbert", + + "tokenizer": "bert-base-uncased", + + "embedding_model": "bert-base-uncased", + + "initializer": "xavier_uniform", + "optimizer": "adam", + "loss_function": "cross_entropy", + "learning_rate": 0.001, + "l2reg": 0.00001, + "epochs": 100, + "batch_size": 32, + "log_step": 5, + "embed_dim": 300, + "hidden_dim": 768, + "polarities_dim": 3, + "dropout": 0.3, + "save_results": true, + "seed": 776, + "device": "cuda", + "repeats": 10, + "patience": 5, + "max_len": 85 +} \ No newline at end of file diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index e3794e5..93ab12e 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -188,7 +188,6 @@ def __post_init__(self): "text_indices", "aspect_indices", "left_indices", - "text_bert_indices", "text_embeddings", "sdat_graph", ] diff --git a/sgnlp/models/sentic_gcn/modeling.py b/sgnlp/models/sentic_gcn/modeling.py index 3603ccb..1b21e22 100644 --- a/sgnlp/models/sentic_gcn/modeling.py +++ b/sgnlp/models/sentic_gcn/modeling.py @@ -82,7 +82,9 @@ def __init__(self, config: SenticGCNConfig) -> None: if config.loss_function == "cross_entropy": self.loss_function = nn.CrossEntropyLoss() - def position_weight(self, x, aspect_double_idx, text_len, aspect_len): + def position_weight( + self, x: torch.Tensor, aspect_double_idx: torch.Tensor, text_len: torch.Tensor, aspect_len: torch.Tensor + ) -> torch.Tensor: batch_size, seq_len = x.shape[0], x.shape[1] aspect_double_idx = aspect_double_idx.cpu().numpy() text_len = text_len.cpu().numpy() @@ -101,7 +103,7 @@ def position_weight(self, x, aspect_double_idx, text_len, aspect_len): weight = torch.tensor(weight, dtype=torch.float).unsqueeze(2).to(self.torch_device) return weight * x - def mask(self, x, aspect_double_idx): + def mask(self, x: torch.Tensor, aspect_double_idx: torch.Tensor) -> torch.Tensor: batch_size, seq_len = x.shape[0], x.shape[1] aspect_double_idx = aspect_double_idx.cpu().numpy() mask = [[] for i in range(batch_size)] @@ -115,7 +117,7 @@ def mask(self, x, aspect_double_idx): mask = torch.tensor(mask, dtype=torch.float).unsqueeze(2).to(self.torch_device) return mask * x - def forward(self, inputs: dict[str, torch.Tensor], labels: Optional[torch.Tensor] = None) -> SenticGCNModelOutput: + def forward(self, inputs: list[torch.Tensor], labels: Optional[torch.Tensor] = None) -> SenticGCNModelOutput: text_indices, aspect_indices, left_indices, text_embeddings, adj = inputs text_len = torch.sum(text_indices != 0, dim=-1) aspect_len = torch.sum(aspect_indices != 0, dim=-1) @@ -188,7 +190,7 @@ class SenticGCNBertModel(SenticGCNBertPreTrainedModel): """ def __init__(self, config: SenticGCNBertConfig) -> None: - super().__init__() + super().__init__(config) self.gc1 = GraphConvolution(config.hidden_dim, config.hidden_dim) self.gc2 = GraphConvolution(config.hidden_dim, config.hidden_dim) self.gc3 = GraphConvolution(config.hidden_dim, config.hidden_dim) @@ -198,7 +200,9 @@ def __init__(self, config: SenticGCNBertConfig) -> None: self.max_seq_len = config.max_seq_len self.loss_function = config.loss_function - def position_weight(self, x, aspect_double_idx, text_len, aspect_len): + def position_weight( + self, x: torch.Tensor, aspect_double_idx: torch.Tensor, text_len: torch.Tensor, aspect_len: torch.Tensor + ) -> torch.Tensor: batch_size, seq_len = x.shape[0], x.shape[1] aspect_double_idx = aspect_double_idx.cpu().numpy() text_len = text_len.cpu().numpy() @@ -217,7 +221,7 @@ def position_weight(self, x, aspect_double_idx, text_len, aspect_len): weight = torch.tensor(weight).unsqueeze(2).to(self.torch_device) return weight * x - def mask(self, x, aspect_double_idx): + def mask(self, x: torch.Tensor, aspect_double_idx: torch.Tensor) -> torch.Tensor: batch_size, seq_len = x.shape[0], x.shape[1] aspect_double_idx = aspect_double_idx.cpu().numpy() mask = [[] for i in range(batch_size)] @@ -231,18 +235,15 @@ def mask(self, x, aspect_double_idx): mask = torch.tensor(mask).unsqueeze(2).float().to(self.torch_device) return mask * x - def forward(self, inputs, labels: torch.Tensor): - text_bert_indices, text_indices, aspect_indices, bert_segments_ids, left_indices, adj = inputs + def forward(self, inputs: list[torch.Tensor], labels: Optional[torch.Tensor] = None) -> SenticGCNBertModelOutput: + text_indices, aspect_indices, left_indices, text_embeddings, adj = inputs # text_indices, text_ text_len = torch.sum(text_indices != 0, dim=-1) aspect_len = torch.sum(aspect_indices != 0, dim=-1) left_len = torch.sum(left_indices != 0, dim=-1) aspect_double_idx = torch.cat([left_len.unsqueeze(1), (left_len + aspect_len - 1).unsqueeze(1)], dim=1) - # TODO: How to embed in the preprocessor? - encoder_layer, _ = self.bert( - text_bert_indices, token_type_ids=bert_segments_ids, output_all_encoded_layers=False - ) - text_out = encoder_layer + + text_out = text_embeddings x = F.relu(self.gc1(self.position_weight(text_out, aspect_double_idx, text_len, aspect_len), adj)) x = F.relu(self.gc2(self.position_weight(x, aspect_double_idx, text_len, aspect_len), adj)) x = F.relu(self.gc3(self.position_weight(x, aspect_double_idx, text_len, aspect_len), adj)) diff --git a/sgnlp/models/sentic_gcn/tokenization.py b/sgnlp/models/sentic_gcn/tokenization.py index 0bf0c0d..c69bf66 100644 --- a/sgnlp/models/sentic_gcn/tokenization.py +++ b/sgnlp/models/sentic_gcn/tokenization.py @@ -2,6 +2,8 @@ import pickle from typing import Dict, List, Optional, Tuple +import torch + from transformers import PreTrainedTokenizer, BertTokenizer @@ -153,4 +155,10 @@ def __call__( return_tensors=return_tensors, **kwargs, ) + # Workaround for padding empty input text + for key in encoding.keys(): + if len(encoding[key]) == 0 and padding == "max_length": + encoding[key] = [0] * max_length + if return_tensors == "pt": + encoding[key] = torch.tensor(encoding[key]) return encoding diff --git a/sgnlp/models/sentic_gcn/train.py b/sgnlp/models/sentic_gcn/train.py index 860cace..8eb72ae 100644 --- a/sgnlp/models/sentic_gcn/train.py +++ b/sgnlp/models/sentic_gcn/train.py @@ -87,6 +87,9 @@ def _create_tokenizer(self) -> Union[SenticGCNTokenizer, SenticGCNBertTokenizer] def _create_embedding_model(self) -> Union[SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel]: raise NotImplementedError("Please call from derived class only.") + def _generate_embeddings(self, batch: list[torch.Tensor]) -> torch.Tensor: + raise NotImplementedError("Please call from derived class only") + def _save_model(self) -> None: """ Private helper method to save the pretrained model. @@ -122,11 +125,11 @@ def _evaluate_acc_f1(self, dataloader: DataLoader) -> Tuple[float, float]: with torch.no_grad(): for _, t_batch in enumerate(dataloader): # Generate embedings - t_batch["text_embeddings"] = self.embed(t_batch["text_indices"]) + t_batch["text_embeddings"] = self._generate_embeddings(t_batch) # Prepare input data and targets - t_inputs = [t_batch[col] for col in self.config.data_cols] - t_targets = t_batch["polarity"] + t_inputs = [t_batch[col].to(self.device) for col in self.config.data_cols] + t_targets = t_batch["polarity"].to(self.device) # Inference t_outputs = self.model(t_inputs) @@ -156,10 +159,9 @@ def _train_loop( max_val_acc, max_val_f1 = 0, 0 max_val_epoch = 0 global_step = 0 - path = None for epoch in range(self.config.epochs): - logging.info(f"Training epoch: {epoch + 1}") + logging.info(f"Training epoch: {epoch}") n_correct, n_total, loss_total = 0, 0, 0 self.model.train() for _, batch in enumerate(train_dataloader): @@ -167,11 +169,11 @@ def _train_loop( optimizer.zero_grad() # Generate embeddings - batch["text_embeddings"] = self.embed(batch["text_indices"]) + batch["text_embeddings"] = self._generate_embeddings(batch) # Prepare input data and targets inputs = [batch[col].to(self.device) for col in self.config.data_cols] - targets = batch["polarity"] + targets = batch["polarity"].to(self.device) # Inference outputs = self.model(inputs) @@ -271,14 +273,16 @@ def __init__(self, config: SenticGCNTrainArgs): self.config = config # Create tokenizer tokenizer = self._create_tokenizer() - # Create + # Create embedding model self.embed = self._create_embedding_model() self.embed.to(self.device) + # Create model self.model = self._create_model() self.model.to(self.device) + # Create dataset data_gen = SenticGCNDatasetGenerator(config, tokenizer) self.train_data, self.val_data, self.test_data = data_gen.generate_datasets() - del data_gen + del data_gen # delete unused dataset generator to free memory def _create_tokenizer(self) -> SenticGCNBertTokenizer: """ @@ -314,7 +318,7 @@ def _create_model(self) -> SenticGCNBertModel: device=self.config.device, loss_function=self.config.loss_function, ) - return SenticGCNModel(model_config) + return SenticGCNBertModel(model_config) def _reset_params(self): """ @@ -343,6 +347,11 @@ def _generate_data_loaders(self) -> Tuple[DataLoader, DataLoader, DataLoader]: test_dataloader = DataLoader(self.test_data, batch_size=self.config.batch_size, shuffle=False) return train_dataloader, val_dataloader, test_dataloader + def _generate_embeddings(self, batch: list[torch.Tensor]) -> torch.Tensor: + return self.embed(batch["text_bert_indices"], token_type_ids=batch["bert_segment_indices"])[ + "last_hidden_state" + ] + def train(self): # Generate data_loaders train_dataloader, val_dataloader, test_dataloader = self._generate_data_loaders() @@ -393,7 +402,7 @@ def __init__(self, config: SenticGCNTrainArgs) -> None: # Create dataset data_gen = SenticGCNDatasetGenerator(config, tokenizer) self.train_data, self.val_data, self.test_data = data_gen.generate_datasets() - del data_gen + del data_gen # delete unused dataset generator to free memory def _create_tokenizer(self) -> SenticGCNTokenizer: """ @@ -479,6 +488,9 @@ def _generate_data_loaders(self) -> Tuple[BucketIterator, BucketIterator, Bucket test_dataloader = BucketIterator(self.test_data, batch_size=self.config.batch_size, shuffle=False) return train_dataloader, val_dataloader, test_dataloader + def _generate_embeddings(self, batch: list[torch.Tensor]) -> torch.Tensor: + return self.embed(batch["text_indices"]) + def train(self): # Generate data_loaders train_dataloader, val_dataloader, test_dataloader = self._generate_data_loaders() @@ -510,49 +522,7 @@ def train(self): if __name__ == "__main__": - # cfg = parse_args_and_load_config() - args = { - "senticnet_word_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_gcn/senticNet/senticnet_word.txt", - "save_preprocessed_senticnet": True, - "saved_preprocessed_senticnet_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_gcn/senticnet/senticnet.pickle", - "spacy_pipeline": "en_core_web_sm", - "word_vec_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_gcn/glove/glove.840B.300d.txt", - "dataset_train": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_gcn/datasets/semeval14/restaurant_train.raw", - "dataset_test": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_gcn/datasets/semeval14/restaurant_test.raw", - "valset_ratio": 0, - "model": "senticgcnbert", - "save_best_model": True, - "save_model_path": "senticgcn_model", - "tokenizer": "bert-base-uncased", - "train_tokenizer": False, - "save_tokenizer": False, - "save_tokenizer_path": "senticgcn_tokenizer_temp", - "embedding_model": "bert-base-uncased", - "build_embedding_model": False, - "save_embedding_model": False, - "save_embedding_model_path": "senticgcn_embed_model_temp", - "initializer": "xavier_uniform_", - "optimizer": "adam", - "loss_function": "cross_entropy", - "learning_rate": 0.001, - "l2reg": 0.00001, - "epochs": 2, - "batch_size": 32, - "log_step": 5, - "embed_dim": 300, - "hidden_dim": 300, - "polarities_dim": 3, - "dropout": 0.3, - "save_results": True, - "seed": 776, - "device": "cpu", - "repeats": 2, - "patience": 5, - "max_len": 85, - } - from data_class import SenticGCNTrainArgs - - cfg = SenticGCNTrainArgs(**args) + cfg = parse_args_and_load_config() if cfg.seed is not None: set_random_seed(cfg.seed) trainer = SenticGCNTrainer(cfg) if cfg.model == "senticgcn" else SenticGCNBertTrainer(cfg) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index 267c8e2..8945917 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -374,19 +374,47 @@ def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, list # Process indices text_indices = self.tokenizer( - full_text, return_tensors=None, return_attention_mask=False, return_token_type_ids=False + full_text, + max_length=max_len, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, ) aspect_indices = self.tokenizer( - aspect, return_tensors=None, return_attention_mask=False, return_token_type_ids=False + aspect, + max_length=max_len, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, ) left_indices = self.tokenizer( - text_left, return_tensors=None, return_attention_mask=False, return_token_type_ids=False + text_left, + max_length=max_len, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, ) polarity = int(polarity) + 1 # Process bert related indices text_bert_indices = self.tokenizer( - full_text_with_bert_tokens, return_tensors=None, add_special_tokens=False, return_token_type_ids=False + full_text_with_bert_tokens, + max_length=max_len, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, ) text_len = np.sum(text_indices["input_ids"] != 0) aspect_len = np.sum(aspect_indices["input_ids"] != 0) @@ -406,15 +434,22 @@ def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, list "constant", ) + assert len(text_indices["input_ids"]) == max_len + assert len(aspect_indices["input_ids"]) == max_len + assert len(left_indices["input_ids"]) == max_len + assert len(text_bert_indices["input_ids"]) == max_len + assert len(concat_segment_indices) == max_len + assert len(sdat_graph) == max_len + all_data.append( { - "text_indices": text_indices["input_ids"], - "aspect_indices": aspect_indices["input_ids"], - "left_indices": left_indices["input_ids"], - "text_bert_indices": text_bert_indices["input_ids"], - "bert_segment_indices": concat_segment_indices, - "polarity": polarity, - "sdat_graph": sdat_graph, + "text_indices": torch.tensor(text_indices["input_ids"]), + "aspect_indices": torch.tensor(aspect_indices["input_ids"]), + "left_indices": torch.tensor(left_indices["input_ids"]), + "text_bert_indices": torch.tensor(text_bert_indices["input_ids"]), + "bert_segment_indices": torch.tensor(concat_segment_indices), + "polarity": torch.tensor(polarity), + "sdat_graph": torch.tensor(sdat_graph), } ) return all_data From 60157647e5399e7552c341991eb21b4d475cefd7 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 7 Jan 2022 11:58:37 +0800 Subject: [PATCH 096/201] [#41] move save results method to base class --- .../config/senticnet_gcn_bert_config.json | 4 +++- .../config/senticnet_gcn_config.json | 4 +++- sgnlp/models/sentic_gcn/data_class.py | 4 ++++ sgnlp/models/sentic_gcn/train.py | 20 ++++++++++++------- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json index 94e8d1d..b353f9c 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json @@ -17,6 +17,9 @@ "embedding_model": "bert-base-uncased", + "save_results": true, + "save_results_folder": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/results/", + "initializer": "xavier_uniform", "optimizer": "adam", "loss_function": "cross_entropy", @@ -29,7 +32,6 @@ "hidden_dim": 768, "polarities_dim": 3, "dropout": 0.3, - "save_results": true, "seed": 776, "device": "cuda", "repeats": 10, diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json index 1d7a83f..20cc369 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json @@ -23,6 +23,9 @@ "save_embedding_model": false, "save_embedding_model_path": "senticgcn_embed_model", + "save_results": true, + "save_results_folder": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/results/", + "initializer": "xavier_uniform", "optimizer": "adam", "loss_function": "cross_entropy", @@ -35,7 +38,6 @@ "hidden_dim": 300, "polarities_dim": 3, "dropout": 0.3, - "save_results": true, "seed": 776, "device": "cuda", "repeats": 10, diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index 93ab12e..5e5aaac 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -135,6 +135,10 @@ class SenticGCNTrainArgs: }, ) + # Training results + save_results: bool = field(default=True, metadata={"help": "Flag to indicate if results should be saved."}) + save_results_folder: str = field(default="results", metadata={"help": "Folder location to save results pickle."}) + initializer: str = field(default="xavier_uniform", metadata={"help": "Type of initalizer to use."}) optimizer: str = field(default="adam", metadata={"help": "Type of optimizer to use."}) loss_function: str = field(default="cross_entropy", metadata={"help": "Loss function for training/eval."}) diff --git a/sgnlp/models/sentic_gcn/train.py b/sgnlp/models/sentic_gcn/train.py index 8eb72ae..394650b 100644 --- a/sgnlp/models/sentic_gcn/train.py +++ b/sgnlp/models/sentic_gcn/train.py @@ -1,3 +1,4 @@ +import datetime import logging import math import pathlib @@ -97,6 +98,16 @@ def _save_model(self) -> None: if self.config.save_best_model: self.model.save_pretrained(self.config.save_model_path) + def _save_results(self, repeat_results: dict[str, dict]) -> None: + if self.config.save_results: + save_root_folder = pathlib.Path(self.config.save_results_folder) + save_root_folder.mkdir(exist_ok=True) + save_result_file = save_root_folder.joinpath( + f"{self.config.model}_{datetime.datetime.now().strftime('%d-%m-%y_%H-%M-%S')}_results.pkl" + ) + with open(save_result_file, "wb") as f: + pickle.dump(repeat_results, f) + def _clean_temp_dir(self, result_records: dict[str, dict[str, float]]) -> None: """ Helper method to clean up temp dir and model weights from repeat train loops. @@ -371,9 +382,7 @@ def train(self): repeat_result["test"] = {"max_val_acc": test_acc, "max_val_f1": test_f1} - if self.config.save_results: - pickle.dump(repeat_result, "results.pkl") - + self._save_results() self._save_model() self._clean_temp_dir(repeat_result) @@ -511,10 +520,7 @@ def train(self): repeat_result["test"] = {"max_val_acc": test_acc, "max_val_f1": test_f1} - if self.config.save_results: - with open("results.pkl", "wb") as f: - pickle.dump(repeat_result, f) - + self._save_results() self._save_model() self._clean_temp_dir(repeat_result) From d6d3bca1b171737098fe58fe849b45a6f68154bc Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 7 Jan 2022 13:56:05 +0800 Subject: [PATCH 097/201] [#41] fix save_results missing input arguments --- sgnlp/models/sentic_gcn/train.py | 55 ++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/sgnlp/models/sentic_gcn/train.py b/sgnlp/models/sentic_gcn/train.py index 394650b..a76af97 100644 --- a/sgnlp/models/sentic_gcn/train.py +++ b/sgnlp/models/sentic_gcn/train.py @@ -99,6 +99,12 @@ def _save_model(self) -> None: self.model.save_pretrained(self.config.save_model_path) def _save_results(self, repeat_results: dict[str, dict]) -> None: + """ + Private helper metho to save the results dictionary at the end of the training. + + Args: + repeat_results (dict[str, dict]): dictionary containing the training results + """ if self.config.save_results: save_root_folder = pathlib.Path(self.config.save_results_folder) save_root_folder.mkdir(exist_ok=True) @@ -167,6 +173,9 @@ def _train_loop( val_dataloader: DataLoader, tmpdir: pathlib.Path, ) -> pathlib.Path: + """ + Method to execute a single train repeat + """ max_val_acc, max_val_f1 = 0, 0 max_val_epoch = 0 global_step = 0 @@ -239,6 +248,16 @@ def _train_loop( def _train( self, train_dataloader: Union[DataLoader, BucketIterator], val_dataloader: Union[DataLoader, BucketIterator] ) -> dict[str, dict[str, Union[int, float]]]: + """ + Method to execute a repeat train loop. Repeat amount is dependent on config. + + Args: + train_dataloader (Union[DataLoader, BucketIterator]): dataloader for train dataset + val_dataloader (Union[DataLoader, BucketIterator]): dataloader for test dataset + + Returns: + dict[str, dict[str, Union[int, float]]]: return a dictionary containing the train results. + """ criterion = nn.CrossEntropyLoss() _params = filter(lambda p: p.requires_grad, self.model.parameters()) optimizer = self._create_optimizer(_params, lr=self.config.learning_rate, weight_decay=self.config.l2reg) @@ -279,7 +298,7 @@ class SenticGCNBertTrainer(SenticGCNBaseTrainer): config (SenticGCNTrainArgs): Training config for SenticGCNBertModel """ - def __init__(self, config: SenticGCNTrainArgs): + def __init__(self, config: SenticGCNTrainArgs) -> None: super().__init__(config) self.config = config # Create tokenizer @@ -331,7 +350,7 @@ def _create_model(self) -> SenticGCNBertModel: ) return SenticGCNBertModel(model_config) - def _reset_params(self): + def _reset_params(self) -> None: """ Private helper method to reset model parameters. To be used during repeats train loop. @@ -359,11 +378,23 @@ def _generate_data_loaders(self) -> Tuple[DataLoader, DataLoader, DataLoader]: return train_dataloader, val_dataloader, test_dataloader def _generate_embeddings(self, batch: list[torch.Tensor]) -> torch.Tensor: + """ + Private helper method to generate embeddings. + + Args: + batch (list[torch.Tensor]): a batch of sub dataset + + Returns: + torch.Tensor: return embedding tensor + """ return self.embed(batch["text_bert_indices"], token_type_ids=batch["bert_segment_indices"])[ "last_hidden_state" ] - def train(self): + def train(self) -> None: + """ + Main train method + """ # Generate data_loaders train_dataloader, val_dataloader, test_dataloader = self._generate_data_loaders() @@ -382,7 +413,7 @@ def train(self): repeat_result["test"] = {"max_val_acc": test_acc, "max_val_f1": test_f1} - self._save_results() + self._save_results(repeat_result) self._save_model() self._clean_temp_dir(repeat_result) @@ -498,9 +529,21 @@ def _generate_data_loaders(self) -> Tuple[BucketIterator, BucketIterator, Bucket return train_dataloader, val_dataloader, test_dataloader def _generate_embeddings(self, batch: list[torch.Tensor]) -> torch.Tensor: + """ + Private helper method to generate embeddings. + + Args: + batch (list[torch.Tensor]): a batch of sub dataset + + Returns: + torch.Tensor: return embedding tensor + """ return self.embed(batch["text_indices"]) - def train(self): + def train(self) -> None: + """ + Main train method + """ # Generate data_loaders train_dataloader, val_dataloader, test_dataloader = self._generate_data_loaders() @@ -520,7 +563,7 @@ def train(self): repeat_result["test"] = {"max_val_acc": test_acc, "max_val_f1": test_f1} - self._save_results() + self._save_results(repeat_result) self._save_model() self._clean_temp_dir(repeat_result) From 9ed72717c65f7d66312549b799457e5e56187ca3 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 7 Jan 2022 14:35:17 +0800 Subject: [PATCH 098/201] [#41] convert dataset input to take in list of files --- .../config/senticnet_gcn_bert_config.json | 12 +++---- .../config/senticnet_gcn_config.json | 34 +++++++++---------- sgnlp/models/sentic_gcn/data_class.py | 12 +++---- sgnlp/models/sentic_gcn/utils.py | 11 +++--- 4 files changed, 36 insertions(+), 33 deletions(-) diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json index b353f9c..52e5d60 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json @@ -1,12 +1,12 @@ { - "senticnet_word_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/senticNet/senticnet_word.txt", + "senticnet_word_file_path": "./senticNet/senticnet_word.txt", "save_preprocessed_senticnet": true, "saved_preprocessed_senticnet_file_path": "senticnet/senticnet.pickle", "spacy_pipeline": "en_core_web_sm", - "word_vec_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/glove/glove.840B.300d.txt", + "word_vec_file_path": "./glove/glove.840B.300d.txt", - "dataset_train": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", - "dataset_test": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw", + "dataset_train": ["./datasets/semeval14/restaurant_train.raw"], + "dataset_test": ["./datasets/semeval14/restaurant_test.raw"], "valset_ratio": 0, "model": "senticgcnbert", @@ -18,9 +18,9 @@ "embedding_model": "bert-base-uncased", "save_results": true, - "save_results_folder": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/results/", + "save_results_folder": "./results/", - "initializer": "xavier_uniform", + "initializer": "xavier_uniform_", "optimizer": "adam", "loss_function": "cross_entropy", "learning_rate": 0.001, diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json index 20cc369..c2e4169 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json @@ -1,37 +1,37 @@ { - "senticnet_word_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/senticNet/senticnet_word.txt", + "senticnet_word_file_path": "./senticnet-5.0/senticnet5.txt", "save_preprocessed_senticnet": true, - "saved_preprocessed_senticnet_file_path": "senticnet/senticnet.pickle", + "saved_preprocessed_senticnet_file_path": "senticnet-5.0/senticnet.pickle", "spacy_pipeline": "en_core_web_sm", - "word_vec_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/glove/glove.840B.300d.txt", + "word_vec_file_path": "./glove/glove.840B.300d.txt", - "dataset_train": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", - "dataset_test": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw", + "dataset_train": ["./datasets/semeval14/restaurant_train.raw", "./datasets/semeval15/restaurant_train.raw"], + "dataset_test": ["./datasets/semeval14/restaurant_test.raw", "./datasets/semeval15/restaurant_test.raw"], "valset_ratio": 0, "model": "senticgcn", "save_best_model": true, - "save_model_path": "senticgcn", + "save_model_path": "senticgcn_temp", "tokenizer": "senticgcn", - "train_tokenizer": false, - "save_tokenizer": false, - "save_tokenizer_path": "senticgcn_tokenizer", + "train_tokenizer": true, + "save_tokenizer": true, + "save_tokenizer_path": "senticgcn_tokenizer_temp", "embedding_model": "senticgcn_embed_model", - "build_embedding_model": false, - "save_embedding_model": false, - "save_embedding_model_path": "senticgcn_embed_model", + "build_embedding_model": true, + "save_embedding_model": true, + "save_embedding_model_path": "senticgcn_embed_model_temp", "save_results": true, - "save_results_folder": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/results/", + "save_results_folder": "./results/", - "initializer": "xavier_uniform", + "initializer": "xavier_uniform_", "optimizer": "adam", "loss_function": "cross_entropy", "learning_rate": 0.001, "l2reg": 0.00001, - "epochs": 100, + "epochs": 2, "batch_size": 32, "log_step": 5, "embed_dim": 300, @@ -39,8 +39,8 @@ "polarities_dim": 3, "dropout": 0.3, "seed": 776, - "device": "cuda", - "repeats": 10, + "device": "cpu", + "repeats": 2, "patience": 5, "max_len": 85 } \ No newline at end of file diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index 5e5aaac..fae37ff 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -36,13 +36,13 @@ class SenticGCNTrainArgs: ) # Dataset specific config - dataset_train: str = field( - default="train.raw", - metadata={"help": "File path to train dataset."}, + dataset_train: list = field( + default_factory=list, + metadata={"help": "List of file path to train dataset(s)."}, ) - dataset_test: str = field( - default="test.raw", - metadata={"help": "File path to test dataset."}, + dataset_test: list = field( + default_factory=list, + metadata={"help": "List of file path to test dataset(s)."}, ) valset_ratio: float = field( default=0.0, diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index 8945917..c5d81af 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -297,10 +297,13 @@ def _read_raw_dataset(self, dataset_type: str) -> list[str]: Returns: list[str]: list of str consisting of the full text, aspect and polarity index. """ - file_path = self.config.dataset_train if dataset_type == "train" else self.config.dataset_test - with open(file_path, "r", encoding="utf-8", newline="\n", errors="ignore") as f: - lines = f.readlines() - return lines + files_path = self.config.dataset_train if dataset_type == "train" else self.config.dataset_test + all_lines = [] + for dataset_file in files_path: + with open(dataset_file, "r", encoding="utf-8", newline="\n", errors="ignore") as f: + lines = f.readlines() + all_lines = all_lines + lines + return all_lines def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, list]: """ From 26d5486c12852c7639e001d572700435b06f5708 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 7 Jan 2022 15:26:58 +0800 Subject: [PATCH 099/201] [#41] clean up config and set to defaults --- .../config/senticnet_gcn_bert_config.json | 2 +- .../config/senticnet_gcn_config.json | 20 +++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json index 52e5d60..7373998 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json @@ -26,7 +26,7 @@ "learning_rate": 0.001, "l2reg": 0.00001, "epochs": 100, - "batch_size": 32, + "batch_size": 16, "log_step": 5, "embed_dim": 300, "hidden_dim": 768, diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json index c2e4169..4c8393d 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json @@ -1,27 +1,27 @@ { "senticnet_word_file_path": "./senticnet-5.0/senticnet5.txt", "save_preprocessed_senticnet": true, - "saved_preprocessed_senticnet_file_path": "senticnet-5.0/senticnet.pickle", + "saved_preprocessed_senticnet_file_path": "./senticnet-5.0/senticnet.pickle", "spacy_pipeline": "en_core_web_sm", "word_vec_file_path": "./glove/glove.840B.300d.txt", - "dataset_train": ["./datasets/semeval14/restaurant_train.raw", "./datasets/semeval15/restaurant_train.raw"], - "dataset_test": ["./datasets/semeval14/restaurant_test.raw", "./datasets/semeval15/restaurant_test.raw"], + "dataset_train": ["./datasets/semeval14/restaurant_train.raw"], + "dataset_test": ["./datasets/semeval14/restaurant_test.raw"], "valset_ratio": 0, "model": "senticgcn", "save_best_model": true, - "save_model_path": "senticgcn_temp", + "save_model_path": "./models/senticgcn_semeval14_rest/", "tokenizer": "senticgcn", "train_tokenizer": true, "save_tokenizer": true, - "save_tokenizer_path": "senticgcn_tokenizer_temp", + "save_tokenizer_path": "./tokenizers/senticgcn_tok_semeval14_rest/", "embedding_model": "senticgcn_embed_model", "build_embedding_model": true, "save_embedding_model": true, - "save_embedding_model_path": "senticgcn_embed_model_temp", + "save_embedding_model_path": "./embed_models/senticgcn_embed_semeval14_rest/", "save_results": true, "save_results_folder": "./results/", @@ -31,16 +31,16 @@ "loss_function": "cross_entropy", "learning_rate": 0.001, "l2reg": 0.00001, - "epochs": 2, - "batch_size": 32, + "epochs": 100, + "batch_size": 16, "log_step": 5, "embed_dim": 300, "hidden_dim": 300, "polarities_dim": 3, "dropout": 0.3, "seed": 776, - "device": "cpu", - "repeats": 2, + "device": "cuda", + "repeats": 10, "patience": 5, "max_len": 85 } \ No newline at end of file From cd5f4652e21c73a69309270353b6b9615551b122 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 7 Jan 2022 15:48:42 +0800 Subject: [PATCH 100/201] [#41] merge train and test list for vocab training --- sgnlp/models/sentic_gcn/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_gcn/train.py b/sgnlp/models/sentic_gcn/train.py index a76af97..c1a2425 100644 --- a/sgnlp/models/sentic_gcn/train.py +++ b/sgnlp/models/sentic_gcn/train.py @@ -456,7 +456,7 @@ def _create_tokenizer(self) -> SenticGCNTokenizer: return SenticGCNTokenizer.from_pretrained(self.config.tokenizer) else: tokenizer = SenticGCNTokenizer( - train_files=[self.config.dataset_train, self.config.dataset_test], train_vocab=True + train_files=[*self.config.dataset_train, *self.config.dataset_test], train_vocab=True ) if self.config.save_tokenizer: tokenizer.save_pretrained(self.config.save_tokenizer_path) From 8e66021ced2ab6fe76a8d368e580a039de7c9e64 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 7 Jan 2022 16:22:09 +0800 Subject: [PATCH 101/201] [#41] add missing to device call --- sgnlp/models/sentic_gcn/train.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sgnlp/models/sentic_gcn/train.py b/sgnlp/models/sentic_gcn/train.py index c1a2425..ca44dc7 100644 --- a/sgnlp/models/sentic_gcn/train.py +++ b/sgnlp/models/sentic_gcn/train.py @@ -7,6 +7,7 @@ import tempfile from typing import Tuple, Union +import spacy import torch import torch.nn as nn import torch.optim as optim @@ -387,9 +388,10 @@ def _generate_embeddings(self, batch: list[torch.Tensor]) -> torch.Tensor: Returns: torch.Tensor: return embedding tensor """ - return self.embed(batch["text_bert_indices"], token_type_ids=batch["bert_segment_indices"])[ - "last_hidden_state" - ] + text_bert_indices = batch["text_bert_indices"].to(self.device) + bert_segment_indices = batch["bert_segment_indices"].to(self.device) + + return self.embed(text_bert_indices, token_type_ids=bert_segment_indices)["last_hidden_state"] def train(self) -> None: """ @@ -538,7 +540,8 @@ def _generate_embeddings(self, batch: list[torch.Tensor]) -> torch.Tensor: Returns: torch.Tensor: return embedding tensor """ - return self.embed(batch["text_indices"]) + text_indices = batch["text_indices"].to(self.device) + return self.embed(text_indices) def train(self) -> None: """ @@ -556,6 +559,7 @@ def train(self) -> None: model_config = SenticGCNConfig.from_pretrained(config_path) model_path = self.global_best_model_tmpdir.joinpath("pytorch_model.bin") self.model = SenticGCNModel.from_pretrained(model_path, config=model_config) + self.model.to(self.device) # Evaluate test set test_acc, test_f1 = self._evaluate_acc_f1(test_dataloader) From 45d1a74d39789ed63856920ed576e81fdd460629 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 7 Jan 2022 16:34:21 +0800 Subject: [PATCH 102/201] [#41] send x_len variable to cpu --- sgnlp/models/sentic_gcn/modules/dynamic_rnn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_gcn/modules/dynamic_rnn.py b/sgnlp/models/sentic_gcn/modules/dynamic_rnn.py index 76ce70b..3a08ea3 100644 --- a/sgnlp/models/sentic_gcn/modules/dynamic_rnn.py +++ b/sgnlp/models/sentic_gcn/modules/dynamic_rnn.py @@ -59,7 +59,7 @@ def forward(self, x: torch.Tensor, x_len: torch.Tensor, h0: torch.Tensor = None) x = x[x_sort_idx.long()] # Pack - x_emb_p = torch.nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first=self.batch_first) + x_emb_p = torch.nn.utils.rnn.pack_padded_sequence(x, x_len.cpu(), batch_first=self.batch_first) if self.rnn_type == "LSTM": out_pack, (ht, ct) = self.rnn(x_emb_p, None) if h0 is None else self.rnn(x_emb_p, (h0, h0)) From b660bdbe23dd5c0a53fb818ca104f1a683737143 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 7 Jan 2022 18:34:43 +0800 Subject: [PATCH 103/201] [#41] add missing to device for SenticGCNBertModel --- sgnlp/models/sentic_gcn/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_gcn/train.py b/sgnlp/models/sentic_gcn/train.py index ca44dc7..3006460 100644 --- a/sgnlp/models/sentic_gcn/train.py +++ b/sgnlp/models/sentic_gcn/train.py @@ -7,7 +7,6 @@ import tempfile from typing import Tuple, Union -import spacy import torch import torch.nn as nn import torch.optim as optim @@ -408,6 +407,7 @@ def train(self) -> None: model_config = SenticGCNBertConfig.from_pretrained(config_path) model_path = self.global_best_model_tmpdir.joinpath("pytorch_model.bin") self.model = SenticGCNBertModel.from_pretrained(model_path, config=model_config) + self.model.to(self.device) # Evaluate test set test_acc, test_f1 = self._evaluate_acc_f1(test_dataloader) From b98e38cee4cbbb49e485e23f96418a612c40399a Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 7 Jan 2022 21:26:16 +0800 Subject: [PATCH 104/201] [#41] add polyaxon files --- polyaxon/sentic_gcn/conda.yml | 16 ++++++++++ polyaxon/sentic_gcn/experiment.df | 23 +++++++++++++++ polyaxon/sentic_gcn/notebook.df | 29 +++++++++++++++++++ polyaxon/sentic_gcn/notebook.yml | 15 ++++++++++ polyaxon/sentic_gcn/sentic_gcn_bert_train.yml | 22 ++++++++++++++ polyaxon/sentic_gcn/sentic_gcn_train.yml | 22 ++++++++++++++ 6 files changed, 127 insertions(+) create mode 100644 polyaxon/sentic_gcn/conda.yml create mode 100644 polyaxon/sentic_gcn/experiment.df create mode 100644 polyaxon/sentic_gcn/notebook.df create mode 100644 polyaxon/sentic_gcn/notebook.yml create mode 100644 polyaxon/sentic_gcn/sentic_gcn_bert_train.yml create mode 100644 polyaxon/sentic_gcn/sentic_gcn_train.yml diff --git a/polyaxon/sentic_gcn/conda.yml b/polyaxon/sentic_gcn/conda.yml new file mode 100644 index 0000000..4df317c --- /dev/null +++ b/polyaxon/sentic_gcn/conda.yml @@ -0,0 +1,16 @@ +name: polyaxon + - defaults + - conda-forge +dependencies: + - python=3.9.7 + - pip + - pip: + - spacy + - numpy + - torch + - scikit-learn + - transformers + - sentencepiece + - tokenizers + +# Feel free to change the version of any package \ No newline at end of file diff --git a/polyaxon/sentic_gcn/experiment.df b/polyaxon/sentic_gcn/experiment.df new file mode 100644 index 0000000..7d7ffeb --- /dev/null +++ b/polyaxon/sentic_gcn/experiment.df @@ -0,0 +1,23 @@ +# change base image as required +FROM registry.aisingapore.net/polyaxon/cuda10:latest + +ARG USER="polyaxon" +ARG WORK_DIR="/home/$USER" + +RUN rm /bin/sh && ln -s /bin/bash /bin/sh && \ + apt update && apt install -y jq ca-certificates + +WORKDIR $WORK_DIR +USER $USER + +COPY build/conda.yml . +RUN conda env update -f conda.yml -n base && \ + rm conda.yml + +WORKDIR /code + +RUN python -m spacy download en_core_web_sm + +COPY --chown=$USER:$USER build . + +ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:$LD_LIBRARY_PATH diff --git a/polyaxon/sentic_gcn/notebook.df b/polyaxon/sentic_gcn/notebook.df new file mode 100644 index 0000000..111db91 --- /dev/null +++ b/polyaxon/sentic_gcn/notebook.df @@ -0,0 +1,29 @@ +FROM registry.aisingapore.net/aiap/polyaxon/pytorch-tf2-cpu:latest + +ARG WORK_DIR="/code" + +RUN pip install jupyterlab==0.33.12 + +WORKDIR $WORK_DIR + +RUN mkdir -p $WORK_DIR && chown -R 2222:2222 $WORK_DIR + +ARG ORG_JUPYTER="/opt/conda/bin/jupyter" +ARG MOD_JUPYTER="/opt/conda/bin/jupyter.real" + +RUN mv $ORG_JUPYTER $MOD_JUPYTER && \ + echo "#!/bin/bash" > $ORG_JUPYTER && \ + echo "/code/link_workspace.sh &" >> $ORG_JUPYTER && \ + echo "export SHELL=/bin/bash" >> $ORG_JUPYTER && \ + echo "$MOD_JUPYTER \"\$@\"" >> $ORG_JUPYTER && \ + chmod +x $ORG_JUPYTER + +COPY build/conda.yml /code +COPY build/scripts/link_workspace.sh /code + +RUN apt-get update && apt-get -y install vim jq + +RUN conda env update -n polyaxon --file conda.yml +RUN rm /code/conda.yml + +ENV LANG "C.UTF-8" diff --git a/polyaxon/sentic_gcn/notebook.yml b/polyaxon/sentic_gcn/notebook.yml new file mode 100644 index 0000000..615690b --- /dev/null +++ b/polyaxon/sentic_gcn/notebook.yml @@ -0,0 +1,15 @@ +--- +version: 1 + +kind: notebook + +build: + dockerfile: polyaxon/docker/notebook.df + context: . + +environment: + persistence: + data: ["data"] + +logging: + level: DEBUG \ No newline at end of file diff --git a/polyaxon/sentic_gcn/sentic_gcn_bert_train.yml b/polyaxon/sentic_gcn/sentic_gcn_bert_train.yml new file mode 100644 index 0000000..2c3c3e4 --- /dev/null +++ b/polyaxon/sentic_gcn/sentic_gcn_bert_train.yml @@ -0,0 +1,22 @@ +--- +version: 1 + +kind: experiment + +build: + dockerfile: polyaxon/docker/experiment.df + context: . + +environment: + resources: + gpu: + requests: 1 + limits: 1 + persistence: + data: ["data"] + +logging: + level: DEBUG + +run: + cmd: python train.py --config config/senticnet_gcn_bert_config.json diff --git a/polyaxon/sentic_gcn/sentic_gcn_train.yml b/polyaxon/sentic_gcn/sentic_gcn_train.yml new file mode 100644 index 0000000..fcc0869 --- /dev/null +++ b/polyaxon/sentic_gcn/sentic_gcn_train.yml @@ -0,0 +1,22 @@ +--- +version: 1 + +kind: experiment + +build: + dockerfile: polyaxon/docker/experiment.df + context: . + +environment: + resources: + gpu: + requests: 1 + limits: 1 + persistence: + data: ["data"] + +logging: + level: DEBUG + +run: + cmd: python train.py --config config/senticnet_gcn_config.json From 013f6291df8fb30f580ee6d8b4edcd4f10df9fb0 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 7 Jan 2022 22:09:27 +0800 Subject: [PATCH 105/201] [#41] clean up typehints and imports --- sgnlp/models/sentic_gcn/__init__.py | 6 ++++ sgnlp/models/sentic_gcn/data_class.py | 2 -- sgnlp/models/sentic_gcn/modeling.py | 10 +++---- sgnlp/models/sentic_gcn/train.py | 28 +++++++++--------- sgnlp/models/sentic_gcn/utils.py | 42 +++++++++++++-------------- 5 files changed, 46 insertions(+), 42 deletions(-) diff --git a/sgnlp/models/sentic_gcn/__init__.py b/sgnlp/models/sentic_gcn/__init__.py index e69de29..25c08a8 100644 --- a/sgnlp/models/sentic_gcn/__init__.py +++ b/sgnlp/models/sentic_gcn/__init__.py @@ -0,0 +1,6 @@ +from config import SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig +from modeling import SenticGCNModel, SenticGCNBertModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel +from preprocess import SenticGCNPreprocessor, SenticGCNBertPreprocessor +from tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer +from train import SenticGCNTrainer, SenticGCNBertTrainer +from utils import BucketIterator, parse_args_and_load_config, download_tokenizer_files diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index fae37ff..7dc9c26 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -1,7 +1,5 @@ from dataclasses import dataclass, field -from torch.cuda.memory import memory_stats_as_nested_dict - @dataclass class SenticGCNTrainArgs: diff --git a/sgnlp/models/sentic_gcn/modeling.py b/sgnlp/models/sentic_gcn/modeling.py index 1b21e22..878df06 100644 --- a/sgnlp/models/sentic_gcn/modeling.py +++ b/sgnlp/models/sentic_gcn/modeling.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Optional +from typing import Dict, List, Optional import torch import torch.nn as nn @@ -117,7 +117,7 @@ def mask(self, x: torch.Tensor, aspect_double_idx: torch.Tensor) -> torch.Tensor mask = torch.tensor(mask, dtype=torch.float).unsqueeze(2).to(self.torch_device) return mask * x - def forward(self, inputs: list[torch.Tensor], labels: Optional[torch.Tensor] = None) -> SenticGCNModelOutput: + def forward(self, inputs: List[torch.Tensor], labels: Optional[torch.Tensor] = None) -> SenticGCNModelOutput: text_indices, aspect_indices, left_indices, text_embeddings, adj = inputs text_len = torch.sum(text_indices != 0, dim=-1) aspect_len = torch.sum(aspect_indices != 0, dim=-1) @@ -235,7 +235,7 @@ def mask(self, x: torch.Tensor, aspect_double_idx: torch.Tensor) -> torch.Tensor mask = torch.tensor(mask).unsqueeze(2).float().to(self.torch_device) return mask * x - def forward(self, inputs: list[torch.Tensor], labels: Optional[torch.Tensor] = None) -> SenticGCNBertModelOutput: + def forward(self, inputs: List[torch.Tensor], labels: Optional[torch.Tensor] = None) -> SenticGCNBertModelOutput: text_indices, aspect_indices, left_indices, text_embeddings, adj = inputs # text_indices, text_ text_len = torch.sum(text_indices != 0, dim=-1) @@ -312,7 +312,7 @@ def forward(self, token_ids: torch.Tensor) -> torch.Tensor: def build_embedding_model( cls, word_vec_file_path: str, - vocab: dict[str, int], + vocab: Dict[str, int], embed_dim: int = 300, ): """ @@ -321,7 +321,7 @@ def build_embedding_model( Args: word_vec_file_path (str): file path to the word vectors - vocab (dict[str, int]): vocab dictionary consisting of words as key and index as values + vocab (Dict[str, int]): vocab dictionary consisting of words as key and index as values embed_dim (int, optional): the embedding dimension. Defaults to 300. Returns: diff --git a/sgnlp/models/sentic_gcn/train.py b/sgnlp/models/sentic_gcn/train.py index 3006460..c8c3235 100644 --- a/sgnlp/models/sentic_gcn/train.py +++ b/sgnlp/models/sentic_gcn/train.py @@ -5,7 +5,7 @@ import pickle import shutil import tempfile -from typing import Tuple, Union +from typing import Dict, List, Tuple, Union import torch import torch.nn as nn @@ -88,7 +88,7 @@ def _create_tokenizer(self) -> Union[SenticGCNTokenizer, SenticGCNBertTokenizer] def _create_embedding_model(self) -> Union[SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel]: raise NotImplementedError("Please call from derived class only.") - def _generate_embeddings(self, batch: list[torch.Tensor]) -> torch.Tensor: + def _generate_embeddings(self, batch: List[torch.Tensor]) -> torch.Tensor: raise NotImplementedError("Please call from derived class only") def _save_model(self) -> None: @@ -98,12 +98,12 @@ def _save_model(self) -> None: if self.config.save_best_model: self.model.save_pretrained(self.config.save_model_path) - def _save_results(self, repeat_results: dict[str, dict]) -> None: + def _save_results(self, repeat_results: Dict[str, Dict]) -> None: """ Private helper metho to save the results dictionary at the end of the training. Args: - repeat_results (dict[str, dict]): dictionary containing the training results + repeat_results (Dict[str, Dict]): dictionary containing the training results """ if self.config.save_results: save_root_folder = pathlib.Path(self.config.save_results_folder) @@ -114,12 +114,12 @@ def _save_results(self, repeat_results: dict[str, dict]) -> None: with open(save_result_file, "wb") as f: pickle.dump(repeat_results, f) - def _clean_temp_dir(self, result_records: dict[str, dict[str, float]]) -> None: + def _clean_temp_dir(self, result_records: Dict[str, Dict[str, float]]) -> None: """ Helper method to clean up temp dir and model weights from repeat train loops. Args: - result_records (dict[str, dict[str, float]]): dictionary of result_records after training. + result_records (Dict[str, Dict[str, float]]): dictionary of result_records after training. """ for key, val in result_records.items(): if key == "test": @@ -247,7 +247,7 @@ def _train_loop( def _train( self, train_dataloader: Union[DataLoader, BucketIterator], val_dataloader: Union[DataLoader, BucketIterator] - ) -> dict[str, dict[str, Union[int, float]]]: + ) -> Dict[str, Dict[str, Union[int, float]]]: """ Method to execute a repeat train loop. Repeat amount is dependent on config. @@ -256,7 +256,7 @@ def _train( val_dataloader (Union[DataLoader, BucketIterator]): dataloader for test dataset Returns: - dict[str, dict[str, Union[int, float]]]: return a dictionary containing the train results. + Dict[str, Dict[str, Union[int, float]]]: return a dictionary containing the train results. """ criterion = nn.CrossEntropyLoss() _params = filter(lambda p: p.requires_grad, self.model.parameters()) @@ -377,12 +377,12 @@ def _generate_data_loaders(self) -> Tuple[DataLoader, DataLoader, DataLoader]: test_dataloader = DataLoader(self.test_data, batch_size=self.config.batch_size, shuffle=False) return train_dataloader, val_dataloader, test_dataloader - def _generate_embeddings(self, batch: list[torch.Tensor]) -> torch.Tensor: + def _generate_embeddings(self, batch: List[torch.Tensor]) -> torch.Tensor: """ Private helper method to generate embeddings. Args: - batch (list[torch.Tensor]): a batch of sub dataset + batch (List[torch.Tensor]): a batch of sub dataset Returns: torch.Tensor: return embedding tensor @@ -464,13 +464,13 @@ def _create_tokenizer(self) -> SenticGCNTokenizer: tokenizer.save_pretrained(self.config.save_tokenizer_path) return tokenizer - def _create_embedding_model(self, vocab: dict[str, int]) -> SenticGCNEmbeddingModel: + def _create_embedding_model(self, vocab: Dict[str, int]) -> SenticGCNEmbeddingModel: """ Private method to construct embedding model either via the from_pretrained method or building the embedding model from word vector files. (e.g. GloVe word vectors) Args: - vocab (dict[str, int]): dictionary of vocab from tokenizer + vocab (Dict[str, int]): dictionary of vocab from tokenizer Returns: SenticGCNEmbeddingModel: return a SenticGCNEmbeddingModel instance. @@ -530,12 +530,12 @@ def _generate_data_loaders(self) -> Tuple[BucketIterator, BucketIterator, Bucket test_dataloader = BucketIterator(self.test_data, batch_size=self.config.batch_size, shuffle=False) return train_dataloader, val_dataloader, test_dataloader - def _generate_embeddings(self, batch: list[torch.Tensor]) -> torch.Tensor: + def _generate_embeddings(self, batch: List[torch.Tensor]) -> torch.Tensor: """ Private helper method to generate embeddings. Args: - batch (list[torch.Tensor]): a batch of sub dataset + batch (List[torch.Tensor]): a batch of sub dataset Returns: torch.Tensor: return embedding tensor diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index c5d81af..5d5085b 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -7,13 +7,13 @@ import requests import urllib import math -from typing import Dict, Tuple +from typing import Dict, List, Tuple import numpy as np import spacy import torch from torch.utils.data import random_split, Dataset -from transformers import PreTrainedTokenizer, PreTrainedModel +from transformers import PreTrainedTokenizer from transformers.tokenization_utils_base import BatchEncoding from data_class import SenticGCNTrainArgs @@ -56,7 +56,7 @@ def set_random_seed(seed: int = 776) -> None: def download_tokenizer_files( base_url: str, save_folder: str, - files: list[str] = ["special_tokens_map.json", "tokenizer_config.json", "vocab.pkl"], + files: List[str] = ["special_tokens_map.json", "tokenizer_config.json", "vocab.pkl"], ) -> None: """ Helper method to download files from online storage. @@ -93,7 +93,7 @@ def download_url_file(url: str, save_folder: str) -> None: def pad_and_truncate( - sequence: list[float], + sequence: List[float], max_len: int, dtype: str = "int64", padding: str = "post", @@ -104,7 +104,7 @@ def pad_and_truncate( Helper method for padding and truncating text and aspect segment. Args: - sequence (list[float]): input sequence of indices + sequence (List[float]): input sequence of indices max_len (int): maximum len to pad dtype (str, optional): data type to cast indices. Defaults to "int64". padding (str, optional): type of padding, 'pre' or 'post'. Defaults to "post". @@ -262,7 +262,7 @@ class SenticGCNDataset(Dataset): Data class for SenticGCN dataset. """ - def __init__(self, data: list[Dict[str, torch.Tensor]]) -> None: + def __init__(self, data: List[Dict[str, torch.Tensor]]) -> None: self.data = data def __getitem__(self, index: int) -> Dict[str, torch.Tensor]: @@ -287,7 +287,7 @@ def __init__(self, config: SenticGCNTrainArgs, tokenizer: PreTrainedTokenizer) - self.spacy_pipeline = spacy.load(config.spacy_pipeline) self.tokenizer = tokenizer - def _read_raw_dataset(self, dataset_type: str) -> list[str]: + def _read_raw_dataset(self, dataset_type: str) -> List[str]: """ Private helper method to read raw dataset files based on requested type (e.g. Train or Test). @@ -295,7 +295,7 @@ def _read_raw_dataset(self, dataset_type: str) -> list[str]: dataset_type (str): Type of dataset files to read. Train or Test. Returns: - list[str]: list of str consisting of the full text, aspect and polarity index. + List[str]: list of str consisting of the full text, aspect and polarity index. """ files_path = self.config.dataset_train if dataset_type == "train" else self.config.dataset_test all_lines = [] @@ -305,15 +305,15 @@ def _read_raw_dataset(self, dataset_type: str) -> list[str]: all_lines = all_lines + lines return all_lines - def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, list]: + def _generate_senticgcn_dataset(self, raw_data: List[str]) -> Dict[str, List]: """ Data preprocess method to generate all indices required for SenticGCN model training. Args: - raw_data (list[str]): list of text, aspect word and polarity read from raw dataset file. + raw_data (List[str]): list of text, aspect word and polarity read from raw dataset file. Returns: - Dict[str, list]]: return a dictionary of dataset sub-type and their list of values. + Dict[str, List]]: return a dictionary of dataset sub-type and their list of values. """ all_data = [] for i in range(0, len(raw_data), 3): @@ -355,15 +355,15 @@ def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, list]: ) return all_data - def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, list]: + def _generate_senticgcnbert_dataset(self, raw_data: List[str]) -> Dict[str, List]: """ Data preprocess method to generate all indices required for SenticGCNBert model training. Args: - raw_data (list[str]): list of text, aspect word and polarity read from raw dataset file. + raw_data (List[str]): List of text, aspect word and polarity read from raw dataset file. Returns: - Dict[str, list]: return a dictionary of dataset sub-type and their values. + Dict[str, List]: return a dictionary of dataset sub-type and their values. """ all_data = [] max_len = self.config.max_len @@ -492,7 +492,7 @@ class BucketIterator: def __init__( self, - data: list[dict[str, BatchEncoding]], + data: List[Dict[str, BatchEncoding]], batch_size: int, sort_key: str = "text_indices", shuffle=True, @@ -504,16 +504,16 @@ def __init__( self.batches = self._sort_and_pad(data, batch_size) self.batch_len = len(self.batches) - def _sort_and_pad(self, data: list[dict[str, list]], batch_size: int) -> list[dict[str, list[torch.Tensor]]]: + def _sort_and_pad(self, data: List[Dict[str, List]], batch_size: int) -> List[Dict[str, List[torch.Tensor]]]: """ Private method to sort and pad input dataset. Args: - data (list[dict[str, list]]): input dataset + data (List[Dict[str, List]]): input dataset batch_size (int): batch size to split dataset Returns: - list[dict[str, list[torch.Tensor]]]: return list of dictionary of dataset batches + List[Dict[str, List[torch.Tensor]]]: return list of dictionary of dataset batches """ num_batch = int(math.ceil(len(data) / batch_size)) if self.sort: @@ -525,15 +525,15 @@ def _sort_and_pad(self, data: list[dict[str, list]], batch_size: int) -> list[di batches.append(self._pad_data(sorted_data[i * batch_size : (i + 1) * batch_size])) return batches - def _pad_data(self, batch_data: dict[str, list]) -> dict[str, list[torch.Tensor]]: + def _pad_data(self, batch_data: Dict[str, List]) -> Dict[str, List[torch.Tensor]]: """ Private method to each sub dataset to max length for their specific batch Args: - batch_data (dict[str, list]): dictionary of sub dataset and their list of values + batch_data (Dict[str, List]): dictionary of sub dataset and their list of values Returns: - dict[str, list[torch.Tensor]]: return a dictionary of list of tensor values + Dict[str, List[torch.Tensor]]: return a dictionary of list of tensor values """ batch_text_indices = [] batch_aspect_indices = [] From 8f5431acbb444523ef122831c245e9951ae2316d Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 7 Jan 2022 22:10:24 +0800 Subject: [PATCH 106/201] [#41] complete both preprocessor init method --- sgnlp/models/sentic_gcn/preprocess.py | 80 +++++++++++++++++++-------- 1 file changed, 57 insertions(+), 23 deletions(-) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index e478dfc..052513e 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -1,4 +1,5 @@ -from typing import List +import pathlib +from typing import Dict, List import torch from transformers import PreTrainedTokenizer @@ -17,25 +18,42 @@ def __init__( embedding_model: PreTrainedModel = None, tokenizer_name: str = None, embedding_model_name: str = None, - device: torch.device = torch.device("cpu"), + device: str = "cpu", ): - self.device = device + # Set device + self.device = ( + torch.device("cuda" if torch.cuda.is_available() else "cpu") if not device else torch.device(device) + ) + # Init Tokenizer if tokenizer is not None: self.tokenizer = tokenizer else: self.tokenizer = SenticGCNTokenizer.from_pretrained(tokenizer_name) - + # Init Embedding model if embedding_model is not None: self.embedding_model = embedding_model + self.embedding_model.to(self.device) else: - embedding_config = SenticGCNEmbeddingConfig.from_pretrained(embedding_model_name) - self.embedding_model = SenticGCNEmbeddingModel.from_pretrained( - embedding_model_name, config=embedding_config - ).to(device) + embed_model_name = pathlib.Path(embedding_model_name) + if embed_model_name.is_dir(): + config_path = embed_model_name.joinpath("config.json") + model_path = embed_model_name.joinpath("pytorch_model.bin") + if config_path.exists() and model_path.exists(): + embed_config = SenticGCNEmbeddingConfig.from_pretrained(config_path) + embed_model = SenticGCNEmbeddingModel.from_pretrained(model_path, config=embed_config) + else: + raise ValueError( + f"""Error creating embedding model! config.json and pytorch_model.bin + not found in directory {embedding_model_name}.""" + ) + else: + embed_config = SenticGCNEmbeddingConfig.from_pretrained(embedding_model_name) + embed_model = SenticGCNEmbeddingModel.from_pretrained(embedding_model_name, config=embed_config) + self.embedding_model = embed_model + self.embedding_model.to(self.device) - def __call__(self, data_batch: List[str]) -> BatchEncoding: - tokens = self.tokenizer(data_batch, padding=True, return_tensors="pt") - return tokens + def __call__(self, data_batch: List[Dict[str, List[str]]]) -> BatchEncoding: + pass # TODO class SenticGCNBertPreprocessor: @@ -43,24 +61,40 @@ def __init__( self, tokenizer: PreTrainedTokenizer = None, embedding_model: PreTrainedModel = None, - tokenizer_name: str = None, - embedding_model_name: str = None, - device: torch.device = torch.device("cpu"), + tokenizer_name: str = "bert-base-uncased", + embedding_model_name: str = "bert-base-uncased", + device: str = "cpu", ): - self.device = device + # Set device + self.device = ( + torch.device("cuda" if torch.cuda.is_available() else "cpu") if not device else torch.device(device) + ) + # Init Tokenizer if tokenizer is not None: self.tokenizer = tokenizer else: self.tokenizer = SenticGCNBertTokenizer.from_pretrained(tokenizer_name) - + # Init Embedding model if embedding_model is not None: self.embedding_model = embedding_model else: - embedding_config = SenticGCNBertEmbeddingConfig.from_pretrained(embedding_model_name) - self.embedding_model = SenticGCNBertEmbeddingModel.from_pretrained( - embedding_model_name, config=embedding_config - ).to(device) + embed_model_name = pathlib.Path(embedding_model_name) + if embed_model_name.is_dir(): + config_path = embed_model_name.joinpath("config.json") + model_path = embed_model_name.joinpath("pytorch_model.bin") + if config_path.exists() and model_path.exists(): + embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(config_path) + embed_model = SenticGCNBertEmbeddingModel.from_pretrained(model_path, config=embed_config) + else: + raise ValueError( + f"""Error creating embedding model! config.json and pytorch_model.bin + not found in directory {embedding_model_name}.""" + ) + else: + embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(embedding_model_name) + embed_model = SenticGCNBertEmbeddingModel.from_pretrained(embedding_model_name, config=embed_config) + self.embedding_model = embed_model + self.embedding_model.to(self.device) - def __call__(self, data_batch: List[str]) -> BatchEncoding: - tokens = self.tokenizer(data_batch, padding=True, return_tensors="pt") - return tokens + def __call__(self, data_batch: List[Dict[str, List[str]]]) -> BatchEncoding: + pass # TODO From 769fe8e47e09a0f9ed3cfd85b9a16ba907de5e5b Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Sat, 8 Jan 2022 08:24:52 +0800 Subject: [PATCH 107/201] [#41] match default batch size with original code --- sgnlp/models/sentic_gcn/data_class.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index 7dc9c26..703ea70 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -143,7 +143,7 @@ class SenticGCNTrainArgs: learning_rate: float = field(default=0.001, metadata={"help": "Default learning rate for training."}) l2reg: float = field(default=0.00001, metadata={"help": "Default l2reg value."}) epochs: int = field(default=100, metadata={"help": "Number of epochs to train."}) - batch_size: int = field(default=32, metadata={"help": "Training batch size."}) + batch_size: int = field(default=16, metadata={"help": "Training batch size."}) log_step: int = field(default=5, metadata={"help": "Number of train steps to log results."}) embed_dim: int = field(default=300, metadata={"help": "Size of embedding."}) hidden_dim: int = field(default=300, metadata={"help": "Number of neurons for hidden layer."}) From f59444acaa5cef933c0d83e35225adf496b3149d Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Sat, 8 Jan 2022 09:22:27 +0800 Subject: [PATCH 108/201] [#41] add support to load embedding model from cloud storage --- sgnlp/models/sentic_gcn/preprocess.py | 66 +++++++++++++++++---------- 1 file changed, 41 insertions(+), 25 deletions(-) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index 052513e..bb173cd 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -1,4 +1,5 @@ import pathlib +import urllib.parse from typing import Dict, List import torch @@ -16,8 +17,10 @@ def __init__( self, tokenizer: PreTrainedTokenizer = None, embedding_model: PreTrainedModel = None, - tokenizer_name: str = None, - embedding_model_name: str = None, + tokenizer_name: str = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_tokenizer/", + embedding_model_name: str = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/", + config_filename: str = "config.json", + model_filename: str = "pytorch_model.bin", device: str = "cpu", ): # Set device @@ -31,24 +34,28 @@ def __init__( self.tokenizer = SenticGCNTokenizer.from_pretrained(tokenizer_name) # Init Embedding model if embedding_model is not None: + # Load from external instance self.embedding_model = embedding_model self.embedding_model.to(self.device) else: - embed_model_name = pathlib.Path(embedding_model_name) - if embed_model_name.is_dir(): - config_path = embed_model_name.joinpath("config.json") - model_path = embed_model_name.joinpath("pytorch_model.bin") - if config_path.exists() and model_path.exists(): + if embedding_model_name.startswith("https://") or embedding_model_name.startswith("http://"): + # Load from cloud + config_url = urllib.parse.urljoin(embedding_model_name, config_filename) + model_url = urllib.parse.urljoin(embedding_model_name, model_filename) + embed_config = SenticGCNEmbeddingConfig.from_pretrained(config_url) + embed_model = SenticGCNEmbeddingModel.from_pretrained(model_url, config=embed_config) + else: + # Load from local folder + embed_model_name = pathlib.Path(embedding_model_name) + if embed_model_name.is_dir(): + config_path = embed_model_name.joinpath("config.json") + model_path = embed_model_name.joinpath("pytorch_model.bin") embed_config = SenticGCNEmbeddingConfig.from_pretrained(config_path) embed_model = SenticGCNEmbeddingModel.from_pretrained(model_path, config=embed_config) else: - raise ValueError( - f"""Error creating embedding model! config.json and pytorch_model.bin - not found in directory {embedding_model_name}.""" - ) - else: - embed_config = SenticGCNEmbeddingConfig.from_pretrained(embedding_model_name) - embed_model = SenticGCNEmbeddingModel.from_pretrained(embedding_model_name, config=embed_config) + # Load from HuggingFace model repository + embed_config = SenticGCNEmbeddingConfig.from_pretrained(embedding_model_name) + embed_model = SenticGCNEmbeddingModel.from_pretrained(embedding_model_name, config=embed_config) self.embedding_model = embed_model self.embedding_model.to(self.device) @@ -63,6 +70,8 @@ def __init__( embedding_model: PreTrainedModel = None, tokenizer_name: str = "bert-base-uncased", embedding_model_name: str = "bert-base-uncased", + config_filename: str = "config.json", + model_filename: str = "pytorch_model.bin", device: str = "cpu", ): # Set device @@ -76,23 +85,30 @@ def __init__( self.tokenizer = SenticGCNBertTokenizer.from_pretrained(tokenizer_name) # Init Embedding model if embedding_model is not None: + # Load from external instance self.embedding_model = embedding_model + self.embedding_model.to(self.device) else: - embed_model_name = pathlib.Path(embedding_model_name) - if embed_model_name.is_dir(): - config_path = embed_model_name.joinpath("config.json") - model_path = embed_model_name.joinpath("pytorch_model.bin") - if config_path.exists() and model_path.exists(): + if embedding_model_name.startswith("https://") or embedding_model_name.startswith("http://"): + # Load from cloud + config_url = urllib.parse.urljoin(embedding_model_name, config_filename) + model_url = urllib.parse.urljoin(embedding_model_name, model_filename) + embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(config_url) + embed_model = SenticGCNBertEmbeddingModel.from_pretrained(model_url, config=embed_config) + else: + # Load from local folder + embed_model_name = pathlib.Path(embedding_model_name) + if embed_model_name.is_dir(): + config_path = embed_model_name.joinpath("config.json") + model_path = embed_model_name.joinpath("pytorch_model.bin") embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(config_path) embed_model = SenticGCNBertEmbeddingModel.from_pretrained(model_path, config=embed_config) else: - raise ValueError( - f"""Error creating embedding model! config.json and pytorch_model.bin - not found in directory {embedding_model_name}.""" + # Load from HuggingFace model repository + embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(embedding_model_name) + embed_model = SenticGCNBertEmbeddingModel.from_pretrained( + embedding_model_name, config=embed_config ) - else: - embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(embedding_model_name) - embed_model = SenticGCNBertEmbeddingModel.from_pretrained(embedding_model_name, config=embed_config) self.embedding_model = embed_model self.embedding_model.to(self.device) From 286a68679fc345ea10a858ce1b2d3263318e69ed Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Sat, 8 Jan 2022 09:48:15 +0800 Subject: [PATCH 109/201] [#41] refactor to streamline preprocess to remove unused input args --- sgnlp/models/sentic_gcn/preprocess.py | 104 ++++++++++++++++---------- sgnlp/models/sentic_gcn/utils.py | 13 ++-- 2 files changed, 72 insertions(+), 45 deletions(-) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index bb173cd..ec5388a 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -1,24 +1,28 @@ import pathlib +import shutil +import tempfile import urllib.parse -from typing import Dict, List +from typing import Dict, List, Union import torch -from transformers import PreTrainedTokenizer +from transformers import PreTrainedTokenizer, PreTrainedModel from transformers.tokenization_utils_base import BatchEncoding -from transformers.utils.dummy_pt_objects import PreTrainedModel from config import SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig from modeling import SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel from tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer +from utils import download_tokenizer_files class SenticGCNPreprocessor: def __init__( self, - tokenizer: PreTrainedTokenizer = None, - embedding_model: PreTrainedModel = None, - tokenizer_name: str = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_tokenizer/", - embedding_model_name: str = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/", + tokenizer: Union[ + str, PreTrainedTokenizer + ] = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_tokenizer/", + embedding_model: Union[ + str, PreTrainedModel + ] = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/", config_filename: str = "config.json", model_filename: str = "pytorch_model.bin", device: str = "cpu", @@ -27,26 +31,39 @@ def __init__( self.device = ( torch.device("cuda" if torch.cuda.is_available() else "cpu") if not device else torch.device(device) ) + # Init Tokenizer - if tokenizer is not None: - self.tokenizer = tokenizer + if isinstance(tokenizer, PreTrainedTokenizer): + # Load from external instance + tokenizer_ = tokenizer else: - self.tokenizer = SenticGCNTokenizer.from_pretrained(tokenizer_name) + if tokenizer.startswith("https://") or tokenizer.startswith("http://"): + # Load from cloud + # Download tokenizer files to temp dir + with tempfile.TemporaryDirectory() as tmpdir: + temp_dir = pathlib.Path(tmpdir) + download_tokenizer_files(tokenizer, temp_dir) + tokenizer_ = SenticGCNTokenizer.from_pretrained(temp_dir) + shutil.rmtree(temp_dir, ignore_errors=True) + else: + # Load from local directory or from HuggingFace model repository + tokenizer_ = SenticGCNTokenizer.from_pretrained(tokenizer) + self.tokenizer = tokenizer_ + # Init Embedding model - if embedding_model is not None: + if isinstance(embedding_model, PreTrainedModel): # Load from external instance - self.embedding_model = embedding_model - self.embedding_model.to(self.device) + embed_model = embedding_model else: - if embedding_model_name.startswith("https://") or embedding_model_name.startswith("http://"): + if embedding_model.startswith("https://") or embedding_model.startswith("http://"): # Load from cloud - config_url = urllib.parse.urljoin(embedding_model_name, config_filename) - model_url = urllib.parse.urljoin(embedding_model_name, model_filename) + config_url = urllib.parse.urljoin(embedding_model, config_filename) + model_url = urllib.parse.urljoin(embedding_model, model_filename) embed_config = SenticGCNEmbeddingConfig.from_pretrained(config_url) embed_model = SenticGCNEmbeddingModel.from_pretrained(model_url, config=embed_config) else: # Load from local folder - embed_model_name = pathlib.Path(embedding_model_name) + embed_model_name = pathlib.Path(embedding_model) if embed_model_name.is_dir(): config_path = embed_model_name.joinpath("config.json") model_path = embed_model_name.joinpath("pytorch_model.bin") @@ -54,10 +71,10 @@ def __init__( embed_model = SenticGCNEmbeddingModel.from_pretrained(model_path, config=embed_config) else: # Load from HuggingFace model repository - embed_config = SenticGCNEmbeddingConfig.from_pretrained(embedding_model_name) - embed_model = SenticGCNEmbeddingModel.from_pretrained(embedding_model_name, config=embed_config) - self.embedding_model = embed_model - self.embedding_model.to(self.device) + embed_config = SenticGCNEmbeddingConfig.from_pretrained(embedding_model) + embed_model = SenticGCNEmbeddingModel.from_pretrained(embedding_model, config=embed_config) + self.embedding_model = embed_model + self.embedding_model.to(self.device) def __call__(self, data_batch: List[Dict[str, List[str]]]) -> BatchEncoding: pass # TODO @@ -66,10 +83,8 @@ def __call__(self, data_batch: List[Dict[str, List[str]]]) -> BatchEncoding: class SenticGCNBertPreprocessor: def __init__( self, - tokenizer: PreTrainedTokenizer = None, - embedding_model: PreTrainedModel = None, - tokenizer_name: str = "bert-base-uncased", - embedding_model_name: str = "bert-base-uncased", + tokenizer: Union[str, PreTrainedTokenizer] = "bert-base-uncased", + embedding_model: Union[str, PreTrainedModel] = "bert-base-uncased", config_filename: str = "config.json", model_filename: str = "pytorch_model.bin", device: str = "cpu", @@ -78,26 +93,39 @@ def __init__( self.device = ( torch.device("cuda" if torch.cuda.is_available() else "cpu") if not device else torch.device(device) ) + # Init Tokenizer - if tokenizer is not None: - self.tokenizer = tokenizer + if isinstance(tokenizer, PreTrainedTokenizer): + # Load from external instance + tokenizer_ = tokenizer else: - self.tokenizer = SenticGCNBertTokenizer.from_pretrained(tokenizer_name) + if tokenizer.startswith("https://") or tokenizer.startswith("http://"): + # Load from cloud + # Download tokenizer files to temp dir + with tempfile.TemporaryDirectory() as tmpdir: + temp_dir = pathlib.Path(tmpdir) + download_tokenizer_files(tokenizer, temp_dir) + tokenizer_ = SenticGCNBertTokenizer.from_pretrained(temp_dir) + shutil.rmtree(temp_dir, ignore_errors=True) + else: + # Load from local directory or from HuggingFace model repository + tokenizer_ = SenticGCNBertTokenizer.from_pretrained(tokenizer) + self.tokenizer = tokenizer_ + # Init Embedding model - if embedding_model is not None: + if isinstance(embedding_model, PreTrainedModel): # Load from external instance - self.embedding_model = embedding_model - self.embedding_model.to(self.device) + embed_model = embedding_model else: - if embedding_model_name.startswith("https://") or embedding_model_name.startswith("http://"): + if embedding_model.startswith("https://") or embedding_model.startswith("http://"): # Load from cloud - config_url = urllib.parse.urljoin(embedding_model_name, config_filename) - model_url = urllib.parse.urljoin(embedding_model_name, model_filename) + config_url = urllib.parse.urljoin(embedding_model, config_filename) + model_url = urllib.parse.urljoin(embedding_model, model_filename) embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(config_url) embed_model = SenticGCNBertEmbeddingModel.from_pretrained(model_url, config=embed_config) else: # Load from local folder - embed_model_name = pathlib.Path(embedding_model_name) + embed_model_name = pathlib.Path(embedding_model) if embed_model_name.is_dir(): config_path = embed_model_name.joinpath("config.json") model_path = embed_model_name.joinpath("pytorch_model.bin") @@ -105,10 +133,8 @@ def __init__( embed_model = SenticGCNBertEmbeddingModel.from_pretrained(model_path, config=embed_config) else: # Load from HuggingFace model repository - embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(embedding_model_name) - embed_model = SenticGCNBertEmbeddingModel.from_pretrained( - embedding_model_name, config=embed_config - ) + embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(embedding_model) + embed_model = SenticGCNBertEmbeddingModel.from_pretrained(embedding_model, config=embed_config) self.embedding_model = embed_model self.embedding_model.to(self.device) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index 5d5085b..7541e12 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -7,7 +7,7 @@ import requests import urllib import math -from typing import Dict, List, Tuple +from typing import Dict, List, Tuple, Union import numpy as np import spacy @@ -55,7 +55,7 @@ def set_random_seed(seed: int = 776) -> None: def download_tokenizer_files( base_url: str, - save_folder: str, + save_folder: Union[str, pathlib.Path], files: List[str] = ["special_tokens_map.json", "tokenizer_config.json", "vocab.pkl"], ) -> None: """ @@ -63,22 +63,23 @@ def download_tokenizer_files( Args: base_url (str): Url string to storage folder. - save_folder (str): Local folder to save downloaded files. Folder will be created if it does not exists. + save_folder (Union[str, pathlib.Path]): + Local folder to save downloaded files. Folder will be created if it does not exists. """ file_paths = [urllib.parse.urljoin(base_url, file_name) for file_name in files] for file_path in file_paths: download_url_file(file_path, save_folder) -def download_url_file(url: str, save_folder: str) -> None: +def download_url_file(url: str, save_folder: Union[str, pathlib.Path]) -> None: """ Helper method to download and save url file. Args: url (str): Url of file to download. - save_folder (str): Folder to save downloaded file. Will be created if it does not exists. + save_folder (Union[str, pathlib.Path]): Folder to save downloaded file. Will be created if it does not exists. """ - save_folder_path = pathlib.Path(save_folder) + save_folder_path = pathlib.Path(save_folder) if not isinstance(save_folder, pathlib.Path) else save_folder save_folder_path.mkdir(exist_ok=True) fn_start_pos = url.rfind("/") + 1 file_name = url[fn_start_pos:] From b308d8e822fbff5798064d8a35107aeb72d636cc Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Sat, 8 Jan 2022 09:50:08 +0800 Subject: [PATCH 110/201] [#41] remove hardcoded variables --- sgnlp/models/sentic_gcn/preprocess.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index ec5388a..651231d 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -65,8 +65,8 @@ def __init__( # Load from local folder embed_model_name = pathlib.Path(embedding_model) if embed_model_name.is_dir(): - config_path = embed_model_name.joinpath("config.json") - model_path = embed_model_name.joinpath("pytorch_model.bin") + config_path = embed_model_name.joinpath(config_filename) + model_path = embed_model_name.joinpath(model_filename) embed_config = SenticGCNEmbeddingConfig.from_pretrained(config_path) embed_model = SenticGCNEmbeddingModel.from_pretrained(model_path, config=embed_config) else: @@ -127,8 +127,8 @@ def __init__( # Load from local folder embed_model_name = pathlib.Path(embedding_model) if embed_model_name.is_dir(): - config_path = embed_model_name.joinpath("config.json") - model_path = embed_model_name.joinpath("pytorch_model.bin") + config_path = embed_model_name.joinpath(config_filename) + model_path = embed_model_name.joinpath(model_filename) embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(config_path) embed_model = SenticGCNBertEmbeddingModel.from_pretrained(model_path, config=embed_config) else: From d58189468d016657656f4e36f3b6fc898a7aac59 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Sat, 8 Jan 2022 10:03:56 +0800 Subject: [PATCH 111/201] [#41] add logging and exception handling --- sgnlp/models/sentic_gcn/preprocess.py | 192 ++++++++++++++++---------- 1 file changed, 116 insertions(+), 76 deletions(-) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index 651231d..71baa05 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -1,3 +1,4 @@ +import logging import pathlib import shutil import tempfile @@ -14,6 +15,9 @@ from utils import download_tokenizer_files +logging.basicConfig(level=logging.DEBUG) + + class SenticGCNPreprocessor: def __init__( self, @@ -32,49 +36,67 @@ def __init__( torch.device("cuda" if torch.cuda.is_available() else "cpu") if not device else torch.device(device) ) - # Init Tokenizer - if isinstance(tokenizer, PreTrainedTokenizer): - # Load from external instance - tokenizer_ = tokenizer - else: - if tokenizer.startswith("https://") or tokenizer.startswith("http://"): - # Load from cloud - # Download tokenizer files to temp dir - with tempfile.TemporaryDirectory() as tmpdir: - temp_dir = pathlib.Path(tmpdir) - download_tokenizer_files(tokenizer, temp_dir) - tokenizer_ = SenticGCNTokenizer.from_pretrained(temp_dir) - shutil.rmtree(temp_dir, ignore_errors=True) + try: + # Init Tokenizer + if isinstance(tokenizer, PreTrainedTokenizer): + # Load from external instance + tokenizer_ = tokenizer else: - # Load from local directory or from HuggingFace model repository - tokenizer_ = SenticGCNTokenizer.from_pretrained(tokenizer) - self.tokenizer = tokenizer_ + if tokenizer.startswith("https://") or tokenizer.startswith("http://"): + # Load from cloud + # Download tokenizer files to temp dir + with tempfile.TemporaryDirectory() as tmpdir: + temp_dir = pathlib.Path(tmpdir) + download_tokenizer_files(tokenizer, temp_dir) + tokenizer_ = SenticGCNTokenizer.from_pretrained(temp_dir) + shutil.rmtree(temp_dir, ignore_errors=True) + else: + # Load from local directory or from HuggingFace model repository + tokenizer_ = SenticGCNTokenizer.from_pretrained(tokenizer) + self.tokenizer = tokenizer_ + except Exception as e: + logging.error(e) + raise Exception( + """ + Error initializing tokenizer! Please ensure that input tokenizer is either a PreTrainedTokenizer instance, + an url to cloud storage folder, local folder or HuggingFace model name. + """ + ) - # Init Embedding model - if isinstance(embedding_model, PreTrainedModel): - # Load from external instance - embed_model = embedding_model - else: - if embedding_model.startswith("https://") or embedding_model.startswith("http://"): - # Load from cloud - config_url = urllib.parse.urljoin(embedding_model, config_filename) - model_url = urllib.parse.urljoin(embedding_model, model_filename) - embed_config = SenticGCNEmbeddingConfig.from_pretrained(config_url) - embed_model = SenticGCNEmbeddingModel.from_pretrained(model_url, config=embed_config) + try: + # Init Embedding model + if isinstance(embedding_model, PreTrainedModel): + # Load from external instance + embed_model = embedding_model else: - # Load from local folder - embed_model_name = pathlib.Path(embedding_model) - if embed_model_name.is_dir(): - config_path = embed_model_name.joinpath(config_filename) - model_path = embed_model_name.joinpath(model_filename) - embed_config = SenticGCNEmbeddingConfig.from_pretrained(config_path) - embed_model = SenticGCNEmbeddingModel.from_pretrained(model_path, config=embed_config) + if embedding_model.startswith("https://") or embedding_model.startswith("http://"): + # Load from cloud + config_url = urllib.parse.urljoin(embedding_model, config_filename) + model_url = urllib.parse.urljoin(embedding_model, model_filename) + embed_config = SenticGCNEmbeddingConfig.from_pretrained(config_url) + embed_model = SenticGCNEmbeddingModel.from_pretrained(model_url, config=embed_config) else: - # Load from HuggingFace model repository - embed_config = SenticGCNEmbeddingConfig.from_pretrained(embedding_model) - embed_model = SenticGCNEmbeddingModel.from_pretrained(embedding_model, config=embed_config) - self.embedding_model = embed_model - self.embedding_model.to(self.device) + # Load from local folder + embed_model_name = pathlib.Path(embedding_model) + if embed_model_name.is_dir(): + config_path = embed_model_name.joinpath(config_filename) + model_path = embed_model_name.joinpath(model_filename) + embed_config = SenticGCNEmbeddingConfig.from_pretrained(config_path) + embed_model = SenticGCNEmbeddingModel.from_pretrained(model_path, config=embed_config) + else: + # Load from HuggingFace model repository + embed_config = SenticGCNEmbeddingConfig.from_pretrained(embedding_model) + embed_model = SenticGCNEmbeddingModel.from_pretrained(embedding_model, config=embed_config) + self.embedding_model = embed_model + self.embedding_model.to(self.device) + except Exception as e: + logging.error(e) + raise Exception( + """ + Error initializing embedding model! Please ensure that input tokenizer is either a PreTrainedModel instance, + an url to cloud storage folder, local folder or HuggingFace model name. + """ + ) def __call__(self, data_batch: List[Dict[str, List[str]]]) -> BatchEncoding: pass # TODO @@ -94,49 +116,67 @@ def __init__( torch.device("cuda" if torch.cuda.is_available() else "cpu") if not device else torch.device(device) ) - # Init Tokenizer - if isinstance(tokenizer, PreTrainedTokenizer): - # Load from external instance - tokenizer_ = tokenizer - else: - if tokenizer.startswith("https://") or tokenizer.startswith("http://"): - # Load from cloud - # Download tokenizer files to temp dir - with tempfile.TemporaryDirectory() as tmpdir: - temp_dir = pathlib.Path(tmpdir) - download_tokenizer_files(tokenizer, temp_dir) - tokenizer_ = SenticGCNBertTokenizer.from_pretrained(temp_dir) - shutil.rmtree(temp_dir, ignore_errors=True) + try: + # Init Tokenizer + if isinstance(tokenizer, PreTrainedTokenizer): + # Load from external instance + tokenizer_ = tokenizer else: - # Load from local directory or from HuggingFace model repository - tokenizer_ = SenticGCNBertTokenizer.from_pretrained(tokenizer) - self.tokenizer = tokenizer_ + if tokenizer.startswith("https://") or tokenizer.startswith("http://"): + # Load from cloud + # Download tokenizer files to temp dir + with tempfile.TemporaryDirectory() as tmpdir: + temp_dir = pathlib.Path(tmpdir) + download_tokenizer_files(tokenizer, temp_dir) + tokenizer_ = SenticGCNBertTokenizer.from_pretrained(temp_dir) + shutil.rmtree(temp_dir, ignore_errors=True) + else: + # Load from local directory or from HuggingFace model repository + tokenizer_ = SenticGCNBertTokenizer.from_pretrained(tokenizer) + self.tokenizer = tokenizer_ + except Exception as e: + logging.error(e) + raise Exception( + """ + Error initializing tokenizer! Please ensure that input tokenizer is either a PreTrainedTokenizer instance, + an url to cloud storage folder, local folder or HuggingFace model name. + """ + ) - # Init Embedding model - if isinstance(embedding_model, PreTrainedModel): - # Load from external instance - embed_model = embedding_model - else: - if embedding_model.startswith("https://") or embedding_model.startswith("http://"): - # Load from cloud - config_url = urllib.parse.urljoin(embedding_model, config_filename) - model_url = urllib.parse.urljoin(embedding_model, model_filename) - embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(config_url) - embed_model = SenticGCNBertEmbeddingModel.from_pretrained(model_url, config=embed_config) + try: + # Init Embedding model + if isinstance(embedding_model, PreTrainedModel): + # Load from external instance + embed_model = embedding_model else: - # Load from local folder - embed_model_name = pathlib.Path(embedding_model) - if embed_model_name.is_dir(): - config_path = embed_model_name.joinpath(config_filename) - model_path = embed_model_name.joinpath(model_filename) - embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(config_path) - embed_model = SenticGCNBertEmbeddingModel.from_pretrained(model_path, config=embed_config) + if embedding_model.startswith("https://") or embedding_model.startswith("http://"): + # Load from cloud + config_url = urllib.parse.urljoin(embedding_model, config_filename) + model_url = urllib.parse.urljoin(embedding_model, model_filename) + embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(config_url) + embed_model = SenticGCNBertEmbeddingModel.from_pretrained(model_url, config=embed_config) else: - # Load from HuggingFace model repository - embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(embedding_model) - embed_model = SenticGCNBertEmbeddingModel.from_pretrained(embedding_model, config=embed_config) + # Load from local folder + embed_model_name = pathlib.Path(embedding_model) + if embed_model_name.is_dir(): + config_path = embed_model_name.joinpath(config_filename) + model_path = embed_model_name.joinpath(model_filename) + embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(config_path) + embed_model = SenticGCNBertEmbeddingModel.from_pretrained(model_path, config=embed_config) + else: + # Load from HuggingFace model repository + embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(embedding_model) + embed_model = SenticGCNBertEmbeddingModel.from_pretrained(embedding_model, config=embed_config) self.embedding_model = embed_model self.embedding_model.to(self.device) + except Exception as e: + logging.error(e) + raise Exception( + """ + Error initializing embedding model! Please ensure that input tokenizer is either a PreTrainedModel instance, + an url to cloud storage folder, local folder or HuggingFace model name. + """ + ) def __call__(self, data_batch: List[Dict[str, List[str]]]) -> BatchEncoding: pass # TODO From 3931946176b89c54dde348a5c76e1b09edcce731 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Sat, 8 Jan 2022 11:08:44 +0800 Subject: [PATCH 112/201] [#41] introduce base processor class for code reuse --- sgnlp/models/sentic_gcn/preprocess.py | 141 +++++++++++--------------- 1 file changed, 58 insertions(+), 83 deletions(-) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index 71baa05..e740b51 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -5,8 +5,9 @@ import urllib.parse from typing import Dict, List, Union +import spacy import torch -from transformers import PreTrainedTokenizer, PreTrainedModel +from transformers import PreTrainedTokenizer, PretrainedConfig, PreTrainedModel from transformers.tokenization_utils_base import BatchEncoding from config import SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig @@ -18,23 +19,24 @@ logging.basicConfig(level=logging.DEBUG) -class SenticGCNPreprocessor: +class SenticGCNBasePreprocessor: def __init__( self, - tokenizer: Union[ - str, PreTrainedTokenizer - ] = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_tokenizer/", - embedding_model: Union[ - str, PreTrainedModel - ] = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/", + tokenizer: Union[str, PreTrainedTokenizer], + embedding_model: Union[str, PreTrainedTokenizer], + tokenizer_class: PreTrainedTokenizer, + embedding_config_class: PretrainedConfig, + embedding_model_class: PreTrainedModel, config_filename: str = "config.json", model_filename: str = "pytorch_model.bin", + spacy_pipeline: str = "en_core_web_sm", device: str = "cpu", - ): + ) -> None: # Set device self.device = ( torch.device("cuda" if torch.cuda.is_available() else "cpu") if not device else torch.device(device) ) + self.spacy_pipeline = spacy.load(spacy_pipeline) try: # Init Tokenizer @@ -48,11 +50,11 @@ def __init__( with tempfile.TemporaryDirectory() as tmpdir: temp_dir = pathlib.Path(tmpdir) download_tokenizer_files(tokenizer, temp_dir) - tokenizer_ = SenticGCNTokenizer.from_pretrained(temp_dir) + tokenizer_ = tokenizer_class.from_pretrained(temp_dir) shutil.rmtree(temp_dir, ignore_errors=True) else: # Load from local directory or from HuggingFace model repository - tokenizer_ = SenticGCNTokenizer.from_pretrained(tokenizer) + tokenizer_ = tokenizer_class.from_pretrained(tokenizer) self.tokenizer = tokenizer_ except Exception as e: logging.error(e) @@ -73,20 +75,20 @@ def __init__( # Load from cloud config_url = urllib.parse.urljoin(embedding_model, config_filename) model_url = urllib.parse.urljoin(embedding_model, model_filename) - embed_config = SenticGCNEmbeddingConfig.from_pretrained(config_url) - embed_model = SenticGCNEmbeddingModel.from_pretrained(model_url, config=embed_config) + embed_config = embedding_config_class.from_pretrained(config_url) + embed_model = embedding_model_class.from_pretrained(model_url, config=embed_config) else: # Load from local folder embed_model_name = pathlib.Path(embedding_model) if embed_model_name.is_dir(): config_path = embed_model_name.joinpath(config_filename) model_path = embed_model_name.joinpath(model_filename) - embed_config = SenticGCNEmbeddingConfig.from_pretrained(config_path) - embed_model = SenticGCNEmbeddingModel.from_pretrained(model_path, config=embed_config) + embed_config = embedding_config_class.from_pretrained(config_path) + embed_model = embedding_model_class.from_pretrained(model_path, config=embed_config) else: # Load from HuggingFace model repository - embed_config = SenticGCNEmbeddingConfig.from_pretrained(embedding_model) - embed_model = SenticGCNEmbeddingModel.from_pretrained(embedding_model, config=embed_config) + embed_config = embedding_config_class.from_pretrained(embedding_model) + embed_model = embedding_model_class.from_pretrained(embedding_model, config=embed_config) self.embedding_model = embed_model self.embedding_model.to(self.device) except Exception as e: @@ -98,85 +100,58 @@ def __init__( """ ) + +class SenticGCNPreprocessor(SenticGCNBasePreprocessor): + def __init__( + self, + tokenizer: Union[ + str, PreTrainedTokenizer + ] = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_tokenizer/", + embedding_model: Union[ + str, PreTrainedModel + ] = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/", + config_filename: str = "config.json", + model_filename: str = "pytorch_model.bin", + spacy_pipeline: str = "en_core_web_sm", + device: str = "cpu", + ): + super().__init__( + tokenizer=tokenizer, + embedding_model=embedding_model, + tokenizer_class=SenticGCNTokenizer, + embedding_config_class=SenticGCNEmbeddingConfig, + embedding_model_class=SenticGCNEmbeddingModel, + config_filename=config_filename, + model_filename=model_filename, + spacy_pipeline=spacy_pipeline, + device=device, + ) + def __call__(self, data_batch: List[Dict[str, List[str]]]) -> BatchEncoding: pass # TODO -class SenticGCNBertPreprocessor: +class SenticGCNBertPreprocessor(SenticGCNBasePreprocessor): def __init__( self, tokenizer: Union[str, PreTrainedTokenizer] = "bert-base-uncased", embedding_model: Union[str, PreTrainedModel] = "bert-base-uncased", config_filename: str = "config.json", model_filename: str = "pytorch_model.bin", + spacy_pipeline: str = "en_core_web_sm", device: str = "cpu", ): - # Set device - self.device = ( - torch.device("cuda" if torch.cuda.is_available() else "cpu") if not device else torch.device(device) + super().__init__( + tokenizer=tokenizer, + embedding_model=embedding_model, + tokenizer_class=SenticGCNBertTokenizer, + embedding_config_class=SenticGCNBertEmbeddingConfig, + embedding_model_class=SenticGCNBertEmbeddingModel, + config_filename=config_filename, + model_filename=model_filename, + spacy_pipeline=spacy_pipeline, + device=device, ) - try: - # Init Tokenizer - if isinstance(tokenizer, PreTrainedTokenizer): - # Load from external instance - tokenizer_ = tokenizer - else: - if tokenizer.startswith("https://") or tokenizer.startswith("http://"): - # Load from cloud - # Download tokenizer files to temp dir - with tempfile.TemporaryDirectory() as tmpdir: - temp_dir = pathlib.Path(tmpdir) - download_tokenizer_files(tokenizer, temp_dir) - tokenizer_ = SenticGCNBertTokenizer.from_pretrained(temp_dir) - shutil.rmtree(temp_dir, ignore_errors=True) - else: - # Load from local directory or from HuggingFace model repository - tokenizer_ = SenticGCNBertTokenizer.from_pretrained(tokenizer) - self.tokenizer = tokenizer_ - except Exception as e: - logging.error(e) - raise Exception( - """ - Error initializing tokenizer! Please ensure that input tokenizer is either a PreTrainedTokenizer instance, - an url to cloud storage folder, local folder or HuggingFace model name. - """ - ) - - try: - # Init Embedding model - if isinstance(embedding_model, PreTrainedModel): - # Load from external instance - embed_model = embedding_model - else: - if embedding_model.startswith("https://") or embedding_model.startswith("http://"): - # Load from cloud - config_url = urllib.parse.urljoin(embedding_model, config_filename) - model_url = urllib.parse.urljoin(embedding_model, model_filename) - embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(config_url) - embed_model = SenticGCNBertEmbeddingModel.from_pretrained(model_url, config=embed_config) - else: - # Load from local folder - embed_model_name = pathlib.Path(embedding_model) - if embed_model_name.is_dir(): - config_path = embed_model_name.joinpath(config_filename) - model_path = embed_model_name.joinpath(model_filename) - embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(config_path) - embed_model = SenticGCNBertEmbeddingModel.from_pretrained(model_path, config=embed_config) - else: - # Load from HuggingFace model repository - embed_config = SenticGCNBertEmbeddingConfig.from_pretrained(embedding_model) - embed_model = SenticGCNBertEmbeddingModel.from_pretrained(embedding_model, config=embed_config) - self.embedding_model = embed_model - self.embedding_model.to(self.device) - except Exception as e: - logging.error(e) - raise Exception( - """ - Error initializing embedding model! Please ensure that input tokenizer is either a PreTrainedModel instance, - an url to cloud storage folder, local folder or HuggingFace model name. - """ - ) - def __call__(self, data_batch: List[Dict[str, List[str]]]) -> BatchEncoding: pass # TODO From a1a69fb3108c2bb8e835b6c4b5dc96c9b350951d Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Sat, 8 Jan 2022 17:23:30 +0800 Subject: [PATCH 113/201] [#41] draft implementation for processing batch of input sentence with multiple occurance for aspect --- sgnlp/models/sentic_gcn/preprocess.py | 28 +++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index e740b51..e4dd079 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -3,6 +3,7 @@ import shutil import tempfile import urllib.parse +from collections import namedtuple from typing import Dict, List, Union import spacy @@ -19,6 +20,10 @@ logging.basicConfig(level=logging.DEBUG) +SenticGCNData = namedtuple("SenticGCNData", ["full_text", "aspect", "left_text"]) +SenticGCNBertData = namedtuple("SenticGCNBertData", ["full_text", "aspect", "left_text", "full_text_with_bert_tokens"]) + + class SenticGCNBasePreprocessor: def __init__( self, @@ -153,5 +158,28 @@ def __init__( device=device, ) + def _process_indices(self, data_batch: List[SenticGCNBertData]): + pass + + def _process_inputs(self, data_batch: List[Dict[str, List[str]]]) -> List[SenticGCNBertData]: + processed_inputs = [] + for batch in data_batch: + full_text = batch["sentence"].lower().strip() + for aspect in batch["aspect"]: + aspect = aspect.lower().strip() + aspect_idxs = [index for index in range(len(full_text)) if full_text.startswith(aspect, index)] + for aspect_index in aspect_idxs: + left_text = full_text[:aspect_index].strip() + full_text_with_bert_tokens = f"[CLS] {full_text} [SEP] {aspect} [SEP]" + processed_inputs.append( + SenticGCNBertData( + full_text=full_text, + aspect=aspect, + left_text=left_text, + full_text_with_bert_tokens=full_text_with_bert_tokens, + ) + ) + return processed_inputs + def __call__(self, data_batch: List[Dict[str, List[str]]]) -> BatchEncoding: pass # TODO From 3a7476bcdce2695e5044ae76ed87f7ac5a9fd13e Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Sat, 8 Jan 2022 20:45:33 +0800 Subject: [PATCH 114/201] [#41] first complete implementation for SenticGCNBertPreprocessor --- sgnlp/models/sentic_gcn/preprocess.py | 112 ++++++++++++++++++++++++-- sgnlp/models/sentic_gcn/utils.py | 7 -- 2 files changed, 107 insertions(+), 12 deletions(-) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index e4dd079..49e48e2 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -6,6 +6,7 @@ from collections import namedtuple from typing import Dict, List, Union +import numpy as np import spacy import torch from transformers import PreTrainedTokenizer, PretrainedConfig, PreTrainedModel @@ -14,7 +15,12 @@ from config import SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig from modeling import SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel from tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer -from utils import download_tokenizer_files +from utils import ( + load_and_process_senticnet, + download_tokenizer_files, + pad_and_truncate, + generate_dependency_adj_matrix, +) logging.basicConfig(level=logging.DEBUG) @@ -35,6 +41,7 @@ def __init__( config_filename: str = "config.json", model_filename: str = "pytorch_model.bin", spacy_pipeline: str = "en_core_web_sm", + senticnet: str = "senticnet.pickle", device: str = "cpu", ) -> None: # Set device @@ -43,6 +50,20 @@ def __init__( ) self.spacy_pipeline = spacy.load(spacy_pipeline) + if senticnet.endswith(".pkl") or senticnet.endswith(".pickle"): + self.senticnet = load_and_process_senticnet(saved_preprocessed_senticnet_file_path=senticnet) + elif senticnet.endswith(".txt"): + self.senticnet = load_and_process_senticnet(senticnet_file_path=senticnet) + else: + raise ValueError( + f""" + Invalid SenticNet file! + For processed SenticNet dictionary, please provide pickle file location + (i.e. file with .pkl or .pickle extension). + For raw SenticNet-5.0 file, please provide text file path (i.e. file with .txt extension) + """ + ) + try: # Init Tokenizer if isinstance(tokenizer, PreTrainedTokenizer): @@ -144,6 +165,8 @@ def __init__( config_filename: str = "config.json", model_filename: str = "pytorch_model.bin", spacy_pipeline: str = "en_core_web_sm", + senticnet: str = "senticnet.pkl", + max_len: int = 85, device: str = "cpu", ): super().__init__( @@ -155,11 +178,89 @@ def __init__( config_filename=config_filename, model_filename=model_filename, spacy_pipeline=spacy_pipeline, + senticnet=senticnet, device=device, ) + self.max_len = max_len - def _process_indices(self, data_batch: List[SenticGCNBertData]): - pass + def _process_indices(self, data_batch: List[SenticGCNBertData]) -> List[torch.Tensor]: + all_text_indices = [] + all_aspect_indices = [] + all_left_indices = [] + all_text_bert_indices = [] + all_bert_segment_indices = [] + all_sdat_graph = [] + for data in data_batch: + text_indices = self.tokenizer( + data.full_text, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + aspect_indices = self.tokenizer( + data.aspect, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + left_indices = self.tokenizer( + data.left_text, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + text_bert_indices = self.tokenizer( + data.full_text_with_bert_tokens, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + text_len = np.sum(text_indices["input_ids"] != 0) + aspect_len = np.sum(aspect_indices["input_ids"] != 0) + concat_segment_indices = [0] * (text_len + 2) + [1] * (aspect_len + 1) + concat_segment_indices = pad_and_truncate(concat_segment_indices, self.max_len) + + graph = generate_dependency_adj_matrix(data.full_text, data.aspect, self.senticnet, self.spacy_pipeline) + sdat_graph = np.pad( + graph, + ( + (0, self.max_len - graph.shape[0]), + (0, self.max_len - graph.shape[0]), + ), + "constant", + ) + + all_text_indices.append(text_indices["input_ids"]) + all_aspect_indices.append(aspect_indices["input_ids"]) + all_left_indices.append(left_indices["input_ids"]) + all_text_bert_indices.append(text_bert_indices["input_ids"]) + all_bert_segment_indices.append(concat_segment_indices) + all_sdat_graph.append(sdat_graph) + + all_text_bert_indices = torch.tensor(all_text_bert_indices).to(self.device) + all_bert_segment_indices = torch.tensor(np.array(all_bert_segment_indices)).to(self.device) + text_embeddings = self.embedding_model(all_text_bert_indices, token_type_ids=all_bert_segment_indices)[ + "last_hidden_state" + ] + return [ + torch.tensor(all_text_indices), + torch.tensor(all_aspect_indices), + torch.tensor(all_left_indices), + text_embeddings, + torch.tensor(all_sdat_graph), + ] def _process_inputs(self, data_batch: List[Dict[str, List[str]]]) -> List[SenticGCNBertData]: processed_inputs = [] @@ -181,5 +282,6 @@ def _process_inputs(self, data_batch: List[Dict[str, List[str]]]) -> List[Sentic ) return processed_inputs - def __call__(self, data_batch: List[Dict[str, List[str]]]) -> BatchEncoding: - pass # TODO + def __call__(self, data_batch: List[Dict[str, List[str]]]) -> List[torch.Tensor]: + processed_inputs = self._process_inputs(data_batch) + return self._process_indices(processed_inputs) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index 7541e12..b36879c 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -438,13 +438,6 @@ def _generate_senticgcnbert_dataset(self, raw_data: List[str]) -> Dict[str, List "constant", ) - assert len(text_indices["input_ids"]) == max_len - assert len(aspect_indices["input_ids"]) == max_len - assert len(left_indices["input_ids"]) == max_len - assert len(text_bert_indices["input_ids"]) == max_len - assert len(concat_segment_indices) == max_len - assert len(sdat_graph) == max_len - all_data.append( { "text_indices": torch.tensor(text_indices["input_ids"]), From ceb18a77050ae2dcdda0d69de0fe6d58678033f5 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Sat, 8 Jan 2022 20:55:27 +0800 Subject: [PATCH 115/201] [#41] bugfix for open file mode for senticnet loader --- sgnlp/models/sentic_gcn/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index b36879c..d8756c6 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -206,7 +206,7 @@ def load_and_process_senticnet( """ saved_senticnet_file_path = pathlib.Path(saved_preprocessed_senticnet_file_path) if saved_senticnet_file_path.exists() and not save_preprocessed_senticnet: - with open(saved_senticnet_file_path, "r") as f: + with open(saved_senticnet_file_path, "rb") as f: sentic_dict = pickle.load(f) else: senticnet_file_path = pathlib.Path(senticnet_file_path) From 5ae22a61bac88924c53cf766eddfa823346128cb Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Sat, 8 Jan 2022 22:13:20 +0800 Subject: [PATCH 116/201] [#41] complete preprocessor for both SenticGCN and SenticGCNBert --- sgnlp/models/sentic_gcn/data_class.py | 12 +- sgnlp/models/sentic_gcn/preprocess.py | 181 ++++++++++++++++++++++++-- 2 files changed, 171 insertions(+), 22 deletions(-) diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index 703ea70..a417676 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -183,14 +183,4 @@ def __post_init__(self): assert self.max_len > 0, "Max_len must be greater than 0." # Assign sub dataset columns name - self.data_cols = ( - ["text_indices", "aspect_indices", "left_indices", "text_embeddings", "sdat_graph"] - if self.model == "senticgcn" - else [ - "text_indices", - "aspect_indices", - "left_indices", - "text_embeddings", - "sdat_graph", - ] - ) + self.data_cols = ["text_indices", "aspect_indices", "left_indices", "text_embeddings", "sdat_graph"] diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index 49e48e2..16199ff 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -10,7 +10,6 @@ import spacy import torch from transformers import PreTrainedTokenizer, PretrainedConfig, PreTrainedModel -from transformers.tokenization_utils_base import BatchEncoding from config import SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig from modeling import SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel @@ -31,6 +30,11 @@ class SenticGCNBasePreprocessor: + """ + Base preprocessor class provides initialization for spacy, senticnet, tokenizer and embedding model. + Class is only meant to be inherited by derived preprocessor. + """ + def __init__( self, tokenizer: Union[str, PreTrainedTokenizer], @@ -50,6 +54,7 @@ def __init__( ) self.spacy_pipeline = spacy.load(spacy_pipeline) + # Load senticnet if senticnet.endswith(".pkl") or senticnet.endswith(".pickle"): self.senticnet = load_and_process_senticnet(saved_preprocessed_senticnet_file_path=senticnet) elif senticnet.endswith(".txt"): @@ -128,6 +133,11 @@ def __init__( class SenticGCNPreprocessor(SenticGCNBasePreprocessor): + """ + Class for preprocessing sentence(s) and its aspect(s) to a batch of tensors for the SenticGCNBertModel + to predict on. + """ + def __init__( self, tokenizer: Union[ @@ -140,7 +150,7 @@ def __init__( model_filename: str = "pytorch_model.bin", spacy_pipeline: str = "en_core_web_sm", device: str = "cpu", - ): + ) -> None: super().__init__( tokenizer=tokenizer, embedding_model=embedding_model, @@ -153,11 +163,122 @@ def __init__( device=device, ) - def __call__(self, data_batch: List[Dict[str, List[str]]]) -> BatchEncoding: - pass # TODO + def _process_indices(self, data_batch: List[SenticGCNData]) -> List[torch.Tensor]: + """ + Private helper method to generate all indices and embeddings from list of input data + required for model input. + + Args: + data_batch (List[SenticGCNData]): list of processed inputs as SenticGCNData + + Returns: + List[torch.Tensor]: return a list of tensors for model input + """ + all_text_indices = [] + all_aspect_indices = [] + all_left_indices = [] + all_sdat_graph = [] + max_len = max([len(data["sentence"]) for data in data_batch]) + for data in data_batch: + text_indices = self.tokenizer( + data.full_text, + max_length=max_len, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + aspect_indices = self.tokenizer( + data.aspect, + max_length=max_len, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + left_indices = self.tokenizer( + data.left_indices, + max_length=max_len, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + graph = generate_dependency_adj_matrix(data.full_text, data.aspect, self.senticnet, self.spacy_pipeline) + sdat_graph = np.pad( + graph, + ((0, max_len - graph.shape[0]), (0, max_len - graph.shape[0])), + "constant", + ) + + all_text_indices.append(text_indices["input_ids"]) + all_aspect_indices.append(aspect_indices["input_ids"]) + all_left_indices.append(left_indices["input_ids"]) + all_sdat_graph.append(sdat_graph) + + all_text_indices = torch.tensor(all_text_indices).to(self.device) + text_embeddings = self.embedding_model(all_text_indices) + + return [ + all_text_indices, + torch.tensor(all_aspect_indices).to(self.device), + torch.tensor(all_left_indices).to(self.device), + text_embeddings, + torch.tensor(sdat_graph).to(self.device), + ] + + def _process_inputs(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> List[SenticGCNData]: + """ + Private helper method to process input data batch. + Input entries are repeated for each input aspect. + If input aspect have multiple occurance in the sentence, each occurance is process as an entry. + + Args: + data_batch (List[Dict[str, Union[str, List[str]]]]): list of dictionaries with 2 keys, 'sentence' and 'aspect'. + 'sentence' value are strings and 'aspect' value is a list of accompanying aspect. + + Returns: + List[SenticGCNData]: return list of processed inputs as SenticGCNData + """ + processed_inputs = [] + for batch in data_batch: + full_text = batch["sentence"].lower().strip() + for aspect in batch["aspect"]: + aspect = aspect.lower().strip() + aspect_idxs = [index for index in range(len(full_text)) if full_text.startswith(aspect, index)] + for aspect_index in aspect_idxs: + left_text = full_text[:aspect_index].strip() + processed_inputs.append(SenticGCNData(full_text=full_text, aspect=aspect, left_text=left_text)) + return processed_inputs + + def __call__(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> List[torch.Tensor]: + """ + Method to generate list of input tensors from a list of sentences and their accompanying list of aspect. + + Args: + data_batch (List[Dict[str, Union[str, List[str]]]]): list of dictionaries with 2 keys, 'sentence' and 'aspect'. + 'sentence' value are strings and 'aspect' value is a list of accompanying aspect. + + Returns: + List[torch.Tensor]: return a list of ordered tensors for 'text_indices', 'aspect_indices', 'left_indices', + 'text_embeddings' and 'sdat_graph'. + """ + processed_inputs = self._process_inputs(data_batch) + return self._process_indices(processed_inputs) class SenticGCNBertPreprocessor(SenticGCNBasePreprocessor): + """ + Class for preprocessing sentence(s) and its aspect(s) to a batch of tensors for the SenticGCNBertModel + to predict on. + """ + def __init__( self, tokenizer: Union[str, PreTrainedTokenizer] = "bert-base-uncased", @@ -168,7 +289,7 @@ def __init__( senticnet: str = "senticnet.pkl", max_len: int = 85, device: str = "cpu", - ): + ) -> None: super().__init__( tokenizer=tokenizer, embedding_model=embedding_model, @@ -184,6 +305,16 @@ def __init__( self.max_len = max_len def _process_indices(self, data_batch: List[SenticGCNBertData]) -> List[torch.Tensor]: + """ + Private helper method to generate all indices and embeddings from list of input data + required for model input. + + Args: + data_batch (List[SenticGCNBertData]): list of processed inputs as SenticGCNBertData + + Returns: + List[torch.Tensor]: return a list of tensors for model input + """ all_text_indices = [] all_aspect_indices = [] all_left_indices = [] @@ -193,6 +324,7 @@ def _process_indices(self, data_batch: List[SenticGCNBertData]) -> List[torch.Te for data in data_batch: text_indices = self.tokenizer( data.full_text, + max_length=self.max_len, padding="max_length", truncation=True, add_special_tokens=False, @@ -202,6 +334,7 @@ def _process_indices(self, data_batch: List[SenticGCNBertData]) -> List[torch.Te ) aspect_indices = self.tokenizer( data.aspect, + max_length=self.max_len, padding="max_length", truncation=True, add_special_tokens=False, @@ -211,6 +344,7 @@ def _process_indices(self, data_batch: List[SenticGCNBertData]) -> List[torch.Te ) left_indices = self.tokenizer( data.left_text, + max_length=self.max_len, padding="max_length", truncation=True, add_special_tokens=False, @@ -220,6 +354,7 @@ def _process_indices(self, data_batch: List[SenticGCNBertData]) -> List[torch.Te ) text_bert_indices = self.tokenizer( data.full_text_with_bert_tokens, + max_length=self.max_len, padding="max_length", truncation=True, add_special_tokens=False, @@ -254,15 +389,28 @@ def _process_indices(self, data_batch: List[SenticGCNBertData]) -> List[torch.Te text_embeddings = self.embedding_model(all_text_bert_indices, token_type_ids=all_bert_segment_indices)[ "last_hidden_state" ] + return [ - torch.tensor(all_text_indices), - torch.tensor(all_aspect_indices), - torch.tensor(all_left_indices), + torch.tensor(all_text_indices).to(self.device), + torch.tensor(all_aspect_indices).to(self.device), + torch.tensor(all_left_indices).to(self.device), text_embeddings, - torch.tensor(all_sdat_graph), + torch.tensor(all_sdat_graph).to(self.device), ] - def _process_inputs(self, data_batch: List[Dict[str, List[str]]]) -> List[SenticGCNBertData]: + def _process_inputs(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> List[SenticGCNBertData]: + """ + Private helper method to process input data batch. + Input entries are repeated for each input aspect. + If input aspect have multiple occurance in the sentence, each occurance is process as an entry. + + Args: + data_batch (List[Dict[str, Union[str, List[str]]]]): list of dictionaries with 2 keys, 'sentence' and 'aspect'. + 'sentence' value are strings and 'aspect' value is a list of accompanying aspect. + + Returns: + List[SenticGCNBertData]: return list of processed inputs as SenticGCNBertData + """ processed_inputs = [] for batch in data_batch: full_text = batch["sentence"].lower().strip() @@ -282,6 +430,17 @@ def _process_inputs(self, data_batch: List[Dict[str, List[str]]]) -> List[Sentic ) return processed_inputs - def __call__(self, data_batch: List[Dict[str, List[str]]]) -> List[torch.Tensor]: + def __call__(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> List[torch.Tensor]: + """ + Method to generate list of input tensors from a list of sentences and their accompanying list of aspect. + + Args: + data_batch (List[Dict[str, Union[str, List[str]]]]): list of dictionaries with 2 keys, 'sentence' and 'aspect'. + 'sentence' value are strings and 'aspect' value is a list of accompanying aspect. + + Returns: + List[torch.Tensor]: return a list of ordered tensors for 'text_indices', 'aspect_indices', 'left_indices', + 'text_embeddings' and 'sdat_graph'. + """ processed_inputs = self._process_inputs(data_batch) return self._process_indices(processed_inputs) From 087bcde70dcf1f145c2a0c69bd783d0c0b42adbf Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 10 Jan 2022 08:38:36 +0800 Subject: [PATCH 117/201] [#41] removed redundant torch device cast --- sgnlp/models/sentic_gcn/config.py | 6 ------ sgnlp/models/sentic_gcn/modeling.py | 10 ++++------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/sgnlp/models/sentic_gcn/config.py b/sgnlp/models/sentic_gcn/config.py index 467b867..b37bcb9 100644 --- a/sgnlp/models/sentic_gcn/config.py +++ b/sgnlp/models/sentic_gcn/config.py @@ -12,7 +12,6 @@ class SenticGCNConfig(PretrainedConfig): hidden_dim (:obj:`int`, defaults to 300): Size of hidden dimension. dropout (:obj:`float`, defaults to 0.3): Droput percentage. polarities_dim (:obj:`int`, defaults to 3): Size of output dimension representing available polarities (e.g. Positive, Negative, Neutral). - device (:obj:`str`, defaults to 'cuda`): Type of torch device. loss_function (:obj:`str`, defaults to 'cross_entropy'): Loss function for training/eval. Example: @@ -29,7 +28,6 @@ def __init__( hidden_dim: int = 300, polarities_dim: int = 3, dropout: float = 0.3, - device: str = "cuda", loss_function: str = "cross_entropy", **kwargs ) -> None: @@ -38,7 +36,6 @@ def __init__( self.hidden_dim = hidden_dim self.dropout = dropout self.polarities_dim = polarities_dim - self.device = device self.loss_function = loss_function @@ -53,7 +50,6 @@ class SenticGCNBertConfig(PretrainedConfig): max_seq_len (:obj:`int`, defaults to 85): The max sequence length to pad and truncate. dropout (:obj:`float`, defaults to 0.3): Dropout percentage. polarities_dim (:ob:`int`, defaults to 3): Size of output dimension representing available polarities (e.g. Positive, Negative, Neutral). - device (:obj:`str`, defaults to 'cuda'): Type of torch device. loss_function (:obj:`str`, defaults to 'cross_entropy'): Loss function for training/eval. Example: @@ -70,7 +66,6 @@ def __init__( max_seq_len: int = 85, polarities_dim: int = 3, dropout: float = 0.3, - device: str = "cuda", loss_function: str = "cross_entropy", **kwargs ) -> None: @@ -80,7 +75,6 @@ def __init__( self.max_seq_len = max_seq_len self.dropout = dropout self.polarities_dim = polarities_dim - self.device = device self.loss_function = loss_function diff --git a/sgnlp/models/sentic_gcn/modeling.py b/sgnlp/models/sentic_gcn/modeling.py index 878df06..9ab599b 100644 --- a/sgnlp/models/sentic_gcn/modeling.py +++ b/sgnlp/models/sentic_gcn/modeling.py @@ -78,7 +78,6 @@ def __init__(self, config: SenticGCNConfig) -> None: self.gc2 = GraphConvolution(2 * config.hidden_dim, 2 * config.hidden_dim) self.fc = nn.Linear(2 * config.hidden_dim, config.polarities_dim) self.text_embed_dropout = nn.Dropout(config.dropout) - self.torch_device = torch.device(config.device) if config.loss_function == "cross_entropy": self.loss_function = nn.CrossEntropyLoss() @@ -100,7 +99,7 @@ def position_weight( weight[i].append(1 - (j - aspect_double_idx[i, 1] / context_len)) for j in range(text_len[i], seq_len): weight[i].append(0) - weight = torch.tensor(weight, dtype=torch.float).unsqueeze(2).to(self.torch_device) + weight = torch.tensor(weight, dtype=torch.float).unsqueeze(2) return weight * x def mask(self, x: torch.Tensor, aspect_double_idx: torch.Tensor) -> torch.Tensor: @@ -114,7 +113,7 @@ def mask(self, x: torch.Tensor, aspect_double_idx: torch.Tensor) -> torch.Tensor mask[i].append(1) for j in range(aspect_double_idx[i, 1] + 1, seq_len): mask[i].append(0) - mask = torch.tensor(mask, dtype=torch.float).unsqueeze(2).to(self.torch_device) + mask = torch.tensor(mask, dtype=torch.float).unsqueeze(2) return mask * x def forward(self, inputs: List[torch.Tensor], labels: Optional[torch.Tensor] = None) -> SenticGCNModelOutput: @@ -196,7 +195,6 @@ def __init__(self, config: SenticGCNBertConfig) -> None: self.gc3 = GraphConvolution(config.hidden_dim, config.hidden_dim) self.fc = nn.Linear(config.hidden_dim, config.polarities_dim) self.text_embed_dropout = nn.Dropout(config.dropout) - self.torch_device = torch.device(config.device) self.max_seq_len = config.max_seq_len self.loss_function = config.loss_function @@ -218,7 +216,7 @@ def position_weight( weight[i].append(1 - (j - aspect_double_idx[i, 1]) / context_len) for j in range(text_len[i], seq_len): weight[i].append(0) - weight = torch.tensor(weight).unsqueeze(2).to(self.torch_device) + weight = torch.tensor(weight).unsqueeze(2) return weight * x def mask(self, x: torch.Tensor, aspect_double_idx: torch.Tensor) -> torch.Tensor: @@ -232,7 +230,7 @@ def mask(self, x: torch.Tensor, aspect_double_idx: torch.Tensor) -> torch.Tensor mask[i].append(1) for j in range(min(aspect_double_idx[i, 1] + 1, self.max_seq_len), seq_len): mask[i].append(0) - mask = torch.tensor(mask).unsqueeze(2).float().to(self.torch_device) + mask = torch.tensor(mask).unsqueeze(2).float() return mask * x def forward(self, inputs: List[torch.Tensor], labels: Optional[torch.Tensor] = None) -> SenticGCNBertModelOutput: From 3c8b9f9cf1110050ba43e634ef3866f9756e6eed Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 10 Jan 2022 08:48:33 +0800 Subject: [PATCH 118/201] [#41] return preprocess text inputs from preprocessor --- sgnlp/models/sentic_gcn/preprocess.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index 16199ff..391f174 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -4,7 +4,7 @@ import tempfile import urllib.parse from collections import namedtuple -from typing import Dict, List, Union +from typing import Dict, List, Tuple, Union import numpy as np import spacy @@ -257,7 +257,9 @@ def _process_inputs(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> processed_inputs.append(SenticGCNData(full_text=full_text, aspect=aspect, left_text=left_text)) return processed_inputs - def __call__(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> List[torch.Tensor]: + def __call__( + self, data_batch: List[Dict[str, Union[str, List[str]]]] + ) -> Tuple[List[SenticGCNData, List[torch.Tensor]]]: """ Method to generate list of input tensors from a list of sentences and their accompanying list of aspect. @@ -266,11 +268,11 @@ def __call__(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> List[t 'sentence' value are strings and 'aspect' value is a list of accompanying aspect. Returns: - List[torch.Tensor]: return a list of ordered tensors for 'text_indices', 'aspect_indices', 'left_indices', - 'text_embeddings' and 'sdat_graph'. + Tuple[List[SenticGCNData, List[torch.Tensor]]]: return a list of ordered tensors for 'text_indices', + 'aspect_indices', 'left_indices', 'text_embeddings' and 'sdat_graph'. """ processed_inputs = self._process_inputs(data_batch) - return self._process_indices(processed_inputs) + return processed_inputs, self._process_indices(processed_inputs) class SenticGCNBertPreprocessor(SenticGCNBasePreprocessor): @@ -430,7 +432,9 @@ def _process_inputs(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> ) return processed_inputs - def __call__(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> List[torch.Tensor]: + def __call__( + self, data_batch: List[Dict[str, Union[str, List[str]]]] + ) -> Tuple[List[SenticGCNData, List[torch.Tensor]]]: """ Method to generate list of input tensors from a list of sentences and their accompanying list of aspect. @@ -439,8 +443,8 @@ def __call__(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> List[t 'sentence' value are strings and 'aspect' value is a list of accompanying aspect. Returns: - List[torch.Tensor]: return a list of ordered tensors for 'text_indices', 'aspect_indices', 'left_indices', - 'text_embeddings' and 'sdat_graph'. + Tuple[List[SenticGCNData, List[torch.Tensor]]]: return a list of ordered tensors for 'text_indices', + 'aspect_indices', 'left_indices', 'text_embeddings' and 'sdat_graph'. """ processed_inputs = self._process_inputs(data_batch) - return self._process_indices(processed_inputs) + return processed_inputs, self._process_indices(processed_inputs) From 5e6bbde2f07e99d6098a8be727699a25827cd953 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 10 Jan 2022 10:51:00 +0800 Subject: [PATCH 119/201] [#41] return split tokens and their respective aspect token index from preprocessor --- sgnlp/models/sentic_gcn/preprocess.py | 42 ++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index 391f174..7ab9393 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -1,6 +1,7 @@ import logging import pathlib import shutil +import string import tempfile import urllib.parse from collections import namedtuple @@ -25,8 +26,13 @@ logging.basicConfig(level=logging.DEBUG) -SenticGCNData = namedtuple("SenticGCNData", ["full_text", "aspect", "left_text"]) -SenticGCNBertData = namedtuple("SenticGCNBertData", ["full_text", "aspect", "left_text", "full_text_with_bert_tokens"]) +SenticGCNData = namedtuple( + "SenticGCNData", ["full_text", "aspect", "left_text", "full_text_tokens", "aspect_token_index"] +) +SenticGCNBertData = namedtuple( + "SenticGCNBertData", + ["full_text", "aspect", "left_text", "full_text_with_bert_tokens", "full_text_tokens", "aspect_token_index"], +) class SenticGCNBasePreprocessor: @@ -249,17 +255,31 @@ def _process_inputs(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> processed_inputs = [] for batch in data_batch: full_text = batch["sentence"].lower().strip() + full_text_tokens = batch["sentence"].split() for aspect in batch["aspect"]: aspect = aspect.lower().strip() + aspect_token_index = [ + idx + for idx, val in enumerate(full_text_tokens) + if val.lower().translate(str.maketrans("", "", string.punctuation)) == aspect + ] aspect_idxs = [index for index in range(len(full_text)) if full_text.startswith(aspect, index)] for aspect_index in aspect_idxs: left_text = full_text[:aspect_index].strip() - processed_inputs.append(SenticGCNData(full_text=full_text, aspect=aspect, left_text=left_text)) + processed_inputs.append( + SenticGCNData( + full_text=full_text, + aspect=aspect, + left_text=left_text, + full_text_tokens=full_text_tokens, + aspect_token_index=aspect_token_index, + ) + ) return processed_inputs def __call__( self, data_batch: List[Dict[str, Union[str, List[str]]]] - ) -> Tuple[List[SenticGCNData, List[torch.Tensor]]]: + ) -> Tuple[List[SenticGCNData], List[torch.Tensor]]: """ Method to generate list of input tensors from a list of sentences and their accompanying list of aspect. @@ -268,7 +288,7 @@ def __call__( 'sentence' value are strings and 'aspect' value is a list of accompanying aspect. Returns: - Tuple[List[SenticGCNData, List[torch.Tensor]]]: return a list of ordered tensors for 'text_indices', + Tuple[List[SenticGCNData], List[torch.Tensor]]: return a list of ordered tensors for 'text_indices', 'aspect_indices', 'left_indices', 'text_embeddings' and 'sdat_graph'. """ processed_inputs = self._process_inputs(data_batch) @@ -416,8 +436,14 @@ def _process_inputs(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> processed_inputs = [] for batch in data_batch: full_text = batch["sentence"].lower().strip() + full_text_tokens = batch["sentence"].split() for aspect in batch["aspect"]: aspect = aspect.lower().strip() + aspect_token_index = [ + idx + for idx, val in enumerate(full_text_tokens) + if val.lower().translate(str.maketrans("", "", string.punctuation)) == aspect + ] aspect_idxs = [index for index in range(len(full_text)) if full_text.startswith(aspect, index)] for aspect_index in aspect_idxs: left_text = full_text[:aspect_index].strip() @@ -428,13 +454,15 @@ def _process_inputs(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> aspect=aspect, left_text=left_text, full_text_with_bert_tokens=full_text_with_bert_tokens, + full_text_tokens=full_text_tokens, + aspect_token_index=aspect_token_index, ) ) return processed_inputs def __call__( self, data_batch: List[Dict[str, Union[str, List[str]]]] - ) -> Tuple[List[SenticGCNData, List[torch.Tensor]]]: + ) -> Tuple[List[SenticGCNData], List[torch.Tensor]]: """ Method to generate list of input tensors from a list of sentences and their accompanying list of aspect. @@ -443,7 +471,7 @@ def __call__( 'sentence' value are strings and 'aspect' value is a list of accompanying aspect. Returns: - Tuple[List[SenticGCNData, List[torch.Tensor]]]: return a list of ordered tensors for 'text_indices', + Tuple[List[SenticGCNData], List[torch.Tensor]]: return a list of ordered tensors for 'text_indices', 'aspect_indices', 'left_indices', 'text_embeddings' and 'sdat_graph'. """ processed_inputs = self._process_inputs(data_batch) From 71af251f84afaa9f84ab4b96ed17c86e0accf1a1 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 10 Jan 2022 13:41:58 +0800 Subject: [PATCH 120/201] [#41] bug fix to return single aspect index per data entry --- sgnlp/models/sentic_gcn/preprocess.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index 7ab9393..3f28f9a 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -258,13 +258,13 @@ def _process_inputs(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> full_text_tokens = batch["sentence"].split() for aspect in batch["aspect"]: aspect = aspect.lower().strip() - aspect_token_index = [ + aspect_token_indexes = [ idx for idx, val in enumerate(full_text_tokens) if val.lower().translate(str.maketrans("", "", string.punctuation)) == aspect ] aspect_idxs = [index for index in range(len(full_text)) if full_text.startswith(aspect, index)] - for aspect_index in aspect_idxs: + for aspect_index, aspect_token_index in zip(aspect_idxs, aspect_token_indexes): left_text = full_text[:aspect_index].strip() processed_inputs.append( SenticGCNData( @@ -439,13 +439,13 @@ def _process_inputs(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> full_text_tokens = batch["sentence"].split() for aspect in batch["aspect"]: aspect = aspect.lower().strip() - aspect_token_index = [ + aspect_token_indexes = [ idx for idx, val in enumerate(full_text_tokens) if val.lower().translate(str.maketrans("", "", string.punctuation)) == aspect ] aspect_idxs = [index for index in range(len(full_text)) if full_text.startswith(aspect, index)] - for aspect_index in aspect_idxs: + for aspect_index, aspect_token_index in zip(aspect_idxs, aspect_token_indexes): left_text = full_text[:aspect_index].strip() full_text_with_bert_tokens = f"[CLS] {full_text} [SEP] {aspect} [SEP]" processed_inputs.append( From cd32b78a5a41747de796cbaf94acd02786c01b13 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 10 Jan 2022 14:19:46 +0800 Subject: [PATCH 121/201] [#41] add missing senticnet input args for non-bert preprocessor --- sgnlp/models/sentic_gcn/preprocess.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index 3f28f9a..db006ea 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -155,6 +155,7 @@ def __init__( config_filename: str = "config.json", model_filename: str = "pytorch_model.bin", spacy_pipeline: str = "en_core_web_sm", + senticnet: str = "senticnet.pkl", device: str = "cpu", ) -> None: super().__init__( @@ -166,6 +167,7 @@ def __init__( config_filename=config_filename, model_filename=model_filename, spacy_pipeline=spacy_pipeline, + senticnet=senticnet, device=device, ) From 966b11b4cdada568fa82e535527652f05c19d661 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 10 Jan 2022 16:15:58 +0800 Subject: [PATCH 122/201] [#41] bug fix for non-bert sentic-gcn preprocessor --- sgnlp/models/sentic_gcn/preprocess.py | 29 ++++++++++++++++----------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index db006ea..df5134e 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -186,7 +186,7 @@ def _process_indices(self, data_batch: List[SenticGCNData]) -> List[torch.Tensor all_aspect_indices = [] all_left_indices = [] all_sdat_graph = [] - max_len = max([len(data["sentence"]) for data in data_batch]) + max_len = max([len(data.full_text) for data in data_batch]) for data in data_batch: text_indices = self.tokenizer( data.full_text, @@ -208,16 +208,21 @@ def _process_indices(self, data_batch: List[SenticGCNData]) -> List[torch.Tensor return_attention_mask=False, return_token_type_ids=False, ) - left_indices = self.tokenizer( - data.left_indices, - max_length=max_len, - padding="max_length", - truncation=True, - add_special_tokens=False, - return_tensors=None, - return_attention_mask=False, - return_token_type_ids=False, - ) + if data.left_text: + left_indices = self.tokenizer( + data.left_text, + max_length=max_len, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + else: + # Workaround for handling empty string. + # This happens when the aspect is also the first word in the full text. + left_indices = {"input_ids": [0] * max_len} graph = generate_dependency_adj_matrix(data.full_text, data.aspect, self.senticnet, self.spacy_pipeline) sdat_graph = np.pad( graph, @@ -238,7 +243,7 @@ def _process_indices(self, data_batch: List[SenticGCNData]) -> List[torch.Tensor torch.tensor(all_aspect_indices).to(self.device), torch.tensor(all_left_indices).to(self.device), text_embeddings, - torch.tensor(sdat_graph).to(self.device), + torch.tensor(all_sdat_graph).to(self.device), ] def _process_inputs(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> List[SenticGCNData]: From 8251dd830bee9a914da49fe0a7f17a437cae2b61 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 10 Jan 2022 17:33:17 +0800 Subject: [PATCH 123/201] [#41] add postprocessor for SenticGCNBertModel --- sgnlp/models/sentic_gcn/postprocess.py | 55 ++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 sgnlp/models/sentic_gcn/postprocess.py diff --git a/sgnlp/models/sentic_gcn/postprocess.py b/sgnlp/models/sentic_gcn/postprocess.py new file mode 100644 index 0000000..d0fb1d2 --- /dev/null +++ b/sgnlp/models/sentic_gcn/postprocess.py @@ -0,0 +1,55 @@ +from typing import Dict, List, Union + +import torch.nn.functional as F + +from preprocess import SenticGCNBertData +from modeling import SenticGCNBertModelOutput + + +class SenticGCNBertPostprocessor: + """ + Class to initialise the Postprocessor for SenticGCNBertModel. + Class to postprocess SenticGCNBertModel output to get a list of input text tokens, + aspect token index and prediction labels. + + Args: + return_full_text (bool): Flag to indicate if the full text should be included in the output. + return_aspects_text (bool): Flag to indicate if the list of aspects text should be included in the output. + """ + + def __init__(self, return_full_text: bool = False, return_aspects_text: bool = False) -> None: + self.return_full_text = return_full_text + self.return_aspects_text = return_aspects_text + + def __call__( + self, processed_inputs: List[SenticGCNBertData], model_outputs: SenticGCNBertModelOutput + ) -> List[Dict[str, Union[List[str], List[int], float]]]: + # Get predictions + probabilities = F.softmax(model_outputs.logits, dim=-1).detach().numpy() + predictions = [probabilities.argmax(axis=-1)[idx] - 1 for idx in range(len(probabilities))] + # Process output + outputs = [] + for processed_input, prediction in zip(processed_inputs, predictions): + exists = False + # Check to see if the full_text_tokens already exists + # If found, append the aspect_token_index, prediction and optionally aspect texts. + for idx, proc_output in enumerate(outputs): + if proc_output["sentence"] == processed_input.full_text_tokens: + exists = True + outputs[idx]["aspects"].append(processed_input.aspect_token_index) + outputs[idx]["labels"].append(prediction) + if self.return_aspects_text: + outputs[idx]["aspects_text"].append(processed_input.aspect) + break + if exists: + continue + processed_dict = {} + processed_dict["sentence"] = processed_input.full_text_tokens + processed_dict["aspects"] = [processed_input.aspect_token_index] + processed_dict["labels"] = [prediction] + if self.return_full_text: + processed_dict["full_text"] = processed_input.full_text + if self.return_aspects_text: + processed_dict["aspects_text"] = [processed_input.aspect] + outputs.append(processed_dict) + return outputs From c217167bc4afe85764792c753da2c99348aff21e Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 10 Jan 2022 20:26:25 +0800 Subject: [PATCH 124/201] [#41] add postprocessor for SenticGCNModel --- sgnlp/models/sentic_gcn/postprocess.py | 48 ++++++++++++++++++++------ 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/sgnlp/models/sentic_gcn/postprocess.py b/sgnlp/models/sentic_gcn/postprocess.py index d0fb1d2..e8ab708 100644 --- a/sgnlp/models/sentic_gcn/postprocess.py +++ b/sgnlp/models/sentic_gcn/postprocess.py @@ -2,19 +2,13 @@ import torch.nn.functional as F -from preprocess import SenticGCNBertData -from modeling import SenticGCNBertModelOutput +from preprocess import SenticGCNData, SenticGCNBertData +from modeling import SenticGCNModelOutput, SenticGCNBertModelOutput -class SenticGCNBertPostprocessor: +class SenticGCNBasePostprocessor: """ - Class to initialise the Postprocessor for SenticGCNBertModel. - Class to postprocess SenticGCNBertModel output to get a list of input text tokens, - aspect token index and prediction labels. - - Args: - return_full_text (bool): Flag to indicate if the full text should be included in the output. - return_aspects_text (bool): Flag to indicate if the list of aspects text should be included in the output. + Base postprocessor class providing common post processing functions. """ def __init__(self, return_full_text: bool = False, return_aspects_text: bool = False) -> None: @@ -22,7 +16,9 @@ def __init__(self, return_full_text: bool = False, return_aspects_text: bool = F self.return_aspects_text = return_aspects_text def __call__( - self, processed_inputs: List[SenticGCNBertData], model_outputs: SenticGCNBertModelOutput + self, + processed_inputs: List[Union[SenticGCNData, SenticGCNBertData]], + model_outputs: Union[SenticGCNModelOutput, SenticGCNBertModelOutput], ) -> List[Dict[str, Union[List[str], List[int], float]]]: # Get predictions probabilities = F.softmax(model_outputs.logits, dim=-1).detach().numpy() @@ -53,3 +49,33 @@ def __call__( processed_dict["aspects_text"] = [processed_input.aspect] outputs.append(processed_dict) return outputs + + +class SenticGCNPostprocessor(SenticGCNBasePostprocessor): + """ + Class to initialise the Postprocessor for SenticGCNModel. + Class to postprocess SenticGCNModel output to get a list of input text tokens, + aspect token index and prediction labels. + + Args: + return_full_text (bool): Flag to indicate if the full text should be included in the output. + return_aspects_text (bool): Flag to indicate if the list of aspects text should be included in the output. + """ + + def __init__(self, return_full_text: bool = False, return_aspects_text: bool = False) -> None: + super().__init__(return_full_text=return_full_text, return_aspects_text=return_aspects_text) + + +class SenticGCNBertPostprocessor(SenticGCNBasePostprocessor): + """ + Class to initialise the Postprocessor for SenticGCNBertModel. + Class to postprocess SenticGCNBertModel output to get a list of input text tokens, + aspect token index and prediction labels. + + Args: + return_full_text (bool): Flag to indicate if the full text should be included in the output. + return_aspects_text (bool): Flag to indicate if the list of aspects text should be included in the output. + """ + + def __init__(self, return_full_text: bool = False, return_aspects_text: bool = False) -> None: + super().__init__(return_full_text=return_full_text, return_aspects_text=return_aspects_text) From 077f488d1903ac15bb22058bd7ff6532e4dad0ea Mon Sep 17 00:00:00 2001 From: Kng Wei Ming Date: Thu, 6 Jan 2022 12:52:26 +0800 Subject: [PATCH 125/201] [#43] add file structure --- demo_api/sentic_gcn/Dockerfile | 10 +++++ demo_api/sentic_gcn/api.py | 32 +++++++++++++ demo_api/sentic_gcn/download_pretrained.py | 12 +++++ .../sentic_gcn/model_card/sentic_gcn.json | 45 +++++++++++++++++++ demo_api/sentic_gcn/usage.py | 8 ++++ 5 files changed, 107 insertions(+) create mode 100644 demo_api/sentic_gcn/Dockerfile create mode 100644 demo_api/sentic_gcn/api.py create mode 100644 demo_api/sentic_gcn/download_pretrained.py create mode 100644 demo_api/sentic_gcn/model_card/sentic_gcn.json create mode 100644 demo_api/sentic_gcn/usage.py diff --git a/demo_api/sentic_gcn/Dockerfile b/demo_api/sentic_gcn/Dockerfile new file mode 100644 index 0000000..a7c9286 --- /dev/null +++ b/demo_api/sentic_gcn/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3.8-buster + +COPY . /demo_api + +WORKDIR /demo_api/sentic_gcn + +RUN pip install -r requirements.txt +RUN python -m download_pretrained + +CMD PYTHONPATH=../../ gunicorn -c ../gunicorn.conf.py \ No newline at end of file diff --git a/demo_api/sentic_gcn/api.py b/demo_api/sentic_gcn/api.py new file mode 100644 index 0000000..7fe880d --- /dev/null +++ b/demo_api/sentic_gcn/api.py @@ -0,0 +1,32 @@ +from flask import request +from flask import request +from transformers import cached_path + +from demo_api.common import create_api +from sgnlp.models.sentic_gcn import SenticGCNModel + +app = create_api(app_name=__name__, model_card_path="model_card/sentic_gcn.json") + +# Download files from azure blob storage +#rel2id_path = cached_path("https://storage.googleapis.com/sgnlp/models/lsr/rel2id.json") + + +# Load model + +config = + +model = + + +app.logger.info("Preprocessing pipeline and model initialization complete.") + +@app.route("/predict", methods=["POST"]) +def predict(): + req_body = request.get_json() + document = req_body["document"] + + # Perform preprocessing from the imported pipeline + + +if __name__ == "__main__": + app.run() \ No newline at end of file diff --git a/demo_api/sentic_gcn/download_pretrained.py b/demo_api/sentic_gcn/download_pretrained.py new file mode 100644 index 0000000..60e11b0 --- /dev/null +++ b/demo_api/sentic_gcn/download_pretrained.py @@ -0,0 +1,12 @@ +# from sgnlp.models.sentic_gcn import () + +config = .from_pretrained( + # google storage .json file +) + +model = .from_pretrained( + # google storage, .bin file + , config=config +) + +# Download tokenizer files \ No newline at end of file diff --git a/demo_api/sentic_gcn/model_card/sentic_gcn.json b/demo_api/sentic_gcn/model_card/sentic_gcn.json new file mode 100644 index 0000000..d3d27dd --- /dev/null +++ b/demo_api/sentic_gcn/model_card/sentic_gcn.json @@ -0,0 +1,45 @@ +{ + "name": "Sentic GCN", + "languages": "English", + "description": "This is a neural network that induces a latent document-level graph and uses a refinement strategy that allows the model to incrementally aggregate relevant information for multi-hop reasoning. This particular model corresponds to the GloVe+LSR model described in the paper.", + "paper": { + "text": "Nan, G., Guo, Z., Sekulić, I., & Lu, W. (2020). Reasoning with Latent Structure Refinement for Document-Level Relation Extraction. Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, July 2020 (pp. 1546-1557).", + "url": "https://aclanthology.org/2020.acl-main.141/" + }, + "trainingDataset": { + "text": "DocRED", + "url": "https://github.com/thunlp/DocRED/tree/master/data" + }, + "evaluationDataset": { + "text": "DocRED", + "url": "https://github.com/thunlp/DocRED/tree/master/data" + }, + "evaluationScores": "0.55 F1 on development set. 0.55 F1 reported by authors in paper on development set.", + "trainingConfig": { + "text": "Not available." + }, + "trainingTime": "~17 hours for 100 epochs on a single V100 GPU.", + "modelWeights": { + "text": "https://storage.googleapis.com/sgnlp/models/lsr/pytorch_model.bin", + "url": "https://storage.googleapis.com/sgnlp/models/lsr/pytorch_model.bin" + }, + "modelConfig": { + "text": "https://storage.googleapis.com/sgnlp/models/lsr/config.json", + "url": "https://storage.googleapis.com/sgnlp/models/lsr/config.json" + }, + "modelInput": "Coreference clusters of entities, relations between clusters of entities, and text.", + "modelOutput": "Scores of all possible relation labels between all possible pairs of entity clusters.", + "modelSize": "~85MB", + "inferenceInfo": "Not available.", + "usageScenarios": "Knowledge graph building.", + "originalCode": { + "text": "https://github.com/nanguoshun/LSR", + "url": "https://github.com/nanguoshun/LSR" + }, + "license": { + "text": "MIT License", + "url": "https://choosealicense.com/licenses/mit" + }, + "contact": "sg-nlp@aisingapore.org", + "additionalInfo": "CAVEATS: The model trained in this paper alone is not sufficient to do extract relations from a document. It requires other models to perform entity recognition and coreference between the entities. For this demo, two other pretrained models from AllenNLP is used: Fine Grained Name Entity Recognition and Coreference SpanBERT." + } \ No newline at end of file diff --git a/demo_api/sentic_gcn/usage.py b/demo_api/sentic_gcn/usage.py new file mode 100644 index 0000000..18fde85 --- /dev/null +++ b/demo_api/sentic_gcn/usage.py @@ -0,0 +1,8 @@ +from sentic_gcn +from transformers import cached_path + +# Download files from azure blob storage + + +# Load model + From 548ab3701bf2b8dc8a0c875e85f7f3192ac6ac69 Mon Sep 17 00:00:00 2001 From: Kng Wei Ming Date: Thu, 6 Jan 2022 18:29:36 +0800 Subject: [PATCH 126/201] [#43] Add requirements.txt --- demo_api/sentic_gcn/api.py | 24 ++++++++-- .../sentic_gcn/model_card/sentic_gcn.json | 44 ++++++++--------- demo_api/sentic_gcn/requirements.txt | 6 +++ demo_api/sentic_gcn/usage.py | 47 +++++++++++++++++-- 4 files changed, 93 insertions(+), 28 deletions(-) create mode 100644 demo_api/sentic_gcn/requirements.txt diff --git a/demo_api/sentic_gcn/api.py b/demo_api/sentic_gcn/api.py index 7fe880d..135a40f 100644 --- a/demo_api/sentic_gcn/api.py +++ b/demo_api/sentic_gcn/api.py @@ -3,7 +3,21 @@ from transformers import cached_path from demo_api.common import create_api -from sgnlp.models.sentic_gcn import SenticGCNModel +from sgnlp.models.sentic_gcn import ( + SenticGCNModel, + SenticGCNModelOutput, + SenticGCNPreTrainedModel, + SenticGCNBertModelOutput, + SenticGCNBertPreTrainedModel, + SenticGCNBertModel, + SenticGCNEmbeddingPreTrainedModel, + SenticGCNEmbeddingModel, + SenticGCNBertEmbeddingModel, + SenticGCNConfig, + SenticGCNPreprocessor, + SenticGCNTokenizer, + SenticGCNBertTokenizer + ) app = create_api(app_name=__name__, model_card_path="model_card/sentic_gcn.json") @@ -13,10 +27,14 @@ # Load model -config = +config = SenticGCNConfig.from_pretrained("") # Input JSON file -model = +model = SenticGCNModel.from_pretrained( + "", # Input model + config=config +) +model.eval() app.logger.info("Preprocessing pipeline and model initialization complete.") diff --git a/demo_api/sentic_gcn/model_card/sentic_gcn.json b/demo_api/sentic_gcn/model_card/sentic_gcn.json index d3d27dd..b15f203 100644 --- a/demo_api/sentic_gcn/model_card/sentic_gcn.json +++ b/demo_api/sentic_gcn/model_card/sentic_gcn.json @@ -1,40 +1,40 @@ { "name": "Sentic GCN", "languages": "English", - "description": "This is a neural network that induces a latent document-level graph and uses a refinement strategy that allows the model to incrementally aggregate relevant information for multi-hop reasoning. This particular model corresponds to the GloVe+LSR model described in the paper.", + "description": "", "paper": { - "text": "Nan, G., Guo, Z., Sekulić, I., & Lu, W. (2020). Reasoning with Latent Structure Refinement for Document-Level Relation Extraction. Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, July 2020 (pp. 1546-1557).", - "url": "https://aclanthology.org/2020.acl-main.141/" + "text": "Bin Liang, Hang Su, Lin Gui, Erik Cambria, Ruifeng Xu. Knowledge-Based Systems, 2021: 107643.", + "url": "https://github.com/BinLiang-NLP/Sentic-GCN" }, "trainingDataset": { - "text": "DocRED", - "url": "https://github.com/thunlp/DocRED/tree/master/data" + "text": "", + "url": "" }, "evaluationDataset": { - "text": "DocRED", - "url": "https://github.com/thunlp/DocRED/tree/master/data" + "text": "", + "url": "" }, - "evaluationScores": "0.55 F1 on development set. 0.55 F1 reported by authors in paper on development set.", + "evaluationScores": "", "trainingConfig": { - "text": "Not available." + "text": "" }, - "trainingTime": "~17 hours for 100 epochs on a single V100 GPU.", + "trainingTime": "", "modelWeights": { - "text": "https://storage.googleapis.com/sgnlp/models/lsr/pytorch_model.bin", - "url": "https://storage.googleapis.com/sgnlp/models/lsr/pytorch_model.bin" + "text": "", + "url": "" }, "modelConfig": { - "text": "https://storage.googleapis.com/sgnlp/models/lsr/config.json", - "url": "https://storage.googleapis.com/sgnlp/models/lsr/config.json" - }, - "modelInput": "Coreference clusters of entities, relations between clusters of entities, and text.", - "modelOutput": "Scores of all possible relation labels between all possible pairs of entity clusters.", - "modelSize": "~85MB", - "inferenceInfo": "Not available.", - "usageScenarios": "Knowledge graph building.", + "text": "", + "url": "" + }, + "modelInput": "", + "modelOutput": "", + "modelSize": "", + "inferenceInfo": "", + "usageScenarios": "", "originalCode": { - "text": "https://github.com/nanguoshun/LSR", - "url": "https://github.com/nanguoshun/LSR" + "text": "https://github.com/BinLiang-NLP/Sentic-GCN", + "url": "https://github.com/BinLiang-NLP/Sentic-GCN" }, "license": { "text": "MIT License", diff --git a/demo_api/sentic_gcn/requirements.txt b/demo_api/sentic_gcn/requirements.txt new file mode 100644 index 0000000..074e572 --- /dev/null +++ b/demo_api/sentic_gcn/requirements.txt @@ -0,0 +1,6 @@ +python==3.6 +torch==1.0.0 +spacy==2.0.18 +numpy==1.15.4 +flask +gunicorn \ No newline at end of file diff --git a/demo_api/sentic_gcn/usage.py b/demo_api/sentic_gcn/usage.py index 18fde85..f00902d 100644 --- a/demo_api/sentic_gcn/usage.py +++ b/demo_api/sentic_gcn/usage.py @@ -1,8 +1,49 @@ -from sentic_gcn -from transformers import cached_path +from sgnlp.models.sentic_gcn import ( + SenticGCNModel, + SenticGCNModelOutput, + SenticGCNPreTrainedModel, + SenticGCNBertModelOutput, + SenticGCNBertPreTrainedModel, + SenticGCNBertModel, + SenticGCNEmbeddingPreTrainedModel, + SenticGCNEmbeddingModel, + SenticGCNBertEmbeddingModel, + SenticGCNConfig, + SenticGCNPreprocessor, + SenticGCNTokenizer, + SenticGCNBertTokenizer + ) -# Download files from azure blob storage +""" +Overall steps: +1. tokenize the data +2. Get embedding matrix + -> self.embedding_matrix = build_embedding_matrix(tokenizer.word2idx, embed_dim, dataset) +3. Set embedding martrix in the loaded model class +4. Run the model (train / test) +""" # Load model +config = SenticGCNConfig.from_pretrained("") # Input JSON file +embedded_matrix = "" + +tokenizer = SenticGCNTokenizer.from_pretrained("") + +model = SenticGCNModel.from_pretrained( + "", # Input model + config=config +) + +preprocessor = SenticGCNPreprocessor(tokenizer) + +# Model predict + +# Inputs +input_batch = {} # Dictionary + + +tensor_dict = preprocessor(input_batch) +raw_output = model(**tensor_dict) +acc , f1 = "" # Return the output, refer to the model class From 78fbaefcb9b42ed604dceab82838656c74d67118 Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Mon, 10 Jan 2022 21:29:38 +0800 Subject: [PATCH 127/201] [#43] Working version of usage.py --- demo_api/sentic_gcn/usage.py | 80 +++++++++++++++++++++++++----------- 1 file changed, 57 insertions(+), 23 deletions(-) diff --git a/demo_api/sentic_gcn/usage.py b/demo_api/sentic_gcn/usage.py index f00902d..d7d361b 100644 --- a/demo_api/sentic_gcn/usage.py +++ b/demo_api/sentic_gcn/usage.py @@ -1,19 +1,16 @@ +from torch._C import device +from transformers import cached_path +import torch.nn.functional as F + + from sgnlp.models.sentic_gcn import ( - SenticGCNModel, - SenticGCNModelOutput, - SenticGCNPreTrainedModel, - SenticGCNBertModelOutput, - SenticGCNBertPreTrainedModel, - SenticGCNBertModel, - SenticGCNEmbeddingPreTrainedModel, - SenticGCNEmbeddingModel, - SenticGCNBertEmbeddingModel, - SenticGCNConfig, - SenticGCNPreprocessor, - SenticGCNTokenizer, - SenticGCNBertTokenizer + SenticGCNBertModel, + SenticGCNBertPreprocessor, + SenticGCNBertConfig ) +from sgnlp.models.sentic_gcn.postprocess import SenticGCNBertPostprocessor + """ Overall steps: 1. tokenize the data @@ -25,25 +22,62 @@ # Load model -config = SenticGCNConfig.from_pretrained("") # Input JSON file +# path = '/Users/weiming/Dev/sg-nlp/sgnlp/sgnlp/models/sentic_gcn/senticnet5.pickle' +path = '../../sgnlp/models/sentic_gcn/senticnet5.pickle' +preprocessor = SenticGCNBertPreprocessor(senticnet=path, device='cpu') + +config = SenticGCNBertConfig.from_pretrained('https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json') + embedded_matrix = "" -tokenizer = SenticGCNTokenizer.from_pretrained("") +# tokenizer = SenticGCNTokenizer.from_pretrained("senticgcn") +# Other tokenizers +# BertTokenizer: 'bert-base-uncased' -model = SenticGCNModel.from_pretrained( - "", # Input model +model = SenticGCNBertModel.from_pretrained( + 'https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin', config=config ) -preprocessor = SenticGCNPreprocessor(tokenizer) - # Model predict # Inputs -input_batch = {} # Dictionary +inputs = [ + { + "aspect": ["Soup"], + "sentence": "Soup is tasty but soup is a little salty. Salty soup." + }, # 1, -1 + { + "aspect": ["service"], + "sentence": "Everyone that sat in the back outside agreed that it was the worst service we had ever received." + }, # -1 + { + "aspect": ["location", "food"], + "sentence": "it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more ." + } # 0, 1 +] + +processed_inputs, processed_indices = preprocessor(inputs) +processed_inputs + +outputs = model(processed_indices) +t_probs = F.softmax(outputs.logits) +t_probs = t_probs.detach().numpy() + +infer_label = [t_probs.argmax(axis=-1)[idx] -1 for idx in range(len(t_probs))] + +# print(processed_inputs[0]) +# print(infer_label[0]) + + +# tensor_dict = preprocessor(input_batch) +# print(tensor_dict) +# output = model(**tensor_dict) +# sentiment = "" -tensor_dict = preprocessor(input_batch) -raw_output = model(**tensor_dict) -acc , f1 = "" # Return the output, refer to the model class +# Postprocessing +postprocessor = SenticGCNBertPostprocessor() +post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=outputs) +print(post_outputs) \ No newline at end of file From 7c0c25c56d4eb18052aa7a3676a5512a67523de0 Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Mon, 10 Jan 2022 21:31:57 +0800 Subject: [PATCH 128/201] [#43] Update download_pretrained.py with filepaths --- demo_api/sentic_gcn/download_pretrained.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/demo_api/sentic_gcn/download_pretrained.py b/demo_api/sentic_gcn/download_pretrained.py index 60e11b0..a86a789 100644 --- a/demo_api/sentic_gcn/download_pretrained.py +++ b/demo_api/sentic_gcn/download_pretrained.py @@ -1,12 +1,20 @@ -# from sgnlp.models.sentic_gcn import () +"""Run this script during build time to download the pretrained models and relevant files first""" -config = .from_pretrained( - # google storage .json file +from sgnlp.models.sentic_gcn import ( + SenticGCNConfig, + SenticGCNBertTokenizer, + SenticGCNBertModel, + SenticGCNBertPreprocessor ) -model = .from_pretrained( - # google storage, .bin file - , config=config +config = SenticGCNConfig.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json" ) -# Download tokenizer files \ No newline at end of file +tokenizer = SenticGCNBertTokenizer.from_pretrained("bert-base-uncased") + + +model = SenticGCNBertModel.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin", + config=config +) \ No newline at end of file From 5a9ee65b13e330f32e8cff1ba21b84a93555d9f1 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 11 Jan 2022 06:57:55 +0800 Subject: [PATCH 129/201] [#41] reference device from existing tensor for newly created tensor within model --- sgnlp/models/sentic_gcn/modeling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sgnlp/models/sentic_gcn/modeling.py b/sgnlp/models/sentic_gcn/modeling.py index 9ab599b..491dd83 100644 --- a/sgnlp/models/sentic_gcn/modeling.py +++ b/sgnlp/models/sentic_gcn/modeling.py @@ -216,7 +216,7 @@ def position_weight( weight[i].append(1 - (j - aspect_double_idx[i, 1]) / context_len) for j in range(text_len[i], seq_len): weight[i].append(0) - weight = torch.tensor(weight).unsqueeze(2) + weight = torch.tensor(weight).unsqueeze(2).to(x.device) return weight * x def mask(self, x: torch.Tensor, aspect_double_idx: torch.Tensor) -> torch.Tensor: @@ -230,7 +230,7 @@ def mask(self, x: torch.Tensor, aspect_double_idx: torch.Tensor) -> torch.Tensor mask[i].append(1) for j in range(min(aspect_double_idx[i, 1] + 1, self.max_seq_len), seq_len): mask[i].append(0) - mask = torch.tensor(mask).unsqueeze(2).float() + mask = torch.tensor(mask).unsqueeze(2).float().to(x.device) return mask * x def forward(self, inputs: List[torch.Tensor], labels: Optional[torch.Tensor] = None) -> SenticGCNBertModelOutput: From 5f44f6bd7acd86fd451b5aa9ca2c8931593ea6e0 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 11 Jan 2022 08:49:22 +0800 Subject: [PATCH 130/201] [#41] add postprocess and data_class imports --- sgnlp/models/sentic_gcn/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sgnlp/models/sentic_gcn/__init__.py b/sgnlp/models/sentic_gcn/__init__.py index 25c08a8..e42dad4 100644 --- a/sgnlp/models/sentic_gcn/__init__.py +++ b/sgnlp/models/sentic_gcn/__init__.py @@ -1,6 +1,8 @@ from config import SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig +from data_class import SenticGCNTrainArgs from modeling import SenticGCNModel, SenticGCNBertModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel from preprocess import SenticGCNPreprocessor, SenticGCNBertPreprocessor +from postprocess import SenticGCNPostprocessor, SenticGCNBertPostprocessor from tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer from train import SenticGCNTrainer, SenticGCNBertTrainer from utils import BucketIterator, parse_args_and_load_config, download_tokenizer_files From 8009c1793cf1e2670920af2cc6d1eeeaccd33f93 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 11 Jan 2022 09:17:21 +0800 Subject: [PATCH 131/201] [#41] cast list of np.array to np.array prior to cast to torch tensor --- sgnlp/models/sentic_gcn/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index d8756c6..0655607 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -561,7 +561,7 @@ def _pad_data(self, batch_data: Dict[str, List]) -> Dict[str, List[torch.Tensor] "aspect_indices": torch.tensor(batch_aspect_indices), "left_indices": torch.tensor(batch_left_indices), "polarity": torch.tensor(batch_polarity), - "sdat_graph": torch.tensor(batch_sdat_graph), + "sdat_graph": torch.tensor(np.array(batch_sdat_graph)), } def __iter__(self): From c29840041c1416e02f9154af3f52132bda426189 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 11 Jan 2022 09:27:36 +0800 Subject: [PATCH 132/201] [#41] add missing to device for SenticGCNModel --- sgnlp/models/sentic_gcn/modeling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sgnlp/models/sentic_gcn/modeling.py b/sgnlp/models/sentic_gcn/modeling.py index 491dd83..c9a6751 100644 --- a/sgnlp/models/sentic_gcn/modeling.py +++ b/sgnlp/models/sentic_gcn/modeling.py @@ -99,7 +99,7 @@ def position_weight( weight[i].append(1 - (j - aspect_double_idx[i, 1] / context_len)) for j in range(text_len[i], seq_len): weight[i].append(0) - weight = torch.tensor(weight, dtype=torch.float).unsqueeze(2) + weight = torch.tensor(weight, dtype=torch.float).unsqueeze(2).to(x.device) return weight * x def mask(self, x: torch.Tensor, aspect_double_idx: torch.Tensor) -> torch.Tensor: @@ -113,7 +113,7 @@ def mask(self, x: torch.Tensor, aspect_double_idx: torch.Tensor) -> torch.Tensor mask[i].append(1) for j in range(aspect_double_idx[i, 1] + 1, seq_len): mask[i].append(0) - mask = torch.tensor(mask, dtype=torch.float).unsqueeze(2) + mask = torch.tensor(mask, dtype=torch.float).unsqueeze(2).to(x.device) return mask * x def forward(self, inputs: List[torch.Tensor], labels: Optional[torch.Tensor] = None) -> SenticGCNModelOutput: From a7c8c1b21d5eae08ff292fb6ae8e1325ff1c5898 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 11 Jan 2022 10:39:40 +0800 Subject: [PATCH 133/201] [#41] add initial test cases --- tests/sentic_gcn/test_data/senticnet.txt | 15 ++++++ tests/sentic_gcn/test_data/test_test.raw | 15 ++++++ tests/sentic_gcn/test_data/test_train.raw | 15 ++++++ tests/sentic_gcn/test_sentic_gcn_utils.py | 63 +++++++++++++++++++++++ 4 files changed, 108 insertions(+) create mode 100644 tests/sentic_gcn/test_data/senticnet.txt create mode 100644 tests/sentic_gcn/test_data/test_test.raw create mode 100644 tests/sentic_gcn/test_data/test_train.raw create mode 100644 tests/sentic_gcn/test_sentic_gcn_utils.py diff --git a/tests/sentic_gcn/test_data/senticnet.txt b/tests/sentic_gcn/test_data/senticnet.txt new file mode 100644 index 0000000..bde4092 --- /dev/null +++ b/tests/sentic_gcn/test_data/senticnet.txt @@ -0,0 +1,15 @@ +CONCEPT POLARITY INTENSITY +abandon negative -0.84 +abandoned negative -0.85 +abandoned_person negative -0.79 +abandoned_quarry negative -0.78 +abandonment negative -0.82 +abase negative -0.90 +abasement negative -0.90 +abash negative -0.77 +abashed negative -0.92 +abashment negative -0.76 +abasia negative -0.67 +abate negative -0.86 +abatement negative -0.85 +abattoir negative -0.77 diff --git a/tests/sentic_gcn/test_data/test_test.raw b/tests/sentic_gcn/test_data/test_test.raw new file mode 100644 index 0000000..0bce32e --- /dev/null +++ b/tests/sentic_gcn/test_data/test_test.raw @@ -0,0 +1,15 @@ +$T$ is super fast , around anywhere from 35 seconds to 1 minute . +Boot time +1 +$T$ would not fix the problem unless I bought your plan for $ 150 plus . +tech support +-1 +$T$ was easy . +Set up +1 +Did not enjoy the new $T$ and touchscreen functions . +Windows 8 +-1 +Did not enjoy the new Windows 8 and $T$ . +touchscreen functions +-1 diff --git a/tests/sentic_gcn/test_data/test_train.raw b/tests/sentic_gcn/test_data/test_train.raw new file mode 100644 index 0000000..b81f040 --- /dev/null +++ b/tests/sentic_gcn/test_data/test_train.raw @@ -0,0 +1,15 @@ +I charge it at night and skip taking the $T$ with me because of the good battery life . +cord +0 +I charge it at night and skip taking the cord with me because of the good $T$ . +battery life +1 +The tech guy then said the $T$ does not do 1-to-1 exchange and I have to direct my concern to the `` sales '' team , which is the retail shop which I bought my netbook from . +service center +-1 +The tech guy then said the service center does not do 1-to-1 exchange and I have to direct my concern to the $T$ , which is the retail shop which I bought my netbook from . +`` sales '' team +-1 +The $T$ then said the service center does not do 1-to-1 exchange and I have to direct my concern to the `` sales '' team , which is the retail shop which I bought my netbook from . +tech guy +0 diff --git a/tests/sentic_gcn/test_sentic_gcn_utils.py b/tests/sentic_gcn/test_sentic_gcn_utils.py new file mode 100644 index 0000000..224d6dd --- /dev/null +++ b/tests/sentic_gcn/test_sentic_gcn_utils.py @@ -0,0 +1,63 @@ +import pathlib +import pytest +import shutil +import tempfile +import unittest + +import numpy as np + +from sgnlp.models.sentic_gcn.utils import pad_and_truncate, load_and_process_senticnet + + +PARENT_DIR = str(pathlib.Path(__file__).parent) + + +class TestPadandTruncate(unittest.TestCase): + def setUp(self) -> None: + self.test_input = [1.0, 2.0, 3.0, 4.0, 5.0] + self.max_len = 50 + + def test_pad_and_truncate(self): + output = pad_and_truncate(self.test_input, max_len=self.max_len) + self.assertEqual(type(output), np.ndarray) + self.assertEqual(len(output), self.max_len) + + +class TestLoadandProcessSenticNet(unittest.TestCase): + def setUp(self) -> None: + self.test_file = pathlib.Path(PARENT_DIR).joinpath("test_data").joinpath("senticnet.txt") + with tempfile.TemporaryDirectory() as tmp_dir: + self.temp_dir = tmp_dir + self.test_save_file_path = pathlib.Path(self.temp_dir).joinpath("senticnet.pkl") + + def tearDown(self) -> None: + shutil.rmtree(self.test_save_file_path, ignore_errors=True) + + def test_load_and_process_senticnet_from_file(self): + senticnet = load_and_process_senticnet(senticnet_file_path=self.test_file) + self.assertEqual(type(senticnet), dict) + self.assertTrue("CONCEPT" not in senticnet.keys()) + self.assertEqual(len(senticnet), 12) + self.assertTrue("abandoned_person" not in senticnet.keys()) + self.assertTrue("abandoned_quarry" not in senticnet.keys()) + self.assertEqual(senticnet["abase"], "-0.90") + + def test_load_and_process_senticnet_save_file(self): + _ = load_and_process_senticnet( + senticnet_file_path=self.test_file, + save_preprocessed_senticnet=True, + saved_preprocessed_senticnet_file_path=self.test_save_file_path, + ) + self.assertTrue(self.test_save_file_path.exists()) + + def test_load_and_process_senticnet_from_pickle_file(self): + _ = load_and_process_senticnet( + senticnet_file_path=self.test_file, + save_preprocessed_senticnet=True, + saved_preprocessed_senticnet_file_path=self.test_save_file_path, + ) + senticnet = load_and_process_senticnet( + save_preprocessed_senticnet=False, saved_preprocessed_senticnet_file_path=str(self.test_save_file_path) + ) + self.assertEqual(type(senticnet), dict) + self.assertEqual(len(senticnet), 12) From 3748e783b9c0ee5481650bdc581053dca6de6f10 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 11 Jan 2022 11:30:52 +0800 Subject: [PATCH 134/201] [#41] add missing _ to default initalizer string --- sgnlp/models/sentic_gcn/data_class.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index a417676..d836711 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -137,7 +137,7 @@ class SenticGCNTrainArgs: save_results: bool = field(default=True, metadata={"help": "Flag to indicate if results should be saved."}) save_results_folder: str = field(default="results", metadata={"help": "Folder location to save results pickle."}) - initializer: str = field(default="xavier_uniform", metadata={"help": "Type of initalizer to use."}) + initializer: str = field(default="xavier_uniform_", metadata={"help": "Type of initalizer to use."}) optimizer: str = field(default="adam", metadata={"help": "Type of optimizer to use."}) loss_function: str = field(default="cross_entropy", metadata={"help": "Loss function for training/eval."}) learning_rate: float = field(default=0.001, metadata={"help": "Default learning rate for training."}) From b0f45265559ec0ebe3cef30ccf3783d1992f6483 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 11 Jan 2022 13:11:26 +0800 Subject: [PATCH 135/201] [#41] add test cases for dataset generator --- tests/sentic_gcn/test_sentic_gcn_utils.py | 96 ++++++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/tests/sentic_gcn/test_sentic_gcn_utils.py b/tests/sentic_gcn/test_sentic_gcn_utils.py index 224d6dd..3bc56ea 100644 --- a/tests/sentic_gcn/test_sentic_gcn_utils.py +++ b/tests/sentic_gcn/test_sentic_gcn_utils.py @@ -3,10 +3,19 @@ import shutil import tempfile import unittest +import unittest.mock as mock import numpy as np +import spacy -from sgnlp.models.sentic_gcn.utils import pad_and_truncate, load_and_process_senticnet +from sgnlp.models.sentic_gcn.data_class import SenticGCNTrainArgs +from sgnlp.models.sentic_gcn.utils import ( + SenticGCNDataset, + SenticGCNDatasetGenerator, + pad_and_truncate, + load_and_process_senticnet, + generate_dependency_adj_matrix, +) PARENT_DIR = str(pathlib.Path(__file__).parent) @@ -61,3 +70,88 @@ def test_load_and_process_senticnet_from_pickle_file(self): ) self.assertEqual(type(senticnet), dict) self.assertEqual(len(senticnet), 12) + + +class TestGenerateDependencyAdjMatrix(unittest.TestCase): + def setUp(self) -> None: + self.test_file = pathlib.Path(PARENT_DIR).joinpath("test_data").joinpath("senticnet.txt") + self.senticnet = load_and_process_senticnet(self.test_file) + self.spacy_pipeline = spacy.load("en_core_web_sm") + self.test_text = "Soup is tasty but soup is a little salty." + self.test_aspect = "soup" + + def test_generate_dependency_adj_matrix(self): + matrix = generate_dependency_adj_matrix(self.test_text, self.test_aspect, self.senticnet, self.spacy_pipeline) + self.assertTrue(type(matrix), np.ndarray) + self.assertEqual(matrix.shape, (9, 9)) + + +class TestSenticGCNDatasetGenerator(unittest.TestCase): + def setUp(self) -> None: + cfg = { + "senticnet_word_file_path": PARENT_DIR + "/test_data/senticnet.txt", + "spacy_pipeline": "en_core_web_sm", + "dataset_train": [PARENT_DIR + "/test_data/test_train.raw"], + "dataset_test": [PARENT_DIR + "/test_data/test_test.raw"], + "valset_ratio": 0, + "model": "senticgcn", + } + self.cfg = SenticGCNTrainArgs(**cfg) + + def test_read_raw_dataset(self): + with mock.patch("sgnlp.models.sentic_gcn.tokenization.SenticGCNTokenizer") as MockClass: + fake_tokenizer = MockClass() + dataset_gen = SenticGCNDatasetGenerator(self.cfg, fake_tokenizer) + data = dataset_gen._read_raw_dataset("train") + self.assertEqual(len(data), 15) + + def test_generate_senticgcn_dataset(self): + with mock.patch("sgnlp.models.sentic_gcn.tokenization.SenticGCNTokenizer") as MockClass: + fake_tokenizer = MockClass(return_value={"input_ids": [1.0, 2.0, 3.0, 4.0, 5.0]}) + dataset_gen = SenticGCNDatasetGenerator(self.cfg, fake_tokenizer) + dataset = dataset_gen._read_raw_dataset(self.cfg.dataset_train) + data = dataset_gen._generate_senticgcn_dataset(dataset) + self.assertEqual(len(data), 5) + for data_row in data: + keys = data_row.keys() + self.assertTrue("text_indices" in keys) + self.assertTrue("aspect_indices" in keys) + self.assertTrue("left_indices" in keys) + self.assertTrue("polarity" in keys) + self.assertTrue("sdat_graph" in keys) + + def test_generate_senticgcn_bert_dataset(self): + with mock.patch("sgnlp.models.sentic_gcn.tokenization.SenticGCNBertTokenizer") as MockClass: + fake_tokenizer = MockClass(return_value={"input_ids": [1.0, 2.0, 3.0, 4.0, 5.0]}) + dataset_gen = SenticGCNDatasetGenerator(self.cfg, fake_tokenizer) + dataset = dataset_gen._read_raw_dataset(self.cfg.dataset_train) + data = dataset_gen._generate_senticgcnbert_dataset(dataset) + self.assertEqual(len(data), 5) + for data_row in data: + keys = data_row.keys() + self.assertTrue("text_indices" in keys) + self.assertTrue("aspect_indices" in keys) + self.assertTrue("left_indices" in keys) + self.assertTrue("text_bert_indices" in keys) + self.assertTrue("bert_segment_indices" in keys) + self.assertTrue("polarity" in keys) + self.assertTrue("sdat_graph" in keys) + + def test_generate_dataset(self): + for model_type in ["senticgcn", "senticgcnbert"]: + self.cfg.model = model_type + class_path = ( + "sgnlp.models.sentic_gcn.tokenization.SenticGCNTokenizer" + if model_type == "senticgcn" + else "sgnlp.models.sentic_gcn.tokenization.SenticGCNBertTokenizer" + ) + with mock.patch(class_path) as MockClass: + fake_tokenizer = MockClass(return_value={"input_ids": [1.0, 2.0, 3.0, 4.0, 5.0]}) + dataset_gen = SenticGCNDatasetGenerator(self.cfg, fake_tokenizer) + train_data, val_data, test_data = dataset_gen.generate_datasets() + self.assertEqual(type(train_data), SenticGCNDataset) + self.assertEqual(type(val_data), SenticGCNDataset) + self.assertEqual(type(test_data), SenticGCNDataset) + self.assertEqual(len(train_data), 5) + self.assertEqual(len(val_data), 5) + self.assertEqual(len(test_data), 5) From 24c025b1c45e56c4602dd774101f38c504f6bf23 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 11 Jan 2022 13:48:39 +0800 Subject: [PATCH 136/201] [#41] add cloud storage option for senticnet files, skip first line when loading senticnet --- sgnlp/models/sentic_gcn/preprocess.py | 14 ++++++++++++-- sgnlp/models/sentic_gcn/utils.py | 6 +++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index df5134e..0b956b2 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -18,6 +18,7 @@ from utils import ( load_and_process_senticnet, download_tokenizer_files, + download_url_file, pad_and_truncate, generate_dependency_adj_matrix, ) @@ -51,7 +52,7 @@ def __init__( config_filename: str = "config.json", model_filename: str = "pytorch_model.bin", spacy_pipeline: str = "en_core_web_sm", - senticnet: str = "senticnet.pickle", + senticnet: str = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", device: str = "cpu", ) -> None: # Set device @@ -61,7 +62,14 @@ def __init__( self.spacy_pipeline = spacy.load(spacy_pipeline) # Load senticnet - if senticnet.endswith(".pkl") or senticnet.endswith(".pickle"): + if senticnet.startswith("https://") or senticnet.startswith("http://"): + with tempfile.TemporaryDirectory() as tmpdir: + temp_dir = pathlib.Path(tmpdir) + download_url_file(senticnet, temp_dir) + saved_path = temp_dir.joinpath("senticnet.pickle") + self.senticnet = load_and_process_senticnet(saved_preprocessed_senticnet_file_path=saved_path) + shutil.rmtree(temp_dir, ignore_errors=True) + elif senticnet.endswith(".pkl") or senticnet.endswith(".pickle"): self.senticnet = load_and_process_senticnet(saved_preprocessed_senticnet_file_path=senticnet) elif senticnet.endswith(".txt"): self.senticnet = load_and_process_senticnet(senticnet_file_path=senticnet) @@ -69,6 +77,8 @@ def __init__( raise ValueError( f""" Invalid SenticNet file! + For downloading from cloud storage, please provide url to pickle file location + (i.e. string url starting with https:// or http://). For processed SenticNet dictionary, please provide pickle file location (i.e. file with .pkl or .pickle extension). For raw SenticNet-5.0 file, please provide text file path (i.e. file with .txt extension) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index 0655607..abad904 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -100,7 +100,7 @@ def pad_and_truncate( padding: str = "post", truncating: str = "post", value: int = 0, -): +) -> np.ndarray: """ Helper method for padding and truncating text and aspect segment. @@ -113,7 +113,7 @@ def pad_and_truncate( value (int, optional): value used for padding. Defaults to 0. Returns: - [type]: [description] + np.ndarray: return a ndarray padded to the max_len """ seq_arr = (np.ones(max_len) * value).astype(dtype) trunc = sequence[-max_len:] if truncating == "pre" else sequence[:max_len] @@ -217,7 +217,7 @@ def load_and_process_senticnet( if not line: continue items = line.split("\t") - if "_" in items[0]: + if "_" in items[0] or "CONCEPT" == items[0]: continue # skip words with '_' sentic_dict[items[0]] = items[-1] if save_preprocessed_senticnet: From 836beb1c6ef001a92cffc21a1503d55aad97ee25 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 11 Jan 2022 14:30:15 +0800 Subject: [PATCH 137/201] [#41] add config unit tests --- tests/sentic_gcn/test_sentic_gcn_model.py | 59 +++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 tests/sentic_gcn/test_sentic_gcn_model.py diff --git a/tests/sentic_gcn/test_sentic_gcn_model.py b/tests/sentic_gcn/test_sentic_gcn_model.py new file mode 100644 index 0000000..22278c5 --- /dev/null +++ b/tests/sentic_gcn/test_sentic_gcn_model.py @@ -0,0 +1,59 @@ +import pytest +import unittest + +from transformers import PretrainedConfig, BertConfig + +from sgnlp.models.sentic_gcn import SenticGCNConfig, SenticGCNBertConfig +from sgnlp.models.sentic_gcn.config import SenticGCNBertEmbeddingConfig, SenticGCNEmbeddingConfig + + +class TestSenticGCNConfigTestCase(unittest.TestCase): + def setUp(self) -> None: + self.config = SenticGCNConfig() + + def test_pretrained_config_base_class(self): + self.assertTrue(issubclass(self.config.__class__, PretrainedConfig)) + + def test_default_params(self): + self.assertEqual(self.config.embed_dim, 300) + self.assertEqual(self.config.hidden_dim, 300) + self.assertEqual(self.config.dropout, 0.3) + self.assertEqual(self.config.polarities_dim, 3) + self.assertEqual(self.config.loss_function, "cross_entropy") + + +class TestSenticGCNBertConfigTestCase(unittest.TestCase): + def setUp(self) -> None: + self.config = SenticGCNBertConfig() + + def test_pretrained_config_base_class(self): + self.assertTrue(issubclass(self.config.__class__, PretrainedConfig)) + + def test_default_params(self): + self.assertEqual(self.config.embed_dim, 300) + self.assertEqual(self.config.hidden_dim, 768) + self.assertEqual(self.config.max_seq_len, 85) + self.assertEqual(self.config.polarities_dim, 3) + self.assertEqual(self.config.dropout, 0.3) + self.assertEqual(self.config.loss_function, "cross_entropy") + + +class TestSenticGCNEmbeddingConfigTestCase(unittest.TestCase): + def setUp(self) -> None: + self.config = SenticGCNEmbeddingConfig() + + def test_pretrained_config_base_class(self): + self.assertTrue(issubclass(self.config.__class__, PretrainedConfig)) + + def test_default_params(self): + self.assertEqual(self.config.vocab_size, 17662) + self.assertEqual(self.config.embed_dim, 300) + + +class TestSenticGCNBertEmbeddingConfigTestCase(unittest.TestCase): + def setUp(self) -> None: + self.config = SenticGCNBertEmbeddingConfig() + + def test_pretrained_config_base_class(self): + self.assertTrue(issubclass(self.config.__class__, PretrainedConfig)) + self.assertTrue(issubclass(self.config.__class__, BertConfig)) From 202fdc532d491e70e3a048081057e55441114562 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Tue, 11 Jan 2022 16:37:40 +0800 Subject: [PATCH 138/201] [#41] add unit tests for SenticGCN models --- tests/sentic_gcn/test_sentic_gcn_model.py | 126 +++++++++++++++++++++- tests/sentic_gcn/test_sentic_gcn_utils.py | 9 +- 2 files changed, 126 insertions(+), 9 deletions(-) diff --git a/tests/sentic_gcn/test_sentic_gcn_model.py b/tests/sentic_gcn/test_sentic_gcn_model.py index 22278c5..cf467fd 100644 --- a/tests/sentic_gcn/test_sentic_gcn_model.py +++ b/tests/sentic_gcn/test_sentic_gcn_model.py @@ -1,10 +1,25 @@ -import pytest import unittest -from transformers import PretrainedConfig, BertConfig +import torch +from transformers import PretrainedConfig, PreTrainedModel, BertConfig, BertModel -from sgnlp.models.sentic_gcn import SenticGCNConfig, SenticGCNBertConfig -from sgnlp.models.sentic_gcn.config import SenticGCNBertEmbeddingConfig, SenticGCNEmbeddingConfig +from sgnlp.models.sentic_gcn.config import ( + SenticGCNConfig, + SenticGCNBertConfig, + SenticGCNEmbeddingConfig, + SenticGCNBertEmbeddingConfig, +) +from sgnlp.models.sentic_gcn.modeling import ( + SenticGCNModel, + SenticGCNModelOutput, + SenticGCNBertModel, + SenticGCNBertModelOutput, + SenticGCNEmbeddingModel, + SenticGCNBertEmbeddingModel, +) + + +DEVICE = torch.device("cpu") class TestSenticGCNConfigTestCase(unittest.TestCase): @@ -57,3 +72,106 @@ def setUp(self) -> None: def test_pretrained_config_base_class(self): self.assertTrue(issubclass(self.config.__class__, PretrainedConfig)) self.assertTrue(issubclass(self.config.__class__, BertConfig)) + + +# TODO: Investigate shape mismatch +# class TestSenticGCNModel(unittest.TestCase): +# def setUp(self) -> None: +# config = SenticGCNConfig() +# self.model = SenticGCNModel(config=config) + +# def test_pretrained_model_base_class(self): +# self.assertTrue(issubclass(self.model.__class__, PreTrainedModel)) + +# def test_config_class(self): +# self.assertEqual(self.model.config_class, SenticGCNConfig) + +# def test_base_model_prefix(self): +# self.assertEqual(self.model.base_model_prefix, "senticgcn") + +# def test_forward_pass(self): +# input_tensors = [ +# torch.ones( +# [1, 100], +# dtype=torch.float32, +# device=DEVICE, +# ), +# torch.ones([1, 100], dtype=torch.float32, device=DEVICE), +# torch.ones([1, 100], dtype=torch.float32, device=DEVICE), +# torch.ones([1, 100, 300], dtype=torch.float32, device=DEVICE), +# torch.ones([1, 100, 100], dtype=torch.float32, device=DEVICE), +# ] + +# self.model.to(DEVICE) +# self.model.eval() +# result = self.model(input_tensors) + +# self.assertEqual(type(result), SenticGCNModelOutput) +# self.assertEqual(type(result.logits), torch.Tensor) +# self.assertEqual(result.logits.shape, torch.Size([1, 3])) + + +class TestSenticGCNBertModel(unittest.TestCase): + def setUp(self) -> None: + config = SenticGCNBertConfig() + self.model = SenticGCNBertModel(config=config) + + def test_pretrained_model_base_class(self): + self.assertTrue(issubclass(self.model.__class__, PreTrainedModel)) + + def test_config_class(self): + self.assertEqual(self.model.config_class, SenticGCNBertConfig) + + def test_base_model_prefix(self): + self.assertEqual(self.model.base_model_prefix, "senticgcnbert") + + def test_forward_pass(self): + input_tensors = [ + torch.ones([1, 85], dtype=torch.float32, device=DEVICE), + torch.ones([1, 85], dtype=torch.float32, device=DEVICE), + torch.ones([1, 85], dtype=torch.float32, device=DEVICE), + torch.ones([1, 85, 768], dtype=torch.float32, device=DEVICE), + torch.ones([1, 85, 85], dtype=torch.float32, device=DEVICE), + ] + + self.model.to(DEVICE) + self.model.eval() + result = self.model(input_tensors) + + self.assertEqual(type(result), SenticGCNBertModelOutput) + self.assertEqual(type(result.logits), torch.Tensor) + self.assertEqual(result.logits.shape, torch.Size([1, 3])) + + +class TestSenticGCNEmbeddingModel(unittest.TestCase): + def setUp(self) -> None: + config = SenticGCNEmbeddingConfig() + self.model = SenticGCNEmbeddingModel(config=config) + + def test_pretrained_model_base_class(self): + self.assertTrue(issubclass(self.model.__class__, PreTrainedModel)) + + def test_config_class(self): + self.assertEqual(self.model.config_class, SenticGCNEmbeddingConfig) + + def test_base_model_prefix(self): + self.assertEqual(self.model.base_model_prefix, "senticgcnembedding") + + def test_forward_pass(self): + input_tensor = torch.ones([1, 100], dtype=torch.long, device=DEVICE) + self.model.to(DEVICE) + self.model.eval() + result = self.model(input_tensor) + + self.assertEqual(type(result), torch.Tensor) + self.assertEqual(result.shape, torch.Size([1, 100, 300])) + + +class TestSenticGCNBertEmbeddingModel(unittest.TestCase): + def setUp(self) -> None: + config = SenticGCNBertEmbeddingConfig() + self.model = SenticGCNBertEmbeddingModel(config=config) + + def test_pretrained_Bert_base_class(self): + self.assertTrue(issubclass(self.model.__class__, BertModel)) + self.assertTrue(issubclass(self.model.__class__, PreTrainedModel)) diff --git a/tests/sentic_gcn/test_sentic_gcn_utils.py b/tests/sentic_gcn/test_sentic_gcn_utils.py index 3bc56ea..66879f0 100644 --- a/tests/sentic_gcn/test_sentic_gcn_utils.py +++ b/tests/sentic_gcn/test_sentic_gcn_utils.py @@ -1,5 +1,4 @@ import pathlib -import pytest import shutil import tempfile import unittest @@ -21,7 +20,7 @@ PARENT_DIR = str(pathlib.Path(__file__).parent) -class TestPadandTruncate(unittest.TestCase): +class TestPadandTruncateTestCase(unittest.TestCase): def setUp(self) -> None: self.test_input = [1.0, 2.0, 3.0, 4.0, 5.0] self.max_len = 50 @@ -32,7 +31,7 @@ def test_pad_and_truncate(self): self.assertEqual(len(output), self.max_len) -class TestLoadandProcessSenticNet(unittest.TestCase): +class TestLoadandProcessSenticNetTestCase(unittest.TestCase): def setUp(self) -> None: self.test_file = pathlib.Path(PARENT_DIR).joinpath("test_data").joinpath("senticnet.txt") with tempfile.TemporaryDirectory() as tmp_dir: @@ -72,7 +71,7 @@ def test_load_and_process_senticnet_from_pickle_file(self): self.assertEqual(len(senticnet), 12) -class TestGenerateDependencyAdjMatrix(unittest.TestCase): +class TestGenerateDependencyAdjMatrixTestCase(unittest.TestCase): def setUp(self) -> None: self.test_file = pathlib.Path(PARENT_DIR).joinpath("test_data").joinpath("senticnet.txt") self.senticnet = load_and_process_senticnet(self.test_file) @@ -86,7 +85,7 @@ def test_generate_dependency_adj_matrix(self): self.assertEqual(matrix.shape, (9, 9)) -class TestSenticGCNDatasetGenerator(unittest.TestCase): +class TestSenticGCNDatasetGeneratorTestCase(unittest.TestCase): def setUp(self) -> None: cfg = { "senticnet_word_file_path": PARENT_DIR + "/test_data/senticnet.txt", From dfb7f0807db0a5ba82094a08e922abbbcecd4d05 Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Tue, 11 Jan 2022 18:29:37 +0800 Subject: [PATCH 139/201] [#43] Update api.py with senticnet preprocessor from gcloud --- demo_api/sentic_gcn/api.py | 57 +++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/demo_api/sentic_gcn/api.py b/demo_api/sentic_gcn/api.py index 135a40f..5d8b8bc 100644 --- a/demo_api/sentic_gcn/api.py +++ b/demo_api/sentic_gcn/api.py @@ -1,50 +1,57 @@ -from flask import request -from flask import request from transformers import cached_path +import torch.nn.functional as F + +from flask import request + from demo_api.common import create_api from sgnlp.models.sentic_gcn import ( - SenticGCNModel, - SenticGCNModelOutput, - SenticGCNPreTrainedModel, - SenticGCNBertModelOutput, - SenticGCNBertPreTrainedModel, SenticGCNBertModel, - SenticGCNEmbeddingPreTrainedModel, - SenticGCNEmbeddingModel, - SenticGCNBertEmbeddingModel, - SenticGCNConfig, - SenticGCNPreprocessor, - SenticGCNTokenizer, - SenticGCNBertTokenizer + SenticGCNBertConfig, + SenticGCNBertTokenizer, + SenticGCNBertPreprocessor ) -app = create_api(app_name=__name__, model_card_path="model_card/sentic_gcn.json") +from sgnlp.models.sentic_gcn.postprocess import SenticGCNBertPostprocessor + +from flask import request +import os + -# Download files from azure blob storage -#rel2id_path = cached_path("https://storage.googleapis.com/sgnlp/models/lsr/rel2id.json") +app = create_api(app_name=__name__, model_card_path="model_card/sentic_gcn.json") +preprocessor = SenticGCNBertPreprocessor(senticnet='https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle', device='cpu') # Load model +config = SenticGCNBertConfig.from_pretrained("https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json") # Input JSON file -config = SenticGCNConfig.from_pretrained("") # Input JSON file - -model = SenticGCNModel.from_pretrained( - "", # Input model +model = SenticGCNBertModel.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin", config=config ) -model.eval() - app.logger.info("Preprocessing pipeline and model initialization complete.") @app.route("/predict", methods=["POST"]) def predict(): req_body = request.get_json() - document = req_body["document"] + # aspect = req_body["aspect"] + # sentence = req_body["sentence"] # Perform preprocessing from the imported pipeline + processed_inputs, processed_indices = preprocessor(req_body) + outputs = model(processed_indices) + t_probs = F.softmax(outputs.logits) + t_probs = t_probs.detach().numpy() + + infer_label = [t_probs.argmax(axis=-1)[idx] -1 for idx in range(len(t_probs))] + + # Postprocessing + postprocessor = SenticGCNBertPostprocessor() + post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=outputs) + return post_outputs if __name__ == "__main__": - app.run() \ No newline at end of file + # app.run() + app.run(host="0.0.0.0", debug=True, port=8000) \ No newline at end of file From 63c221392e27608f12270aba5e1f3ae5fb962095 Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Tue, 11 Jan 2022 19:01:32 +0800 Subject: [PATCH 140/201] [#43] Update api.py POST request --- demo_api/sentic_gcn/api.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/demo_api/sentic_gcn/api.py b/demo_api/sentic_gcn/api.py index 5d8b8bc..da378e3 100644 --- a/demo_api/sentic_gcn/api.py +++ b/demo_api/sentic_gcn/api.py @@ -1,8 +1,9 @@ +import re from transformers import cached_path import torch.nn.functional as F -from flask import request +from flask import request, jsonify from demo_api.common import create_api from sgnlp.models.sentic_gcn import ( @@ -20,6 +21,7 @@ app = create_api(app_name=__name__, model_card_path="model_card/sentic_gcn.json") +# path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static', 'senticnet5.pickle') preprocessor = SenticGCNBertPreprocessor(senticnet='https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle', device='cpu') # Load model @@ -35,11 +37,18 @@ @app.route("/predict", methods=["POST"]) def predict(): req_body = request.get_json() - # aspect = req_body["aspect"] - # sentence = req_body["sentence"] + print(req_body) + aspect = req_body["aspect"] + sentence = req_body["sentence"] + + print('aspect: ',aspect) + print('sentence: ',sentence) + + inputs = list() + inputs.append(req_body) # Perform preprocessing from the imported pipeline - processed_inputs, processed_indices = preprocessor(req_body) + processed_inputs, processed_indices = preprocessor(inputs) outputs = model(processed_indices) t_probs = F.softmax(outputs.logits) t_probs = t_probs.detach().numpy() @@ -49,7 +58,9 @@ def predict(): # Postprocessing postprocessor = SenticGCNBertPostprocessor() post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=outputs) - return post_outputs + + print('post_outputs: ',post_outputs) + return jsonify(post_outputs) # to fix the output if __name__ == "__main__": From 14531a4c77daca0838542a1e2545f806f001ba13 Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Tue, 11 Jan 2022 19:03:38 +0800 Subject: [PATCH 141/201] [#43] tidy up usage.py code --- demo_api/sentic_gcn/usage.py | 37 ++---------------------------------- 1 file changed, 2 insertions(+), 35 deletions(-) diff --git a/demo_api/sentic_gcn/usage.py b/demo_api/sentic_gcn/usage.py index d7d361b..bb2fea4 100644 --- a/demo_api/sentic_gcn/usage.py +++ b/demo_api/sentic_gcn/usage.py @@ -1,8 +1,5 @@ -from torch._C import device -from transformers import cached_path import torch.nn.functional as F - from sgnlp.models.sentic_gcn import ( SenticGCNBertModel, SenticGCNBertPreprocessor, @@ -11,37 +8,17 @@ from sgnlp.models.sentic_gcn.postprocess import SenticGCNBertPostprocessor -""" -Overall steps: -1. tokenize the data -2. Get embedding matrix - -> self.embedding_matrix = build_embedding_matrix(tokenizer.word2idx, embed_dim, dataset) -3. Set embedding martrix in the loaded model class -4. Run the model (train / test) -""" - - # Load model -# path = '/Users/weiming/Dev/sg-nlp/sgnlp/sgnlp/models/sentic_gcn/senticnet5.pickle' -path = '../../sgnlp/models/sentic_gcn/senticnet5.pickle' -preprocessor = SenticGCNBertPreprocessor(senticnet=path, device='cpu') +# path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static', 'senticnet5.pickle') +preprocessor = SenticGCNBertPreprocessor(senticnet='https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle', device='cpu') config = SenticGCNBertConfig.from_pretrained('https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json') - -embedded_matrix = "" - -# tokenizer = SenticGCNTokenizer.from_pretrained("senticgcn") -# Other tokenizers -# BertTokenizer: 'bert-base-uncased' - model = SenticGCNBertModel.from_pretrained( 'https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin', config=config ) -# Model predict - # Inputs inputs = [ { @@ -67,16 +44,6 @@ infer_label = [t_probs.argmax(axis=-1)[idx] -1 for idx in range(len(t_probs))] -# print(processed_inputs[0]) -# print(infer_label[0]) - - -# tensor_dict = preprocessor(input_batch) -# print(tensor_dict) -# output = model(**tensor_dict) -# sentiment = "" - - # Postprocessing postprocessor = SenticGCNBertPostprocessor() post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=outputs) From 3429ce74e5a4c89fd68bbce8284ae4b5484b9b6e Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Wed, 12 Jan 2022 02:07:41 +0800 Subject: [PATCH 142/201] [#43] Update api.py to a working version with int conversion for output key: labels (commented out) --- demo_api/sentic_gcn/api.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/demo_api/sentic_gcn/api.py b/demo_api/sentic_gcn/api.py index da378e3..74e8ba1 100644 --- a/demo_api/sentic_gcn/api.py +++ b/demo_api/sentic_gcn/api.py @@ -1,4 +1,5 @@ import re +import json from transformers import cached_path import torch.nn.functional as F @@ -18,12 +19,13 @@ from flask import request import os - app = create_api(app_name=__name__, model_card_path="model_card/sentic_gcn.json") # path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static', 'senticnet5.pickle') preprocessor = SenticGCNBertPreprocessor(senticnet='https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle', device='cpu') +postprocessor = SenticGCNBertPostprocessor() + # Load model config = SenticGCNBertConfig.from_pretrained("https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json") # Input JSON file @@ -37,30 +39,23 @@ @app.route("/predict", methods=["POST"]) def predict(): req_body = request.get_json() - print(req_body) - aspect = req_body["aspect"] - sentence = req_body["sentence"] - - print('aspect: ',aspect) - print('sentence: ',sentence) inputs = list() inputs.append(req_body) + print(inputs) # Perform preprocessing from the imported pipeline processed_inputs, processed_indices = preprocessor(inputs) outputs = model(processed_indices) t_probs = F.softmax(outputs.logits) t_probs = t_probs.detach().numpy() - - infer_label = [t_probs.argmax(axis=-1)[idx] -1 for idx in range(len(t_probs))] # Postprocessing - postprocessor = SenticGCNBertPostprocessor() post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=outputs) - print('post_outputs: ',post_outputs) - return jsonify(post_outputs) # to fix the output + # Convert labels into int, currently it is a numpy.int which has issues with json + # post_outputs[0]['labels'] = [int(x) for x in post_outputs[0]['labels']] + return post_outputs[0] if __name__ == "__main__": From e93a5492f41cd6e9e3d02925e279be6659fb068e Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Wed, 12 Jan 2022 02:09:05 +0800 Subject: [PATCH 143/201] [#43] Update commented code --- demo_api/sentic_gcn/api.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/demo_api/sentic_gcn/api.py b/demo_api/sentic_gcn/api.py index 74e8ba1..1de8163 100644 --- a/demo_api/sentic_gcn/api.py +++ b/demo_api/sentic_gcn/api.py @@ -55,6 +55,9 @@ def predict(): # Convert labels into int, currently it is a numpy.int which has issues with json # post_outputs[0]['labels'] = [int(x) for x in post_outputs[0]['labels']] + + # To update for conversion in postprocessing for multiple outputs + return post_outputs[0] From 153378e8bd87a42df4be4a0d4432527286a25c61 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 12 Jan 2022 10:12:01 +0800 Subject: [PATCH 144/201] [#41] add unit tests for postprocessor, change all imports to relative path --- sgnlp/models/sentic_gcn/__init__.py | 16 +- sgnlp/models/sentic_gcn/modeling.py | 8 +- sgnlp/models/sentic_gcn/postprocess.py | 8 +- sgnlp/models/sentic_gcn/preprocess.py | 8 +- sgnlp/models/sentic_gcn/train.py | 10 +- sgnlp/models/sentic_gcn/utils.py | 2 +- .../sentic_gcn/test_sentic_gcn_postprocess.py | 342 ++++++++++++++++++ 7 files changed, 368 insertions(+), 26 deletions(-) create mode 100644 tests/sentic_gcn/test_sentic_gcn_postprocess.py diff --git a/sgnlp/models/sentic_gcn/__init__.py b/sgnlp/models/sentic_gcn/__init__.py index e42dad4..200d745 100644 --- a/sgnlp/models/sentic_gcn/__init__.py +++ b/sgnlp/models/sentic_gcn/__init__.py @@ -1,8 +1,8 @@ -from config import SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig -from data_class import SenticGCNTrainArgs -from modeling import SenticGCNModel, SenticGCNBertModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel -from preprocess import SenticGCNPreprocessor, SenticGCNBertPreprocessor -from postprocess import SenticGCNPostprocessor, SenticGCNBertPostprocessor -from tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer -from train import SenticGCNTrainer, SenticGCNBertTrainer -from utils import BucketIterator, parse_args_and_load_config, download_tokenizer_files +from .config import SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig +from .data_class import SenticGCNTrainArgs +from .modeling import SenticGCNModel, SenticGCNBertModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel +from .preprocess import SenticGCNPreprocessor, SenticGCNBertPreprocessor +from .postprocess import SenticGCNPostprocessor, SenticGCNBertPostprocessor +from .tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer +from .train import SenticGCNTrainer, SenticGCNBertTrainer +from .utils import BucketIterator, parse_args_and_load_config, download_tokenizer_files diff --git a/sgnlp/models/sentic_gcn/modeling.py b/sgnlp/models/sentic_gcn/modeling.py index c9a6751..a4c098a 100644 --- a/sgnlp/models/sentic_gcn/modeling.py +++ b/sgnlp/models/sentic_gcn/modeling.py @@ -7,15 +7,15 @@ from transformers import PreTrainedModel, BertModel from transformers.file_utils import ModelOutput -from modules.dynamic_rnn import DynamicLSTM -from modules.gcn import GraphConvolution -from config import ( +from .modules.dynamic_rnn import DynamicLSTM +from .modules.gcn import GraphConvolution +from .config import ( SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig, ) -from utils import build_embedding_matrix +from .utils import build_embedding_matrix @dataclass diff --git a/sgnlp/models/sentic_gcn/postprocess.py b/sgnlp/models/sentic_gcn/postprocess.py index e8ab708..4a35a8b 100644 --- a/sgnlp/models/sentic_gcn/postprocess.py +++ b/sgnlp/models/sentic_gcn/postprocess.py @@ -2,8 +2,8 @@ import torch.nn.functional as F -from preprocess import SenticGCNData, SenticGCNBertData -from modeling import SenticGCNModelOutput, SenticGCNBertModelOutput +from .preprocess import SenticGCNData, SenticGCNBertData +from .modeling import SenticGCNModelOutput, SenticGCNBertModelOutput class SenticGCNBasePostprocessor: @@ -33,7 +33,7 @@ def __call__( if proc_output["sentence"] == processed_input.full_text_tokens: exists = True outputs[idx]["aspects"].append(processed_input.aspect_token_index) - outputs[idx]["labels"].append(prediction) + outputs[idx]["labels"].append(int(prediction)) if self.return_aspects_text: outputs[idx]["aspects_text"].append(processed_input.aspect) break @@ -42,7 +42,7 @@ def __call__( processed_dict = {} processed_dict["sentence"] = processed_input.full_text_tokens processed_dict["aspects"] = [processed_input.aspect_token_index] - processed_dict["labels"] = [prediction] + processed_dict["labels"] = [int(prediction)] if self.return_full_text: processed_dict["full_text"] = processed_input.full_text if self.return_aspects_text: diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index 0b956b2..fc508a4 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -12,10 +12,10 @@ import torch from transformers import PreTrainedTokenizer, PretrainedConfig, PreTrainedModel -from config import SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig -from modeling import SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel -from tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer -from utils import ( +from .config import SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig +from .modeling import SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel +from .tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer +from .utils import ( load_and_process_senticnet, download_tokenizer_files, download_url_file, diff --git a/sgnlp/models/sentic_gcn/train.py b/sgnlp/models/sentic_gcn/train.py index c8c3235..c3b902e 100644 --- a/sgnlp/models/sentic_gcn/train.py +++ b/sgnlp/models/sentic_gcn/train.py @@ -13,17 +13,17 @@ from sklearn.metrics import f1_score from torch.utils.data.dataloader import DataLoader -from config import SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig -from data_class import SenticGCNTrainArgs -from modeling import ( +from .config import SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig +from .data_class import SenticGCNTrainArgs +from .modeling import ( SenticGCNBertPreTrainedModel, SenticGCNModel, SenticGCNBertModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel, ) -from tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer -from utils import parse_args_and_load_config, set_random_seed, SenticGCNDatasetGenerator, BucketIterator +from .tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer +from .utils import parse_args_and_load_config, set_random_seed, SenticGCNDatasetGenerator, BucketIterator logging.basicConfig(level=logging.DEBUG) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index abad904..264dc13 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -16,7 +16,7 @@ from transformers import PreTrainedTokenizer from transformers.tokenization_utils_base import BatchEncoding -from data_class import SenticGCNTrainArgs +from .data_class import SenticGCNTrainArgs def parse_args_and_load_config( diff --git a/tests/sentic_gcn/test_sentic_gcn_postprocess.py b/tests/sentic_gcn/test_sentic_gcn_postprocess.py new file mode 100644 index 0000000..fe8c813 --- /dev/null +++ b/tests/sentic_gcn/test_sentic_gcn_postprocess.py @@ -0,0 +1,342 @@ +import unittest + +import torch + +from sgnlp.models.sentic_gcn.modeling import SenticGCNModelOutput, SenticGCNBertModelOutput +from sgnlp.models.sentic_gcn.preprocess import SenticGCNData, SenticGCNBertData +from sgnlp.models.sentic_gcn.postprocess import SenticGCNPostprocessor, SenticGCNBertPostprocessor + + +class TestSenticGCNPostprocessorTestCase(unittest.TestCase): + def setUp(self) -> None: + self.test_processed_inputs = [ + SenticGCNData( + full_text="soup is tasty but soup is a little salty. salty funkysoup.", + aspect="soup", + left_text="", + full_text_tokens=[ + "Soup", + "is", + "tasty", + "but", + "soup", + "is", + "a", + "little", + "salty.", + "Salty", + "funkysoup.", + ], + aspect_token_index=0, + ), + SenticGCNData( + full_text="soup is tasty but soup is a little salty. salty funkysoup.", + aspect="soup", + left_text="soup is tasty but", + full_text_tokens=[ + "Soup", + "is", + "tasty", + "but", + "soup", + "is", + "a", + "little", + "salty.", + "Salty", + "funkysoup.", + ], + aspect_token_index=4, + ), + SenticGCNData( + full_text="everyone that sat in the back outside agreed that it was the worst service we had ever received.", + aspect="service", + left_text="everyone that sat in the back outside agreed that it was the worst", + full_text_tokens=[ + "Everyone", + "that", + "sat", + "in", + "the", + "back", + "outside", + "agreed", + "that", + "it", + "was", + "the", + "worst", + "service", + "we", + "had", + "ever", + "received.", + ], + aspect_token_index=13, + ), + SenticGCNData( + full_text="it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more .", + aspect="location", + left_text="it 's located in a strip mall near the beverly center , not the greatest", + full_text_tokens=[ + "it", + "'s", + "located", + "in", + "a", + "strip", + "mall", + "near", + "the", + "beverly", + "center", + ",", + "not", + "the", + "greatest", + "location", + ",", + "but", + "the", + "food", + "keeps", + "me", + "coming", + "back", + "for", + "more", + ".", + ], + aspect_token_index=15, + ), + SenticGCNData( + full_text="it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more .", + aspect="food", + left_text="it 's located in a strip mall near the beverly center , not the greatest location , but the", + full_text_tokens=[ + "it", + "'s", + "located", + "in", + "a", + "strip", + "mall", + "near", + "the", + "beverly", + "center", + ",", + "not", + "the", + "greatest", + "location", + ",", + "but", + "the", + "food", + "keeps", + "me", + "coming", + "back", + "for", + "more", + ".", + ], + aspect_token_index=19, + ), + ] + self.test_model_outputs = SenticGCNModelOutput( + loss=None, + logits=torch.ones([5, 3], dtype=torch.float32), + ) + + def test_senticgcn_postprocess(self): + post_proc = SenticGCNPostprocessor() + post_outputs = post_proc(processed_inputs=self.test_processed_inputs, model_outputs=self.test_model_outputs) + self.assertEqual(len(post_outputs), 3) + for key in ["sentence", "aspects", "labels"]: + for output in post_outputs: + self.assertTrue(key in output.keys()) + self.assertEqual(len(post_outputs[0]["aspects"]), 2) + self.assertEqual(len(post_outputs[1]["aspects"]), 1) + self.assertEqual(len(post_outputs[2]["aspects"]), 2) + self.assertEqual(len(post_outputs[0]["labels"]), 2) + self.assertEqual(len(post_outputs[1]["labels"]), 1) + self.assertEqual(len(post_outputs[2]["labels"]), 2) + + def test_senticgcn_post_process_return_text_and_aspect(self): + post_proc = SenticGCNPostprocessor(return_full_text=True, return_aspects_text=True) + post_outputs = post_proc(processed_inputs=self.test_processed_inputs, model_outputs=self.test_model_outputs) + for key in ["sentence", "aspects", "labels", "full_text", "aspects_text"]: + for output in post_outputs: + self.assertTrue(key in output.keys()) + + +class TestSenticGCNBertPostprocessorTestCase(unittest.TestCase): + def setUp(self) -> None: + self.test_processed_inputs = [ + SenticGCNBertData( + full_text="soup is tasty but soup is a little salty. salty funkysoup.", + aspect="soup", + left_text="", + full_text_with_bert_tokens="[CLS] soup is tasty but soup is a little salty. salty funkysoup. [SEP] soup [SEP]", + full_text_tokens=[ + "Soup", + "is", + "tasty", + "but", + "soup", + "is", + "a", + "little", + "salty.", + "Salty", + "funkysoup.", + ], + aspect_token_index=0, + ), + SenticGCNBertData( + full_text="soup is tasty but soup is a little salty. salty funkysoup.", + aspect="soup", + left_text="soup is tasty but", + full_text_with_bert_tokens="[CLS] soup is tasty but soup is a little salty. salty funkysoup. [SEP] soup [SEP]", + full_text_tokens=[ + "Soup", + "is", + "tasty", + "but", + "soup", + "is", + "a", + "little", + "salty.", + "Salty", + "funkysoup.", + ], + aspect_token_index=4, + ), + SenticGCNBertData( + full_text="everyone that sat in the back outside agreed that it was the worst service we had ever received.", + aspect="service", + left_text="everyone that sat in the back outside agreed that it was the worst", + full_text_with_bert_tokens="[CLS] everyone that sat in the back outside agreed that it was the worst service we had ever received. [SEP] service [SEP]", + full_text_tokens=[ + "Everyone", + "that", + "sat", + "in", + "the", + "back", + "outside", + "agreed", + "that", + "it", + "was", + "the", + "worst", + "service", + "we", + "had", + "ever", + "received.", + ], + aspect_token_index=13, + ), + SenticGCNBertData( + full_text="it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more .", + aspect="location", + left_text="it 's located in a strip mall near the beverly center , not the greatest", + full_text_with_bert_tokens="[CLS] it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more . [SEP] location [SEP]", + full_text_tokens=[ + "it", + "'s", + "located", + "in", + "a", + "strip", + "mall", + "near", + "the", + "beverly", + "center", + ",", + "not", + "the", + "greatest", + "location", + ",", + "but", + "the", + "food", + "keeps", + "me", + "coming", + "back", + "for", + "more", + ".", + ], + aspect_token_index=15, + ), + SenticGCNBertData( + full_text="it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more .", + aspect="food", + left_text="it 's located in a strip mall near the beverly center , not the greatest location , but the", + full_text_with_bert_tokens="[CLS] it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more . [SEP] food [SEP]", + full_text_tokens=[ + "it", + "'s", + "located", + "in", + "a", + "strip", + "mall", + "near", + "the", + "beverly", + "center", + ",", + "not", + "the", + "greatest", + "location", + ",", + "but", + "the", + "food", + "keeps", + "me", + "coming", + "back", + "for", + "more", + ".", + ], + aspect_token_index=19, + ), + ] + self.test_model_outputs = SenticGCNBertModelOutput( + loss=None, + logits=torch.ones([5, 3], dtype=torch.float32), + ) + + def test_senticgcnbert_postprocess(self): + post_proc = SenticGCNBertPostprocessor() + post_outputs = post_proc(processed_inputs=self.test_processed_inputs, model_outputs=self.test_model_outputs) + self.assertEqual(len(post_outputs), 3) + for key in ["sentence", "aspects", "labels"]: + for output in post_outputs: + self.assertTrue(key in output.keys()) + self.assertEqual(len(post_outputs[0]["aspects"]), 2) + self.assertEqual(len(post_outputs[1]["aspects"]), 1) + self.assertEqual(len(post_outputs[2]["aspects"]), 2) + self.assertEqual(len(post_outputs[0]["labels"]), 2) + self.assertEqual(len(post_outputs[1]["labels"]), 1) + self.assertEqual(len(post_outputs[2]["labels"]), 2) + + def test_senticgcn_post_process_return_text_and_aspect(self): + post_proc = SenticGCNBertPostprocessor(return_full_text=True, return_aspects_text=True) + post_outputs = post_proc(processed_inputs=self.test_processed_inputs, model_outputs=self.test_model_outputs) + for key in ["sentence", "aspects", "labels", "full_text", "aspects_text"]: + for output in post_outputs: + self.assertTrue(key in output.keys()) From 6035f60d87177731b140027eddef50fc89034bce Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 12 Jan 2022 10:29:51 +0800 Subject: [PATCH 145/201] [#41] update defaults senticnet input arguments --- sgnlp/models/sentic_gcn/preprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index fc508a4..af462fa 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -165,7 +165,7 @@ def __init__( config_filename: str = "config.json", model_filename: str = "pytorch_model.bin", spacy_pipeline: str = "en_core_web_sm", - senticnet: str = "senticnet.pkl", + senticnet: str = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", device: str = "cpu", ) -> None: super().__init__( @@ -325,7 +325,7 @@ def __init__( config_filename: str = "config.json", model_filename: str = "pytorch_model.bin", spacy_pipeline: str = "en_core_web_sm", - senticnet: str = "senticnet.pkl", + senticnet: str = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", max_len: int = 85, device: str = "cpu", ) -> None: From c550d1d45ef0570842602c076eee22f5df5d4f0e Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Wed, 12 Jan 2022 11:35:10 +0800 Subject: [PATCH 146/201] [#43] Tidy up code comments and formatting --- demo_api/sentic_gcn/usage.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/demo_api/sentic_gcn/usage.py b/demo_api/sentic_gcn/usage.py index bb2fea4..7543b64 100644 --- a/demo_api/sentic_gcn/usage.py +++ b/demo_api/sentic_gcn/usage.py @@ -1,5 +1,3 @@ -import torch.nn.functional as F - from sgnlp.models.sentic_gcn import ( SenticGCNBertModel, SenticGCNBertPreprocessor, @@ -8,10 +6,14 @@ from sgnlp.models.sentic_gcn.postprocess import SenticGCNBertPostprocessor -# Load model -# path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static', 'senticnet5.pickle') -preprocessor = SenticGCNBertPreprocessor(senticnet='https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle', device='cpu') +preprocessor = SenticGCNBertPreprocessor( + senticnet='https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle', + device='cpu' +) + +postprocessor = SenticGCNBertPostprocessor() +# Load model config = SenticGCNBertConfig.from_pretrained('https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json') model = SenticGCNBertModel.from_pretrained( @@ -36,15 +38,7 @@ ] processed_inputs, processed_indices = preprocessor(inputs) -processed_inputs - outputs = model(processed_indices) -t_probs = F.softmax(outputs.logits) -t_probs = t_probs.detach().numpy() - -infer_label = [t_probs.argmax(axis=-1)[idx] -1 for idx in range(len(t_probs))] # Postprocessing -postprocessor = SenticGCNBertPostprocessor() -post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=outputs) -print(post_outputs) \ No newline at end of file +post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=outputs) \ No newline at end of file From 0f599191a3dfb935ff877cc2c10f36ca4bd6b550 Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Wed, 12 Jan 2022 11:36:46 +0800 Subject: [PATCH 147/201] [#43] Tidy up import statements --- demo_api/sentic_gcn/usage.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/demo_api/sentic_gcn/usage.py b/demo_api/sentic_gcn/usage.py index 7543b64..7284df7 100644 --- a/demo_api/sentic_gcn/usage.py +++ b/demo_api/sentic_gcn/usage.py @@ -1,11 +1,10 @@ from sgnlp.models.sentic_gcn import ( SenticGCNBertModel, SenticGCNBertPreprocessor, - SenticGCNBertConfig + SenticGCNBertConfig, + SenticGCNBertPostprocessor ) -from sgnlp.models.sentic_gcn.postprocess import SenticGCNBertPostprocessor - preprocessor = SenticGCNBertPreprocessor( senticnet='https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle', device='cpu' From ce349240e7a8b1c679809308d6a0a8f1572ced81 Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Wed, 12 Jan 2022 11:52:51 +0800 Subject: [PATCH 148/201] [#43] Tidy up code, comments and imports --- demo_api/sentic_gcn/api.py | 44 +++++++++++--------------------------- 1 file changed, 13 insertions(+), 31 deletions(-) diff --git a/demo_api/sentic_gcn/api.py b/demo_api/sentic_gcn/api.py index 1de8163..21910e7 100644 --- a/demo_api/sentic_gcn/api.py +++ b/demo_api/sentic_gcn/api.py @@ -1,36 +1,29 @@ -import re -import json -from transformers import cached_path - -import torch.nn.functional as F - from flask import request, jsonify from demo_api.common import create_api from sgnlp.models.sentic_gcn import ( SenticGCNBertModel, SenticGCNBertConfig, - SenticGCNBertTokenizer, - SenticGCNBertPreprocessor + SenticGCNBertPreprocessor, + SenticGCNBertPostprocessor ) -from sgnlp.models.sentic_gcn.postprocess import SenticGCNBertPostprocessor - from flask import request -import os app = create_api(app_name=__name__, model_card_path="model_card/sentic_gcn.json") -# path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static', 'senticnet5.pickle') -preprocessor = SenticGCNBertPreprocessor(senticnet='https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle', device='cpu') +preprocessor = SenticGCNBertPreprocessor( + senticnet='https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle', + device='cpu' +) postprocessor = SenticGCNBertPostprocessor() # Load model -config = SenticGCNBertConfig.from_pretrained("https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json") # Input JSON file +config = SenticGCNBertConfig.from_pretrained('https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json') model = SenticGCNBertModel.from_pretrained( - "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin", + 'https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin', config=config ) @@ -40,26 +33,15 @@ def predict(): req_body = request.get_json() - inputs = list() - inputs.append(req_body) - print(inputs) - - # Perform preprocessing from the imported pipeline - processed_inputs, processed_indices = preprocessor(inputs) + # Preprocessing + processed_inputs, processed_indices = preprocessor([req_body]) outputs = model(processed_indices) - t_probs = F.softmax(outputs.logits) - t_probs = t_probs.detach().numpy() # Postprocessing post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=outputs) - - # Convert labels into int, currently it is a numpy.int which has issues with json - # post_outputs[0]['labels'] = [int(x) for x in post_outputs[0]['labels']] - - # To update for conversion in postprocessing for multiple outputs - - return post_outputs[0] - + + return jsonify(post_outputs) + if __name__ == "__main__": # app.run() From 5f9f816b5577b1542d0ad6c30c7f69f290bd30ef Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Wed, 12 Jan 2022 11:58:35 +0800 Subject: [PATCH 149/201] [#43] Add preprocessor in download_pretrained.py --- demo_api/sentic_gcn/download_pretrained.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/demo_api/sentic_gcn/download_pretrained.py b/demo_api/sentic_gcn/download_pretrained.py index a86a789..8300acc 100644 --- a/demo_api/sentic_gcn/download_pretrained.py +++ b/demo_api/sentic_gcn/download_pretrained.py @@ -7,13 +7,15 @@ SenticGCNBertPreprocessor ) +# Downloads preprocessor, pretrained config, tokenizer, model +preprocessor = SenticGCNBertPreprocessor( + senticnet='https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle', + device='cpu' +) +tokenizer = SenticGCNBertTokenizer.from_pretrained("bert-base-uncased") config = SenticGCNConfig.from_pretrained( "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json" ) - -tokenizer = SenticGCNBertTokenizer.from_pretrained("bert-base-uncased") - - model = SenticGCNBertModel.from_pretrained( "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin", config=config From 4b65af76c718c9f8a0607f0f1ffe13b550fa65f8 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 12 Jan 2022 13:57:41 +0800 Subject: [PATCH 150/201] [#41] minor bug fix for preprocessor, add unit tests for preprocessor --- sgnlp/models/sentic_gcn/preprocess.py | 68 ++++--- .../sentic_gcn/test_sentic_gcn_preprocess.py | 191 ++++++++++++++++++ 2 files changed, 235 insertions(+), 24 deletions(-) create mode 100644 tests/sentic_gcn/test_sentic_gcn_preprocess.py diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index af462fa..a6f29a8 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -52,7 +52,9 @@ def __init__( config_filename: str = "config.json", model_filename: str = "pytorch_model.bin", spacy_pipeline: str = "en_core_web_sm", - senticnet: str = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", + senticnet: Union[ + str, Dict[str, float] + ] = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", device: str = "cpu", ) -> None: # Set device @@ -61,27 +63,41 @@ def __init__( ) self.spacy_pipeline = spacy.load(spacy_pipeline) - # Load senticnet - if senticnet.startswith("https://") or senticnet.startswith("http://"): - with tempfile.TemporaryDirectory() as tmpdir: - temp_dir = pathlib.Path(tmpdir) - download_url_file(senticnet, temp_dir) - saved_path = temp_dir.joinpath("senticnet.pickle") - self.senticnet = load_and_process_senticnet(saved_preprocessed_senticnet_file_path=saved_path) - shutil.rmtree(temp_dir, ignore_errors=True) - elif senticnet.endswith(".pkl") or senticnet.endswith(".pickle"): - self.senticnet = load_and_process_senticnet(saved_preprocessed_senticnet_file_path=senticnet) - elif senticnet.endswith(".txt"): - self.senticnet = load_and_process_senticnet(senticnet_file_path=senticnet) - else: - raise ValueError( - f""" - Invalid SenticNet file! - For downloading from cloud storage, please provide url to pickle file location - (i.e. string url starting with https:// or http://). - For processed SenticNet dictionary, please provide pickle file location - (i.e. file with .pkl or .pickle extension). - For raw SenticNet-5.0 file, please provide text file path (i.e. file with .txt extension) + try: + # Load senticnet + if isinstance(senticnet, dict): + senticnet_ = senticnet + elif senticnet.startswith("https://") or senticnet.startswith("http://"): + with tempfile.TemporaryDirectory() as tmpdir: + temp_dir = pathlib.Path(tmpdir) + download_url_file(senticnet, temp_dir) + saved_path = temp_dir.joinpath("senticnet.pickle") + senticnet_ = load_and_process_senticnet(saved_preprocessed_senticnet_file_path=saved_path) + shutil.rmtree(temp_dir, ignore_errors=True) + elif senticnet.endswith(".pkl") or senticnet.endswith(".pickle"): + senticnet_ = load_and_process_senticnet(saved_preprocessed_senticnet_file_path=senticnet) + elif senticnet.endswith(".txt"): + senticnet_ = load_and_process_senticnet(senticnet_file_path=senticnet) + else: + raise ValueError( + """ + Error initializing SenticNet! + For downloading from cloud storage, please provide url to pickle file location + (i.e. string url starting with https:// or http://). + For processed SenticNet dictionary, please provide pickle file location + (i.e. file with .pkl or .pickle extension). + For raw SenticNet-5.0 file, please provide text file path (i.e. file with .txt extension). + For externally created SenticNet dictionary, please provide a dictionary with words as key + and sentic score as values. + """ + ) + self.senticnet = senticnet_ + except Exception as e: + logging.error(e) + raise Exception( + """ + Error initializing SenticNet! Please ensure that input is either a dictionary, a str path to + a saved pickle file, an url to cloud storage or str path to the raw senticnet file. """ ) @@ -165,7 +181,9 @@ def __init__( config_filename: str = "config.json", model_filename: str = "pytorch_model.bin", spacy_pipeline: str = "en_core_web_sm", - senticnet: str = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", + senticnet: Union[ + str, Dict[str, float] + ] = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", device: str = "cpu", ) -> None: super().__init__( @@ -325,7 +343,9 @@ def __init__( config_filename: str = "config.json", model_filename: str = "pytorch_model.bin", spacy_pipeline: str = "en_core_web_sm", - senticnet: str = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", + senticnet: Union[ + str, Dict[str, float] + ] = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", max_len: int = 85, device: str = "cpu", ) -> None: diff --git a/tests/sentic_gcn/test_sentic_gcn_preprocess.py b/tests/sentic_gcn/test_sentic_gcn_preprocess.py new file mode 100644 index 0000000..03c343d --- /dev/null +++ b/tests/sentic_gcn/test_sentic_gcn_preprocess.py @@ -0,0 +1,191 @@ +import pathlib +import pytest +import unittest + +import torch + +from sgnlp.models.sentic_gcn.config import SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig +from sgnlp.models.sentic_gcn.modeling import SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel +from sgnlp.models.sentic_gcn.preprocess import ( + SenticGCNBasePreprocessor, + SenticGCNPreprocessor, + SenticGCNBertPreprocessor, + SenticGCNData, + SenticGCNBertData, +) +from sgnlp.models.sentic_gcn.tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer + + +PARENT_DIR = str(pathlib.Path(__file__).parent) + + +class TestSenticGCNPreprocessorTestCase(unittest.TestCase): + def setUp(self) -> None: + self.test_tokenizer = SenticGCNTokenizer( + train_files=[PARENT_DIR + "/test_data/test_train.raw", PARENT_DIR + "/test_data/test_test.raw"], + train_vocab=True, + ) + test_embed_config = SenticGCNEmbeddingConfig() + self.test_embed_model = SenticGCNEmbeddingModel(config=test_embed_config) + self.test_inputs = [ + {"aspect": ["Soup"], "sentence": "Soup is tasty but soup is a little salty. Salty funkysoup."}, # 1, -1 + { + "aspect": ["service"], + "sentence": "Everyone that sat in the back outside agreed that it was the worst service we had ever received.", + }, # -1 + { + "aspect": ["location", "food"], + "sentence": "it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more .", + }, # 0, 1 + ] + self.test_senticnet = {"test": 1.0} + + @pytest.mark.slow + def test_senticgcn_preprocessor(self): + """ + Create preprocessor with all defaults input arguments + """ + pre_proc = SenticGCNPreprocessor() + self.assertTrue(issubclass(pre_proc.__class__, SenticGCNBasePreprocessor)) + self.assertEqual(pre_proc.tokenizer.__class__, SenticGCNTokenizer) + self.assertEqual(pre_proc.embedding_model.__class__, SenticGCNEmbeddingModel) + self.assertTrue(isinstance(pre_proc.senticnet, dict)) + + processed_inputs, processed_indices = pre_proc(self.test_inputs) + self.assertEqual(len(processed_inputs), 5) + self.assertEqual(len(processed_indices), 5) + + for proc_input in processed_inputs: + self.assertTrue(isinstance(proc_input, SenticGCNData)) + for key in ["full_text", "aspect", "left_text", "full_text_tokens", "aspect_token_index"]: + self.assertTrue(hasattr(proc_input, key)) + + for proc_index in processed_indices: + self.assertTrue(isinstance(proc_index, torch.Tensor)) + self.assertEqual(processed_indices[0].shape, torch.Size([5, 128])) + self.assertEqual(processed_indices[1].shape, torch.Size([5, 128])) + self.assertEqual(processed_indices[2].shape, torch.Size([5, 128])) + self.assertEqual(processed_indices[3].shape, torch.Size([5, 128, 300])) + self.assertEqual(processed_indices[4].shape, torch.Size([5, 128, 128])) + + def test_senticgcn_preprocessor_from_external(self): + """ + Create preprocessor with tokenizer, embedding model and senticnet from external instances + """ + pre_proc = SenticGCNPreprocessor( + tokenizer=self.test_tokenizer, embedding_model=self.test_embed_model, senticnet=self.test_senticnet + ) + self.assertTrue(issubclass(pre_proc.__class__, SenticGCNBasePreprocessor)) + self.assertEqual(pre_proc.tokenizer.__class__, SenticGCNTokenizer) + self.assertEqual(pre_proc.embedding_model.__class__, SenticGCNEmbeddingModel) + self.assertTrue(isinstance(pre_proc.senticnet, dict)) + + processed_inputs, processed_indices = pre_proc(self.test_inputs) + self.assertEqual(len(processed_inputs), 5) + self.assertEqual(len(processed_indices), 5) + + def test_senticgcn_preprocessor_from_file(self): + """ + Create preprocessor with senticnet from pickle file + """ + pre_proc = SenticGCNPreprocessor( + tokenizer=self.test_tokenizer, + embedding_model=self.test_embed_model, + senticnet=PARENT_DIR + "/test_data/test_senticnet.pickle", + ) + self.assertTrue(issubclass(pre_proc.__class__, SenticGCNBasePreprocessor)) + self.assertEqual(pre_proc.tokenizer.__class__, SenticGCNTokenizer) + self.assertEqual(pre_proc.embedding_model.__class__, SenticGCNEmbeddingModel) + self.assertTrue(isinstance(pre_proc.senticnet, dict)) + + processed_inputs, processed_indices = pre_proc(self.test_inputs) + self.assertEqual(len(processed_inputs), 5) + self.assertEqual(len(processed_indices), 5) + + +class TestSenticGCNBertPreprocessorTestCase(unittest.TestCase): + def setUp(self) -> None: + self.test_tokenizer = SenticGCNBertTokenizer.from_pretrained("bert-base-uncased") + test_embed_config = SenticGCNBertEmbeddingConfig() + self.test_embed_model = SenticGCNBertEmbeddingModel(config=test_embed_config) + self.test_inputs = [ + {"aspect": ["Soup"], "sentence": "Soup is tasty but soup is a little salty. Salty funkysoup."}, # 1, -1 + { + "aspect": ["service"], + "sentence": "Everyone that sat in the back outside agreed that it was the worst service we had ever received.", + }, # -1 + { + "aspect": ["location", "food"], + "sentence": "it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more .", + }, # 0, 1 + ] + self.test_senticnet = {"test": 1.0} + + @pytest.mark.slow + def test_senticgcnbert_preprocessor(self): + """ + Create preprocessor with all defaults input arguments + """ + pre_proc = SenticGCNBertPreprocessor() + self.assertTrue(issubclass(pre_proc.__class__, SenticGCNBasePreprocessor)) + self.assertEqual(pre_proc.tokenizer.__class__, SenticGCNBertTokenizer) + self.assertEqual(pre_proc.embedding_model.__class__, SenticGCNBertEmbeddingModel) + self.assertTrue(isinstance(pre_proc.senticnet, dict)) + + processed_inputs, processed_indices = pre_proc(self.test_inputs) + self.assertEqual(len(processed_inputs), 5) + self.assertEqual(len(processed_indices), 5) + + for proc_input in processed_inputs: + self.assertTrue(isinstance(proc_input, SenticGCNBertData)) + for key in [ + "full_text", + "aspect", + "left_text", + "full_text_with_bert_tokens", + "full_text_tokens", + "aspect_token_index", + ]: + self.assertTrue(hasattr(proc_input, key)) + + for proc_index in processed_indices: + self.assertTrue(isinstance(proc_index, torch.Tensor)) + self.assertEqual(processed_indices[0].shape, torch.Size([5, 85])) + self.assertEqual(processed_indices[1].shape, torch.Size([5, 85])) + self.assertEqual(processed_indices[2].shape, torch.Size([5, 85])) + self.assertEqual(processed_indices[3].shape, torch.Size([5, 85, 768])) + self.assertEqual(processed_indices[4].shape, torch.Size([5, 85, 85])) + + def test_senticgcnbert_preprocessor_from_external(self): + """ + Create preprocessor with tokenizer, embedding model and senticnet from external instances + """ + pre_proc = SenticGCNBertPreprocessor( + tokenizer=self.test_tokenizer, embedding_model=self.test_embed_model, senticnet=self.test_senticnet + ) + self.assertTrue(issubclass(pre_proc.__class__, SenticGCNBasePreprocessor)) + self.assertEqual(pre_proc.tokenizer.__class__, SenticGCNBertTokenizer) + self.assertEqual(pre_proc.embedding_model.__class__, SenticGCNBertEmbeddingModel) + self.assertTrue(isinstance(pre_proc.senticnet, dict)) + + processed_inputs, processed_indices = pre_proc(self.test_inputs) + self.assertEqual(len(processed_inputs), 5) + self.assertEqual(len(processed_indices), 5) + + def test_senticgcnbert_preprocessor_from_file(self): + """ + Create preprocessor with senticnet from pickle file + """ + pre_proc = SenticGCNBertPreprocessor( + tokenizer=self.test_tokenizer, + embedding_model=self.test_embed_model, + senticnet=PARENT_DIR + "/test_data/test_senticnet.pickle", + ) + self.assertTrue(issubclass(pre_proc.__class__, SenticGCNBasePreprocessor)) + self.assertEqual(pre_proc.tokenizer.__class__, SenticGCNBertTokenizer) + self.assertEqual(pre_proc.embedding_model.__class__, SenticGCNBertEmbeddingModel) + self.assertTrue(isinstance(pre_proc.senticnet, dict)) + + processed_inputs, processed_indices = pre_proc(self.test_inputs) + self.assertEqual(len(processed_inputs), 5) + self.assertEqual(len(processed_indices), 5) From 4ae53637a5454de69edd2cf854abfc99a8dd932c Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Wed, 12 Jan 2022 14:23:28 +0800 Subject: [PATCH 151/201] [#43] Update basic information in model_card --- .../sentic_gcn/model_card/sentic_gcn.json | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/demo_api/sentic_gcn/model_card/sentic_gcn.json b/demo_api/sentic_gcn/model_card/sentic_gcn.json index b15f203..1e3b323 100644 --- a/demo_api/sentic_gcn/model_card/sentic_gcn.json +++ b/demo_api/sentic_gcn/model_card/sentic_gcn.json @@ -7,12 +7,12 @@ "url": "https://github.com/BinLiang-NLP/Sentic-GCN" }, "trainingDataset": { - "text": "", - "url": "" + "text": "semeval15", + "url": "https://github.com/BinLiang-NLP/Sentic-GCN/tree/main/datasets" }, "evaluationDataset": { - "text": "", - "url": "" + "text": "semeval15", + "url": "https://github.com/BinLiang-NLP/Sentic-GCN/tree/main/datasets" }, "evaluationScores": "", "trainingConfig": { @@ -20,18 +20,18 @@ }, "trainingTime": "", "modelWeights": { - "text": "", - "url": "" + "text": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin", + "url": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin" }, "modelConfig": { - "text": "", - "url": "" + "text": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json", + "url": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json" }, - "modelInput": "", - "modelOutput": "", + "modelInput": "Aspect (word), sentence containing the aspect", + "modelOutput": "Sentiment of aspect, -1 (negative), 0 (neutral), 1 (postive)", "modelSize": "", "inferenceInfo": "", - "usageScenarios": "", + "usageScenarios": "Sentiment analysis of aspects in sentences", "originalCode": { "text": "https://github.com/BinLiang-NLP/Sentic-GCN", "url": "https://github.com/BinLiang-NLP/Sentic-GCN" From 3248c2582b2af08ccfcc3a1d681d3ea2e91ba449 Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Wed, 12 Jan 2022 14:32:19 +0800 Subject: [PATCH 152/201] [#43] Update requirements.txt --- demo_api/sentic_gcn/requirements.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/demo_api/sentic_gcn/requirements.txt b/demo_api/sentic_gcn/requirements.txt index 074e572..e0cc76d 100644 --- a/demo_api/sentic_gcn/requirements.txt +++ b/demo_api/sentic_gcn/requirements.txt @@ -1,6 +1,6 @@ -python==3.6 -torch==1.0.0 -spacy==2.0.18 -numpy==1.15.4 +torch==1.10.1 +spacy==3.2.1 +numpy==1.22.0 flask -gunicorn \ No newline at end of file +gunicorn +sgnlp==0.2.0 \ No newline at end of file From 62440de325a1b64385214999ed4872b360b62553 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 12 Jan 2022 15:10:04 +0800 Subject: [PATCH 153/201] [#41] add unit tests for tokenizers --- .../test_data/test_senticnet.pickle | Bin 0 -> 31 bytes tests/sentic_gcn/test_data/test_vocab.pkl | Bin 0 -> 46 bytes .../test_sentic_gcn_tokenization.py | 50 ++++++++++++++++++ 3 files changed, 50 insertions(+) create mode 100644 tests/sentic_gcn/test_data/test_senticnet.pickle create mode 100644 tests/sentic_gcn/test_data/test_vocab.pkl create mode 100644 tests/sentic_gcn/test_sentic_gcn_tokenization.py diff --git a/tests/sentic_gcn/test_data/test_senticnet.pickle b/tests/sentic_gcn/test_data/test_senticnet.pickle new file mode 100644 index 0000000000000000000000000000000000000000..7db2ed6ad5e2cc2912d71394c5ef8d309f186acf GIT binary patch literal 31 ecmZo*nJU5n0kuj409R0lc$ literal 0 HcmV?d00001 diff --git a/tests/sentic_gcn/test_data/test_vocab.pkl b/tests/sentic_gcn/test_data/test_vocab.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0da4656d791124d42001c550b18bb5289546eccb GIT binary patch literal 46 ucmZo*nX1eH0ku;!dYIEvQ>S=y^)RJnPVp82Qu$N7<$$84xl_C~O7#E~{S8k5 literal 0 HcmV?d00001 diff --git a/tests/sentic_gcn/test_sentic_gcn_tokenization.py b/tests/sentic_gcn/test_sentic_gcn_tokenization.py new file mode 100644 index 0000000..1528b1c --- /dev/null +++ b/tests/sentic_gcn/test_sentic_gcn_tokenization.py @@ -0,0 +1,50 @@ +import pathlib +import pytest +import unittest + +from transformers import PreTrainedTokenizer +from transformers.file_utils import to_numpy + +from sgnlp.models.sentic_gcn.tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer + + +PARENT_DIR = str(pathlib.Path(__file__).parent) + + +class TestSenticGCNTokenizerTestCase(unittest.TestCase): + def setUp(self) -> None: + self.test_train_files = [PARENT_DIR + "/test_data/test_train.raw", PARENT_DIR + "/test_data/test_test.raw"] + self.test_vocab_file = PARENT_DIR + "/test_data/test_vocab.pkl" + + def test_senticgcn_tokenizer_from_vocab(self): + tokenizer = SenticGCNTokenizer(vocab_file=self.test_vocab_file) + self.assertTrue(issubclass(tokenizer.__class__, PreTrainedTokenizer)) + + output = tokenizer("fee fi fo fum") + self.assertEqual(output["input_ids"], [10, 20, 30, 40]) + + def test_senticgcn_tokenizer_from_train_files(self): + tokenizer = SenticGCNTokenizer(train_files=self.test_train_files, train_vocab=True) + self.assertTrue(issubclass(tokenizer.__class__, PreTrainedTokenizer)) + + output = tokenizer("night service center") + self.assertEqual(output["input_ids"], [6, 24, 25]) + + +class TestSenticGCNBertTokenizerTestCase(unittest.TestCase): + def setUp(self) -> None: + self.pretrained_tokenizer_name = "bert-base-uncased" + + @pytest.mark.slow + def test_senticgcnbert_tokenizer(self): + tokenizer = SenticGCNBertTokenizer.from_pretrained(self.pretrained_tokenizer_name) + self.assertTrue(issubclass(tokenizer.__class__, PreTrainedTokenizer)) + + output = tokenizer("fee fi fo fum") + self.assertEqual(output["input_ids"], [7408, 10882, 1042, 2080, 11865, 2213]) + + output = tokenizer("fee fi fo fum", max_length=30, padding="max_length") + self.assertEqual(len(output["input_ids"]), 30) + + output = tokenizer("", max_length=10, padding="max_length") + self.assertEqual(len(output["input_ids"]), 10) From 8069ea38b78eba7c3db427de132352c76ee8b961 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 12 Jan 2022 15:10:41 +0800 Subject: [PATCH 154/201] [#41] remove unused imports --- tests/sentic_gcn/test_sentic_gcn_tokenization.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/sentic_gcn/test_sentic_gcn_tokenization.py b/tests/sentic_gcn/test_sentic_gcn_tokenization.py index 1528b1c..ee18773 100644 --- a/tests/sentic_gcn/test_sentic_gcn_tokenization.py +++ b/tests/sentic_gcn/test_sentic_gcn_tokenization.py @@ -3,7 +3,6 @@ import unittest from transformers import PreTrainedTokenizer -from transformers.file_utils import to_numpy from sgnlp.models.sentic_gcn.tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer From 4aaaede6592e8bdcef9e92dec657bc51903670c9 Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Wed, 12 Jan 2022 16:07:30 +0800 Subject: [PATCH 155/201] [#43] Update model card information --- .../sentic_gcn/model_card/sentic_gcn.json | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/demo_api/sentic_gcn/model_card/sentic_gcn.json b/demo_api/sentic_gcn/model_card/sentic_gcn.json index 1e3b323..df496eb 100644 --- a/demo_api/sentic_gcn/model_card/sentic_gcn.json +++ b/demo_api/sentic_gcn/model_card/sentic_gcn.json @@ -1,36 +1,36 @@ { "name": "Sentic GCN", "languages": "English", - "description": "", + "description": "This is a neural network that utilises LSTM and GCN to detect the sentiment polarities of different aspects in the same sentence. The models used corresponds to the associated models described in the paper.", "paper": { - "text": "Bin Liang, Hang Su, Lin Gui, Erik Cambria, Ruifeng Xu. Knowledge-Based Systems, 2021: 107643.", + "text": "Bin Liang, Hang Su, Lin Gui, Erik Cambria, Ruifeng Xu. (2021). Aspect-based sentiment analysis via affective knowledge enhanced graph convolutional networks, 2021: 107643.", "url": "https://github.com/BinLiang-NLP/Sentic-GCN" }, "trainingDataset": { - "text": "semeval15", + "text": "acl-14-short-data, semeval14, semeval15, semeval16", "url": "https://github.com/BinLiang-NLP/Sentic-GCN/tree/main/datasets" }, "evaluationDataset": { - "text": "semeval15", + "text": "acl-14-short-data, semeval14, semeval15, semeval16", "url": "https://github.com/BinLiang-NLP/Sentic-GCN/tree/main/datasets" }, - "evaluationScores": "", + "evaluationScores": "Sentic-GCN: 94.36% Acc, 94.43% F1 (SemEval14-Laptop), 94.55% Acc, 91.99% F1 (SemEval14-Restaurant), 95.02% Acc, 93.22% F1 (SemEval15-Restaurant), 96.75% Acc, 93.55% F1 (SemEval16-Restaurant). Sentic-GCN Bert: 99.22% Acc, 99.15% F1 (SemEval14-Laptop), 97.39% Acc, 96.53% F1 (SemEval14-Restaurant), 99.17% Acc, 98.78% F1 (SemEval15-Restaurant), 99.37% Acc, 98.79% F1 (SemEval16-Restaurant).", "trainingConfig": { - "text": "" + "text": "Refer to documentation for details." }, - "trainingTime": "", + "trainingTime": "Sentic-GCN: ~10 mins for ~35 epochs (early stopped), Sentic-GCN Bert: ~1 hr for ~40 epochs (early stopped) for SemEval14-Laptop/SemEval14-Restaurant/SemEval15-Restaurant/SemEval16-Restaurant datasets.", "modelWeights": { - "text": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin", + "text": "Refer to documentation for details.", "url": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin" }, "modelConfig": { - "text": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json", + "text": "Refer to documentation for details.", "url": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json" }, "modelInput": "Aspect (word), sentence containing the aspect", "modelOutput": "Sentiment of aspect, -1 (negative), 0 (neutral), 1 (postive)", - "modelSize": "", - "inferenceInfo": "", + "modelSize": "Sentic-GCN: ~8.7MB, Sentic-GCN Bert: ~7.1MB", + "inferenceInfo": "< 1 sec on Intel(R) i7 Quad-Core @ 1.7GHz.", "usageScenarios": "Sentiment analysis of aspects in sentences", "originalCode": { "text": "https://github.com/BinLiang-NLP/Sentic-GCN", @@ -40,6 +40,5 @@ "text": "MIT License", "url": "https://choosealicense.com/licenses/mit" }, - "contact": "sg-nlp@aisingapore.org", - "additionalInfo": "CAVEATS: The model trained in this paper alone is not sufficient to do extract relations from a document. It requires other models to perform entity recognition and coreference between the entities. For this demo, two other pretrained models from AllenNLP is used: Fine Grained Name Entity Recognition and Coreference SpanBERT." + "contact": "sg-nlp@aisingapore.org" } \ No newline at end of file From 59b8a3d24436375a5e1620c0c4a7596d5bd73639 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 12 Jan 2022 16:44:54 +0800 Subject: [PATCH 156/201] [#41] add option to download embedding model from cloud for trainer, add trainer unit tests --- sgnlp/models/sentic_gcn/train.py | 20 ++- .../sentic_gcn/test_sentic_gcn_train_eval.py | 167 ++++++++++++++++++ 2 files changed, 183 insertions(+), 4 deletions(-) create mode 100644 tests/sentic_gcn/test_sentic_gcn_train_eval.py diff --git a/sgnlp/models/sentic_gcn/train.py b/sgnlp/models/sentic_gcn/train.py index c3b902e..5704f41 100644 --- a/sgnlp/models/sentic_gcn/train.py +++ b/sgnlp/models/sentic_gcn/train.py @@ -5,6 +5,7 @@ import pickle import shutil import tempfile +import urllib from typing import Dict, List, Tuple, Union import torch @@ -476,10 +477,21 @@ def _create_embedding_model(self, vocab: Dict[str, int]) -> SenticGCNEmbeddingMo SenticGCNEmbeddingModel: return a SenticGCNEmbeddingModel instance. """ if not self.config.build_embedding_model: - config_path = pathlib.Path(self.config.embedding_model).joinpath("config.json") - embed_config = SenticGCNEmbeddingConfig.from_pretrained(config_path) - embed_path = pathlib.Path(self.config.embedding_model).joinpath("pytorch_model.bin") - return SenticGCNEmbeddingModel.from_pretrained(embed_path, config=embed_config) + config_filename = "config.json" + model_filename = "pytorch_model.bin" + if self.config.embedding_model.startswith("https://") or self.config.embedding_model.startswith("http://"): + # Load from cloud + config_url = urllib.parse.urljoin(self.config.embedding_model, config_filename) + model_url = urllib.parse.urljoin(self.config.embedding_model, model_filename) + embedding_config = SenticGCNEmbeddingConfig.from_pretrained(config_url) + embedding_model = SenticGCNEmbeddingModel.from_pretrained(model_url, config=embedding_config) + else: + # Load from local folder + config_path = pathlib.Path(self.config.embedding_model).joinpath(config_filename) + embedding_config = SenticGCNEmbeddingConfig.from_pretrained(config_path) + embed_path = pathlib.Path(self.config.embedding_model).joinpath(model_filename) + embedding_model = SenticGCNEmbeddingModel.from_pretrained(embed_path, config=embedding_config) + return embedding_model else: embedding_model = SenticGCNEmbeddingModel.build_embedding_model( self.config.word_vec_file_path, vocab, self.config.embed_dim diff --git a/tests/sentic_gcn/test_sentic_gcn_train_eval.py b/tests/sentic_gcn/test_sentic_gcn_train_eval.py new file mode 100644 index 0000000..a7f39e4 --- /dev/null +++ b/tests/sentic_gcn/test_sentic_gcn_train_eval.py @@ -0,0 +1,167 @@ +import pathlib +import pickle +import pytest +import shutil +import tempfile +import unittest + +from sgnlp.models.sentic_gcn.data_class import SenticGCNTrainArgs +from sgnlp.models.sentic_gcn.train import SenticGCNTrainer, SenticGCNBertTrainer + +PARENT_DIR = str(pathlib.Path(__file__).parent) + + +def find_result_file(path): + for p in pathlib.Path(path).iterdir(): + if p.is_file() and p.suffix == ".pkl": + yield p.resolve() + + +class TestSenticGCNTrainTestCase(unittest.TestCase): + def setUp(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + self.model_save_folder = pathlib.Path(tmpdir) + with tempfile.TemporaryDirectory() as tmpdir: + self.results_save_folder = pathlib.Path(tmpdir) + + cfg = { + "senticnet_word_file_path": "", + "save_preprocessed_senticnet": False, + "saved_preprocessed_senticnet_file_path": PARENT_DIR + "/test_data/test_senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", + "word_vec_file_path": "./glove/glove.840B.300d.txt", + "dataset_train": [PARENT_DIR + "/test_data/test_train.raw"], + "dataset_test": [PARENT_DIR + "/test_data/test_train.raw"], + "valset_ratio": 0, + "model": "senticgcn", + "save_best_model": True, + "save_model_path": str(self.model_save_folder), + "tokenizer": "senticgcn", + "train_tokenizer": True, + "save_tokenizer": False, + "save_tokenizer_path": "", + "embedding_model": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/", + "build_embedding_model": False, + "save_embedding_model": False, + "save_embedding_model_path": "./embed_models/senticgcn_embed_semeval14_rest/", + "save_results": True, + "save_results_folder": str(self.results_save_folder), + "initializer": "xavier_uniform_", + "optimizer": "adam", + "loss_function": "cross_entropy", + "learning_rate": 0.001, + "l2reg": 0.00001, + "epochs": 100, + "batch_size": 2, + "log_step": 5, + "embed_dim": 300, + "hidden_dim": 300, + "polarities_dim": 3, + "dropout": 0.3, + "seed": 776, + "device": "cpu", + "repeats": 2, + "patience": 5, + "max_len": 85, + } + self.cfg = SenticGCNTrainArgs(**cfg) + + def tearDown(self) -> None: + shutil.rmtree(self.model_save_folder, ignore_errors=True) + shutil.rmtree(self.results_save_folder, ignore_errors=True) + + @pytest.mark.slow + def test_train(self): + trainer = SenticGCNTrainer(self.cfg) + trainer.train() + + result_file = list(find_result_file(self.results_save_folder))[0] + + with open(result_file, "rb") as f: + results = pickle.load(f) + + self.assertTrue("Repeat_1" in results.keys()) + self.assertTrue("Repeat_2" in results.keys()) + self.assertTrue("test" in results.keys()) + for key, val in results.items(): + self.assertTrue("max_val_acc" in val.keys()) + self.assertTrue("max_val_f1" in val.keys()) + if key != "test": + self.assertTrue("max_val_epoch" in val.keys()) + + config_filepath = self.model_save_folder.joinpath("config.json") + model_filepath = self.model_save_folder.joinpath("pytorch_model.bin") + self.assertTrue(config_filepath.is_file()) + self.assertTrue(model_filepath.is_file()) + + +class TestSenticGCNBertTrainTestCase(unittest.TestCase): + def setUp(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + self.model_save_folder = pathlib.Path(tmpdir) + with tempfile.TemporaryDirectory() as tmpdir: + self.results_save_folder = pathlib.Path(tmpdir) + + cfg = { + "senticnet_word_file_path": "", + "save_preprocessed_senticnet": False, + "saved_preprocessed_senticnet_file_path": PARENT_DIR + "/test_data/test_senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", + "word_vec_file_path": "./glove/glove.840B.300d.txt", + "dataset_train": [PARENT_DIR + "/test_data/test_train.raw"], + "dataset_test": [PARENT_DIR + "/test_data/test_train.raw"], + "valset_ratio": 0, + "model": "senticgcnbert", + "save_best_model": True, + "save_model_path": str(self.model_save_folder), + "tokenizer": "bert-base-uncased", + "embedding_model": "bert-base-uncased", + "save_results": True, + "save_results_folder": str(self.results_save_folder), + "initializer": "xavier_uniform_", + "optimizer": "adam", + "loss_function": "cross_entropy", + "learning_rate": 0.001, + "l2reg": 0.00001, + "epochs": 100, + "batch_size": 2, + "log_step": 5, + "embed_dim": 300, + "hidden_dim": 768, + "polarities_dim": 3, + "dropout": 0.3, + "seed": 776, + "device": "cpu", + "repeats": 2, + "patience": 5, + "max_len": 85, + } + self.cfg = SenticGCNTrainArgs(**cfg) + + def tearDown(self) -> None: + shutil.rmtree(self.model_save_folder, ignore_errors=True) + shutil.rmtree(self.results_save_folder, ignore_errors=True) + + @pytest.mark.slow + def test_train(self): + trainer = SenticGCNBertTrainer(self.cfg) + trainer.train() + + result_file = list(find_result_file(self.results_save_folder))[0] + + with open(result_file, "rb") as f: + results = pickle.load(f) + + self.assertTrue("Repeat_1" in results.keys()) + self.assertTrue("Repeat_2" in results.keys()) + self.assertTrue("test" in results.keys()) + for key, val in results.items(): + self.assertTrue("max_val_acc" in val.keys()) + self.assertTrue("max_val_f1" in val.keys()) + if key != "test": + self.assertTrue("max_val_epoch" in val.keys()) + + config_filepath = self.model_save_folder.joinpath("config.json") + model_filepath = self.model_save_folder.joinpath("pytorch_model.bin") + self.assertTrue(config_filepath.is_file()) + self.assertTrue(model_filepath.is_file()) From 94455f6441c24b83e802c1755180fdbc902d759f Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 12 Jan 2022 17:42:53 +0800 Subject: [PATCH 157/201] [#41] bug fix wrong shape for graph tensor, complete unit test for SenticGCNModel --- sgnlp/models/sentic_gcn/preprocess.py | 2 +- tests/sentic_gcn/test_sentic_gcn_model.py | 80 +++++++++++++---------- 2 files changed, 46 insertions(+), 36 deletions(-) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index a6f29a8..f7165f7 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -254,7 +254,7 @@ def _process_indices(self, data_batch: List[SenticGCNData]) -> List[torch.Tensor graph = generate_dependency_adj_matrix(data.full_text, data.aspect, self.senticnet, self.spacy_pipeline) sdat_graph = np.pad( graph, - ((0, max_len - graph.shape[0]), (0, max_len - graph.shape[0])), + ((0, max_len - len(data.full_text)), (0, max_len - len(data.full_text))), "constant", ) diff --git a/tests/sentic_gcn/test_sentic_gcn_model.py b/tests/sentic_gcn/test_sentic_gcn_model.py index cf467fd..44b0270 100644 --- a/tests/sentic_gcn/test_sentic_gcn_model.py +++ b/tests/sentic_gcn/test_sentic_gcn_model.py @@ -74,41 +74,51 @@ def test_pretrained_config_base_class(self): self.assertTrue(issubclass(self.config.__class__, BertConfig)) -# TODO: Investigate shape mismatch -# class TestSenticGCNModel(unittest.TestCase): -# def setUp(self) -> None: -# config = SenticGCNConfig() -# self.model = SenticGCNModel(config=config) - -# def test_pretrained_model_base_class(self): -# self.assertTrue(issubclass(self.model.__class__, PreTrainedModel)) - -# def test_config_class(self): -# self.assertEqual(self.model.config_class, SenticGCNConfig) - -# def test_base_model_prefix(self): -# self.assertEqual(self.model.base_model_prefix, "senticgcn") - -# def test_forward_pass(self): -# input_tensors = [ -# torch.ones( -# [1, 100], -# dtype=torch.float32, -# device=DEVICE, -# ), -# torch.ones([1, 100], dtype=torch.float32, device=DEVICE), -# torch.ones([1, 100], dtype=torch.float32, device=DEVICE), -# torch.ones([1, 100, 300], dtype=torch.float32, device=DEVICE), -# torch.ones([1, 100, 100], dtype=torch.float32, device=DEVICE), -# ] - -# self.model.to(DEVICE) -# self.model.eval() -# result = self.model(input_tensors) - -# self.assertEqual(type(result), SenticGCNModelOutput) -# self.assertEqual(type(result.logits), torch.Tensor) -# self.assertEqual(result.logits.shape, torch.Size([1, 3])) +class TestSenticGCNModel(unittest.TestCase): + def setUp(self) -> None: + config = SenticGCNConfig() + self.model = SenticGCNModel(config=config) + + def test_pretrained_model_base_class(self): + self.assertTrue(issubclass(self.model.__class__, PreTrainedModel)) + + def test_config_class(self): + self.assertEqual(self.model.config_class, SenticGCNConfig) + + def test_base_model_prefix(self): + self.assertEqual(self.model.base_model_prefix, "senticgcn") + + def test_forward_pass(self): + text_indices = torch.zeros( + [1, 10], + dtype=torch.float32, + device=DEVICE, + ) + for i in range(0, 3): + text_indices[0][i] = 1 + + aspect_indices = torch.zeros([1, 10], dtype=torch.float32, device=DEVICE) + aspect_indices[0][0] = 1 + + left_indices = torch.zeros([1, 10], dtype=torch.float32, device=DEVICE) + left_indices[0][0] = 1 + left_indices[0][1] = 1 + + input_tensors = [ + text_indices, + aspect_indices, + left_indices, + torch.zeros([1, 10, 300], dtype=torch.float32, device=DEVICE), + torch.zeros([1, 3, 3], dtype=torch.float32, device=DEVICE), + ] + + self.model.to(DEVICE) + self.model.eval() + result = self.model(input_tensors) + + self.assertEqual(type(result), SenticGCNModelOutput) + self.assertEqual(type(result.logits), torch.Tensor) + self.assertEqual(result.logits.shape, torch.Size([1, 3])) class TestSenticGCNBertModel(unittest.TestCase): From 19ec4380631e7eb47920d1eda18d52381733cb3a Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 12 Jan 2022 22:40:13 +0800 Subject: [PATCH 158/201] [#41] add draft implementation of eval script --- .../config/senticnet_gcn_bert_config.json | 15 +- .../config/senticnet_gcn_config.json | 15 +- sgnlp/models/sentic_gcn/data_class.py | 15 ++ sgnlp/models/sentic_gcn/eval.py | 129 ++++++++++++++++++ 4 files changed, 172 insertions(+), 2 deletions(-) create mode 100644 sgnlp/models/sentic_gcn/eval.py diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json index 7373998..68c778f 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json @@ -36,5 +36,18 @@ "device": "cuda", "repeats": 10, "patience": 5, - "max_len": 85 + "max_len": 85, + + "eval_args": { + "model": "senticgcn", + "tokenizer": "senticgcn", + "embedding_model": "senticgcn", + "config_filename": "config.json", + "model_filename": "pytorch_model.bin", + "test_filename": "./datasets/semeval14/restaurant_test.raw", + "result_folder": "./eval_result/", + "eval_batch_size": 16, + "seed": 776, + "device": "cpu" + } } \ No newline at end of file diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json index 4c8393d..71cc2dd 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json @@ -42,5 +42,18 @@ "device": "cuda", "repeats": 10, "patience": 5, - "max_len": 85 + "max_len": 85, + + "eval_args": { + "model": "senticgcn", + "tokenizer": "senticgcn", + "embedding_model": "senticgcn", + "config_filename": "config.json", + "model_filename": "pytorch_model.bin", + "test_filename": "./datasets/semeval14/restaurant_test.raw", + "result_folder": "./eval_result/", + "eval_batch_size": 16, + "seed": 776, + "device": "cpu" + } } \ No newline at end of file diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index d836711..aa8b8f5 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -1,4 +1,5 @@ from dataclasses import dataclass, field +from typing import Any, Dict @dataclass @@ -157,6 +158,20 @@ class SenticGCNTrainArgs: default=5, metadata={"help": "Number of train epoch without improvements prior to early stopping."} ) max_len: int = field(default=85, metadata={"help": "Max length to pad for bert tokenizer."}) + eval_args: Dict[str, Any] = field( + default_factory=lambda: { + "model": "senticgcn", + "tokenizer": "senticgcn", + "embedding_model": "senticgcn", + "config_filename": "config.json", + "model_filename": "pytorch_model.bin", + "test_filename": "", + "result_folder": "", + "eval_batch_size": 16, + "seed": 776, + "device": "cpu", + } + ) def __post_init__(self): # Model diff --git a/sgnlp/models/sentic_gcn/eval.py b/sgnlp/models/sentic_gcn/eval.py new file mode 100644 index 0000000..9a3e724 --- /dev/null +++ b/sgnlp/models/sentic_gcn/eval.py @@ -0,0 +1,129 @@ +import logging +import pathlib +import shutil +import tempfile +import urllib +from typing import Tuple, Union + +import torch +from sklearn.metrics import f1_score +from torch.utils.data import DataLoader + +from .data_class import SenticGCNTrainArgs +from .config import SenticGCNBertConfig, SenticGCNConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig +from .modeling import SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel, SenticGCNModel, SenticGCNBertModel +from .tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer +from .utils import BucketIterator, parse_args_and_load_config, download_tokenizer_files, set_random_seed + + +logging.basicConfig(level=logging.DEBUG) + + +class SenticGCNBaseEvaluator: + def __init__(self, config: SenticGCNTrainArgs) -> None: + self.config = config["eval_args"] + self.data_cols = config.data_cols + self.device = ( + torch.device("cuda" if torch.cuda.is_available() else "cpu") + if not self.config["device"] + else torch.device(config[self.config["device"]]) + ) + + def _create_tokenizer( + self, tokenizer_class: Union[SenticGCNTokenizer, SenticGCNBertTokenizer] + ) -> Union[SenticGCNTokenizer, SenticGCNBertTokenizer]: + if self.config["tokenizer"].startswith("https://") or self.config["tokenizer"].startswith("http://"): + with tempfile.TemporaryDirectory() as tmpdir: + temp_dir = pathlib.Path(tmpdir) + download_tokenizer_files(self.config["tokenizer"], temp_dir) + tokenizer_ = tokenizer_class.from_pretrained(temp_dir) + shutil.rmtree(temp_dir, ignore_errors=True) + else: + tokenizer_ = tokenizer_class.from_pretrained(self.config["tokenizer"]) + return tokenizer_ + + def _create_model( + self, + model_name_path_or_folder: str, + embedding_config_class: Union[SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig], + embedding_model_class: Union[SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel], + ) -> Union[SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel]: + if model_name_path_or_folder.startswith("https://") or model_name_path_or_folder.startswith("http://"): + config_url = urllib.parse.urljoin(model_name_path_or_folder, self.config["config_filename"]) + model_url = urllib.parse.urljoin(model_name_path_or_folder, self.config["model_filename"]) + embed_config = embedding_config_class.from_pretrained(config_url) + embed_model = embedding_model_class.from_pretrained(model_url, config=embed_config) + else: + # Load from local folder + embed_model_name = pathlib.Path(model_name_path_or_folder) + if embed_model_name.is_dir(): + config_path = embed_model_name.joinpath(self.config["config_filename"]) + model_path = embed_model_name.joinpath(self.config["model_filename"]) + embed_config = embedding_config_class.from_pretrained(config_path) + embed_model = embedding_model_class.from_pretrained(model_path, config=embed_config) + else: + # Load from HuggingFace model repository + embed_config = embedding_config_class.from_pretrained(model_name_path_or_folder) + embed_model = embedding_model_class.from_pretrained(model_name_path_or_folder, config=embed_config) + return embed_model + + def _evaluate_acc_f1(self, dataloader: Union[DataLoader, BucketIterator]) -> Tuple[float, float]: + self.model.eval() + n_correct, n_total = 0, 0 + t_targets_all, t_outputs_all = None, None + with torch.no_grad(): + for _, t_batch in enumerate(dataloader): + # Generate embeddings + t_batch["text_embeddings"] = self._generate_embeddings(t_batch) + # Prepare input data and targets + t_inputs = [t_batch[col].to(self.device) for col in self.data_cols] + t_targets = t_batch["polarity"].to(self.device) + # Inference + t_outputs = self.model(t_inputs) + # Calculate loss + n_correct += (torch.argmax(t_outputs.logits, -1) == t_targets).sum().item() + n_total += len(t_outputs.logits) + + if t_targets_all is None: + t_targets_all = t_targets + t_outputs_all = t_outputs.logits + else: + t_targets_all = torch.cat((t_targets_all, t_targets), dim=0) + t_outputs_all = torch.cat((t_outputs_all, t_outputs.logits), dim=0) + test_acc = n_correct / n_total + f1 = f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average="macro") + return test_acc, f1 + + +class SenticGCNEvaluator(SenticGCNBaseEvaluator): + def __init__(self, config: SenticGCNTrainArgs) -> None: + super().__init__(config) + self.tokenizer = self._create_tokenizer(SenticGCNTokenizer) + self.embedding_model = self._create_model( + config.eval_args["embedding_model"], SenticGCNEmbeddingConfig, SenticGCNEmbeddingModel + ) + self.model = self._create_model(config.eval_args["model"], SenticGCNConfig, SenticGCNModel) + + def evaluate(self): + pass + + +class SenticGCNBertEvaluator(SenticGCNBaseEvaluator): + def __init__(self, config: SenticGCNTrainArgs) -> None: + super().__init__(config) + self.tokenizer = self._create_tokenizer(SenticGCNBertTokenizer) + self.embedding_model = self._create_model( + config.eval_args["embedding_model"], SenticGCNBertEmbeddingConfig, SenticGCNBertEmbeddingModel + ) + self.model = self._create_model(config.eval_args["model"], SenticGCNBertConfig, SenticGCNBertModel) + + def evaluate(self): + pass + + +if __name__ == "__main__": + cfg = parse_args_and_load_config() + if cfg.eval_args["seed"] is not None: + set_random_seed(cfg.eval_args["seed"]) + evaluator = SenticGCNEvaluator(cfg) if cfg.model == "senticgcn" else SenticGCNBertEvaluator(cfg) + evaluator.evaluate() From 103d0dba22fc340d5c452b94ef598797ba7c132b Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Wed, 12 Jan 2022 22:48:49 +0800 Subject: [PATCH 159/201] [#41] update config defaults for eval --- .../models/sentic_gcn/config/senticnet_gcn_bert_config.json | 6 +++--- sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json index 68c778f..1d1bee7 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json @@ -39,9 +39,9 @@ "max_len": 85, "eval_args": { - "model": "senticgcn", - "tokenizer": "senticgcn", - "embedding_model": "senticgcn", + "model": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/", + "tokenizer": "bert-base-uncased", + "embedding_model": "bert-base-uncased", "config_filename": "config.json", "model_filename": "pytorch_model.bin", "test_filename": "./datasets/semeval14/restaurant_test.raw", diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json index 71cc2dd..2be6295 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json @@ -45,9 +45,9 @@ "max_len": 85, "eval_args": { - "model": "senticgcn", - "tokenizer": "senticgcn", - "embedding_model": "senticgcn", + "model": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn/", + "tokenizer": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_tokenizer/", + "embedding_model": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/", "config_filename": "config.json", "model_filename": "pytorch_model.bin", "test_filename": "./datasets/semeval14/restaurant_test.raw", From 87b4799db08a48852d7741bff4722228ec532609 Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Thu, 13 Jan 2022 01:40:45 +0800 Subject: [PATCH 160/201] [#43] Update app.run(), remove debug mode --- demo_api/sentic_gcn/api.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/demo_api/sentic_gcn/api.py b/demo_api/sentic_gcn/api.py index 21910e7..e2f284c 100644 --- a/demo_api/sentic_gcn/api.py +++ b/demo_api/sentic_gcn/api.py @@ -44,5 +44,4 @@ def predict(): if __name__ == "__main__": - # app.run() - app.run(host="0.0.0.0", debug=True, port=8000) \ No newline at end of file + app.run() \ No newline at end of file From 43be45b8a5b4b02a829e021e8da2753b21f7b566 Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Thu, 13 Jan 2022 01:41:40 +0800 Subject: [PATCH 161/201] [#43] Remove extra flask import --- demo_api/sentic_gcn/api.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/demo_api/sentic_gcn/api.py b/demo_api/sentic_gcn/api.py index e2f284c..e20ab8a 100644 --- a/demo_api/sentic_gcn/api.py +++ b/demo_api/sentic_gcn/api.py @@ -8,8 +8,6 @@ SenticGCNBertPostprocessor ) -from flask import request - app = create_api(app_name=__name__, model_card_path="model_card/sentic_gcn.json") preprocessor = SenticGCNBertPreprocessor( From 953875e43e36ca6a5837945044c709abef483428 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 13 Jan 2022 10:37:38 +0800 Subject: [PATCH 162/201] [#41] complete eval script --- sgnlp/models/sentic_gcn/__init__.py | 1 + .../config/senticnet_gcn_bert_config.json | 5 +- .../config/senticnet_gcn_config.json | 5 +- sgnlp/models/sentic_gcn/data_class.py | 5 +- sgnlp/models/sentic_gcn/eval.py | 185 ++++++++++++++++-- sgnlp/models/sentic_gcn/utils.py | 60 +++++- 6 files changed, 231 insertions(+), 30 deletions(-) diff --git a/sgnlp/models/sentic_gcn/__init__.py b/sgnlp/models/sentic_gcn/__init__.py index 200d745..1e7cdfa 100644 --- a/sgnlp/models/sentic_gcn/__init__.py +++ b/sgnlp/models/sentic_gcn/__init__.py @@ -1,5 +1,6 @@ from .config import SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig from .data_class import SenticGCNTrainArgs +from .eval import SenticGCNEvaluator, SenticGCNBaseEvaluator from .modeling import SenticGCNModel, SenticGCNBertModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel from .preprocess import SenticGCNPreprocessor, SenticGCNBertPreprocessor from .postprocess import SenticGCNPostprocessor, SenticGCNBertPostprocessor diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json index 1d1bee7..4c15123 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json @@ -39,12 +39,15 @@ "max_len": 85, "eval_args": { - "model": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/", + "model": "senticgcnbert", + "model_path": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/", "tokenizer": "bert-base-uncased", "embedding_model": "bert-base-uncased", "config_filename": "config.json", "model_filename": "pytorch_model.bin", "test_filename": "./datasets/semeval14/restaurant_test.raw", + "senticnet": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", "result_folder": "./eval_result/", "eval_batch_size": 16, "seed": 776, diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json index 2be6295..e365be2 100644 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json +++ b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json @@ -45,12 +45,15 @@ "max_len": 85, "eval_args": { - "model": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn/", + "model": "senticgcn", + "model_path": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn/", "tokenizer": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_tokenizer/", "embedding_model": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/", "config_filename": "config.json", "model_filename": "pytorch_model.bin", "test_filename": "./datasets/semeval14/restaurant_test.raw", + "senticnet": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", "result_folder": "./eval_result/", "eval_batch_size": 16, "seed": 776, diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py index aa8b8f5..dbdda62 100644 --- a/sgnlp/models/sentic_gcn/data_class.py +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -161,12 +161,15 @@ class SenticGCNTrainArgs: eval_args: Dict[str, Any] = field( default_factory=lambda: { "model": "senticgcn", + "model_path": "", "tokenizer": "senticgcn", "embedding_model": "senticgcn", "config_filename": "config.json", "model_filename": "pytorch_model.bin", "test_filename": "", - "result_folder": "", + "senticnet": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", + "result_folder": "./eval_result/", "eval_batch_size": 16, "seed": 776, "device": "cpu", diff --git a/sgnlp/models/sentic_gcn/eval.py b/sgnlp/models/sentic_gcn/eval.py index 9a3e724..fb14ec1 100644 --- a/sgnlp/models/sentic_gcn/eval.py +++ b/sgnlp/models/sentic_gcn/eval.py @@ -1,9 +1,10 @@ +import datetime import logging import pathlib import shutil import tempfile import urllib -from typing import Tuple, Union +from typing import List, Tuple, Union import torch from sklearn.metrics import f1_score @@ -13,25 +14,46 @@ from .config import SenticGCNBertConfig, SenticGCNConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig from .modeling import SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel, SenticGCNModel, SenticGCNBertModel from .tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer -from .utils import BucketIterator, parse_args_and_load_config, download_tokenizer_files, set_random_seed +from .utils import ( + SenticGCNDatasetGenerator, + BucketIterator, + parse_args_and_load_config, + download_tokenizer_files, + set_random_seed, +) logging.basicConfig(level=logging.DEBUG) class SenticGCNBaseEvaluator: + """ + Base Evaluator class used for evaluating SenticGCNModel and SenticGCNBertModel + """ + def __init__(self, config: SenticGCNTrainArgs) -> None: - self.config = config["eval_args"] + self.config = config.eval_args self.data_cols = config.data_cols self.device = ( torch.device("cuda" if torch.cuda.is_available() else "cpu") if not self.config["device"] - else torch.device(config[self.config["device"]]) + else torch.device(self.config["device"]) ) def _create_tokenizer( self, tokenizer_class: Union[SenticGCNTokenizer, SenticGCNBertTokenizer] ) -> Union[SenticGCNTokenizer, SenticGCNBertTokenizer]: + """ + Private method to construct tokenizer. + Tokenizer can be created via download from cloud storage, from local storage + or from HuggingFace repository. + + Args: + tokenizer_class (Union[SenticGCNTokenizer, SenticGCNBertTokenizer]): tokenizer class type to create. + + Returns: + Union[SenticGCNTokenizer, SenticGCNBertTokenizer]: return the tokenizer class instance. + """ if self.config["tokenizer"].startswith("https://") or self.config["tokenizer"].startswith("http://"): with tempfile.TemporaryDirectory() as tmpdir: temp_dir = pathlib.Path(tmpdir) @@ -45,29 +67,56 @@ def _create_tokenizer( def _create_model( self, model_name_path_or_folder: str, - embedding_config_class: Union[SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig], - embedding_model_class: Union[SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel], - ) -> Union[SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel]: + config_class: Union[ + SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig + ], + model_class: Union[SenticGCNModel, SenticGCNBertModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel], + ) -> Union[SenticGCNModel, SenticGCNBertModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel]: + """ + Private method to construct models and embedding models. + Model can be created via download from cloud storage via from_pretrained method, from local storage + or from HuggingFace repository. + + Args: + model_name_path_or_folder (str): cloud or local storage path to model files + config_class (Union[SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig]): + config class type + model_class (Union[SenticGCNModel, SenticGCNBertModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel]): + model class type + + Returns: + Union[SenticGCNModel, SenticGCNBertModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel]: + return model instance. + """ if model_name_path_or_folder.startswith("https://") or model_name_path_or_folder.startswith("http://"): config_url = urllib.parse.urljoin(model_name_path_or_folder, self.config["config_filename"]) model_url = urllib.parse.urljoin(model_name_path_or_folder, self.config["model_filename"]) - embed_config = embedding_config_class.from_pretrained(config_url) - embed_model = embedding_model_class.from_pretrained(model_url, config=embed_config) + config = config_class.from_pretrained(config_url) + model = model_class.from_pretrained(model_url, config=config) else: # Load from local folder embed_model_name = pathlib.Path(model_name_path_or_folder) if embed_model_name.is_dir(): config_path = embed_model_name.joinpath(self.config["config_filename"]) model_path = embed_model_name.joinpath(self.config["model_filename"]) - embed_config = embedding_config_class.from_pretrained(config_path) - embed_model = embedding_model_class.from_pretrained(model_path, config=embed_config) + config = config_class.from_pretrained(config_path) + model = model_class.from_pretrained(model_path, config=config) else: # Load from HuggingFace model repository - embed_config = embedding_config_class.from_pretrained(model_name_path_or_folder) - embed_model = embedding_model_class.from_pretrained(model_name_path_or_folder, config=embed_config) - return embed_model + config = config_class.from_pretrained(model_name_path_or_folder) + model = model_class.from_pretrained(model_name_path_or_folder, config=config) + return model def _evaluate_acc_f1(self, dataloader: Union[DataLoader, BucketIterator]) -> Tuple[float, float]: + """ + Private helper method to evaluate accuracy and f1 score. + + Args: + dataloader (DataLoader): input val and test dataloader + + Returns: + Tuple[float, float]: return acc and f1 score + """ self.model.eval() n_correct, n_total = 0, 0 t_targets_all, t_outputs_all = None, None @@ -94,36 +143,130 @@ def _evaluate_acc_f1(self, dataloader: Union[DataLoader, BucketIterator]) -> Tup f1 = f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average="macro") return test_acc, f1 + def _save_results_to_file(self, acc_f1: List[str]) -> None: + """ + Private method to save acc and f1 results to file. + + Args: + acc_f1 (List[str]): list containing acc and f1 results + """ + results = [ + f"Model: {self.config['model']}\n", + f"Batch Size: {self.config['eval_batch_size']}\n", + f"Random Seed: {self.config['seed']}\n", + ] + results = [*results, *acc_f1] + results_folder = pathlib.Path(self.config["result_folder"]) + results_folder.mkdir(exist_ok=True) + results_file = results_folder.joinpath( + f"{self.config['model']}_{datetime.datetime.now().strftime('%d-%m-%y_%H-%M-%S')}_results.txt" + ) + with open(results_file, "a") as f: + f.writelines(results) + class SenticGCNEvaluator(SenticGCNBaseEvaluator): + """ + Evaluator class derived from SenticGCNBaseEvaluator. + + Args: + config (SenticGCNTrainArgs): Config for SenticGCNModel + """ + def __init__(self, config: SenticGCNTrainArgs) -> None: super().__init__(config) self.tokenizer = self._create_tokenizer(SenticGCNTokenizer) self.embedding_model = self._create_model( config.eval_args["embedding_model"], SenticGCNEmbeddingConfig, SenticGCNEmbeddingModel ) - self.model = self._create_model(config.eval_args["model"], SenticGCNConfig, SenticGCNModel) + self.model = self._create_model(config.eval_args["model_path"], SenticGCNConfig, SenticGCNModel) + data_gen = SenticGCNDatasetGenerator(config, self.tokenizer, "test") + self.raw_data = data_gen.generate_test_datasets() + del data_gen - def evaluate(self): - pass + def _generate_embeddings(self, batch: List[torch.Tensor]) -> torch.Tensor: + """ + Private helper method to generate embeddings. + + Args: + batch (List[torch.Tensor]): a batch of test dataset + + Returns: + torch.Tensor: return embedding tensor + """ + text_indices = batch["text_indices"].to(self.device) + return self.embedding_model(text_indices) + + def evaluate(self) -> None: + """ + Main evaluate method. + """ + # Generate dataloaders + test_dataloader = BucketIterator(self.raw_data, batch_size=self.config["eval_batch_size"], shuffle=False) + # Evalute Acc and F1 + acc, f1 = self._evaluate_acc_f1(test_dataloader) + logging.info(f"Evaluate Results -> Acc: {acc}, F1: {f1}") + # Save results + acc_f1 = [f"Acc: {acc}\n", f"F1: {f1}\n"] + self._save_results_to_file(acc_f1) + + logging.info("Evaluation Complete!") class SenticGCNBertEvaluator(SenticGCNBaseEvaluator): + """ + Evaluator class derived from SenticGCNBaseEvaluator. + + Args: + config (SenticGCNTrainArgs): Config for SenticGCNModel + """ + def __init__(self, config: SenticGCNTrainArgs) -> None: super().__init__(config) self.tokenizer = self._create_tokenizer(SenticGCNBertTokenizer) self.embedding_model = self._create_model( config.eval_args["embedding_model"], SenticGCNBertEmbeddingConfig, SenticGCNBertEmbeddingModel ) - self.model = self._create_model(config.eval_args["model"], SenticGCNBertConfig, SenticGCNBertModel) + self.model = self._create_model(config.eval_args["model_path"], SenticGCNBertConfig, SenticGCNBertModel) + data_gen = SenticGCNDatasetGenerator(config, self.tokenizer, "test") + self.raw_data = data_gen.generate_test_datasets() + del data_gen + + def _generate_embeddings(self, batch: List[torch.Tensor]) -> torch.Tensor: + """ + Private helper method to generate embeddings. + + Args: + batch (List[torch.Tensor]): a batch of test dataset + + Returns: + torch.Tensor: return embedding tensor + """ + text_bert_indices = batch["text_bert_indices"].to(self.device) + bert_segment_indices = batch["bert_segment_indices"].to(self.device) + + return self.embedding_model(text_bert_indices, token_type_ids=bert_segment_indices)["last_hidden_state"] + + def evaluate(self) -> None: + """ + Main evaluate method. + """ + # Generate dataloaders + test_dataloader = DataLoader(self.raw_data, batch_size=self.config["eval_batch_size"], shuffle=False) + # Evaluate Acc and F1 + acc, f1 = self._evaluate_acc_f1(test_dataloader) + logging.info(f"Evaluate Results -> Acc: {acc}, F1: {f1}") + # Save results + acc_f1 = [f"Acc: {acc}\n", f"F1: {f1}\n"] + self._save_results_to_file(acc_f1) - def evaluate(self): - pass + logging.info("Evaluation Complete!") if __name__ == "__main__": cfg = parse_args_and_load_config() if cfg.eval_args["seed"] is not None: set_random_seed(cfg.eval_args["seed"]) - evaluator = SenticGCNEvaluator(cfg) if cfg.model == "senticgcn" else SenticGCNBertEvaluator(cfg) + evaluator = SenticGCNEvaluator(cfg) if cfg.eval_args["model"] == "senticgcn" else SenticGCNBertEvaluator(cfg) + logging.info(f"Evaluating {cfg.eval_args['model']}") evaluator.evaluate() diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index 264dc13..2ed1dac 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -7,6 +7,8 @@ import requests import urllib import math +import tempfile +import shutil from typing import Dict, List, Tuple, Union import numpy as np @@ -276,18 +278,50 @@ def __len__(self): class SenticGCNDatasetGenerator: """ Main dataset generator class to preprocess raw dataset file. + Set mode to 'train' to generate dataset for training. + Set mode to 'test' to generate dataset for training from eval_args. """ - def __init__(self, config: SenticGCNTrainArgs, tokenizer: PreTrainedTokenizer) -> None: + def __init__(self, config: SenticGCNTrainArgs, tokenizer: PreTrainedTokenizer, mode: str = "train") -> None: self.config = config - self.senticnet = load_and_process_senticnet( - config.senticnet_word_file_path, - config.save_preprocessed_senticnet, - config.saved_preprocessed_senticnet_file_path, + self.senticnet = self._load_senticnet(mode) + self.spacy_pipeline = spacy.load( + config.spacy_pipeline if mode == "train" else config.eval_args["spacy_pipeline"] ) - self.spacy_pipeline = spacy.load(config.spacy_pipeline) self.tokenizer = tokenizer + def _load_senticnet(self, mode: str) -> Dict[str, float]: + if mode == "train": + senticnet_ = load_and_process_senticnet( + self.config.senticnet_word_file_path, + self.config.save_preprocessed_senticnet, + self.config.saved_preprocessed_senticnet_file_path, + ) + else: + if self.config.eval_args["senticnet"].startswith("https://") or self.config.eval_args[ + "senticnet" + ].startswith("http://"): + with tempfile.TemporaryDirectory() as tmpdir: + temp_dir = pathlib.Path(tmpdir) + download_url_file(self.config.eval_args["senticnet"], temp_dir) + saved_path = temp_dir.joinpath("senticnet.pickle") + senticnet_ = load_and_process_senticnet(saved_preprocessed_senticnet_file_path=saved_path) + shutil.rmtree(temp_dir, ignore_errors=True) + elif self.config.eval_args["senticnet"].endswith(".pkl") or self.config.eval_args["senticnet"].endswith( + ".pickle" + ): + senticnet_ = load_and_process_senticnet( + saved_preprocessed_senticnet_file_path=self.config.eval_args["senticnet"] + ) + else: + raise ValueError( + """ + Error initializing SenticNet! + Please only provide url to pickle file cloud storage location or local file path. + """ + ) + return senticnet_ + def _read_raw_dataset(self, dataset_type: str) -> List[str]: """ Private helper method to read raw dataset files based on requested type (e.g. Train or Test). @@ -478,6 +512,20 @@ def generate_datasets(self) -> Tuple[SenticGCNDataset, SenticGCNDataset, SenticG val_data = test_data return SenticGCNDataset(train_data), SenticGCNDataset(val_data), SenticGCNDataset(test_data) + def generate_test_datasets(self) -> SenticGCNDataset: + """ + Main wrapper method to generate test datasets for both SenticGCN and SenticGCNBert based on eval config. + + Returns: + SenticGCNDataset: return SenticGCNDataset instance for test datasets + """ + raw_data = self._read_raw_dataset(self.config.eval_args["test_filename"]) + if self.config.eval_args["model"] == "senticgcn": + test_data = self._generate_senticgcn_dataset(raw_data) + else: + test_data = self._generate_senticgcnbert_dataset(raw_data) + return SenticGCNDataset(test_data) + class BucketIterator: """ From 7fb41ae487526ea59ca6286610f70984cd204509 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 13 Jan 2022 11:16:22 +0800 Subject: [PATCH 163/201] [#41] add unit tests for SenticGCN and SenticGCNBert evaluator --- .../sentic_gcn/test_sentic_gcn_train_eval.py | 97 ++++++++++++++++++- 1 file changed, 93 insertions(+), 4 deletions(-) diff --git a/tests/sentic_gcn/test_sentic_gcn_train_eval.py b/tests/sentic_gcn/test_sentic_gcn_train_eval.py index a7f39e4..31d1c8f 100644 --- a/tests/sentic_gcn/test_sentic_gcn_train_eval.py +++ b/tests/sentic_gcn/test_sentic_gcn_train_eval.py @@ -6,14 +6,15 @@ import unittest from sgnlp.models.sentic_gcn.data_class import SenticGCNTrainArgs +from sgnlp.models.sentic_gcn.eval import SenticGCNEvaluator, SenticGCNBertEvaluator from sgnlp.models.sentic_gcn.train import SenticGCNTrainer, SenticGCNBertTrainer PARENT_DIR = str(pathlib.Path(__file__).parent) -def find_result_file(path): +def find_result_file(path: str, extension: str): for p in pathlib.Path(path).iterdir(): - if p.is_file() and p.suffix == ".pkl": + if p.is_file() and p.suffix == extension: yield p.resolve() @@ -75,7 +76,7 @@ def test_train(self): trainer = SenticGCNTrainer(self.cfg) trainer.train() - result_file = list(find_result_file(self.results_save_folder))[0] + result_file = list(find_result_file(self.results_save_folder, ".pkl"))[0] with open(result_file, "rb") as f: results = pickle.load(f) @@ -147,7 +148,7 @@ def test_train(self): trainer = SenticGCNBertTrainer(self.cfg) trainer.train() - result_file = list(find_result_file(self.results_save_folder))[0] + result_file = list(find_result_file(self.results_save_folder, ".pkl"))[0] with open(result_file, "rb") as f: results = pickle.load(f) @@ -165,3 +166,91 @@ def test_train(self): model_filepath = self.model_save_folder.joinpath("pytorch_model.bin") self.assertTrue(config_filepath.is_file()) self.assertTrue(model_filepath.is_file()) + + +class TestSenticGCNEvaluateTestCase(unittest.TestCase): + def setUp(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + self.results_save_folder = pathlib.Path(tmpdir) + + cfg = { + "eval_args": { + "model": "senticgcn", + "model_path": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn/", + "tokenizer": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_tokenizer/", + "embedding_model": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/", + "config_filename": "config.json", + "model_filename": "pytorch_model.bin", + "test_filename": PARENT_DIR + "/test_data/test_test.raw", + "senticnet": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", + "result_folder": str(self.results_save_folder), + "eval_batch_size": 16, + "seed": 776, + "device": "cpu", + } + } + self.cfg = SenticGCNTrainArgs(**cfg) + + def tearDown(self) -> None: + shutil.rmtree(self.results_save_folder, ignore_errors=True) + + @pytest.mark.slow + def test_evaluate(self): + evaluator = SenticGCNEvaluator(self.cfg) + evaluator.evaluate() + + result_file = list(find_result_file(self.results_save_folder, ".txt"))[0] + with open(result_file, "r") as f: + results = f.readlines() + + self.assertEqual(len(results), 5) + self.assertTrue(results[0].startswith("Model:")) + self.assertTrue(results[1].startswith("Batch Size:")) + self.assertTrue(results[2].startswith("Random Seed:")) + self.assertTrue(results[3].startswith("Acc:")) + self.assertTrue(results[4].startswith("F1:")) + + +class TestSenticGCNBertEvaluateTestCase(unittest.TestCase): + def setUp(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + self.results_save_folder = pathlib.Path(tmpdir) + + cfg = { + "eval_args": { + "model": "senticgcnbert", + "model_path": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/", + "tokenizer": "bert-base-uncased", + "embedding_model": "bert-base-uncased", + "config_filename": "config.json", + "model_filename": "pytorch_model.bin", + "test_filename": PARENT_DIR + "/test_data/test_test.raw", + "senticnet": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", + "result_folder": str(self.results_save_folder), + "eval_batch_size": 16, + "seed": 776, + "device": "cpu", + } + } + self.cfg = SenticGCNTrainArgs(**cfg) + + def tearDown(self) -> None: + shutil.rmtree(self.results_save_folder, ignore_errors=True) + + @pytest.mark.slow + def test_evaluate(self): + evaluator = SenticGCNBertEvaluator(self.cfg) + evaluator.evaluate() + + result_file = list(find_result_file(self.results_save_folder, ".txt"))[0] + with open(result_file, "r") as f: + results = f.readlines() + + self.assertEqual(len(results), 5) + self.assertTrue(results[0].startswith("Model:")) + self.assertTrue(results[1].startswith("Batch Size:")) + self.assertTrue(results[2].startswith("Random Seed:")) + self.assertTrue(results[3].startswith("Acc:")) + self.assertTrue(results[4].startswith("F1:")) From 4ba78e23c0d5bbfa302a50f7d25915460ee7fff3 Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Thu, 13 Jan 2022 11:28:27 +0800 Subject: [PATCH 164/201] [#43] Remove extra files to prevent merge conflicts --- sgnlp/models/sentic_gcn/__init__.py | 0 sgnlp/models/sentic_gcn/config.py | 109 ---- .../config/senticnet_gcn_config.json | 44 -- sgnlp/models/sentic_gcn/data_class.py | 195 ------- sgnlp/models/sentic_gcn/modeling.py | 355 ------------ sgnlp/models/sentic_gcn/modules/__init__.py | 0 .../models/sentic_gcn/modules/dynamic_rnn.py | 88 --- sgnlp/models/sentic_gcn/modules/gcn.py | 23 - sgnlp/models/sentic_gcn/preprocess.py | 66 --- sgnlp/models/sentic_gcn/tokenization.py | 156 ----- sgnlp/models/sentic_gcn/train.py | 222 ------- sgnlp/models/sentic_gcn/utils.py | 541 ------------------ 12 files changed, 1799 deletions(-) delete mode 100644 sgnlp/models/sentic_gcn/__init__.py delete mode 100644 sgnlp/models/sentic_gcn/config.py delete mode 100644 sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json delete mode 100644 sgnlp/models/sentic_gcn/data_class.py delete mode 100644 sgnlp/models/sentic_gcn/modeling.py delete mode 100644 sgnlp/models/sentic_gcn/modules/__init__.py delete mode 100644 sgnlp/models/sentic_gcn/modules/dynamic_rnn.py delete mode 100644 sgnlp/models/sentic_gcn/modules/gcn.py delete mode 100644 sgnlp/models/sentic_gcn/preprocess.py delete mode 100644 sgnlp/models/sentic_gcn/tokenization.py delete mode 100644 sgnlp/models/sentic_gcn/train.py delete mode 100644 sgnlp/models/sentic_gcn/utils.py diff --git a/sgnlp/models/sentic_gcn/__init__.py b/sgnlp/models/sentic_gcn/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/sgnlp/models/sentic_gcn/config.py b/sgnlp/models/sentic_gcn/config.py deleted file mode 100644 index cee9948..0000000 --- a/sgnlp/models/sentic_gcn/config.py +++ /dev/null @@ -1,109 +0,0 @@ -from transformers import PretrainedConfig, BertConfig - - -class SenticGCNConfig(PretrainedConfig): - """ - This is the configuration class to store the configuration of a - :class:`~sgnlp.models.sentic_gcn.modeling.SenticGCNModel`. - It is used to instantiate a SenticGCNModel network according to the specific arguments, defining the model architecture. - - Args: - embed_dim (:obj:`int`, defaults to 300): Embedding dimension size. - hidden_dim (:obj:`int`, defaults to 300): Size of hidden dimension. - dropout (:obj:`float`, defaults to 0.3): Droput percentage. - polarities_dim (:obj:`int`, defaults to 3): Size of output dimension representing available polarities (e.g. Positive, Negative, Neutral). - device (:obj:`str`, defaults to 'cuda`): Type of torch device. - loss_function (:obj:`str`, defaults to 'cross_entropy'): Loss function for training/eval. - - Example: - - from sgnlp.models.sentic_gcn import SenticGCNConfig - - # Initialize with default values - config = SenticGCNConfig() - """ - - def __init__( - self, - embed_dim: int = 300, - hidden_dim: int = 300, - polarities_dim: int = 3, - dropout: float = 0.3, - device: str = "cuda", - loss_function: str = "cross_entropy", - **kwargs - ) -> None: - super().__init__(**kwargs) - self.embed_dim = embed_dim - self.hidden_dim = hidden_dim - self.dropout = dropout - self.polarities_dim = polarities_dim - self.device = device - self.loss_function = loss_function - - -class SenticGCNBertConfig(PretrainedConfig): - """ - This is the configuration class to store the configuration of a :class:`~sgnlp.models.sentic_gcn.modeling.SenticBertGCNModel`. - It is used to instantiate a SenticBertGCNModel network according to the specific arguments, defining the model architecture. - - Args: - hidden_dim (:obj:`int`, defaults to 768): The embedding dimension size for the Bert model as well as GCN dimension. - max_seq_len (:obj:`int`, defaults to 85): The max sequence length to pad and truncate. - dropout (:obj:`float`, defaults to 0.3): Dropout percentage. - polarities_dim (:ob:`int`, defaults to 3): Size of output dimension representing available polarities (e.g. Positive, Negative, Neutral). - device (:obj:`str`, defaults to 'cuda'): Type of torch device. - loss_function (:obj:`str`, defaults to 'cross_entropy'): Loss function for training/eval. - Example: - - from sgnlp.models.sentic_gcn import SenticGCNBertConfig - - # Initialize with default values - config = SenticGCNBertConfig() - """ - - def __init__( - self, - hidden_dim: int = 768, - max_seq_len: int = 85, - polarities_dim: int = 3, - dropout: float = 0.3, - device: str = "cuda", - loss_function: str = "cross_entropy", - **kwargs - ) -> None: - super().__init__(**kwargs) - self.hidden_dim = hidden_dim - self.max_seq_len = max_seq_len - self.dropout = dropout - self.polarities_dim = polarities_dim - self.device = device - self.loss_function = loss_function - - -class SenticGCNEmbeddingConfig(PretrainedConfig): - """ - This is the configuration class to store the configuration of a :class:`~SenticGCNEmbeddingModel`. - It is used to instantiate a SenticGCN Embedding model according to the specified arguments, defining the model architecture. - - Args: - PretrainedConfig (:obj:`PretrainedConfig`): transformer :obj:`PretrainedConfig` base class - """ - - def __init__(self, vocab_size: int = 17662, embed_dim: int = 300, **kwargs) -> None: - super().__init__(**kwargs) - self.vocab_size = vocab_size - self.embed_dim = embed_dim - - -class SenticGCNBertEmbeddingConfig(BertConfig): - """ - This is the configuration class to store the configuration of a :class:`~SenticGCNBertEmbeddingModel`. - It is used to instantiate a SenticGCN Bert Embedding model according to the specified arguments, defining the model architecture. - - Args: - BertConfig (:obj:`BertConfig`): transformer :obj:`BertConfig` base class - """ - - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json b/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json deleted file mode 100644 index 1d7a83f..0000000 --- a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "senticnet_word_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/senticNet/senticnet_word.txt", - "save_preprocessed_senticnet": true, - "saved_preprocessed_senticnet_file_path": "senticnet/senticnet.pickle", - "spacy_pipeline": "en_core_web_sm", - "word_vec_file_path": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/glove/glove.840B.300d.txt", - - "dataset_train": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_train.raw", - "dataset_test": "/Users/raymond/work/aimakerspace_sgnlp/sgnlp/models/sentic_asgcn/datasets/semeval14/restaurant_test.raw", - "valset_ratio": 0, - - "model": "senticgcn", - "save_best_model": true, - "save_model_path": "senticgcn", - - "tokenizer": "senticgcn", - "train_tokenizer": false, - "save_tokenizer": false, - "save_tokenizer_path": "senticgcn_tokenizer", - - "embedding_model": "senticgcn_embed_model", - "build_embedding_model": false, - "save_embedding_model": false, - "save_embedding_model_path": "senticgcn_embed_model", - - "initializer": "xavier_uniform", - "optimizer": "adam", - "loss_function": "cross_entropy", - "learning_rate": 0.001, - "l2reg": 0.00001, - "epochs": 100, - "batch_size": 32, - "log_step": 5, - "embed_dim": 300, - "hidden_dim": 300, - "polarities_dim": 3, - "dropout": 0.3, - "save_results": true, - "seed": 776, - "device": "cuda", - "repeats": 10, - "patience": 5, - "max_len": 85 -} \ No newline at end of file diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py deleted file mode 100644 index 9fe058d..0000000 --- a/sgnlp/models/sentic_gcn/data_class.py +++ /dev/null @@ -1,195 +0,0 @@ -from dataclasses import dataclass, field - -from torch.cuda.memory import memory_stats_as_nested_dict - - -@dataclass -class SenticGCNTrainArgs: - """ - Data class for training config for both SenticGCNModel and SenticGCNBertModel - """ - - # External resources (e.g. Senticnet file, GloVe word vectors, etc) - senticnet_word_file_path: str = field( - default="./senticNet/senticnet_word.txt", metadata={"help": "SenticNet word file path."} - ) - save_preprocessed_senticnet: str = field( - default=True, - metadata={ - "help": """Flag to indicate if senticnet dictionary should be saved during preprocess step. - If 'saved_preprocessed_senticnet_file_path' is populated and valid, it will be overwritten if flag is set to True.""" - }, - ) - saved_preprocessed_senticnet_file_path: str = field( - default="senticnet/senticnet.pickle", - metadata={ - "help": """File path to saved preprocessed senticnet, if file exists and 'save_preprocessed_senticnet' flag is set to False. - SenticNet will be loaded from file instead of generated from raw senticnet files.""" - }, - ) - spacy_pipeline: str = field( - default="en_core_web_sm", metadata={"help": "Type of spacy pipeline to load for processor."} - ) - word_vec_file_path: str = field( - default="glove/glove.840B.300d.txt", - metadata={"help": "File path to word vector."}, - ) - - # Dataset specific config - dataset_train: str = field( - default="train.raw", - metadata={"help": "File path to train dataset."}, - ) - dataset_test: str = field( - default="test.raw", - metadata={"help": "File path to test dataset."}, - ) - valset_ratio: float = field( - default=0.0, - metadata={ - "help": """ - Ratio of train dataset to be split for validation. - If value is set to 0, test dataset is set as validation dataset as well.""" - }, - ) - - # Model specific config - model: str = field(default="senticgcn", metadata={"help": "Option to choose which model to train."}) - save_best_model: bool = field( - default=True, - metadata={ - "help": """Flag to indicate if best model should be saved during training. - Applies to both bert and non-bert SenticGCN models.""" - }, - ) - save_model_path: str = field( - default="senticgcn", - metadata={ - "help": """Folder path to save trained model using the save_pretrained method. - Applies to both bert and non-bert SenticGCN models.""" - }, - ) - - # Tokenizer specific config - tokenizer: str = field( - default="senticgcn", - metadata={ - "help": """Option to choose which tokenizer to use for training preprocessing. - Value will be used to create tokenizer via the from_pretrained method.""" - }, - ) - train_tokenizer: bool = field( - default=False, - metadata={ - "help": """Flag to indicate if tokenizer should be trained on input dataset. - Only applies to non-bert SenticGCN tokenizer.""" - }, - ) - save_tokenizer: bool = field( - default=False, - metadata={ - "help": """Flag to indicate if tokenizer should be saved using the save_pretrained method. - Only applies to non-bert SenticGCN tokenizer.""" - }, - ) - save_tokenizer_path: str = field( - default="senticgcn_tokenizer", - metadata={ - "help": """Folder path to save pretrained tokenizer using the save_pretrained method. - Only applies to non-bert SenticGCN tokenizer.""" - }, - ) - - # Embedding specific config - embedding_model: str = field( - default="senticgcn", - metadata={ - "help": """Option to choose which embeding model to use for training preprocessing. - For non-bert model, value should point to a pretraine model folder. - 'config.json' and 'pytorch_model.bin' will be used to create the config and embedding model - via the from_pretrained method. - Ignore if 'build_embedding_model' flag is set, only affects non-bert SenticGCN embedding model. - For bert model, value should be model name used to download from huggingface model hub.""" - }, - ) - build_embedding_model: bool = field( - default=False, - metadata={ - "help": """Flag to indicate if embedding model should be built from input word vectors. - Only applies to non-bert SenticGCN embedding models. - Word vectors to train on is indicated in 'word_vec_file_path' config.""" - }, - ) - save_embedding_model: bool = field( - default=False, - metadata={ - "help": """Flag to indicate if embedding model should be saved using the save_pretrained method. - Only applies to non-bert SenticGCN embedding model.""" - }, - ) - save_embedding_model_path: str = field( - default=False, - metadata={ - "help": """Folder path to save pretrained embedding model using the save_pretrained method. - Only applies to non-bert SenticGCN embeddding model.""" - }, - ) - - initializer: str = field(default="xavier_uniform", metadata={"help": "Type of initalizer to use."}) - optimizer: str = field(default="adam", metadata={"help": "Type of optimizer to use."}) - loss_function: str = field(default="cross_entropy", metadata={"help": "Loss function for training/eval."}) - learning_rate: float = field(default=0.001, metadata={"help": "Default learning rate for training."}) - l2reg: float = field(default=0.00001, metadata={"help": "Default l2reg value."}) - epochs: int = field(default=100, metadata={"help": "Number of epochs to train."}) - batch_size: int = field(default=32, metadata={"help": "Training batch size."}) - log_step: int = field(default=5, metadata={"help": "Number of train steps to log results."}) - embed_dim: int = field(default=300, metadata={"help": "Size of embedding."}) - hidden_dim: int = field(default=300, metadata={"help": "Number of neurons for hidden layer."}) - dropout: float = field(default=0.3, metadata={"help": "Default value for dropout percentages."}) - polarities_dim: int = field(default=3, metadata={"help": "Default dimension for polarities."}) - save_results: bool = field(default=True, metadata={"help": "Flag to indicate if results should be saved."}) - seed: int = field(default=776, metadata={"help": "Default random seed for training."}) - device: str = field(default="cuda", metadata={"help": "Type of compute device to use for training."}) - repeats: int = field(default=10, metadata={"help": "Number of times to repeat train loop."}) - patience: int = field( - default=5, metadata={"help": "Number of train epoch without improvements prior to early stopping."} - ) - max_len: int = field(default=85, metadata={"help": "Max length to pad for bert tokenizer."}) - - def __post_init__(self): - # Model - assert self.model in ["senticgcn", "senticgcnbert"], "Invalid model type!" - - assert self.initializer in [ - "xavier_uniform", - "xavier_uniform", - "orthogonal", - ], "Invalid initializer type!" - assert self.optimizer in [ - "adadelta", - "adagrad", - "adam", - "adamax", - "asgd", - "rmsprop", - "sgd", - ], "Invalid optimizer" - assert self.device in ["cuda", "cpu"], "Invalid device type." - assert self.repeats > 1, "Repeats value must be at least 1." - assert self.patience > 1, "Patience value must be at least 1." - assert 0 >= self.valset_ratio < 1, "Valset_ratio must be greater or equals to 0 and less than 1." - assert self.max_len > 0, "Max_len must be greater than 0." - - # Assign sub dataset columns name - self.data_cols = ( - ["text_indices", "aspect_indices", "left_indices", "text_embeddings", "sdat_graph"] - if self.model == "senticgcn" - else [ - "text_indices", - "aspect_indices", - "left_indices", - "text_bert_indices", - "text_embeddings", - "sdat_graph", - ] - ) diff --git a/sgnlp/models/sentic_gcn/modeling.py b/sgnlp/models/sentic_gcn/modeling.py deleted file mode 100644 index 14bf6e9..0000000 --- a/sgnlp/models/sentic_gcn/modeling.py +++ /dev/null @@ -1,355 +0,0 @@ -from dataclasses import dataclass -from typing import Optional - -import torch -import torch.nn as nn -import torch.nn.functional as F -from transformers import PreTrainedModel, BertModel -from transformers.file_utils import ModelOutput - -from modules.dynamic_rnn import DynamicLSTM -from modules.gcn import GraphConvolution -from config import ( - SenticGCNConfig, - SenticGCNBertConfig, - SenticGCNEmbeddingConfig, - SenticGCNBertEmbeddingConfig, -) -from utils import build_embedding_matrix - - -@dataclass -class SenticGCNModelOutput(ModelOutput): - """ - Base class for outputs of SenticGCNModel. - - Args: - loss (:obj:`torch.Tensor` of shape `(1,)`, `optional`, return when :obj:`labels` is provided): - classification loss, typically cross entropy. Loss function used is dependent on what is specified in SenticGCNConfig. - logits (:obj:`torch.Tensor` of shape :obj:`(batch_size, num_classes)`): - raw logits for each class. num_classes = 3 by default. - """ - - loss: Optional[torch.Tensor] = None - logits: torch.Tensor = None - - -class SenticGCNPreTrainedModel(PreTrainedModel): - """ - The SenticGCN Pre-Trained Model used as base class for derived SenticGCN Model. - - This model is the abstract super class for the SenticGCN Model which defines the config - class types and weights initalization method. This class should not be used or instantiated directly, - see SenticGCNModel class for usage. - """ - - config_class = SenticGCNConfig - base_model_prefix = "senticgcn" - - def _init_weights(self, module: nn.Module) -> None: - pass - - -class SenticGCNModel(SenticGCNPreTrainedModel): - """ - The SenticGCN Model for aspect based sentiment analysis. - - This method inherits from :obj:`SenticGCNPreTrainedModel` for weights initalization and utility functions - from transformer :obj:`PreTrainedModel` class. - - Args: - config (:obj:`~SenticGCNConfig`): - Model configuration class with all parameters required for the model. - Initializing with a config file does not load - the weights associated with the model, only the configuration. - Use the :obj:`.from_pretrained` method to load the model weights. - """ - - def __init__(self, config: SenticGCNConfig) -> None: - super().__init__(config) - self.text_lstm = DynamicLSTM( - config.embed_dim, - config.hidden_dim, - num_layers=1, - batch_first=True, - bidirectional=True, - ) - self.gc1 = GraphConvolution(2 * config.hidden_dim, 2 * config.hidden_dim) - self.gc2 = GraphConvolution(2 * config.hidden_dim, 2 * config.hidden_dim) - self.fc = nn.Linear(2 * config.hidden_dim, config.polarities_dim) - self.text_embed_dropout = nn.Dropout(config.dropout) - self.torch_device = torch.device(config.device) - if config.loss_function == "cross_entropy": - self.loss_function = nn.CrossEntropyLoss() - - def position_weight(self, x, aspect_double_idx, text_len, aspect_len): - batch_size, seq_len = x.shape[0], x.shape[1] - aspect_double_idx = aspect_double_idx.cpu().numpy() - text_len = text_len.cpu().numpy() - aspect_len = aspect_len.cpu().numpy() - weight = [[] for i in range(batch_size)] - for i in range(batch_size): - context_len = text_len[i] - aspect_len[i] - for j in range(aspect_double_idx[i, 0]): - weight[i].append(1 - (aspect_double_idx[i, 0] - j) / context_len) - for j in range(aspect_double_idx[i, 0], aspect_double_idx[i, 1] + 1): - weight[i].append(0) - for j in range(aspect_double_idx[i, 1] + 1, text_len[i]): - weight[i].append(1 - (j - aspect_double_idx[i, 1] / context_len)) - for j in range(text_len[i], seq_len): - weight[i].append(0) - weight = torch.tensor(weight, dtype=torch.float).unsqueeze(2).to(self.torch_device) - return weight * x - - def mask(self, x, aspect_double_idx): - batch_size, seq_len = x.shape[0], x.shape[1] - aspect_double_idx = aspect_double_idx.cpu().numpy() - mask = [[] for i in range(batch_size)] - for i in range(batch_size): - for j in range(aspect_double_idx[i, 0]): - mask[i].append(0) - for j in range(aspect_double_idx[i, 0], aspect_double_idx[i, 1] + 1): - mask[i].append(1) - for j in range(aspect_double_idx[i, 1] + 1, seq_len): - mask[i].append(0) - mask = torch.tensor(mask, dtype=torch.float).unsqueeze(2).to(self.torch_device) - return mask * x - - def forward(self, inputs: dict[str, torch.Tensor], labels: Optional[torch.Tensor] = None) -> SenticGCNModelOutput: - text_indices, aspect_indices, left_indices, text_embeddings, adj = inputs - text_len = torch.sum(text_indices != 0, dim=-1) - aspect_len = torch.sum(aspect_indices != 0, dim=-1) - left_len = torch.sum(left_indices != 0, dim=-1) - aspect_double_idx = torch.cat([left_len.unsqueeze(1), (left_len + aspect_len - 1).unsqueeze(1)], dim=1) - text = self.text_embed_dropout(text_embeddings) - text_out, (_, _) = self.text_lstm(text, text_len) - x = F.relu( - self.gc1( - self.position_weight(text_out, aspect_double_idx, text_len, aspect_len), - adj, - ) - ) - x = F.relu(self.gc2(self.position_weight(x, aspect_double_idx, text_len, aspect_len), adj)) - alpha_mat = torch.matmul(x, text_out.transpose(1, 2)) - alpha = F.softmax(alpha_mat.sum(1, keepdim=True), dim=2) - x = torch.matmul(alpha, text_out).squeeze(1) # batch_size x 2 * hidden_dim - logits = self.fc(x) - - loss = self.loss_function(logits, labels) if labels is not None else None - return SenticGCNModelOutput(loss=loss, logits=logits) - - -@dataclass -class SenticGCNBertModelOutput(ModelOutput): - """ - Base class for outputs of SenticGCNBertModel. - - Args: - loss (:obj:`torch.Tensor` of shape `(1,)`, `optional`, return when :obj:`labels` is provided): - classification loss, typically cross entropy. - Loss function used is dependent on what is specified in SenticGCNBertConfig. - logits (:obj:`torch.Tensor` of shape :obj:`(batch_size, num_classes)`): - raw logits for each class. num_classes = 3 by default. - """ - - loss: Optional[torch.Tensor] = None - logits: torch.Tensor = None - - -class SenticGCNBertPreTrainedModel(PreTrainedModel): - """ - The SenticGCNBert Pre-Trained Model used as base class for derived SenticGCNBert Model. - - This model is the abstract super class for the SenticGCNBert Model which defines the config - class types and weights initalization method. This class should not be used or instantiated directly, - see SenticGCNBertModel class for usage. - """ - - config_class = SenticGCNBertConfig - base_model_prefix = "senticgcnbert" - - def _init_weights(self, module: nn.Module) -> None: - pass - - -class SenticGCNBertModel(SenticGCNBertPreTrainedModel): - """ - The SenticGCNBert Model for aspect based sentiment analysis. - - This method inherits from :obj:`SenticGCNBertPreTrainedModel` for weights initalization and utility functions - from transformer :obj:`PreTrainedModel` class. - - Args: - config (:obj:`~SenticGCNBertConfig`): - Model configuration class with all parameters required for the model. - Initializing with a config file does not load - the weights associated with the model, only the configuration. - Use the :obj:`.from_pretrained` method to load the model weights. - """ - - def __init__(self, config: SenticGCNBertConfig) -> None: - super().__init__() - self.gc1 = GraphConvolution(config.hidden_dim, config.hidden_dim) - self.gc2 = GraphConvolution(config.hidden_dim, config.hidden_dim) - self.gc3 = GraphConvolution(config.hidden_dim, config.hidden_dim) - self.fc = nn.Linear(config.hidden_dim, config.polarities_dim) - self.text_embed_dropout = nn.Dropout(config.dropout) - self.torch_device = torch.device(config.device) - self.max_seq_len = config.max_seq_len - self.loss_function = config.loss_function - - def position_weight(self, x, aspect_double_idx, text_len, aspect_len): - batch_size, seq_len = x.shape[0], x.shape[1] - aspect_double_idx = aspect_double_idx.cpu().numpy() - text_len = text_len.cpu().numpy() - aspect_len = aspect_len.cpu().numpy() - weight = [[] for i in range(batch_size)] - for i in range(batch_size): - context_len = text_len[i] - aspect_len[i] - for j in range(aspect_double_idx[i, 0]): - weight[i].append(1 - (aspect_double_idx[i, 0] - j) / context_len) - for j in range(aspect_double_idx[i, 0], min(aspect_double_idx[i, 1] + 1, self.max_seq_len)): - weight[i].append(0) - for j in range(aspect_double_idx[i, 1] + 1, text_len[i]): - weight[i].append(1 - (j - aspect_double_idx[i, 1]) / context_len) - for j in range(text_len[i], seq_len): - weight[i].append(0) - weight = torch.tensor(weight).unsqueeze(2).to(self.torch_device) - return weight * x - - def mask(self, x, aspect_double_idx): - batch_size, seq_len = x.shape[0], x.shape[1] - aspect_double_idx = aspect_double_idx.cpu().numpy() - mask = [[] for i in range(batch_size)] - for i in range(batch_size): - for j in range(aspect_double_idx[i, 0]): - mask[i].append(0) - for j in range(aspect_double_idx[i, 0], min(aspect_double_idx[i, 1] + 1, self.max_seq_len)): - mask[i].append(1) - for j in range(min(aspect_double_idx[i, 1] + 1, self.max_seq_len), seq_len): - mask[i].append(0) - mask = torch.tensor(mask).unsqueeze(2).float().to(self.torch_device) - return mask * x - - def forward(self, inputs, labels: torch.Tensor): - text_bert_indices, text_indices, aspect_indices, bert_segments_ids, left_indices, adj = inputs - # text_indices, text_ - text_len = torch.sum(text_indices != 0, dim=-1) - aspect_len = torch.sum(aspect_indices != 0, dim=-1) - left_len = torch.sum(left_indices != 0, dim=-1) - aspect_double_idx = torch.cat([left_len.unsqueeze(1), (left_len + aspect_len - 1).unsqueeze(1)], dim=1) - # TODO: How to embed in the preprocessor? - encoder_layer, _ = self.bert( - text_bert_indices, token_type_ids=bert_segments_ids, output_all_encoded_layers=False - ) - text_out = encoder_layer - x = F.relu(self.gc1(self.position_weight(text_out, aspect_double_idx, text_len, aspect_len), adj)) - x = F.relu(self.gc2(self.position_weight(x, aspect_double_idx, text_len, aspect_len), adj)) - x = F.relu(self.gc3(self.position_weight(x, aspect_double_idx, text_len, aspect_len), adj)) - x = self.mask(x, aspect_double_idx) - alpha_mat = torch.matmul(x, text_out.transpose(1, 2)) - alpha = F.softmax(alpha_mat.sum(1, keepdim=True), dim=2) - x = torch.matmul(alpha, text_out).squeeze(1) # batch_size x 2*hidden_dim - logits = self.fc(x) - - loss = self.loss_function(logits, labels) if labels is not None else None - return SenticGCNBertModelOutput(loss=loss, logits=logits) - - -class SenticGCNEmbeddingPreTrainedModel(PreTrainedModel): - """ - The SenticGCN Embedding Pre-Trained Model used as base class for derived SenticGCN Embedding Model. - - This model is the abstract super class for the SenticGCN Embedding Model which defines the config - class types and weights initalization method. This class should not be used or instantiated directly, - see SenticGCNEmbeddingModel class for usage. - """ - - config_class = SenticGCNEmbeddingConfig - base_model_prefix = "senticgcnembedding" - - def _init_weights(self, module: nn.Module) -> None: - pass - - -class SenticGCNEmbeddingModel(SenticGCNEmbeddingPreTrainedModel): - """ - The SenticGCN Embedding Model used to generate embeddings for model inputs. - By default, the embeddings are generated from the glove.840B.300d embeddings. - - This class inherits from :obj:`SenticGCNEmbeddingPreTrainedModel` for weights initalization and utility functions - from transformers :obj:`PreTrainedModel` class. - - This class can also be constructed via the SenticGCNEmbeddingModel.build_embedding_matrix class method. - - Args: - config (:obj:`~SenticGCNEmbeddingConfig`): - Model configuration class with all parameters required for the model. - Initializing with a config file does not load - the weights associated with the model, only the configuration. - Use the :obj:`.from_pretrained` method to load the model weights. - """ - - def __init__(self, config: SenticGCNEmbeddingConfig) -> None: - super().__init__(config) - self.vocab_size = config.vocab_size - self.embed = nn.Embedding(config.vocab_size, config.embed_dim) - - def forward(self, token_ids: torch.Tensor) -> torch.Tensor: - """ - Encode input token ids using word embedding. - - Args: - token_ids (torch.Tensor): Tensor of token ids with shape [batch_size, num_words] - - Returns: - torch.Tensor: return Tensor of embeddings with shape (batch_size, num_words, embed_dim) - """ - return self.embed(token_ids) - - @classmethod - def build_embedding_matrix( - cls, - word_vec_file_path: str, - vocab: dict[str, int], - embed_dim: int = 300, - ): - """ - This class method is a helper method to construct the embedding model from a file containing word vectors (i.e. GloVe) - and a vocab dictionary. - - Args: - word_vec_file_path (str): file path to the word vectors - vocab (dict[str, int]): vocab dictionary consisting of words as key and index as values - embed_dim (int, optional): the embedding dimension. Defaults to 300. - - Returns: - SenticGCNEmbeddingModel: return an instance of SenticGCNEmbeddingModel - """ - embedding_matrix = build_embedding_matrix( - word_vec_file_path=word_vec_file_path, vocab=vocab, embed_dim=embed_dim - ) - embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float) - sentic_embed_config = SenticGCNEmbeddingConfig(vocab_size=len(vocab), embed_dim=embed_dim) - senticgcn_embed = cls(sentic_embed_config) - senticgcn_embed.embed.weight.data.copy_(embedding_tensor) - return senticgcn_embed - - -class SenticGCNBertEmbeddingModel(BertModel): - """ - The SenticGCN Bert Embedding Model used to generate embeddings for model inputs. - - This class inherits from :obj:`BertModel` for weights initalization and utility functions - from transformers :obj:`PreTrainedModel` class. - - Args: - config (:obj:`~SenticGCNBertEmbeddingConfig`): - Model configuration class with all parameters required for the model. - Initializing with a config file does not load - the weights associated with the model, only the configuration. - Use the :obj:`.from_pretrained` method to load the model weights. - """ - - def __init__(self, config: SenticGCNBertEmbeddingConfig) -> None: - super().__init__(config) diff --git a/sgnlp/models/sentic_gcn/modules/__init__.py b/sgnlp/models/sentic_gcn/modules/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/sgnlp/models/sentic_gcn/modules/dynamic_rnn.py b/sgnlp/models/sentic_gcn/modules/dynamic_rnn.py deleted file mode 100644 index 76ce70b..0000000 --- a/sgnlp/models/sentic_gcn/modules/dynamic_rnn.py +++ /dev/null @@ -1,88 +0,0 @@ -import torch -import torch.nn as nn - - -class DynamicLSTM(nn.Module): - """ - A dynamic LSTM class which can hold variable length sequence - """ - - def __init__( - self, - input_size: int, - hidden_size: int, - num_layers: int = 1, - bias: bool = True, - batch_first: bool = True, - dropout: float = 0, - bidirectional: bool = False, - only_use_last_hidden_state: bool = False, - rnn_type: str = "LSTM", - ) -> None: - super(DynamicLSTM, self).__init__() - self.input_size = input_size - self.hidden_size = hidden_size - self.num_layers = num_layers - self.bias = bias - self.batch_first = batch_first - self.dropout = dropout - self.bidirectional = bidirectional - self.only_use_last_hidden_state = only_use_last_hidden_state - self.rnn_type = rnn_type - self.__init_rnn() - - def __init_rnn(self) -> None: - """ - Helper method to initalized RNN type - """ - input_args = { - "input_size": self.input_size, - "hidden_size": self.hidden_size, - "num_layers": self.num_layers, - "bias": self.bias, - "batch_first": self.batch_first, - "dropout": self.dropout, - "bidirectional": self.bidirectional, - } - if self.rnn_type == "LSTM": - self.rnn = nn.LSTM(**input_args) - elif self.rnn_type == "GRU": - self.rnn = nn.GRU(**input_args) - elif self.rnn_type == "RNN": - self.rnn = nn.RNN(**input_args) - - def forward(self, x: torch.Tensor, x_len: torch.Tensor, h0: torch.Tensor = None) -> torch.Tensor: - # Sort - x_sort_idx = torch.argsort(-x_len) - x_unsort_idx = torch.argsort(x_sort_idx).long() - x_len = x_len[x_sort_idx] - x = x[x_sort_idx.long()] - - # Pack - x_emb_p = torch.nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first=self.batch_first) - - if self.rnn_type == "LSTM": - out_pack, (ht, ct) = self.rnn(x_emb_p, None) if h0 is None else self.rnn(x_emb_p, (h0, h0)) - else: - out_pack, ht = self.rnn(x_emb_p, None) if h0 is None else self.rnn(x_emb_p, h0) - ct = None - - # Unsort - # (num_layers * num_directions, batch, hidden_size) -> (batch, ...) - ht = torch.transpose(ht, 0, 1)[x_unsort_idx] - ht = torch.transpose(ht, 0, 1) - - if self.only_use_last_hidden_state: - return ht - else: - # Unpack: out - out = torch.nn.utils.rnn.pad_packed_sequence(out_pack, batch_first=self.batch_first) # (sequence, lengths) - out = out[0] - out = out[x_unsort_idx] - - # Unsort: out c - if self.rnn_type == "LSTM": - # (num_layers * num_directions, batch, hidden_size) -> (batch, ...) - ct = torch.transpose(ct, 0, 1)[x_unsort_idx] - ct = torch.transpose(ct, 0, 1) - return out, (ht, ct) diff --git a/sgnlp/models/sentic_gcn/modules/gcn.py b/sgnlp/models/sentic_gcn/modules/gcn.py deleted file mode 100644 index 618156e..0000000 --- a/sgnlp/models/sentic_gcn/modules/gcn.py +++ /dev/null @@ -1,23 +0,0 @@ -import torch -import torch.nn as nn - - -class GraphConvolution(nn.Module): - """ - Simple GCN Layer, similar to https://arxiv.org/abs/1609.02907 - """ - - def __init__(self, in_features: torch.Tensor, out_features: torch.Tensor, bias=True) -> None: - super(GraphConvolution, self).__init__() - self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features)) - if bias: - self.bias = nn.Parameter(torch.FloatTensor(out_features)) - else: - self.register_parameter("bias", None) - - def forward(self, text: torch.Tensor, adj: torch.Tensor): - text = text.to(torch.float32) - hidden = torch.matmul(text, self.weight) - denom = torch.sum(adj, dim=2, keepdim=True) + 1 - output = torch.matmul(adj, hidden) / denom - return output + self.bias if self.bias is not None else output diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py deleted file mode 100644 index e478dfc..0000000 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ /dev/null @@ -1,66 +0,0 @@ -from typing import List - -import torch -from transformers import PreTrainedTokenizer -from transformers.tokenization_utils_base import BatchEncoding -from transformers.utils.dummy_pt_objects import PreTrainedModel - -from config import SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig -from modeling import SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel -from tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer - - -class SenticGCNPreprocessor: - def __init__( - self, - tokenizer: PreTrainedTokenizer = None, - embedding_model: PreTrainedModel = None, - tokenizer_name: str = None, - embedding_model_name: str = None, - device: torch.device = torch.device("cpu"), - ): - self.device = device - if tokenizer is not None: - self.tokenizer = tokenizer - else: - self.tokenizer = SenticGCNTokenizer.from_pretrained(tokenizer_name) - - if embedding_model is not None: - self.embedding_model = embedding_model - else: - embedding_config = SenticGCNEmbeddingConfig.from_pretrained(embedding_model_name) - self.embedding_model = SenticGCNEmbeddingModel.from_pretrained( - embedding_model_name, config=embedding_config - ).to(device) - - def __call__(self, data_batch: List[str]) -> BatchEncoding: - tokens = self.tokenizer(data_batch, padding=True, return_tensors="pt") - return tokens - - -class SenticGCNBertPreprocessor: - def __init__( - self, - tokenizer: PreTrainedTokenizer = None, - embedding_model: PreTrainedModel = None, - tokenizer_name: str = None, - embedding_model_name: str = None, - device: torch.device = torch.device("cpu"), - ): - self.device = device - if tokenizer is not None: - self.tokenizer = tokenizer - else: - self.tokenizer = SenticGCNBertTokenizer.from_pretrained(tokenizer_name) - - if embedding_model is not None: - self.embedding_model = embedding_model - else: - embedding_config = SenticGCNBertEmbeddingConfig.from_pretrained(embedding_model_name) - self.embedding_model = SenticGCNBertEmbeddingModel.from_pretrained( - embedding_model_name, config=embedding_config - ).to(device) - - def __call__(self, data_batch: List[str]) -> BatchEncoding: - tokens = self.tokenizer(data_batch, padding=True, return_tensors="pt") - return tokens diff --git a/sgnlp/models/sentic_gcn/tokenization.py b/sgnlp/models/sentic_gcn/tokenization.py deleted file mode 100644 index 2645cf2..0000000 --- a/sgnlp/models/sentic_gcn/tokenization.py +++ /dev/null @@ -1,156 +0,0 @@ -import pathlib -import pickle -from typing import Dict, List, Optional, Tuple - -from transformers import PreTrainedTokenizer, BertTokenizer - - -VOCAB_FILES_NAMES = {"vocab_file": "vocab.pkl"} - - -class SenticGCNTokenizer(PreTrainedTokenizer): - """ - The SenticGCN tokenizer class used for to generate tokens for the embedding model. - - Args: - text (:obj:`str`): - input text string to tokenize - - Example:: - tokenizer = SenticGCNTokenizer.from_pretrained("senticgcn") - inputs = tokenizer('Hello World!') - inputs['input_ids'] - """ - - vocab_files_names = VOCAB_FILES_NAMES - - def __init__( - self, - vocab_file: str = None, - train_files: List[str] = None, - train_vocab: bool = False, - do_lower_case: bool = True, - unk_token: str = "", - pad_token: str = "", - **kwargs, - ): - super().__init__( - do_lower_case=do_lower_case, - unk_token=unk_token, - pad_token=pad_token, - **kwargs, - ) - self.do_lower_case = do_lower_case - if train_vocab: - self.vocab = self.create_vocab(train_files) - else: - with open(vocab_file, "rb") as fin: - self.vocab = pickle.load(fin) - self.ids_to_tokens = {v: k for k, v in self.vocab.items()} - - @property - def vocab_size(self): - return len(self.vocab) - - def get_vocab(self): - return dict(self.vocab) - - def _convert_token_to_id(self, token: str) -> int: - return self.vocab.get(token, self.vocab.get(self.unk_token)) - - def _convert_id_to_token(self, index: int) -> str: - return self.ids_to_tokens(index, self.unk_token) - - @staticmethod - def __read_text_file(file_names: List[str]) -> str: - """ - Helper method to read contents of a list of text files. - - Args: - file_names (List[str]): list of text files to read. - - Returns: - str: return a concatenated string of text files contents. - """ - text = "" - for fname in file_names: - with open(fname, "r", encoding="utf-8", newline="\n", errors="ignore") as fin: - lines = fin.readlines() - for i in range(0, len(lines), 3): - text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] - aspect = lines[i + 1].lower().strip() - text += f"{text_left} {aspect} {text_right} " # Left a space at the end - return text - - def create_vocab(self, train_files: List[str]) -> Dict[str, int]: - text = self.__read_text_file(train_files) - if self.do_lower_case: - text = text.lower() - vocab = {} - vocab[self.pad_token] = 0 - vocab[self.unk_token] = 1 - offset = len(vocab.keys()) - - words = text.split() - for word in words: - if word not in vocab: - vocab[word] = offset - offset += 1 - return vocab - - def _tokenize(self, text, **kwargs): - if self.do_lower_case: - text = text.lower() - words = text.split() - return words - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: - save_dir = pathlib.Path(save_directory) - save_dir.mkdir(exist_ok=True) - vocab_file_path = save_dir.joinpath("vocab.pkl") - with open(vocab_file_path, "wb") as fout: - pickle.dump(self.vocab, fout) - return (str(vocab_file_path),) - - -class SenticGCNBertTokenizer(BertTokenizer): - """ - The senticGCN Bert Tokenizer class used to generate tokens for the embedding model, derived from BertTokenizer class. - - Args: - text (:obj:`str`): - input text string to tokenize - - Example:: - tokenizer = SenticGCNBertTokenizer.from_pretrained('bert-base-uncased') - inputs = tokenizer('Hello World!') - inputs['input_ids'] - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def __call__( - self, - text, - max_length: int = 85, - add_special_tokens: bool = False, - padding: bool = True, - truncation: bool = True, - return_token_type_ids: bool = False, - return_attention_mask: bool = False, - return_tensors: str = None, - **kwargs, - ): - encoding = super().__call__( - text, - max_length=max_length, - add_special_tokens=add_special_tokens, - padding=padding, - truncation=truncation, - return_token_type_ids=return_token_type_ids, - return_attention_mask=return_attention_mask, - return_tensors=return_tensors, - **kwargs, - ) - return encoding diff --git a/sgnlp/models/sentic_gcn/train.py b/sgnlp/models/sentic_gcn/train.py deleted file mode 100644 index a38e57d..0000000 --- a/sgnlp/models/sentic_gcn/train.py +++ /dev/null @@ -1,222 +0,0 @@ -import datetime -import logging -import math -import pathlib - -import numpy as np -from sklearn.metrics import f1_score -import torch -import torch.nn as nn -import torch.optim as optim - -from data_class import SenticGCNTrainArgs -from modeling import SenticGCNBertPreTrainedModel -from tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer -from utils import parse_args_and_load_config, set_random_seed, ABSADatasetReader - - -logging.basicConfig(level=logging.DEBUG) - - -class SenticGCNBaseTrainer: - def __init__(self, config: SenticGCNTrainArgs): - self.config = config - self.global_max_acc = 0.0 - self.global_max_f1 = 0.0 - self.device = ( - torch.device("cuda" if torch.cuda.is_available() else "cpu") - if not self.config.device - else torch.device(self.config.device) - ) - tokenizer = self._create_tokenizer() - # self.dataloader - if config.save_state_dict: - self.save_state_dict_folder = pathlib.Path(self.config.saved_state_dict_folder_path) - self.save_state_dict_folder.mkdir(exist_ok=True) - - def _create_initializers(self): - initializers = { - "xavier_uniform_": nn.init.xavier_uniform_, - "xavier_normal_": nn.init.xavier_normal, - "orthogonal": nn.init.orthogonal_, - } - return initializers[self.config.initializer] - - def _create_optimizer(self): - optimizers = { - "adadelta": optim.Adadelta, - "adagrad": optim.Adagrad, - "adam": optim.Adam, - "adamax": optim.Adamax, - "asgd": optim.ASGD, - "rmsprop": optim.RMSprop, - "sgd": optim.SGD, - } - return optimizers[self.config.optimizer] - - def _create_tokenizer(self): - self.tokenizer = ( - SenticGCNBertTokenizer.from_pretrained(self.config.tokenizer) - if self.config.model == "senticgcn" - else SenticGCNBertTokenizer.from_pretrained(self.config.tokenizer) - ) - - def _reset_params(self): - raise NotImplementedError("Please call from derived class only.") - - def _evaluate_acc_f1(self): - self.model.eval() - n_correct, n_total = 0, 0 - t_targets_all, t_outputs_all = None, None - with torch.no_grad(): - for _, t_batch in enumerate(self.dataset_test): - t_inputs = [t_batch[col].to(self.device) for col in t_batch.keys()] - t_targets = t_batch["polarity"].to(self.device) - t_outputs = self.model(t_inputs) - - n_correct += (torch.argmax(t_outputs, -1) == t_targets).sum().item() - n_total += len(t_outputs) - - if t_targets_all is None: - t_targets_all = t_targets - t_outputs_all = t_outputs - else: - t_targets_all = torch.cat((t_targets_all, t_targets), dim=0) - t_outputs_all = torch.cat((t_outputs_all, t_outputs), dim=0) - test_acc = n_correct / n_total - f1 = f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average="macro") - return test_acc, f1 - - def _save_state_dict(self, epoch: int) -> pathlib.Path: - curr_dt = datetime.datetime.now() - curr_dt_str = curr_dt.strftime("%Y-%m-%d_%H%M%S") - filename = f"{self.config.model}_epoch_{epoch}_{curr_dt_str}.pkl" - full_path = self.save_state_dict_folder.joinpath(filename) - try: - torch.save(self.model.state_dict(), full_path) - except: - raise Exception("Error saving model state dict!") - return full_path - - def _train_epoch(self, criterion: function, optimizer: function) -> pathlib.Path: - max_val_acc, max_val_f1 = 0, 0 - max_val_epoch = 0 - global_step = 0 - path = None - - for epoch in range(self.config.epochs): - n_correct, n_total, loss_total = 0, 0, 0 - self.model.train() - for _, batch in enumerate(self.dataloader_train): - global_step += 1 - optimizer.zero_grad() - - inputs = [batch[col].to(self.device) for col in batch.keys()] - targets = batch["polarity"].to(self.device) - outputs = self.model(inputs) - loss = criterion(outputs, targets) - loss.backward() - optimizer.step() - - n_correct += (torch.argmax(outputs, -1) == targets).sum().item() - n_total += len(outputs) - loss_total += loss.item() * len(outputs) - - if global_step % self.config.log_step == 0: - train_acc = n_correct / n_total - train_loss = loss_total / n_total - logging.info(f"Train Acc: {train_acc:.4f}, Train Loss: {train_loss:.4f}") - - val_acc, val_f1 = self._evaluate_acc_f1() - logging.info( - f""" - Epoch: {epoch} - Test Acc: {val_acc:.4f} - Test Loss: {val_f1:.4f} - """ - ) - if val_f1 > max_val_f1: - max_val_f1 = val_f1 - - if val_acc > max_val_acc: - max_val_acc = val_acc - max_val_epoch = epoch - if self.config.save_state_dict: - path = self._save_state_dict(epoch) - logging.info( - f""" - Best model saved. Acc: {max_val_acc:.4f}, F1: {max_val_f1}, Epoch: {max_val_epoch} - """ - ) - - if epoch - max_val_epoch >= self.config.patience: - logging.info(f"Early stopping") - break - return path - - def train(self): - criterion = nn.CrossEntropyLoss() - _params = filter(lambda p: p.requires_grad, self.model.parameters()) - optimizer = self._create_optimizer()(_params, lr=self.config.learning_rate, weight_decay=self.config.l2reg) - - test_accs, test_f1s = [], [] - for i in range(self.config.repeats): - logging.info(f"Start overall train loop : {i + 1}") - - self._reset_params() - test_acc, test_f1 = self._train_epoch(criterion, optimizer) - test_accs.append(test_acc) - test_f1s.append(test_f1) - - logging.info(f"Test_acc: {test_acc}, Test_f1: {test_f1}") - test_accs_avg = np.sum(test_accs) / self.config.repeats - test_f1s_avg = np.sum(test_f1s) / self.config.repeats - max_accs = np.max(test_accs) - max_f1s = np.max(test_f1s) - - logging.info( - f""" - Test acc average: {test_accs_avg} - Test f1 average: {test_f1s_avg} - Test acc max: {max_accs} - Test f1 max: {max_f1s} - """ - ) - - -class SenticBertGCNTrainer(SenticGCNBaseTrainer): - def __init__(self, config: SenticGCNTrainArgs): - self.config = config - - def _reset_params(self): - for child in self.model.children(): - if type(child) != SenticGCNBertPreTrainedModel: - for param in child.parameters(): - if param.requires_grad: - if len(param.shape) > 1: - self._create_initializers(param) - else: - stdv = 1.0 / math.sqrt(param.shape[0]) - nn.init.uniform_(param, a=-stdv, b=stdv) - - -class SenticGCNTrainer(SenticGCNBaseTrainer): - def __init__(self, config: SenticGCNTrainArgs): - self.config = config - - def _reset_params(self): - for param in self.modelparameters(): - if param.requires_grad: - if len(param.shape) > 1: - self._create_initializers(param) - else: - stdv = 1.0 / math.sqrt(param.shape[0]) - nn.init.uniform_(param, a=-stdv, b=stdv) - - -if __name__ == "__main__": - cfg = parse_args_and_load_config() - if cfg.seed is not None: - set_random_seed(cfg.seed) - trainer = SenticGCNTrainer(cfg) if cfg.model == "senticgcn" else SenticBertGCNTrainer(cfg) - trainer.train() diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py deleted file mode 100644 index 1019788..0000000 --- a/sgnlp/models/sentic_gcn/utils.py +++ /dev/null @@ -1,541 +0,0 @@ -import argparse -import json -import logging -import pickle -import random -import pathlib -import requests -import urllib -import math -from typing import Dict, Tuple - -import numpy as np -import spacy -import torch -from torch.utils.data import random_split, Dataset -from transformers import PreTrainedTokenizer, PreTrainedModel -from transformers.tokenization_utils_base import BatchEncoding - -from data_class import SenticGCNTrainArgs - - -def parse_args_and_load_config( - config_path: str = "config/senticnet_gcn_config.json", -) -> SenticGCNTrainArgs: - """Get config from config file using argparser - - Returns: - SenticGCNTrainArgs: SenticGCNTrainArgs instance populated from config - """ - parser = argparse.ArgumentParser(description="SenticASGCN Training") - parser.add_argument("--config", type=str, default=config_path) - args = parser.parse_args() - - cfg_path = pathlib.Path(__file__).parent / args.config - with open(cfg_path, "r") as cfg_file: - cfg = json.load(cfg_file) - - sentic_asgcn_args = SenticGCNTrainArgs(**cfg) - return sentic_asgcn_args - - -def set_random_seed(seed: int = 776) -> None: - """Helper method to set random seeds for python, numpy and torch - - Args: - seed (int, optional): seed value to set. Defaults to 776. - """ - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - torch.cuda.manual_seed(seed) - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False - - -def download_tokenizer_files( - base_url: str, - save_folder: str, - files: list[str] = ["special_tokens_map.json", "tokenizer_config.json", "vocab.pkl"], -) -> None: - """ - Helper method to download files from online storage. - - Args: - base_url (str): Url string to storage folder. - save_folder (str): Local folder to save downloaded files. Folder will be created if it does not exists. - """ - file_paths = [urllib.parse.urljoin(base_url, file_name) for file_name in files] - for file_path in file_paths: - download_url_file(file_path, save_folder) - - -def download_url_file(url: str, save_folder: str) -> None: - """ - Helper method to download and save url file. - - Args: - url (str): Url of file to download. - save_folder (str): Folder to save downloaded file. Will be created if it does not exists. - """ - save_folder_path = pathlib.Path(save_folder) - save_folder_path.mkdir(exist_ok=True) - fn_start_pos = url.rfind("/") + 1 - file_name = url[fn_start_pos:] - save_file_path = save_folder_path.joinpath(file_name) - req = requests.get(url) - if req.status_code == requests.codes.ok: - with open(save_file_path, "wb") as f: - for data in req: - f.write(data) - else: - logging.error(f"Fail to request files from {url}.") - - -def pad_and_truncate( - sequence: list[float], - max_len: int, - dtype: str = "int64", - padding: str = "post", - truncating: str = "post", - value: int = 0, -): - """ - Helper method for padding and truncating text and aspect segment. - - Args: - sequence (list[float]): input sequence of indices - max_len (int): maximum len to pad - dtype (str, optional): data type to cast indices. Defaults to "int64". - padding (str, optional): type of padding, 'pre' or 'post'. Defaults to "post". - truncating (str, optional): type of truncating, 'pre' or 'post'. Defaults to "post". - value (int, optional): value used for padding. Defaults to 0. - - Returns: - [type]: [description] - """ - seq_arr = (np.ones(max_len) * value).astype(dtype) - trunc = sequence[-max_len:] if truncating == "pre" else sequence[:max_len] - trunc = np.asarray(trunc, dtype=dtype) - if padding == "post": - seq_arr[: len(trunc)] = trunc - else: - seq_arr[-len(trunc) :] = trunc - return seq_arr - - -def load_word_vec(word_vec_file_path: str, vocab: Dict[str, int], embed_dim: int = 300) -> Dict[str, np.asarray]: - """ - Helper method to load word vectors from file (e.g. GloVe) for each word in vocab. - - Args: - word_vec_file_path (str): full file path to word vectors. - vocab (Dict[str, int]): dictionary of vocab word as key and word index as values. - embed_dim (int, optional): embedding dimension. Defaults to 300. - - Returns: - Dict[str, np.asarray]: dictionary with words as key and word vectors as values. - """ - with open(word_vec_file_path, "r", encoding="utf-8", newline="\n", errors="ignore") as fin: - word_vec = {} - for line in fin: - tokens = line.rstrip().split() - word, vec = " ".join(tokens[:-embed_dim]), tokens[-embed_dim:] - if word in vocab.keys(): - word_vec[word] = np.asarray(vec, dtype="float32") - return word_vec - - -def build_embedding_matrix( - word_vec_file_path: str, - vocab: Dict[str, int], - embed_dim: int = 300, - save_embed_matrix: bool = False, - save_embed_file_path: str = None, -) -> np.ndarray: - """ - Helper method to generate an embedding matrix. - - Args: - word_vec_file_path (str): full file path to word vectors. - vocab (Dict[str, int]): dictionary of vocab word as key and word index as values. - embed_dim (int, optional): embedding dimension. Defaults to 300. - save_embed_matrix (bool, optional): flag to indicate if . Defaults to False. - save_embed_directory (str, optional): [description]. Defaults to None. - - Returns: - np.array: numpy array of embedding matrix - """ - embedding_matrix = np.zeros((len(vocab), embed_dim)) - embedding_matrix[1, :] = np.random.uniform(-1 / np.sqrt(embed_dim), 1 / np.sqrt(embed_dim), (1, embed_dim)) - word_vec = load_word_vec(word_vec_file_path, vocab, embed_dim) - for word, idx in vocab.items(): - vec = word_vec.get(word) - if vec is not None: - embedding_matrix[idx] = vec - - if save_embed_matrix: - save_file_path = pathlib.Path(save_embed_file_path) - if not save_file_path.exists(): - save_file_path.parent.mkdir(exist_ok=True) - with open(save_file_path, "wb") as fout: - pickle.dump(embedding_matrix, fout) - - return embedding_matrix - - -def load_and_process_senticnet( - senticnet_file_path: str = None, - save_preprocessed_senticnet: bool = False, - saved_preprocessed_senticnet_file_path: str = "senticnet.pkl", -) -> Dict[str, float]: - """ - Helper method to load and process senticnet. Default is SenticNet 5.0. - If a saved preprocess senticnet file is available, and save flag is set to false, it will be loaded from file instead. - Source: - https://github.com/BinLiang-NLP/Sentic-GCN/tree/main/senticnet-5.0 - - Args: - senticnet_file_path (str): File path to senticnet 5.0 file. - save_preprocessed_senticnet (bool): Flag to indicate if processed senticnet should be saved. - saved_preprocessed_senticnet_file_path: (str): File path to saved preprocessed senticnet file. - - Returns: - Dict[str, float]: return dictionary with concept word as keys and intensity as values. - """ - saved_senticnet_file_path = pathlib.Path(saved_preprocessed_senticnet_file_path) - if saved_senticnet_file_path.exists() and not save_preprocessed_senticnet: - with open(saved_senticnet_file_path, "r") as f: - sentic_dict = pickle.load(f) - else: - senticnet_file_path = pathlib.Path(senticnet_file_path) - sentic_dict = {} - with open(senticnet_file_path, "r") as f: - for line in f: - line = line.strip() - if not line: - continue - items = line.split("\t") - if "_" in items[0]: - continue # skip words with '_' - sentic_dict[items[0]] = items[-1] - if save_preprocessed_senticnet: - saved_senticnet_file_path.parent.mkdir(exist_ok=True) - with open(saved_senticnet_file_path, "wb") as f: - pickle.dump(sentic_dict, f) - return sentic_dict - - -def generate_dependency_adj_matrix(text: str, aspect: str, senticnet: Dict[str, float], spacy_pipeline) -> np.ndarray: - """ - Helper method to generate senticnet depdency adj matrix. - - Args: - text (str): input text to process - aspect (str): aspect from input text - senticnet (Dict[str, float]): dictionary of preprocessed senticnet. See load_and_process_senticnet() - spacy_pipeline : Spacy pretrained pipeline (e.g. 'en_core_web_sm') - - Returns: - np.ndarray: return ndarry representing adj matrix. - """ - document = spacy_pipeline(text) - seq_len = len(text.split()) - matrix = np.zeros((seq_len, seq_len)).astype("float32") - for token in document: - sentic = float(senticnet[str(token)]) + 1.0 if str(token) in senticnet else 0 - if str(token) in aspect: - sentic += 1.0 - if token.i < seq_len: - matrix[token.i][token.i] = 1.0 * sentic - for child in token.children: - if str(child) in aspect: - sentic += 1.0 - if child.i < seq_len: - matrix[token.i][child.i] = 1.0 * sentic - matrix[child.i][token.i] = 1.0 * sentic - return matrix - - -class SenticGCNDataset(Dataset): - """ - Data class for SenticGCN dataset. - """ - - def __init__(self, data: list[Dict[str, torch.Tensor]]) -> None: - self.data = data - - def __getitem__(self, index: int) -> Dict[str, torch.Tensor]: - return self.data[index] - - def __len__(self): - return len(self.data) - - -class SenticGCNDatasetGenerator: - """ - Main dataset generator class to preprocess raw dataset file. - """ - - def __init__(self, config: SenticGCNTrainArgs, tokenizer: PreTrainedTokenizer) -> None: - self.config = config - self.senticnet = load_and_process_senticnet( - config.senticnet_word_file_path, - config.save_preprocessed_senticnet, - config.saved_preprocessed_senticnet_file_path, - ) - self.spacy_pipeline = spacy.load(config.spacy_pipeline) - self.tokenizer = tokenizer - - def _read_raw_dataset(self, dataset_type: str) -> list[str]: - """ - Private helper method to read raw dataset files based on requested type (e.g. Train or Test). - - Args: - dataset_type (str): Type of dataset files to read. Train or Test. - - Returns: - list[str]: list of str consisting of the full text, aspect and polarity index. - """ - file_path = self.config.dataset_train if dataset_type == "train" else self.config.dataset_test - with open(file_path, "r", encoding="utf-8", newline="\n", errors="ignore") as f: - lines = f.readlines() - return lines - - def _generate_senticgcn_dataset(self, raw_data: list[str]) -> Dict[str, list]: - """ - Data preprocess method to generate all indices required for SenticGCN model training. - - Args: - raw_data (list[str]): list of text, aspect word and polarity read from raw dataset file. - - Returns: - Dict[str, list]]: return a dictionary of dataset sub-type and their list of values. - """ - all_data = [] - for i in range(0, len(raw_data), 3): - # Process full text, aspect and polarity index - text_left, _, text_right = [s.lower().strip() for s in raw_data[i].partition("$T$")] - aspect = raw_data[i + 1].lower().strip() - full_text = f"{text_left} {aspect} {text_right}" - polarity = raw_data[i + 2].strip() - - # Process indices - text_indices = self.tokenizer( - full_text, - return_tensors=None, - return_attention_mask=False, - return_token_type_ids=False, - ) - aspect_indices = self.tokenizer( - aspect, - return_tensors=None, - return_attention_mask=False, - return_token_type_ids=False, - ) - left_indices = self.tokenizer( - text_left, - return_tensors=None, - return_attention_mask=False, - return_token_type_ids=False, - ) - polarity = int(polarity) + 1 - graph = generate_dependency_adj_matrix(full_text, aspect, self.senticnet, self.spacy_pipeline) - all_data.append( - { - "text_indices": text_indices["input_ids"], - "aspect_indices": aspect_indices["input_ids"], - "left_indices": left_indices["input_ids"], - "polarity": polarity, - "sdat_graph": graph, - } - ) - return all_data - - def _generate_senticgcnbert_dataset(self, raw_data: list[str]) -> Dict[str, BatchEncoding]: - """ - Data preprocess method to generate all indices required for SenticGCNBert model training. - - Args: - raw_data (list[str]): list of text, aspect word and polarity read from raw dataset file. - - Returns: - Dict[str, BatchEncoding]: return a dictionary of dataset sub-type and their tensors. - """ - all_data = [] - max_len = self.config.max_len - for i in range(0, len(raw_data), 3): - # Process full text, aspect and polarity index - text_left, _, text_right = [s.lower().strip() for s in raw_data[i].partition("$T$")] - aspect = raw_data[i + 1].lower().strip() - polarity = raw_data[i + 2].strip() - full_text = f"{text_left} {aspect} {text_right}" - full_text_with_bert_tokens = f"[CLS] {full_text} [SEP] {aspect} [SEP]" - - # Process indices - text_indices = self.tokenizer(full_text, return_tensors="pt") - aspect_indices = self.tokenizer(aspect, return_tensors="pt") - left_indices = self.tokenizer(text_left, return_tensors="pt") - polarity = int(polarity) + 1 - polarity = BatchEncoding({"input_ids": polarity}) - polarity = polarity.convert_to_tensors("pt") - - # Process bert related indices - text_bert_indices = self.tokenizer( - full_text_with_bert_tokens, return_tensors="pt", add_special_tokens=True, return_token_type_ids=True - ) - text_len = np.sum(text_indices["input_ids"].numpy() != 0) - aspect_len = np.sum(aspect_indices["input_ids"].numpy() != 0) - - # array of [0] for texts including [CLS] and [SEP] and [1] for aspect and ending [SEP] - concat_segment_indices = [0] * (text_len + 2) + [1] * (aspect_len + 1) - concat_segment_indices = pad_and_truncate(concat_segment_indices, max_len) - concat_segment_indices = BatchEncoding({"input_ids": concat_segment_indices}) - concat_segment_indices = concat_segment_indices.convert_to_tensors("pt") - - # Process embeddings - - # Process graph - graph = generate_dependency_adj_matrix(full_text, aspect, self.senticnet, self.spacy_pipeline) - sdat_graph = np.pad( - graph, - ( - (0, max_len - graph.shape[0]), - (0, max_len - graph.shape[0]), - ), - "constant", - ) - sdat_graph = BatchEncoding({"input_ids": sdat_graph}) - sdat_graph = sdat_graph.convert_to_tensors("pt") - - all_data.append( - { - "text_indices": text_indices.to(self.device), - "aspect_indices": aspect_indices.to(self.device), - "left_indices": left_indices.to(self.device), - "text_bert_indices": text_bert_indices.to(self.device), - "bert_segment_indices": concat_segment_indices.to(self.device), - "polarity": polarity.to(self.device), - "sdat_graph": sdat_graph.to(self.device), - } - ) - return all_data - - def generate_datasets(self) -> Tuple[SenticGCNDataset, SenticGCNDataset, SenticGCNDataset]: - """ - Main wrapper method to generate datasets for both SenticGCN and SenticGCNBert based on config. - - Returns: - Tuple[SenticGCNDataset, SenticGCNDataset, SenticGCNDataset]: - return SenticGCNDataset instances for train/val/test data. - """ - # Read raw data from dataset files - raw_train_data = self._read_raw_dataset(self.config.dataset_train) - raw_test_data = self._read_raw_dataset(self.config.dataset_test) - - # Generate dataset dictionary - if self.config.model == "senticgcn": - train_data = self._generate_senticgcn_dataset(raw_train_data) - test_data = self._generate_senticgcn_dataset(raw_test_data) - else: - train_data = self._generate_senticgcnbert_dataset(raw_train_data) - test_data = self._generate_senticgcnbert_dataset(raw_test_data) - # Train/Val/Test split - if self.config.valset_ratio > 0: - valset_len = int(len(train_data) * self.config.valset_ratio) - train_data, val_data = random_split(train_data, (len(train_data) - valset_len, valset_len)) - else: - val_data = test_data - return SenticGCNDataset(train_data), SenticGCNDataset(val_data), SenticGCNDataset(test_data) - - -class BucketIterator: - """ - Iterator class for use with non-bert version of SenticGCN. - """ - - def __init__( - self, - data: list[dict[str, BatchEncoding]], - batch_size: int, - sort_key: str = "text_indices", - shuffle=True, - sort=True, - ): - self.shuffle = shuffle - self.sort = sort - self.sort_key = sort_key - self.batches = self._sort_and_pad(data, batch_size) - self.batch_len = len(self.batches) - - def _sort_and_pad(self, data: list[dict[str, list]], batch_size: int) -> list[dict[str, list[torch.Tensor]]]: - """ - Private method to sort and pad input dataset. - - Args: - data (list[dict[str, list]]): input dataset - batch_size (int): batch size to split dataset - - Returns: - list[dict[str, list[torch.Tensor]]]: return list of dictionary of dataset batches - """ - num_batch = int(math.ceil(len(data) / batch_size)) - if self.sort: - sorted_data = sorted(data, key=lambda x: len(x[self.sort_key])) - else: - sorted_data = data - batches = [] - for i in range(num_batch): - batches.append(self._pad_data(sorted_data[i * batch_size : (i + 1) * batch_size])) - return batches - - def _pad_data(self, batch_data: dict[str, list]) -> dict[str, list[torch.Tensor]]: - """ - Private method to each sub dataset to max length for their specific batch - - Args: - batch_data (dict[str, list]): dictionary of sub dataset and their list of values - - Returns: - dict[str, list[torch.Tensor]]: return a dictionary of list of tensor values - """ - batch_text_indices = [] - batch_aspect_indices = [] - batch_left_indices = [] - batch_polarity = [] - batch_sdat_graph = [] - max_len = max([len(t[self.sort_key]) for t in batch_data]) - for item in batch_data: - (text_indices, aspect_indices, left_indices, polarity, sdat_graph,) = ( - item["text_indices"], - item["aspect_indices"], - item["left_indices"], - item["polarity"], - item["sdat_graph"], - ) - # Calculate padding length - text_padding = [0] * (max_len - len(text_indices)) - aspect_padding = [0] * (max_len - len(aspect_indices)) - left_padding = [0] * (max_len - len(left_indices)) - - batch_text_indices.append(text_indices + text_padding) - batch_aspect_indices.append(aspect_indices + aspect_padding) - batch_left_indices.append(left_indices + left_padding) - batch_polarity.append(polarity) - batch_sdat_graph.append( - np.pad(sdat_graph, ((0, max_len - len(text_indices)), (0, max_len - len(text_indices))), "constant") - ) - - return { - "text_indices": torch.tensor(batch_text_indices), - "aspect_indices": torch.tensor(batch_aspect_indices), - "left_indices": torch.tensor(batch_left_indices), - "polarity": torch.tensor(batch_polarity), - "sdat_graph": torch.tensor(batch_sdat_graph), - } - - def __iter__(self): - if self.shuffle: - random.shuffle(self.batches) - for idx in range(self.batch_len): - yield self.batches[idx] From 3401b9bce1028135dc959d5fff2cd2f308f00bb7 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 13 Jan 2022 14:14:03 +0800 Subject: [PATCH 165/201] [#41] bug fix for sentic_gcn preprocessor --- sgnlp/models/sentic_gcn/preprocess.py | 65 +++++++++++++++------------ 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index f7165f7..c1d7858 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -214,53 +214,60 @@ def _process_indices(self, data_batch: List[SenticGCNData]) -> List[torch.Tensor all_aspect_indices = [] all_left_indices = [] all_sdat_graph = [] - max_len = max([len(data.full_text) for data in data_batch]) + all_data = [] + max_len = 0 for data in data_batch: text_indices = self.tokenizer( data.full_text, - max_length=max_len, - padding="max_length", - truncation=True, - add_special_tokens=False, return_tensors=None, return_attention_mask=False, return_token_type_ids=False, ) aspect_indices = self.tokenizer( data.aspect, - max_length=max_len, - padding="max_length", - truncation=True, - add_special_tokens=False, return_tensors=None, return_attention_mask=False, return_token_type_ids=False, ) - if data.left_text: - left_indices = self.tokenizer( - data.left_text, - max_length=max_len, - padding="max_length", - truncation=True, - add_special_tokens=False, - return_tensors=None, - return_attention_mask=False, - return_token_type_ids=False, - ) - else: - # Workaround for handling empty string. - # This happens when the aspect is also the first word in the full text. - left_indices = {"input_ids": [0] * max_len} + left_indices = self.tokenizer( + data.left_text, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) graph = generate_dependency_adj_matrix(data.full_text, data.aspect, self.senticnet, self.spacy_pipeline) + all_data.append( + { + "text_indices": text_indices["input_ids"], + "aspect_indices": aspect_indices["input_ids"], + "left_indices": left_indices["input_ids"], + "sdat_graph": graph, + } + ) + if max_len < len(text_indices["input_ids"]): + max_len = len(text_indices["input_ids"]) + + for item in all_data: + (text_indices, aspect_indices, left_indices, sdat_graph,) = ( + item["text_indices"], + item["aspect_indices"], + item["left_indices"], + item["sdat_graph"], + ) + + text_padding = [0] * (max_len - len(text_indices)) + aspect_padding = [0] * (max_len - len(aspect_indices)) + left_padding = [0] * (max_len - len(left_indices)) + sdat_graph = np.pad( - graph, - ((0, max_len - len(data.full_text)), (0, max_len - len(data.full_text))), + sdat_graph, + ((0, max_len - len(text_indices)), (0, max_len - len(text_indices))), "constant", ) - all_text_indices.append(text_indices["input_ids"]) - all_aspect_indices.append(aspect_indices["input_ids"]) - all_left_indices.append(left_indices["input_ids"]) + all_text_indices.append(text_indices + text_padding) + all_aspect_indices.append(aspect_indices + aspect_padding) + all_left_indices.append(left_indices + left_padding) all_sdat_graph.append(sdat_graph) all_text_indices = torch.tensor(all_text_indices).to(self.device) From 8455c9500cb7d9159200ac0668396dbc0aada869 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 13 Jan 2022 14:20:48 +0800 Subject: [PATCH 166/201] [#41] update unit tests for preprocess --- tests/sentic_gcn/test_sentic_gcn_preprocess.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/sentic_gcn/test_sentic_gcn_preprocess.py b/tests/sentic_gcn/test_sentic_gcn_preprocess.py index 03c343d..4e202e9 100644 --- a/tests/sentic_gcn/test_sentic_gcn_preprocess.py +++ b/tests/sentic_gcn/test_sentic_gcn_preprocess.py @@ -62,11 +62,11 @@ def test_senticgcn_preprocessor(self): for proc_index in processed_indices: self.assertTrue(isinstance(proc_index, torch.Tensor)) - self.assertEqual(processed_indices[0].shape, torch.Size([5, 128])) - self.assertEqual(processed_indices[1].shape, torch.Size([5, 128])) - self.assertEqual(processed_indices[2].shape, torch.Size([5, 128])) - self.assertEqual(processed_indices[3].shape, torch.Size([5, 128, 300])) - self.assertEqual(processed_indices[4].shape, torch.Size([5, 128, 128])) + self.assertEqual(processed_indices[0].shape, torch.Size([5, 27])) + self.assertEqual(processed_indices[1].shape, torch.Size([5, 27])) + self.assertEqual(processed_indices[2].shape, torch.Size([5, 27])) + self.assertEqual(processed_indices[3].shape, torch.Size([5, 27, 300])) + self.assertEqual(processed_indices[4].shape, torch.Size([5, 27, 27])) def test_senticgcn_preprocessor_from_external(self): """ From 2196e5b1ff921bad952caaed368b8fb2bc2d5cf3 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 13 Jan 2022 14:55:03 +0800 Subject: [PATCH 167/201] [#41] bug fix for raw dataset reader --- sgnlp/models/sentic_gcn/utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py index 2ed1dac..30a89e2 100644 --- a/sgnlp/models/sentic_gcn/utils.py +++ b/sgnlp/models/sentic_gcn/utils.py @@ -322,17 +322,15 @@ def _load_senticnet(self, mode: str) -> Dict[str, float]: ) return senticnet_ - def _read_raw_dataset(self, dataset_type: str) -> List[str]: + def _read_raw_dataset(self, files_path: List[str]) -> List[str]: """ Private helper method to read raw dataset files based on requested type (e.g. Train or Test). Args: - dataset_type (str): Type of dataset files to read. Train or Test. - + file_path (str): file path to the dataset Returns: List[str]: list of str consisting of the full text, aspect and polarity index. """ - files_path = self.config.dataset_train if dataset_type == "train" else self.config.dataset_test all_lines = [] for dataset_file in files_path: with open(dataset_file, "r", encoding="utf-8", newline="\n", errors="ignore") as f: From 714c0bcfeca1d6b1f1ab0e6b9b661ff8d2d04445 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 13 Jan 2022 14:56:20 +0800 Subject: [PATCH 168/201] [#41] update eval unit tests --- tests/sentic_gcn/test_sentic_gcn_train_eval.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/sentic_gcn/test_sentic_gcn_train_eval.py b/tests/sentic_gcn/test_sentic_gcn_train_eval.py index 31d1c8f..8c56884 100644 --- a/tests/sentic_gcn/test_sentic_gcn_train_eval.py +++ b/tests/sentic_gcn/test_sentic_gcn_train_eval.py @@ -181,11 +181,11 @@ def setUp(self) -> None: "embedding_model": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/", "config_filename": "config.json", "model_filename": "pytorch_model.bin", - "test_filename": PARENT_DIR + "/test_data/test_test.raw", + "test_filename": [PARENT_DIR + "/test_data/test_test.raw"], "senticnet": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", "spacy_pipeline": "en_core_web_sm", "result_folder": str(self.results_save_folder), - "eval_batch_size": 16, + "eval_batch_size": 2, "seed": 776, "device": "cpu", } @@ -225,11 +225,11 @@ def setUp(self) -> None: "embedding_model": "bert-base-uncased", "config_filename": "config.json", "model_filename": "pytorch_model.bin", - "test_filename": PARENT_DIR + "/test_data/test_test.raw", + "test_filename": [PARENT_DIR + "/test_data/test_test.raw"], "senticnet": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", "spacy_pipeline": "en_core_web_sm", "result_folder": str(self.results_save_folder), - "eval_batch_size": 16, + "eval_batch_size": 2, "seed": 776, "device": "cpu", } From fa79413b5c9ea6df03ecb65736955a3149e797d0 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Thu, 13 Jan 2022 15:01:48 +0800 Subject: [PATCH 169/201] [#41] update test case for read raw datasets --- tests/sentic_gcn/test_sentic_gcn_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/sentic_gcn/test_sentic_gcn_utils.py b/tests/sentic_gcn/test_sentic_gcn_utils.py index 66879f0..0eaebce 100644 --- a/tests/sentic_gcn/test_sentic_gcn_utils.py +++ b/tests/sentic_gcn/test_sentic_gcn_utils.py @@ -101,7 +101,7 @@ def test_read_raw_dataset(self): with mock.patch("sgnlp.models.sentic_gcn.tokenization.SenticGCNTokenizer") as MockClass: fake_tokenizer = MockClass() dataset_gen = SenticGCNDatasetGenerator(self.cfg, fake_tokenizer) - data = dataset_gen._read_raw_dataset("train") + data = dataset_gen._read_raw_dataset(self.cfg.dataset_train) self.assertEqual(len(data), 15) def test_generate_senticgcn_dataset(self): From 4d776066558346443dbfef5e184e15475e307a3e Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Thu, 13 Jan 2022 16:56:35 +0800 Subject: [PATCH 170/201] [#43] Update jsonify output to first item of the list --- demo_api/sentic_gcn/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo_api/sentic_gcn/api.py b/demo_api/sentic_gcn/api.py index e20ab8a..09a60c4 100644 --- a/demo_api/sentic_gcn/api.py +++ b/demo_api/sentic_gcn/api.py @@ -38,7 +38,7 @@ def predict(): # Postprocessing post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=outputs) - return jsonify(post_outputs) + return jsonify(post_outputs[0]) if __name__ == "__main__": From 46f34649335492a71eb3ca9a5194a223ac57201c Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Thu, 13 Jan 2022 18:14:43 +0800 Subject: [PATCH 171/201] [#43] Update json output from list to dictionary --- demo_api/sentic_gcn/api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/demo_api/sentic_gcn/api.py b/demo_api/sentic_gcn/api.py index 09a60c4..76b0e94 100644 --- a/demo_api/sentic_gcn/api.py +++ b/demo_api/sentic_gcn/api.py @@ -42,4 +42,5 @@ def predict(): if __name__ == "__main__": - app.run() \ No newline at end of file + # app.run() + app.run(host="0.0.0.0", debug=True, port=8000) \ No newline at end of file From 1a726243d2af78658539c0bd50eb03138e1ba794 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 14 Jan 2022 10:31:20 +0800 Subject: [PATCH 172/201] [#41] fix wrong imports for __init__ --- sgnlp/models/sentic_gcn/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_gcn/__init__.py b/sgnlp/models/sentic_gcn/__init__.py index 1e7cdfa..59b7569 100644 --- a/sgnlp/models/sentic_gcn/__init__.py +++ b/sgnlp/models/sentic_gcn/__init__.py @@ -1,6 +1,6 @@ from .config import SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig from .data_class import SenticGCNTrainArgs -from .eval import SenticGCNEvaluator, SenticGCNBaseEvaluator +from .eval import SenticGCNEvaluator, SenticGCNBertEvaluator from .modeling import SenticGCNModel, SenticGCNBertModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel from .preprocess import SenticGCNPreprocessor, SenticGCNBertPreprocessor from .postprocess import SenticGCNPostprocessor, SenticGCNBertPostprocessor From aad5e0443d8312fbc3d5e59b381749ba117654c2 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 14 Jan 2022 10:35:37 +0800 Subject: [PATCH 173/201] [#41] fix broken docstring --- sgnlp/models/sentic_gcn/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgnlp/models/sentic_gcn/config.py b/sgnlp/models/sentic_gcn/config.py index b37bcb9..5d0a439 100644 --- a/sgnlp/models/sentic_gcn/config.py +++ b/sgnlp/models/sentic_gcn/config.py @@ -49,7 +49,7 @@ class SenticGCNBertConfig(PretrainedConfig): hidden_dim (:obj:`int`, defaults to 768): The embedding dimension size for the Bert model as well as GCN dimension. max_seq_len (:obj:`int`, defaults to 85): The max sequence length to pad and truncate. dropout (:obj:`float`, defaults to 0.3): Dropout percentage. - polarities_dim (:ob:`int`, defaults to 3): Size of output dimension representing available polarities (e.g. Positive, Negative, Neutral). + polarities_dim (:obj:`int`, defaults to 3): Size of output dimension representing available polarities (e.g. Positive, Negative, Neutral). loss_function (:obj:`str`, defaults to 'cross_entropy'): Loss function for training/eval. Example: From e4a2959f80818019fc1ac20f50c78fbe222e9b5f Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Fri, 14 Jan 2022 10:39:41 +0800 Subject: [PATCH 174/201] [#43] Update requirements.txt --- demo_api/sentic_gcn/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo_api/sentic_gcn/requirements.txt b/demo_api/sentic_gcn/requirements.txt index e0cc76d..e0c022b 100644 --- a/demo_api/sentic_gcn/requirements.txt +++ b/demo_api/sentic_gcn/requirements.txt @@ -3,4 +3,4 @@ spacy==3.2.1 numpy==1.22.0 flask gunicorn -sgnlp==0.2.0 \ No newline at end of file +sgnlp \ No newline at end of file From 723daafee480037312b5f5a9f656ce5ca314904f Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Fri, 14 Jan 2022 10:40:57 +0800 Subject: [PATCH 175/201] [#43] Add requirements_dev.txt --- demo_api/sentic_gcn/requirements_dev.txt | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 demo_api/sentic_gcn/requirements_dev.txt diff --git a/demo_api/sentic_gcn/requirements_dev.txt b/demo_api/sentic_gcn/requirements_dev.txt new file mode 100644 index 0000000..0d08f9b --- /dev/null +++ b/demo_api/sentic_gcn/requirements_dev.txt @@ -0,0 +1,6 @@ +-e . +torch==1.10.1 +spacy==3.2.1 +numpy==1.22.0 +flask +gunicorn \ No newline at end of file From 2cc819e8d6d2245a25b22ec75ce9a085cc824373 Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Fri, 14 Jan 2022 10:42:20 +0800 Subject: [PATCH 176/201] [#43] Add dev.Dockerfile --- demo_api/sentic_gcn/dev.Dockerfile | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 demo_api/sentic_gcn/dev.Dockerfile diff --git a/demo_api/sentic_gcn/dev.Dockerfile b/demo_api/sentic_gcn/dev.Dockerfile new file mode 100644 index 0000000..3158cca --- /dev/null +++ b/demo_api/sentic_gcn/dev.Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.8-buster + +COPY ./demo_api /demo_api +COPY ./sgnlp /sgnlp +COPY ./setup.py /setup.py +COPY ./README.md /README.md + +RUN pip install -r /demo_api/sentic_gcn/requirements_dev.txt + +WORKDIR /demo_api/sentic_gcn + +RUN python -m download_pretrained + +CMD PYTHONPATH=../../ gunicorn -c ../gunicorn.conf.py \ No newline at end of file From 5809e99495a0f5c967ea33bab5c405463afc2da4 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 14 Jan 2022 10:45:42 +0800 Subject: [PATCH 177/201] [#41] standardise config files name --- ...senticnet_gcn_bert_config.json => sentic_gcn_bert_config.json} | 0 .../config/{senticnet_gcn_config.json => sentic_gcn_config.json} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename sgnlp/models/sentic_gcn/config/{senticnet_gcn_bert_config.json => sentic_gcn_bert_config.json} (100%) rename sgnlp/models/sentic_gcn/config/{senticnet_gcn_config.json => sentic_gcn_config.json} (100%) diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json b/sgnlp/models/sentic_gcn/config/sentic_gcn_bert_config.json similarity index 100% rename from sgnlp/models/sentic_gcn/config/senticnet_gcn_bert_config.json rename to sgnlp/models/sentic_gcn/config/sentic_gcn_bert_config.json diff --git a/sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json b/sgnlp/models/sentic_gcn/config/sentic_gcn_config.json similarity index 100% rename from sgnlp/models/sentic_gcn/config/senticnet_gcn_config.json rename to sgnlp/models/sentic_gcn/config/sentic_gcn_config.json From f59fc322e05d55b7fef1c1130c94c431c09045b0 Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Fri, 14 Jan 2022 11:51:08 +0800 Subject: [PATCH 178/201] [#43] Update api.py returns --- demo_api/sentic_gcn/api.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/demo_api/sentic_gcn/api.py b/demo_api/sentic_gcn/api.py index 76b0e94..feff513 100644 --- a/demo_api/sentic_gcn/api.py +++ b/demo_api/sentic_gcn/api.py @@ -31,16 +31,17 @@ def predict(): req_body = request.get_json() + print('req_body: ',req_body) + # Preprocessing - processed_inputs, processed_indices = preprocessor([req_body]) + processed_inputs, processed_indices = preprocessor(req_body) outputs = model(processed_indices) # Postprocessing post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=outputs) - return jsonify(post_outputs[0]) + return jsonify(post_outputs) if __name__ == "__main__": - # app.run() - app.run(host="0.0.0.0", debug=True, port=8000) \ No newline at end of file + app.run() \ No newline at end of file From 33cce1e727c89fabea5752dd1dae259016a75fbd Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Fri, 14 Jan 2022 11:59:21 +0800 Subject: [PATCH 179/201] [#43] Include sentic_gcn --- jsonnet/dev-demo-api.jsonnet | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/jsonnet/dev-demo-api.jsonnet b/jsonnet/dev-demo-api.jsonnet index cf84943..1c6a662 100644 --- a/jsonnet/dev-demo-api.jsonnet +++ b/jsonnet/dev-demo-api.jsonnet @@ -42,6 +42,11 @@ local api_names = { image_name: "lif-3way-ap", deployment_name: "lif-3way-ap" }, + "sentic_gcn": { + module_name: "sentic_gcn", + image_name: "sentic-gcn", + deployment_name: "sentic-gcn + }, "ufd": { module_name: "ufd", image_name: "ufd", From f332802ec50e9834a663785ff5551fc76cd578e0 Mon Sep 17 00:00:00 2001 From: K-WeiMing Date: Fri, 14 Jan 2022 13:07:10 +0800 Subject: [PATCH 180/201] [#43] Remove extra print statement --- demo_api/sentic_gcn/api.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/demo_api/sentic_gcn/api.py b/demo_api/sentic_gcn/api.py index feff513..816f9d0 100644 --- a/demo_api/sentic_gcn/api.py +++ b/demo_api/sentic_gcn/api.py @@ -31,8 +31,6 @@ def predict(): req_body = request.get_json() - print('req_body: ',req_body) - # Preprocessing processed_inputs, processed_indices = preprocessor(req_body) outputs = model(processed_indices) From c27bd148c83c3f637ff595c690946f3b55e95287 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Fri, 14 Jan 2022 13:37:16 +0800 Subject: [PATCH 181/201] [#41] add documentation for senticgcn --- docs/source/model/senticgcn.rst | 401 ++++++++++++++++++++++++++++++++ docs/source/models.rst | 1 + 2 files changed, 402 insertions(+) create mode 100644 docs/source/model/senticgcn.rst diff --git a/docs/source/model/senticgcn.rst b/docs/source/model/senticgcn.rst new file mode 100644 index 0000000..7859765 --- /dev/null +++ b/docs/source/model/senticgcn.rst @@ -0,0 +1,401 @@ +Sentic-GCN: Aspect-Based Sentiment Analysis via Affective Knowledge Enhanced Graph Convolutional Networks +========================================================================================================= + +Overview +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The Sentic-GCN model was proposed in `Aspect-Based Sentiment Analysis via Affective Knowledge Enhanced +Graph Convolutional Networks `_ by Liang, Bin and Su, Hang and +Gui, Lin and Cambria, Erik and Xu, Ruifeng. + +The abstract from the paper is as follows: + +*Aspect-based sentiment analysis is a fine-grained sentiment analysis task, which needs to detection the +sentiment polarity towards a given aspect. Recently, graph neural models over the dependency tree are +widely applied for aspect- based sentiment analysis. Most existing works, however, they generally focus +on learning the dependency information from contextual words to aspect words based on the dependency tree +of the sentence, which lacks the exploitation of contextual affective knowledge with regard to the +specific aspect. In this pa- per, we propose a graph convolutional network based on SenticNet to leverage +the affective dependencies of the sentence according to the specific aspect, called Sentic GCN. To be +specific, we explore a novel solution to construct the graph neural networks via integrating the affective +knowledge from SenticNet to en- hance the dependency graphs of sentences. Based on it, both the +dependencies of contextual words and aspect words and the affective information between opinion words and +the aspect are considered by the novel affective enhanced graph model. Experimental results on multiple +public benchmark datasets il- lustrate that our proposed model can beat state-of-the-art methods.* + +In keeping with how the models performance are calculated in the paper, this implementation save the best +performing model weights for both Sentic-GCN model and the Sentic-GCN Bert model. + +Default dataset presented in the paper are the SemEval 2014 (Laptop, Restaurant), SemEval 2015 +(Restaurant), SemEval 2016 (Restaurant). However, please note that the dataset format have been further +processed from original source, please see dataset link below for the processed datasets. + +| Link to the `paper `_ +| Link to the `dataset `_ +| Link to the original `github `_ + + +Getting started +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The Sentic-GCN model pretrained on the SemEval 2014/2015/2016 data can be loaded and accessed with the +following code: + +.. code:: python + + from sgnlp.models.sentic_gcn import( + SenticGCNConfig, + SenticGCNModel, + SenticGCNEmbeddingConfig, + SenticGCNEmbeddingModel, + SenticGCNTokenizer, + SenticGCNPreprocessor, + SenticGCNPostprocessor, + download_tokenizer_files, + ) + + download_tokenizer_files( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_tokenizer/", + "senticgcn_tokenizer") + tokenizer = SenticGCNTokenizer.from_pretrained("senticgcn_tokenizer") + + config = SenticGCNConfig.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn/config.json" + ) + model = SenticGCNModel.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn/pytorch_model.bin", + config=config + ) + + embed_config = SenticGCNEmbeddingConfig.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/config.json" + ) + embed_model = SenticGCNEmbeddingModel.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/pytorch_model.bin", + config=embed_config + ) + + preprocessor = SenticGCNPreprocessor(tokenizer=tokenizer, embedding_model=embed_model) + postprocessor = SenticGCNPostprocessor() + + inputs = [ + { + "aspect": ["Soup"], + "sentence": "The soup is a little salty." + }, + { + "aspect": ["service"], + "sentence": """Everyone that sat in the back outside agreed that it was the worst service we + had ever received.""" + }, + { + "aspect": ["location", "food"], + "sentence": """it 's located in a strip mall near the beverly center , not the greatest + location , but the food keeps me coming back for more .""" + } + ] + + processed_inputs, processed_indices = preprocessor(inputs) + raw_outputs = model(processed_indices) + + post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=raw_outputs) + + print(post_outputs[0]) + # {'sentence': ['The', 'soup', 'is', 'a', 'little', 'salty.'], + # 'aspects': [1], + # 'labels': [-1]} + + print(post_outputs[1]) + # {'sentence': ['Everyone', 'that', 'sat', 'in', 'the', 'back', 'outside', 'agreed', 'that', 'it', + # 'was', 'the', 'worst', 'service', 'we', 'had', 'ever', 'received.'], + # 'aspects': [13], + # 'labels': [-1]} + + print(post_outputs[2]) + # {'sentence': ['it', "'s", 'located', 'in', 'a', 'strip', 'mall', 'near', 'the', 'beverly', + # 'center', ',', 'not', 'the', 'greatest', 'location', ',', 'but', 'the', 'food', + # 'keeps', 'me', 'coming', 'back', 'for', 'more', '.'], + # 'aspects': [15, 19], + # 'labels': [0, 1]} + + +The Sentic-GCN Bert model pretrained on the SemEval 2014/2015/2016 data can be loaded and accessed +with the following code: + +.. code:: python + + from sgnlp.models.sentic_gcn import( + SenticGCNBertConfig, + SenticGCNBertModel, + SenticGCNBertEmbeddingConfig, + SenticGCNBertEmbeddingModel, + SenticGCNBertTokenizer, + SenticGCNBertPreprocessor, + SenticGCNBertPostprocessor + ) + + tokenizer = SenticGCNBertTokenizer.from_pretrained("bert-base-uncased") + + config = SenticGCNBertConfig.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json" + ) + model = SenticGCNBertModel.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin", + config=config + ) + + embed_config = SenticGCNBertEmbeddingConfig.from_pretrained("bert-base-uncased") + embed_model = SenticGCNBertEmbeddingModel.from_pretrained("bert-base-uncased", + config=embed_config + ) + + preprocessor = SenticGCNBertPreprocessor(tokenizer=tokenizer, embedding_model=embed_model) + postprocessor = SenticGCNBertPostprocessor() + + inputs = [ + { + "aspect": ["Soup"], + "sentence": "The soup is a little salty." + }, + { + "aspect": ["service"], + "sentence": """Everyone that sat in the back outside agreed that it was the worst service we + had ever received.""" + }, + { + "aspect": ["location", "food"], + "sentence": """it 's located in a strip mall near the beverly center , not the greatest + location , but the food keeps me coming back for more .""" + } + ] + + processed_inputs, processed_indices = preprocessor(inputs) + raw_outputs = model(processed_indices) + + post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=raw_outputs) + + print(post_outputs[0]) + # {'sentence': ['The', 'soup', 'is', 'a', 'little', 'salty.'], + # 'aspects': [1], + # 'labels': [-1]} + + print(post_outputs[1]) + # {'sentence': ['Everyone', 'that', 'sat', 'in', 'the', 'back', 'outside', 'agreed', 'that', 'it', + # 'was', 'the', 'worst', 'service', 'we', 'had', 'ever', 'received.'], + # 'aspects': [13], + # 'labels': [-1]} + + print(post_outputs[2]) + # {'sentence': ['it', "'s", 'located', 'in', 'a', 'strip', 'mall', 'near', 'the', 'beverly', + # 'center', ',', 'not', 'the', 'greatest', 'location', ',', 'but', 'the', 'food', + # 'keeps', 'me', 'coming', 'back', 'for', 'more', '.'], + # 'aspects': [15, 19], + # 'labels': [0, 1]} + + +Input +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The input data needs to be a dictionary with the following keys: + ++----------------------+-----------------------------------------------------------------------------------------------+ +| Key | Meaning | ++----------------------+-----------------------------------------------------------------------------------------------+ +| aspects | A list of aspect(s) which must also be found in the sentence. | ++----------------------+-----------------------------------------------------------------------------------------------+ +| sentence | A sentence which also contains all the aspects. | ++----------------------+-----------------------------------------------------------------------------------------------+ + +The value(s) for aspects must be a list and each aspect must also exists in the sentence. If aspect have more than one +occurances in the sentence, each aspect will be treated as an input instance. + + +Output +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The output returned from :class:`~sgnlp.models.sentic_gcn.postprocess.SenticGCNPostprocessor` and +:class:`~sgnlp.models.sentic_gcn.postprocess.SenticGCNBertPostprocessor` consists of a list of dictionary +containing each processed input entries. Each entry consists of the following: + +1. sentence: The input sentence in tokenized form. +2. aspects: A list of indices which denotes each index position in the tokenized input sentence. +3. labels: A list of prediction for each aspects in order. -1 denote negative sentiment, 0 denote neutral sentiment and 1 denote positive sentiment. + +The logits can be accessed from the model output returned from the model. + + +Training +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Dataset Preparation +------------------- +Prepare the training and evaluation dataset in the format that is the same as the datasets from the +author's repo. Please refer to the sample dataset +`here `__ for reference. + + +Config Preparation +------------------ + +Aspect of the training could be configured via the `sentic_gcn_config.json` and `sentic_gcn_bert_config.json` +file. An example of the Sentic-GCN config file can be found +`here `_ +and example of the Sentic-GCN Bert config file can be found +`here `_ + ++------------------------------------------+--------------------------------------------------------------------------------------+ +| Configuration key | Description | ++==========================================+======================================================================================+ +| senticnet_word_file_path | File path to the SenticNet 5.0 file. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_preprocessed_senticnet | Flag to indicate if the processed SenticNet dictionary should be pickled. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| saved_preprocessed_senticnet_file_path | Pickle file path for saving processed SenticNet dictionary. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| spacy_pipeline | Spacy pre-trained pipeline to load for preprocessing. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| word_vec_file_path | File path to word vectors file for generating embeddings. (e.g. GloVe vectors.) | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| dataset_train | List of training dataset files path. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| dataset_test | List of testing dataset files path. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| valset_ratio | Ratio for train validation split. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| model | The model type to train. Either 'senticgcn' or 'senticgcnbert'. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_best_model | Flag to indicate if best model should saved. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_model_path | Folder path to save best performing model during training. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| tokenizer | The tokenizer type to use for dataset preprocessing. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| train_tokenizer | Flag to indicate if tokenizer should be trained using train and test datasets. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_tokenizer | Flag to indicate if trained tokenizer should be saved. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_tokenizer_path | Folder path to save trained tokenizer. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| embedding_model | Embedding model type to use for training. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| build_embedding_model | Flag to indicate if embedding model should be trained on input word vectors. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_embedding_model | Flag to indicate if trained embedding model should be saved. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_embedding_model_path | Folder path to save trained embedding model. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_results | Flag to indicate if training results should be saved. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_result_folder | Folder path for saving training results. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| initializer | torch.nn.initializer type for initializing model weights. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| optimizer | torch.nn.optimizer type for training. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| loss_function | Loss function to use for training. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| learning_rate | Learning rate for training. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| l2reg | l2reg value to set for training. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| epochs | Number of epoch to train. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| batch_size | Batch size to set for dataloader. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| log_step | Print training results for every log_step. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| embed_dim | Size of embedding dimension. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| hidden_dim | Size of hidden layer for GCN. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| polarities_dim | Size of output layer. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| dropout | Dropout ratio for dropout layer. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| seed | Random seed to set prior to training. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| device | torch.device to set for training. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| repeats | Number of times to repeat whole training cycle. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| patience | Patience value for early stopping. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| max_len | Maximum length for input tensor. | ++------------------------------------------+--------------------------------------------------------------------------------------+ + + +Running Train Code +------------------ +To start training Sentic-GCN or Sentic-GCN Bert model, execute the following code: + +.. code:: python + + from sgnlp.models.sentic_gcn.train import SenticGCNTrainer, SenticGCNBertTrainer + from sgnlp.models.sentic_gcn.utils import parse_args_and_load_config, set_random_seed + + cfg = parse_args_and_load_config() + if cfg.seed is not None: + set_random_seed(cfg.seed) + trainer = SenticGCNTrainer(cfg) if cfg.model == "senticgcn" else SenticGCNBertTrainer(cfg) + trainer.train() + + +Evaluating +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Dataset Preparation +------------------- + +Refer to training section above for dataset example. + + +Config Preparation +------------------ + +Aspect of the training could be configured via the `sentic_gcn_config.json` and `sentic_gcn_bert_config.json` +file. An example of the Sentic-GCN config file can be found +`here `_ +and example of the Sentic-GCN Bert config file can be found +`here `_ + ++------------------------------------------+--------------------------------------------------------------------------------------+ +| Configuration key | Description | ++==========================================+======================================================================================+ +| eval_args/model | The model type to evaluate. Either 'senticgcn' or 'senticgcnbert'. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| eval_args/model | Path to model folder, could be cloud storage, local folder or HuggingFace model hub. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| tokenizer | The tokenizer type to use for dataset preprocessing. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| embedding_model | The embedding model type to use for dataset preprocessing. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| config_filename | Config file name to load from model folder and embedding model folder. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| model_filename | Model file name to load from model folder and embedding model folder. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| test_filename | File path to test dataset. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| senticnet | File path to pickled processed senticnet. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| spacy_pipeline | Spacy pre-trained pipeline to load for dataset preprocesing. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| result_folder | Folder to save evaluation results. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| eval_batch_size | Batch size for evaluator dataloader. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| seed | Random seed to set for evaluation. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| device | torch.device to set for tensors. | ++------------------------------------------+--------------------------------------------------------------------------------------+ + + +Running the Evaluation Code +--------------------------- +To start evaluating Sentic-GCN or Sentic-GCN Bert model, execute the following code: + +.. code:: python + + from sgnlp.models.sentic_gcn.eval import SenticGCNEvaluator, SenticGCNBertEvaluator + from sgnlp.models.sentic_gcn.utils import parse_args_and_load_config, set_random_seed + + cfg = parse_args_and_load_config() + if cfg.seed is not None: + set_random_seed(cfg.seed) + evaluator = SenticGCNEvaluator(cfg) if cfg.model == "senticgcn" else SenticGCNBertEvaluator(cfg) + evaluator.evaluate() diff --git a/docs/source/models.rst b/docs/source/models.rst index 77771a1..c78be12 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -7,3 +7,4 @@ Models model/ufd model/emotion_entailment model/span_extraction + model/senticgcn \ No newline at end of file From e932971ee56f688c9c81aa7d2c27405aa6348c00 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 13:20:59 +0800 Subject: [PATCH 182/201] [#41] re-add deleted files during merge --- sgnlp/models/sentic_gcn/modules/__init__.py | 0 sgnlp/models/sentic_gcn/modules/gcn.py | 23 +++++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 sgnlp/models/sentic_gcn/modules/__init__.py create mode 100644 sgnlp/models/sentic_gcn/modules/gcn.py diff --git a/sgnlp/models/sentic_gcn/modules/__init__.py b/sgnlp/models/sentic_gcn/modules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sgnlp/models/sentic_gcn/modules/gcn.py b/sgnlp/models/sentic_gcn/modules/gcn.py new file mode 100644 index 0000000..618156e --- /dev/null +++ b/sgnlp/models/sentic_gcn/modules/gcn.py @@ -0,0 +1,23 @@ +import torch +import torch.nn as nn + + +class GraphConvolution(nn.Module): + """ + Simple GCN Layer, similar to https://arxiv.org/abs/1609.02907 + """ + + def __init__(self, in_features: torch.Tensor, out_features: torch.Tensor, bias=True) -> None: + super(GraphConvolution, self).__init__() + self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features)) + if bias: + self.bias = nn.Parameter(torch.FloatTensor(out_features)) + else: + self.register_parameter("bias", None) + + def forward(self, text: torch.Tensor, adj: torch.Tensor): + text = text.to(torch.float32) + hidden = torch.matmul(text, self.weight) + denom = torch.sum(adj, dim=2, keepdim=True) + 1 + output = torch.matmul(adj, hidden) / denom + return output + self.bias if self.bias is not None else output From 64d053d80dfb16bb6cfa24a3893fd76bb9e7f074 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 13:46:31 +0800 Subject: [PATCH 183/201] [#41] fix broken string and add sentic_gcn to demo_api jsonnet --- jsonnet/demo-api.jsonnet | 5 +++++ jsonnet/dev-demo-api.jsonnet | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/jsonnet/demo-api.jsonnet b/jsonnet/demo-api.jsonnet index 7ab3059..93c0589 100644 --- a/jsonnet/demo-api.jsonnet +++ b/jsonnet/demo-api.jsonnet @@ -126,6 +126,11 @@ local api_names = { module_name: "ufd", image_name: "ufd", deployment_name: "ufd" + }, + "sentic_gcn": { + module_name: "sentic_gcn", + image_name: "sentic-gcn", + deployment_name: "sentic-gcn" } }; diff --git a/jsonnet/dev-demo-api.jsonnet b/jsonnet/dev-demo-api.jsonnet index 1c6a662..f15f521 100644 --- a/jsonnet/dev-demo-api.jsonnet +++ b/jsonnet/dev-demo-api.jsonnet @@ -45,7 +45,7 @@ local api_names = { "sentic_gcn": { module_name: "sentic_gcn", image_name: "sentic-gcn", - deployment_name: "sentic-gcn + deployment_name: "sentic-gcn" }, "ufd": { module_name: "ufd", From 475098c6a792bd37e187f0f7f3f9d132ac2be8e2 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 14:17:27 +0800 Subject: [PATCH 184/201] [#41] added tags to indicate on-prem for CI/CD --- .gitlab-ci.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ac230d9..157d794 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -45,6 +45,8 @@ run_slow_unit_tests: generate_demo_api_yaml: stage: pre + tags: + - on-prem image: alpine:latest script: - apk add -U jsonnet @@ -55,6 +57,8 @@ generate_demo_api_yaml: generate_dev_demo_api_yaml: stage: pre + tags: + - on-prem image: alpine:latest script: - apk add -U jsonnet @@ -66,6 +70,8 @@ generate_dev_demo_api_yaml: build_and_push_demo_api: stage: build + tags: + - on-prem allow_failure: true needs: - generate_demo_api_yaml @@ -76,6 +82,8 @@ build_and_push_demo_api: build_and_push_dev_demo_api: stage: build + tags: + - on-prem allow_failure: true needs: - generate_dev_demo_api_yaml From 905ab954cf97425565f2e86949584f39b5e16480 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 14:24:37 +0800 Subject: [PATCH 185/201] [#41] remove tags from build stage --- .gitlab-ci.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 157d794..92a7acd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -70,8 +70,6 @@ generate_dev_demo_api_yaml: build_and_push_demo_api: stage: build - tags: - - on-prem allow_failure: true needs: - generate_demo_api_yaml @@ -82,8 +80,6 @@ build_and_push_demo_api: build_and_push_dev_demo_api: stage: build - tags: - - on-prem allow_failure: true needs: - generate_dev_demo_api_yaml From dfb49f0ce8a5288c86569438a018fec26312871d Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 14:45:58 +0800 Subject: [PATCH 186/201] [#41] try adding on-prem to jsonnet --- jsonnet/demo-api.jsonnet | 5 +++++ jsonnet/dev-demo-api.jsonnet | 2 ++ 2 files changed, 7 insertions(+) diff --git a/jsonnet/demo-api.jsonnet b/jsonnet/demo-api.jsonnet index 93c0589..dfff45d 100644 --- a/jsonnet/demo-api.jsonnet +++ b/jsonnet/demo-api.jsonnet @@ -1,6 +1,7 @@ local build_and_push_staging(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "build_and_push_staging", + tags: "on-prem", when: "manual", script: [ "echo 'Logging in to AISG Docker Registry...'", @@ -15,6 +16,7 @@ local build_and_push_staging(module_name, image_name) = { local build_and_push_docs_staging() = { image: "python:3.8.11-slim", stage: "build_and_push_staging", + tags: "on-prem", when: "manual", script: [ "echo 'Building Sphinx docs'", @@ -42,6 +44,7 @@ local build_and_push_docs_staging() = { local retag_and_push_production(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "retag_and_push_production", + tags: "on-prem", only: { refs: ["main"] }, @@ -62,6 +65,7 @@ local retag_and_push_production(module_name, image_name) = { local restart_kubernetes_staging(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_staging", + tags: "on-prem", when: "manual", needs: ["%s_build_and_push_staging" % module_name], script: [ @@ -74,6 +78,7 @@ local restart_kubernetes_staging(module_name, deployment_name) = { local restart_kubernetes_production(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_production", + tags: "on-prem", only: { refs: ["main"] }, diff --git a/jsonnet/dev-demo-api.jsonnet b/jsonnet/dev-demo-api.jsonnet index f15f521..ebe5482 100644 --- a/jsonnet/dev-demo-api.jsonnet +++ b/jsonnet/dev-demo-api.jsonnet @@ -1,6 +1,7 @@ local build_and_push_staging(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "build_and_push_staging", + tags: "on-prem", when: "manual", script: [ "echo 'Logging in to AISG Docker Registry...'", @@ -16,6 +17,7 @@ local build_and_push_staging(module_name, image_name) = { local restart_kubernetes_staging(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_staging", + tags: "on-prem", when: "manual", needs: ["%s_build_and_push_staging" % module_name], script: [ From 7f0d127d1e22bc35750878dc30babc6f0349f078 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 14:56:35 +0800 Subject: [PATCH 187/201] Revert "[#41] try adding on-prem to jsonnet" This reverts commit dfb49f0ce8a5288c86569438a018fec26312871d. --- jsonnet/demo-api.jsonnet | 5 ----- jsonnet/dev-demo-api.jsonnet | 2 -- 2 files changed, 7 deletions(-) diff --git a/jsonnet/demo-api.jsonnet b/jsonnet/demo-api.jsonnet index dfff45d..93c0589 100644 --- a/jsonnet/demo-api.jsonnet +++ b/jsonnet/demo-api.jsonnet @@ -1,7 +1,6 @@ local build_and_push_staging(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "build_and_push_staging", - tags: "on-prem", when: "manual", script: [ "echo 'Logging in to AISG Docker Registry...'", @@ -16,7 +15,6 @@ local build_and_push_staging(module_name, image_name) = { local build_and_push_docs_staging() = { image: "python:3.8.11-slim", stage: "build_and_push_staging", - tags: "on-prem", when: "manual", script: [ "echo 'Building Sphinx docs'", @@ -44,7 +42,6 @@ local build_and_push_docs_staging() = { local retag_and_push_production(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "retag_and_push_production", - tags: "on-prem", only: { refs: ["main"] }, @@ -65,7 +62,6 @@ local retag_and_push_production(module_name, image_name) = { local restart_kubernetes_staging(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_staging", - tags: "on-prem", when: "manual", needs: ["%s_build_and_push_staging" % module_name], script: [ @@ -78,7 +74,6 @@ local restart_kubernetes_staging(module_name, deployment_name) = { local restart_kubernetes_production(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_production", - tags: "on-prem", only: { refs: ["main"] }, diff --git a/jsonnet/dev-demo-api.jsonnet b/jsonnet/dev-demo-api.jsonnet index ebe5482..f15f521 100644 --- a/jsonnet/dev-demo-api.jsonnet +++ b/jsonnet/dev-demo-api.jsonnet @@ -1,7 +1,6 @@ local build_and_push_staging(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "build_and_push_staging", - tags: "on-prem", when: "manual", script: [ "echo 'Logging in to AISG Docker Registry...'", @@ -17,7 +16,6 @@ local build_and_push_staging(module_name, image_name) = { local restart_kubernetes_staging(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_staging", - tags: "on-prem", when: "manual", needs: ["%s_build_and_push_staging" % module_name], script: [ From c67b04840ae0ab592a85cbd0e2307bc1d2666b1a Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 15:06:28 +0800 Subject: [PATCH 188/201] [#41] try adding tags with proper syntax in jsonnet --- jsonnet/demo-api.jsonnet | 15 +++++++++++++++ jsonnet/dev-demo-api.jsonnet | 6 ++++++ 2 files changed, 21 insertions(+) diff --git a/jsonnet/demo-api.jsonnet b/jsonnet/demo-api.jsonnet index 93c0589..e610592 100644 --- a/jsonnet/demo-api.jsonnet +++ b/jsonnet/demo-api.jsonnet @@ -1,6 +1,9 @@ local build_and_push_staging(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "build_and_push_staging", + tags: [ + "on-prem", + ], when: "manual", script: [ "echo 'Logging in to AISG Docker Registry...'", @@ -15,6 +18,9 @@ local build_and_push_staging(module_name, image_name) = { local build_and_push_docs_staging() = { image: "python:3.8.11-slim", stage: "build_and_push_staging", + tags: [ + "on-prem", + ], when: "manual", script: [ "echo 'Building Sphinx docs'", @@ -42,6 +48,9 @@ local build_and_push_docs_staging() = { local retag_and_push_production(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "retag_and_push_production", + tags: [ + "on-prem", + ], only: { refs: ["main"] }, @@ -62,6 +71,9 @@ local retag_and_push_production(module_name, image_name) = { local restart_kubernetes_staging(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_staging", + tags: [ + "on-prem", + ], when: "manual", needs: ["%s_build_and_push_staging" % module_name], script: [ @@ -74,6 +86,9 @@ local restart_kubernetes_staging(module_name, deployment_name) = { local restart_kubernetes_production(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_production", + tags: [ + "on-prem", + ], only: { refs: ["main"] }, diff --git a/jsonnet/dev-demo-api.jsonnet b/jsonnet/dev-demo-api.jsonnet index f15f521..558bd20 100644 --- a/jsonnet/dev-demo-api.jsonnet +++ b/jsonnet/dev-demo-api.jsonnet @@ -1,6 +1,9 @@ local build_and_push_staging(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "build_and_push_staging", + tags: [ + "on-prem", + ], when: "manual", script: [ "echo 'Logging in to AISG Docker Registry...'", @@ -16,6 +19,9 @@ local build_and_push_staging(module_name, image_name) = { local restart_kubernetes_staging(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_staging", + tags: [ + "on-prem", + ], when: "manual", needs: ["%s_build_and_push_staging" % module_name], script: [ From 34af1d556546941557b935e425b04f6346222b8d Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 15:23:42 +0800 Subject: [PATCH 189/201] [#41] try remove on-prem tag for pre stage --- .gitlab-ci.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 92a7acd..ac230d9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -45,8 +45,6 @@ run_slow_unit_tests: generate_demo_api_yaml: stage: pre - tags: - - on-prem image: alpine:latest script: - apk add -U jsonnet @@ -57,8 +55,6 @@ generate_demo_api_yaml: generate_dev_demo_api_yaml: stage: pre - tags: - - on-prem image: alpine:latest script: - apk add -U jsonnet From a842cbc49e1747714cc65c7849d8e5a1d80b73f2 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 15:26:12 +0800 Subject: [PATCH 190/201] Revert "[#41] try remove on-prem tag for pre stage" This reverts commit 34af1d556546941557b935e425b04f6346222b8d. --- .gitlab-ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ac230d9..92a7acd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -45,6 +45,8 @@ run_slow_unit_tests: generate_demo_api_yaml: stage: pre + tags: + - on-prem image: alpine:latest script: - apk add -U jsonnet @@ -55,6 +57,8 @@ generate_demo_api_yaml: generate_dev_demo_api_yaml: stage: pre + tags: + - on-prem image: alpine:latest script: - apk add -U jsonnet From a0b8eaf923df49849be295b4aa3547fb0c858d44 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 15:53:50 +0800 Subject: [PATCH 191/201] [#41] try fix CI/CD pipeline with proper tags --- .gitlab-ci.yml | 4 ++-- jsonnet/demo-api.jsonnet | 5 +++++ jsonnet/dev-demo-api.jsonnet | 2 ++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 92a7acd..ed78ce3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -46,7 +46,7 @@ run_slow_unit_tests: generate_demo_api_yaml: stage: pre tags: - - on-prem + - dind image: alpine:latest script: - apk add -U jsonnet @@ -58,7 +58,7 @@ generate_demo_api_yaml: generate_dev_demo_api_yaml: stage: pre tags: - - on-prem + - dind image: alpine:latest script: - apk add -U jsonnet diff --git a/jsonnet/demo-api.jsonnet b/jsonnet/demo-api.jsonnet index e610592..90a1185 100644 --- a/jsonnet/demo-api.jsonnet +++ b/jsonnet/demo-api.jsonnet @@ -3,6 +3,7 @@ local build_and_push_staging(module_name, image_name) = { stage: "build_and_push_staging", tags: [ "on-prem", + "dind", ], when: "manual", script: [ @@ -20,6 +21,7 @@ local build_and_push_docs_staging() = { stage: "build_and_push_staging", tags: [ "on-prem", + "dind", ], when: "manual", script: [ @@ -50,6 +52,7 @@ local retag_and_push_production(module_name, image_name) = { stage: "retag_and_push_production", tags: [ "on-prem", + "dind", ], only: { refs: ["main"] @@ -73,6 +76,7 @@ local restart_kubernetes_staging(module_name, deployment_name) = { stage: "restart_kubernetes_staging", tags: [ "on-prem", + "dind", ], when: "manual", needs: ["%s_build_and_push_staging" % module_name], @@ -88,6 +92,7 @@ local restart_kubernetes_production(module_name, deployment_name) = { stage: "restart_kubernetes_production", tags: [ "on-prem", + "dind", ], only: { refs: ["main"] diff --git a/jsonnet/dev-demo-api.jsonnet b/jsonnet/dev-demo-api.jsonnet index 558bd20..3239732 100644 --- a/jsonnet/dev-demo-api.jsonnet +++ b/jsonnet/dev-demo-api.jsonnet @@ -3,6 +3,7 @@ local build_and_push_staging(module_name, image_name) = { stage: "build_and_push_staging", tags: [ "on-prem", + "dind", ], when: "manual", script: [ @@ -21,6 +22,7 @@ local restart_kubernetes_staging(module_name, deployment_name) = { stage: "restart_kubernetes_staging", tags: [ "on-prem", + "dind", ], when: "manual", needs: ["%s_build_and_push_staging" % module_name], From 578f423aa462ef97c38861a092b00697e9f39a50 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 15:56:27 +0800 Subject: [PATCH 192/201] [#41] try fix pipeline adding on-prem tags --- .gitlab-ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ed78ce3..95c0354 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -46,6 +46,7 @@ run_slow_unit_tests: generate_demo_api_yaml: stage: pre tags: + - on-prem - dind image: alpine:latest script: @@ -58,6 +59,7 @@ generate_demo_api_yaml: generate_dev_demo_api_yaml: stage: pre tags: + - on-prem - dind image: alpine:latest script: From 2f75e621bd12583f5928ff60afd246c5e66cebe7 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 16:09:26 +0800 Subject: [PATCH 193/201] [#41] remove on-prem tags --- .gitlab-ci.yml | 2 -- jsonnet/demo-api.jsonnet | 5 ----- jsonnet/dev-demo-api.jsonnet | 2 -- 3 files changed, 9 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 95c0354..ed78ce3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -46,7 +46,6 @@ run_slow_unit_tests: generate_demo_api_yaml: stage: pre tags: - - on-prem - dind image: alpine:latest script: @@ -59,7 +58,6 @@ generate_demo_api_yaml: generate_dev_demo_api_yaml: stage: pre tags: - - on-prem - dind image: alpine:latest script: diff --git a/jsonnet/demo-api.jsonnet b/jsonnet/demo-api.jsonnet index 90a1185..d31f99c 100644 --- a/jsonnet/demo-api.jsonnet +++ b/jsonnet/demo-api.jsonnet @@ -2,7 +2,6 @@ local build_and_push_staging(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "build_and_push_staging", tags: [ - "on-prem", "dind", ], when: "manual", @@ -20,7 +19,6 @@ local build_and_push_docs_staging() = { image: "python:3.8.11-slim", stage: "build_and_push_staging", tags: [ - "on-prem", "dind", ], when: "manual", @@ -51,7 +49,6 @@ local retag_and_push_production(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "retag_and_push_production", tags: [ - "on-prem", "dind", ], only: { @@ -75,7 +72,6 @@ local restart_kubernetes_staging(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_staging", tags: [ - "on-prem", "dind", ], when: "manual", @@ -91,7 +87,6 @@ local restart_kubernetes_production(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_production", tags: [ - "on-prem", "dind", ], only: { diff --git a/jsonnet/dev-demo-api.jsonnet b/jsonnet/dev-demo-api.jsonnet index 3239732..fe3e757 100644 --- a/jsonnet/dev-demo-api.jsonnet +++ b/jsonnet/dev-demo-api.jsonnet @@ -2,7 +2,6 @@ local build_and_push_staging(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "build_and_push_staging", tags: [ - "on-prem", "dind", ], when: "manual", @@ -21,7 +20,6 @@ local restart_kubernetes_staging(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_staging", tags: [ - "on-prem", "dind", ], when: "manual", From b2d3f768bcba9d4b1bd7d5f069373cd68dc8fa3d Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 16:16:30 +0800 Subject: [PATCH 194/201] [#41] try adding dind tags for unit tests stage --- .gitlab-ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ed78ce3..870c72d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -16,6 +16,8 @@ cache: run_non_slow_unit_tests: stage: unit_test_non_slow + tags: + - dind before_script: - python -m venv .venv - source .venv/bin/activate @@ -29,6 +31,8 @@ run_non_slow_unit_tests: run_slow_unit_tests: stage: unit_test_slow + tags: + - dind when: manual before_script: - python -m venv .venv From 8088c2219633f121db470e75b4d21bb172c6b2cc Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 16:42:48 +0800 Subject: [PATCH 195/201] [#41] try adding on-prem tag --- jsonnet/dev-demo-api.jsonnet | 2 ++ 1 file changed, 2 insertions(+) diff --git a/jsonnet/dev-demo-api.jsonnet b/jsonnet/dev-demo-api.jsonnet index fe3e757..3239732 100644 --- a/jsonnet/dev-demo-api.jsonnet +++ b/jsonnet/dev-demo-api.jsonnet @@ -2,6 +2,7 @@ local build_and_push_staging(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "build_and_push_staging", tags: [ + "on-prem", "dind", ], when: "manual", @@ -20,6 +21,7 @@ local restart_kubernetes_staging(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_staging", tags: [ + "on-prem", "dind", ], when: "manual", From 254384a611f054c47a896080b39eccbad6e24509 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 17:38:43 +0800 Subject: [PATCH 196/201] [#41] add on-prem tag to prod config --- jsonnet/demo-api.jsonnet | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/jsonnet/demo-api.jsonnet b/jsonnet/demo-api.jsonnet index d31f99c..90a1185 100644 --- a/jsonnet/demo-api.jsonnet +++ b/jsonnet/demo-api.jsonnet @@ -2,6 +2,7 @@ local build_and_push_staging(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "build_and_push_staging", tags: [ + "on-prem", "dind", ], when: "manual", @@ -19,6 +20,7 @@ local build_and_push_docs_staging() = { image: "python:3.8.11-slim", stage: "build_and_push_staging", tags: [ + "on-prem", "dind", ], when: "manual", @@ -49,6 +51,7 @@ local retag_and_push_production(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "retag_and_push_production", tags: [ + "on-prem", "dind", ], only: { @@ -72,6 +75,7 @@ local restart_kubernetes_staging(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_staging", tags: [ + "on-prem", "dind", ], when: "manual", @@ -87,6 +91,7 @@ local restart_kubernetes_production(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_production", tags: [ + "on-prem", "dind", ], only: { From 84d58bbfa00cd4593e499dc559ed51b1f819cbfd Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 18:15:11 +0800 Subject: [PATCH 197/201] [#41] remove list from output --- demo_api/sentic_gcn/api.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/demo_api/sentic_gcn/api.py b/demo_api/sentic_gcn/api.py index 816f9d0..1318484 100644 --- a/demo_api/sentic_gcn/api.py +++ b/demo_api/sentic_gcn/api.py @@ -5,41 +5,42 @@ SenticGCNBertModel, SenticGCNBertConfig, SenticGCNBertPreprocessor, - SenticGCNBertPostprocessor - ) + SenticGCNBertPostprocessor, +) app = create_api(app_name=__name__, model_card_path="model_card/sentic_gcn.json") preprocessor = SenticGCNBertPreprocessor( - senticnet='https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle', - device='cpu' + senticnet="https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", device="cpu" ) postprocessor = SenticGCNBertPostprocessor() # Load model -config = SenticGCNBertConfig.from_pretrained('https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json') +config = SenticGCNBertConfig.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json" +) model = SenticGCNBertModel.from_pretrained( - 'https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin', - config=config + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin", config=config ) app.logger.info("Preprocessing pipeline and model initialization complete.") + @app.route("/predict", methods=["POST"]) def predict(): req_body = request.get_json() - + # Preprocessing processed_inputs, processed_indices = preprocessor(req_body) outputs = model(processed_indices) - + # Postprocessing post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=outputs) - return jsonify(post_outputs) + return jsonify(post_outputs[0]) if __name__ == "__main__": - app.run() \ No newline at end of file + app.run() From 038e660077cfb97bd92c16369c06e4f9fe029372 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 18:26:27 +0800 Subject: [PATCH 198/201] [#41] update text for model weights and config --- demo_api/sentic_gcn/model_card/sentic_gcn.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demo_api/sentic_gcn/model_card/sentic_gcn.json b/demo_api/sentic_gcn/model_card/sentic_gcn.json index df496eb..6bfe52d 100644 --- a/demo_api/sentic_gcn/model_card/sentic_gcn.json +++ b/demo_api/sentic_gcn/model_card/sentic_gcn.json @@ -20,11 +20,11 @@ }, "trainingTime": "Sentic-GCN: ~10 mins for ~35 epochs (early stopped), Sentic-GCN Bert: ~1 hr for ~40 epochs (early stopped) for SemEval14-Laptop/SemEval14-Restaurant/SemEval15-Restaurant/SemEval16-Restaurant datasets.", "modelWeights": { - "text": "Refer to documentation for details.", + "text": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin", "url": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin" }, "modelConfig": { - "text": "Refer to documentation for details.", + "text": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json", "url": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json" }, "modelInput": "Aspect (word), sentence containing the aspect", From d1a366f7012508f88011874f02cfd7d9a4b3a569 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 19:19:56 +0800 Subject: [PATCH 199/201] [#41] standardise aspects key from input batch --- demo_api/sentic_gcn/api.py | 2 +- docs/source/model/senticgcn.rst | 12 ++++++------ sgnlp/models/sentic_gcn/preprocess.py | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/demo_api/sentic_gcn/api.py b/demo_api/sentic_gcn/api.py index 1318484..48511cc 100644 --- a/demo_api/sentic_gcn/api.py +++ b/demo_api/sentic_gcn/api.py @@ -33,7 +33,7 @@ def predict(): req_body = request.get_json() # Preprocessing - processed_inputs, processed_indices = preprocessor(req_body) + processed_inputs, processed_indices = preprocessor([req_body]) outputs = model(processed_indices) # Postprocessing diff --git a/docs/source/model/senticgcn.rst b/docs/source/model/senticgcn.rst index 7859765..799a0e6 100644 --- a/docs/source/model/senticgcn.rst +++ b/docs/source/model/senticgcn.rst @@ -78,16 +78,16 @@ following code: inputs = [ { - "aspect": ["Soup"], + "aspects": ["Soup"], "sentence": "The soup is a little salty." }, { - "aspect": ["service"], + "aspects": ["service"], "sentence": """Everyone that sat in the back outside agreed that it was the worst service we had ever received.""" }, { - "aspect": ["location", "food"], + "aspects": ["location", "food"], "sentence": """it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more .""" } @@ -152,16 +152,16 @@ with the following code: inputs = [ { - "aspect": ["Soup"], + "aspects": ["Soup"], "sentence": "The soup is a little salty." }, { - "aspect": ["service"], + "aspects": ["service"], "sentence": """Everyone that sat in the back outside agreed that it was the worst service we had ever received.""" }, { - "aspect": ["location", "food"], + "aspects": ["location", "food"], "sentence": """it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more .""" } diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py index c1d7858..678bddc 100644 --- a/sgnlp/models/sentic_gcn/preprocess.py +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -298,7 +298,7 @@ def _process_inputs(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> for batch in data_batch: full_text = batch["sentence"].lower().strip() full_text_tokens = batch["sentence"].split() - for aspect in batch["aspect"]: + for aspect in batch["aspects"]: aspect = aspect.lower().strip() aspect_token_indexes = [ idx @@ -481,7 +481,7 @@ def _process_inputs(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> for batch in data_batch: full_text = batch["sentence"].lower().strip() full_text_tokens = batch["sentence"].split() - for aspect in batch["aspect"]: + for aspect in batch["aspects"]: aspect = aspect.lower().strip() aspect_token_indexes = [ idx From d82e7791639da7d79ed4bc6036dfc1328df07029 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 19:32:38 +0800 Subject: [PATCH 200/201] [#41] fix unit tests for aspect changes --- tests/sentic_gcn/test_sentic_gcn_preprocess.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/sentic_gcn/test_sentic_gcn_preprocess.py b/tests/sentic_gcn/test_sentic_gcn_preprocess.py index 4e202e9..bee1b10 100644 --- a/tests/sentic_gcn/test_sentic_gcn_preprocess.py +++ b/tests/sentic_gcn/test_sentic_gcn_preprocess.py @@ -28,13 +28,13 @@ def setUp(self) -> None: test_embed_config = SenticGCNEmbeddingConfig() self.test_embed_model = SenticGCNEmbeddingModel(config=test_embed_config) self.test_inputs = [ - {"aspect": ["Soup"], "sentence": "Soup is tasty but soup is a little salty. Salty funkysoup."}, # 1, -1 + {"aspects": ["Soup"], "sentence": "Soup is tasty but soup is a little salty. Salty funkysoup."}, # 1, -1 { - "aspect": ["service"], + "aspects": ["service"], "sentence": "Everyone that sat in the back outside agreed that it was the worst service we had ever received.", }, # -1 { - "aspect": ["location", "food"], + "aspects": ["location", "food"], "sentence": "it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more .", }, # 0, 1 ] @@ -109,13 +109,13 @@ def setUp(self) -> None: test_embed_config = SenticGCNBertEmbeddingConfig() self.test_embed_model = SenticGCNBertEmbeddingModel(config=test_embed_config) self.test_inputs = [ - {"aspect": ["Soup"], "sentence": "Soup is tasty but soup is a little salty. Salty funkysoup."}, # 1, -1 + {"aspects": ["Soup"], "sentence": "Soup is tasty but soup is a little salty. Salty funkysoup."}, # 1, -1 { - "aspect": ["service"], + "aspects": ["service"], "sentence": "Everyone that sat in the back outside agreed that it was the worst service we had ever received.", }, # -1 { - "aspect": ["location", "food"], + "aspects": ["location", "food"], "sentence": "it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more .", }, # 0, 1 ] From b123236c19f6d6e6b817329c960646dd54eb0fd2 Mon Sep 17 00:00:00 2001 From: Raymond Ng Date: Mon, 17 Jan 2022 21:26:56 +0800 Subject: [PATCH 201/201] [#41] update usage with latest examples --- demo_api/sentic_gcn/usage.py | 38 ++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/demo_api/sentic_gcn/usage.py b/demo_api/sentic_gcn/usage.py index 7284df7..8710d4a 100644 --- a/demo_api/sentic_gcn/usage.py +++ b/demo_api/sentic_gcn/usage.py @@ -1,43 +1,43 @@ from sgnlp.models.sentic_gcn import ( - SenticGCNBertModel, - SenticGCNBertPreprocessor, + SenticGCNBertModel, + SenticGCNBertPreprocessor, SenticGCNBertConfig, - SenticGCNBertPostprocessor - ) + SenticGCNBertPostprocessor, +) preprocessor = SenticGCNBertPreprocessor( - senticnet='https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle', - device='cpu' + senticnet="https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", device="cpu" ) postprocessor = SenticGCNBertPostprocessor() # Load model -config = SenticGCNBertConfig.from_pretrained('https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json') +config = SenticGCNBertConfig.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json" +) model = SenticGCNBertModel.from_pretrained( - 'https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin', - config=config + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin", config=config ) # Inputs inputs = [ { - "aspect": ["Soup"], - "sentence": "Soup is tasty but soup is a little salty. Salty soup." - }, # 1, -1 + "aspects": ["service", "decor"], + "sentence": "Everything is always cooked to perfection , the service is excellent, the decor cool and understated.", + }, { - "aspect": ["service"], - "sentence": "Everyone that sat in the back outside agreed that it was the worst service we had ever received." - }, # -1 + "aspects": ["food", "portions"], + "sentence": "The food was lousy - too sweet or too salty and the portions tiny.", + }, { - "aspect": ["location", "food"], - "sentence": "it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more ." - } # 0, 1 + "aspects": ["service"], + "sentence": "To sum it up : service varies from good to mediorce , depending on which waiter you get ; generally it is just average ok .", + }, ] processed_inputs, processed_indices = preprocessor(inputs) outputs = model(processed_indices) # Postprocessing -post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=outputs) \ No newline at end of file +post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=outputs)