diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ac230d9..870c72d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -16,6 +16,8 @@ cache: run_non_slow_unit_tests: stage: unit_test_non_slow + tags: + - dind before_script: - python -m venv .venv - source .venv/bin/activate @@ -29,6 +31,8 @@ run_non_slow_unit_tests: run_slow_unit_tests: stage: unit_test_slow + tags: + - dind when: manual before_script: - python -m venv .venv @@ -45,6 +49,8 @@ run_slow_unit_tests: generate_demo_api_yaml: stage: pre + tags: + - dind image: alpine:latest script: - apk add -U jsonnet @@ -55,6 +61,8 @@ generate_demo_api_yaml: generate_dev_demo_api_yaml: stage: pre + tags: + - dind image: alpine:latest script: - apk add -U jsonnet diff --git a/demo_api/sentic_gcn/Dockerfile b/demo_api/sentic_gcn/Dockerfile new file mode 100644 index 0000000..a7c9286 --- /dev/null +++ b/demo_api/sentic_gcn/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3.8-buster + +COPY . /demo_api + +WORKDIR /demo_api/sentic_gcn + +RUN pip install -r requirements.txt +RUN python -m download_pretrained + +CMD PYTHONPATH=../../ gunicorn -c ../gunicorn.conf.py \ No newline at end of file diff --git a/demo_api/sentic_gcn/api.py b/demo_api/sentic_gcn/api.py new file mode 100644 index 0000000..48511cc --- /dev/null +++ b/demo_api/sentic_gcn/api.py @@ -0,0 +1,46 @@ +from flask import request, jsonify + +from demo_api.common import create_api +from sgnlp.models.sentic_gcn import ( + SenticGCNBertModel, + SenticGCNBertConfig, + SenticGCNBertPreprocessor, + SenticGCNBertPostprocessor, +) + +app = create_api(app_name=__name__, model_card_path="model_card/sentic_gcn.json") + +preprocessor = SenticGCNBertPreprocessor( + senticnet="https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", device="cpu" +) + +postprocessor = SenticGCNBertPostprocessor() + +# Load model +config = SenticGCNBertConfig.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json" +) + +model = SenticGCNBertModel.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin", config=config +) + +app.logger.info("Preprocessing pipeline and model initialization complete.") + + +@app.route("/predict", methods=["POST"]) +def predict(): + req_body = request.get_json() + + # Preprocessing + processed_inputs, processed_indices = preprocessor([req_body]) + outputs = model(processed_indices) + + # Postprocessing + post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=outputs) + + return jsonify(post_outputs[0]) + + +if __name__ == "__main__": + app.run() diff --git a/demo_api/sentic_gcn/dev.Dockerfile b/demo_api/sentic_gcn/dev.Dockerfile new file mode 100644 index 0000000..3158cca --- /dev/null +++ b/demo_api/sentic_gcn/dev.Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.8-buster + +COPY ./demo_api /demo_api +COPY ./sgnlp /sgnlp +COPY ./setup.py /setup.py +COPY ./README.md /README.md + +RUN pip install -r /demo_api/sentic_gcn/requirements_dev.txt + +WORKDIR /demo_api/sentic_gcn + +RUN python -m download_pretrained + +CMD PYTHONPATH=../../ gunicorn -c ../gunicorn.conf.py \ No newline at end of file diff --git a/demo_api/sentic_gcn/download_pretrained.py b/demo_api/sentic_gcn/download_pretrained.py new file mode 100644 index 0000000..8300acc --- /dev/null +++ b/demo_api/sentic_gcn/download_pretrained.py @@ -0,0 +1,22 @@ +"""Run this script during build time to download the pretrained models and relevant files first""" + +from sgnlp.models.sentic_gcn import ( + SenticGCNConfig, + SenticGCNBertTokenizer, + SenticGCNBertModel, + SenticGCNBertPreprocessor +) + +# Downloads preprocessor, pretrained config, tokenizer, model +preprocessor = SenticGCNBertPreprocessor( + senticnet='https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle', + device='cpu' +) +tokenizer = SenticGCNBertTokenizer.from_pretrained("bert-base-uncased") +config = SenticGCNConfig.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json" +) +model = SenticGCNBertModel.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin", + config=config +) \ No newline at end of file diff --git a/demo_api/sentic_gcn/model_card/sentic_gcn.json b/demo_api/sentic_gcn/model_card/sentic_gcn.json new file mode 100644 index 0000000..6bfe52d --- /dev/null +++ b/demo_api/sentic_gcn/model_card/sentic_gcn.json @@ -0,0 +1,44 @@ +{ + "name": "Sentic GCN", + "languages": "English", + "description": "This is a neural network that utilises LSTM and GCN to detect the sentiment polarities of different aspects in the same sentence. The models used corresponds to the associated models described in the paper.", + "paper": { + "text": "Bin Liang, Hang Su, Lin Gui, Erik Cambria, Ruifeng Xu. (2021). Aspect-based sentiment analysis via affective knowledge enhanced graph convolutional networks, 2021: 107643.", + "url": "https://github.com/BinLiang-NLP/Sentic-GCN" + }, + "trainingDataset": { + "text": "acl-14-short-data, semeval14, semeval15, semeval16", + "url": "https://github.com/BinLiang-NLP/Sentic-GCN/tree/main/datasets" + }, + "evaluationDataset": { + "text": "acl-14-short-data, semeval14, semeval15, semeval16", + "url": "https://github.com/BinLiang-NLP/Sentic-GCN/tree/main/datasets" + }, + "evaluationScores": "Sentic-GCN: 94.36% Acc, 94.43% F1 (SemEval14-Laptop), 94.55% Acc, 91.99% F1 (SemEval14-Restaurant), 95.02% Acc, 93.22% F1 (SemEval15-Restaurant), 96.75% Acc, 93.55% F1 (SemEval16-Restaurant). Sentic-GCN Bert: 99.22% Acc, 99.15% F1 (SemEval14-Laptop), 97.39% Acc, 96.53% F1 (SemEval14-Restaurant), 99.17% Acc, 98.78% F1 (SemEval15-Restaurant), 99.37% Acc, 98.79% F1 (SemEval16-Restaurant).", + "trainingConfig": { + "text": "Refer to documentation for details." + }, + "trainingTime": "Sentic-GCN: ~10 mins for ~35 epochs (early stopped), Sentic-GCN Bert: ~1 hr for ~40 epochs (early stopped) for SemEval14-Laptop/SemEval14-Restaurant/SemEval15-Restaurant/SemEval16-Restaurant datasets.", + "modelWeights": { + "text": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin", + "url": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin" + }, + "modelConfig": { + "text": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json", + "url": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json" + }, + "modelInput": "Aspect (word), sentence containing the aspect", + "modelOutput": "Sentiment of aspect, -1 (negative), 0 (neutral), 1 (postive)", + "modelSize": "Sentic-GCN: ~8.7MB, Sentic-GCN Bert: ~7.1MB", + "inferenceInfo": "< 1 sec on Intel(R) i7 Quad-Core @ 1.7GHz.", + "usageScenarios": "Sentiment analysis of aspects in sentences", + "originalCode": { + "text": "https://github.com/BinLiang-NLP/Sentic-GCN", + "url": "https://github.com/BinLiang-NLP/Sentic-GCN" + }, + "license": { + "text": "MIT License", + "url": "https://choosealicense.com/licenses/mit" + }, + "contact": "sg-nlp@aisingapore.org" + } \ No newline at end of file diff --git a/demo_api/sentic_gcn/requirements.txt b/demo_api/sentic_gcn/requirements.txt new file mode 100644 index 0000000..e0c022b --- /dev/null +++ b/demo_api/sentic_gcn/requirements.txt @@ -0,0 +1,6 @@ +torch==1.10.1 +spacy==3.2.1 +numpy==1.22.0 +flask +gunicorn +sgnlp \ No newline at end of file diff --git a/demo_api/sentic_gcn/requirements_dev.txt b/demo_api/sentic_gcn/requirements_dev.txt new file mode 100644 index 0000000..0d08f9b --- /dev/null +++ b/demo_api/sentic_gcn/requirements_dev.txt @@ -0,0 +1,6 @@ +-e . +torch==1.10.1 +spacy==3.2.1 +numpy==1.22.0 +flask +gunicorn \ No newline at end of file diff --git a/demo_api/sentic_gcn/usage.py b/demo_api/sentic_gcn/usage.py new file mode 100644 index 0000000..8710d4a --- /dev/null +++ b/demo_api/sentic_gcn/usage.py @@ -0,0 +1,43 @@ +from sgnlp.models.sentic_gcn import ( + SenticGCNBertModel, + SenticGCNBertPreprocessor, + SenticGCNBertConfig, + SenticGCNBertPostprocessor, +) + +preprocessor = SenticGCNBertPreprocessor( + senticnet="https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", device="cpu" +) + +postprocessor = SenticGCNBertPostprocessor() + +# Load model +config = SenticGCNBertConfig.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json" +) + +model = SenticGCNBertModel.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin", config=config +) + +# Inputs +inputs = [ + { + "aspects": ["service", "decor"], + "sentence": "Everything is always cooked to perfection , the service is excellent, the decor cool and understated.", + }, + { + "aspects": ["food", "portions"], + "sentence": "The food was lousy - too sweet or too salty and the portions tiny.", + }, + { + "aspects": ["service"], + "sentence": "To sum it up : service varies from good to mediorce , depending on which waiter you get ; generally it is just average ok .", + }, +] + +processed_inputs, processed_indices = preprocessor(inputs) +outputs = model(processed_indices) + +# Postprocessing +post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=outputs) diff --git a/docs/source/model/senticgcn.rst b/docs/source/model/senticgcn.rst new file mode 100644 index 0000000..799a0e6 --- /dev/null +++ b/docs/source/model/senticgcn.rst @@ -0,0 +1,401 @@ +Sentic-GCN: Aspect-Based Sentiment Analysis via Affective Knowledge Enhanced Graph Convolutional Networks +========================================================================================================= + +Overview +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The Sentic-GCN model was proposed in `Aspect-Based Sentiment Analysis via Affective Knowledge Enhanced +Graph Convolutional Networks `_ by Liang, Bin and Su, Hang and +Gui, Lin and Cambria, Erik and Xu, Ruifeng. + +The abstract from the paper is as follows: + +*Aspect-based sentiment analysis is a fine-grained sentiment analysis task, which needs to detection the +sentiment polarity towards a given aspect. Recently, graph neural models over the dependency tree are +widely applied for aspect- based sentiment analysis. Most existing works, however, they generally focus +on learning the dependency information from contextual words to aspect words based on the dependency tree +of the sentence, which lacks the exploitation of contextual affective knowledge with regard to the +specific aspect. In this pa- per, we propose a graph convolutional network based on SenticNet to leverage +the affective dependencies of the sentence according to the specific aspect, called Sentic GCN. To be +specific, we explore a novel solution to construct the graph neural networks via integrating the affective +knowledge from SenticNet to en- hance the dependency graphs of sentences. Based on it, both the +dependencies of contextual words and aspect words and the affective information between opinion words and +the aspect are considered by the novel affective enhanced graph model. Experimental results on multiple +public benchmark datasets il- lustrate that our proposed model can beat state-of-the-art methods.* + +In keeping with how the models performance are calculated in the paper, this implementation save the best +performing model weights for both Sentic-GCN model and the Sentic-GCN Bert model. + +Default dataset presented in the paper are the SemEval 2014 (Laptop, Restaurant), SemEval 2015 +(Restaurant), SemEval 2016 (Restaurant). However, please note that the dataset format have been further +processed from original source, please see dataset link below for the processed datasets. + +| Link to the `paper `_ +| Link to the `dataset `_ +| Link to the original `github `_ + + +Getting started +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The Sentic-GCN model pretrained on the SemEval 2014/2015/2016 data can be loaded and accessed with the +following code: + +.. code:: python + + from sgnlp.models.sentic_gcn import( + SenticGCNConfig, + SenticGCNModel, + SenticGCNEmbeddingConfig, + SenticGCNEmbeddingModel, + SenticGCNTokenizer, + SenticGCNPreprocessor, + SenticGCNPostprocessor, + download_tokenizer_files, + ) + + download_tokenizer_files( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_tokenizer/", + "senticgcn_tokenizer") + tokenizer = SenticGCNTokenizer.from_pretrained("senticgcn_tokenizer") + + config = SenticGCNConfig.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn/config.json" + ) + model = SenticGCNModel.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn/pytorch_model.bin", + config=config + ) + + embed_config = SenticGCNEmbeddingConfig.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/config.json" + ) + embed_model = SenticGCNEmbeddingModel.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/pytorch_model.bin", + config=embed_config + ) + + preprocessor = SenticGCNPreprocessor(tokenizer=tokenizer, embedding_model=embed_model) + postprocessor = SenticGCNPostprocessor() + + inputs = [ + { + "aspects": ["Soup"], + "sentence": "The soup is a little salty." + }, + { + "aspects": ["service"], + "sentence": """Everyone that sat in the back outside agreed that it was the worst service we + had ever received.""" + }, + { + "aspects": ["location", "food"], + "sentence": """it 's located in a strip mall near the beverly center , not the greatest + location , but the food keeps me coming back for more .""" + } + ] + + processed_inputs, processed_indices = preprocessor(inputs) + raw_outputs = model(processed_indices) + + post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=raw_outputs) + + print(post_outputs[0]) + # {'sentence': ['The', 'soup', 'is', 'a', 'little', 'salty.'], + # 'aspects': [1], + # 'labels': [-1]} + + print(post_outputs[1]) + # {'sentence': ['Everyone', 'that', 'sat', 'in', 'the', 'back', 'outside', 'agreed', 'that', 'it', + # 'was', 'the', 'worst', 'service', 'we', 'had', 'ever', 'received.'], + # 'aspects': [13], + # 'labels': [-1]} + + print(post_outputs[2]) + # {'sentence': ['it', "'s", 'located', 'in', 'a', 'strip', 'mall', 'near', 'the', 'beverly', + # 'center', ',', 'not', 'the', 'greatest', 'location', ',', 'but', 'the', 'food', + # 'keeps', 'me', 'coming', 'back', 'for', 'more', '.'], + # 'aspects': [15, 19], + # 'labels': [0, 1]} + + +The Sentic-GCN Bert model pretrained on the SemEval 2014/2015/2016 data can be loaded and accessed +with the following code: + +.. code:: python + + from sgnlp.models.sentic_gcn import( + SenticGCNBertConfig, + SenticGCNBertModel, + SenticGCNBertEmbeddingConfig, + SenticGCNBertEmbeddingModel, + SenticGCNBertTokenizer, + SenticGCNBertPreprocessor, + SenticGCNBertPostprocessor + ) + + tokenizer = SenticGCNBertTokenizer.from_pretrained("bert-base-uncased") + + config = SenticGCNBertConfig.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/config.json" + ) + model = SenticGCNBertModel.from_pretrained( + "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/pytorch_model.bin", + config=config + ) + + embed_config = SenticGCNBertEmbeddingConfig.from_pretrained("bert-base-uncased") + embed_model = SenticGCNBertEmbeddingModel.from_pretrained("bert-base-uncased", + config=embed_config + ) + + preprocessor = SenticGCNBertPreprocessor(tokenizer=tokenizer, embedding_model=embed_model) + postprocessor = SenticGCNBertPostprocessor() + + inputs = [ + { + "aspects": ["Soup"], + "sentence": "The soup is a little salty." + }, + { + "aspects": ["service"], + "sentence": """Everyone that sat in the back outside agreed that it was the worst service we + had ever received.""" + }, + { + "aspects": ["location", "food"], + "sentence": """it 's located in a strip mall near the beverly center , not the greatest + location , but the food keeps me coming back for more .""" + } + ] + + processed_inputs, processed_indices = preprocessor(inputs) + raw_outputs = model(processed_indices) + + post_outputs = postprocessor(processed_inputs=processed_inputs, model_outputs=raw_outputs) + + print(post_outputs[0]) + # {'sentence': ['The', 'soup', 'is', 'a', 'little', 'salty.'], + # 'aspects': [1], + # 'labels': [-1]} + + print(post_outputs[1]) + # {'sentence': ['Everyone', 'that', 'sat', 'in', 'the', 'back', 'outside', 'agreed', 'that', 'it', + # 'was', 'the', 'worst', 'service', 'we', 'had', 'ever', 'received.'], + # 'aspects': [13], + # 'labels': [-1]} + + print(post_outputs[2]) + # {'sentence': ['it', "'s", 'located', 'in', 'a', 'strip', 'mall', 'near', 'the', 'beverly', + # 'center', ',', 'not', 'the', 'greatest', 'location', ',', 'but', 'the', 'food', + # 'keeps', 'me', 'coming', 'back', 'for', 'more', '.'], + # 'aspects': [15, 19], + # 'labels': [0, 1]} + + +Input +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The input data needs to be a dictionary with the following keys: + ++----------------------+-----------------------------------------------------------------------------------------------+ +| Key | Meaning | ++----------------------+-----------------------------------------------------------------------------------------------+ +| aspects | A list of aspect(s) which must also be found in the sentence. | ++----------------------+-----------------------------------------------------------------------------------------------+ +| sentence | A sentence which also contains all the aspects. | ++----------------------+-----------------------------------------------------------------------------------------------+ + +The value(s) for aspects must be a list and each aspect must also exists in the sentence. If aspect have more than one +occurances in the sentence, each aspect will be treated as an input instance. + + +Output +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The output returned from :class:`~sgnlp.models.sentic_gcn.postprocess.SenticGCNPostprocessor` and +:class:`~sgnlp.models.sentic_gcn.postprocess.SenticGCNBertPostprocessor` consists of a list of dictionary +containing each processed input entries. Each entry consists of the following: + +1. sentence: The input sentence in tokenized form. +2. aspects: A list of indices which denotes each index position in the tokenized input sentence. +3. labels: A list of prediction for each aspects in order. -1 denote negative sentiment, 0 denote neutral sentiment and 1 denote positive sentiment. + +The logits can be accessed from the model output returned from the model. + + +Training +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Dataset Preparation +------------------- +Prepare the training and evaluation dataset in the format that is the same as the datasets from the +author's repo. Please refer to the sample dataset +`here `__ for reference. + + +Config Preparation +------------------ + +Aspect of the training could be configured via the `sentic_gcn_config.json` and `sentic_gcn_bert_config.json` +file. An example of the Sentic-GCN config file can be found +`here `_ +and example of the Sentic-GCN Bert config file can be found +`here `_ + ++------------------------------------------+--------------------------------------------------------------------------------------+ +| Configuration key | Description | ++==========================================+======================================================================================+ +| senticnet_word_file_path | File path to the SenticNet 5.0 file. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_preprocessed_senticnet | Flag to indicate if the processed SenticNet dictionary should be pickled. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| saved_preprocessed_senticnet_file_path | Pickle file path for saving processed SenticNet dictionary. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| spacy_pipeline | Spacy pre-trained pipeline to load for preprocessing. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| word_vec_file_path | File path to word vectors file for generating embeddings. (e.g. GloVe vectors.) | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| dataset_train | List of training dataset files path. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| dataset_test | List of testing dataset files path. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| valset_ratio | Ratio for train validation split. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| model | The model type to train. Either 'senticgcn' or 'senticgcnbert'. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_best_model | Flag to indicate if best model should saved. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_model_path | Folder path to save best performing model during training. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| tokenizer | The tokenizer type to use for dataset preprocessing. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| train_tokenizer | Flag to indicate if tokenizer should be trained using train and test datasets. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_tokenizer | Flag to indicate if trained tokenizer should be saved. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_tokenizer_path | Folder path to save trained tokenizer. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| embedding_model | Embedding model type to use for training. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| build_embedding_model | Flag to indicate if embedding model should be trained on input word vectors. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_embedding_model | Flag to indicate if trained embedding model should be saved. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_embedding_model_path | Folder path to save trained embedding model. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_results | Flag to indicate if training results should be saved. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| save_result_folder | Folder path for saving training results. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| initializer | torch.nn.initializer type for initializing model weights. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| optimizer | torch.nn.optimizer type for training. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| loss_function | Loss function to use for training. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| learning_rate | Learning rate for training. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| l2reg | l2reg value to set for training. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| epochs | Number of epoch to train. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| batch_size | Batch size to set for dataloader. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| log_step | Print training results for every log_step. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| embed_dim | Size of embedding dimension. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| hidden_dim | Size of hidden layer for GCN. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| polarities_dim | Size of output layer. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| dropout | Dropout ratio for dropout layer. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| seed | Random seed to set prior to training. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| device | torch.device to set for training. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| repeats | Number of times to repeat whole training cycle. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| patience | Patience value for early stopping. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| max_len | Maximum length for input tensor. | ++------------------------------------------+--------------------------------------------------------------------------------------+ + + +Running Train Code +------------------ +To start training Sentic-GCN or Sentic-GCN Bert model, execute the following code: + +.. code:: python + + from sgnlp.models.sentic_gcn.train import SenticGCNTrainer, SenticGCNBertTrainer + from sgnlp.models.sentic_gcn.utils import parse_args_and_load_config, set_random_seed + + cfg = parse_args_and_load_config() + if cfg.seed is not None: + set_random_seed(cfg.seed) + trainer = SenticGCNTrainer(cfg) if cfg.model == "senticgcn" else SenticGCNBertTrainer(cfg) + trainer.train() + + +Evaluating +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Dataset Preparation +------------------- + +Refer to training section above for dataset example. + + +Config Preparation +------------------ + +Aspect of the training could be configured via the `sentic_gcn_config.json` and `sentic_gcn_bert_config.json` +file. An example of the Sentic-GCN config file can be found +`here `_ +and example of the Sentic-GCN Bert config file can be found +`here `_ + ++------------------------------------------+--------------------------------------------------------------------------------------+ +| Configuration key | Description | ++==========================================+======================================================================================+ +| eval_args/model | The model type to evaluate. Either 'senticgcn' or 'senticgcnbert'. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| eval_args/model | Path to model folder, could be cloud storage, local folder or HuggingFace model hub. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| tokenizer | The tokenizer type to use for dataset preprocessing. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| embedding_model | The embedding model type to use for dataset preprocessing. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| config_filename | Config file name to load from model folder and embedding model folder. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| model_filename | Model file name to load from model folder and embedding model folder. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| test_filename | File path to test dataset. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| senticnet | File path to pickled processed senticnet. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| spacy_pipeline | Spacy pre-trained pipeline to load for dataset preprocesing. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| result_folder | Folder to save evaluation results. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| eval_batch_size | Batch size for evaluator dataloader. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| seed | Random seed to set for evaluation. | ++------------------------------------------+--------------------------------------------------------------------------------------+ +| device | torch.device to set for tensors. | ++------------------------------------------+--------------------------------------------------------------------------------------+ + + +Running the Evaluation Code +--------------------------- +To start evaluating Sentic-GCN or Sentic-GCN Bert model, execute the following code: + +.. code:: python + + from sgnlp.models.sentic_gcn.eval import SenticGCNEvaluator, SenticGCNBertEvaluator + from sgnlp.models.sentic_gcn.utils import parse_args_and_load_config, set_random_seed + + cfg = parse_args_and_load_config() + if cfg.seed is not None: + set_random_seed(cfg.seed) + evaluator = SenticGCNEvaluator(cfg) if cfg.model == "senticgcn" else SenticGCNBertEvaluator(cfg) + evaluator.evaluate() diff --git a/docs/source/models.rst b/docs/source/models.rst index 77771a1..c78be12 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -7,3 +7,4 @@ Models model/ufd model/emotion_entailment model/span_extraction + model/senticgcn \ No newline at end of file diff --git a/jsonnet/demo-api.jsonnet b/jsonnet/demo-api.jsonnet index 7ab3059..90a1185 100644 --- a/jsonnet/demo-api.jsonnet +++ b/jsonnet/demo-api.jsonnet @@ -1,6 +1,10 @@ local build_and_push_staging(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "build_and_push_staging", + tags: [ + "on-prem", + "dind", + ], when: "manual", script: [ "echo 'Logging in to AISG Docker Registry...'", @@ -15,6 +19,10 @@ local build_and_push_staging(module_name, image_name) = { local build_and_push_docs_staging() = { image: "python:3.8.11-slim", stage: "build_and_push_staging", + tags: [ + "on-prem", + "dind", + ], when: "manual", script: [ "echo 'Building Sphinx docs'", @@ -42,6 +50,10 @@ local build_and_push_docs_staging() = { local retag_and_push_production(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "retag_and_push_production", + tags: [ + "on-prem", + "dind", + ], only: { refs: ["main"] }, @@ -62,6 +74,10 @@ local retag_and_push_production(module_name, image_name) = { local restart_kubernetes_staging(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_staging", + tags: [ + "on-prem", + "dind", + ], when: "manual", needs: ["%s_build_and_push_staging" % module_name], script: [ @@ -74,6 +90,10 @@ local restart_kubernetes_staging(module_name, deployment_name) = { local restart_kubernetes_production(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_production", + tags: [ + "on-prem", + "dind", + ], only: { refs: ["main"] }, @@ -126,6 +146,11 @@ local api_names = { module_name: "ufd", image_name: "ufd", deployment_name: "ufd" + }, + "sentic_gcn": { + module_name: "sentic_gcn", + image_name: "sentic-gcn", + deployment_name: "sentic-gcn" } }; diff --git a/jsonnet/dev-demo-api.jsonnet b/jsonnet/dev-demo-api.jsonnet index cf84943..3239732 100644 --- a/jsonnet/dev-demo-api.jsonnet +++ b/jsonnet/dev-demo-api.jsonnet @@ -1,6 +1,10 @@ local build_and_push_staging(module_name, image_name) = { image: "registry.aisingapore.net/sg-nlp/sg-nlp-runner:latest", stage: "build_and_push_staging", + tags: [ + "on-prem", + "dind", + ], when: "manual", script: [ "echo 'Logging in to AISG Docker Registry...'", @@ -16,6 +20,10 @@ local build_and_push_staging(module_name, image_name) = { local restart_kubernetes_staging(module_name, deployment_name) = { image: "registry.aisingapore.net/sea-core-nlp/seacorenlp-runner:latest", stage: "restart_kubernetes_staging", + tags: [ + "on-prem", + "dind", + ], when: "manual", needs: ["%s_build_and_push_staging" % module_name], script: [ @@ -42,6 +50,11 @@ local api_names = { image_name: "lif-3way-ap", deployment_name: "lif-3way-ap" }, + "sentic_gcn": { + module_name: "sentic_gcn", + image_name: "sentic-gcn", + deployment_name: "sentic-gcn" + }, "ufd": { module_name: "ufd", image_name: "ufd", diff --git a/polyaxon/sentic_gcn/conda.yml b/polyaxon/sentic_gcn/conda.yml new file mode 100644 index 0000000..4df317c --- /dev/null +++ b/polyaxon/sentic_gcn/conda.yml @@ -0,0 +1,16 @@ +name: polyaxon + - defaults + - conda-forge +dependencies: + - python=3.9.7 + - pip + - pip: + - spacy + - numpy + - torch + - scikit-learn + - transformers + - sentencepiece + - tokenizers + +# Feel free to change the version of any package \ No newline at end of file diff --git a/polyaxon/sentic_gcn/experiment.df b/polyaxon/sentic_gcn/experiment.df new file mode 100644 index 0000000..7d7ffeb --- /dev/null +++ b/polyaxon/sentic_gcn/experiment.df @@ -0,0 +1,23 @@ +# change base image as required +FROM registry.aisingapore.net/polyaxon/cuda10:latest + +ARG USER="polyaxon" +ARG WORK_DIR="/home/$USER" + +RUN rm /bin/sh && ln -s /bin/bash /bin/sh && \ + apt update && apt install -y jq ca-certificates + +WORKDIR $WORK_DIR +USER $USER + +COPY build/conda.yml . +RUN conda env update -f conda.yml -n base && \ + rm conda.yml + +WORKDIR /code + +RUN python -m spacy download en_core_web_sm + +COPY --chown=$USER:$USER build . + +ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:$LD_LIBRARY_PATH diff --git a/polyaxon/sentic_gcn/notebook.df b/polyaxon/sentic_gcn/notebook.df new file mode 100644 index 0000000..111db91 --- /dev/null +++ b/polyaxon/sentic_gcn/notebook.df @@ -0,0 +1,29 @@ +FROM registry.aisingapore.net/aiap/polyaxon/pytorch-tf2-cpu:latest + +ARG WORK_DIR="/code" + +RUN pip install jupyterlab==0.33.12 + +WORKDIR $WORK_DIR + +RUN mkdir -p $WORK_DIR && chown -R 2222:2222 $WORK_DIR + +ARG ORG_JUPYTER="/opt/conda/bin/jupyter" +ARG MOD_JUPYTER="/opt/conda/bin/jupyter.real" + +RUN mv $ORG_JUPYTER $MOD_JUPYTER && \ + echo "#!/bin/bash" > $ORG_JUPYTER && \ + echo "/code/link_workspace.sh &" >> $ORG_JUPYTER && \ + echo "export SHELL=/bin/bash" >> $ORG_JUPYTER && \ + echo "$MOD_JUPYTER \"\$@\"" >> $ORG_JUPYTER && \ + chmod +x $ORG_JUPYTER + +COPY build/conda.yml /code +COPY build/scripts/link_workspace.sh /code + +RUN apt-get update && apt-get -y install vim jq + +RUN conda env update -n polyaxon --file conda.yml +RUN rm /code/conda.yml + +ENV LANG "C.UTF-8" diff --git a/polyaxon/sentic_gcn/notebook.yml b/polyaxon/sentic_gcn/notebook.yml new file mode 100644 index 0000000..615690b --- /dev/null +++ b/polyaxon/sentic_gcn/notebook.yml @@ -0,0 +1,15 @@ +--- +version: 1 + +kind: notebook + +build: + dockerfile: polyaxon/docker/notebook.df + context: . + +environment: + persistence: + data: ["data"] + +logging: + level: DEBUG \ No newline at end of file diff --git a/polyaxon/sentic_gcn/sentic_gcn_bert_train.yml b/polyaxon/sentic_gcn/sentic_gcn_bert_train.yml new file mode 100644 index 0000000..2c3c3e4 --- /dev/null +++ b/polyaxon/sentic_gcn/sentic_gcn_bert_train.yml @@ -0,0 +1,22 @@ +--- +version: 1 + +kind: experiment + +build: + dockerfile: polyaxon/docker/experiment.df + context: . + +environment: + resources: + gpu: + requests: 1 + limits: 1 + persistence: + data: ["data"] + +logging: + level: DEBUG + +run: + cmd: python train.py --config config/senticnet_gcn_bert_config.json diff --git a/polyaxon/sentic_gcn/sentic_gcn_train.yml b/polyaxon/sentic_gcn/sentic_gcn_train.yml new file mode 100644 index 0000000..fcc0869 --- /dev/null +++ b/polyaxon/sentic_gcn/sentic_gcn_train.yml @@ -0,0 +1,22 @@ +--- +version: 1 + +kind: experiment + +build: + dockerfile: polyaxon/docker/experiment.df + context: . + +environment: + resources: + gpu: + requests: 1 + limits: 1 + persistence: + data: ["data"] + +logging: + level: DEBUG + +run: + cmd: python train.py --config config/senticnet_gcn_config.json diff --git a/sgnlp/models/sentic_gcn/__init__.py b/sgnlp/models/sentic_gcn/__init__.py new file mode 100644 index 0000000..59b7569 --- /dev/null +++ b/sgnlp/models/sentic_gcn/__init__.py @@ -0,0 +1,9 @@ +from .config import SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig +from .data_class import SenticGCNTrainArgs +from .eval import SenticGCNEvaluator, SenticGCNBertEvaluator +from .modeling import SenticGCNModel, SenticGCNBertModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel +from .preprocess import SenticGCNPreprocessor, SenticGCNBertPreprocessor +from .postprocess import SenticGCNPostprocessor, SenticGCNBertPostprocessor +from .tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer +from .train import SenticGCNTrainer, SenticGCNBertTrainer +from .utils import BucketIterator, parse_args_and_load_config, download_tokenizer_files diff --git a/sgnlp/models/sentic_gcn/config.py b/sgnlp/models/sentic_gcn/config.py new file mode 100644 index 0000000..5d0a439 --- /dev/null +++ b/sgnlp/models/sentic_gcn/config.py @@ -0,0 +1,106 @@ +from transformers import PretrainedConfig, BertConfig + + +class SenticGCNConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a + :class:`~sgnlp.models.sentic_gcn.modeling.SenticGCNModel`. + It is used to instantiate a SenticGCNModel network according to the specific arguments, defining the model architecture. + + Args: + embed_dim (:obj:`int`, defaults to 300): Embedding dimension size. + hidden_dim (:obj:`int`, defaults to 300): Size of hidden dimension. + dropout (:obj:`float`, defaults to 0.3): Droput percentage. + polarities_dim (:obj:`int`, defaults to 3): Size of output dimension representing available polarities (e.g. Positive, Negative, Neutral). + loss_function (:obj:`str`, defaults to 'cross_entropy'): Loss function for training/eval. + + Example: + + from sgnlp.models.sentic_gcn import SenticGCNConfig + + # Initialize with default values + config = SenticGCNConfig() + """ + + def __init__( + self, + embed_dim: int = 300, + hidden_dim: int = 300, + polarities_dim: int = 3, + dropout: float = 0.3, + loss_function: str = "cross_entropy", + **kwargs + ) -> None: + super().__init__(**kwargs) + self.embed_dim = embed_dim + self.hidden_dim = hidden_dim + self.dropout = dropout + self.polarities_dim = polarities_dim + self.loss_function = loss_function + + +class SenticGCNBertConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a :class:`~sgnlp.models.sentic_gcn.modeling.SenticBertGCNModel`. + It is used to instantiate a SenticBertGCNModel network according to the specific arguments, defining the model architecture. + + Args: + embed_dim (:obj:`int`, defaults to 300): The input dimension for the LSTM layer + hidden_dim (:obj:`int`, defaults to 768): The embedding dimension size for the Bert model as well as GCN dimension. + max_seq_len (:obj:`int`, defaults to 85): The max sequence length to pad and truncate. + dropout (:obj:`float`, defaults to 0.3): Dropout percentage. + polarities_dim (:obj:`int`, defaults to 3): Size of output dimension representing available polarities (e.g. Positive, Negative, Neutral). + loss_function (:obj:`str`, defaults to 'cross_entropy'): Loss function for training/eval. + Example: + + from sgnlp.models.sentic_gcn import SenticGCNBertConfig + + # Initialize with default values + config = SenticGCNBertConfig() + """ + + def __init__( + self, + embed_dim: int = 300, + hidden_dim: int = 768, + max_seq_len: int = 85, + polarities_dim: int = 3, + dropout: float = 0.3, + loss_function: str = "cross_entropy", + **kwargs + ) -> None: + super().__init__(**kwargs) + self.embed_dim = embed_dim + self.hidden_dim = hidden_dim + self.max_seq_len = max_seq_len + self.dropout = dropout + self.polarities_dim = polarities_dim + self.loss_function = loss_function + + +class SenticGCNEmbeddingConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a :class:`~SenticGCNEmbeddingModel`. + It is used to instantiate a SenticGCN Embedding model according to the specified arguments, defining the model architecture. + + Args: + PretrainedConfig (:obj:`PretrainedConfig`): transformer :obj:`PretrainedConfig` base class + """ + + def __init__(self, vocab_size: int = 17662, embed_dim: int = 300, **kwargs) -> None: + super().__init__(**kwargs) + self.vocab_size = vocab_size + self.embed_dim = embed_dim + + +class SenticGCNBertEmbeddingConfig(BertConfig): + """ + This is the configuration class to store the configuration of a :class:`~SenticGCNBertEmbeddingModel`. + It is used to instantiate a SenticGCN Bert Embedding model according to the specified arguments, defining the model architecture. + + Args: + BertConfig (:obj:`BertConfig`): transformer :obj:`BertConfig` base class + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) diff --git a/sgnlp/models/sentic_gcn/config/sentic_gcn_bert_config.json b/sgnlp/models/sentic_gcn/config/sentic_gcn_bert_config.json new file mode 100644 index 0000000..4c15123 --- /dev/null +++ b/sgnlp/models/sentic_gcn/config/sentic_gcn_bert_config.json @@ -0,0 +1,56 @@ +{ + "senticnet_word_file_path": "./senticNet/senticnet_word.txt", + "save_preprocessed_senticnet": true, + "saved_preprocessed_senticnet_file_path": "senticnet/senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", + "word_vec_file_path": "./glove/glove.840B.300d.txt", + + "dataset_train": ["./datasets/semeval14/restaurant_train.raw"], + "dataset_test": ["./datasets/semeval14/restaurant_test.raw"], + "valset_ratio": 0, + + "model": "senticgcnbert", + "save_best_model": true, + "save_model_path": "senticgcnbert", + + "tokenizer": "bert-base-uncased", + + "embedding_model": "bert-base-uncased", + + "save_results": true, + "save_results_folder": "./results/", + + "initializer": "xavier_uniform_", + "optimizer": "adam", + "loss_function": "cross_entropy", + "learning_rate": 0.001, + "l2reg": 0.00001, + "epochs": 100, + "batch_size": 16, + "log_step": 5, + "embed_dim": 300, + "hidden_dim": 768, + "polarities_dim": 3, + "dropout": 0.3, + "seed": 776, + "device": "cuda", + "repeats": 10, + "patience": 5, + "max_len": 85, + + "eval_args": { + "model": "senticgcnbert", + "model_path": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/", + "tokenizer": "bert-base-uncased", + "embedding_model": "bert-base-uncased", + "config_filename": "config.json", + "model_filename": "pytorch_model.bin", + "test_filename": "./datasets/semeval14/restaurant_test.raw", + "senticnet": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", + "result_folder": "./eval_result/", + "eval_batch_size": 16, + "seed": 776, + "device": "cpu" + } +} \ No newline at end of file diff --git a/sgnlp/models/sentic_gcn/config/sentic_gcn_config.json b/sgnlp/models/sentic_gcn/config/sentic_gcn_config.json new file mode 100644 index 0000000..e365be2 --- /dev/null +++ b/sgnlp/models/sentic_gcn/config/sentic_gcn_config.json @@ -0,0 +1,62 @@ +{ + "senticnet_word_file_path": "./senticnet-5.0/senticnet5.txt", + "save_preprocessed_senticnet": true, + "saved_preprocessed_senticnet_file_path": "./senticnet-5.0/senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", + "word_vec_file_path": "./glove/glove.840B.300d.txt", + + "dataset_train": ["./datasets/semeval14/restaurant_train.raw"], + "dataset_test": ["./datasets/semeval14/restaurant_test.raw"], + "valset_ratio": 0, + + "model": "senticgcn", + "save_best_model": true, + "save_model_path": "./models/senticgcn_semeval14_rest/", + + "tokenizer": "senticgcn", + "train_tokenizer": true, + "save_tokenizer": true, + "save_tokenizer_path": "./tokenizers/senticgcn_tok_semeval14_rest/", + + "embedding_model": "senticgcn_embed_model", + "build_embedding_model": true, + "save_embedding_model": true, + "save_embedding_model_path": "./embed_models/senticgcn_embed_semeval14_rest/", + + "save_results": true, + "save_results_folder": "./results/", + + "initializer": "xavier_uniform_", + "optimizer": "adam", + "loss_function": "cross_entropy", + "learning_rate": 0.001, + "l2reg": 0.00001, + "epochs": 100, + "batch_size": 16, + "log_step": 5, + "embed_dim": 300, + "hidden_dim": 300, + "polarities_dim": 3, + "dropout": 0.3, + "seed": 776, + "device": "cuda", + "repeats": 10, + "patience": 5, + "max_len": 85, + + "eval_args": { + "model": "senticgcn", + "model_path": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn/", + "tokenizer": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_tokenizer/", + "embedding_model": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/", + "config_filename": "config.json", + "model_filename": "pytorch_model.bin", + "test_filename": "./datasets/semeval14/restaurant_test.raw", + "senticnet": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", + "result_folder": "./eval_result/", + "eval_batch_size": 16, + "seed": 776, + "device": "cpu" + } +} \ No newline at end of file diff --git a/sgnlp/models/sentic_gcn/data_class.py b/sgnlp/models/sentic_gcn/data_class.py new file mode 100644 index 0000000..dbdda62 --- /dev/null +++ b/sgnlp/models/sentic_gcn/data_class.py @@ -0,0 +1,204 @@ +from dataclasses import dataclass, field +from typing import Any, Dict + + +@dataclass +class SenticGCNTrainArgs: + """ + Data class for training config for both SenticGCNModel and SenticGCNBertModel + """ + + # External resources (e.g. Senticnet file, GloVe word vectors, etc) + senticnet_word_file_path: str = field( + default="./senticNet/senticnet_word.txt", metadata={"help": "SenticNet word file path."} + ) + save_preprocessed_senticnet: str = field( + default=True, + metadata={ + "help": """Flag to indicate if senticnet dictionary should be saved during preprocess step. + If 'saved_preprocessed_senticnet_file_path' is populated and valid, it will be overwritten if flag is set to True.""" + }, + ) + saved_preprocessed_senticnet_file_path: str = field( + default="senticnet/senticnet.pickle", + metadata={ + "help": """File path to saved preprocessed senticnet, if file exists and 'save_preprocessed_senticnet' flag is set to False. + SenticNet will be loaded from file instead of generated from raw senticnet files.""" + }, + ) + spacy_pipeline: str = field( + default="en_core_web_sm", metadata={"help": "Type of spacy pipeline to load for processor."} + ) + word_vec_file_path: str = field( + default="glove/glove.840B.300d.txt", + metadata={"help": "File path to word vector."}, + ) + + # Dataset specific config + dataset_train: list = field( + default_factory=list, + metadata={"help": "List of file path to train dataset(s)."}, + ) + dataset_test: list = field( + default_factory=list, + metadata={"help": "List of file path to test dataset(s)."}, + ) + valset_ratio: float = field( + default=0.0, + metadata={ + "help": """ + Ratio of train dataset to be split for validation. + If value is set to 0, test dataset is set as validation dataset as well.""" + }, + ) + + # Model specific config + model: str = field(default="senticgcn", metadata={"help": "Option to choose which model to train."}) + save_best_model: bool = field( + default=True, + metadata={ + "help": """Flag to indicate if best model should be saved during training. + Applies to both bert and non-bert SenticGCN models.""" + }, + ) + save_model_path: str = field( + default="senticgcn", + metadata={ + "help": """Folder path to save trained model using the save_pretrained method. + Applies to both bert and non-bert SenticGCN models.""" + }, + ) + + # Tokenizer specific config + tokenizer: str = field( + default="senticgcn_tokenizer", + metadata={ + "help": """Option to choose which tokenizer to use for training preprocessing. + Value will be used to create tokenizer via the from_pretrained method.""" + }, + ) + train_tokenizer: bool = field( + default=False, + metadata={ + "help": """Flag to indicate if tokenizer should be trained on train and test input dataset. + Only applies to non-bert SenticGCN tokenizer.""" + }, + ) + save_tokenizer: bool = field( + default=False, + metadata={ + "help": """Flag to indicate if tokenizer should be saved using the save_pretrained method. + Only applies to non-bert SenticGCN tokenizer.""" + }, + ) + save_tokenizer_path: str = field( + default="senticgcn_tokenizer", + metadata={ + "help": """Folder path to save pretrained tokenizer using the save_pretrained method. + Only applies to non-bert SenticGCN tokenizer.""" + }, + ) + + # Embedding specific config + embedding_model: str = field( + default="senticgcn_embed_model", + metadata={ + "help": """Option to choose which embeding model to use for training preprocessing. + For non-bert model, value should point to a pretraine model folder. + 'config.json' and 'pytorch_model.bin' will be used to create the config and embedding model + via the from_pretrained method. + Ignore if 'build_embedding_model' flag is set, only affects non-bert SenticGCN embedding model. + For bert model, value should be model name used to download from huggingface model hub.""" + }, + ) + build_embedding_model: bool = field( + default=False, + metadata={ + "help": """Flag to indicate if embedding model should be built from input word vectors. + Only applies to non-bert SenticGCN embedding models. + Word vectors to train on is indicated in 'word_vec_file_path' config.""" + }, + ) + save_embedding_model: bool = field( + default=False, + metadata={ + "help": """Flag to indicate if embedding model should be saved using the save_pretrained method. + Only applies to non-bert SenticGCN embedding model.""" + }, + ) + save_embedding_model_path: str = field( + default="senticgcn_embed_model", + metadata={ + "help": """Folder path to save pretrained embedding model using the save_pretrained method. + Only applies to non-bert SenticGCN embeddding model.""" + }, + ) + + # Training results + save_results: bool = field(default=True, metadata={"help": "Flag to indicate if results should be saved."}) + save_results_folder: str = field(default="results", metadata={"help": "Folder location to save results pickle."}) + + initializer: str = field(default="xavier_uniform_", metadata={"help": "Type of initalizer to use."}) + optimizer: str = field(default="adam", metadata={"help": "Type of optimizer to use."}) + loss_function: str = field(default="cross_entropy", metadata={"help": "Loss function for training/eval."}) + learning_rate: float = field(default=0.001, metadata={"help": "Default learning rate for training."}) + l2reg: float = field(default=0.00001, metadata={"help": "Default l2reg value."}) + epochs: int = field(default=100, metadata={"help": "Number of epochs to train."}) + batch_size: int = field(default=16, metadata={"help": "Training batch size."}) + log_step: int = field(default=5, metadata={"help": "Number of train steps to log results."}) + embed_dim: int = field(default=300, metadata={"help": "Size of embedding."}) + hidden_dim: int = field(default=300, metadata={"help": "Number of neurons for hidden layer."}) + dropout: float = field(default=0.3, metadata={"help": "Default value for dropout percentages."}) + polarities_dim: int = field(default=3, metadata={"help": "Default dimension for polarities."}) + save_results: bool = field(default=True, metadata={"help": "Flag to indicate if results should be saved."}) + seed: int = field(default=776, metadata={"help": "Default random seed for training."}) + device: str = field(default="cuda", metadata={"help": "Type of compute device to use for training."}) + repeats: int = field(default=10, metadata={"help": "Number of times to repeat train loop."}) + patience: int = field( + default=5, metadata={"help": "Number of train epoch without improvements prior to early stopping."} + ) + max_len: int = field(default=85, metadata={"help": "Max length to pad for bert tokenizer."}) + eval_args: Dict[str, Any] = field( + default_factory=lambda: { + "model": "senticgcn", + "model_path": "", + "tokenizer": "senticgcn", + "embedding_model": "senticgcn", + "config_filename": "config.json", + "model_filename": "pytorch_model.bin", + "test_filename": "", + "senticnet": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", + "result_folder": "./eval_result/", + "eval_batch_size": 16, + "seed": 776, + "device": "cpu", + } + ) + + def __post_init__(self): + # Model + assert self.model in ["senticgcn", "senticgcnbert"], "Invalid model type!" + + assert self.initializer in [ + "xavier_uniform_", + "xavier_normal_", + "orthogonal_", + ], "Invalid initializer type!" + assert self.optimizer in [ + "adadelta", + "adagrad", + "adam", + "adamax", + "asgd", + "rmsprop", + "sgd", + ], "Invalid optimizer" + assert self.device in ["cuda", "cpu"], "Invalid device type." + assert self.repeats > 1, "Repeats value must be at least 1." + assert self.patience > 1, "Patience value must be at least 1." + assert 0 >= self.valset_ratio < 1, "Valset_ratio must be greater or equals to 0 and less than 1." + assert self.max_len > 0, "Max_len must be greater than 0." + + # Assign sub dataset columns name + self.data_cols = ["text_indices", "aspect_indices", "left_indices", "text_embeddings", "sdat_graph"] diff --git a/sgnlp/models/sentic_gcn/eval.py b/sgnlp/models/sentic_gcn/eval.py new file mode 100644 index 0000000..fb14ec1 --- /dev/null +++ b/sgnlp/models/sentic_gcn/eval.py @@ -0,0 +1,272 @@ +import datetime +import logging +import pathlib +import shutil +import tempfile +import urllib +from typing import List, Tuple, Union + +import torch +from sklearn.metrics import f1_score +from torch.utils.data import DataLoader + +from .data_class import SenticGCNTrainArgs +from .config import SenticGCNBertConfig, SenticGCNConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig +from .modeling import SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel, SenticGCNModel, SenticGCNBertModel +from .tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer +from .utils import ( + SenticGCNDatasetGenerator, + BucketIterator, + parse_args_and_load_config, + download_tokenizer_files, + set_random_seed, +) + + +logging.basicConfig(level=logging.DEBUG) + + +class SenticGCNBaseEvaluator: + """ + Base Evaluator class used for evaluating SenticGCNModel and SenticGCNBertModel + """ + + def __init__(self, config: SenticGCNTrainArgs) -> None: + self.config = config.eval_args + self.data_cols = config.data_cols + self.device = ( + torch.device("cuda" if torch.cuda.is_available() else "cpu") + if not self.config["device"] + else torch.device(self.config["device"]) + ) + + def _create_tokenizer( + self, tokenizer_class: Union[SenticGCNTokenizer, SenticGCNBertTokenizer] + ) -> Union[SenticGCNTokenizer, SenticGCNBertTokenizer]: + """ + Private method to construct tokenizer. + Tokenizer can be created via download from cloud storage, from local storage + or from HuggingFace repository. + + Args: + tokenizer_class (Union[SenticGCNTokenizer, SenticGCNBertTokenizer]): tokenizer class type to create. + + Returns: + Union[SenticGCNTokenizer, SenticGCNBertTokenizer]: return the tokenizer class instance. + """ + if self.config["tokenizer"].startswith("https://") or self.config["tokenizer"].startswith("http://"): + with tempfile.TemporaryDirectory() as tmpdir: + temp_dir = pathlib.Path(tmpdir) + download_tokenizer_files(self.config["tokenizer"], temp_dir) + tokenizer_ = tokenizer_class.from_pretrained(temp_dir) + shutil.rmtree(temp_dir, ignore_errors=True) + else: + tokenizer_ = tokenizer_class.from_pretrained(self.config["tokenizer"]) + return tokenizer_ + + def _create_model( + self, + model_name_path_or_folder: str, + config_class: Union[ + SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig + ], + model_class: Union[SenticGCNModel, SenticGCNBertModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel], + ) -> Union[SenticGCNModel, SenticGCNBertModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel]: + """ + Private method to construct models and embedding models. + Model can be created via download from cloud storage via from_pretrained method, from local storage + or from HuggingFace repository. + + Args: + model_name_path_or_folder (str): cloud or local storage path to model files + config_class (Union[SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig]): + config class type + model_class (Union[SenticGCNModel, SenticGCNBertModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel]): + model class type + + Returns: + Union[SenticGCNModel, SenticGCNBertModel, SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel]: + return model instance. + """ + if model_name_path_or_folder.startswith("https://") or model_name_path_or_folder.startswith("http://"): + config_url = urllib.parse.urljoin(model_name_path_or_folder, self.config["config_filename"]) + model_url = urllib.parse.urljoin(model_name_path_or_folder, self.config["model_filename"]) + config = config_class.from_pretrained(config_url) + model = model_class.from_pretrained(model_url, config=config) + else: + # Load from local folder + embed_model_name = pathlib.Path(model_name_path_or_folder) + if embed_model_name.is_dir(): + config_path = embed_model_name.joinpath(self.config["config_filename"]) + model_path = embed_model_name.joinpath(self.config["model_filename"]) + config = config_class.from_pretrained(config_path) + model = model_class.from_pretrained(model_path, config=config) + else: + # Load from HuggingFace model repository + config = config_class.from_pretrained(model_name_path_or_folder) + model = model_class.from_pretrained(model_name_path_or_folder, config=config) + return model + + def _evaluate_acc_f1(self, dataloader: Union[DataLoader, BucketIterator]) -> Tuple[float, float]: + """ + Private helper method to evaluate accuracy and f1 score. + + Args: + dataloader (DataLoader): input val and test dataloader + + Returns: + Tuple[float, float]: return acc and f1 score + """ + self.model.eval() + n_correct, n_total = 0, 0 + t_targets_all, t_outputs_all = None, None + with torch.no_grad(): + for _, t_batch in enumerate(dataloader): + # Generate embeddings + t_batch["text_embeddings"] = self._generate_embeddings(t_batch) + # Prepare input data and targets + t_inputs = [t_batch[col].to(self.device) for col in self.data_cols] + t_targets = t_batch["polarity"].to(self.device) + # Inference + t_outputs = self.model(t_inputs) + # Calculate loss + n_correct += (torch.argmax(t_outputs.logits, -1) == t_targets).sum().item() + n_total += len(t_outputs.logits) + + if t_targets_all is None: + t_targets_all = t_targets + t_outputs_all = t_outputs.logits + else: + t_targets_all = torch.cat((t_targets_all, t_targets), dim=0) + t_outputs_all = torch.cat((t_outputs_all, t_outputs.logits), dim=0) + test_acc = n_correct / n_total + f1 = f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average="macro") + return test_acc, f1 + + def _save_results_to_file(self, acc_f1: List[str]) -> None: + """ + Private method to save acc and f1 results to file. + + Args: + acc_f1 (List[str]): list containing acc and f1 results + """ + results = [ + f"Model: {self.config['model']}\n", + f"Batch Size: {self.config['eval_batch_size']}\n", + f"Random Seed: {self.config['seed']}\n", + ] + results = [*results, *acc_f1] + results_folder = pathlib.Path(self.config["result_folder"]) + results_folder.mkdir(exist_ok=True) + results_file = results_folder.joinpath( + f"{self.config['model']}_{datetime.datetime.now().strftime('%d-%m-%y_%H-%M-%S')}_results.txt" + ) + with open(results_file, "a") as f: + f.writelines(results) + + +class SenticGCNEvaluator(SenticGCNBaseEvaluator): + """ + Evaluator class derived from SenticGCNBaseEvaluator. + + Args: + config (SenticGCNTrainArgs): Config for SenticGCNModel + """ + + def __init__(self, config: SenticGCNTrainArgs) -> None: + super().__init__(config) + self.tokenizer = self._create_tokenizer(SenticGCNTokenizer) + self.embedding_model = self._create_model( + config.eval_args["embedding_model"], SenticGCNEmbeddingConfig, SenticGCNEmbeddingModel + ) + self.model = self._create_model(config.eval_args["model_path"], SenticGCNConfig, SenticGCNModel) + data_gen = SenticGCNDatasetGenerator(config, self.tokenizer, "test") + self.raw_data = data_gen.generate_test_datasets() + del data_gen + + def _generate_embeddings(self, batch: List[torch.Tensor]) -> torch.Tensor: + """ + Private helper method to generate embeddings. + + Args: + batch (List[torch.Tensor]): a batch of test dataset + + Returns: + torch.Tensor: return embedding tensor + """ + text_indices = batch["text_indices"].to(self.device) + return self.embedding_model(text_indices) + + def evaluate(self) -> None: + """ + Main evaluate method. + """ + # Generate dataloaders + test_dataloader = BucketIterator(self.raw_data, batch_size=self.config["eval_batch_size"], shuffle=False) + # Evalute Acc and F1 + acc, f1 = self._evaluate_acc_f1(test_dataloader) + logging.info(f"Evaluate Results -> Acc: {acc}, F1: {f1}") + # Save results + acc_f1 = [f"Acc: {acc}\n", f"F1: {f1}\n"] + self._save_results_to_file(acc_f1) + + logging.info("Evaluation Complete!") + + +class SenticGCNBertEvaluator(SenticGCNBaseEvaluator): + """ + Evaluator class derived from SenticGCNBaseEvaluator. + + Args: + config (SenticGCNTrainArgs): Config for SenticGCNModel + """ + + def __init__(self, config: SenticGCNTrainArgs) -> None: + super().__init__(config) + self.tokenizer = self._create_tokenizer(SenticGCNBertTokenizer) + self.embedding_model = self._create_model( + config.eval_args["embedding_model"], SenticGCNBertEmbeddingConfig, SenticGCNBertEmbeddingModel + ) + self.model = self._create_model(config.eval_args["model_path"], SenticGCNBertConfig, SenticGCNBertModel) + data_gen = SenticGCNDatasetGenerator(config, self.tokenizer, "test") + self.raw_data = data_gen.generate_test_datasets() + del data_gen + + def _generate_embeddings(self, batch: List[torch.Tensor]) -> torch.Tensor: + """ + Private helper method to generate embeddings. + + Args: + batch (List[torch.Tensor]): a batch of test dataset + + Returns: + torch.Tensor: return embedding tensor + """ + text_bert_indices = batch["text_bert_indices"].to(self.device) + bert_segment_indices = batch["bert_segment_indices"].to(self.device) + + return self.embedding_model(text_bert_indices, token_type_ids=bert_segment_indices)["last_hidden_state"] + + def evaluate(self) -> None: + """ + Main evaluate method. + """ + # Generate dataloaders + test_dataloader = DataLoader(self.raw_data, batch_size=self.config["eval_batch_size"], shuffle=False) + # Evaluate Acc and F1 + acc, f1 = self._evaluate_acc_f1(test_dataloader) + logging.info(f"Evaluate Results -> Acc: {acc}, F1: {f1}") + # Save results + acc_f1 = [f"Acc: {acc}\n", f"F1: {f1}\n"] + self._save_results_to_file(acc_f1) + + logging.info("Evaluation Complete!") + + +if __name__ == "__main__": + cfg = parse_args_and_load_config() + if cfg.eval_args["seed"] is not None: + set_random_seed(cfg.eval_args["seed"]) + evaluator = SenticGCNEvaluator(cfg) if cfg.eval_args["model"] == "senticgcn" else SenticGCNBertEvaluator(cfg) + logging.info(f"Evaluating {cfg.eval_args['model']}") + evaluator.evaluate() diff --git a/sgnlp/models/sentic_gcn/modeling.py b/sgnlp/models/sentic_gcn/modeling.py new file mode 100644 index 0000000..a4c098a --- /dev/null +++ b/sgnlp/models/sentic_gcn/modeling.py @@ -0,0 +1,354 @@ +from dataclasses import dataclass +from typing import Dict, List, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import PreTrainedModel, BertModel +from transformers.file_utils import ModelOutput + +from .modules.dynamic_rnn import DynamicLSTM +from .modules.gcn import GraphConvolution +from .config import ( + SenticGCNConfig, + SenticGCNBertConfig, + SenticGCNEmbeddingConfig, + SenticGCNBertEmbeddingConfig, +) +from .utils import build_embedding_matrix + + +@dataclass +class SenticGCNModelOutput(ModelOutput): + """ + Base class for outputs of SenticGCNModel. + + Args: + loss (:obj:`torch.Tensor` of shape `(1,)`, `optional`, return when :obj:`labels` is provided): + classification loss, typically cross entropy. Loss function used is dependent on what is specified in SenticGCNConfig. + logits (:obj:`torch.Tensor` of shape :obj:`(batch_size, num_classes)`): + raw logits for each class. num_classes = 3 by default. + """ + + loss: Optional[torch.Tensor] = None + logits: torch.Tensor = None + + +class SenticGCNPreTrainedModel(PreTrainedModel): + """ + The SenticGCN Pre-Trained Model used as base class for derived SenticGCN Model. + + This model is the abstract super class for the SenticGCN Model which defines the config + class types and weights initalization method. This class should not be used or instantiated directly, + see SenticGCNModel class for usage. + """ + + config_class = SenticGCNConfig + base_model_prefix = "senticgcn" + + def _init_weights(self, module: nn.Module) -> None: + pass + + +class SenticGCNModel(SenticGCNPreTrainedModel): + """ + The SenticGCN Model for aspect based sentiment analysis. + + This method inherits from :obj:`SenticGCNPreTrainedModel` for weights initalization and utility functions + from transformer :obj:`PreTrainedModel` class. + + Args: + config (:obj:`~SenticGCNConfig`): + Model configuration class with all parameters required for the model. + Initializing with a config file does not load + the weights associated with the model, only the configuration. + Use the :obj:`.from_pretrained` method to load the model weights. + """ + + def __init__(self, config: SenticGCNConfig) -> None: + super().__init__(config) + self.text_lstm = DynamicLSTM( + config.embed_dim, + config.hidden_dim, + num_layers=1, + batch_first=True, + bidirectional=True, + ) + self.gc1 = GraphConvolution(2 * config.hidden_dim, 2 * config.hidden_dim) + self.gc2 = GraphConvolution(2 * config.hidden_dim, 2 * config.hidden_dim) + self.fc = nn.Linear(2 * config.hidden_dim, config.polarities_dim) + self.text_embed_dropout = nn.Dropout(config.dropout) + if config.loss_function == "cross_entropy": + self.loss_function = nn.CrossEntropyLoss() + + def position_weight( + self, x: torch.Tensor, aspect_double_idx: torch.Tensor, text_len: torch.Tensor, aspect_len: torch.Tensor + ) -> torch.Tensor: + batch_size, seq_len = x.shape[0], x.shape[1] + aspect_double_idx = aspect_double_idx.cpu().numpy() + text_len = text_len.cpu().numpy() + aspect_len = aspect_len.cpu().numpy() + weight = [[] for i in range(batch_size)] + for i in range(batch_size): + context_len = text_len[i] - aspect_len[i] + for j in range(aspect_double_idx[i, 0]): + weight[i].append(1 - (aspect_double_idx[i, 0] - j) / context_len) + for j in range(aspect_double_idx[i, 0], aspect_double_idx[i, 1] + 1): + weight[i].append(0) + for j in range(aspect_double_idx[i, 1] + 1, text_len[i]): + weight[i].append(1 - (j - aspect_double_idx[i, 1] / context_len)) + for j in range(text_len[i], seq_len): + weight[i].append(0) + weight = torch.tensor(weight, dtype=torch.float).unsqueeze(2).to(x.device) + return weight * x + + def mask(self, x: torch.Tensor, aspect_double_idx: torch.Tensor) -> torch.Tensor: + batch_size, seq_len = x.shape[0], x.shape[1] + aspect_double_idx = aspect_double_idx.cpu().numpy() + mask = [[] for i in range(batch_size)] + for i in range(batch_size): + for j in range(aspect_double_idx[i, 0]): + mask[i].append(0) + for j in range(aspect_double_idx[i, 0], aspect_double_idx[i, 1] + 1): + mask[i].append(1) + for j in range(aspect_double_idx[i, 1] + 1, seq_len): + mask[i].append(0) + mask = torch.tensor(mask, dtype=torch.float).unsqueeze(2).to(x.device) + return mask * x + + def forward(self, inputs: List[torch.Tensor], labels: Optional[torch.Tensor] = None) -> SenticGCNModelOutput: + text_indices, aspect_indices, left_indices, text_embeddings, adj = inputs + text_len = torch.sum(text_indices != 0, dim=-1) + aspect_len = torch.sum(aspect_indices != 0, dim=-1) + left_len = torch.sum(left_indices != 0, dim=-1) + aspect_double_idx = torch.cat([left_len.unsqueeze(1), (left_len + aspect_len - 1).unsqueeze(1)], dim=1) + text = self.text_embed_dropout(text_embeddings) + text_out, (_, _) = self.text_lstm(text, text_len) + x = F.relu( + self.gc1( + self.position_weight(text_out, aspect_double_idx, text_len, aspect_len), + adj, + ) + ) + x = F.relu(self.gc2(self.position_weight(x, aspect_double_idx, text_len, aspect_len), adj)) + alpha_mat = torch.matmul(x, text_out.transpose(1, 2)) + alpha = F.softmax(alpha_mat.sum(1, keepdim=True), dim=2) + x = torch.matmul(alpha, text_out).squeeze(1) # batch_size x 2 * hidden_dim + logits = self.fc(x) + + loss = self.loss_function(logits, labels) if labels is not None else None + return SenticGCNModelOutput(loss=loss, logits=logits) + + +@dataclass +class SenticGCNBertModelOutput(ModelOutput): + """ + Base class for outputs of SenticGCNBertModel. + + Args: + loss (:obj:`torch.Tensor` of shape `(1,)`, `optional`, return when :obj:`labels` is provided): + classification loss, typically cross entropy. + Loss function used is dependent on what is specified in SenticGCNBertConfig. + logits (:obj:`torch.Tensor` of shape :obj:`(batch_size, num_classes)`): + raw logits for each class. num_classes = 3 by default. + """ + + loss: Optional[torch.Tensor] = None + logits: torch.Tensor = None + + +class SenticGCNBertPreTrainedModel(PreTrainedModel): + """ + The SenticGCNBert Pre-Trained Model used as base class for derived SenticGCNBert Model. + + This model is the abstract super class for the SenticGCNBert Model which defines the config + class types and weights initalization method. This class should not be used or instantiated directly, + see SenticGCNBertModel class for usage. + """ + + config_class = SenticGCNBertConfig + base_model_prefix = "senticgcnbert" + + def _init_weights(self, module: nn.Module) -> None: + pass + + +class SenticGCNBertModel(SenticGCNBertPreTrainedModel): + """ + The SenticGCNBert Model for aspect based sentiment analysis. + + This method inherits from :obj:`SenticGCNBertPreTrainedModel` for weights initalization and utility functions + from transformer :obj:`PreTrainedModel` class. + + Args: + config (:obj:`~SenticGCNBertConfig`): + Model configuration class with all parameters required for the model. + Initializing with a config file does not load + the weights associated with the model, only the configuration. + Use the :obj:`.from_pretrained` method to load the model weights. + """ + + def __init__(self, config: SenticGCNBertConfig) -> None: + super().__init__(config) + self.gc1 = GraphConvolution(config.hidden_dim, config.hidden_dim) + self.gc2 = GraphConvolution(config.hidden_dim, config.hidden_dim) + self.gc3 = GraphConvolution(config.hidden_dim, config.hidden_dim) + self.fc = nn.Linear(config.hidden_dim, config.polarities_dim) + self.text_embed_dropout = nn.Dropout(config.dropout) + self.max_seq_len = config.max_seq_len + self.loss_function = config.loss_function + + def position_weight( + self, x: torch.Tensor, aspect_double_idx: torch.Tensor, text_len: torch.Tensor, aspect_len: torch.Tensor + ) -> torch.Tensor: + batch_size, seq_len = x.shape[0], x.shape[1] + aspect_double_idx = aspect_double_idx.cpu().numpy() + text_len = text_len.cpu().numpy() + aspect_len = aspect_len.cpu().numpy() + weight = [[] for i in range(batch_size)] + for i in range(batch_size): + context_len = text_len[i] - aspect_len[i] + for j in range(aspect_double_idx[i, 0]): + weight[i].append(1 - (aspect_double_idx[i, 0] - j) / context_len) + for j in range(aspect_double_idx[i, 0], min(aspect_double_idx[i, 1] + 1, self.max_seq_len)): + weight[i].append(0) + for j in range(aspect_double_idx[i, 1] + 1, text_len[i]): + weight[i].append(1 - (j - aspect_double_idx[i, 1]) / context_len) + for j in range(text_len[i], seq_len): + weight[i].append(0) + weight = torch.tensor(weight).unsqueeze(2).to(x.device) + return weight * x + + def mask(self, x: torch.Tensor, aspect_double_idx: torch.Tensor) -> torch.Tensor: + batch_size, seq_len = x.shape[0], x.shape[1] + aspect_double_idx = aspect_double_idx.cpu().numpy() + mask = [[] for i in range(batch_size)] + for i in range(batch_size): + for j in range(aspect_double_idx[i, 0]): + mask[i].append(0) + for j in range(aspect_double_idx[i, 0], min(aspect_double_idx[i, 1] + 1, self.max_seq_len)): + mask[i].append(1) + for j in range(min(aspect_double_idx[i, 1] + 1, self.max_seq_len), seq_len): + mask[i].append(0) + mask = torch.tensor(mask).unsqueeze(2).float().to(x.device) + return mask * x + + def forward(self, inputs: List[torch.Tensor], labels: Optional[torch.Tensor] = None) -> SenticGCNBertModelOutput: + text_indices, aspect_indices, left_indices, text_embeddings, adj = inputs + # text_indices, text_ + text_len = torch.sum(text_indices != 0, dim=-1) + aspect_len = torch.sum(aspect_indices != 0, dim=-1) + left_len = torch.sum(left_indices != 0, dim=-1) + aspect_double_idx = torch.cat([left_len.unsqueeze(1), (left_len + aspect_len - 1).unsqueeze(1)], dim=1) + + text_out = text_embeddings + x = F.relu(self.gc1(self.position_weight(text_out, aspect_double_idx, text_len, aspect_len), adj)) + x = F.relu(self.gc2(self.position_weight(x, aspect_double_idx, text_len, aspect_len), adj)) + x = F.relu(self.gc3(self.position_weight(x, aspect_double_idx, text_len, aspect_len), adj)) + x = self.mask(x, aspect_double_idx) + alpha_mat = torch.matmul(x, text_out.transpose(1, 2)) + alpha = F.softmax(alpha_mat.sum(1, keepdim=True), dim=2) + x = torch.matmul(alpha, text_out).squeeze(1) # batch_size x 2*hidden_dim + logits = self.fc(x) + + loss = self.loss_function(logits, labels) if labels is not None else None + return SenticGCNBertModelOutput(loss=loss, logits=logits) + + +class SenticGCNEmbeddingPreTrainedModel(PreTrainedModel): + """ + The SenticGCN Embedding Pre-Trained Model used as base class for derived SenticGCN Embedding Model. + + This model is the abstract super class for the SenticGCN Embedding Model which defines the config + class types and weights initalization method. This class should not be used or instantiated directly, + see SenticGCNEmbeddingModel class for usage. + """ + + config_class = SenticGCNEmbeddingConfig + base_model_prefix = "senticgcnembedding" + + def _init_weights(self, module: nn.Module) -> None: + pass + + +class SenticGCNEmbeddingModel(SenticGCNEmbeddingPreTrainedModel): + """ + The SenticGCN Embedding Model used to generate embeddings for model inputs. + By default, the embeddings are generated from the glove.840B.300d embeddings. + + This class inherits from :obj:`SenticGCNEmbeddingPreTrainedModel` for weights initalization and utility functions + from transformers :obj:`PreTrainedModel` class. + + This class can also be constructed via the SenticGCNEmbeddingModel.build_embedding_matrix class method. + + Args: + config (:obj:`~SenticGCNEmbeddingConfig`): + Model configuration class with all parameters required for the model. + Initializing with a config file does not load + the weights associated with the model, only the configuration. + Use the :obj:`.from_pretrained` method to load the model weights. + """ + + def __init__(self, config: SenticGCNEmbeddingConfig) -> None: + super().__init__(config) + self.vocab_size = config.vocab_size + self.embed = nn.Embedding(config.vocab_size, config.embed_dim) + + def forward(self, token_ids: torch.Tensor) -> torch.Tensor: + """ + Encode input token ids using word embedding. + + Args: + token_ids (torch.Tensor): Tensor of token ids with shape [batch_size, num_words] + + Returns: + torch.Tensor: return Tensor of embeddings with shape (batch_size, num_words, embed_dim) + """ + return self.embed(token_ids) + + @classmethod + def build_embedding_model( + cls, + word_vec_file_path: str, + vocab: Dict[str, int], + embed_dim: int = 300, + ): + """ + This class method is a helper method to construct the embedding model from a file containing word vectors (i.e. GloVe) + and a vocab dictionary. + + Args: + word_vec_file_path (str): file path to the word vectors + vocab (Dict[str, int]): vocab dictionary consisting of words as key and index as values + embed_dim (int, optional): the embedding dimension. Defaults to 300. + + Returns: + SenticGCNEmbeddingModel: return an instance of SenticGCNEmbeddingModel + """ + embedding_matrix = build_embedding_matrix( + word_vec_file_path=word_vec_file_path, vocab=vocab, embed_dim=embed_dim + ) + embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float) + sentic_embed_config = SenticGCNEmbeddingConfig(vocab_size=len(vocab), embed_dim=embed_dim) + senticgcn_embed = cls(sentic_embed_config) + senticgcn_embed.embed.weight.data.copy_(embedding_tensor) + return senticgcn_embed + + +class SenticGCNBertEmbeddingModel(BertModel): + """ + The SenticGCN Bert Embedding Model used to generate embeddings for model inputs. + + This class inherits from :obj:`BertModel` for weights initalization and utility functions + from transformers :obj:`PreTrainedModel` class. + + Args: + config (:obj:`~SenticGCNBertEmbeddingConfig`): + Model configuration class with all parameters required for the model. + Initializing with a config file does not load + the weights associated with the model, only the configuration. + Use the :obj:`.from_pretrained` method to load the model weights. + """ + + def __init__(self, config: SenticGCNBertEmbeddingConfig) -> None: + super().__init__(config) diff --git a/sgnlp/models/sentic_gcn/modules/__init__.py b/sgnlp/models/sentic_gcn/modules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sgnlp/models/sentic_gcn/modules/dynamic_rnn.py b/sgnlp/models/sentic_gcn/modules/dynamic_rnn.py new file mode 100644 index 0000000..3a08ea3 --- /dev/null +++ b/sgnlp/models/sentic_gcn/modules/dynamic_rnn.py @@ -0,0 +1,88 @@ +import torch +import torch.nn as nn + + +class DynamicLSTM(nn.Module): + """ + A dynamic LSTM class which can hold variable length sequence + """ + + def __init__( + self, + input_size: int, + hidden_size: int, + num_layers: int = 1, + bias: bool = True, + batch_first: bool = True, + dropout: float = 0, + bidirectional: bool = False, + only_use_last_hidden_state: bool = False, + rnn_type: str = "LSTM", + ) -> None: + super(DynamicLSTM, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.bias = bias + self.batch_first = batch_first + self.dropout = dropout + self.bidirectional = bidirectional + self.only_use_last_hidden_state = only_use_last_hidden_state + self.rnn_type = rnn_type + self.__init_rnn() + + def __init_rnn(self) -> None: + """ + Helper method to initalized RNN type + """ + input_args = { + "input_size": self.input_size, + "hidden_size": self.hidden_size, + "num_layers": self.num_layers, + "bias": self.bias, + "batch_first": self.batch_first, + "dropout": self.dropout, + "bidirectional": self.bidirectional, + } + if self.rnn_type == "LSTM": + self.rnn = nn.LSTM(**input_args) + elif self.rnn_type == "GRU": + self.rnn = nn.GRU(**input_args) + elif self.rnn_type == "RNN": + self.rnn = nn.RNN(**input_args) + + def forward(self, x: torch.Tensor, x_len: torch.Tensor, h0: torch.Tensor = None) -> torch.Tensor: + # Sort + x_sort_idx = torch.argsort(-x_len) + x_unsort_idx = torch.argsort(x_sort_idx).long() + x_len = x_len[x_sort_idx] + x = x[x_sort_idx.long()] + + # Pack + x_emb_p = torch.nn.utils.rnn.pack_padded_sequence(x, x_len.cpu(), batch_first=self.batch_first) + + if self.rnn_type == "LSTM": + out_pack, (ht, ct) = self.rnn(x_emb_p, None) if h0 is None else self.rnn(x_emb_p, (h0, h0)) + else: + out_pack, ht = self.rnn(x_emb_p, None) if h0 is None else self.rnn(x_emb_p, h0) + ct = None + + # Unsort + # (num_layers * num_directions, batch, hidden_size) -> (batch, ...) + ht = torch.transpose(ht, 0, 1)[x_unsort_idx] + ht = torch.transpose(ht, 0, 1) + + if self.only_use_last_hidden_state: + return ht + else: + # Unpack: out + out = torch.nn.utils.rnn.pad_packed_sequence(out_pack, batch_first=self.batch_first) # (sequence, lengths) + out = out[0] + out = out[x_unsort_idx] + + # Unsort: out c + if self.rnn_type == "LSTM": + # (num_layers * num_directions, batch, hidden_size) -> (batch, ...) + ct = torch.transpose(ct, 0, 1)[x_unsort_idx] + ct = torch.transpose(ct, 0, 1) + return out, (ht, ct) diff --git a/sgnlp/models/sentic_gcn/modules/gcn.py b/sgnlp/models/sentic_gcn/modules/gcn.py new file mode 100644 index 0000000..618156e --- /dev/null +++ b/sgnlp/models/sentic_gcn/modules/gcn.py @@ -0,0 +1,23 @@ +import torch +import torch.nn as nn + + +class GraphConvolution(nn.Module): + """ + Simple GCN Layer, similar to https://arxiv.org/abs/1609.02907 + """ + + def __init__(self, in_features: torch.Tensor, out_features: torch.Tensor, bias=True) -> None: + super(GraphConvolution, self).__init__() + self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features)) + if bias: + self.bias = nn.Parameter(torch.FloatTensor(out_features)) + else: + self.register_parameter("bias", None) + + def forward(self, text: torch.Tensor, adj: torch.Tensor): + text = text.to(torch.float32) + hidden = torch.matmul(text, self.weight) + denom = torch.sum(adj, dim=2, keepdim=True) + 1 + output = torch.matmul(adj, hidden) / denom + return output + self.bias if self.bias is not None else output diff --git a/sgnlp/models/sentic_gcn/postprocess.py b/sgnlp/models/sentic_gcn/postprocess.py new file mode 100644 index 0000000..4a35a8b --- /dev/null +++ b/sgnlp/models/sentic_gcn/postprocess.py @@ -0,0 +1,81 @@ +from typing import Dict, List, Union + +import torch.nn.functional as F + +from .preprocess import SenticGCNData, SenticGCNBertData +from .modeling import SenticGCNModelOutput, SenticGCNBertModelOutput + + +class SenticGCNBasePostprocessor: + """ + Base postprocessor class providing common post processing functions. + """ + + def __init__(self, return_full_text: bool = False, return_aspects_text: bool = False) -> None: + self.return_full_text = return_full_text + self.return_aspects_text = return_aspects_text + + def __call__( + self, + processed_inputs: List[Union[SenticGCNData, SenticGCNBertData]], + model_outputs: Union[SenticGCNModelOutput, SenticGCNBertModelOutput], + ) -> List[Dict[str, Union[List[str], List[int], float]]]: + # Get predictions + probabilities = F.softmax(model_outputs.logits, dim=-1).detach().numpy() + predictions = [probabilities.argmax(axis=-1)[idx] - 1 for idx in range(len(probabilities))] + # Process output + outputs = [] + for processed_input, prediction in zip(processed_inputs, predictions): + exists = False + # Check to see if the full_text_tokens already exists + # If found, append the aspect_token_index, prediction and optionally aspect texts. + for idx, proc_output in enumerate(outputs): + if proc_output["sentence"] == processed_input.full_text_tokens: + exists = True + outputs[idx]["aspects"].append(processed_input.aspect_token_index) + outputs[idx]["labels"].append(int(prediction)) + if self.return_aspects_text: + outputs[idx]["aspects_text"].append(processed_input.aspect) + break + if exists: + continue + processed_dict = {} + processed_dict["sentence"] = processed_input.full_text_tokens + processed_dict["aspects"] = [processed_input.aspect_token_index] + processed_dict["labels"] = [int(prediction)] + if self.return_full_text: + processed_dict["full_text"] = processed_input.full_text + if self.return_aspects_text: + processed_dict["aspects_text"] = [processed_input.aspect] + outputs.append(processed_dict) + return outputs + + +class SenticGCNPostprocessor(SenticGCNBasePostprocessor): + """ + Class to initialise the Postprocessor for SenticGCNModel. + Class to postprocess SenticGCNModel output to get a list of input text tokens, + aspect token index and prediction labels. + + Args: + return_full_text (bool): Flag to indicate if the full text should be included in the output. + return_aspects_text (bool): Flag to indicate if the list of aspects text should be included in the output. + """ + + def __init__(self, return_full_text: bool = False, return_aspects_text: bool = False) -> None: + super().__init__(return_full_text=return_full_text, return_aspects_text=return_aspects_text) + + +class SenticGCNBertPostprocessor(SenticGCNBasePostprocessor): + """ + Class to initialise the Postprocessor for SenticGCNBertModel. + Class to postprocess SenticGCNBertModel output to get a list of input text tokens, + aspect token index and prediction labels. + + Args: + return_full_text (bool): Flag to indicate if the full text should be included in the output. + return_aspects_text (bool): Flag to indicate if the list of aspects text should be included in the output. + """ + + def __init__(self, return_full_text: bool = False, return_aspects_text: bool = False) -> None: + super().__init__(return_full_text=return_full_text, return_aspects_text=return_aspects_text) diff --git a/sgnlp/models/sentic_gcn/preprocess.py b/sgnlp/models/sentic_gcn/preprocess.py new file mode 100644 index 0000000..678bddc --- /dev/null +++ b/sgnlp/models/sentic_gcn/preprocess.py @@ -0,0 +1,522 @@ +import logging +import pathlib +import shutil +import string +import tempfile +import urllib.parse +from collections import namedtuple +from typing import Dict, List, Tuple, Union + +import numpy as np +import spacy +import torch +from transformers import PreTrainedTokenizer, PretrainedConfig, PreTrainedModel + +from .config import SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig +from .modeling import SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel +from .tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer +from .utils import ( + load_and_process_senticnet, + download_tokenizer_files, + download_url_file, + pad_and_truncate, + generate_dependency_adj_matrix, +) + + +logging.basicConfig(level=logging.DEBUG) + + +SenticGCNData = namedtuple( + "SenticGCNData", ["full_text", "aspect", "left_text", "full_text_tokens", "aspect_token_index"] +) +SenticGCNBertData = namedtuple( + "SenticGCNBertData", + ["full_text", "aspect", "left_text", "full_text_with_bert_tokens", "full_text_tokens", "aspect_token_index"], +) + + +class SenticGCNBasePreprocessor: + """ + Base preprocessor class provides initialization for spacy, senticnet, tokenizer and embedding model. + Class is only meant to be inherited by derived preprocessor. + """ + + def __init__( + self, + tokenizer: Union[str, PreTrainedTokenizer], + embedding_model: Union[str, PreTrainedTokenizer], + tokenizer_class: PreTrainedTokenizer, + embedding_config_class: PretrainedConfig, + embedding_model_class: PreTrainedModel, + config_filename: str = "config.json", + model_filename: str = "pytorch_model.bin", + spacy_pipeline: str = "en_core_web_sm", + senticnet: Union[ + str, Dict[str, float] + ] = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", + device: str = "cpu", + ) -> None: + # Set device + self.device = ( + torch.device("cuda" if torch.cuda.is_available() else "cpu") if not device else torch.device(device) + ) + self.spacy_pipeline = spacy.load(spacy_pipeline) + + try: + # Load senticnet + if isinstance(senticnet, dict): + senticnet_ = senticnet + elif senticnet.startswith("https://") or senticnet.startswith("http://"): + with tempfile.TemporaryDirectory() as tmpdir: + temp_dir = pathlib.Path(tmpdir) + download_url_file(senticnet, temp_dir) + saved_path = temp_dir.joinpath("senticnet.pickle") + senticnet_ = load_and_process_senticnet(saved_preprocessed_senticnet_file_path=saved_path) + shutil.rmtree(temp_dir, ignore_errors=True) + elif senticnet.endswith(".pkl") or senticnet.endswith(".pickle"): + senticnet_ = load_and_process_senticnet(saved_preprocessed_senticnet_file_path=senticnet) + elif senticnet.endswith(".txt"): + senticnet_ = load_and_process_senticnet(senticnet_file_path=senticnet) + else: + raise ValueError( + """ + Error initializing SenticNet! + For downloading from cloud storage, please provide url to pickle file location + (i.e. string url starting with https:// or http://). + For processed SenticNet dictionary, please provide pickle file location + (i.e. file with .pkl or .pickle extension). + For raw SenticNet-5.0 file, please provide text file path (i.e. file with .txt extension). + For externally created SenticNet dictionary, please provide a dictionary with words as key + and sentic score as values. + """ + ) + self.senticnet = senticnet_ + except Exception as e: + logging.error(e) + raise Exception( + """ + Error initializing SenticNet! Please ensure that input is either a dictionary, a str path to + a saved pickle file, an url to cloud storage or str path to the raw senticnet file. + """ + ) + + try: + # Init Tokenizer + if isinstance(tokenizer, PreTrainedTokenizer): + # Load from external instance + tokenizer_ = tokenizer + else: + if tokenizer.startswith("https://") or tokenizer.startswith("http://"): + # Load from cloud + # Download tokenizer files to temp dir + with tempfile.TemporaryDirectory() as tmpdir: + temp_dir = pathlib.Path(tmpdir) + download_tokenizer_files(tokenizer, temp_dir) + tokenizer_ = tokenizer_class.from_pretrained(temp_dir) + shutil.rmtree(temp_dir, ignore_errors=True) + else: + # Load from local directory or from HuggingFace model repository + tokenizer_ = tokenizer_class.from_pretrained(tokenizer) + self.tokenizer = tokenizer_ + except Exception as e: + logging.error(e) + raise Exception( + """ + Error initializing tokenizer! Please ensure that input tokenizer is either a PreTrainedTokenizer instance, + an url to cloud storage folder, local folder or HuggingFace model name. + """ + ) + + try: + # Init Embedding model + if isinstance(embedding_model, PreTrainedModel): + # Load from external instance + embed_model = embedding_model + else: + if embedding_model.startswith("https://") or embedding_model.startswith("http://"): + # Load from cloud + config_url = urllib.parse.urljoin(embedding_model, config_filename) + model_url = urllib.parse.urljoin(embedding_model, model_filename) + embed_config = embedding_config_class.from_pretrained(config_url) + embed_model = embedding_model_class.from_pretrained(model_url, config=embed_config) + else: + # Load from local folder + embed_model_name = pathlib.Path(embedding_model) + if embed_model_name.is_dir(): + config_path = embed_model_name.joinpath(config_filename) + model_path = embed_model_name.joinpath(model_filename) + embed_config = embedding_config_class.from_pretrained(config_path) + embed_model = embedding_model_class.from_pretrained(model_path, config=embed_config) + else: + # Load from HuggingFace model repository + embed_config = embedding_config_class.from_pretrained(embedding_model) + embed_model = embedding_model_class.from_pretrained(embedding_model, config=embed_config) + self.embedding_model = embed_model + self.embedding_model.to(self.device) + except Exception as e: + logging.error(e) + raise Exception( + """ + Error initializing embedding model! Please ensure that input tokenizer is either a PreTrainedModel instance, + an url to cloud storage folder, local folder or HuggingFace model name. + """ + ) + + +class SenticGCNPreprocessor(SenticGCNBasePreprocessor): + """ + Class for preprocessing sentence(s) and its aspect(s) to a batch of tensors for the SenticGCNBertModel + to predict on. + """ + + def __init__( + self, + tokenizer: Union[ + str, PreTrainedTokenizer + ] = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_tokenizer/", + embedding_model: Union[ + str, PreTrainedModel + ] = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/", + config_filename: str = "config.json", + model_filename: str = "pytorch_model.bin", + spacy_pipeline: str = "en_core_web_sm", + senticnet: Union[ + str, Dict[str, float] + ] = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", + device: str = "cpu", + ) -> None: + super().__init__( + tokenizer=tokenizer, + embedding_model=embedding_model, + tokenizer_class=SenticGCNTokenizer, + embedding_config_class=SenticGCNEmbeddingConfig, + embedding_model_class=SenticGCNEmbeddingModel, + config_filename=config_filename, + model_filename=model_filename, + spacy_pipeline=spacy_pipeline, + senticnet=senticnet, + device=device, + ) + + def _process_indices(self, data_batch: List[SenticGCNData]) -> List[torch.Tensor]: + """ + Private helper method to generate all indices and embeddings from list of input data + required for model input. + + Args: + data_batch (List[SenticGCNData]): list of processed inputs as SenticGCNData + + Returns: + List[torch.Tensor]: return a list of tensors for model input + """ + all_text_indices = [] + all_aspect_indices = [] + all_left_indices = [] + all_sdat_graph = [] + all_data = [] + max_len = 0 + for data in data_batch: + text_indices = self.tokenizer( + data.full_text, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + aspect_indices = self.tokenizer( + data.aspect, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + left_indices = self.tokenizer( + data.left_text, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + graph = generate_dependency_adj_matrix(data.full_text, data.aspect, self.senticnet, self.spacy_pipeline) + all_data.append( + { + "text_indices": text_indices["input_ids"], + "aspect_indices": aspect_indices["input_ids"], + "left_indices": left_indices["input_ids"], + "sdat_graph": graph, + } + ) + if max_len < len(text_indices["input_ids"]): + max_len = len(text_indices["input_ids"]) + + for item in all_data: + (text_indices, aspect_indices, left_indices, sdat_graph,) = ( + item["text_indices"], + item["aspect_indices"], + item["left_indices"], + item["sdat_graph"], + ) + + text_padding = [0] * (max_len - len(text_indices)) + aspect_padding = [0] * (max_len - len(aspect_indices)) + left_padding = [0] * (max_len - len(left_indices)) + + sdat_graph = np.pad( + sdat_graph, + ((0, max_len - len(text_indices)), (0, max_len - len(text_indices))), + "constant", + ) + + all_text_indices.append(text_indices + text_padding) + all_aspect_indices.append(aspect_indices + aspect_padding) + all_left_indices.append(left_indices + left_padding) + all_sdat_graph.append(sdat_graph) + + all_text_indices = torch.tensor(all_text_indices).to(self.device) + text_embeddings = self.embedding_model(all_text_indices) + + return [ + all_text_indices, + torch.tensor(all_aspect_indices).to(self.device), + torch.tensor(all_left_indices).to(self.device), + text_embeddings, + torch.tensor(all_sdat_graph).to(self.device), + ] + + def _process_inputs(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> List[SenticGCNData]: + """ + Private helper method to process input data batch. + Input entries are repeated for each input aspect. + If input aspect have multiple occurance in the sentence, each occurance is process as an entry. + + Args: + data_batch (List[Dict[str, Union[str, List[str]]]]): list of dictionaries with 2 keys, 'sentence' and 'aspect'. + 'sentence' value are strings and 'aspect' value is a list of accompanying aspect. + + Returns: + List[SenticGCNData]: return list of processed inputs as SenticGCNData + """ + processed_inputs = [] + for batch in data_batch: + full_text = batch["sentence"].lower().strip() + full_text_tokens = batch["sentence"].split() + for aspect in batch["aspects"]: + aspect = aspect.lower().strip() + aspect_token_indexes = [ + idx + for idx, val in enumerate(full_text_tokens) + if val.lower().translate(str.maketrans("", "", string.punctuation)) == aspect + ] + aspect_idxs = [index for index in range(len(full_text)) if full_text.startswith(aspect, index)] + for aspect_index, aspect_token_index in zip(aspect_idxs, aspect_token_indexes): + left_text = full_text[:aspect_index].strip() + processed_inputs.append( + SenticGCNData( + full_text=full_text, + aspect=aspect, + left_text=left_text, + full_text_tokens=full_text_tokens, + aspect_token_index=aspect_token_index, + ) + ) + return processed_inputs + + def __call__( + self, data_batch: List[Dict[str, Union[str, List[str]]]] + ) -> Tuple[List[SenticGCNData], List[torch.Tensor]]: + """ + Method to generate list of input tensors from a list of sentences and their accompanying list of aspect. + + Args: + data_batch (List[Dict[str, Union[str, List[str]]]]): list of dictionaries with 2 keys, 'sentence' and 'aspect'. + 'sentence' value are strings and 'aspect' value is a list of accompanying aspect. + + Returns: + Tuple[List[SenticGCNData], List[torch.Tensor]]: return a list of ordered tensors for 'text_indices', + 'aspect_indices', 'left_indices', 'text_embeddings' and 'sdat_graph'. + """ + processed_inputs = self._process_inputs(data_batch) + return processed_inputs, self._process_indices(processed_inputs) + + +class SenticGCNBertPreprocessor(SenticGCNBasePreprocessor): + """ + Class for preprocessing sentence(s) and its aspect(s) to a batch of tensors for the SenticGCNBertModel + to predict on. + """ + + def __init__( + self, + tokenizer: Union[str, PreTrainedTokenizer] = "bert-base-uncased", + embedding_model: Union[str, PreTrainedModel] = "bert-base-uncased", + config_filename: str = "config.json", + model_filename: str = "pytorch_model.bin", + spacy_pipeline: str = "en_core_web_sm", + senticnet: Union[ + str, Dict[str, float] + ] = "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", + max_len: int = 85, + device: str = "cpu", + ) -> None: + super().__init__( + tokenizer=tokenizer, + embedding_model=embedding_model, + tokenizer_class=SenticGCNBertTokenizer, + embedding_config_class=SenticGCNBertEmbeddingConfig, + embedding_model_class=SenticGCNBertEmbeddingModel, + config_filename=config_filename, + model_filename=model_filename, + spacy_pipeline=spacy_pipeline, + senticnet=senticnet, + device=device, + ) + self.max_len = max_len + + def _process_indices(self, data_batch: List[SenticGCNBertData]) -> List[torch.Tensor]: + """ + Private helper method to generate all indices and embeddings from list of input data + required for model input. + + Args: + data_batch (List[SenticGCNBertData]): list of processed inputs as SenticGCNBertData + + Returns: + List[torch.Tensor]: return a list of tensors for model input + """ + all_text_indices = [] + all_aspect_indices = [] + all_left_indices = [] + all_text_bert_indices = [] + all_bert_segment_indices = [] + all_sdat_graph = [] + for data in data_batch: + text_indices = self.tokenizer( + data.full_text, + max_length=self.max_len, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + aspect_indices = self.tokenizer( + data.aspect, + max_length=self.max_len, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + left_indices = self.tokenizer( + data.left_text, + max_length=self.max_len, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + text_bert_indices = self.tokenizer( + data.full_text_with_bert_tokens, + max_length=self.max_len, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + text_len = np.sum(text_indices["input_ids"] != 0) + aspect_len = np.sum(aspect_indices["input_ids"] != 0) + concat_segment_indices = [0] * (text_len + 2) + [1] * (aspect_len + 1) + concat_segment_indices = pad_and_truncate(concat_segment_indices, self.max_len) + + graph = generate_dependency_adj_matrix(data.full_text, data.aspect, self.senticnet, self.spacy_pipeline) + sdat_graph = np.pad( + graph, + ( + (0, self.max_len - graph.shape[0]), + (0, self.max_len - graph.shape[0]), + ), + "constant", + ) + + all_text_indices.append(text_indices["input_ids"]) + all_aspect_indices.append(aspect_indices["input_ids"]) + all_left_indices.append(left_indices["input_ids"]) + all_text_bert_indices.append(text_bert_indices["input_ids"]) + all_bert_segment_indices.append(concat_segment_indices) + all_sdat_graph.append(sdat_graph) + + all_text_bert_indices = torch.tensor(all_text_bert_indices).to(self.device) + all_bert_segment_indices = torch.tensor(np.array(all_bert_segment_indices)).to(self.device) + text_embeddings = self.embedding_model(all_text_bert_indices, token_type_ids=all_bert_segment_indices)[ + "last_hidden_state" + ] + + return [ + torch.tensor(all_text_indices).to(self.device), + torch.tensor(all_aspect_indices).to(self.device), + torch.tensor(all_left_indices).to(self.device), + text_embeddings, + torch.tensor(all_sdat_graph).to(self.device), + ] + + def _process_inputs(self, data_batch: List[Dict[str, Union[str, List[str]]]]) -> List[SenticGCNBertData]: + """ + Private helper method to process input data batch. + Input entries are repeated for each input aspect. + If input aspect have multiple occurance in the sentence, each occurance is process as an entry. + + Args: + data_batch (List[Dict[str, Union[str, List[str]]]]): list of dictionaries with 2 keys, 'sentence' and 'aspect'. + 'sentence' value are strings and 'aspect' value is a list of accompanying aspect. + + Returns: + List[SenticGCNBertData]: return list of processed inputs as SenticGCNBertData + """ + processed_inputs = [] + for batch in data_batch: + full_text = batch["sentence"].lower().strip() + full_text_tokens = batch["sentence"].split() + for aspect in batch["aspects"]: + aspect = aspect.lower().strip() + aspect_token_indexes = [ + idx + for idx, val in enumerate(full_text_tokens) + if val.lower().translate(str.maketrans("", "", string.punctuation)) == aspect + ] + aspect_idxs = [index for index in range(len(full_text)) if full_text.startswith(aspect, index)] + for aspect_index, aspect_token_index in zip(aspect_idxs, aspect_token_indexes): + left_text = full_text[:aspect_index].strip() + full_text_with_bert_tokens = f"[CLS] {full_text} [SEP] {aspect} [SEP]" + processed_inputs.append( + SenticGCNBertData( + full_text=full_text, + aspect=aspect, + left_text=left_text, + full_text_with_bert_tokens=full_text_with_bert_tokens, + full_text_tokens=full_text_tokens, + aspect_token_index=aspect_token_index, + ) + ) + return processed_inputs + + def __call__( + self, data_batch: List[Dict[str, Union[str, List[str]]]] + ) -> Tuple[List[SenticGCNData], List[torch.Tensor]]: + """ + Method to generate list of input tensors from a list of sentences and their accompanying list of aspect. + + Args: + data_batch (List[Dict[str, Union[str, List[str]]]]): list of dictionaries with 2 keys, 'sentence' and 'aspect'. + 'sentence' value are strings and 'aspect' value is a list of accompanying aspect. + + Returns: + Tuple[List[SenticGCNData], List[torch.Tensor]]: return a list of ordered tensors for 'text_indices', + 'aspect_indices', 'left_indices', 'text_embeddings' and 'sdat_graph'. + """ + processed_inputs = self._process_inputs(data_batch) + return processed_inputs, self._process_indices(processed_inputs) diff --git a/sgnlp/models/sentic_gcn/tokenization.py b/sgnlp/models/sentic_gcn/tokenization.py new file mode 100644 index 0000000..c69bf66 --- /dev/null +++ b/sgnlp/models/sentic_gcn/tokenization.py @@ -0,0 +1,164 @@ +import pathlib +import pickle +from typing import Dict, List, Optional, Tuple + +import torch + +from transformers import PreTrainedTokenizer, BertTokenizer + + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.pkl"} + + +class SenticGCNTokenizer(PreTrainedTokenizer): + """ + The SenticGCN tokenizer class used for to generate tokens for the embedding model. + + Args: + text (:obj:`str`): + input text string to tokenize + + Example:: + tokenizer = SenticGCNTokenizer.from_pretrained("senticgcn") + inputs = tokenizer('Hello World!') + inputs['input_ids'] + """ + + vocab_files_names = VOCAB_FILES_NAMES + + def __init__( + self, + vocab_file: str = None, + train_files: List[str] = None, + train_vocab: bool = False, + do_lower_case: bool = True, + unk_token: str = "", + pad_token: str = "", + **kwargs, + ): + super().__init__( + do_lower_case=do_lower_case, + unk_token=unk_token, + pad_token=pad_token, + **kwargs, + ) + self.do_lower_case = do_lower_case + if train_vocab: + self.vocab = self.create_vocab(train_files) + else: + with open(vocab_file, "rb") as fin: + self.vocab = pickle.load(fin) + self.ids_to_tokens = {v: k for k, v in self.vocab.items()} + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab) + + def _convert_token_to_id(self, token: str) -> int: + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index: int) -> str: + return self.ids_to_tokens(index, self.unk_token) + + @staticmethod + def __read_text_file(file_names: List[str]) -> str: + """ + Helper method to read contents of a list of text files. + + Args: + file_names (List[str]): list of text files to read. + + Returns: + str: return a concatenated string of text files contents. + """ + text = "" + for fname in file_names: + with open(fname, "r", encoding="utf-8", newline="\n", errors="ignore") as fin: + lines = fin.readlines() + for i in range(0, len(lines), 3): + text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] + aspect = lines[i + 1].lower().strip() + text += f"{text_left} {aspect} {text_right} " # Left a space at the end + return text + + def create_vocab(self, train_files: List[str]) -> Dict[str, int]: + text = SenticGCNTokenizer.__read_text_file(train_files) + if self.do_lower_case: + text = text.lower() + vocab = {} + vocab[self.pad_token] = 0 + vocab[self.unk_token] = 1 + offset = len(vocab.keys()) + + words = text.split() + for word in words: + if word not in vocab: + vocab[word] = offset + offset += 1 + return vocab + + def _tokenize(self, text, **kwargs): + if self.do_lower_case: + text = text.lower() + words = text.split() + return words + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + save_dir = pathlib.Path(save_directory) + save_dir.mkdir(exist_ok=True) + vocab_file_path = save_dir.joinpath("vocab.pkl") + with open(vocab_file_path, "wb") as fout: + pickle.dump(self.vocab, fout) + return (str(vocab_file_path),) + + +class SenticGCNBertTokenizer(BertTokenizer): + """ + The senticGCN Bert Tokenizer class used to generate tokens for the embedding model, derived from BertTokenizer class. + + Args: + text (:obj:`str`): + input text string to tokenize + + Example:: + tokenizer = SenticGCNBertTokenizer.from_pretrained('bert-base-uncased') + inputs = tokenizer('Hello World!') + inputs['input_ids'] + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def __call__( + self, + text, + max_length: int = 85, + add_special_tokens: bool = False, + padding: bool = True, + truncation: bool = True, + return_token_type_ids: bool = False, + return_attention_mask: bool = False, + return_tensors: str = None, + **kwargs, + ): + encoding = super().__call__( + text, + max_length=max_length, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_tensors=return_tensors, + **kwargs, + ) + # Workaround for padding empty input text + for key in encoding.keys(): + if len(encoding[key]) == 0 and padding == "max_length": + encoding[key] = [0] * max_length + if return_tensors == "pt": + encoding[key] = torch.tensor(encoding[key]) + return encoding diff --git a/sgnlp/models/sentic_gcn/train.py b/sgnlp/models/sentic_gcn/train.py new file mode 100644 index 0000000..5704f41 --- /dev/null +++ b/sgnlp/models/sentic_gcn/train.py @@ -0,0 +1,594 @@ +import datetime +import logging +import math +import pathlib +import pickle +import shutil +import tempfile +import urllib +from typing import Dict, List, Tuple, Union + +import torch +import torch.nn as nn +import torch.optim as optim +from sklearn.metrics import f1_score +from torch.utils.data.dataloader import DataLoader + +from .config import SenticGCNConfig, SenticGCNBertConfig, SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig +from .data_class import SenticGCNTrainArgs +from .modeling import ( + SenticGCNBertPreTrainedModel, + SenticGCNModel, + SenticGCNBertModel, + SenticGCNEmbeddingModel, + SenticGCNBertEmbeddingModel, +) +from .tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer +from .utils import parse_args_and_load_config, set_random_seed, SenticGCNDatasetGenerator, BucketIterator + + +logging.basicConfig(level=logging.DEBUG) + + +class SenticGCNBaseTrainer: + """ + Base Trainer class used for training SenticGCNModel and SenticGCNBertModel + """ + + def __init__(self, config: SenticGCNTrainArgs): + self.config = config + self.global_max_acc = 0.0 + self.global_max_f1 = 0.0 + self.global_best_model_tmpdir = None + self.device = ( + torch.device("cuda" if torch.cuda.is_available() else "cpu") + if not self.config.device + else torch.device(self.config.device) + ) + self.initializer = self._create_initializers() + with tempfile.TemporaryDirectory() as tmpdir: + self.temp_dir = pathlib.Path(tmpdir) + + def _create_initializers(self): + """ + Private helper method to instantiate initializer. + """ + initializers = { + "xavier_uniform_": nn.init.xavier_uniform_, + "xavier_normal_": nn.init.xavier_normal_, + "orthogonal": nn.init.orthogonal_, + } + return initializers[self.config.initializer] + + def _create_optimizer(self, params, lr, weight_decay): + """ + Private helper method to instantiate optimzer. + """ + optimizers = { + "adadelta": optim.Adadelta, + "adagrad": optim.Adagrad, + "adam": optim.Adam, + "adamax": optim.Adamax, + "asgd": optim.ASGD, + "rmsprop": optim.RMSprop, + "sgd": optim.SGD, + } + return optimizers[self.config.optimizer](params, lr=lr, weight_decay=weight_decay) + + def _reset_params(self) -> None: + raise NotImplementedError("Please call from derived class only.") + + def _generate_data_loaders( + self, + ) -> Union[Tuple[DataLoader, DataLoader, DataLoader], Tuple[BucketIterator, BucketIterator, BucketIterator]]: + raise NotImplementedError("Please call from derived class only.") + + def _create_tokenizer(self) -> Union[SenticGCNTokenizer, SenticGCNBertTokenizer]: + raise NotImplementedError("Please call from derived class only.") + + def _create_embedding_model(self) -> Union[SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel]: + raise NotImplementedError("Please call from derived class only.") + + def _generate_embeddings(self, batch: List[torch.Tensor]) -> torch.Tensor: + raise NotImplementedError("Please call from derived class only") + + def _save_model(self) -> None: + """ + Private helper method to save the pretrained model. + """ + if self.config.save_best_model: + self.model.save_pretrained(self.config.save_model_path) + + def _save_results(self, repeat_results: Dict[str, Dict]) -> None: + """ + Private helper metho to save the results dictionary at the end of the training. + + Args: + repeat_results (Dict[str, Dict]): dictionary containing the training results + """ + if self.config.save_results: + save_root_folder = pathlib.Path(self.config.save_results_folder) + save_root_folder.mkdir(exist_ok=True) + save_result_file = save_root_folder.joinpath( + f"{self.config.model}_{datetime.datetime.now().strftime('%d-%m-%y_%H-%M-%S')}_results.pkl" + ) + with open(save_result_file, "wb") as f: + pickle.dump(repeat_results, f) + + def _clean_temp_dir(self, result_records: Dict[str, Dict[str, float]]) -> None: + """ + Helper method to clean up temp dir and model weights from repeat train loops. + + Args: + result_records (Dict[str, Dict[str, float]]): dictionary of result_records after training. + """ + for key, val in result_records.items(): + if key == "test": + continue + shutil.rmtree(val["tmp_dir"], ignore_errors=True) + + def _evaluate_acc_f1(self, dataloader: DataLoader) -> Tuple[float, float]: + """ + Private helper method to evaluate accuracy and f1 score. + + Args: + dataloader (DataLoader): input val and test dataloader + + Returns: + Tuple[float, float]: return acc and f1 score + """ + self.model.eval() + n_correct, n_total = 0, 0 + t_targets_all, t_outputs_all = None, None + with torch.no_grad(): + for _, t_batch in enumerate(dataloader): + # Generate embedings + t_batch["text_embeddings"] = self._generate_embeddings(t_batch) + + # Prepare input data and targets + t_inputs = [t_batch[col].to(self.device) for col in self.config.data_cols] + t_targets = t_batch["polarity"].to(self.device) + + # Inference + t_outputs = self.model(t_inputs) + + # Calculate loss + n_correct += (torch.argmax(t_outputs.logits, -1) == t_targets).sum().item() + n_total += len(t_outputs.logits) + + if t_targets_all is None: + t_targets_all = t_targets + t_outputs_all = t_outputs.logits + else: + t_targets_all = torch.cat((t_targets_all, t_targets), dim=0) + t_outputs_all = torch.cat((t_outputs_all, t_outputs.logits), dim=0) + test_acc = n_correct / n_total + f1 = f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average="macro") + return test_acc, f1 + + def _train_loop( + self, + criterion, + optimizer, + train_dataloader: DataLoader, + val_dataloader: DataLoader, + tmpdir: pathlib.Path, + ) -> pathlib.Path: + """ + Method to execute a single train repeat + """ + max_val_acc, max_val_f1 = 0, 0 + max_val_epoch = 0 + global_step = 0 + + for epoch in range(self.config.epochs): + logging.info(f"Training epoch: {epoch}") + n_correct, n_total, loss_total = 0, 0, 0 + self.model.train() + for _, batch in enumerate(train_dataloader): + global_step += 1 + optimizer.zero_grad() + + # Generate embeddings + batch["text_embeddings"] = self._generate_embeddings(batch) + + # Prepare input data and targets + inputs = [batch[col].to(self.device) for col in self.config.data_cols] + targets = batch["polarity"].to(self.device) + + # Inference + outputs = self.model(inputs) + loss = criterion(outputs.logits, targets) + loss.backward() + optimizer.step() + + # Calculate loss + n_correct += (torch.argmax(outputs.logits, -1) == targets).sum().item() + n_total += len(outputs.logits) + loss_total += loss.item() * len(outputs.logits) + + # Report batch loop step results + if global_step % self.config.log_step == 0: + train_acc = n_correct / n_total + train_loss = loss_total / n_total + logging.info(f"Train Acc: {train_acc:.4f}, Train Loss: {train_loss:.4f}") + + # Run eval for validation dataloader + val_acc, val_f1 = self._evaluate_acc_f1(val_dataloader) + logging.info( + f""" + Epoch: {epoch} + Test Acc: {val_acc:.4f} + Test Loss: {val_f1:.4f} + """ + ) + + # Report new max F1 + if val_f1 > max_val_f1: + logging.info(f"New max F1: {val_f1:.4f} @ epoch {epoch}") + max_val_f1 = val_f1 + + # Report new max acc and save if required + if val_acc > max_val_acc: + logging.info(f"New max Accuracy: {val_acc:.4f} @ epoch {epoch}") + max_val_acc = val_acc + max_val_epoch = epoch + self.model.save_pretrained(tmpdir) + logging.info( + f""" + Best model saved. Acc: {max_val_acc:.4f}, F1: {max_val_f1}, Epoch: {max_val_epoch} + """ + ) + + # Early stopping + if epoch - max_val_epoch >= self.config.patience: + logging.info(f"Early stopping") + break + return max_val_acc, max_val_f1, max_val_epoch + + def _train( + self, train_dataloader: Union[DataLoader, BucketIterator], val_dataloader: Union[DataLoader, BucketIterator] + ) -> Dict[str, Dict[str, Union[int, float]]]: + """ + Method to execute a repeat train loop. Repeat amount is dependent on config. + + Args: + train_dataloader (Union[DataLoader, BucketIterator]): dataloader for train dataset + val_dataloader (Union[DataLoader, BucketIterator]): dataloader for test dataset + + Returns: + Dict[str, Dict[str, Union[int, float]]]: return a dictionary containing the train results. + """ + criterion = nn.CrossEntropyLoss() + _params = filter(lambda p: p.requires_grad, self.model.parameters()) + optimizer = self._create_optimizer(_params, lr=self.config.learning_rate, weight_decay=self.config.l2reg) + + repeat_result = {} + for i in range(self.config.repeats): + logging.info(f"Start repeat train loop : {i + 1}") + repeat_tmpdir = self.temp_dir.joinpath(f"repeat{i + 1}") + + self._reset_params() + max_val_acc, max_val_f1, max_val_epoch = self._train_loop( + criterion, optimizer, train_dataloader, val_dataloader, repeat_tmpdir + ) + + # Record repeat runs + repeat_result[f"Repeat_{i + 1}"] = { + "max_val_acc": max_val_acc, + "max_val_f1": max_val_f1, + "max_val_epoch": max_val_epoch, + "tmp_dir": repeat_tmpdir, + } + + # Overwrite global stats + if max_val_acc > self.global_max_acc: + self.global_max_acc = max_val_acc + self.global_best_model_tmpdir = repeat_tmpdir + if max_val_f1 > self.global_max_f1: + self.global_max_f1 + + return repeat_result + + +class SenticGCNBertTrainer(SenticGCNBaseTrainer): + """ + Trainer class derived from SenticGCNBaseTrainer. Used for training SenticGCNBertModel. + + Args: + config (SenticGCNTrainArgs): Training config for SenticGCNBertModel + """ + + def __init__(self, config: SenticGCNTrainArgs) -> None: + super().__init__(config) + self.config = config + # Create tokenizer + tokenizer = self._create_tokenizer() + # Create embedding model + self.embed = self._create_embedding_model() + self.embed.to(self.device) + # Create model + self.model = self._create_model() + self.model.to(self.device) + # Create dataset + data_gen = SenticGCNDatasetGenerator(config, tokenizer) + self.train_data, self.val_data, self.test_data = data_gen.generate_datasets() + del data_gen # delete unused dataset generator to free memory + + def _create_tokenizer(self) -> SenticGCNBertTokenizer: + """ + Private method to construct tokenizer via the from_pretrained method. + + Returns: + SenticGCNBertTokenizer: return a SenticGCNBertTokenizer instance. + """ + return SenticGCNBertTokenizer.from_pretrained(self.config.tokenizer) + + def _create_embedding_model(self) -> SenticGCNBertEmbeddingModel: + """ + Private helper method to create the bert based embedding models. + + Returns: + SenticGCNBertEmbeddingModel: return instance of pretrained SenticGCNBertEmbeddingModel + """ + config = SenticGCNBertEmbeddingConfig.from_pretrained(self.config.embedding_model) + return SenticGCNBertEmbeddingModel.from_pretrained(self.config.embedding_model, config=config) + + def _create_model(self) -> SenticGCNBertModel: + """ + Private helper method to create the SenticGCNBertModel instance. + + Returns: + SenticGCNBertModel: return a SenticGCNBertModel based on SenticGCNBertConfig + """ + model_config = SenticGCNBertConfig( + hidden_dim=self.config.hidden_dim, + max_seq_len=self.config.max_len, + polarities_dim=self.config.polarities_dim, + dropout=self.config.dropout, + device=self.config.device, + loss_function=self.config.loss_function, + ) + return SenticGCNBertModel(model_config) + + def _reset_params(self) -> None: + """ + Private helper method to reset model parameters. + To be used during repeats train loop. + """ + for child in self.model.children(): + if type(child) != SenticGCNBertPreTrainedModel: + for param in child.parameters(): + if param.requires_grad: + if len(param.shape) > 1: + self.initializer(param) + else: + stdv = 1.0 / math.sqrt(param.shape[0]) + nn.init.uniform_(param, a=-stdv, b=stdv) + + def _generate_data_loaders(self) -> Tuple[DataLoader, DataLoader, DataLoader]: + """ + Private helper method to generate train, val and test dataloaders. + + Returns: + Tuple[DataLoader, DataLoader, DataLoader]: return train, val and test dataloaders. + """ + train_dataloader = DataLoader(self.train_data, batch_size=self.config.batch_size, shuffle=True) + val_dataloader = DataLoader(self.val_data, batch_size=self.config.batch_size, shuffle=False) + test_dataloader = DataLoader(self.test_data, batch_size=self.config.batch_size, shuffle=False) + return train_dataloader, val_dataloader, test_dataloader + + def _generate_embeddings(self, batch: List[torch.Tensor]) -> torch.Tensor: + """ + Private helper method to generate embeddings. + + Args: + batch (List[torch.Tensor]): a batch of sub dataset + + Returns: + torch.Tensor: return embedding tensor + """ + text_bert_indices = batch["text_bert_indices"].to(self.device) + bert_segment_indices = batch["bert_segment_indices"].to(self.device) + + return self.embed(text_bert_indices, token_type_ids=bert_segment_indices)["last_hidden_state"] + + def train(self) -> None: + """ + Main train method + """ + # Generate data_loaders + train_dataloader, val_dataloader, test_dataloader = self._generate_data_loaders() + + # Run main train + repeat_result = self._train(train_dataloader, val_dataloader) + + # Recreate best model from all repeat loops + config_path = self.global_best_model_tmpdir.joinpath("config.json") + model_config = SenticGCNBertConfig.from_pretrained(config_path) + model_path = self.global_best_model_tmpdir.joinpath("pytorch_model.bin") + self.model = SenticGCNBertModel.from_pretrained(model_path, config=model_config) + self.model.to(self.device) + + # Evaluate test set + test_acc, test_f1 = self._evaluate_acc_f1(test_dataloader) + logging.info(f"Best Model - Test Acc: {test_acc:.4f} - Test F1: {test_f1:.4f}") + + repeat_result["test"] = {"max_val_acc": test_acc, "max_val_f1": test_f1} + + self._save_results(repeat_result) + self._save_model() + self._clean_temp_dir(repeat_result) + + logging.info("Training Completed!") + + +class SenticGCNTrainer(SenticGCNBaseTrainer): + """ + Trainer class derived from SenticGCNBaseTrainer. Used for training SenticGCNModel. + + Args: + config (SenticGCNTrainArgs): Training config for SenticGCNModel + """ + + def __init__(self, config: SenticGCNTrainArgs) -> None: + super().__init__(config) + self.config = config + # Create tokenizer + tokenizer = self._create_tokenizer() + # Create embedding model + self.embed = self._create_embedding_model(tokenizer.vocab) + self.embed.to(self.device) + # Create model + self.model = self._create_model() + self.model.to(self.device) + # Create dataset + data_gen = SenticGCNDatasetGenerator(config, tokenizer) + self.train_data, self.val_data, self.test_data = data_gen.generate_datasets() + del data_gen # delete unused dataset generator to free memory + + def _create_tokenizer(self) -> SenticGCNTokenizer: + """ + Private method to construct tokenizer either via the from_pretrained method or + constructing the tokenizer using input dataset files. + + Returns: + SenticGCNTokenizer: return a SenticGCNTokenizer instance. + """ + if not self.config.train_tokenizer: + return SenticGCNTokenizer.from_pretrained(self.config.tokenizer) + else: + tokenizer = SenticGCNTokenizer( + train_files=[*self.config.dataset_train, *self.config.dataset_test], train_vocab=True + ) + if self.config.save_tokenizer: + tokenizer.save_pretrained(self.config.save_tokenizer_path) + return tokenizer + + def _create_embedding_model(self, vocab: Dict[str, int]) -> SenticGCNEmbeddingModel: + """ + Private method to construct embedding model either via the from_pretrained method or + building the embedding model from word vector files. (e.g. GloVe word vectors) + + Args: + vocab (Dict[str, int]): dictionary of vocab from tokenizer + + Returns: + SenticGCNEmbeddingModel: return a SenticGCNEmbeddingModel instance. + """ + if not self.config.build_embedding_model: + config_filename = "config.json" + model_filename = "pytorch_model.bin" + if self.config.embedding_model.startswith("https://") or self.config.embedding_model.startswith("http://"): + # Load from cloud + config_url = urllib.parse.urljoin(self.config.embedding_model, config_filename) + model_url = urllib.parse.urljoin(self.config.embedding_model, model_filename) + embedding_config = SenticGCNEmbeddingConfig.from_pretrained(config_url) + embedding_model = SenticGCNEmbeddingModel.from_pretrained(model_url, config=embedding_config) + else: + # Load from local folder + config_path = pathlib.Path(self.config.embedding_model).joinpath(config_filename) + embedding_config = SenticGCNEmbeddingConfig.from_pretrained(config_path) + embed_path = pathlib.Path(self.config.embedding_model).joinpath(model_filename) + embedding_model = SenticGCNEmbeddingModel.from_pretrained(embed_path, config=embedding_config) + return embedding_model + else: + embedding_model = SenticGCNEmbeddingModel.build_embedding_model( + self.config.word_vec_file_path, vocab, self.config.embed_dim + ) + if self.config.save_embedding_model: + embedding_model.save_pretrained(self.config.save_embedding_model_path) + return embedding_model + + def _create_model(self) -> SenticGCNModel: + """ + Private helper method to create the SenticGCNModel instance. + + Returns: + SenticGCNModel: return a SenticGCNModel based on SenticGCNConfig + """ + model_config = SenticGCNConfig( + embed_dim=self.config.embed_dim, + hidden_dim=self.config.hidden_dim, + polarities_dim=self.config.polarities_dim, + dropout=self.config.dropout, + device=self.config.device, + loss_function=self.config.loss_function, + ) + return SenticGCNModel(model_config) + + def _reset_params(self) -> None: + """ + Private helper method to reset model parameters. + To be used during repeats train loop. + """ + for param in self.model.parameters(): + if param.requires_grad: + if len(param.shape) > 1: + self.initializer(param) + else: + stdv = 1.0 / math.sqrt(param.shape[0]) + nn.init.uniform_(param, a=-stdv, b=stdv) + + def _generate_data_loaders(self) -> Tuple[BucketIterator, BucketIterator, BucketIterator]: + """ + Private helper method to generate train, val and test dataloaders. + + Returns: + Tuple[BucketIterator, BucketIterator, BucketIterator]: return train, val and test bucketiterators. + """ + train_dataloader = BucketIterator(self.train_data, batch_size=self.config.batch_size, shuffle=True) + val_dataloader = BucketIterator(self.val_data, batch_size=self.config.batch_size, shuffle=False) + test_dataloader = BucketIterator(self.test_data, batch_size=self.config.batch_size, shuffle=False) + return train_dataloader, val_dataloader, test_dataloader + + def _generate_embeddings(self, batch: List[torch.Tensor]) -> torch.Tensor: + """ + Private helper method to generate embeddings. + + Args: + batch (List[torch.Tensor]): a batch of sub dataset + + Returns: + torch.Tensor: return embedding tensor + """ + text_indices = batch["text_indices"].to(self.device) + return self.embed(text_indices) + + def train(self) -> None: + """ + Main train method + """ + # Generate data_loaders + train_dataloader, val_dataloader, test_dataloader = self._generate_data_loaders() + + # Run main train + repeat_result = self._train(train_dataloader, val_dataloader) + logging.info(f"Best Train Acc: {self.global_max_acc} - Best Train F1: {self.global_max_f1}") + + # Recreate best model from all repeat loops + config_path = self.global_best_model_tmpdir.joinpath("config.json") + model_config = SenticGCNConfig.from_pretrained(config_path) + model_path = self.global_best_model_tmpdir.joinpath("pytorch_model.bin") + self.model = SenticGCNModel.from_pretrained(model_path, config=model_config) + self.model.to(self.device) + + # Evaluate test set + test_acc, test_f1 = self._evaluate_acc_f1(test_dataloader) + logging.info(f"Best Model - Test Acc: {test_acc:.4f} - Test F1: {test_f1:.4f}") + + repeat_result["test"] = {"max_val_acc": test_acc, "max_val_f1": test_f1} + + self._save_results(repeat_result) + self._save_model() + self._clean_temp_dir(repeat_result) + + logging.info("Training Completed!") + + +if __name__ == "__main__": + cfg = parse_args_and_load_config() + if cfg.seed is not None: + set_random_seed(cfg.seed) + trainer = SenticGCNTrainer(cfg) if cfg.model == "senticgcn" else SenticGCNBertTrainer(cfg) + trainer.train() diff --git a/sgnlp/models/sentic_gcn/utils.py b/sgnlp/models/sentic_gcn/utils.py new file mode 100644 index 0000000..30a89e2 --- /dev/null +++ b/sgnlp/models/sentic_gcn/utils.py @@ -0,0 +1,617 @@ +import argparse +import json +import logging +import pickle +import random +import pathlib +import requests +import urllib +import math +import tempfile +import shutil +from typing import Dict, List, Tuple, Union + +import numpy as np +import spacy +import torch +from torch.utils.data import random_split, Dataset +from transformers import PreTrainedTokenizer +from transformers.tokenization_utils_base import BatchEncoding + +from .data_class import SenticGCNTrainArgs + + +def parse_args_and_load_config( + config_path: str = "config/senticnet_gcn_config.json", +) -> SenticGCNTrainArgs: + """Get config from config file using argparser + + Returns: + SenticGCNTrainArgs: SenticGCNTrainArgs instance populated from config + """ + parser = argparse.ArgumentParser(description="SenticASGCN Training") + parser.add_argument("--config", type=str, default=config_path) + args = parser.parse_args() + + cfg_path = pathlib.Path(__file__).parent / args.config + with open(cfg_path, "r") as cfg_file: + cfg = json.load(cfg_file) + + sentic_asgcn_args = SenticGCNTrainArgs(**cfg) + return sentic_asgcn_args + + +def set_random_seed(seed: int = 776) -> None: + """Helper method to set random seeds for python, numpy and torch + + Args: + seed (int, optional): seed value to set. Defaults to 776. + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + +def download_tokenizer_files( + base_url: str, + save_folder: Union[str, pathlib.Path], + files: List[str] = ["special_tokens_map.json", "tokenizer_config.json", "vocab.pkl"], +) -> None: + """ + Helper method to download files from online storage. + + Args: + base_url (str): Url string to storage folder. + save_folder (Union[str, pathlib.Path]): + Local folder to save downloaded files. Folder will be created if it does not exists. + """ + file_paths = [urllib.parse.urljoin(base_url, file_name) for file_name in files] + for file_path in file_paths: + download_url_file(file_path, save_folder) + + +def download_url_file(url: str, save_folder: Union[str, pathlib.Path]) -> None: + """ + Helper method to download and save url file. + + Args: + url (str): Url of file to download. + save_folder (Union[str, pathlib.Path]): Folder to save downloaded file. Will be created if it does not exists. + """ + save_folder_path = pathlib.Path(save_folder) if not isinstance(save_folder, pathlib.Path) else save_folder + save_folder_path.mkdir(exist_ok=True) + fn_start_pos = url.rfind("/") + 1 + file_name = url[fn_start_pos:] + save_file_path = save_folder_path.joinpath(file_name) + req = requests.get(url) + if req.status_code == requests.codes.ok: + with open(save_file_path, "wb") as f: + for data in req: + f.write(data) + else: + logging.error(f"Fail to request files from {url}.") + + +def pad_and_truncate( + sequence: List[float], + max_len: int, + dtype: str = "int64", + padding: str = "post", + truncating: str = "post", + value: int = 0, +) -> np.ndarray: + """ + Helper method for padding and truncating text and aspect segment. + + Args: + sequence (List[float]): input sequence of indices + max_len (int): maximum len to pad + dtype (str, optional): data type to cast indices. Defaults to "int64". + padding (str, optional): type of padding, 'pre' or 'post'. Defaults to "post". + truncating (str, optional): type of truncating, 'pre' or 'post'. Defaults to "post". + value (int, optional): value used for padding. Defaults to 0. + + Returns: + np.ndarray: return a ndarray padded to the max_len + """ + seq_arr = (np.ones(max_len) * value).astype(dtype) + trunc = sequence[-max_len:] if truncating == "pre" else sequence[:max_len] + trunc = np.asarray(trunc, dtype=dtype) + if padding == "post": + seq_arr[: len(trunc)] = trunc + else: + seq_arr[-len(trunc) :] = trunc + return seq_arr + + +def load_word_vec(word_vec_file_path: str, vocab: Dict[str, int], embed_dim: int = 300) -> Dict[str, np.asarray]: + """ + Helper method to load word vectors from file (e.g. GloVe) for each word in vocab. + + Args: + word_vec_file_path (str): full file path to word vectors. + vocab (Dict[str, int]): dictionary of vocab word as key and word index as values. + embed_dim (int, optional): embedding dimension. Defaults to 300. + + Returns: + Dict[str, np.asarray]: dictionary with words as key and word vectors as values. + """ + with open(word_vec_file_path, "r", encoding="utf-8", newline="\n", errors="ignore") as fin: + word_vec = {} + for line in fin: + tokens = line.rstrip().split() + word, vec = " ".join(tokens[:-embed_dim]), tokens[-embed_dim:] + if word in vocab.keys(): + word_vec[word] = np.asarray(vec, dtype="float32") + return word_vec + + +def build_embedding_matrix( + word_vec_file_path: str, + vocab: Dict[str, int], + embed_dim: int = 300, + save_embed_matrix: bool = False, + save_embed_file_path: str = None, +) -> np.ndarray: + """ + Helper method to generate an embedding matrix. + + Args: + word_vec_file_path (str): full file path to word vectors. + vocab (Dict[str, int]): dictionary of vocab word as key and word index as values. + embed_dim (int, optional): embedding dimension. Defaults to 300. + save_embed_matrix (bool, optional): flag to indicate if . Defaults to False. + save_embed_directory (str, optional): [description]. Defaults to None. + + Returns: + np.array: numpy array of embedding matrix + """ + embedding_matrix = np.zeros((len(vocab), embed_dim)) + embedding_matrix[1, :] = np.random.uniform(-1 / np.sqrt(embed_dim), 1 / np.sqrt(embed_dim), (1, embed_dim)) + word_vec = load_word_vec(word_vec_file_path, vocab, embed_dim) + for word, idx in vocab.items(): + vec = word_vec.get(word) + if vec is not None: + embedding_matrix[idx] = vec + + if save_embed_matrix: + save_file_path = pathlib.Path(save_embed_file_path) + if not save_file_path.exists(): + save_file_path.parent.mkdir(exist_ok=True) + with open(save_file_path, "wb") as fout: + pickle.dump(embedding_matrix, fout) + + return embedding_matrix + + +def load_and_process_senticnet( + senticnet_file_path: str = None, + save_preprocessed_senticnet: bool = False, + saved_preprocessed_senticnet_file_path: str = "senticnet.pkl", +) -> Dict[str, float]: + """ + Helper method to load and process senticnet. Default is SenticNet 5.0. + If a saved preprocess senticnet file is available, and save flag is set to false, it will be loaded from file instead. + Source: + https://github.com/BinLiang-NLP/Sentic-GCN/tree/main/senticnet-5.0 + + Args: + senticnet_file_path (str): File path to senticnet 5.0 file. + save_preprocessed_senticnet (bool): Flag to indicate if processed senticnet should be saved. + saved_preprocessed_senticnet_file_path: (str): File path to saved preprocessed senticnet file. + + Returns: + Dict[str, float]: return dictionary with concept word as keys and intensity as values. + """ + saved_senticnet_file_path = pathlib.Path(saved_preprocessed_senticnet_file_path) + if saved_senticnet_file_path.exists() and not save_preprocessed_senticnet: + with open(saved_senticnet_file_path, "rb") as f: + sentic_dict = pickle.load(f) + else: + senticnet_file_path = pathlib.Path(senticnet_file_path) + sentic_dict = {} + with open(senticnet_file_path, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + items = line.split("\t") + if "_" in items[0] or "CONCEPT" == items[0]: + continue # skip words with '_' + sentic_dict[items[0]] = items[-1] + if save_preprocessed_senticnet: + saved_senticnet_file_path.parent.mkdir(exist_ok=True) + with open(saved_senticnet_file_path, "wb") as f: + pickle.dump(sentic_dict, f) + return sentic_dict + + +def generate_dependency_adj_matrix(text: str, aspect: str, senticnet: Dict[str, float], spacy_pipeline) -> np.ndarray: + """ + Helper method to generate senticnet depdency adj matrix. + + Args: + text (str): input text to process + aspect (str): aspect from input text + senticnet (Dict[str, float]): dictionary of preprocessed senticnet. See load_and_process_senticnet() + spacy_pipeline : Spacy pretrained pipeline (e.g. 'en_core_web_sm') + + Returns: + np.ndarray: return ndarry representing adj matrix. + """ + document = spacy_pipeline(text) + seq_len = len(text.split()) + matrix = np.zeros((seq_len, seq_len)).astype("float32") + for token in document: + sentic = float(senticnet[str(token)]) + 1.0 if str(token) in senticnet else 0 + if str(token) in aspect: + sentic += 1.0 + if token.i < seq_len: + matrix[token.i][token.i] = 1.0 * sentic + for child in token.children: + if str(child) in aspect: + sentic += 1.0 + if child.i < seq_len: + matrix[token.i][child.i] = 1.0 * sentic + matrix[child.i][token.i] = 1.0 * sentic + return matrix + + +class SenticGCNDataset(Dataset): + """ + Data class for SenticGCN dataset. + """ + + def __init__(self, data: List[Dict[str, torch.Tensor]]) -> None: + self.data = data + + def __getitem__(self, index: int) -> Dict[str, torch.Tensor]: + return self.data[index] + + def __len__(self): + return len(self.data) + + +class SenticGCNDatasetGenerator: + """ + Main dataset generator class to preprocess raw dataset file. + Set mode to 'train' to generate dataset for training. + Set mode to 'test' to generate dataset for training from eval_args. + """ + + def __init__(self, config: SenticGCNTrainArgs, tokenizer: PreTrainedTokenizer, mode: str = "train") -> None: + self.config = config + self.senticnet = self._load_senticnet(mode) + self.spacy_pipeline = spacy.load( + config.spacy_pipeline if mode == "train" else config.eval_args["spacy_pipeline"] + ) + self.tokenizer = tokenizer + + def _load_senticnet(self, mode: str) -> Dict[str, float]: + if mode == "train": + senticnet_ = load_and_process_senticnet( + self.config.senticnet_word_file_path, + self.config.save_preprocessed_senticnet, + self.config.saved_preprocessed_senticnet_file_path, + ) + else: + if self.config.eval_args["senticnet"].startswith("https://") or self.config.eval_args[ + "senticnet" + ].startswith("http://"): + with tempfile.TemporaryDirectory() as tmpdir: + temp_dir = pathlib.Path(tmpdir) + download_url_file(self.config.eval_args["senticnet"], temp_dir) + saved_path = temp_dir.joinpath("senticnet.pickle") + senticnet_ = load_and_process_senticnet(saved_preprocessed_senticnet_file_path=saved_path) + shutil.rmtree(temp_dir, ignore_errors=True) + elif self.config.eval_args["senticnet"].endswith(".pkl") or self.config.eval_args["senticnet"].endswith( + ".pickle" + ): + senticnet_ = load_and_process_senticnet( + saved_preprocessed_senticnet_file_path=self.config.eval_args["senticnet"] + ) + else: + raise ValueError( + """ + Error initializing SenticNet! + Please only provide url to pickle file cloud storage location or local file path. + """ + ) + return senticnet_ + + def _read_raw_dataset(self, files_path: List[str]) -> List[str]: + """ + Private helper method to read raw dataset files based on requested type (e.g. Train or Test). + + Args: + file_path (str): file path to the dataset + Returns: + List[str]: list of str consisting of the full text, aspect and polarity index. + """ + all_lines = [] + for dataset_file in files_path: + with open(dataset_file, "r", encoding="utf-8", newline="\n", errors="ignore") as f: + lines = f.readlines() + all_lines = all_lines + lines + return all_lines + + def _generate_senticgcn_dataset(self, raw_data: List[str]) -> Dict[str, List]: + """ + Data preprocess method to generate all indices required for SenticGCN model training. + + Args: + raw_data (List[str]): list of text, aspect word and polarity read from raw dataset file. + + Returns: + Dict[str, List]]: return a dictionary of dataset sub-type and their list of values. + """ + all_data = [] + for i in range(0, len(raw_data), 3): + # Process full text, aspect and polarity index + text_left, _, text_right = [s.lower().strip() for s in raw_data[i].partition("$T$")] + aspect = raw_data[i + 1].lower().strip() + full_text = f"{text_left} {aspect} {text_right}" + polarity = raw_data[i + 2].strip() + + # Process indices + text_indices = self.tokenizer( + full_text, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + aspect_indices = self.tokenizer( + aspect, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + left_indices = self.tokenizer( + text_left, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + polarity = int(polarity) + 1 + graph = generate_dependency_adj_matrix(full_text, aspect, self.senticnet, self.spacy_pipeline) + all_data.append( + { + "text_indices": text_indices["input_ids"], + "aspect_indices": aspect_indices["input_ids"], + "left_indices": left_indices["input_ids"], + "polarity": polarity, + "sdat_graph": graph, + } + ) + return all_data + + def _generate_senticgcnbert_dataset(self, raw_data: List[str]) -> Dict[str, List]: + """ + Data preprocess method to generate all indices required for SenticGCNBert model training. + + Args: + raw_data (List[str]): List of text, aspect word and polarity read from raw dataset file. + + Returns: + Dict[str, List]: return a dictionary of dataset sub-type and their values. + """ + all_data = [] + max_len = self.config.max_len + for i in range(0, len(raw_data), 3): + # Process full text, aspect and polarity index + text_left, _, text_right = [s.lower().strip() for s in raw_data[i].partition("$T$")] + aspect = raw_data[i + 1].lower().strip() + polarity = raw_data[i + 2].strip() + full_text = f"{text_left} {aspect} {text_right}" + full_text_with_bert_tokens = f"[CLS] {full_text} [SEP] {aspect} [SEP]" + + # Process indices + text_indices = self.tokenizer( + full_text, + max_length=max_len, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + aspect_indices = self.tokenizer( + aspect, + max_length=max_len, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + left_indices = self.tokenizer( + text_left, + max_length=max_len, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + polarity = int(polarity) + 1 + + # Process bert related indices + text_bert_indices = self.tokenizer( + full_text_with_bert_tokens, + max_length=max_len, + padding="max_length", + truncation=True, + add_special_tokens=False, + return_tensors=None, + return_attention_mask=False, + return_token_type_ids=False, + ) + text_len = np.sum(text_indices["input_ids"] != 0) + aspect_len = np.sum(aspect_indices["input_ids"] != 0) + + # array of [0] for texts including [CLS] and [SEP] and [1] for aspect and ending [SEP] + concat_segment_indices = [0] * (text_len + 2) + [1] * (aspect_len + 1) + concat_segment_indices = pad_and_truncate(concat_segment_indices, max_len) + + # Process graph + graph = generate_dependency_adj_matrix(full_text, aspect, self.senticnet, self.spacy_pipeline) + sdat_graph = np.pad( + graph, + ( + (0, max_len - graph.shape[0]), + (0, max_len - graph.shape[0]), + ), + "constant", + ) + + all_data.append( + { + "text_indices": torch.tensor(text_indices["input_ids"]), + "aspect_indices": torch.tensor(aspect_indices["input_ids"]), + "left_indices": torch.tensor(left_indices["input_ids"]), + "text_bert_indices": torch.tensor(text_bert_indices["input_ids"]), + "bert_segment_indices": torch.tensor(concat_segment_indices), + "polarity": torch.tensor(polarity), + "sdat_graph": torch.tensor(sdat_graph), + } + ) + return all_data + + def generate_datasets(self) -> Tuple[SenticGCNDataset, SenticGCNDataset, SenticGCNDataset]: + """ + Main wrapper method to generate datasets for both SenticGCN and SenticGCNBert based on config. + + Returns: + Tuple[SenticGCNDataset, SenticGCNDataset, SenticGCNDataset]: + return SenticGCNDataset instances for train/val/test data. + """ + # Read raw data from dataset files + raw_train_data = self._read_raw_dataset(self.config.dataset_train) + raw_test_data = self._read_raw_dataset(self.config.dataset_test) + + # Generate dataset dictionary + if self.config.model == "senticgcn": + train_data = self._generate_senticgcn_dataset(raw_train_data) + test_data = self._generate_senticgcn_dataset(raw_test_data) + else: + train_data = self._generate_senticgcnbert_dataset(raw_train_data) + test_data = self._generate_senticgcnbert_dataset(raw_test_data) + # Train/Val/Test split + if self.config.valset_ratio > 0: + valset_len = int(len(train_data) * self.config.valset_ratio) + train_data, val_data = random_split(train_data, (len(train_data) - valset_len, valset_len)) + else: + val_data = test_data + return SenticGCNDataset(train_data), SenticGCNDataset(val_data), SenticGCNDataset(test_data) + + def generate_test_datasets(self) -> SenticGCNDataset: + """ + Main wrapper method to generate test datasets for both SenticGCN and SenticGCNBert based on eval config. + + Returns: + SenticGCNDataset: return SenticGCNDataset instance for test datasets + """ + raw_data = self._read_raw_dataset(self.config.eval_args["test_filename"]) + if self.config.eval_args["model"] == "senticgcn": + test_data = self._generate_senticgcn_dataset(raw_data) + else: + test_data = self._generate_senticgcnbert_dataset(raw_data) + return SenticGCNDataset(test_data) + + +class BucketIterator: + """ + Iterator class for use with non-bert version of SenticGCN. + """ + + def __init__( + self, + data: List[Dict[str, BatchEncoding]], + batch_size: int, + sort_key: str = "text_indices", + shuffle=True, + sort=True, + ): + self.shuffle = shuffle + self.sort = sort + self.sort_key = sort_key + self.batches = self._sort_and_pad(data, batch_size) + self.batch_len = len(self.batches) + + def _sort_and_pad(self, data: List[Dict[str, List]], batch_size: int) -> List[Dict[str, List[torch.Tensor]]]: + """ + Private method to sort and pad input dataset. + + Args: + data (List[Dict[str, List]]): input dataset + batch_size (int): batch size to split dataset + + Returns: + List[Dict[str, List[torch.Tensor]]]: return list of dictionary of dataset batches + """ + num_batch = int(math.ceil(len(data) / batch_size)) + if self.sort: + sorted_data = sorted(data, key=lambda x: len(x[self.sort_key])) + else: + sorted_data = data + batches = [] + for i in range(num_batch): + batches.append(self._pad_data(sorted_data[i * batch_size : (i + 1) * batch_size])) + return batches + + def _pad_data(self, batch_data: Dict[str, List]) -> Dict[str, List[torch.Tensor]]: + """ + Private method to each sub dataset to max length for their specific batch + + Args: + batch_data (Dict[str, List]): dictionary of sub dataset and their list of values + + Returns: + Dict[str, List[torch.Tensor]]: return a dictionary of list of tensor values + """ + batch_text_indices = [] + batch_aspect_indices = [] + batch_left_indices = [] + batch_polarity = [] + batch_sdat_graph = [] + max_len = max([len(t[self.sort_key]) for t in batch_data]) + for item in batch_data: + (text_indices, aspect_indices, left_indices, polarity, sdat_graph,) = ( + item["text_indices"], + item["aspect_indices"], + item["left_indices"], + item["polarity"], + item["sdat_graph"], + ) + # Calculate padding length + text_padding = [0] * (max_len - len(text_indices)) + aspect_padding = [0] * (max_len - len(aspect_indices)) + left_padding = [0] * (max_len - len(left_indices)) + + batch_text_indices.append(text_indices + text_padding) + batch_aspect_indices.append(aspect_indices + aspect_padding) + batch_left_indices.append(left_indices + left_padding) + batch_polarity.append(polarity) + batch_sdat_graph.append( + np.pad(sdat_graph, ((0, max_len - len(text_indices)), (0, max_len - len(text_indices))), "constant") + ) + + return { + "text_indices": torch.tensor(batch_text_indices), + "aspect_indices": torch.tensor(batch_aspect_indices), + "left_indices": torch.tensor(batch_left_indices), + "polarity": torch.tensor(batch_polarity), + "sdat_graph": torch.tensor(np.array(batch_sdat_graph)), + } + + def __iter__(self): + if self.shuffle: + random.shuffle(self.batches) + for idx in range(self.batch_len): + yield self.batches[idx] diff --git a/tests/sentic_gcn/test_data/senticnet.txt b/tests/sentic_gcn/test_data/senticnet.txt new file mode 100644 index 0000000..bde4092 --- /dev/null +++ b/tests/sentic_gcn/test_data/senticnet.txt @@ -0,0 +1,15 @@ +CONCEPT POLARITY INTENSITY +abandon negative -0.84 +abandoned negative -0.85 +abandoned_person negative -0.79 +abandoned_quarry negative -0.78 +abandonment negative -0.82 +abase negative -0.90 +abasement negative -0.90 +abash negative -0.77 +abashed negative -0.92 +abashment negative -0.76 +abasia negative -0.67 +abate negative -0.86 +abatement negative -0.85 +abattoir negative -0.77 diff --git a/tests/sentic_gcn/test_data/test_senticnet.pickle b/tests/sentic_gcn/test_data/test_senticnet.pickle new file mode 100644 index 0000000..7db2ed6 Binary files /dev/null and b/tests/sentic_gcn/test_data/test_senticnet.pickle differ diff --git a/tests/sentic_gcn/test_data/test_test.raw b/tests/sentic_gcn/test_data/test_test.raw new file mode 100644 index 0000000..0bce32e --- /dev/null +++ b/tests/sentic_gcn/test_data/test_test.raw @@ -0,0 +1,15 @@ +$T$ is super fast , around anywhere from 35 seconds to 1 minute . +Boot time +1 +$T$ would not fix the problem unless I bought your plan for $ 150 plus . +tech support +-1 +$T$ was easy . +Set up +1 +Did not enjoy the new $T$ and touchscreen functions . +Windows 8 +-1 +Did not enjoy the new Windows 8 and $T$ . +touchscreen functions +-1 diff --git a/tests/sentic_gcn/test_data/test_train.raw b/tests/sentic_gcn/test_data/test_train.raw new file mode 100644 index 0000000..b81f040 --- /dev/null +++ b/tests/sentic_gcn/test_data/test_train.raw @@ -0,0 +1,15 @@ +I charge it at night and skip taking the $T$ with me because of the good battery life . +cord +0 +I charge it at night and skip taking the cord with me because of the good $T$ . +battery life +1 +The tech guy then said the $T$ does not do 1-to-1 exchange and I have to direct my concern to the `` sales '' team , which is the retail shop which I bought my netbook from . +service center +-1 +The tech guy then said the service center does not do 1-to-1 exchange and I have to direct my concern to the $T$ , which is the retail shop which I bought my netbook from . +`` sales '' team +-1 +The $T$ then said the service center does not do 1-to-1 exchange and I have to direct my concern to the `` sales '' team , which is the retail shop which I bought my netbook from . +tech guy +0 diff --git a/tests/sentic_gcn/test_data/test_vocab.pkl b/tests/sentic_gcn/test_data/test_vocab.pkl new file mode 100644 index 0000000..0da4656 Binary files /dev/null and b/tests/sentic_gcn/test_data/test_vocab.pkl differ diff --git a/tests/sentic_gcn/test_sentic_gcn_model.py b/tests/sentic_gcn/test_sentic_gcn_model.py new file mode 100644 index 0000000..44b0270 --- /dev/null +++ b/tests/sentic_gcn/test_sentic_gcn_model.py @@ -0,0 +1,187 @@ +import unittest + +import torch +from transformers import PretrainedConfig, PreTrainedModel, BertConfig, BertModel + +from sgnlp.models.sentic_gcn.config import ( + SenticGCNConfig, + SenticGCNBertConfig, + SenticGCNEmbeddingConfig, + SenticGCNBertEmbeddingConfig, +) +from sgnlp.models.sentic_gcn.modeling import ( + SenticGCNModel, + SenticGCNModelOutput, + SenticGCNBertModel, + SenticGCNBertModelOutput, + SenticGCNEmbeddingModel, + SenticGCNBertEmbeddingModel, +) + + +DEVICE = torch.device("cpu") + + +class TestSenticGCNConfigTestCase(unittest.TestCase): + def setUp(self) -> None: + self.config = SenticGCNConfig() + + def test_pretrained_config_base_class(self): + self.assertTrue(issubclass(self.config.__class__, PretrainedConfig)) + + def test_default_params(self): + self.assertEqual(self.config.embed_dim, 300) + self.assertEqual(self.config.hidden_dim, 300) + self.assertEqual(self.config.dropout, 0.3) + self.assertEqual(self.config.polarities_dim, 3) + self.assertEqual(self.config.loss_function, "cross_entropy") + + +class TestSenticGCNBertConfigTestCase(unittest.TestCase): + def setUp(self) -> None: + self.config = SenticGCNBertConfig() + + def test_pretrained_config_base_class(self): + self.assertTrue(issubclass(self.config.__class__, PretrainedConfig)) + + def test_default_params(self): + self.assertEqual(self.config.embed_dim, 300) + self.assertEqual(self.config.hidden_dim, 768) + self.assertEqual(self.config.max_seq_len, 85) + self.assertEqual(self.config.polarities_dim, 3) + self.assertEqual(self.config.dropout, 0.3) + self.assertEqual(self.config.loss_function, "cross_entropy") + + +class TestSenticGCNEmbeddingConfigTestCase(unittest.TestCase): + def setUp(self) -> None: + self.config = SenticGCNEmbeddingConfig() + + def test_pretrained_config_base_class(self): + self.assertTrue(issubclass(self.config.__class__, PretrainedConfig)) + + def test_default_params(self): + self.assertEqual(self.config.vocab_size, 17662) + self.assertEqual(self.config.embed_dim, 300) + + +class TestSenticGCNBertEmbeddingConfigTestCase(unittest.TestCase): + def setUp(self) -> None: + self.config = SenticGCNBertEmbeddingConfig() + + def test_pretrained_config_base_class(self): + self.assertTrue(issubclass(self.config.__class__, PretrainedConfig)) + self.assertTrue(issubclass(self.config.__class__, BertConfig)) + + +class TestSenticGCNModel(unittest.TestCase): + def setUp(self) -> None: + config = SenticGCNConfig() + self.model = SenticGCNModel(config=config) + + def test_pretrained_model_base_class(self): + self.assertTrue(issubclass(self.model.__class__, PreTrainedModel)) + + def test_config_class(self): + self.assertEqual(self.model.config_class, SenticGCNConfig) + + def test_base_model_prefix(self): + self.assertEqual(self.model.base_model_prefix, "senticgcn") + + def test_forward_pass(self): + text_indices = torch.zeros( + [1, 10], + dtype=torch.float32, + device=DEVICE, + ) + for i in range(0, 3): + text_indices[0][i] = 1 + + aspect_indices = torch.zeros([1, 10], dtype=torch.float32, device=DEVICE) + aspect_indices[0][0] = 1 + + left_indices = torch.zeros([1, 10], dtype=torch.float32, device=DEVICE) + left_indices[0][0] = 1 + left_indices[0][1] = 1 + + input_tensors = [ + text_indices, + aspect_indices, + left_indices, + torch.zeros([1, 10, 300], dtype=torch.float32, device=DEVICE), + torch.zeros([1, 3, 3], dtype=torch.float32, device=DEVICE), + ] + + self.model.to(DEVICE) + self.model.eval() + result = self.model(input_tensors) + + self.assertEqual(type(result), SenticGCNModelOutput) + self.assertEqual(type(result.logits), torch.Tensor) + self.assertEqual(result.logits.shape, torch.Size([1, 3])) + + +class TestSenticGCNBertModel(unittest.TestCase): + def setUp(self) -> None: + config = SenticGCNBertConfig() + self.model = SenticGCNBertModel(config=config) + + def test_pretrained_model_base_class(self): + self.assertTrue(issubclass(self.model.__class__, PreTrainedModel)) + + def test_config_class(self): + self.assertEqual(self.model.config_class, SenticGCNBertConfig) + + def test_base_model_prefix(self): + self.assertEqual(self.model.base_model_prefix, "senticgcnbert") + + def test_forward_pass(self): + input_tensors = [ + torch.ones([1, 85], dtype=torch.float32, device=DEVICE), + torch.ones([1, 85], dtype=torch.float32, device=DEVICE), + torch.ones([1, 85], dtype=torch.float32, device=DEVICE), + torch.ones([1, 85, 768], dtype=torch.float32, device=DEVICE), + torch.ones([1, 85, 85], dtype=torch.float32, device=DEVICE), + ] + + self.model.to(DEVICE) + self.model.eval() + result = self.model(input_tensors) + + self.assertEqual(type(result), SenticGCNBertModelOutput) + self.assertEqual(type(result.logits), torch.Tensor) + self.assertEqual(result.logits.shape, torch.Size([1, 3])) + + +class TestSenticGCNEmbeddingModel(unittest.TestCase): + def setUp(self) -> None: + config = SenticGCNEmbeddingConfig() + self.model = SenticGCNEmbeddingModel(config=config) + + def test_pretrained_model_base_class(self): + self.assertTrue(issubclass(self.model.__class__, PreTrainedModel)) + + def test_config_class(self): + self.assertEqual(self.model.config_class, SenticGCNEmbeddingConfig) + + def test_base_model_prefix(self): + self.assertEqual(self.model.base_model_prefix, "senticgcnembedding") + + def test_forward_pass(self): + input_tensor = torch.ones([1, 100], dtype=torch.long, device=DEVICE) + self.model.to(DEVICE) + self.model.eval() + result = self.model(input_tensor) + + self.assertEqual(type(result), torch.Tensor) + self.assertEqual(result.shape, torch.Size([1, 100, 300])) + + +class TestSenticGCNBertEmbeddingModel(unittest.TestCase): + def setUp(self) -> None: + config = SenticGCNBertEmbeddingConfig() + self.model = SenticGCNBertEmbeddingModel(config=config) + + def test_pretrained_Bert_base_class(self): + self.assertTrue(issubclass(self.model.__class__, BertModel)) + self.assertTrue(issubclass(self.model.__class__, PreTrainedModel)) diff --git a/tests/sentic_gcn/test_sentic_gcn_postprocess.py b/tests/sentic_gcn/test_sentic_gcn_postprocess.py new file mode 100644 index 0000000..fe8c813 --- /dev/null +++ b/tests/sentic_gcn/test_sentic_gcn_postprocess.py @@ -0,0 +1,342 @@ +import unittest + +import torch + +from sgnlp.models.sentic_gcn.modeling import SenticGCNModelOutput, SenticGCNBertModelOutput +from sgnlp.models.sentic_gcn.preprocess import SenticGCNData, SenticGCNBertData +from sgnlp.models.sentic_gcn.postprocess import SenticGCNPostprocessor, SenticGCNBertPostprocessor + + +class TestSenticGCNPostprocessorTestCase(unittest.TestCase): + def setUp(self) -> None: + self.test_processed_inputs = [ + SenticGCNData( + full_text="soup is tasty but soup is a little salty. salty funkysoup.", + aspect="soup", + left_text="", + full_text_tokens=[ + "Soup", + "is", + "tasty", + "but", + "soup", + "is", + "a", + "little", + "salty.", + "Salty", + "funkysoup.", + ], + aspect_token_index=0, + ), + SenticGCNData( + full_text="soup is tasty but soup is a little salty. salty funkysoup.", + aspect="soup", + left_text="soup is tasty but", + full_text_tokens=[ + "Soup", + "is", + "tasty", + "but", + "soup", + "is", + "a", + "little", + "salty.", + "Salty", + "funkysoup.", + ], + aspect_token_index=4, + ), + SenticGCNData( + full_text="everyone that sat in the back outside agreed that it was the worst service we had ever received.", + aspect="service", + left_text="everyone that sat in the back outside agreed that it was the worst", + full_text_tokens=[ + "Everyone", + "that", + "sat", + "in", + "the", + "back", + "outside", + "agreed", + "that", + "it", + "was", + "the", + "worst", + "service", + "we", + "had", + "ever", + "received.", + ], + aspect_token_index=13, + ), + SenticGCNData( + full_text="it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more .", + aspect="location", + left_text="it 's located in a strip mall near the beverly center , not the greatest", + full_text_tokens=[ + "it", + "'s", + "located", + "in", + "a", + "strip", + "mall", + "near", + "the", + "beverly", + "center", + ",", + "not", + "the", + "greatest", + "location", + ",", + "but", + "the", + "food", + "keeps", + "me", + "coming", + "back", + "for", + "more", + ".", + ], + aspect_token_index=15, + ), + SenticGCNData( + full_text="it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more .", + aspect="food", + left_text="it 's located in a strip mall near the beverly center , not the greatest location , but the", + full_text_tokens=[ + "it", + "'s", + "located", + "in", + "a", + "strip", + "mall", + "near", + "the", + "beverly", + "center", + ",", + "not", + "the", + "greatest", + "location", + ",", + "but", + "the", + "food", + "keeps", + "me", + "coming", + "back", + "for", + "more", + ".", + ], + aspect_token_index=19, + ), + ] + self.test_model_outputs = SenticGCNModelOutput( + loss=None, + logits=torch.ones([5, 3], dtype=torch.float32), + ) + + def test_senticgcn_postprocess(self): + post_proc = SenticGCNPostprocessor() + post_outputs = post_proc(processed_inputs=self.test_processed_inputs, model_outputs=self.test_model_outputs) + self.assertEqual(len(post_outputs), 3) + for key in ["sentence", "aspects", "labels"]: + for output in post_outputs: + self.assertTrue(key in output.keys()) + self.assertEqual(len(post_outputs[0]["aspects"]), 2) + self.assertEqual(len(post_outputs[1]["aspects"]), 1) + self.assertEqual(len(post_outputs[2]["aspects"]), 2) + self.assertEqual(len(post_outputs[0]["labels"]), 2) + self.assertEqual(len(post_outputs[1]["labels"]), 1) + self.assertEqual(len(post_outputs[2]["labels"]), 2) + + def test_senticgcn_post_process_return_text_and_aspect(self): + post_proc = SenticGCNPostprocessor(return_full_text=True, return_aspects_text=True) + post_outputs = post_proc(processed_inputs=self.test_processed_inputs, model_outputs=self.test_model_outputs) + for key in ["sentence", "aspects", "labels", "full_text", "aspects_text"]: + for output in post_outputs: + self.assertTrue(key in output.keys()) + + +class TestSenticGCNBertPostprocessorTestCase(unittest.TestCase): + def setUp(self) -> None: + self.test_processed_inputs = [ + SenticGCNBertData( + full_text="soup is tasty but soup is a little salty. salty funkysoup.", + aspect="soup", + left_text="", + full_text_with_bert_tokens="[CLS] soup is tasty but soup is a little salty. salty funkysoup. [SEP] soup [SEP]", + full_text_tokens=[ + "Soup", + "is", + "tasty", + "but", + "soup", + "is", + "a", + "little", + "salty.", + "Salty", + "funkysoup.", + ], + aspect_token_index=0, + ), + SenticGCNBertData( + full_text="soup is tasty but soup is a little salty. salty funkysoup.", + aspect="soup", + left_text="soup is tasty but", + full_text_with_bert_tokens="[CLS] soup is tasty but soup is a little salty. salty funkysoup. [SEP] soup [SEP]", + full_text_tokens=[ + "Soup", + "is", + "tasty", + "but", + "soup", + "is", + "a", + "little", + "salty.", + "Salty", + "funkysoup.", + ], + aspect_token_index=4, + ), + SenticGCNBertData( + full_text="everyone that sat in the back outside agreed that it was the worst service we had ever received.", + aspect="service", + left_text="everyone that sat in the back outside agreed that it was the worst", + full_text_with_bert_tokens="[CLS] everyone that sat in the back outside agreed that it was the worst service we had ever received. [SEP] service [SEP]", + full_text_tokens=[ + "Everyone", + "that", + "sat", + "in", + "the", + "back", + "outside", + "agreed", + "that", + "it", + "was", + "the", + "worst", + "service", + "we", + "had", + "ever", + "received.", + ], + aspect_token_index=13, + ), + SenticGCNBertData( + full_text="it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more .", + aspect="location", + left_text="it 's located in a strip mall near the beverly center , not the greatest", + full_text_with_bert_tokens="[CLS] it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more . [SEP] location [SEP]", + full_text_tokens=[ + "it", + "'s", + "located", + "in", + "a", + "strip", + "mall", + "near", + "the", + "beverly", + "center", + ",", + "not", + "the", + "greatest", + "location", + ",", + "but", + "the", + "food", + "keeps", + "me", + "coming", + "back", + "for", + "more", + ".", + ], + aspect_token_index=15, + ), + SenticGCNBertData( + full_text="it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more .", + aspect="food", + left_text="it 's located in a strip mall near the beverly center , not the greatest location , but the", + full_text_with_bert_tokens="[CLS] it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more . [SEP] food [SEP]", + full_text_tokens=[ + "it", + "'s", + "located", + "in", + "a", + "strip", + "mall", + "near", + "the", + "beverly", + "center", + ",", + "not", + "the", + "greatest", + "location", + ",", + "but", + "the", + "food", + "keeps", + "me", + "coming", + "back", + "for", + "more", + ".", + ], + aspect_token_index=19, + ), + ] + self.test_model_outputs = SenticGCNBertModelOutput( + loss=None, + logits=torch.ones([5, 3], dtype=torch.float32), + ) + + def test_senticgcnbert_postprocess(self): + post_proc = SenticGCNBertPostprocessor() + post_outputs = post_proc(processed_inputs=self.test_processed_inputs, model_outputs=self.test_model_outputs) + self.assertEqual(len(post_outputs), 3) + for key in ["sentence", "aspects", "labels"]: + for output in post_outputs: + self.assertTrue(key in output.keys()) + self.assertEqual(len(post_outputs[0]["aspects"]), 2) + self.assertEqual(len(post_outputs[1]["aspects"]), 1) + self.assertEqual(len(post_outputs[2]["aspects"]), 2) + self.assertEqual(len(post_outputs[0]["labels"]), 2) + self.assertEqual(len(post_outputs[1]["labels"]), 1) + self.assertEqual(len(post_outputs[2]["labels"]), 2) + + def test_senticgcn_post_process_return_text_and_aspect(self): + post_proc = SenticGCNBertPostprocessor(return_full_text=True, return_aspects_text=True) + post_outputs = post_proc(processed_inputs=self.test_processed_inputs, model_outputs=self.test_model_outputs) + for key in ["sentence", "aspects", "labels", "full_text", "aspects_text"]: + for output in post_outputs: + self.assertTrue(key in output.keys()) diff --git a/tests/sentic_gcn/test_sentic_gcn_preprocess.py b/tests/sentic_gcn/test_sentic_gcn_preprocess.py new file mode 100644 index 0000000..bee1b10 --- /dev/null +++ b/tests/sentic_gcn/test_sentic_gcn_preprocess.py @@ -0,0 +1,191 @@ +import pathlib +import pytest +import unittest + +import torch + +from sgnlp.models.sentic_gcn.config import SenticGCNEmbeddingConfig, SenticGCNBertEmbeddingConfig +from sgnlp.models.sentic_gcn.modeling import SenticGCNEmbeddingModel, SenticGCNBertEmbeddingModel +from sgnlp.models.sentic_gcn.preprocess import ( + SenticGCNBasePreprocessor, + SenticGCNPreprocessor, + SenticGCNBertPreprocessor, + SenticGCNData, + SenticGCNBertData, +) +from sgnlp.models.sentic_gcn.tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer + + +PARENT_DIR = str(pathlib.Path(__file__).parent) + + +class TestSenticGCNPreprocessorTestCase(unittest.TestCase): + def setUp(self) -> None: + self.test_tokenizer = SenticGCNTokenizer( + train_files=[PARENT_DIR + "/test_data/test_train.raw", PARENT_DIR + "/test_data/test_test.raw"], + train_vocab=True, + ) + test_embed_config = SenticGCNEmbeddingConfig() + self.test_embed_model = SenticGCNEmbeddingModel(config=test_embed_config) + self.test_inputs = [ + {"aspects": ["Soup"], "sentence": "Soup is tasty but soup is a little salty. Salty funkysoup."}, # 1, -1 + { + "aspects": ["service"], + "sentence": "Everyone that sat in the back outside agreed that it was the worst service we had ever received.", + }, # -1 + { + "aspects": ["location", "food"], + "sentence": "it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more .", + }, # 0, 1 + ] + self.test_senticnet = {"test": 1.0} + + @pytest.mark.slow + def test_senticgcn_preprocessor(self): + """ + Create preprocessor with all defaults input arguments + """ + pre_proc = SenticGCNPreprocessor() + self.assertTrue(issubclass(pre_proc.__class__, SenticGCNBasePreprocessor)) + self.assertEqual(pre_proc.tokenizer.__class__, SenticGCNTokenizer) + self.assertEqual(pre_proc.embedding_model.__class__, SenticGCNEmbeddingModel) + self.assertTrue(isinstance(pre_proc.senticnet, dict)) + + processed_inputs, processed_indices = pre_proc(self.test_inputs) + self.assertEqual(len(processed_inputs), 5) + self.assertEqual(len(processed_indices), 5) + + for proc_input in processed_inputs: + self.assertTrue(isinstance(proc_input, SenticGCNData)) + for key in ["full_text", "aspect", "left_text", "full_text_tokens", "aspect_token_index"]: + self.assertTrue(hasattr(proc_input, key)) + + for proc_index in processed_indices: + self.assertTrue(isinstance(proc_index, torch.Tensor)) + self.assertEqual(processed_indices[0].shape, torch.Size([5, 27])) + self.assertEqual(processed_indices[1].shape, torch.Size([5, 27])) + self.assertEqual(processed_indices[2].shape, torch.Size([5, 27])) + self.assertEqual(processed_indices[3].shape, torch.Size([5, 27, 300])) + self.assertEqual(processed_indices[4].shape, torch.Size([5, 27, 27])) + + def test_senticgcn_preprocessor_from_external(self): + """ + Create preprocessor with tokenizer, embedding model and senticnet from external instances + """ + pre_proc = SenticGCNPreprocessor( + tokenizer=self.test_tokenizer, embedding_model=self.test_embed_model, senticnet=self.test_senticnet + ) + self.assertTrue(issubclass(pre_proc.__class__, SenticGCNBasePreprocessor)) + self.assertEqual(pre_proc.tokenizer.__class__, SenticGCNTokenizer) + self.assertEqual(pre_proc.embedding_model.__class__, SenticGCNEmbeddingModel) + self.assertTrue(isinstance(pre_proc.senticnet, dict)) + + processed_inputs, processed_indices = pre_proc(self.test_inputs) + self.assertEqual(len(processed_inputs), 5) + self.assertEqual(len(processed_indices), 5) + + def test_senticgcn_preprocessor_from_file(self): + """ + Create preprocessor with senticnet from pickle file + """ + pre_proc = SenticGCNPreprocessor( + tokenizer=self.test_tokenizer, + embedding_model=self.test_embed_model, + senticnet=PARENT_DIR + "/test_data/test_senticnet.pickle", + ) + self.assertTrue(issubclass(pre_proc.__class__, SenticGCNBasePreprocessor)) + self.assertEqual(pre_proc.tokenizer.__class__, SenticGCNTokenizer) + self.assertEqual(pre_proc.embedding_model.__class__, SenticGCNEmbeddingModel) + self.assertTrue(isinstance(pre_proc.senticnet, dict)) + + processed_inputs, processed_indices = pre_proc(self.test_inputs) + self.assertEqual(len(processed_inputs), 5) + self.assertEqual(len(processed_indices), 5) + + +class TestSenticGCNBertPreprocessorTestCase(unittest.TestCase): + def setUp(self) -> None: + self.test_tokenizer = SenticGCNBertTokenizer.from_pretrained("bert-base-uncased") + test_embed_config = SenticGCNBertEmbeddingConfig() + self.test_embed_model = SenticGCNBertEmbeddingModel(config=test_embed_config) + self.test_inputs = [ + {"aspects": ["Soup"], "sentence": "Soup is tasty but soup is a little salty. Salty funkysoup."}, # 1, -1 + { + "aspects": ["service"], + "sentence": "Everyone that sat in the back outside agreed that it was the worst service we had ever received.", + }, # -1 + { + "aspects": ["location", "food"], + "sentence": "it 's located in a strip mall near the beverly center , not the greatest location , but the food keeps me coming back for more .", + }, # 0, 1 + ] + self.test_senticnet = {"test": 1.0} + + @pytest.mark.slow + def test_senticgcnbert_preprocessor(self): + """ + Create preprocessor with all defaults input arguments + """ + pre_proc = SenticGCNBertPreprocessor() + self.assertTrue(issubclass(pre_proc.__class__, SenticGCNBasePreprocessor)) + self.assertEqual(pre_proc.tokenizer.__class__, SenticGCNBertTokenizer) + self.assertEqual(pre_proc.embedding_model.__class__, SenticGCNBertEmbeddingModel) + self.assertTrue(isinstance(pre_proc.senticnet, dict)) + + processed_inputs, processed_indices = pre_proc(self.test_inputs) + self.assertEqual(len(processed_inputs), 5) + self.assertEqual(len(processed_indices), 5) + + for proc_input in processed_inputs: + self.assertTrue(isinstance(proc_input, SenticGCNBertData)) + for key in [ + "full_text", + "aspect", + "left_text", + "full_text_with_bert_tokens", + "full_text_tokens", + "aspect_token_index", + ]: + self.assertTrue(hasattr(proc_input, key)) + + for proc_index in processed_indices: + self.assertTrue(isinstance(proc_index, torch.Tensor)) + self.assertEqual(processed_indices[0].shape, torch.Size([5, 85])) + self.assertEqual(processed_indices[1].shape, torch.Size([5, 85])) + self.assertEqual(processed_indices[2].shape, torch.Size([5, 85])) + self.assertEqual(processed_indices[3].shape, torch.Size([5, 85, 768])) + self.assertEqual(processed_indices[4].shape, torch.Size([5, 85, 85])) + + def test_senticgcnbert_preprocessor_from_external(self): + """ + Create preprocessor with tokenizer, embedding model and senticnet from external instances + """ + pre_proc = SenticGCNBertPreprocessor( + tokenizer=self.test_tokenizer, embedding_model=self.test_embed_model, senticnet=self.test_senticnet + ) + self.assertTrue(issubclass(pre_proc.__class__, SenticGCNBasePreprocessor)) + self.assertEqual(pre_proc.tokenizer.__class__, SenticGCNBertTokenizer) + self.assertEqual(pre_proc.embedding_model.__class__, SenticGCNBertEmbeddingModel) + self.assertTrue(isinstance(pre_proc.senticnet, dict)) + + processed_inputs, processed_indices = pre_proc(self.test_inputs) + self.assertEqual(len(processed_inputs), 5) + self.assertEqual(len(processed_indices), 5) + + def test_senticgcnbert_preprocessor_from_file(self): + """ + Create preprocessor with senticnet from pickle file + """ + pre_proc = SenticGCNBertPreprocessor( + tokenizer=self.test_tokenizer, + embedding_model=self.test_embed_model, + senticnet=PARENT_DIR + "/test_data/test_senticnet.pickle", + ) + self.assertTrue(issubclass(pre_proc.__class__, SenticGCNBasePreprocessor)) + self.assertEqual(pre_proc.tokenizer.__class__, SenticGCNBertTokenizer) + self.assertEqual(pre_proc.embedding_model.__class__, SenticGCNBertEmbeddingModel) + self.assertTrue(isinstance(pre_proc.senticnet, dict)) + + processed_inputs, processed_indices = pre_proc(self.test_inputs) + self.assertEqual(len(processed_inputs), 5) + self.assertEqual(len(processed_indices), 5) diff --git a/tests/sentic_gcn/test_sentic_gcn_tokenization.py b/tests/sentic_gcn/test_sentic_gcn_tokenization.py new file mode 100644 index 0000000..ee18773 --- /dev/null +++ b/tests/sentic_gcn/test_sentic_gcn_tokenization.py @@ -0,0 +1,49 @@ +import pathlib +import pytest +import unittest + +from transformers import PreTrainedTokenizer + +from sgnlp.models.sentic_gcn.tokenization import SenticGCNTokenizer, SenticGCNBertTokenizer + + +PARENT_DIR = str(pathlib.Path(__file__).parent) + + +class TestSenticGCNTokenizerTestCase(unittest.TestCase): + def setUp(self) -> None: + self.test_train_files = [PARENT_DIR + "/test_data/test_train.raw", PARENT_DIR + "/test_data/test_test.raw"] + self.test_vocab_file = PARENT_DIR + "/test_data/test_vocab.pkl" + + def test_senticgcn_tokenizer_from_vocab(self): + tokenizer = SenticGCNTokenizer(vocab_file=self.test_vocab_file) + self.assertTrue(issubclass(tokenizer.__class__, PreTrainedTokenizer)) + + output = tokenizer("fee fi fo fum") + self.assertEqual(output["input_ids"], [10, 20, 30, 40]) + + def test_senticgcn_tokenizer_from_train_files(self): + tokenizer = SenticGCNTokenizer(train_files=self.test_train_files, train_vocab=True) + self.assertTrue(issubclass(tokenizer.__class__, PreTrainedTokenizer)) + + output = tokenizer("night service center") + self.assertEqual(output["input_ids"], [6, 24, 25]) + + +class TestSenticGCNBertTokenizerTestCase(unittest.TestCase): + def setUp(self) -> None: + self.pretrained_tokenizer_name = "bert-base-uncased" + + @pytest.mark.slow + def test_senticgcnbert_tokenizer(self): + tokenizer = SenticGCNBertTokenizer.from_pretrained(self.pretrained_tokenizer_name) + self.assertTrue(issubclass(tokenizer.__class__, PreTrainedTokenizer)) + + output = tokenizer("fee fi fo fum") + self.assertEqual(output["input_ids"], [7408, 10882, 1042, 2080, 11865, 2213]) + + output = tokenizer("fee fi fo fum", max_length=30, padding="max_length") + self.assertEqual(len(output["input_ids"]), 30) + + output = tokenizer("", max_length=10, padding="max_length") + self.assertEqual(len(output["input_ids"]), 10) diff --git a/tests/sentic_gcn/test_sentic_gcn_train_eval.py b/tests/sentic_gcn/test_sentic_gcn_train_eval.py new file mode 100644 index 0000000..8c56884 --- /dev/null +++ b/tests/sentic_gcn/test_sentic_gcn_train_eval.py @@ -0,0 +1,256 @@ +import pathlib +import pickle +import pytest +import shutil +import tempfile +import unittest + +from sgnlp.models.sentic_gcn.data_class import SenticGCNTrainArgs +from sgnlp.models.sentic_gcn.eval import SenticGCNEvaluator, SenticGCNBertEvaluator +from sgnlp.models.sentic_gcn.train import SenticGCNTrainer, SenticGCNBertTrainer + +PARENT_DIR = str(pathlib.Path(__file__).parent) + + +def find_result_file(path: str, extension: str): + for p in pathlib.Path(path).iterdir(): + if p.is_file() and p.suffix == extension: + yield p.resolve() + + +class TestSenticGCNTrainTestCase(unittest.TestCase): + def setUp(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + self.model_save_folder = pathlib.Path(tmpdir) + with tempfile.TemporaryDirectory() as tmpdir: + self.results_save_folder = pathlib.Path(tmpdir) + + cfg = { + "senticnet_word_file_path": "", + "save_preprocessed_senticnet": False, + "saved_preprocessed_senticnet_file_path": PARENT_DIR + "/test_data/test_senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", + "word_vec_file_path": "./glove/glove.840B.300d.txt", + "dataset_train": [PARENT_DIR + "/test_data/test_train.raw"], + "dataset_test": [PARENT_DIR + "/test_data/test_train.raw"], + "valset_ratio": 0, + "model": "senticgcn", + "save_best_model": True, + "save_model_path": str(self.model_save_folder), + "tokenizer": "senticgcn", + "train_tokenizer": True, + "save_tokenizer": False, + "save_tokenizer_path": "", + "embedding_model": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/", + "build_embedding_model": False, + "save_embedding_model": False, + "save_embedding_model_path": "./embed_models/senticgcn_embed_semeval14_rest/", + "save_results": True, + "save_results_folder": str(self.results_save_folder), + "initializer": "xavier_uniform_", + "optimizer": "adam", + "loss_function": "cross_entropy", + "learning_rate": 0.001, + "l2reg": 0.00001, + "epochs": 100, + "batch_size": 2, + "log_step": 5, + "embed_dim": 300, + "hidden_dim": 300, + "polarities_dim": 3, + "dropout": 0.3, + "seed": 776, + "device": "cpu", + "repeats": 2, + "patience": 5, + "max_len": 85, + } + self.cfg = SenticGCNTrainArgs(**cfg) + + def tearDown(self) -> None: + shutil.rmtree(self.model_save_folder, ignore_errors=True) + shutil.rmtree(self.results_save_folder, ignore_errors=True) + + @pytest.mark.slow + def test_train(self): + trainer = SenticGCNTrainer(self.cfg) + trainer.train() + + result_file = list(find_result_file(self.results_save_folder, ".pkl"))[0] + + with open(result_file, "rb") as f: + results = pickle.load(f) + + self.assertTrue("Repeat_1" in results.keys()) + self.assertTrue("Repeat_2" in results.keys()) + self.assertTrue("test" in results.keys()) + for key, val in results.items(): + self.assertTrue("max_val_acc" in val.keys()) + self.assertTrue("max_val_f1" in val.keys()) + if key != "test": + self.assertTrue("max_val_epoch" in val.keys()) + + config_filepath = self.model_save_folder.joinpath("config.json") + model_filepath = self.model_save_folder.joinpath("pytorch_model.bin") + self.assertTrue(config_filepath.is_file()) + self.assertTrue(model_filepath.is_file()) + + +class TestSenticGCNBertTrainTestCase(unittest.TestCase): + def setUp(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + self.model_save_folder = pathlib.Path(tmpdir) + with tempfile.TemporaryDirectory() as tmpdir: + self.results_save_folder = pathlib.Path(tmpdir) + + cfg = { + "senticnet_word_file_path": "", + "save_preprocessed_senticnet": False, + "saved_preprocessed_senticnet_file_path": PARENT_DIR + "/test_data/test_senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", + "word_vec_file_path": "./glove/glove.840B.300d.txt", + "dataset_train": [PARENT_DIR + "/test_data/test_train.raw"], + "dataset_test": [PARENT_DIR + "/test_data/test_train.raw"], + "valset_ratio": 0, + "model": "senticgcnbert", + "save_best_model": True, + "save_model_path": str(self.model_save_folder), + "tokenizer": "bert-base-uncased", + "embedding_model": "bert-base-uncased", + "save_results": True, + "save_results_folder": str(self.results_save_folder), + "initializer": "xavier_uniform_", + "optimizer": "adam", + "loss_function": "cross_entropy", + "learning_rate": 0.001, + "l2reg": 0.00001, + "epochs": 100, + "batch_size": 2, + "log_step": 5, + "embed_dim": 300, + "hidden_dim": 768, + "polarities_dim": 3, + "dropout": 0.3, + "seed": 776, + "device": "cpu", + "repeats": 2, + "patience": 5, + "max_len": 85, + } + self.cfg = SenticGCNTrainArgs(**cfg) + + def tearDown(self) -> None: + shutil.rmtree(self.model_save_folder, ignore_errors=True) + shutil.rmtree(self.results_save_folder, ignore_errors=True) + + @pytest.mark.slow + def test_train(self): + trainer = SenticGCNBertTrainer(self.cfg) + trainer.train() + + result_file = list(find_result_file(self.results_save_folder, ".pkl"))[0] + + with open(result_file, "rb") as f: + results = pickle.load(f) + + self.assertTrue("Repeat_1" in results.keys()) + self.assertTrue("Repeat_2" in results.keys()) + self.assertTrue("test" in results.keys()) + for key, val in results.items(): + self.assertTrue("max_val_acc" in val.keys()) + self.assertTrue("max_val_f1" in val.keys()) + if key != "test": + self.assertTrue("max_val_epoch" in val.keys()) + + config_filepath = self.model_save_folder.joinpath("config.json") + model_filepath = self.model_save_folder.joinpath("pytorch_model.bin") + self.assertTrue(config_filepath.is_file()) + self.assertTrue(model_filepath.is_file()) + + +class TestSenticGCNEvaluateTestCase(unittest.TestCase): + def setUp(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + self.results_save_folder = pathlib.Path(tmpdir) + + cfg = { + "eval_args": { + "model": "senticgcn", + "model_path": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn/", + "tokenizer": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_tokenizer/", + "embedding_model": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_embedding_model/", + "config_filename": "config.json", + "model_filename": "pytorch_model.bin", + "test_filename": [PARENT_DIR + "/test_data/test_test.raw"], + "senticnet": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", + "result_folder": str(self.results_save_folder), + "eval_batch_size": 2, + "seed": 776, + "device": "cpu", + } + } + self.cfg = SenticGCNTrainArgs(**cfg) + + def tearDown(self) -> None: + shutil.rmtree(self.results_save_folder, ignore_errors=True) + + @pytest.mark.slow + def test_evaluate(self): + evaluator = SenticGCNEvaluator(self.cfg) + evaluator.evaluate() + + result_file = list(find_result_file(self.results_save_folder, ".txt"))[0] + with open(result_file, "r") as f: + results = f.readlines() + + self.assertEqual(len(results), 5) + self.assertTrue(results[0].startswith("Model:")) + self.assertTrue(results[1].startswith("Batch Size:")) + self.assertTrue(results[2].startswith("Random Seed:")) + self.assertTrue(results[3].startswith("Acc:")) + self.assertTrue(results[4].startswith("F1:")) + + +class TestSenticGCNBertEvaluateTestCase(unittest.TestCase): + def setUp(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + self.results_save_folder = pathlib.Path(tmpdir) + + cfg = { + "eval_args": { + "model": "senticgcnbert", + "model_path": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticgcn_bert/", + "tokenizer": "bert-base-uncased", + "embedding_model": "bert-base-uncased", + "config_filename": "config.json", + "model_filename": "pytorch_model.bin", + "test_filename": [PARENT_DIR + "/test_data/test_test.raw"], + "senticnet": "https://storage.googleapis.com/sgnlp/models/sentic_gcn/senticnet.pickle", + "spacy_pipeline": "en_core_web_sm", + "result_folder": str(self.results_save_folder), + "eval_batch_size": 2, + "seed": 776, + "device": "cpu", + } + } + self.cfg = SenticGCNTrainArgs(**cfg) + + def tearDown(self) -> None: + shutil.rmtree(self.results_save_folder, ignore_errors=True) + + @pytest.mark.slow + def test_evaluate(self): + evaluator = SenticGCNBertEvaluator(self.cfg) + evaluator.evaluate() + + result_file = list(find_result_file(self.results_save_folder, ".txt"))[0] + with open(result_file, "r") as f: + results = f.readlines() + + self.assertEqual(len(results), 5) + self.assertTrue(results[0].startswith("Model:")) + self.assertTrue(results[1].startswith("Batch Size:")) + self.assertTrue(results[2].startswith("Random Seed:")) + self.assertTrue(results[3].startswith("Acc:")) + self.assertTrue(results[4].startswith("F1:")) diff --git a/tests/sentic_gcn/test_sentic_gcn_utils.py b/tests/sentic_gcn/test_sentic_gcn_utils.py new file mode 100644 index 0000000..0eaebce --- /dev/null +++ b/tests/sentic_gcn/test_sentic_gcn_utils.py @@ -0,0 +1,156 @@ +import pathlib +import shutil +import tempfile +import unittest +import unittest.mock as mock + +import numpy as np +import spacy + +from sgnlp.models.sentic_gcn.data_class import SenticGCNTrainArgs +from sgnlp.models.sentic_gcn.utils import ( + SenticGCNDataset, + SenticGCNDatasetGenerator, + pad_and_truncate, + load_and_process_senticnet, + generate_dependency_adj_matrix, +) + + +PARENT_DIR = str(pathlib.Path(__file__).parent) + + +class TestPadandTruncateTestCase(unittest.TestCase): + def setUp(self) -> None: + self.test_input = [1.0, 2.0, 3.0, 4.0, 5.0] + self.max_len = 50 + + def test_pad_and_truncate(self): + output = pad_and_truncate(self.test_input, max_len=self.max_len) + self.assertEqual(type(output), np.ndarray) + self.assertEqual(len(output), self.max_len) + + +class TestLoadandProcessSenticNetTestCase(unittest.TestCase): + def setUp(self) -> None: + self.test_file = pathlib.Path(PARENT_DIR).joinpath("test_data").joinpath("senticnet.txt") + with tempfile.TemporaryDirectory() as tmp_dir: + self.temp_dir = tmp_dir + self.test_save_file_path = pathlib.Path(self.temp_dir).joinpath("senticnet.pkl") + + def tearDown(self) -> None: + shutil.rmtree(self.test_save_file_path, ignore_errors=True) + + def test_load_and_process_senticnet_from_file(self): + senticnet = load_and_process_senticnet(senticnet_file_path=self.test_file) + self.assertEqual(type(senticnet), dict) + self.assertTrue("CONCEPT" not in senticnet.keys()) + self.assertEqual(len(senticnet), 12) + self.assertTrue("abandoned_person" not in senticnet.keys()) + self.assertTrue("abandoned_quarry" not in senticnet.keys()) + self.assertEqual(senticnet["abase"], "-0.90") + + def test_load_and_process_senticnet_save_file(self): + _ = load_and_process_senticnet( + senticnet_file_path=self.test_file, + save_preprocessed_senticnet=True, + saved_preprocessed_senticnet_file_path=self.test_save_file_path, + ) + self.assertTrue(self.test_save_file_path.exists()) + + def test_load_and_process_senticnet_from_pickle_file(self): + _ = load_and_process_senticnet( + senticnet_file_path=self.test_file, + save_preprocessed_senticnet=True, + saved_preprocessed_senticnet_file_path=self.test_save_file_path, + ) + senticnet = load_and_process_senticnet( + save_preprocessed_senticnet=False, saved_preprocessed_senticnet_file_path=str(self.test_save_file_path) + ) + self.assertEqual(type(senticnet), dict) + self.assertEqual(len(senticnet), 12) + + +class TestGenerateDependencyAdjMatrixTestCase(unittest.TestCase): + def setUp(self) -> None: + self.test_file = pathlib.Path(PARENT_DIR).joinpath("test_data").joinpath("senticnet.txt") + self.senticnet = load_and_process_senticnet(self.test_file) + self.spacy_pipeline = spacy.load("en_core_web_sm") + self.test_text = "Soup is tasty but soup is a little salty." + self.test_aspect = "soup" + + def test_generate_dependency_adj_matrix(self): + matrix = generate_dependency_adj_matrix(self.test_text, self.test_aspect, self.senticnet, self.spacy_pipeline) + self.assertTrue(type(matrix), np.ndarray) + self.assertEqual(matrix.shape, (9, 9)) + + +class TestSenticGCNDatasetGeneratorTestCase(unittest.TestCase): + def setUp(self) -> None: + cfg = { + "senticnet_word_file_path": PARENT_DIR + "/test_data/senticnet.txt", + "spacy_pipeline": "en_core_web_sm", + "dataset_train": [PARENT_DIR + "/test_data/test_train.raw"], + "dataset_test": [PARENT_DIR + "/test_data/test_test.raw"], + "valset_ratio": 0, + "model": "senticgcn", + } + self.cfg = SenticGCNTrainArgs(**cfg) + + def test_read_raw_dataset(self): + with mock.patch("sgnlp.models.sentic_gcn.tokenization.SenticGCNTokenizer") as MockClass: + fake_tokenizer = MockClass() + dataset_gen = SenticGCNDatasetGenerator(self.cfg, fake_tokenizer) + data = dataset_gen._read_raw_dataset(self.cfg.dataset_train) + self.assertEqual(len(data), 15) + + def test_generate_senticgcn_dataset(self): + with mock.patch("sgnlp.models.sentic_gcn.tokenization.SenticGCNTokenizer") as MockClass: + fake_tokenizer = MockClass(return_value={"input_ids": [1.0, 2.0, 3.0, 4.0, 5.0]}) + dataset_gen = SenticGCNDatasetGenerator(self.cfg, fake_tokenizer) + dataset = dataset_gen._read_raw_dataset(self.cfg.dataset_train) + data = dataset_gen._generate_senticgcn_dataset(dataset) + self.assertEqual(len(data), 5) + for data_row in data: + keys = data_row.keys() + self.assertTrue("text_indices" in keys) + self.assertTrue("aspect_indices" in keys) + self.assertTrue("left_indices" in keys) + self.assertTrue("polarity" in keys) + self.assertTrue("sdat_graph" in keys) + + def test_generate_senticgcn_bert_dataset(self): + with mock.patch("sgnlp.models.sentic_gcn.tokenization.SenticGCNBertTokenizer") as MockClass: + fake_tokenizer = MockClass(return_value={"input_ids": [1.0, 2.0, 3.0, 4.0, 5.0]}) + dataset_gen = SenticGCNDatasetGenerator(self.cfg, fake_tokenizer) + dataset = dataset_gen._read_raw_dataset(self.cfg.dataset_train) + data = dataset_gen._generate_senticgcnbert_dataset(dataset) + self.assertEqual(len(data), 5) + for data_row in data: + keys = data_row.keys() + self.assertTrue("text_indices" in keys) + self.assertTrue("aspect_indices" in keys) + self.assertTrue("left_indices" in keys) + self.assertTrue("text_bert_indices" in keys) + self.assertTrue("bert_segment_indices" in keys) + self.assertTrue("polarity" in keys) + self.assertTrue("sdat_graph" in keys) + + def test_generate_dataset(self): + for model_type in ["senticgcn", "senticgcnbert"]: + self.cfg.model = model_type + class_path = ( + "sgnlp.models.sentic_gcn.tokenization.SenticGCNTokenizer" + if model_type == "senticgcn" + else "sgnlp.models.sentic_gcn.tokenization.SenticGCNBertTokenizer" + ) + with mock.patch(class_path) as MockClass: + fake_tokenizer = MockClass(return_value={"input_ids": [1.0, 2.0, 3.0, 4.0, 5.0]}) + dataset_gen = SenticGCNDatasetGenerator(self.cfg, fake_tokenizer) + train_data, val_data, test_data = dataset_gen.generate_datasets() + self.assertEqual(type(train_data), SenticGCNDataset) + self.assertEqual(type(val_data), SenticGCNDataset) + self.assertEqual(type(test_data), SenticGCNDataset) + self.assertEqual(len(train_data), 5) + self.assertEqual(len(val_data), 5) + self.assertEqual(len(test_data), 5)