Skip to content

Commit

Permalink
update transformer
Browse files Browse the repository at this point in the history
  • Loading branch information
sunlanchang committed Jun 10, 2020
1 parent 0288f98 commit 0b85f05
Show file tree
Hide file tree
Showing 2 changed files with 280 additions and 30 deletions.
2 changes: 1 addition & 1 deletion LSTM_gender_multi_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def get_train_val():
tokenizer.fit_on_texts(f)
f.close()
creative_id_seq = []
with open('word2vec/userid_creative_ids.txt') as f:
with open('word2vec/userid_creative_ids.txt', 'r') as f:
for text in f:
creative_id_seq.append(text.strip())

Expand Down
308 changes: 279 additions & 29 deletions test.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,281 @@
# %%
import numpy
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
# %%
numpy.random.seed(7)
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
# %%
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
# # create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length,
input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam', metrics=['accuracy'])
print(model.summary())
# model.fit(X_train, y_train, validation_data=(
# X_test, y_test), epochs=1, batch_size=64)
model.predict(X_test[0:1])
import random
import unittest

from transformers import is_torch_available

import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import pandas as pd

if is_torch_available():
from transformers import (
BertConfig,
BertModel,
BertForMaskedLM,
BertForNextSentencePrediction,
BertForPreTraining,
BertForQuestionAnswering,
BertForSequenceClassification,
BertForTokenClassification,
BertForMultipleChoice,
)
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST


# %%
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
EPOCHS = 10


# %%
creative_id_seq = []
cnt = 0
with open('word2vec/userid_creative_ids.txt', 'r') as f:
for text in f:
creative_id_seq.append(text.strip())
cnt += 1
if cnt == 90:
break
with open('tmp/tmp.txt', 'w')as f:
f.write('[PAD]\n[UNK]\n[CLS]\n[SEP]\n')
s = set()
for seq in creative_id_seq:
seq = seq.split(' ')
s = s | set(seq)
for e in s:
f.write(str(e)+'\n')


# %%
user_train = pd.read_csv(
'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,))
Y_gender = user_train['gender'].values
Y_age = user_train['age'].values
Y_gender = Y_gender - 1
Y_age = Y_age - 1
# Y_age = to_categorical(Y_age)


# %%
tokenizer = BertTokenizer('tmp/tmp.txt')
print(tokenizer.get_vocab())
sample_txt = '456 1 23 456 89 89'
tokenizer.tokenize(sample_txt)


# %%

encoding = tokenizer.encode_plus(
sample_txt,
max_length=32,
add_special_tokens=True, # Add '[CLS]' and '[SEP]'
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt', # Return PyTorch tensors
)
encoding.keys()
encoding['input_ids']
encoding['attention_mask']
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])


# %%


class SentimentClassifier(nn.Module):

def __init__(self, n_classes=10):
super(SentimentClassifier, self).__init__()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
# self.bert = model
self.drop = nn.Dropout(p=0.3)
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

def forward(self, input_ids, attention_mask):
_, pooled_output = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)
output = self.drop(pooled_output)
return self.out(output)
# %%


class GPReviewDataset(Dataset):

def __init__(self, reviews, targets, tokenizer, max_len):
self.reviews = reviews
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len

def __len__(self):
return len(self.reviews)

def __getitem__(self, item):
review = str(self.reviews[item])
target = self.targets[item]

encoding = self.tokenizer.encode_plus(
review,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
)

return {
'review_text': review,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.long)
}


# %%


def create_data_loader(df, tokenizer, max_len, batch_size):
ds = GPReviewDataset(
reviews=df.content.to_numpy(),
targets=df.sentiment.to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)

return DataLoader(
ds,
batch_size=batch_size,
num_workers=4
)


# tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
# %%
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = SentimentClassifier()

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(dataloader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)


# %%


def train_epoch(
model,
data_loader,
loss_fn,
optimizer,
device,
scheduler,
n_examples
):
model = model.train()

losses = []
correct_predictions = 0

for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)

outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)

print(outputs.shape)

_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)

correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())

loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()

return correct_predictions.double() / n_examples, np.mean(losses)


# %%
ds = GPReviewDataset(creative_id_seq[:90], Y_age[:90], tokenizer, 100)
dataloader = DataLoader(ds, batch_size=1)


# %%
# train_epoch(model, dataloader, loss_fn, optimizer,
# device, scheduler, len(dataloader))

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 10)

train_acc, train_loss = train_epoch(
model,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
len(df_train)
)

print(f'Train loss {train_loss} accuracy {train_acc}')

val_acc, val_loss = eval_model(
model,
val_data_loader,
loss_fn,
device,
len(df_val)
)

print(f'Val loss {val_loss} accuracy {val_acc}')
print()

history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)

if val_acc > best_accuracy:
torch.save(model.state_dict(), 'best_model_state.bin')
best_accuracy = val_acc

0 comments on commit 0b85f05

Please sign in to comment.