2020. 2. 2. 00:19ㆍ개발/AI&ML
For using kobert, you should install these libraries:
!pip install mxnet-cu101
!pip install gluonnlp pandas tqdm
!pip install sentencepiece==0.1.85
!pip install transformers==2.1.1
!pip install torch==1.3.1
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master
import:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import WarmupLinearSchedule
drive mount:
from google.colab import drive
drive.mount('/content/drive')
cuda:
##GPU 사용 시
device = torch.device("cuda:0")
get model and vocab file:
bertmodel, vocab = get_pytorch_kobert_model()
Setting:
# Import all libs
import os
import numpy as np
import pandas as pd
import re
import csv
from collections import namedtuple
from sklearn.model_selection import train_test_split
import nltk
from sklearn.pipeline import Pipeline
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
import sys
# Set paths and gloabal variables
BASE_DIR = '/content/drive/My Drive'
DATA_DIR = os.path.join(BASE_DIR,'Allganize_data/cremafactory_review_analysis/crema_sejin')
DATA = os.path.join(DATA_DIR, 'datayouhave1.tsv')
DATA = os.path.join(DATA_DIR, 'datayouhave2.tsv')
DATA_TEST = os.path.join(DATA_DIR, 'datayouhave3.tsv')
DATA_TRAIN = os.path.join(DATA_DIR, 'datayouhave4.tsv')
LABELS = [-1, 0, 1]
CATEGS = []
CATEGS_CNT = len(CATEGS)
data preprocessing:
data preprocess yourself
get kobert tokenizer:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
setting parameters:
max_len = 128
batch_size = 8
warmup_ratio = 0.1
num_epochs = 15
max_grad_norm = 1
log_interval = 200
learning_rate = 2e-5
For batch generating, we used BatchGenrator in keras. In torch, we should use ‘Dataset’. Especially, if you want to input just one sentence, you should set pair=False, if 2 sentences, you should set pari=True in SentenceTransform.
class BertDataset(Dataset):
'Characterizes a dataset for PyTorch'
def __init__(self, data=train_data, batch_size=batch_size, bert_tokenizer=tok, max_len=max_len, pad=True, pair=True):
'Initialization'
self.data = data
self.review_counts = len(data)
self.transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
#self.sentences = [transform([dt[1],dt[0]]) for dt in data.values]
self.data = data
#self.labels = data['score]
def __len__(self):
'Denotes the total number of samples'
return (len(self.data))
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
#def to_categorical(y, num_classes):
#return np.eye(len(LABELS), dtype='uint8')[y]
#y = to_categorical(int(self.labels.iloc[index])+1, 3)
if self.data.iloc[index]['score'] == '2':
related = np.int32(0)
else:
related = np.int32(1)
sentence = self.transform([self.data.iloc[index]['category'], self.data.iloc[index]['review']])
score = np.int32(self.data.iloc[index]['score']) + np.int32(1)
return (sentence + (related, score, ))
In dataloader, if set shuffle=True, torch shuffles data in each epochs.:
data_train = BertDataset(data=train_data)
data_test = BertDataset(data=test_data)
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)
Similar to keras, we can make the model like below:
class BERTClassifier(nn.Module):
def __init__(self,
bert,
hidden_size = 768,
num_classes=3,
params=None):
super(BERTClassifier, self).__init__()
self.bert = bert
#self.dr_rate = dr_rate
self.layer1 = nn.Linear(hidden_size, 192)
self.special1 = nn.Dropout(p=0.5)
self.special2 = nn.Linear(hidden_size, 2)
self.special3 = nn.Dropout(p=0.3)
self.special4 = nn.Linear(192, 4)
#if dr_rate:
#self.dropout = nn.Dropout(p=dr_rate)
def gen_attention_mask(self, token_ids, valid_length,batch_mode=True):
attention_mask = torch.zeros_like(token_ids)
if batch_mode:
for i, v in enumerate(valid_length):
attention_mask[i][:v] = 1
return attention_mask.float()
else:
attention_mask[:valid_length] = 1
return attention_mask.float()
def forward(self, token_ids, valid_length, segment_ids,batch_mode=True):
attention_mask = self.gen_attention_mask(token_ids, valid_length,batch_mode)
encoded_layer, pooler = self.bert(input_ids = token_ids.to(device), token_type_ids = segment_ids.long().to(device), attention_mask = attention_mask.float().to(device))#token_ids
hidden1 = self.special1(pooler)
out1 = self.special2(hidden1)
hidden2 = self.special3(pooler)
hidden2 = self.layer1(hidden2)
out2 = self.special4(hidden2)
#out = nn.functional.relu(hidden)
return out1, out2
model = BERTClassifier(bertmodel).to(device)
Setting optimizer, loss function, etc.
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn1 = nn.BCEWithLogitsLoss()
loss_fn2 = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_step, t_total=t_total)
You should make accuracy function by yourself.
def calc_accuracy(X,Y):
max_vals, max_indices = torch.max(X, 1)
train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
return train_acc
Training:
You must set model.train() before training and model.eval() before evaluation!!!!
for e in range(num_epochs):
train_acc_relevant = 0.0
train_acc_score = 0.0
test_acc_relevant = 0.0
test_acc_score = 0.0
model.train()
loss_list = []
for batch_id, (token_ids, valid_length, segment_ids, related, label) in enumerate(tqdm_notebook(train_dataloader)):
optimizer.zero_grad()
token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
valid_length= valid_length
label = label.long().to(device)
related = related.long().to(device)
out1, out2 = model(token_ids, valid_length, segment_ids)
loss1 = loss_fn2(out1, related)
loss2 = loss_fn2(out2, label)
loss = loss1 + loss2
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
optimizer.step()
scheduler.step() # Update learning rate schedule
if batch_id %10 == 0:
loss_list.append(loss.item())
train_acc_relevant += calc_accuracy(out1, related)
train_acc_score += calc_accuracy(out2, label)
if batch_id % log_interval == 0:
print("epoch {} batch id {} loss {} train acc relevant {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc_relevant / (batch_id+1)))
print("epoch {} batch id {} loss {} train acc score {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc_score / (batch_id+1)))
#print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
print("mean loss = ", sum(loss_list) / len(loss_list))
model.eval()
for batch_id, (token_ids, valid_length, segment_ids, related, label) in enumerate(tqdm_notebook(test_dataloader)):
token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
valid_length= valid_length
label = label.long().to(device)
related = related.long().to(device)
out1, out2 = model(token_ids, valid_length, segment_ids)
test_acc_relevant += calc_accuracy(out1, related)
test_acc_score += calc_accuracy(out2, label)
print("epoch {} test acc relevant {}".format(e+1, test_acc_relevant / (batch_id+1)))
print("epoch {} test acc score {}".format(e+1, test_acc_score / (batch_id+1)))
Saving Model:
If you want to load model, you must define the class same name and structure as same as I defined before loading.(tokenizer too)
PATH = 'pathyouwant'
torch.save(model, PATH)
Confusion matrix:
RELEVANT_LABELS = [0,1]
SCORE_LABELS = [-1,0,1,2]
model.eval()
relevant_true = []
relevant_pred = []
sentiment_true = []
sentiment_pred = []
for batch_id, (token_ids, valid_length, segment_ids, related, label) in enumerate(tqdm_notebook(test_dataloader)):
token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
valid_length= valid_length
label = label.long().to(device)
related = related.long().to(device)
out1, out2 = model(token_ids, valid_length, segment_ids)
_, related_index = torch.max(out1, 1)
_, label_index = torch.max(out2, 1)
for idx in range(len(label)):
sentiment_true.append(S_LABELS[int(label[idx])])
relevant_true.append(R_LABELS[int(related[idx])])
sentiment_pred.append(S_LABELS[int(label_index[idx])])
relevant_pred.append(R_LABELS[int(related_index[idx])])
print(sentiment_true)
print(sentiment_pred)
print(relevant_true)
print(relevant_pred)
from sklearn.metrics import confusion_matrix, classification_report
from collections import Counter
print('da confusion matrix')
print('Labels: [-1, 0, 1, 2] (2 is not labeled because irrelevant)')
print(confusion_matrix(sentiment_true, sentiment_pred, labels=[-1,0,1,2]))
print(classification_report(sentiment_true, sentiment_pred, digits=3))
print('da confusion matrix')
print(confusion_matrix(relevant_true, relevant_pred, labels=[0,1]))
print(classification_report(relevant_true, relevant_pred, digits=3))
To make inference data, you should define new dataloader.(I don't know exact reason, even though I set model.eval() but when I make inference for each individual data, its output is terrible)
class InferenceBertDataset(Dataset):
'Characterizes a dataset for PyTorch'
def __init__(self, data=train_data, batch_size=batch_size, bert_tokenizer=tok,category='review', max_len=max_len, pad=True, pair=True):
'Initialization'
self.data = data
self.review_counts = len(data)
self.transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
#self.sentences = [transform([dt[1],dt[0]]) for dt in data.values]
self.data = data
self.category = category
#self.labels = data['score]
def __len__(self):
'Denotes the total number of samples'
return (len(self.data))
def __getitem__(self, index):
'Generates one sample of data'
sentence = self.transform([self.category, self.data.iloc[index]['review']])
return sentence
#inference data preprocessing
PATHtoData = os.path.join('somethingelse','something')
PathToSave = os.path.join('som','daf')
inference_data = pd.read_csv(PAThtoData, sep='\t')
first_line = inference_data.columns[0]
inference_data.columns = ['review']
inference_data = inference_data.append({'review': first_line}, ignore_index=True)
Make inference data and save.
from collections import defaultdict
model.eval()
inference_dict = defaultdict(list)
transform = nlp.data.BERTSentenceTransform(tok, max_seq_length=max_len, pad=True, pair=True)
relevant_true = []
relevant_pred = []
s_true = []
s_pred = []
for i in range(len(inference_data)):
inference_dict['review'].append(inference_data.iloc[i]['review'])
for category in c:
data_inference = InferenceBertDataset(data=inference_data,category=category)
inference_dataloader = torch.utils.data.DataLoader(data_inference, batch_size=batch_size, num_workers=5)
for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(inference_dataloader)):
token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
valid_length= valid_length
out1, out2 = model(token_ids, valid_length, segment_ids)
max_vals, max_indices = torch.max(out1, 1)
score_max_vals, score_max_indices = torch.max(out2, 1)
for idx in range(len(token_ids)):
if RELEVANT_LABELS[int(max_indices[idx])] == 1 and SCORE_LABELS[int(score_max_indices[idx])] != 2:
result = str(SLABELS[score_max_indices[idx]])
else:
result = 'NaN'
inference_dict[category].append(result)
inference_df = pd.DataFrame(inference_dict)
inference_df.to_csv(PathYouwant)
'개발 > AI&ML' 카테고리의 다른 글
200314 auto ml에 관하여 (0) | 2020.03.15 |
---|