add BERT train code
Browse files- BERT_inference.py +217 -0
BERT_inference.py
CHANGED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import csv
|
5 |
+
import torch.nn as nn
|
6 |
+
from torch.optim.lr_scheduler import ReduceLROnPlateau
|
7 |
+
from torch.utils.data import TensorDataset, DataLoader
|
8 |
+
from transformers import BertTokenizer,BertConfig,AdamW
|
9 |
+
from sklearn.metrics import accuracy_score
|
10 |
+
from sklearn.metrics import classification_report
|
11 |
+
from tqdm import tqdm
|
12 |
+
import torch
|
13 |
+
import transformers
|
14 |
+
from torch.utils.data import Dataset, DataLoader
|
15 |
+
|
16 |
+
# %%
|
17 |
+
|
18 |
+
class MyDataSet(Dataset):
|
19 |
+
def __init__(self, loaded_data):
|
20 |
+
self.data = loaded_data
|
21 |
+
|
22 |
+
def __len__(self):
|
23 |
+
return len(self.data)
|
24 |
+
|
25 |
+
def __getitem__(self, idx):
|
26 |
+
return self.data[idx]
|
27 |
+
|
28 |
+
Data_path = "/kaggle/input/inference/train.csv"
|
29 |
+
Totle_data = pd.read_csv(Data_path)
|
30 |
+
Totle_data = Totle_data.sample(frac=0.1)
|
31 |
+
Totle_data = Totle_data.dropna(axis=0,subset = ["2"])
|
32 |
+
custom_dataset = MyDataSet(Totle_data)
|
33 |
+
#按照比例划分
|
34 |
+
train_size = int(len(custom_dataset) * 0.6)
|
35 |
+
validate_size = int(len(custom_dataset) * 0.1)
|
36 |
+
test_size = len(custom_dataset) - validate_size - train_size
|
37 |
+
train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(custom_dataset, [train_size, validate_size, test_size])
|
38 |
+
|
39 |
+
#设置保存路径
|
40 |
+
train_data_path="Bert_Try.csv"
|
41 |
+
dev_data_path = "Bert_Dev.csv"
|
42 |
+
test_data_path="Bert_Test.csv"
|
43 |
+
|
44 |
+
train_dataset = Totle_data.iloc[train_dataset.indices]
|
45 |
+
validate_dataset = Totle_data.iloc[validate_dataset.indices]
|
46 |
+
test_dataset = Totle_data.iloc[test_dataset.indices]
|
47 |
+
|
48 |
+
#index参数设置为False表示不保存行索引,header设置为False表示不保存列索引
|
49 |
+
train_dataset.to_csv(train_data_path,index=False,header=True)
|
50 |
+
validate_dataset.to_csv(dev_data_path ,index=False,header=True)
|
51 |
+
test_dataset.to_csv(test_data_path,index=False,header=True)
|
52 |
+
|
53 |
+
# %%
|
54 |
+
data = pd.read_csv(train_data_path)
|
55 |
+
data.head
|
56 |
+
|
57 |
+
# %%
|
58 |
+
|
59 |
+
class BertClassificationModel(nn.Module):
|
60 |
+
def __init__(self):
|
61 |
+
super(BertClassificationModel, self).__init__()
|
62 |
+
#加载预训练模型
|
63 |
+
pretrained_weights="bert-base-chinese"
|
64 |
+
self.bert = transformers.BertModel.from_pretrained(pretrained_weights)
|
65 |
+
for param in self.bert.parameters():
|
66 |
+
param.requires_grad = True
|
67 |
+
#定义线性函数
|
68 |
+
self.dense = nn.Linear(768, 3)
|
69 |
+
|
70 |
+
def forward(self, input_ids,token_type_ids,attention_mask):
|
71 |
+
#得到bert_output
|
72 |
+
bert_output = self.bert(input_ids=input_ids,token_type_ids=token_type_ids, attention_mask=attention_mask)
|
73 |
+
#获得预训练模型的输出
|
74 |
+
bert_cls_hidden_state = bert_output[1]
|
75 |
+
#将768维的向量输入到线性层映射为二维向量
|
76 |
+
linear_output = self.dense(bert_cls_hidden_state)
|
77 |
+
return linear_output
|
78 |
+
|
79 |
+
# %%
|
80 |
+
|
81 |
+
def encoder(max_len,vocab_path,text_list):
|
82 |
+
#将text_list embedding成bert模型可用的输入形式
|
83 |
+
#加载分词模型
|
84 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
|
85 |
+
tokenizer = tokenizer(
|
86 |
+
text_list,
|
87 |
+
padding = True,
|
88 |
+
truncation = True,
|
89 |
+
max_length = max_len,
|
90 |
+
return_tensors='pt' # 返回的类型为pytorch tensor
|
91 |
+
)
|
92 |
+
input_ids = tokenizer['input_ids']
|
93 |
+
token_type_ids = tokenizer['token_type_ids']
|
94 |
+
attention_mask = tokenizer['attention_mask']
|
95 |
+
return input_ids,token_type_ids,attention_mask
|
96 |
+
|
97 |
+
# %%
|
98 |
+
labels2dict = {"neutral":0,"entailment":1,"contradiction":2}
|
99 |
+
def load_data(path):
|
100 |
+
csvFileObj = open(path)
|
101 |
+
readerObj = csv.reader(csvFileObj)
|
102 |
+
text_list = []
|
103 |
+
labels = []
|
104 |
+
for row in readerObj:
|
105 |
+
#跳过表头
|
106 |
+
if readerObj.line_num == 1:
|
107 |
+
continue
|
108 |
+
#label在什么位置就改成对应的index
|
109 |
+
label = int(labels2dict[row[0]])
|
110 |
+
text = row[1]
|
111 |
+
text_list.append(text)
|
112 |
+
labels.append(label)
|
113 |
+
#调用encoder函数,获得预训练模型的三种输入形式
|
114 |
+
input_ids,token_type_ids,attention_mask = encoder(max_len=150,vocab_path="/root/Bert/bert-base-chinese/vocab.txt",text_list=text_list)
|
115 |
+
labels = torch.tensor(labels)
|
116 |
+
#将encoder的返回值以及label封装为Tensor的形式
|
117 |
+
data = TensorDataset(input_ids,token_type_ids,attention_mask,labels)
|
118 |
+
return data
|
119 |
+
|
120 |
+
# %%
|
121 |
+
#设定batch_size
|
122 |
+
batch_size = 16
|
123 |
+
#引入数据路径
|
124 |
+
train_data_path="Bert_Try.csv"
|
125 |
+
dev_data_path="Bert_Dev.csv"
|
126 |
+
test_data_path="Bert_Test.csv"
|
127 |
+
#调用load_data函数,将数据加载为Tensor形式
|
128 |
+
train_data = load_data(train_data_path)
|
129 |
+
dev_data = load_data(dev_data_path)
|
130 |
+
test_data = load_data(test_data_path)
|
131 |
+
#将训练数据和测试数据进行DataLoader实例化
|
132 |
+
train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
|
133 |
+
dev_loader = DataLoader(dataset=dev_data, batch_size=batch_size, shuffle=True)
|
134 |
+
test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False)
|
135 |
+
|
136 |
+
# %%
|
137 |
+
def dev(model,dev_loader):
|
138 |
+
model.to(device)
|
139 |
+
|
140 |
+
model.eval()
|
141 |
+
|
142 |
+
with torch.no_grad():
|
143 |
+
correct = 0
|
144 |
+
total = 0
|
145 |
+
for step, (input_ids,token_type_ids,attention_mask,labels) in tqdm(enumerate(dev_loader),desc='Dev Itreation:'):
|
146 |
+
input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device)
|
147 |
+
out_put = model(input_ids,token_type_ids,attention_mask)
|
148 |
+
_, predict = torch.max(out_put.data, 1)
|
149 |
+
correct += (predict==labels).sum().item()
|
150 |
+
total += labels.size(0)
|
151 |
+
res = correct / total
|
152 |
+
return res
|
153 |
+
|
154 |
+
# %%
|
155 |
+
|
156 |
+
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
|
157 |
+
def train(model,train_loader,dev_loader) :
|
158 |
+
|
159 |
+
model.to(device)
|
160 |
+
model.train()
|
161 |
+
criterion = nn.CrossEntropyLoss()
|
162 |
+
param_optimizer = list(model.named_parameters())
|
163 |
+
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
164 |
+
|
165 |
+
optimizer_grouped_parameters = [
|
166 |
+
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
|
167 |
+
'weight_decay': 0.01},
|
168 |
+
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
169 |
+
]
|
170 |
+
|
171 |
+
optimizer_params = {'lr': 1e-5, 'eps': 1e-6, 'correct_bias': False}
|
172 |
+
optimizer = AdamW(optimizer_grouped_parameters, **optimizer_params)
|
173 |
+
scheduler = ReduceLROnPlateau(optimizer,mode='max',factor=0.5,min_lr=1e-7, patience=5,verbose= True, threshold=0.0001, eps=1e-08)
|
174 |
+
t_total = len(train_loader)
|
175 |
+
|
176 |
+
total_epochs = 10
|
177 |
+
bestAcc = 0
|
178 |
+
correct = 0
|
179 |
+
total = 0
|
180 |
+
print('Training and verification begin!')
|
181 |
+
for epoch in range(total_epochs):
|
182 |
+
for step, (input_ids,token_type_ids,attention_mask,labels) in enumerate(train_loader):
|
183 |
+
|
184 |
+
optimizer.zero_grad()
|
185 |
+
input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device)
|
186 |
+
out_put = model(input_ids,token_type_ids,attention_mask)
|
187 |
+
loss = criterion(out_put, labels)
|
188 |
+
_, predict = torch.max(out_put.data, 1)
|
189 |
+
correct += (predict == labels).sum().item()
|
190 |
+
total += labels.size(0)
|
191 |
+
loss.backward()
|
192 |
+
optimizer.step()
|
193 |
+
#每两步进行一次打印
|
194 |
+
if (step + 1) % 10 == 0:
|
195 |
+
train_acc = correct / total
|
196 |
+
print("Train Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,loss.item()))
|
197 |
+
#每五十次进行一次验证
|
198 |
+
if (step + 1) % 200 == 0:
|
199 |
+
train_acc = correct / total
|
200 |
+
#调用验证函数dev对模型进行验证,并将有效果提升的模型进行保存
|
201 |
+
acc = dev(model, dev_loader)
|
202 |
+
if bestAcc < acc:
|
203 |
+
bestAcc = acc
|
204 |
+
#模型保存路径
|
205 |
+
path = 'bert_model.pkl'
|
206 |
+
torch.save(model, path)
|
207 |
+
print("DEV Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,bestAcc{:.6f}%,dev_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,bestAcc*100,acc*100,loss.item()))
|
208 |
+
scheduler.step(bestAcc)
|
209 |
+
|
210 |
+
# %%
|
211 |
+
|
212 |
+
path = '/kaggle/input/inference/bert_model.pkl'
|
213 |
+
# model = torch.load(path)
|
214 |
+
#实例化模型
|
215 |
+
model = BertClassificationModel()
|
216 |
+
#调用训练函数进行训练与验证
|
217 |
+
train(model,train_loader,dev_loader)
|