PyTorch实战:聊天机器人

数据预处理

我们使用Cornell Movie-Dialogs Corpus的电影剧本作为聊天机器人的训练语料,下载解压后主要使用 movie_lines.txt 和 movie_conversations.txt 这两个文件。两者都是以 “ +++$+++ ” 作为分隔符的结构化数据,前者包含了 (lineID, characterID, movieID, character, text) 字段,后者的结构是 (character1ID, character2ID, movieID, utteranceIDs),其中 utternaceIDs 是一个列表,其中对应的就是前者文件中的 lineID,以此归类就可以得到电影剧本中的对话上下文。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def rebuild_corpus_format(movie_lines_file, movie_convers_file, output_file):
# 将movie_lines.txt的每一行拆分为字段(lineID, characterID, movieID, character, text)组合的字典
MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
fin = open(movie_lines_file, encoding='iso-8859-1')
movie_lines = {}
for sline in fin:
values = sline.split(' +++$+++ ')
line_obj = {}
for i,field in enumerate(MOVIE_LINES_FIELDS):
line_obj[field] = values[i]
movie_lines[line_obj["lineID"]] = line_obj
fin.close()

# 根据movie_conversations.txt将movie_lines中的每一行数据进行归类
MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
fin = open(movie_convers_file, encoding='iso-8859-1')
conversations = []
for sline in fin:
values = sline.split(' +++$+++ ')
conv_obj = {}
for i, field in enumerate(MOVIE_CONVERSATIONS_FIELDS):
conv_obj[field] = values[i]
line_ids = eval(conv_obj["utteranceIDs"])
conv_obj["lines"] = []
for line_id in line_ids:
conv_obj['lines'].append(movie_lines[line_id])
conversations.append(conv_obj)
fin.close()

def remove_punc(string):
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~\t'''
no_punct = ""
for char in string:
if char not in punctuations:
no_punct = no_punct + char
return no_punct.lower()

# 从对话中抽取句子对
fout = open(output_file, 'w', encoding='utf-8')
for conversation in conversations:
for i in range(len(conversation["lines"]) - 1):
input_line = remove_punc(conversation["lines"][i]["text"].strip())
target_line = remove_punc(conversation["lines"][i + 1]["text"].strip())
if input_line and target_line:
fout.write("%s\t%s\n" % (input_line, target_line))
fout.close()

归类后的数据,我们按照 “上句[TAB]下句” 为一行的格式保存起来。基于这份数据我们进行训练集、验证集、测试集的划分,并从训练集中构建词汇表,将所有数据转换为词汇 ID 编码的整数序列:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import json,pickle
from collections import Counter
from sklearn.model_selection import train_test_split


def encode_pair_data(src_list, tgt_list, word_map, max_len):
pairs_encoded = []
for pair in zip(src_list, tgt_list):
qus = [word_map.get(word, word_map['<unk>']) for word in pair[0]] + [word_map['<pad>']] * (
max_len - len(pair[0]))
ans = [word_map['<bos>']] + [word_map.get(word, word_map['<unk>']) for word in pair[1]] + \
[word_map['<eos>']] + [word_map['<pad>']] * (max_len - len(pair[1]))
pairs_encoded.append([qus, ans])
return pairs_encoded


def construct_corpus(input_file, output_dir, min_word_freq, max_len):
src_list = []
tgt_list = []
with open(input_file, encoding='utf-8') as fin:
for sline in fin:
src,tgt = sline.strip().split('\t')
src_list.append(src.split()[:max_len])
tgt_list.append(tgt.split()[:max_len])
src_train, src_test, tgt_train, tgt_test = train_test_split(src_list, tgt_list, test_size=0.1, shuffle=True)
src_train, src_eval, tgt_train, tgt_eval = train_test_split(src_train, tgt_train, test_size=0.1, shuffle=True)

# 从训练数据中构建词汇表
word_freq = Counter()
for pair in zip(src_train, tgt_train):
word_freq.update(pair[0])
word_freq.update(pair[1])

words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq]
word_map = {k: v + 1 for v, k in enumerate(words)}
word_map['<unk>'] = len(word_map) + 1
word_map['<bos>'] = len(word_map) + 1
word_map['<eos>'] = len(word_map) + 1
word_map['<pad>'] = 0
print("Total words are: {}".format(len(word_map)))

with open(output_dir+'/vocab.json', 'w', encoding='utf-8') as fout:
json.dump(word_map, fout)

train_data = encode_pair_data(src_train, tgt_train, word_map, max_len)
with open(output_dir+'/train_data.pkl', 'wb') as fout:
pickle.dump(train_data, fout)
eval_data = encode_pair_data(src_eval, tgt_eval, word_map, max_len)
with open(output_dir+'/eval_data.pkl', 'wb') as fout:
pickle.dump(eval_data, fout)
test_data = encode_pair_data(src_test, tgt_test, word_map, max_len)
with open(output_dir+'/test_data.pkl', 'wb') as fout:
pickle.dump(test_data, fout)

我们把出现频次大于5的单词保存到词汇表中,并对序列的最大长度进行了25个单词的限制。

数据加载

前面的预处理已经完成了很多转换操作,所以加载数据只需要构建 Dataset 与 DataLoader:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from torch.utils.data import Dataset
import torch.utils.data

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

class ChatbotDataset(Dataset):
def __init__(self, data_file):
with open(data_file, 'rb') as fin:
self.pairs = pickle.load(fin)
self.dataset_size = len(self.pairs)

def __getitem__(self, i):
question = torch.LongTensor(self.pairs[i][0]).to(device)
reply = torch.LongTensor(self.pairs[i][1]).to(device)

return question, reply

def __len__(self):
return self.dataset_size


train_loader = torch.utils.data.DataLoader(ChatbotDataset('./data/Chatbot/train_data.pkl'), batch_size=64, shuffle=True)
eval_loader = torch.utils.data.DataLoader(ChatbotDataset('./data/Chatbot/eval_data.pkl'), batch_size=64, shuffle=True)

定义网络

实现一个聊天机器人本质上也是一个 Seq2Seq 的网络框架,在这里我们选择一个与前面机器翻译不同的模型 —— Transformer,该网络本身就是一个“编码-解码”的结构,但不同于 RNN 类型的序列模型,其使用自注意力机制来对序列建模,从而提高了模型的并行化训练能力。我们仍然先将网络所需的超参数整合到一个 Config 类中:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
class Config:
def __init__(self):
self.vocab_size = 16103 # 词汇表的大小
self.d_model = 200 # Transformer编码-解码的输入维度(当前任务下等于embed_size)
self.n_head = 20 # 多头注意力的头数
self.num_encoder_layers = 2 # 编码层个数
self.num_decoder_layers = 2 # 解码层个数
self.dim_feedforward = 200 # 前向网络层维度(=编码器的输出维度=embed_size)
self.dropout = 0.1 # dropout率
self.embed_size = 200 # 词向量的维度
self.maxlen = 27 # 序列最大长度=max_len + 2(<bos>, <eos>)
self.epochs = 200 # 迭代轮数
self.batch_size = 64 # 批量大小
self.learning_rate = 1e-4 # 学习率
self.summary_step = 2000 # 多少步训练后做一次验证总结

由于 Transformer 失去了序列模型天然的对位置敏感的特性,所以我们需要额外实现一个位置的 embedding,并把它加到 token 的语义 embedding 上,来补充位置特征,其中位置编码的算法我们按照论文中的 sin/cos 计算方式实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
class PositionalEmbedding(nn.Module):
def __init__(self, d_model, max_len):
super().__init__()

pe = torch.zeros(max_len, d_model).float()
pe.require_grad = False

position = torch.arange(0, max_len).float().unsqueeze(1)
div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()

pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)

pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)

def forward(self, x):
return self.pe[:, :x.size(1)]


class Embeddings(nn.Module):
def __init__(self, config):
super(Embeddings, self).__init__()
self.token_embedding = nn.Embedding(config.vocab_size, config.embed_size, padding_idx=0)
self.pos_embedding = PositionalEmbedding(d_model = config.embed_size, max_len=config.maxlen)

def forward(self, x):
token_embed = self.token_embedding(x)
pos_embed = self.pos_embedding(x)
return token_embed + pos_embed

在《PyTorch常用网络层》一节中,我们已经介绍了 Transformer 层的简单使用方法,这里我们需要使用完整的 Transformer 网络用于“编码-解码”的任务,值得注意的是其中的 MASK 如何配置:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
class TransformerChatbot(nn.Module):
def __init__(self, config):
super(TransformerChatbot, self).__init__()
# 语义embedding + 位置embedding的输出
self.input_embedding = Embeddings(config=config)
# 完整的Transformer网络
self.transfomrer = torch.nn.Transformer(d_model=config.d_model, nhead=config.n_head,
num_encoder_layers=config.num_encoder_layers,
num_decoder_layers=config.num_decoder_layers,
dim_feedforward=config.dim_feedforward, dropout=config.dropout)
# 编码器的输出向词汇表进行映射
self.proj_vocab_layer = nn.Linear(in_features=config.dim_feedforward, out_features=config.vocab_size)

self.apply(self._initailze)

def forward(self, enc_input, dec_input):
# 输入的张量形状为[batch_size, seq_len, embed_dim]
x_enc_embed = self.input_embedding(enc_input.long())
x_dec_embed = self.input_embedding(dec_input.long())
# 掩盖住输入、输出中不同序列的padding部分
src_key_padding_mask = enc_input == 0
tgt_key_padding_mask = dec_input == 0
# 掩盖Encoder输出的memory中不同序列的padding部分
memory_key_padding_mask = src_key_padding_mask
# Decoder中掩盖当前位置之后的所有位置
tgt_mask = self.transfomrer.generate_square_subsequent_mask(dec_input.size(1))

# 轴位置互换为[seq_len, batch_size, embed_dim]
x_enc_embed = x_enc_embed.permute(1, 0, 2)
x_dec_embed = x_dec_embed.permute(1, 0, 2)

feature = self.transfomrer(src = x_enc_embed,
tgt = x_dec_embed,
src_key_padding_mask = src_key_padding_mask,
tgt_key_padding_mask = tgt_key_padding_mask,
memory_key_padding_mask=memory_key_padding_mask,
tgt_mask = tgt_mask.to(device))

logits = self.proj_vocab_layer(feature)
# 轴位置恢复为[batch_size, seq_len, embed_dim]
logits = logits.permute(1, 0, 2)

return logits

def _initailze(self, layer):
if isinstance(layer, (nn.Linear)):
nn.init.kaiming_uniform_(layer.weight)

可以看到,Chatbot 主体网络的实现非常简单:Embbedding -> Transformer -> Linear 就搭建完成了。最新版本的 Transformer 已经提供了 batch_first 的参数,轴变换的操作可以省去了。

开始训练

训练的过程与前面的实战相似,我们会把迭代过程中训练效果到达一定精度的模型都保存下来,以供后续选择:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from torch import optim
from tqdm import tqdm

def train(model_dir):
model_config = Config()
# 定义模型
model = TransformerChatbot(config=model_config)
model.to(device)
# 定义损失函数
loss_fn = nn.CrossEntropyLoss(ignore_index=0)
# 定义优化器
opt = optim.Adam(params=model.parameters(), lr=model_config.learning_rate)

best_train_acc = 0.95
# 迭代训练
for epoch in tqdm(range(model_config.epochs), desc='epoch', total=model_config.epochs):
print("epoch : {}, lr: {}".format(epoch, opt.param_groups[0]['lr']))
tr_loss = 0
model.train()

for step, (question, reply) in enumerate(train_loader):
opt.zero_grad()
enc_input, dec_input, dec_output = question, reply[:,:-1], reply[:,1:]
y_pred = model(enc_input, dec_input)
y_pred = y_pred.reshape(-1, y_pred.size(-1))
dec_output = dec_output.contiguous().view(-1).long()
real_value_index = [dec_output != 0]
mb_loss = loss_fn(y_pred[real_value_index], dec_output[real_value_index])
mb_loss.backward()
opt.step()

with torch.no_grad():
mb_acc = acc(y_pred, dec_output)

tr_loss += mb_loss.item()
tr_acc = mb_acc.item()
tr_loss_avg = tr_loss / (step + 1)
tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc}
total_step = epoch * len(train_loader) + step

# 在验证集上做评价
if total_step % model_config.summary_step == 0 and total_step != 0:
model.eval()
print("eval: ")
val_summary = evaluate(model, eval_loader, {'loss': loss_fn, 'acc': acc})
tqdm.write('epoch : {}, step : {}, tr_loss: {:.3f}, val_loss: {:.3f}, '
'tr_acc: {:.2%}, val_acc: {:.2%}'.format(epoch + 1, total_step,
tr_summary['loss'],
val_summary['loss'],
tr_summary['acc'],
val_summary['acc']))
val_loss = val_summary['loss']
is_best = tr_acc > best_train_acc
# 保存模型
if is_best:
best_train_acc = tr_acc
print("[Best model Save] train_acc: {}, "
"train_loss: {}, val_loss: {}".format(tr_summary['acc'],
tr_summary['loss'],
val_loss))
state = {'epoch': epoch + 1,
'model_state_dict': model.to(torch.device('cpu')).state_dict(),
'opt_state_dict': opt.state_dict()}
summary = {'train': tr_summary, 'validation': val_summary}
with open(model_dir+'/summary-'+str(epoch)+'-'+str(total_step)+'.json', mode='w') as fout:
json.dump(summary, fout, indent=4)
torch.save(state, model_dir+'/model-'+str(epoch)+'-acc-'+str(best_train_acc)+'.pth')

model.to(device)
model.train()
else:
if step % 50 == 0:
print('epoch : {}, step : {}, '
'tr_loss: {:.3f}, tr_acc: {:.2%}'.format(epoch + 1, total_step,
tr_summary['loss'], tr_summary['acc']))

其中用到的评估与准确率计算的实现如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def acc(yhat, y):
with torch.no_grad():
yhat = yhat.max(dim=-1)[1]
acc = (yhat == y).float()[y != 0].mean()
return acc

def correct_sum(y_pred, dec_output):
with torch.no_grad():
y_pred = y_pred.max(dim=-1)[1]
correct_elms = (y_pred == dec_output).float()[dec_output != 0]
correct_sum = correct_elms.sum().to(torch.device('cpu')).numpy()
num_correct_elms = len(correct_elms)
return correct_sum, num_correct_elms


def evaluate(model, data_loader, metrics):
if model.training:
model.eval()

summary = {metric: 0 for metric in metrics}
num_correct_elms = 0
for step, (question, reply) in enumerate(data_loader):
enc_input, dec_input, dec_output = question, reply[:,:-1], reply[:,1:]

with torch.no_grad():
y_pred = model(enc_input, dec_input)
y_pred = y_pred.reshape(-1, y_pred.size(-1))
dec_output = dec_output.contiguous().view(-1).long()
for metric in metrics:
if metric is 'acc':
_correct_sum, _num_correct_elms = correct_sum(y_pred, dec_output)
summary[metric] += _correct_sum
num_correct_elms += _num_correct_elms
else:
summary[metric] += metrics[metric](y_pred, dec_output).item()

for metric in metrics:
if metric is 'acc':
summary[metric] /= num_correct_elms
else:
summary[metric] /= len(data_loader.dataset)

return summary

使用模型

使用模型需要注意,尽管输入仅仅是单条语句,当仍需要按照 [batch_size, seq_len] 的形状进行组装,而且 decoder_input 需要初始化一个 “\” 作为第一个单词。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def decoding_from_token_id(y_pred, reverse_word_map):
list_of_pred_ids = y_pred.max(dim=-1)[1].tolist()[0]
str_lst = [reverse_word_map[t] for t in list_of_pred_ids]
return ' '.join(str_lst)

def predict():
config = Config()
# 加载词典
with open('./data/Chatbot/vocab.json', encoding='utf-8') as fin:
word_map = eval(fin.readline())
reverse_word_map = {value:key for key, value in word_map.items()}

input = 'what can i do for you'
print('[INPUT]: '+input)
input_word_lst = input.split(' ')
question = [word_map.get(word, word_map['<unk>']) for word in input_word_lst] + [word_map['<pad>']] * (
config.maxlen - len(input_word_lst))
input_batch_lst = []
input_batch_lst.append(question)

model = TransformerChatbot(config=config)
checkpoint = torch.load('./model/Chatbot/model-196-acc-0.3378582298755646.pth', map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['model_state_dict'])

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.eval()

enc_input = torch.LongTensor(input_batch_lst).to(device)
dec_input = torch.LongTensor([[word_map['<bos>']]]).to(device)

for i in range(config.maxlen):
y_pred = model(enc_input, dec_input)
y_pred_ids = y_pred.max(dim=-1)[1]
if (y_pred_ids[0, -1] == word_map['<eos>']).to(torch.device('cpu')).numpy():
print('[OUTPUT]: '+decoding_from_token_id(y_pred, reverse_word_map))
break

dec_input = torch.cat(
[dec_input.to(torch.device('cpu')), y_pred_ids[0, -1].unsqueeze(0).unsqueeze(0).to(torch.device('cpu'))],
dim=-1)

if i == config.maxlen - 1:
print('[OUTPUT]: '+decoding_from_token_id(y_pred, reverse_word_map))

可以看到,我们输入 “what can i do for you”,Chatbot 的输出是 “i want to talk to you \“。