# DyNet Implementation
import argparse
import random
import sys
import numpy as np
# PyTorch Implementation
import argparse
import random
import sys
import numpy as np
# Tensorflow Implementation
import argparse
import random
import sys
import numpy as np
PAD = "__PAD__"
UNK = "__UNK__"
DIM_EMBEDDING = 100
LSTM_HIDDEN = 100
BATCH_SIZE = 10
LEARNING_RATE = 0.015
LEARNING_DECAY_RATE = 0.05
EPOCHS = 100
KEEP_PROB = 0.5
GLOVE = "../data/glove.6B.100d.txt"
WEIGHT_DECAY = 1e-8
PAD = "__PAD__"
UNK = "__UNK__"
DIM_EMBEDDING = 100
LSTM_HIDDEN = 100
BATCH_SIZE = 10
LEARNING_RATE = 0.015
LEARNING_DECAY_RATE = 0.05
EPOCHS = 100
KEEP_PROB = 0.5
GLOVE = "../data/glove.6B.100d.txt"
WEIGHT_DECAY = 1e-8
PAD = "__PAD__"
UNK = "__UNK__"
DIM_EMBEDDING = 100
LSTM_HIDDEN = 100
BATCH_SIZE = 10
LEARNING_RATE = 0.015
LEARNING_DECAY_RATE = 0.05
EPOCHS = 100
KEEP_PROB = 0.5
GLOVE = "../data/glove.6B.100d.txt"
# WEIGHT_DECAY = 1e-8 Not used, see note at the bottom of the page
import dynet_config
dynet_config.set(mem=256, autobatch=0, weight_decay=WEIGHT_DECAY,random_seed=0)
# dynet_config.set_gpu() for when we want to run with GPUs
import dynet as dy
import torch
torch.manual_seed(0)
import tensorflow as tf
# Data reading
def read_data(filename):
# Data reading
def read_data(filename):
# Data reading
def read_data(filename):
"""Example input:
Pierre|NNP Vinken|NNP ,|, 61|CD years|NNS old|JJ
"""
content = []
with open(filename) as data_src:
for line in data_src:
t_p = [w.split("|") for w in line.strip().split()]
tokens = [v[0] for v in t_p]
tags = [v[1] for v in t_p]
content.append((tokens, tags))
return content
def simplify_token(token):
chars = []
for char in token:
"""Example input:
Pierre|NNP Vinken|NNP ,|, 61|CD years|NNS old|JJ
"""
content = []
with open(filename) as data_src:
for line in data_src:
t_p = [w.split("|") for w in line.strip().split()]
tokens = [v[0] for v in t_p]
tags = [v[1] for v in t_p]
content.append((tokens, tags))
return content
def simplify_token(token):
chars = []
for char in token:
"""Example input:
Pierre|NNP Vinken|NNP ,|, 61|CD years|NNS old|JJ
"""
content = []
with open(filename) as data_src:
for line in data_src:
t_p = [w.split("|") for w in line.strip().split()]
tokens = [v[0] for v in t_p]
tags = [v[1] for v in t_p]
content.append((tokens, tags))
return content
def simplify_token(token):
chars = []
for char in token:
if char.isdigit():
chars.append("0")
else:
chars.append(char)
return ''.join(chars)
def main():
if char.isdigit():
chars.append("0")
else:
chars.append(char)
return ''.join(chars)
def main():
if char.isdigit():
chars.append("0")
else:
chars.append(char)
return ''.join(chars)
def main():
parser = argparse.ArgumentParser(description='POS tagger.')
parser.add_argument('training_data')
parser.add_argument('dev_data')
args = parser.parse_args()
train = read_data(args.training_data)
dev = read_data(args.dev_data)
parser = argparse.ArgumentParser(description='POS tagger.')
parser.add_argument('training_data')
parser.add_argument('dev_data')
args = parser.parse_args()
train = read_data(args.training_data)
dev = read_data(args.dev_data)
parser = argparse.ArgumentParser(description='POS tagger.')
parser.add_argument('training_data')
parser.add_argument('dev_data')
args = parser.parse_args()
train = read_data(args.training_data)
dev = read_data(args.dev_data)
# Make indices
id_to_token = [PAD, UNK]
token_to_id = {PAD: 0, UNK: 1}
id_to_tag = [PAD]
tag_to_id = {PAD: 0}
# Make indices
id_to_token = [PAD, UNK]
token_to_id = {PAD: 0, UNK: 1}
id_to_tag = [PAD]
tag_to_id = {PAD: 0}
# Make indices
id_to_token = [PAD, UNK]
token_to_id = {PAD: 0, UNK: 1}
id_to_tag = [PAD]
tag_to_id = {PAD: 0}
for tokens, tags in train + dev:
for token in tokens:
token = simplify_token(token)
if token not in token_to_id:
token_to_id[token] = len(token_to_id)
id_to_token.append(token)
for tag in tags:
if tag not in tag_to_id:
tag_to_id[tag] = len(tag_to_id)
id_to_tag.append(tag)
NWORDS = len(token_to_id)
NTAGS = len(tag_to_id)
# Load pre-trained GloVe vectors
for tokens, tags in train + dev:
for token in tokens:
token = simplify_token(token)
if token not in token_to_id:
token_to_id[token] = len(token_to_id)
id_to_token.append(token)
for tag in tags:
if tag not in tag_to_id:
tag_to_id[tag] = len(tag_to_id)
id_to_tag.append(tag)
NWORDS = len(token_to_id)
NTAGS = len(tag_to_id)
# Load pre-trained GloVe vectors
for tokens, tags in train + dev:
for token in tokens:
token = simplify_token(token)
if token not in token_to_id:
token_to_id[token] = len(token_to_id)
id_to_token.append(token)
for tag in tags:
if tag not in tag_to_id:
tag_to_id[tag] = len(tag_to_id)
id_to_tag.append(tag)
NWORDS = len(token_to_id)
NTAGS = len(tag_to_id)
# Load pre-trained GloVe vectors
pretrained = {}
for line in open(GLOVE):
parts = line.strip().split()
word = parts[0]
vector = [float(v) for v in parts[1:]]
pretrained[word] = vector
pretrained = {}
for line in open(GLOVE):
parts = line.strip().split()
word = parts[0]
vector = [float(v) for v in parts[1:]]
pretrained[word] = vector
pretrained = {}
for line in open(GLOVE):
parts = line.strip().split()
word = parts[0]
vector = [float(v) for v in parts[1:]]
pretrained[word] = vector
pretrained_list = []
scale = np.sqrt(3.0 / DIM_EMBEDDING)
for word in id_to_token:
# apply lower() because all GloVe vectors are for lowercase words
if word.lower() in pretrained:
pretrained_list.append(np.array(pretrained[word.lower()]))
else:
pretrained_list = []
scale = np.sqrt(3.0 / DIM_EMBEDDING)
for word in id_to_token:
# apply lower() because all GloVe vectors are for lowercase words
if word.lower() in pretrained:
pretrained_list.append(np.array(pretrained[word.lower()]))
else:
pretrained_list = []
scale = np.sqrt(3.0 / DIM_EMBEDDING)
for word in id_to_token:
# apply lower() because all GloVe vectors are for lowercase words
if word.lower() in pretrained:
pretrained_list.append(np.array(pretrained[word.lower()]))
else:
random_vector = np.random.uniform(-scale, scale, [DIM_EMBEDDING])
pretrained_list.append(random_vector)
random_vector = np.random.uniform(-scale, scale, [DIM_EMBEDDING])
pretrained_list.append(random_vector)
random_vector = np.random.uniform(-scale, scale, [DIM_EMBEDDING])
pretrained_list.append(random_vector)
# Model creation
# Model creation
# Model creation
model = dy.ParameterCollection()
# Create word embeddings and initialise
pEmbedding = model.add_lookup_parameters((NWORDS, DIM_EMBEDDING))
pEmbedding.init_from_array(np.array(pretrained_list))
# Create LSTM parameters
stdv = 1.0 / np.sqrt(LSTM_HIDDEN)
f_lstm = dy.VanillaLSTMBuilder(1, DIM_EMBEDDING, LSTM_HIDDEN, model,
forget_bias=(np.random.random_sample() - 0.5) * 2 * stdv)
b_lstm = dy.VanillaLSTMBuilder(1, DIM_EMBEDDING, LSTM_HIDDEN, model,
forget_bias=(np.random.random_sample() - 0.5) * 2 * stdv)
# Create output layer
pOutput = model.add_parameters((NTAGS, 2 * LSTM_HIDDEN))
# Set recurrent dropout values (not used in this case)
f_lstm.set_dropouts(0.0, 0.0)
b_lstm.set_dropouts(0.0, 0.0)
# Initialise LSTM parameters
f_lstm.get_parameters()[0][0].set_value(
np.random.uniform(-stdv, stdv, [4 * LSTM_HIDDEN, DIM_EMBEDDING]))
f_lstm.get_parameters()[0][1].set_value(
np.random.uniform(-stdv, stdv, [4 * LSTM_HIDDEN, LSTM_HIDDEN]))
f_lstm.get_parameters()[0][2].set_value(
np.random.uniform(-stdv, stdv, [4 * LSTM_HIDDEN]))
b_lstm.get_parameters()[0][0].set_value(
np.random.uniform(-stdv, stdv, [4 * LSTM_HIDDEN, DIM_EMBEDDING]))
b_lstm.get_parameters()[0][1].set_value(
np.random.uniform(-stdv, stdv, [4 * LSTM_HIDDEN, LSTM_HIDDEN]))
b_lstm.get_parameters()[0][2].set_value(
np.random.uniform(-stdv, stdv, [4 * LSTM_HIDDEN]))
# Create the trainer
trainer = dy.SimpleSGDTrainer(model, learning_rate=LEARNING_RATE)
trainer.set_clip_threshold(-1)
model = TaggerModel(NWORDS, NTAGS, pretrained_list, id_to_token)
# Create optimizer and configure the learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE,
weight_decay=WEIGHT_DECAY)
rescale_lr = lambda epoch: 1 / (1 + LEARNING_DECAY_RATE * epoch)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,
lr_lambda=rescale_lr)
This line creates a new graph and makes it the default graph for operations to be registered to. It is not necessary here because we only have one graph, but is considered good practise (more discussion on Stackoverflow.
with tf.Graph().as_default():
# Define inputs
e_input = tf.placeholder(tf.int32, [None, None], name='input')
e_lengths = tf.placeholder(tf.int32, [None], name='lengths')
e_mask = tf.placeholder(tf.int32, [None, None], name='mask')
e_gold_output = tf.placeholder(tf.int32, [None, None],
name='gold_output')
e_keep_prob = tf.placeholder(tf.float32, name='keep_prob')
e_learning_rate = tf.placeholder(tf.float32, name='learning_rate')
# Define word embedding
glove_init = tf.constant_initializer(np.array(pretrained_list))
e_embedding = tf.get_variable("embedding", [NWORDS, DIM_EMBEDDING],
initializer=glove_init)
e_embed = tf.nn.embedding_lookup(e_embedding, e_input)
# Define LSTM cells
e_cell_f = tf.contrib.rnn.BasicLSTMCell(LSTM_HIDDEN)
e_cell_f = tf.contrib.rnn.DropoutWrapper(e_cell_f,
input_keep_prob=e_keep_prob, output_keep_prob=e_keep_prob)
# Recurrent dropout options
# variational_recurrent=True, dtype=tf.float32,
# input_size=DIM_EMBEDDING)
# Multi-layer cell creation
# e_cell_f = tf.contrib.rnn.MultiRNNCell([e_cell_f])
e_cell_b = tf.contrib.rnn.BasicLSTMCell(LSTM_HIDDEN)
e_cell_b = tf.contrib.rnn.DropoutWrapper(e_cell_b,
input_keep_prob=e_keep_prob, output_keep_prob=e_keep_prob)
e_initial_state_f = e_cell_f.zero_state(BATCH_SIZE, dtype=tf.float32)
e_initial_state_b = e_cell_f.zero_state(BATCH_SIZE, dtype=tf.float32)
e_lstm_outputs, e_final_state = tf.nn.bidirectional_dynamic_rnn(
cell_fw=e_cell_f, cell_bw=e_cell_b, inputs=e_embed,
initial_state_fw=e_initial_state_f,
initial_state_bw=e_initial_state_b,
sequence_length=e_lengths, dtype=tf.float32)
e_lstm_outputs_merged = tf.concat(e_lstm_outputs, 2)
# Define output layer
e_predictions = tf.contrib.layers.fully_connected(e_lstm_outputs_merged,
NTAGS, activation_fn=None)
# Define loss and update
e_loss = tf.losses.sparse_softmax_cross_entropy(e_gold_output,
e_predictions, weights=e_mask,
reduction=tf.losses.Reduction.SUM)
e_train = tf.train.GradientDescentOptimizer(e_learning_rate).minimize(e_loss)
# Update with gradient clipping
# e_optimiser = tf.train.GradientDescentOptimizer(LEARNING_RATE)
# e_gradients = e_optimiser.compute_gradients(e_loss)
# e_clipped_gradients = [(tf.clip_by_value(grad, -5., 5.), var)
# for grad, var in e_gradients]
# e_train = e_optimiser.apply_gradients(e_gradients)
# Define output
e_auto_output = tf.argmax(e_predictions, 2, output_type=tf.int32)
# Do training
config = tf.ConfigProto(
device_count = {'GPU': 0},
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.8)
)
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
expressions = (pEmbedding, pOutput, f_lstm, b_lstm, trainer)
expressions = (model, optimizer)
expressions = [
e_auto_output, e_gold_output, e_input, e_keep_prob, e_lengths,
e_loss, e_train, e_mask, e_learning_rate, sess
]
for epoch in range(EPOCHS):
random.shuffle(train)
for epoch in range(EPOCHS):
random.shuffle(train)
for epoch in range(EPOCHS):
random.shuffle(train)
# Update learning rate
trainer.learning_rate = LEARNING_RATE / (1+ LEARNING_DECAY_RATE * epoch)
# Update learning rate
scheduler.step()
model.train()
model.zero_grad()
# Determine the current learning rate
current_lr = LEARNING_RATE / (1+ LEARNING_DECAY_RATE * epoch)
loss, tacc = do_pass(train, token_to_id, tag_to_id, expressions, True)
loss, tacc = do_pass(train, token_to_id, tag_to_id, expressions,
True)
loss, tacc = do_pass(train, token_to_id, tag_to_id, expressions,
True, current_lr)
model.eval()
_, dacc = do_pass(dev, token_to_id, tag_to_id, expressions, False)
print("{} loss {} t-acc {} d-acc {}".format(epoch, loss, tacc, dacc))
_, dacc = do_pass(dev, token_to_id, tag_to_id, expressions, False)
print("{} loss {} t-acc {} d-acc {}".format(epoch, loss,
tacc, dacc))
_, dacc = do_pass(dev, token_to_id, tag_to_id, expressions,
False)
print("{} loss {} t-acc {} d-acc {}".format(epoch, loss, tacc,
dacc))
# Save model
model.save("tagger.dy.model")
# Load model
model.populate("tagger.dy.model")
# Evaluation pass.
_, test_acc = do_pass(dev, token_to_id, tag_to_id, expressions, False)
print("Test Accuracy: {:.3f}".format(test_acc))
# Save model
torch.save(model.state_dict(), "tagger.pt.model")
# Load model
model.load_state_dict(torch.load('tagger.pt.model'))
# Evaluation pass.
_, test_acc = do_pass(dev, token_to_id, tag_to_id, expressions, False)
print("Test Accuracy: {:.3f}".format(test_acc))
# Save model
saver = tf.train.Saver()
saver.save(sess, "./tagger.tf.model")
# Load model
saver.restore(sess, "./tagger.tf.model")
# Evaluation pass.
_, test_acc = do_pass(dev, token_to_id, tag_to_id, expressions,
False)
print("Test Accuracy: {:.3f}".format(test_acc))
class TaggerModel(torch.nn.Module):
def __init__(self, nwords, ntags, pretrained_list, id_to_token):
super().__init__()
# Create word embeddings
pretrained_tensor = torch.FloatTensor(pretrained_list)
self.word_embedding = torch.nn.Embedding.from_pretrained(
pretrained_tensor, freeze=False)
# Create input dropout parameter
self.word_dropout = torch.nn.Dropout(1 - KEEP_PROB)
# Create LSTM parameters
self.lstm = torch.nn.LSTM(DIM_EMBEDDING, LSTM_HIDDEN, num_layers=1,
batch_first=True, bidirectional=True)
# Create output dropout parameter
self.lstm_output_dropout = torch.nn.Dropout(1 - KEEP_PROB)
# Create final matrix multiply parameters
self.hidden_to_tag = torch.nn.Linear(LSTM_HIDDEN * 2, ntags)
def forward(self, sentences, labels, lengths, cur_batch_size):
max_length = sentences.size(1)
# Look up word vectors
word_vectors = self.word_embedding(sentences)
# Apply dropout
dropped_word_vectors = self.word_dropout(word_vectors)
# Run the LSTM over the input, reshaping data for efficiency
packed_words = torch.nn.utils.rnn.pack_padded_sequence(
dropped_word_vectors, lengths, True)
lstm_out, _ = self.lstm(packed_words, None)
lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out,
batch_first=True, total_length=max_length)
# Apply dropout
lstm_out_dropped = self.lstm_output_dropout(lstm_out)
# Matrix multiply to get scores for each tag
output_scores = self.hidden_to_tag(lstm_out_dropped)
# Calculate loss and predictions
output_scores = output_scores.view(cur_batch_size * max_length, -1)
flat_labels = labels.view(cur_batch_size * max_length)
loss_function = torch.nn.CrossEntropyLoss(ignore_index=0, reduction='sum')
loss = loss_function(output_scores, flat_labels)
predicted_tags = torch.argmax(output_scores, 1)
predicted_tags = predicted_tags.view(cur_batch_size, max_length)
return loss, predicted_tags
def do_pass(data, token_to_id, tag_to_id, expressions, train):
pEmbedding, pOutput, f_lstm, b_lstm, trainer = expressions
# Loop over batches
loss = 0
match = 0
total = 0
for start in range(0, len(data), BATCH_SIZE):
def do_pass(data, token_to_id, tag_to_id, expressions, train):
model, optimizer = expressions
# Loop over batches
loss = 0
match = 0
total = 0
for start in range(0, len(data), BATCH_SIZE):
def do_pass(data, token_to_id, tag_to_id, expressions, train, lr=0.0):
e_auto_output, e_gold_output, e_input, e_keep_prob, e_lengths, e_loss, \
e_train, e_mask, e_learning_rate, session = expressions
# Loop over batches
loss = 0
match = 0
total = 0
for start in range(0, len(data), BATCH_SIZE):
batch = data[start : start + BATCH_SIZE]
batch.sort(key = lambda x: -len(x[0]))
batch = data[start : start + BATCH_SIZE]
batch.sort(key = lambda x: -len(x[0]))
batch = data[start : start + BATCH_SIZE]
batch.sort(key = lambda x: -len(x[0]))
if start % 4000 == 0 and start > 0:
print(loss, match / total)
sys.stdout.flush()
if start % 4000 == 0 and start > 0:
print(loss, match / total)
sys.stdout.flush()
if start % 4000 == 0 and start > 0:
print(loss, match / total)
sys.stdout.flush()
# Process batch
dy.renew_cg()
loss_expressions = []
predicted = []
# Prepare inputs
cur_batch_size = len(batch)
max_length = len(batch[0][0])
lengths = [len(v[0]) for v in batch]
input_array = torch.zeros((cur_batch_size, max_length)).long()
output_array = torch.zeros((cur_batch_size, max_length)).long()
# Add empty sentences to fill the batch
batch += [([], []) for _ in range(BATCH_SIZE - len(batch))]
# Prepare inputs
max_length = len(batch[0][0])
input_array = np.zeros([len(batch), max_length])
output_array = np.zeros([len(batch), max_length])
lengths = np.array([len(v[0]) for v in batch])
mask = np.zeros([len(batch), max_length])
for n, (tokens, tags) in enumerate(batch):
token_ids = [token_to_id.get(simplify_token(t), 0) for t in tokens]
tag_ids = [tag_to_id[t] for t in tags]
for n, (tokens, tags) in enumerate(batch):
token_ids = [token_to_id.get(simplify_token(t), 0) for t in tokens]
tag_ids = [tag_to_id[t] for t in tags]
for n, (tokens, tags) in enumerate(batch):
token_ids = [token_to_id.get(simplify_token(t), 0) for t in tokens]
tag_ids = [tag_to_id[t] for t in tags]
# Look up word embeddings
wembs = [dy.lookup(pEmbedding, w) for w in token_ids]
# Apply dropout
if train:
wembs = [dy.dropout(w, 1.0 - KEEP_PROB) for w in wembs]
# Feed words into the LSTM
We pull out the output vector from the cell state at each step.
f_init = f_lstm.initial_state()
f_lstm_output = [x.output() for x in f_init.add_inputs(wembs)]
rev_embs = reversed(wembs)
b_init = b_lstm.initial_state()
b_lstm_output = [x.output() for x in b_init.add_inputs(rev_embs)]
# For each output, calculate the output and loss
pred_tags = []
for f, b, t in zip(f_lstm_output, reversed(b_lstm_output), tag_ids):
# Combine the outputs
combined = dy.concatenate([f,b])
# Apply dropout
if train:
combined = dy.dropout(combined, 1.0 - KEEP_PROB)
# Matrix multiply to get scores for each tag
r_t = pOutput * combined
# Calculate cross-entropy loss
if train:
err = dy.pickneglogsoftmax(r_t, t)
loss_expressions.append(err)
# Calculate the highest scoring tag
chosen = np.argmax(r_t.npvalue())
pred_tags.append(chosen)
predicted.append(pred_tags)
# combine the losses for the batch, do an update, and record the loss
if train:
loss_for_batch = dy.esum(loss_expressions)
loss_for_batch.backward()
trainer.update()
loss += loss_for_batch.scalar_value()
input_array[n, :len(tokens)] = torch.LongTensor(token_ids)
output_array[n, :len(tags)] = torch.LongTensor(tag_ids)
# Construct computation
batch_loss, output = model(input_array, output_array, lengths,
cur_batch_size)
# Run computations
if train:
batch_loss.backward()
optimizer.step()
model.zero_grad()
loss += batch_loss.item()
predicted = output.cpu().data.numpy()
input_array[n, :len(tokens)] = token_ids
output_array[n, :len(tags)] = tag_ids
mask[n, :len(tokens)] = np.ones([len(tokens)])
cur_keep_prob = KEEP_PROB if train else 1.0
feed = {
e_input: input_array,
e_gold_output: output_array,
e_mask: mask,
e_keep_prob: cur_keep_prob,
e_lengths: lengths,
e_learning_rate: lr
}
# Define the computations needed
todo = [e_auto_output]
if train:
todo.append(e_loss)
todo.append(e_train)
# Run computations
outcomes = session.run(todo, feed_dict=feed)
# Get outputs
predicted = outcomes[0]
if train:
loss += outcomes[1]
# Update the number of correct tags and total tags
for (_, g), a in zip(batch, predicted):
total += len(g)
for gt, at in zip(g, a):
gt = tag_to_id[gt]
if gt == at:
match += 1
return loss, match / total
if __name__ == '__main__':
main()
# Update the number of correct tags and total tags
for (_, g), a in zip(batch, predicted):
total += len(g)
for gt, at in zip(g, a):
gt = tag_to_id[gt]
if gt == at:
match += 1
return loss, match / total
if __name__ == '__main__':
main()
# Update the number of correct tags and total tags
for (_, g), a in zip(batch, predicted):
total += len(g)
for gt, at in zip(g, a):
gt = tag_to_id[gt]
if gt == at:
match += 1
return loss, match / total
if __name__ == '__main__':
main()