Word Embedding Implemented in Python
Word Embedding Training in Python¶
Backpropagation gradient calculation should be straightforward once you did any Affine (fully connected) neural network, or Convolution Neural Network (CNN) and Softmax classifier. One example showing the gradient calculation and backpropagation can be found here.
"You shall know a word by the company it keeps" - John Rupert Firth¶
Word Embedding has become very popular since Thomas Mikolov's Word2vec paper was published. We are going to compress words from high-dimension, sparse space (vocabulary) to lower-dimension, continous vector space. It is pretty amazing to see certain linguistic, semantic relations can be encoded and linear transformation can be applied. For example, vector('Rome') is close to vec('Paris') - vec('France') + vec('Italy').
Continuous Bag of Words (CBOW) and Skipgram are two models commonly used to extract this relation into vectors. Comparing with other optimization problems, Word Embedding training has one difficulity - the size of vocabulary is too big to process efficiently, and evaluation is needed for each step. To get around the expensive evaluation of classifier cost (such as applying Softmax prediction + Cross-Entropy loss) scores on the complete vocabulary, negative sampling method can be used to approximate probabilities from Softmax classifier.
This project documents some naive implementations on Word Embedding training. Many functions should be fine-tuned in order to build an efficient deep learning framework
Part of the dataset processing code is adapted based on TensorFlow tutorial on word2vec
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import scipy.io
import scipy.misc
import time
import sys
import os
import glob
import random
from random import randrange
import zipfile
import collections
import cPickle as pickle
from sklearn.manifold import TSNE
from six.moves import urllib
from six.moves import xrange # pylint: diable=redefined-builtin
# Setup Matplotlib and autoreload
%matplotlib inline
%load_ext autoreload
%autoreload 2
# Because we are not using any existing flow/package, we need to define all building blocks in Python
#
def normalize_vecs(x):
""" Row normalization function
Normalize each row of input matrix
- Inputs:
x: vectors (matrix)
- Outputs:
x: normalized vectors (matrix)
"""
xl_vec = np.sqrt(np.sum(x**2, axis=1, keepdims=True))
x = x/xl_vec
return x
# Sigmoid logistic function
def logistic(x):
"""
Apply logistic function to input vector
Inputs:
- vector x with shape (N, D)
Outputs:
- out # Output vector of shape (N, D)
- dx # gradient with regarding to x, shape (N, D)
"""
out = 1/ (1 + np.exp(-x))
dx = out * (1 - out)
return out, dx
# Softmax function
def softmax(x):
"""
Calculate softmax based probability for given input vector
"""
x -= np.max(x, axis=x.ndim-1, keepdims=True) # Numerically safe operation
exp_x = np.exp(x)
p = exp_x / np.sum(exp_x, axis=x.ndim-1, keepdims=True)
return p
# Softmax cost layer
def softmax_cost(x, w, target_idx):
"""
Calculate softmax cost and gradient
No bias in this implmentation
Inputs
- x # vector from previous layer, such as hidden layer
- w # weight for last layer, so called "output vectors"
- target_idx # Target word index, ground truth
Outputs
- cost # Softmax + cross-entropy cost for current predications
- dw # Gradient with regarding to weights
- dx # Gradient with regarding to input vector
"""
V, D = w.shape # V: vocabulary size, D: Vector dimension
x = x.reshape((1, D))
# x.shape is (1, D)
prob = softmax(x.dot(w.T)) # (1, V)
cost = -np.log(prob[0, target_idx]) # Cross-entropy loss, Grand truth distribution is one hot
dcost = prob # (1, V)
dcost[0, target_idx] -= 1
dw = dcost.T.dot(x) # (V,1) * (1, D) ==> (V, D)
dx = dcost.dot(w) # (1,V) * (V, D) ==> (1, D)
return cost, dw, dx.flatten()
# Naive negative sample generator to randomly pick negative samples
# Paper recommend to use probability^(3/4), here we just use uniform distribution
def get_random_samples(target_idx, data_size, sample_size):
"""
Randomly pick sample_size of samples for negative sampling
"""
indices = [None] * sample_size
for i in xrange(sample_size):
new_idx = random.randint(0, data_size - 1)
while new_idx == target_idx:
new_idx = random.randint(0, data_size - 1)
indices[i] = new_idx
return indices
# Negative sampling cost layer
def negative_sampling_cost(x, w, target_idx, sample_size=10):
"""
Purpose is to replace softmax cost layer to handle large vocabulary size
Using randomly picked samples to adjust prediction and provide feedback
Inputs
- x # vector from previous layer, such as hidden layer
- w # weight for last layer, so called "output vectors"
- target_idx # Target word index, ground truth
- sample_size # Size for negative sampling
Outputs
- cost # Softmax + cross-entropy cost for current predication
- dw # Gradient with regarding to weights
- dx # Gradient with regarding to input vector
"""
V, D = w.shape
dw = np.zeros_like(w)
dx = np.zeros_like(x)
indices = [target_idx]
indices.extend(get_random_samples(target_idx, V, sample_size))
#print "indices = {}".format(indices)
labels = np.array([1] + [-1 for i in range(sample_size)]) # (sample_size + 1,)
wvecs = w[indices] # (sample_size+1, D)
z = np.dot(wvecs, x.T) # (sample_size + 1, D) dot (D) ==> (sample_size + 1)
z = z * labels
scores, _ = logistic(z) # (sample_size + 1)
cost = - np.sum(np.log(scores)) # A scaler
# Calculate gradients
dz = labels * (scores -1) # (sample_size + 1)
dz = dz.reshape((1, sample_size + 1))
dx = dz.dot(wvecs) # (sample_size + 1) dot (sample_size + 1, D) ==> (1, D)
dwvecs = dz.T.dot(x.reshape((1, D))) # (sample_size + 1, 1) dot (1, D) ==> (sample_size + 1, D)
# Copy gradient for sample vectors to complete dW array
for i in range(sample_size + 1):
dw[indices[i]] += dwvecs[i,:]
return cost, dw, dx.flatten()
# skipgram function
def skipgram(center_word, context_size,
context_words, input_vectors, output_vectors,
cost_func=negative_sampling_cost):
"""
Skip-Gram model
- Inputs:
center_word: index of current center word
context_size: size of context window
context_words: list of word indexs of context words
input_vectors: word vectors from input to hidden layer, for lookup
output_vectors: word vectors from hidden layer to final cost function
cost_func: cost function, such as softmax_cost or negative_sampling_cost
- Outputs:
cost: evaluated cost value
grad_in: gradient with regard to input_vectors
grad_out: gradient with regard to output_vectors
"""
cost = 0
grad_in = np.zeros_like(input_vectors)
grad_out = np.zeros_like(output_vectors)
center_word_vec = input_vectors[center_word, :]
for target_word_idx in context_words:
c_cost, c_dw, c_dx = cost_func(center_word_vec, output_vectors, target_word_idx)
cost += c_cost
grad_in[center_word, :] += c_dx
grad_out += c_dw
return cost, grad_in, grad_out
# continuous bag of words
def cbow(center_word, context_size,
context_words, input_vectors, output_vectors,
cost_func=negative_sampling_cost):
"""
CBOW model
- Inputs:
center_word: index of current center word
context_size: size of context window
context_words: list of word indexs of context words
input_vectors: word vectors from input to hidden layer, for lookup
output_vectors: word vectors from hidden layer to final cost function
cost_func: cost function, such as softmax_cost or negative_sampling_cost
- Outputs:
cost: evaluated cost value
grad_in: gradient with regard to input_vectors
grad_out: gradient with regard to output_vectors
"""
cost = 0
grad_in = np.zeros_like(input_vectors)
grad_out = np.zeros_like(output_vectors)
V, D = input_vectors.shape
onehot = np.zeros((2*context_size, V))
target_word_idx = center_word
#print context_size, context_words
for i, w_idx in enumerate(context_words):
onehot[i, w_idx] += 1
d = np.dot(onehot, input_vectors) # (2C, V) dot (V, D) ==> (2C, D)
cbow_vec = np.sum(d, axis=0)/(2 * context_size) # Average vectors' weight, shape (1, D)
cost, grad_out, grad_cbow_vec = cost_func(cbow_vec, output_vectors, target_word_idx)
for word in context_words:
grad_in[word] += grad_cbow_vec/(2 * context_size)
return cost, grad_in, grad_out
# Check gradient
def gradcheck_naive(f, x):
"""
Gradient check for a function f
- f should be a function that takes a single argument and outputs the cost and its gradients
- x is the point (numpy array) to check the gradient at
"""
rndstate = random.getstate()
random.setstate(rndstate)
fx, grad = f(x) # Evaluate function value at original point
h = 1e-8
# Iterate over all indexes in x
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
while not it.finished:
ix = it.multi_index
org_fx = fx
org_x = x[ix]
x[ix] = org_x + 0.5*h
random.setstate(rndstate)
fx_a, _ = f(x)
x[ix] = org_x - 0.5*h
random.setstate(rndstate)
fx_b, _ = f(x)
fx = org_fx
x[ix] = org_x
numgrad = (fx_a - fx_b) / h
# Compare gradients
reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
if reldiff > 1e-5:
print "Gradient check failed."
print "First gradient error found at index %s" % str(ix)
print "Your gradient: %f \t Numerical gradient: %f" % (grad[ix], numgrad)
return
it.iternext() # Step
print "Gradient check passed!"
def getRandomContext(context_size,
dataset):
span = len(dataset)
centerword_idx = random.randint(0, span-1)
context = dataset[max(0, centerword_idx - context_size): centerword_idx]
if centerword_idx + 1 < span:
context += dataset[centerword_idx + 1: min(span, centerword_idx + context_size + 1)]
#print "centerword = {}, context = {}".format(dataset[centerword_idx], context)
return dataset[centerword_idx], [dataset[w] for w in context]
def word2vec_sgd_batch(model, word_vectors, dataset, context_size, word2vec_cost_func=negative_sampling_cost):
""" Run word2vec stochastic gradient descent for a set (batchsize) of randomly picked data"""
batchsize = 50
cost = 0.0
grad = np.zeros(word_vectors.shape)
N = word_vectors.shape[0]
input_vectors = word_vectors[:N/2, :]
output_vectors = word_vectors[N/2:,:]
for i in xrange(batchsize):
c_size = random.randint(1, context_size)
centerword, context = getRandomContext(c_size, dataset)
c, gin, gout = model(centerword, c_size, context, input_vectors, output_vectors, word2vec_cost_func)
cost += c/ batchsize
grad[:N/2, :] += gin / batchsize
grad[N/2:, :] += gout /batchsize
return cost, grad
### Check gradient with summy dataset
#dummy_dataset = ['abacus', 'b', 'c', 'd', 'e']
dummy_dataset = [0, 1, 2, 3, 4]
dummy_dict = dict([("abacus", 0), ("b", 1), ("c", 2), ("d", 3), ("e", 4)])
# Fix random seeds for numerical gradient check
random.seed(123456)
np.random.seed(654321)
dummy_vectors = normalize_vecs(np.random.randn(10, 3))
# Run Gradient check
gradcheck_naive(lambda vec: word2vec_sgd_batch(skipgram, vec, dummy_dataset, 5), dummy_vectors)
gradcheck_naive(lambda vec: word2vec_sgd_batch(skipgram, vec, dummy_dataset, 5, softmax_cost), dummy_vectors)
gradcheck_naive(lambda vec: word2vec_sgd_batch(cbow, vec, dummy_dataset, 5), dummy_vectors)
gradcheck_naive(lambda vec: word2vec_sgd_batch(cbow, vec, dummy_dataset, 5, softmax_cost), dummy_vectors)
# Time to get some read dataset to play with!
# We will use 'text8' dataset, a text-only version of Wikipedia dataset, trimmed to the first 10^8 bytes
dataset_url = 'http://mattmahoney.net/dc/text8.zip'
expected_bytes = 31344016
# To Do: add auto retry with delay if network connection error found
def may_need_download(url_link, dest_directory, expected_bytes):
"""
Download files if not available locally
Args:
url_link: the target file for downloading
dest_directory: local directory to save downloaded file
expected_bytes: expected file size, used to make sure downloading is correct
Return:
filenpath
"""
if not os.path.exists(dest_directory):
os.makedirs(dest_directory)
filename = url_link.split('/')[-1]
filepath = os.path.join(dest_directory, filename)
if not os.path.exists(filepath):
def _progress(count, block_size, total_size):
sys.stdout.write('\r>>> Downloading %s %.1f%%'
%(filename, float(count * block_size) / float(total_size) * 100.0))
sys.stdout.flush()
filepath, _ = urllib.request.urlretrieve(url_link, filepath, _progress)
print '\n'
statinfo = os.stat(filepath)
if statinfo.st_size == expected_bytes:
print 'Successfully downloaded', filename, statinfo.st_size, 'bytes.'
else:
raise Exception('File ' + filepath + ' is damaged. Please try to download it again')
return filepath
dataset_file = may_need_download(dataset_url, 'dataset', expected_bytes)
# Read data into a list of strings
def read_data(filename):
"""
Extract the file, it is a huge list of strings
"""
with zipfile.ZipFile(filename) as f:
data = f.read(f.namelist()[0]).split()
return data
words_list = read_data(dataset_file)
print "Dataset type %s, with %d words" % (type(words_list), len(words_list))
# Process data
def build_dataset(words_list, vocabulary_size):
count = [['UNK', -1]]
count.extend(collections.Counter(words_list).most_common(vocabulary_size - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words_list:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # map to unknown 'unk'
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
### Get Vocabulary ready
vocabulary_size = 20000
dataset, count, dictionary, r_dictionary = build_dataset(words_list, vocabulary_size)
del words_list
# Check some samples of the dataset
print 'Most common words:', count[:5]
print 'Sample data:', dataset[:10], [r_dictionary[i] for i in dataset[:10]]
# Add parameter saving and restoring capability
def save_params(iter, params, cost_history, target_dir):
""" Function to save paramerters for later to restore. """
if not os.path.exists(target_dir):
os.makedirs(target_dir)
filepath = os.path.join(target_dir, "saved_params_%d.npy" % iter)
with open(filepath, "w") as f:
pickle.dump(params, f)
pickle.dump(cost_history, f)
pickle.dump(random.getstate(), f)
def load_saved_params(target_dir):
""" Restore saved parameters and set iteration number. """
start_point = 0
for f in glob.glob("{}/saved_params_*.npy".format(target_dir)):
iter = int(os.path.splitext(os.path.basename(f))[0].split("_")[-1])
if iter > start_point :
start_point = iter
if start_point > 0:
with open("{}/saved_params_{}.npy".format(target_dir, start_point), "r") as f:
params = pickle.load(f)
cost_history = pickle.load(f)
state = pickle.load(f)
return start_point, params, cost_history, state
else:
return start_point, None, [], None
def sgd_optimizer(func, x0, step, iterations,
PRINT_EVERY=100,
SAVE_PARAMS_EVERY=2000,
use_saved=False,
save_dir='saved_params_dir'):
"""
Stochastic Gradient Descent Optimizer
Inputs:
- func: the function to optimize, it should take a single
argument and yield two outputs, a cost and the gradient
with respect to the arguments
- x0: the initial point to start SGD from
- step: the step size for SGD
- iterations: total iterations to run SGD with
- save_dir: directory for saving parameters
Output:
- x: the parameter value after SGD finishes
- cost_history: the history of cost function value
"""
# Anneal learning rate every several iterations
ANNEAL_EVERY = 5000
x = x0
start_iter = 0
cost_history = []
if use_saved:
start_iter , old_x, cost_history, state = load_saved_params(save_dir)
if start_iter > 0:
x = old_x;
step *= 0.5 ** (start_iter / ANNEAL_EVERY)
if state:
random.setstate(state)
else:
pass
expcost = None
for iter in xrange(start_iter + 1, iterations + 1):
cost = None
cost, grad = func(x)
x -= step * grad
cost_history.append(cost)
if iter % PRINT_EVERY == 0:
if not expcost:
expcost = cost
else:
expcost = .95 * expcost + .05 * cost
print "\r>>> iter %d: %f" % (iter, expcost)
if iter % SAVE_PARAMS_EVERY == 0 and use_saved:
save_params(iter, x, cost_history, save_dir)
if iter % ANNEAL_EVERY == 0:
step *= 0.5
print "\n"
return x, cost_history
embedding_size = 64
word_vectors = normalize_vecs(np.random.randn(2*vocabulary_size, embedding_size))
context_window_size = 1
trained_word_vector, cost_history = sgd_optimizer(lambda vec:
word2vec_sgd_batch(skipgram,
vec, dataset,
context_window_size),
word_vectors,
1, 30000, use_saved=True)
#print len(dataset)
# Plot training cost curve
plt.plot(cost_history, 'o')
plt.xlabel('Iteration')
plt.ylabel('Cost')
# Plot the results
final_embeddings = trained_word_vector[:vocabulary_size, :] + trained_word_vector[vocabulary_size:, :]
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(18, 18))
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(label,
xy=(x,y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.savefig(filename)
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 500
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
labels = [r_dictionary[i] for i in xrange(plot_only)]
plot_with_labels(low_dim_embs, labels)
# Hand pick several words to show the distances
visualizeWords = ["a", "an", "the", "one", "first", "two", "three", "four",
"good", "great", "well", "poor", "bad", "evil", "god"]
visualizeIdx = [dictionary[word] for word in visualizeWords]
visualizeVecs = final_embeddings[visualizeIdx, :]
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
U,S,V = np.linalg.svd(covariance)
coord = temp.dot(U[:,0:2])
for i in xrange(len(visualizeWords)):
plt.text(coord[i,0], coord[i,1], visualizeWords[i],
bbox=dict(facecolor='green', alpha=0.1))
plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
plt.savefig('q3_word_vectors.png')
plt.show()
# Check similarity of embedding words for top words
nom = np.sqrt(np.sum(np.square(final_embeddings), 1, keepdims=True))
normalized_embeddings = final_embeddings / nom
#randomly pick some words from top distribution
valid_size = 16
valid_example = np.random.choice(50, valid_size, replace=False)
valid_matrix = normalized_embeddings[valid_example,:]
similarity = valid_matrix.dot(normalized_embeddings.T)
for i in xrange(valid_size):
valid_word = r_dictionary[valid_example[i]]
top_k = 8 # number of nearest neighbors
nearest = (similarity[i,:]).argsort()[-top_k-1:-1]
close_words = [r_dictionary[k] for k in nearest]
print "'{}' are closest words to '{}'".format(', '.join(close_words), valid_word)