import numpy as np
from sklearn.preprocessing import normalize
import requests
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2
class WikiEmbedding:
def __init__(self, fname):
self.w2idx = {}
self.idx2w = []
with open(fname, 'rb') as f:
next(f)
m,n = 1828514, 100
self.E = np.zeros((int(m), int(n)))
for i, l in enumerate(f):
try:
l = l.decode('utf8').strip().split(' ')
except:
continue
w = l[0]
self.E[i] = np.array(l[1:])
self.w2idx[w] = i
self.idx2w.append(w)
self.E = normalize(self.E)
self.idx2w = np.array(self.idx2w)
def most_similar(self, w, n=10, min_similarity=0.5):
"""
Find the top-N most similar words to w, based on cosine similarity.
As a speed optimization, only consider neighbors with a similarity
above min_similarity
"""
if type(w) is str:
w = self.E[self.w2idx[w]]
scores = self.E.dot(w)
# only consider neighbors above threshold
min_idxs = np.where(scores > min_similarity)
ranking = np.argsort(-scores[min_idxs])[1:(n+1)]
nn_ws = self.idx2w[min_idxs][ranking]
nn_scores = scores[min_idxs][ranking]
return list(zip(list(nn_ws), list(nn_scores)))
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
en_embedding = WikiEmbedding('2017-01-01_2017-01-30_en_100')
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-6-b0aa5fcb03e7> in <module>() ----> 1 en_embedding = WikiEmbedding('2017-01-01_2017-01-30_en_100') <ipython-input-5-c87c2313e6c5> in __init__(self, fname) 29 continue 30 w = l[0] ---> 31 self.E[i] = np.array(l[1:]) 32 self.w2idx[w] = i 33 self.idx2w.append(w) ValueError: could not broadcast input array from shape (99) into shape (100)
en_embedding.most_similar('Word2vec')