import pandas as pd from sklearn.decomposition import PCA from gensim.models import KeyedVectors def load_embeddings(path, binary = False, randomPCA = False, limit = None): if randomPCA: pca = PCA(n_components=2, copy=False, whiten=False, svd_solver='randomized', iterated_power='auto' ) else: pca = PCA(n_components=2) print("--------> PATH:", path) model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit) # Cased Vocab cased_words = model.index_to_key cased_emb = model.get_normed_vectors() cased_pca = pca.fit_transform(cased_emb) df_cased = pd.DataFrame( zip( cased_words, cased_emb, cased_pca ), columns=['word', 'embedding', 'pca'] ) df_cased['word'] = df_cased.word.apply(lambda w: w.lower()) df_uncased = df_cased.drop_duplicates(subset='word') return df_uncased #load_embeddings('data/fasttext-sbwc.100k.vec', limit=1000)