gensim - word2vec实战

介绍如何利用 gensim 库建立简单的 word2vec 模型。

# -*- coding: utf-8 -*-
import gensim
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import os
import logging
import jieba
import re
import multiprocessing
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

# logging information
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)

# get input file, text format
inp = sys.argv[1]
input = open(inp, 'r')
output = open('output.seq', 'w')

if len(sys.argv) < 2:
    print(globals()['__doc__'] % locals())
    sys.exit(1)

# read file and separate words
for line in input.readlines():
    line=line.strip('\n')
    seg_list = jieba.cut(line)
    output.write(' '.join(seg_list) + '\n')

output.close()
output= open('output.seq', 'r')

# initialize the model
# size = the dimensionality of the feature vectors
# window = the maximum distance between the current and predicted word within a sentence
# min_count = ignore all words with total frequency lower than this.
model = Word2Vec(LineSentence(output), size=100, window=3, min_count=5,workers=multiprocessing.cpu_count())

# save model
model.save('output.model')
model.save_word2vec_format('output.vector', binary=False)

# test
model=gensim.models.Word2Vec.load('output.model')
x = model.most_similar([u'奖励'])
for i in x:
    print "Word: {}\t Similarity: {}".format(i[0], i[1])

更多代码

徐阿衡 wechat
欢迎关注:徐阿衡的微信公众号
客官,打个赏呗~