,然后再根据这篇文章,使用jieba分词技术实现中文分词,并且统计词频。
load stopwordsload stopwords。
import jiebaimport jieba。
import xlwtimport xlwt。
f = open("C://Users//Administrator//Desktop//NLPdata//stopwords.txt",'r',encoding ='utf-8')f = open("C://Users//Administrator//Desktop//NLPdata//stopwords.txt",'r',encoding ='utf-8')。
stopwords = [line.strip() for line in f.readlines()]stopwords = [line.strip() for line in f.readlines()]。
f.close()f.close()。
txt= open("C://Users//Administrator//Desktop//NLPdata//LiaoNing.txt",'r',encoding ='utf-8')txt= open("C://Users//Administrator//Desktop//NLPdata//LiaoNing.txt",'r',encoding ='utf-8')。
text = ''.join(txt.readlines())text = ''.join(txt.readlines())。
txt.close()txt.close()。
segment = jieba.lcut(text)segment = jieba.lcut(text)。
temp = {}temp = {}。
for ele in segment:for ele in segment:。
if ele in stopwords:if ele in stopwords:。
continuecontinue。
else:else:。
temp[ele] = segment.count(ele)temp[ele] = segment.count(ele)。
#Create a dictionary and sort a dictionary according to the values#Create a dictionary and sort a dictionary according to the values。
dict = sorted(temp.items(),key=lambda temp:temp[1],reverse=True)dict = sorted(temp.items(),key=lambda temp:temp[1],reverse=True)。
#Create a new Excel file, write values to the Excel file, and save the Excel file#Create a new Excel file, write values to the Excel file, and save the Excel file。
excel = xlwt.Workbook(encoding='utf-8')excel = xlwt.Workbook(encoding='utf-8')。
sheet1 = excel.add_sheet('data')sheet1 = excel.add_sheet('data')。
for i in range(len(dict)):for i in range(len(dict)):。
for j in range(len(dict[i])):for j in range(len(dict[i])):。
sheet1.write(i,j,dict[i][j])sheet1.write(i,j,dict[i][j])。
excel.save('C://Users//Administrator//Desktop//NLPdata//LiaoNing_data.xls')。