(原创文章,转载请标明出处) 主要特点如下: 1. 制作了两个自定义字典(三国演义人物名、三国演义官职一览表)
《三国演义》词频统计
使用Jieba分词,统计《三国演义》的词频,最后生成词云
2. 使用停用词词典import re import jieba import csv from collections import Counter from pyecharts import options as opts from pyecharts.charts import Page, WordCloud from pyecharts.globals import SymbolType def ReadText(filename): """读取文本内容""" with open(filename, 'r', encoding='utf-8') as f: text = f.read() return text def CutWords(text, *filelist): """分词(filelist传入自定义字典)""" text = re.sub('([u4e00-u9fa5]+)', '', text) # 只保留中文汉字 for file in filelist: jieba.load_userdict(file) words = jieba.lcut(text) # 分词 words = [word for word in words if len(word)>1] # 去掉只有一个字的词 return words def StopWords(words, stopfile): """去除停用词""" with open(stopfile, 'r', encoding='utf-8') as f: # 打开存放停用词的文件 stoplist = f.readlines() stoplist = [stop.strip('n') for stop in stoplist] words = [word for word in words if word not in stoplist] # 去除停用词 return words def WriteCSV(filename, freqdict, num=0): """将词频统计结果写入csv""" with open(filename, 'w', encoding='utf-8', newline='') as f: if num == 0: num = len(freqdict.keys()) # 默认统计全部词语 freqlist = freqdict.most_common(num) # 词语列表(元素是tuple:词语,词频) writer = csv.writer(f) writer.writerow(('词汇', '词频')) for freq in freqlist: writer.writerow((freq[0], freq[1])) if __name__ == '__main__': text = ReadText('三国演义.txt') text = re.sub('曰', '', text) # 手动清理 filelist = ['三国演义人物名.txt', '三国演义官职一览表.txt'] # 自定义字典 words = CutWords(text, filelist[0], filelist[1]) newwords = StopWords(words, stopfile='stop_words.txt') wordfreq = Counter(newwords) # 计数 WriteCSV('三国演义词频统计.csv', wordfreq, num=50) # Top50 # 生成词云图 wordcloud = WordCloud() wordcloud.add('', wordfreq.most_common(50), word_size_range=[20,100]) wordcloud.set_global_opts(title_opts=opts.TitleOpts(title='三国演义词云Top50')) wordcloud.render('三国演义词云图Top50.html')
本网页所有视频内容由 imoviebox边看边下-网页视频下载, iurlBox网页地址收藏管理器 下载并得到。
ImovieBox网页视频下载器 下载地址: ImovieBox网页视频下载器-最新版本下载
本文章由: imapbox邮箱云存储,邮箱网盘,ImageBox 图片批量下载器,网页图片批量下载专家,网页图片批量下载器,获取到文章图片,imoviebox网页视频批量下载器,下载视频内容,为您提供.
阅读和此文章类似的: 全球云计算