Python常用功能函数系列总结(二)
常用函数二:文本分词
方式一:jieba分词+停用词+自定义词典
# -*- coding: utf-8 -*- """ Datetime: 2020/06/25 Author: Zhang Yafei Description: 文本分词 输入 停用词文件路径 词典文件路径 分词文件路径 表名(可选) 列名 分词结果列名 保存文件名 输出 分词结果-文件 """ import os import re import jieba import pandas as pd if not os.path.exists(‘res‘): os.mkdir(‘res‘) class TextCut(object): def __init__(self, dictionary=None, stopwords=None, ): self.dictionary = dictionary self.word_list = None if self.dictionary: jieba.load_userdict(self.dictionary) if stopwords: with open(stopwords, ‘r‘, encoding=‘utf-8‘) as swf: self.stopwords = [line.strip() for line in swf] else: self.stopwords = None @staticmethod def clean_txt(raw): file = re.compile(r"[^0-9a-zA-Z\u4e00-\u9fa5]+") return file.sub(‘ ‘, raw) def cut(self, text): sentence = self.clean_txt(text.strip().replace(‘\n‘, ‘‘)) return ‘ ‘.join([i for i in jieba.cut(sentence) if i.strip() and i not in self.stopwords and len(i) > 1]) def cut2(self, text): sentence = self.clean_txt(text.strip().replace(‘\n‘, ‘‘)) return ‘ ‘.join([i for i in jieba.cut(sentence) if i.strip() and i not in self.stopwords and len(i) > 1 and i in self.word_list]) def run(self, file_path, col_name, new_col_name, to_file, sheet_name=None, word_in_dict=False): if sheet_name: df = pd.read_excel(file_path, sheet_name=sheet_name) else: df = pd.read_excel(file_path) if word_in_dict: with open(self.dictionary, encoding=‘utf-8‘) as f: self.word_list = [word.strip() for word in f] df[new_col_name] = df[col_name].apply(self.cut2) else: df[new_col_name] = df[col_name].apply(self.cut) df.to_excel(to_file, index=False) print(‘######### 处理完成 ############‘) if __name__ == "__main__": # 1. 分词 text_cut = TextCut(stopwords=‘data/stopwords.txt‘, dictionary=‘data/word_dict.txt‘) text_cut.run(file_path=‘data/山西政策.xlsx‘, sheet_name=‘1.21-2.20‘, col_name=‘全文‘, new_col_name=‘全文分词‘, to_file=‘res/山西政策_分词.xlsx‘)
方式二:jieba分词+信息熵合并
# -*- coding: utf-8 -*- """ Datetime: 2020/03/01 Author: Zhang Yafei Description: 基于信息熵对分词结果进行合并 """ from collections import Counter from functools import reduce from pandas import read_excel, DataFrame class InfoEntropyMerge(object): def __init__(self, data, stopwords=‘data/stopwords.txt‘): self.data = data self.words_freq_one = {} self.words_freq_two = {} self.entropy_words_dict = {} if stopwords: with open(stopwords, ‘r‘, encoding=‘utf-8‘) as f: self.stopwords = {line.strip() for line in f} else: self.stopwords = None def count_word_freq_one(self, save_to_file=False, word_freq_file=None): keywords = (word for word_list in self.data for word in word_list if word) self.words_freq_one = Counter(keywords) if save_to_file: words = [word for word in self.words_freq_one] freqs = [self.words_freq_one[word] for word in words] words_df = DataFrame(data={‘word‘: words, ‘freq‘: freqs}) words_df.sort_values(‘freq‘, ascending=False, inplace=True) words_df.to_excel(word_freq_file, index=False) def count_freq(self, word1, word2): """ 统计相邻两个词出现的频率 :param word1: :param word2: :return: """ if (word1, word2) not in self.words_freq_two: self.words_freq_two[(word1, word2)] = 1 else: self.words_freq_two[(word1, word2)] += 1 return word2 def count_word_freq_two(self, save_to_file=False, word_freq_file=None): """ 计算相邻两个词出现的频率 :param save_to_file: :param word_freq_file: :return: """ for word_list in self.data: reduce(self.count_freq, word_list) if save_to_file and word_freq_file: words_list = [(word1, word2) for word1, word2 in self.words_freq_two] freqs = [self.words_freq_two[w1_w2] for w1_w2 in words_list] words_df = DataFrame(data={‘word‘: words_list, ‘freq‘: freqs}) words_df.sort_values(‘freq‘, ascending=False, inplace=True) words_df.to_excel(word_freq_file, index=False) @staticmethod def is_chinese(word): for ch in word: if ‘\u4e00‘ <= ch <= ‘\u9fff‘: return True return False def clac_entropy(self, save_to_file=False, dict_path=‘data/entropy_dict.txt‘): """ 计算信息熵: E(w1, w2) = P(w1,w2)/min(P(w1),P(w2)) :param save_to_file: 是否将熵值大于0.5的新词保存到文件中 :param dict_path: 保存字典路径 :return: """ for word1, word2 in self.words_freq_two: freq_two = self.words_freq_two[(word1, word2)] freq_one_min = min(self.words_freq_one[word1], self.words_freq_one[word2]) freq_one_max = max(self.words_freq_one[word1], self.words_freq_one[word2]) w1_w2_entropy = freq_two / freq_one_max if self.stopwords: if w1_w2_entropy > 0.5 and word1 not in self.stopwords and word2 not in self.stopwords and self.is_chinese(word1) and self.is_chinese(word2): # print(word1, word2, freq_two, freq_one_min, freq_one_max) self.entropy_words_dict[word1+word2] = w1_w2_entropy else: if w1_w2_entropy > 0.5: self.entropy_words_dict[word1+word2] = w1_w2_entropy print(‘信息熵大于0.5的词语组合:\n‘, self.entropy_words_dict) if save_to_file and dict_path: with open(dict_path, mode=‘r+‘, encoding=‘utf-8‘) as f: content = f.read() f.seek(0, 0) for word in self.entropy_words_dict: f.write(word+‘\n‘) f.write(content) print(f‘成功将信息熵大于0.5的词语保存到了{dict_path}中‘) def data_read(path, col_name): df = read_excel(path) texts = df.loc[df[col_name].notna(), col_name].str.split() return texts if __name__ == ‘__main__‘: text_list = data_read(path=‘res/国家政策_分词.xlsx‘, col_name=‘全文分词‘) info_entro = InfoEntropyMerge(data=text_list) info_entro.count_word_freq_one() info_entro.count_word_freq_two() info_entro.clac_entropy(save_to_file=False, dict_path=‘data/entropy_dict.txt‘)
经验分享:若有好的词典和停用词,优先选用方式一,否则选择方式二。
常用函数三:词频统计
# -*- coding: utf-8 -*- """ Datetime: 2020/06/25 Author: Zhang Yafei Description: 统计词频 输入 文件名 列名 分割符 输出 词频统计结果-文件 """ from collections import Counter import pandas as pd def count_word_freq(file_path, col_name, to_file, sep=‘; ‘, multi_table=False): """ 统计词频 :param file_path: 读取文件路径 :param col_name: 统计词频所在列名 :param to_file: 保存文件路径 :param sep: 词语分割符 :param multi_table: 是否读取多张表 :return: """ if multi_table: datas = pd.read_excel(file_path, header=None, sheet_name=None) with pd.ExcelWriter(path=to_file) as writer: for sheet_name in datas: df = datas[sheet_name] keywords = (word for word_list in df.loc[df[col_name].notna(), col_name].str.split(sep) for word in word_list if word) words_freq = Counter(keywords) words = [word for word in words_freq] freqs = [words_freq[word] for word in words] words_df = pd.DataFrame(data={‘word‘: words, ‘freq‘: freqs}) words_df.sort_values(‘freq‘, ascending=False, inplace=True) words_df.to_excel(excel_writer=writer, sheet_name=sheet_name, index=False) writer.save() else: df = pd.read_excel(file_path) keywords = (word for word_list in df.loc[df[col_name].notna(), col_name].str.split() for word in word_list if word) words_freq = Counter(keywords) words = [word for word in words_freq] freqs = [words_freq[word] for word in words] words_df = pd.DataFrame(data={‘word‘: words, ‘freq‘: freqs}) words_df.sort_values(‘freq‘, ascending=False, inplace=True) words_df.to_excel(to_file, index=False) if __name__ == ‘__main__‘: # 对data.xlsx所有表中的keyword列统计词频,以默认‘; ‘为分割符切割词语,统计该列分词后的词频,结果保存至res.xlsx中 count_word_freq(file_path=‘data.xlsx‘, col_name=‘keyword‘, to_file=‘res.xlsx‘, multi_table=True)
经验分享:注意输入格式为excel文件,这也是我学习生活中常用的处理方式,直接拿去用,非常方便
另外,在我之前的一篇博客中,我介绍了Python统计词频常用的几种方式,不同的场景可以满足你各自的需求。博客传送门:https://www.cnblogs.com/zhangyafei/p/10653977.html
相关推荐
夜斗不是神 2020-11-17
huavhuahua 2020-11-20
Yasin 2020-11-16
xiaoseyihe 2020-11-16
千锋 2020-11-15
diyanpython 2020-11-12
chunjiekid 2020-11-10
wordmhg 2020-11-06
世事一场大梦 2020-11-17
xiaoseyihe 2020-11-16
Morelia 2020-11-03
CloudXli 2020-11-03
文山羊 2020-10-31
comtop0 2020-10-31
pythonxuexi 2020-10-30
三石 2020-10-29
chaochao 2020-10-27
PythonMaker 2020-10-27