# coding:utf-8 # @文件: chinese_analyzer.py # @创建者:州的先生 # #日期:2020/11/22 # 博客地址:zmister.com from whoosh.compat import u, text_type from whoosh.analysis.filters import LowercaseFilter from whoosh.analysis.filters import StopFilter, STOP_WORDS from whoosh.analysis.morph import StemFilter from whoosh.analysis.tokenizers import default_pattern from whoosh.lang.porter import stem from whoosh.analysis import Tokenizer, Token from whoosh.util.text import rcompile import jieba class ChineseTokenizer(Tokenizer): """ 使用正则表达式从文本中提取 token 令牌。 >>> rex = ChineseTokenizer() >>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))] ["hi", "there", "3.141", "big", "time", "under_score"] """ def __init__(self, expression=default_pattern, gaps=False): """ :param expression: 一个正则表达式对象或字符串,默认为 rcompile(r"\w+(\.?\w+)*")。 表达式的每一个匹配都等于一个 token 令牌。 第0组匹配(整个匹配文本)用作 token 令牌的文本。 如果你需要更复杂的正则表达式匹配处理,只需要编写自己的 tokenizer 令牌解析器即可。 :param gaps: 如果为 True, tokenizer 令牌解析器会在正则表达式上进行分割,而非匹配。 """ self.expression = rcompile(expression) self.gaps = gaps def __eq__(self, other): if self.__class__ is other.__class__: if self.expression.pattern == other.expression.pattern: return True return False def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: 进行令牌解析的 Unicode 字符串。 :param positions: 是否在 token 令牌中记录 token 令牌位置。 :param chars: 是否在 token 中记录字符偏移。 :param start_pos: 第一个 token 的位置。例如, 如果设置 start_pos=2, 那么 token 的位置将是 2,3,4,...而非 0,1,2,... :param start_char: 第一个 token 中第一个字符的偏移量。 例如, 如果设置 start_char=2, 那么文本 "aaa bbb" 解析的两个字符串位置将体现为 (2,5),(6,9) 而非 (0,3),(4,7). :param tokenize: 如果为 True, 文本应该被令牌解析。 """ # 判断传入的文本是否为字符串,如果不为字符串则抛出 assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t elif not self.gaps: # The default: expression matches are used as tokens # 默认情况下,正则表达式的匹配用作 token 令牌 # for pos, match in enumerate(self.expression.finditer(value)): # t.text = match.group(0) # t.boost = 1.0 # if keeporiginal: # t.original = t.text # t.stopped = False # if positions: # t.pos = start_pos + pos # if chars: # t.startchar = start_char + match.start() # t.endchar = start_char + match.end() # yield t seglist = jieba.cut(value, cut_all=True) for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) yield t else: # When gaps=True, iterate through the matches and # yield the text between them. # 当 gaps=True, 遍历匹配项并在它们之间生成文本。 prevend = 0 pos = start_pos for match in self.expression.finditer(value): start = prevend end = match.start() text = value[start:end] if text: t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + start t.endchar = start_char + end yield t prevend = match.end() # If the last "gap" was before the end of the text, # yield the last bit of text as a final token. if prevend < len(value): t.text = value[prevend:] t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = prevend t.endchar = len(value) yield t def ChineseAnalyzer(expression=default_pattern, stoplist=None, minsize=2, maxsize=None, gaps=False, stemfn=stem, ignore=None, cachesize=50000): """Composes a RegexTokenizer with a lower case filter, an optional stop filter, and a stemming filter. 用小写过滤器、可选的停止停用词过滤器和词干过滤器组成生成器。 >>> ana = ChineseAnalyzer() >>> [token.text for token in ana("Testing is testing and testing")] ["test", "test", "test"] :param expression: 用于提取 token 令牌的正则表达式 :param stoplist: 一个停用词列表。 设置为 None 标识禁用停用词过滤功能。 :param minsize: 单词最小长度,小于它的单词将被从流中删除。 :param maxsize: 单词最大长度,大于它的单词将被从流中删除。 :param gaps: 如果为 True, tokenizer 令牌解析器将会分割正则表达式,而非匹配正则表达式 :param ignore: 一组忽略的单词。 :param cachesize: 缓存词干词的最大数目。 这个数字越大,词干生成的速度就越快,但占用的内存就越多。 使用 None 表示无缓存,使用 -1 表示无限缓存。 """ ret = ChineseTokenizer(expression=expression, gaps=gaps) chain = ret | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter(stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain | StemFilter(stemfn=stemfn, ignore=ignore, cachesize=cachesize)