29 lines
930 B
Python
29 lines
930 B
Python
# coding:utf-8
|
||
# @文件: chinese_analyzer.py
|
||
# @创建者:州的先生
|
||
# #日期:2020/11/22
|
||
# 博客地址:zmister.com
|
||
|
||
import jieba
|
||
from whoosh.analysis import Tokenizer, Token
|
||
|
||
class ChineseTokenizer(Tokenizer):
|
||
def __call__(self, value, positions=False, chars=False,
|
||
keeporiginal=False, removestops=True,
|
||
start_pos=0, start_char=0, mode='', **kwargs):
|
||
t = Token(positions, chars, removestops=removestops, mode=mode,
|
||
**kwargs)
|
||
seglist = jieba.cut(value, cut_all=True)
|
||
for w in seglist:
|
||
t.original = t.text = w
|
||
t.boost = 1.0
|
||
if positions:
|
||
t.pos = start_pos + value.find(w)
|
||
if chars:
|
||
t.startchar = start_char + value.find(w)
|
||
t.endchar = start_char + value.find(w) + len(w)
|
||
yield t
|
||
|
||
|
||
def ChineseAnalyzer():
|
||
return ChineseTokenizer() |