优化文档全文搜索

This commit is contained in:
yangjian 2021-02-08 21:13:37 +08:00
parent 5a79dcbe4c
commit 4e1c9d5f14
2 changed files with 155 additions and 15 deletions

View File

@ -4,26 +4,152 @@
# #日期2020/11/22
# 博客地址zmister.com
import jieba
from whoosh.compat import u, text_type
from whoosh.analysis.filters import LowercaseFilter
from whoosh.analysis.filters import StopFilter, STOP_WORDS
from whoosh.analysis.morph import StemFilter
from whoosh.analysis.tokenizers import default_pattern
from whoosh.lang.porter import stem
from whoosh.analysis import Tokenizer, Token
from whoosh.util.text import rcompile
import jieba
class ChineseTokenizer(Tokenizer):
def __call__(self, value, positions=False, chars=False,
keeporiginal=False, removestops=True,
start_pos=0, start_char=0, mode='', **kwargs):
"""
使用正则表达式从文本中提取 token 令牌
>>> rex = ChineseTokenizer()
>>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))]
["hi", "there", "3.141", "big", "time", "under_score"]
"""
def __init__(self, expression=default_pattern, gaps=False):
"""
:param expression: 一个正则表达式对象或字符串默认为 rcompile(r"\w+(\.?\w+)*")
表达式的每一个匹配都等于一个 token 令牌
第0组匹配整个匹配文本用作 token 令牌的文本
如果你需要更复杂的正则表达式匹配处理只需要编写自己的 tokenizer 令牌解析器即可
:param gaps: 如果为 True, tokenizer 令牌解析器会在正则表达式上进行分割而非匹配
"""
self.expression = rcompile(expression)
self.gaps = gaps
def __eq__(self, other):
if self.__class__ is other.__class__:
if self.expression.pattern == other.expression.pattern:
return True
return False
def __call__(self, value, positions=False, chars=False, keeporiginal=False,
removestops=True, start_pos=0, start_char=0, tokenize=True,
mode='', **kwargs):
"""
:param value: 进行令牌解析的 Unicode 字符串
:param positions: 是否在 token 令牌中记录 token 令牌位置
:param chars: 是否在 token 中记录字符偏移
:param start_pos: 第一个 token 的位置例如
如果设置 start_pos=2, 那么 token 的位置将是 2,3,4,...而非 0,1,2,...
:param start_char: 第一个 token 中第一个字符的偏移量
例如, 如果设置 start_char=2, 那么文本 "aaa bbb" 解析的两个字符串位置将体现为 (2,5),(6,9) 而非 (0,3),(4,7).
:param tokenize: 如果为 True, 文本应该被令牌解析
"""
# 判断传入的文本是否为字符串,如果不为字符串则抛出
assert isinstance(value, text_type), "%s is not unicode" % repr(value)
t = Token(positions, chars, removestops=removestops, mode=mode,
**kwargs)
seglist = jieba.cut(value, cut_all=True)
for w in seglist:
t.original = t.text = w
if not tokenize:
t.original = t.text = value
t.boost = 1.0
if positions:
t.pos = start_pos + value.find(w)
t.pos = start_pos
if chars:
t.startchar = start_char + value.find(w)
t.endchar = start_char + value.find(w) + len(w)
t.startchar = start_char
t.endchar = start_char + len(value)
yield t
elif not self.gaps:
# The default: expression matches are used as tokens
# 默认情况下,正则表达式的匹配用作 token 令牌
# for pos, match in enumerate(self.expression.finditer(value)):
# t.text = match.group(0)
# t.boost = 1.0
# if keeporiginal:
# t.original = t.text
# t.stopped = False
# if positions:
# t.pos = start_pos + pos
# if chars:
# t.startchar = start_char + match.start()
# t.endchar = start_char + match.end()
# yield t
seglist = jieba.cut(value, cut_all=True)
for w in seglist:
t.original = t.text = w
t.boost = 1.0
if positions:
t.pos = start_pos + value.find(w)
if chars:
t.startchar = start_char + value.find(w)
t.endchar = start_char + value.find(w) + len(w)
yield t
else:
# When gaps=True, iterate through the matches and
# yield the text between them.
# 当 gaps=True, 遍历匹配项并在它们之间生成文本。
prevend = 0
pos = start_pos
for match in self.expression.finditer(value):
start = prevend
end = match.start()
text = value[start:end]
if text:
t.text = text
t.boost = 1.0
if keeporiginal:
t.original = t.text
t.stopped = False
if positions:
t.pos = pos
pos += 1
if chars:
t.startchar = start_char + start
t.endchar = start_char + end
yield t
prevend = match.end()
# If the last "gap" was before the end of the text,
# yield the last bit of text as a final token.
if prevend < len(value):
t.text = value[prevend:]
t.boost = 1.0
if keeporiginal:
t.original = t.text
t.stopped = False
if positions:
t.pos = pos
if chars:
t.startchar = prevend
t.endchar = len(value)
yield t
def ChineseAnalyzer():
return ChineseTokenizer()
def ChineseAnalyzer(expression=default_pattern, stoplist=None,
minsize=2, maxsize=None, gaps=False, stemfn=stem,
ignore=None, cachesize=50000):
"""Composes a RegexTokenizer with a lower case filter, an optional stop
filter, and a stemming filter.
用小写过滤器可选的停止停用词过滤器和词干过滤器组成生成器
>>> ana = ChineseAnalyzer()
>>> [token.text for token in ana("Testing is testing and testing")]
["test", "test", "test"]
:param expression: 用于提取 token 令牌的正则表达式
:param stoplist: 一个停用词列表 设置为 None 标识禁用停用词过滤功能
:param minsize: 单词最小长度小于它的单词将被从流中删除
:param maxsize: 单词最大长度大于它的单词将被从流中删除
:param gaps: 如果为 True, tokenizer 令牌解析器将会分割正则表达式而非匹配正则表达式
:param ignore: 一组忽略的单词
:param cachesize: 缓存词干词的最大数目 这个数字越大词干生成的速度就越快但占用的内存就越多
使用 None 表示无缓存使用 -1 表示无限缓存
"""
ret = ChineseTokenizer(expression=expression, gaps=gaps)
chain = ret | LowercaseFilter()
if stoplist is not None:
chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,
maxsize=maxsize)
return chain | StemFilter(stemfn=stemfn, ignore=ignore,
cachesize=cachesize)

View File

@ -5,13 +5,26 @@
# 博客地址zmister.com
# from haystack.generic_views import SearchView
from haystack.generic_views import SearchView as BaseSearchView
from django.db.models import Q
from haystack.views import SearchView
from haystack.query import SearchQuerySet
from app_doc.models import *
import datetime
class DocSearchView2(BaseSearchView):
def get_queryset(self):
queryset = super(DocSearchView, self).get_queryset()
# further filter queryset based on some set of criteria
return queryset.filter(pub_date__gte=date(2015, 1, 1))
def get_context_data(self, *args, **kwargs):
context = super(DocSearchView, self).get_context_data(*args, **kwargs)
# do something
return context
# 文档搜索 - 基于Haystack全文搜索
class DocSearchView(SearchView):
results_per_page = 10
@ -66,18 +79,19 @@ class DocSearchView(SearchView):
view_list = list(set(open_list).union(set(colla_list))) # 合并上述两个文集ID列表
else:
view_list = [i.id for i in Project.objects.filter(role=0)] # 公开文集
if len(view_list) > 0:
sqs = SearchQuerySet().filter(
top_doc__in=view_list
).filter(
modify_time__gte=start_date,
modify_time__lte=end_date).order_by('-modify_time')
modify_time__lte=end_date)
else:
sqs = SearchQuerySet().filter(
top_doc__in=None
).filter(
modify_time__gte=start_date,
modify_time__lte=end_date).order_by('-modify_time')
modify_time__lte=end_date)
self.form = self.build_form(form_kwargs={'searchqueryset': sqs})
self.query = self.get_query().replace("\n",'').replace("\r",'')
self.results = self.get_results()