From 4e1c9d5f14064a0930636e73cdb7bf75d936c9c8 Mon Sep 17 00:00:00 2001 From: yangjian Date: Mon, 8 Feb 2021 21:13:37 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=96=87=E6=A1=A3=E5=85=A8?= =?UTF-8?q?=E6=96=87=E6=90=9C=E7=B4=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app_doc/search/chinese_analyzer.py | 150 ++++++++++++++++++++++++++--- app_doc/views_search.py | 20 +++- 2 files changed, 155 insertions(+), 15 deletions(-) diff --git a/app_doc/search/chinese_analyzer.py b/app_doc/search/chinese_analyzer.py index b579884..5d7f85f 100644 --- a/app_doc/search/chinese_analyzer.py +++ b/app_doc/search/chinese_analyzer.py @@ -4,26 +4,152 @@ # #日期:2020/11/22 # 博客地址:zmister.com -import jieba +from whoosh.compat import u, text_type +from whoosh.analysis.filters import LowercaseFilter +from whoosh.analysis.filters import StopFilter, STOP_WORDS +from whoosh.analysis.morph import StemFilter +from whoosh.analysis.tokenizers import default_pattern +from whoosh.lang.porter import stem from whoosh.analysis import Tokenizer, Token +from whoosh.util.text import rcompile +import jieba + class ChineseTokenizer(Tokenizer): - def __call__(self, value, positions=False, chars=False, - keeporiginal=False, removestops=True, - start_pos=0, start_char=0, mode='', **kwargs): + """ + 使用正则表达式从文本中提取 token 令牌。 + >>> rex = ChineseTokenizer() + >>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))] + ["hi", "there", "3.141", "big", "time", "under_score"] + """ + def __init__(self, expression=default_pattern, gaps=False): + """ + :param expression: 一个正则表达式对象或字符串,默认为 rcompile(r"\w+(\.?\w+)*")。 + 表达式的每一个匹配都等于一个 token 令牌。 + 第0组匹配(整个匹配文本)用作 token 令牌的文本。 + 如果你需要更复杂的正则表达式匹配处理,只需要编写自己的 tokenizer 令牌解析器即可。 + :param gaps: 如果为 True, tokenizer 令牌解析器会在正则表达式上进行分割,而非匹配。 + """ + self.expression = rcompile(expression) + self.gaps = gaps + def __eq__(self, other): + if self.__class__ is other.__class__: + if self.expression.pattern == other.expression.pattern: + return True + return False + def __call__(self, value, positions=False, chars=False, keeporiginal=False, + removestops=True, start_pos=0, start_char=0, tokenize=True, + mode='', **kwargs): + """ + :param value: 进行令牌解析的 Unicode 字符串。 + :param positions: 是否在 token 令牌中记录 token 令牌位置。 + :param chars: 是否在 token 中记录字符偏移。 + :param start_pos: 第一个 token 的位置。例如, + 如果设置 start_pos=2, 那么 token 的位置将是 2,3,4,...而非 0,1,2,... + :param start_char: 第一个 token 中第一个字符的偏移量。 + 例如, 如果设置 start_char=2, 那么文本 "aaa bbb" 解析的两个字符串位置将体现为 (2,5),(6,9) 而非 (0,3),(4,7). + :param tokenize: 如果为 True, 文本应该被令牌解析。 + """ + # 判断传入的文本是否为字符串,如果不为字符串则抛出 + assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) - seglist = jieba.cut(value, cut_all=True) - for w in seglist: - t.original = t.text = w + if not tokenize: + t.original = t.text = value t.boost = 1.0 if positions: - t.pos = start_pos + value.find(w) + t.pos = start_pos if chars: - t.startchar = start_char + value.find(w) - t.endchar = start_char + value.find(w) + len(w) + t.startchar = start_char + t.endchar = start_char + len(value) yield t + elif not self.gaps: + # The default: expression matches are used as tokens + # 默认情况下,正则表达式的匹配用作 token 令牌 + # for pos, match in enumerate(self.expression.finditer(value)): + # t.text = match.group(0) + # t.boost = 1.0 + # if keeporiginal: + # t.original = t.text + # t.stopped = False + # if positions: + # t.pos = start_pos + pos + # if chars: + # t.startchar = start_char + match.start() + # t.endchar = start_char + match.end() + # yield t + seglist = jieba.cut(value, cut_all=True) + for w in seglist: + t.original = t.text = w + t.boost = 1.0 + if positions: + t.pos = start_pos + value.find(w) + if chars: + t.startchar = start_char + value.find(w) + t.endchar = start_char + value.find(w) + len(w) + yield t + else: + # When gaps=True, iterate through the matches and + # yield the text between them. + # 当 gaps=True, 遍历匹配项并在它们之间生成文本。 + prevend = 0 + pos = start_pos + for match in self.expression.finditer(value): + start = prevend + end = match.start() + text = value[start:end] + if text: + t.text = text + t.boost = 1.0 + if keeporiginal: + t.original = t.text + t.stopped = False + if positions: + t.pos = pos + pos += 1 + if chars: + t.startchar = start_char + start + t.endchar = start_char + end + yield t + prevend = match.end() + # If the last "gap" was before the end of the text, + # yield the last bit of text as a final token. + if prevend < len(value): + t.text = value[prevend:] + t.boost = 1.0 + if keeporiginal: + t.original = t.text + t.stopped = False + if positions: + t.pos = pos + if chars: + t.startchar = prevend + t.endchar = len(value) + yield t -def ChineseAnalyzer(): - return ChineseTokenizer() \ No newline at end of file +def ChineseAnalyzer(expression=default_pattern, stoplist=None, + minsize=2, maxsize=None, gaps=False, stemfn=stem, + ignore=None, cachesize=50000): + """Composes a RegexTokenizer with a lower case filter, an optional stop + filter, and a stemming filter. + 用小写过滤器、可选的停止停用词过滤器和词干过滤器组成生成器。 + >>> ana = ChineseAnalyzer() + >>> [token.text for token in ana("Testing is testing and testing")] + ["test", "test", "test"] + :param expression: 用于提取 token 令牌的正则表达式 + :param stoplist: 一个停用词列表。 设置为 None 标识禁用停用词过滤功能。 + :param minsize: 单词最小长度,小于它的单词将被从流中删除。 + :param maxsize: 单词最大长度,大于它的单词将被从流中删除。 + :param gaps: 如果为 True, tokenizer 令牌解析器将会分割正则表达式,而非匹配正则表达式 + :param ignore: 一组忽略的单词。 + :param cachesize: 缓存词干词的最大数目。 这个数字越大,词干生成的速度就越快,但占用的内存就越多。 + 使用 None 表示无缓存,使用 -1 表示无限缓存。 + """ + ret = ChineseTokenizer(expression=expression, gaps=gaps) + chain = ret | LowercaseFilter() + if stoplist is not None: + chain = chain | StopFilter(stoplist=stoplist, minsize=minsize, + maxsize=maxsize) + return chain | StemFilter(stemfn=stemfn, ignore=ignore, + cachesize=cachesize) \ No newline at end of file diff --git a/app_doc/views_search.py b/app_doc/views_search.py index b65ab4c..6dcdf85 100644 --- a/app_doc/views_search.py +++ b/app_doc/views_search.py @@ -5,13 +5,26 @@ # 博客地址:zmister.com -# from haystack.generic_views import SearchView +from haystack.generic_views import SearchView as BaseSearchView from django.db.models import Q from haystack.views import SearchView from haystack.query import SearchQuerySet from app_doc.models import * import datetime + +class DocSearchView2(BaseSearchView): + + def get_queryset(self): + queryset = super(DocSearchView, self).get_queryset() + # further filter queryset based on some set of criteria + return queryset.filter(pub_date__gte=date(2015, 1, 1)) + + def get_context_data(self, *args, **kwargs): + context = super(DocSearchView, self).get_context_data(*args, **kwargs) + # do something + return context + # 文档搜索 - 基于Haystack全文搜索 class DocSearchView(SearchView): results_per_page = 10 @@ -66,18 +79,19 @@ class DocSearchView(SearchView): view_list = list(set(open_list).union(set(colla_list))) # 合并上述两个文集ID列表 else: view_list = [i.id for i in Project.objects.filter(role=0)] # 公开文集 + if len(view_list) > 0: sqs = SearchQuerySet().filter( top_doc__in=view_list ).filter( modify_time__gte=start_date, - modify_time__lte=end_date).order_by('-modify_time') + modify_time__lte=end_date) else: sqs = SearchQuerySet().filter( top_doc__in=None ).filter( modify_time__gte=start_date, - modify_time__lte=end_date).order_by('-modify_time') + modify_time__lte=end_date) self.form = self.build_form(form_kwargs={'searchqueryset': sqs}) self.query = self.get_query().replace("\n",'').replace("\r",'') self.results = self.get_results()