优化文档全文搜索

2021-02-08 21:13:37 +08:00 · 2021-02-08 21:13:37 +08:00 · 4e1c9d5f14
commit 4e1c9d5f14
parent 5a79dcbe4c
2 changed files with 155 additions and 15 deletions
--- a/app_doc/search/chinese_analyzer.py
+++ b/app_doc/search/chinese_analyzer.py
@ -4,26 +4,152 @@
 # #日期：2020/11/22
 # 博客地址：zmister.com

-import jieba
+from whoosh.compat import u, text_type
+from whoosh.analysis.filters import LowercaseFilter
+from whoosh.analysis.filters import StopFilter, STOP_WORDS
+from whoosh.analysis.morph import StemFilter
+from whoosh.analysis.tokenizers import default_pattern
+from whoosh.lang.porter import stem
 from whoosh.analysis import Tokenizer, Token
+from whoosh.util.text import rcompile
+import jieba
+

 class ChineseTokenizer(Tokenizer):
-    def __call__(self, value, positions=False, chars=False,
-                 keeporiginal=False, removestops=True,
-                 start_pos=0, start_char=0, mode='', **kwargs):
+    """
+    使用正则表达式从文本中提取 token 令牌。
+    >>> rex = ChineseTokenizer()
+    >>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))]
+    ["hi", "there", "3.141", "big", "time", "under_score"]
+    """
+    def __init__(self, expression=default_pattern, gaps=False):
+        """
+        :param expression: 一个正则表达式对象或字符串，默认为 rcompile(r"\w+(\.?\w+)*")。
+            表达式的每一个匹配都等于一个 token 令牌。
+            第0组匹配（整个匹配文本）用作 token 令牌的文本。
+            如果你需要更复杂的正则表达式匹配处理，只需要编写自己的 tokenizer 令牌解析器即可。
+        :param gaps: 如果为 True, tokenizer 令牌解析器会在正则表达式上进行分割，而非匹配。
+        """
+        self.expression = rcompile(expression)
+        self.gaps = gaps
+    def __eq__(self, other):
+        if self.__class__ is other.__class__:
+            if self.expression.pattern == other.expression.pattern:
+                return True
+        return False
+    def __call__(self, value, positions=False, chars=False, keeporiginal=False,
+                 removestops=True, start_pos=0, start_char=0, tokenize=True,
+                 mode='', **kwargs):
+        """
+        :param value: 进行令牌解析的 Unicode 字符串。
+        :param positions: 是否在 token 令牌中记录 token 令牌位置。
+        :param chars: 是否在 token 中记录字符偏移。
+        :param start_pos: 第一个 token 的位置。例如，
+            如果设置 start_pos=2, 那么 token 的位置将是 2,3,4,...而非 0,1,2,...
+        :param start_char: 第一个 token 中第一个字符的偏移量。
+            例如, 如果设置 start_char=2, 那么文本 "aaa bbb" 解析的两个字符串位置将体现为 (2,5),(6,9) 而非 (0,3),(4,7).
+        :param tokenize: 如果为 True, 文本应该被令牌解析。
+        """
+        # 判断传入的文本是否为字符串，如果不为字符串则抛出
+        assert isinstance(value, text_type), "%s is not unicode" % repr(value)
        t = Token(positions, chars, removestops=removestops, mode=mode,
                  **kwargs)
-        seglist = jieba.cut(value, cut_all=True)
-        for w in seglist:
-            t.original = t.text = w
+        if not tokenize:
+            t.original = t.text = value
            t.boost = 1.0
            if positions:
-                t.pos = start_pos + value.find(w)
+                t.pos = start_pos
            if chars:
-                t.startchar = start_char + value.find(w)
-                t.endchar = start_char + value.find(w) + len(w)
+                t.startchar = start_char
+                t.endchar = start_char + len(value)
            yield t
+        elif not self.gaps:
+            # The default: expression matches are used as tokens
+            # 默认情况下，正则表达式的匹配用作 token 令牌
+            # for pos, match in enumerate(self.expression.finditer(value)):
+            #     t.text = match.group(0)
+            #     t.boost = 1.0
+            #     if keeporiginal:
+            #         t.original = t.text
+            #     t.stopped = False
+            #     if positions:
+            #         t.pos = start_pos + pos
+            #     if chars:
+            #         t.startchar = start_char + match.start()
+            #         t.endchar = start_char + match.end()
+            #     yield t
+            seglist = jieba.cut(value, cut_all=True)
+            for w in seglist:
+                t.original = t.text = w
+                t.boost = 1.0
+                if positions:
+                    t.pos = start_pos + value.find(w)
+                if chars:
+                    t.startchar = start_char + value.find(w)
+                    t.endchar = start_char + value.find(w) + len(w)
+                yield t
+        else:
+            # When gaps=True, iterate through the matches and
+            # yield the text between them.
+            # 当 gaps=True, 遍历匹配项并在它们之间生成文本。
+            prevend = 0
+            pos = start_pos
+            for match in self.expression.finditer(value):
+                start = prevend
+                end = match.start()
+                text = value[start:end]
+                if text:
+                    t.text = text
+                    t.boost = 1.0
+                    if keeporiginal:
+                        t.original = t.text
+                    t.stopped = False
+                    if positions:
+                        t.pos = pos
+                        pos += 1
+                    if chars:
+                        t.startchar = start_char + start
+                        t.endchar = start_char + end
+                    yield t
+                prevend = match.end()
+            # If the last "gap" was before the end of the text,
+            # yield the last bit of text as a final token.
+            if prevend < len(value):
+                t.text = value[prevend:]
+                t.boost = 1.0
+                if keeporiginal:
+                    t.original = t.text
+                t.stopped = False
+                if positions:
+                    t.pos = pos
+                if chars:
+                    t.startchar = prevend
+                    t.endchar = len(value)
+                yield t


-def ChineseAnalyzer():
-    return ChineseTokenizer()
+def ChineseAnalyzer(expression=default_pattern, stoplist=None,
+                     minsize=2, maxsize=None, gaps=False, stemfn=stem,
+                     ignore=None, cachesize=50000):
+    """Composes a RegexTokenizer with a lower case filter, an optional stop
+    filter, and a stemming filter.
+    用小写过滤器、可选的停止停用词过滤器和词干过滤器组成生成器。
+    >>> ana = ChineseAnalyzer()
+    >>> [token.text for token in ana("Testing is testing and testing")]
+    ["test", "test", "test"]
+    :param expression: 用于提取 token 令牌的正则表达式
+    :param stoplist: 一个停用词列表。 设置为 None 标识禁用停用词过滤功能。
+    :param minsize: 单词最小长度，小于它的单词将被从流中删除。
+    :param maxsize: 单词最大长度，大于它的单词将被从流中删除。
+    :param gaps: 如果为 True, tokenizer 令牌解析器将会分割正则表达式，而非匹配正则表达式
+    :param ignore: 一组忽略的单词。
+    :param cachesize: 缓存词干词的最大数目。 这个数字越大，词干生成的速度就越快，但占用的内存就越多。
+                      使用 None 表示无缓存，使用 -1 表示无限缓存。
+    """
+    ret = ChineseTokenizer(expression=expression, gaps=gaps)
+    chain = ret | LowercaseFilter()
+    if stoplist is not None:
+        chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,
+                                   maxsize=maxsize)
+    return chain | StemFilter(stemfn=stemfn, ignore=ignore,
+                              cachesize=cachesize)
--- a/app_doc/views_search.py
+++ b/app_doc/views_search.py
@ -5,13 +5,26 @@
 # 博客地址：zmister.com


-# from haystack.generic_views import SearchView
+from haystack.generic_views import SearchView as BaseSearchView
 from django.db.models import Q
 from haystack.views import SearchView
 from haystack.query import SearchQuerySet
 from app_doc.models import *
 import datetime

+
+class DocSearchView2(BaseSearchView):
+
+    def get_queryset(self):
+        queryset = super(DocSearchView, self).get_queryset()
+        # further filter queryset based on some set of criteria
+        return queryset.filter(pub_date__gte=date(2015, 1, 1))
+
+    def get_context_data(self, *args, **kwargs):
+        context = super(DocSearchView, self).get_context_data(*args, **kwargs)
+        # do something
+        return context
+
 # 文档搜索 - 基于Haystack全文搜索
 class DocSearchView(SearchView):
    results_per_page = 10
@ -66,18 +79,19 @@ class DocSearchView(SearchView):
            view_list = list(set(open_list).union(set(colla_list)))  # 合并上述两个文集ID列表
        else:
            view_list = [i.id for i in Project.objects.filter(role=0)] # 公开文集
+
        if len(view_list) > 0:
            sqs = SearchQuerySet().filter(
                top_doc__in=view_list
            ).filter(
                modify_time__gte=start_date,
-                modify_time__lte=end_date).order_by('-modify_time')
+                modify_time__lte=end_date)
        else:
            sqs = SearchQuerySet().filter(
                top_doc__in=None
            ).filter(
                modify_time__gte=start_date,
-                modify_time__lte=end_date).order_by('-modify_time')
+                modify_time__lte=end_date)
        self.form = self.build_form(form_kwargs={'searchqueryset': sqs})
        self.query = self.get_query().replace("\n",'').replace("\r",'')
        self.results = self.get_results()