diff --git a/MrDoc/settings.py b/MrDoc/settings.py index 3618965..af9ab11 100644 --- a/MrDoc/settings.py +++ b/MrDoc/settings.py @@ -53,10 +53,11 @@ INSTALLED_APPS = [ 'django.contrib.sessions', 'django.contrib.messages', 'django.contrib.staticfiles', - 'app_admin', - 'app_doc', - 'app_api', - 'django.contrib.sitemaps', + 'haystack', # 全文搜索 + 'app_admin', # 管理APP + 'app_doc', # 文档APP + 'app_api', # API APP + 'django.contrib.sitemaps', # 站点地图 'rest_framework', ] @@ -200,4 +201,20 @@ except: try: CHROMIUM_ARGS = CONFIG['chromium']['args'].split(',') except: - CHROMIUM_ARGS = [] \ No newline at end of file + CHROMIUM_ARGS = [] + + +# 全文检索配置 +HAYSTACK_CONNECTIONS = { + 'default': { + # 使用whoosh引擎 + 'ENGINE': 'app_doc.search.whoosh_cn_backend.WhooshEngine', + # 索引文件路径 + 'PATH': os.path.join(BASE_DIR, 'whoosh_index'), + } +} + +# 当添加、修改、删除数据时,自动生成索引 +HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor' +# 自定义高亮 +HAYSTACK_CUSTOM_HIGHLIGHTER = "app_doc.search.highlight.MyHighLighter" \ No newline at end of file diff --git a/app_doc/search/chinese_analyzer.py b/app_doc/search/chinese_analyzer.py new file mode 100644 index 0000000..b579884 --- /dev/null +++ b/app_doc/search/chinese_analyzer.py @@ -0,0 +1,29 @@ +# coding:utf-8 +# @文件: chinese_analyzer.py +# @创建者:州的先生 +# #日期:2020/11/22 +# 博客地址:zmister.com + +import jieba +from whoosh.analysis import Tokenizer, Token + +class ChineseTokenizer(Tokenizer): + def __call__(self, value, positions=False, chars=False, + keeporiginal=False, removestops=True, + start_pos=0, start_char=0, mode='', **kwargs): + t = Token(positions, chars, removestops=removestops, mode=mode, + **kwargs) + seglist = jieba.cut(value, cut_all=True) + for w in seglist: + t.original = t.text = w + t.boost = 1.0 + if positions: + t.pos = start_pos + value.find(w) + if chars: + t.startchar = start_char + value.find(w) + t.endchar = start_char + value.find(w) + len(w) + yield t + + +def ChineseAnalyzer(): + return ChineseTokenizer() \ No newline at end of file diff --git a/app_doc/search/highlight.py b/app_doc/search/highlight.py new file mode 100644 index 0000000..443ab95 --- /dev/null +++ b/app_doc/search/highlight.py @@ -0,0 +1,172 @@ +# coding:utf-8 +# @文件: highlight.py +# @创建者:州的先生 +# #日期:2020/11/24 +# 博客地址:zmister.com +from haystack.utils import Highlighter +from django.utils.html import strip_tags +from app_doc.search.chinese_analyzer import ChineseAnalyzer as StemmingAnalyzer + + +class MyHighLighter(Highlighter): + + def __init__(self, query, **kwargs): + + # self.query = [token.text for token in sa(query)] + self.query = query + + if "max_length" in kwargs: + self.max_length = int(kwargs["max_length"]) + + if "html_tag" in kwargs: + self.html_tag = kwargs["html_tag"] + + if "css_class" in kwargs: + self.css_class = kwargs["css_class"] + + # self.query_words = set( + # [word.lower() for word in self.query.split() if not word.startswith("-")] + # ) + sa = StemmingAnalyzer() + self.query_words = set( + [token.text.lower() for token in sa(query) if token.text.replace(" ",'') != ''] + ) + # print(self.query_words) + + def highlight(self, text_block): + self.text_block = strip_tags(text_block) + highlight_locations = self.find_highlightable_words() + start_offset, end_offset = self.find_window(highlight_locations) + if len(text_block) < self.max_length: + start_offset = 0 + return self.render_html(highlight_locations, start_offset, end_offset) + + def find_window(self, highlight_locations): + best_start = 0 + best_end = self.max_length + + # First, make sure we have words. + if not len(highlight_locations): + return (best_start, best_end) + + words_found = [] + + # Next, make sure we found any words at all. + for word, offset_list in highlight_locations.items(): + if len(offset_list): + # Add all of the locations to the list. + words_found.extend(offset_list) + + if not len(words_found): + return (best_start, best_end) + + # 只有一个搜索词被发现 + if len(words_found) == 1: + # return (words_found[0], words_found[0] + self.max_length) # + # new added + best_start = words_found[0] + best_end = words_found[0] + self.max_length + + if best_end > len(self.text_block): + current_length = len(self.text_block) - best_start + move_forward_steps = self.max_length - current_length + best_start = best_start - move_forward_steps + if best_start < 0: + best_start = 0 + return (best_start, best_end) + + # Sort the list so it's in ascending order. + words_found = sorted(words_found) + + # We now have a denormalized list of all positions were a word was + # found. We'll iterate through and find the densest window we can by + # counting the number of found offsets (-1 to fit in the window). + highest_density = 0 + + if words_found[:-1][0] > self.max_length: + best_start = words_found[:-1][0] + best_end = best_start + self.max_length + + for count, start in enumerate(words_found[:-1]): + current_density = 1 + + for end in words_found[count + 1:]: + if end - start < self.max_length: + current_density += 1 + else: + current_density = 0 + + # Only replace if we have a bigger (not equal density) so we + # give deference to windows earlier in the document. + if current_density > highest_density: + # print(start) + if start == 0: + best_start = start + else: + best_start = start + best_end = start + self.max_length + highest_density = current_density + if best_start < 10: + best_start = 0 + best_end = 200 + else: + best_start -= 10 + best_end -= 10 + + return (best_start, best_end) + + def render_html(self, highlight_locations=None, start_offset=None, end_offset=None): + # Start by chopping the block down to the proper window. + text = self.text_block[start_offset:end_offset] + + # Invert highlight_locations to a location -> term list + term_list = [] + + for term, locations in highlight_locations.items(): + term_list += [(loc - start_offset, term) for loc in locations] + + loc_to_term = sorted(term_list) + + # Prepare the highlight template + if self.css_class: + hl_start = '<%s class="%s">' % (self.html_tag, self.css_class) + else: + hl_start = "<%s>" % (self.html_tag) + + hl_end = "" % self.html_tag + + # Copy the part from the start of the string to the first match, + # and there replace the match with a highlighted version. + highlighted_chunk = "" + matched_so_far = 0 + prev = 0 + prev_str = "" + + for cur, cur_str in loc_to_term: + # This can be in a different case than cur_str + actual_term = text[cur : cur + len(cur_str)] + + # Handle incorrect highlight_locations by first checking for the term + if actual_term.lower() == cur_str: + if cur < prev + len(prev_str): + continue + + highlighted_chunk += ( + text[prev + len(prev_str) : cur] + hl_start + actual_term + hl_end + ) + prev = cur + prev_str = cur_str + + # Keep track of how far we've copied so far, for the last step + matched_so_far = cur + len(actual_term) + + # Don't forget the chunk after the last term + highlighted_chunk += text[matched_so_far:] + + if start_offset > 0: + highlighted_chunk = "%s" % highlighted_chunk + + if end_offset < len(self.text_block): + highlighted_chunk = "%s..." % highlighted_chunk + + return highlighted_chunk \ No newline at end of file diff --git a/app_doc/search/whoosh_cn_backend.py b/app_doc/search/whoosh_cn_backend.py new file mode 100644 index 0000000..2b13da6 --- /dev/null +++ b/app_doc/search/whoosh_cn_backend.py @@ -0,0 +1,1073 @@ +# encoding:utf-8 +# @文件: whoosh_cn_backend.py +# @创建者:州的先生 +# #日期:2020/11/22 +# 博客地址:zmister.com + +import json +import os +import re +import shutil +import threading +import warnings + +from django.conf import settings +from django.core.exceptions import ImproperlyConfigured +from django.utils.datetime_safe import datetime +from django.utils.encoding import force_str + +from haystack.backends import ( + BaseEngine, + BaseSearchBackend, + BaseSearchQuery, + EmptyResults, + log_query, +) +from haystack.constants import ( + DJANGO_CT, + DJANGO_ID, + FUZZY_WHOOSH_MAX_EDITS, + FUZZY_WHOOSH_MIN_PREFIX, + ID, +) +from haystack.exceptions import MissingDependency, SearchBackendError, SkipDocument +from haystack.inputs import Clean, Exact, PythonData, Raw +from haystack.models import SearchResult +from haystack.utils import get_identifier, get_model_ct +from haystack.utils import log as logging +from haystack.utils.app_loading import haystack_get_model + +try: + import whoosh +except ImportError: + raise MissingDependency( + "The 'whoosh' backend requires the installation of 'Whoosh'. Please refer to the documentation." + ) + +# Handle minimum requirement. +if not hasattr(whoosh, "__version__") or whoosh.__version__ < (2, 5, 0): + raise MissingDependency("The 'whoosh' backend requires version 2.5.0 or greater.") + +# Bubble up the correct error. +from whoosh import index +# from whoosh.analysis import StemmingAnalyzer +from app_doc.search.chinese_analyzer import ChineseAnalyzer as StemmingAnalyzer +from whoosh.fields import ID as WHOOSH_ID +from whoosh.fields import ( + BOOLEAN, + DATETIME, + IDLIST, + KEYWORD, + NGRAM, + NGRAMWORDS, + NUMERIC, + Schema, + TEXT, +) +from whoosh.filedb.filestore import FileStorage, RamStorage +from whoosh.highlight import highlight as whoosh_highlight +from whoosh.highlight import ContextFragmenter, HtmlFormatter +from whoosh.qparser import QueryParser, FuzzyTermPlugin +from whoosh.searching import ResultsPage +from whoosh.writing import AsyncWriter + + +DATETIME_REGEX = re.compile( + "^(?P\d{4})-(?P\d{2})-(?P\d{2})T(?P\d{2}):(?P\d{2}):(?P\d{2})(\.\d{3,6}Z?)?$" +) +LOCALS = threading.local() +LOCALS.RAM_STORE = None + + +class WhooshHtmlFormatter(HtmlFormatter): + """ + This is a HtmlFormatter simpler than the whoosh.HtmlFormatter. + We use it to have consistent results across backends. Specifically, + Solr, Xapian and Elasticsearch are using this formatting. + """ + + template = "<%(tag)s>%(t)s" + + +class WhooshSearchBackend(BaseSearchBackend): + # Word reserved by Whoosh for special use. + RESERVED_WORDS = ("AND", "NOT", "OR", "TO") + + # Characters reserved by Whoosh for special use. + # The '\\' must come first, so as not to overwrite the other slash replacements. + RESERVED_CHARACTERS = ( + "\\", + "+", + "-", + "&&", + "||", + "!", + "(", + ")", + "{", + "}", + "[", + "]", + "^", + '"', + "~", + "*", + "?", + ":", + ".", + ) + + def __init__(self, connection_alias, **connection_options): + super(WhooshSearchBackend, self).__init__( + connection_alias, **connection_options + ) + self.setup_complete = False + self.use_file_storage = True + self.post_limit = getattr(connection_options, "POST_LIMIT", 128 * 1024 * 1024) + self.path = connection_options.get("PATH") + + if connection_options.get("STORAGE", "file") != "file": + self.use_file_storage = False + + if self.use_file_storage and not self.path: + raise ImproperlyConfigured( + "You must specify a 'PATH' in your settings for connection '%s'." + % connection_alias + ) + + self.log = logging.getLogger("haystack") + + def setup(self): + """ + Defers loading until needed. + """ + from haystack import connections + + new_index = False + + # Make sure the index is there. + if self.use_file_storage and not os.path.exists(self.path): + os.makedirs(self.path) + new_index = True + + if self.use_file_storage and not os.access(self.path, os.W_OK): + raise IOError( + "The path to your Whoosh index '%s' is not writable for the current user/group." + % self.path + ) + + if self.use_file_storage: + self.storage = FileStorage(self.path) + else: + global LOCALS + + if getattr(LOCALS, "RAM_STORE", None) is None: + LOCALS.RAM_STORE = RamStorage() + + self.storage = LOCALS.RAM_STORE + + self.content_field_name, self.schema = self.build_schema( + connections[self.connection_alias].get_unified_index().all_searchfields() + ) + self.parser = QueryParser(self.content_field_name, schema=self.schema) + self.parser.add_plugins([FuzzyTermPlugin]) + + if new_index is True: + self.index = self.storage.create_index(self.schema) + else: + try: + self.index = self.storage.open_index(schema=self.schema) + except index.EmptyIndexError: + self.index = self.storage.create_index(self.schema) + + self.setup_complete = True + + def build_schema(self, fields): + schema_fields = { + ID: WHOOSH_ID(stored=True, unique=True), + DJANGO_CT: WHOOSH_ID(stored=True), + DJANGO_ID: WHOOSH_ID(stored=True), + } + # Grab the number of keys that are hard-coded into Haystack. + # We'll use this to (possibly) fail slightly more gracefully later. + initial_key_count = len(schema_fields) + content_field_name = "" + + for field_name, field_class in fields.items(): + if field_class.is_multivalued: + if field_class.indexed is False: + schema_fields[field_class.index_fieldname] = IDLIST( + stored=True, field_boost=field_class.boost + ) + else: + schema_fields[field_class.index_fieldname] = KEYWORD( + stored=True, + commas=True, + scorable=True, + field_boost=field_class.boost, + ) + elif field_class.field_type in ["date", "datetime"]: + schema_fields[field_class.index_fieldname] = DATETIME( + stored=field_class.stored, sortable=True + ) + elif field_class.field_type == "integer": + schema_fields[field_class.index_fieldname] = NUMERIC( + stored=field_class.stored, + numtype=int, + field_boost=field_class.boost, + ) + elif field_class.field_type == "float": + schema_fields[field_class.index_fieldname] = NUMERIC( + stored=field_class.stored, + numtype=float, + field_boost=field_class.boost, + ) + elif field_class.field_type == "boolean": + # Field boost isn't supported on BOOLEAN as of 1.8.2. + schema_fields[field_class.index_fieldname] = BOOLEAN( + stored=field_class.stored + ) + elif field_class.field_type == "ngram": + schema_fields[field_class.index_fieldname] = NGRAM( + minsize=3, + maxsize=15, + stored=field_class.stored, + field_boost=field_class.boost, + ) + elif field_class.field_type == "edge_ngram": + schema_fields[field_class.index_fieldname] = NGRAMWORDS( + minsize=2, + maxsize=15, + at="start", + stored=field_class.stored, + field_boost=field_class.boost, + ) + else: + schema_fields[field_class.index_fieldname] = TEXT( + stored=True, + analyzer=StemmingAnalyzer(), + field_boost=field_class.boost, + sortable=True, + ) + + if field_class.document is True: + content_field_name = field_class.index_fieldname + schema_fields[field_class.index_fieldname].spelling = True + + # Fail more gracefully than relying on the backend to die if no fields + # are found. + if len(schema_fields) <= initial_key_count: + raise SearchBackendError( + "No fields were found in any search_indexes. Please correct this before attempting to search." + ) + + return (content_field_name, Schema(**schema_fields)) + + def update(self, index, iterable, commit=True): + if not self.setup_complete: + self.setup() + + self.index = self.index.refresh() + writer = AsyncWriter(self.index) + + for obj in iterable: + try: + doc = index.full_prepare(obj) + except SkipDocument: + self.log.debug("Indexing for object `%s` skipped", obj) + else: + # Really make sure it's unicode, because Whoosh won't have it any + # other way. + for key in doc: + doc[key] = self._from_python(doc[key]) + + # Document boosts aren't supported in Whoosh 2.5.0+. + if "boost" in doc: + del doc["boost"] + + try: + writer.update_document(**doc) + except Exception as e: + if not self.silently_fail: + raise + + # We'll log the object identifier but won't include the actual object + # to avoid the possibility of that generating encoding errors while + # processing the log message: + self.log.error( + "%s while preparing object for update" % e.__class__.__name__, + exc_info=True, + extra={"data": {"index": index, "object": get_identifier(obj)}}, + ) + + if len(iterable) > 0: + # For now, commit no matter what, as we run into locking issues otherwise. + writer.commit() + + def remove(self, obj_or_string, commit=True): + if not self.setup_complete: + self.setup() + + self.index = self.index.refresh() + whoosh_id = get_identifier(obj_or_string) + + try: + self.index.delete_by_query(q=self.parser.parse('%s:"%s"' % (ID, whoosh_id))) + except Exception as e: + if not self.silently_fail: + raise + + self.log.error( + "Failed to remove document '%s' from Whoosh: %s", + whoosh_id, + e, + exc_info=True, + ) + + def clear(self, models=None, commit=True): + if not self.setup_complete: + self.setup() + + self.index = self.index.refresh() + + if models is not None: + assert isinstance(models, (list, tuple)) + + try: + if models is None: + self.delete_index() + else: + models_to_delete = [] + + for model in models: + models_to_delete.append("%s:%s" % (DJANGO_CT, get_model_ct(model))) + + self.index.delete_by_query( + q=self.parser.parse(" OR ".join(models_to_delete)) + ) + except Exception as e: + if not self.silently_fail: + raise + + if models is not None: + self.log.error( + "Failed to clear Whoosh index of models '%s': %s", + ",".join(models_to_delete), + e, + exc_info=True, + ) + else: + self.log.error("Failed to clear Whoosh index: %s", e, exc_info=True) + + def delete_index(self): + # Per the Whoosh mailing list, if wiping out everything from the index, + # it's much more efficient to simply delete the index files. + if self.use_file_storage and os.path.exists(self.path): + shutil.rmtree(self.path) + elif not self.use_file_storage: + self.storage.clean() + + # Recreate everything. + self.setup() + + def optimize(self): + if not self.setup_complete: + self.setup() + + self.index = self.index.refresh() + self.index.optimize() + + def calculate_page(self, start_offset=0, end_offset=None): + # Prevent against Whoosh throwing an error. Requires an end_offset + # greater than 0. + if end_offset is not None and end_offset <= 0: + end_offset = 1 + + # Determine the page. + page_num = 0 + + if end_offset is None: + end_offset = 1000000 + + if start_offset is None: + start_offset = 0 + + page_length = end_offset - start_offset + + if page_length and page_length > 0: + page_num = int(start_offset / page_length) + + # Increment because Whoosh uses 1-based page numbers. + page_num += 1 + return page_num, page_length + + @log_query + def search( + self, + query_string, + sort_by=None, + start_offset=0, + end_offset=None, + fields="", + highlight=False, + facets=None, + date_facets=None, + query_facets=None, + narrow_queries=None, + spelling_query=None, + within=None, + dwithin=None, + distance_point=None, + models=None, + limit_to_registered_models=None, + result_class=None, + **kwargs + ): + if not self.setup_complete: + self.setup() + + # A zero length query should return no results. + if len(query_string) == 0: + return {"results": [], "hits": 0} + + query_string = force_str(query_string) + + # A one-character query (non-wildcard) gets nabbed by a stopwords + # filter and should yield zero results. + if len(query_string) <= 1 and query_string != "*": + return {"results": [], "hits": 0} + + reverse = False + + if sort_by is not None: + # Determine if we need to reverse the results and if Whoosh can + # handle what it's being asked to sort by. Reversing is an + # all-or-nothing action, unfortunately. + sort_by_list = [] + reverse_counter = 0 + + for order_by in sort_by: + if order_by.startswith("-"): + reverse_counter += 1 + + if reverse_counter and reverse_counter != len(sort_by): + raise SearchBackendError( + "Whoosh requires all order_by fields" + " to use the same sort direction" + ) + + for order_by in sort_by: + if order_by.startswith("-"): + sort_by_list.append(order_by[1:]) + + if len(sort_by_list) == 1: + reverse = True + else: + sort_by_list.append(order_by) + + if len(sort_by_list) == 1: + reverse = False + + sort_by = sort_by_list + + if facets is not None: + warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2) + + if date_facets is not None: + warnings.warn( + "Whoosh does not handle date faceting.", Warning, stacklevel=2 + ) + + if query_facets is not None: + warnings.warn( + "Whoosh does not handle query faceting.", Warning, stacklevel=2 + ) + + narrowed_results = None + self.index = self.index.refresh() + + if limit_to_registered_models is None: + limit_to_registered_models = getattr( + settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True + ) + + if models and len(models): + model_choices = sorted(get_model_ct(model) for model in models) + elif limit_to_registered_models: + # Using narrow queries, limit the results to only models handled + # with the current routers. + model_choices = self.build_models_list() + else: + model_choices = [] + + if len(model_choices) > 0: + if narrow_queries is None: + narrow_queries = set() + + narrow_queries.add( + " OR ".join(["%s:%s" % (DJANGO_CT, rm) for rm in model_choices]) + ) + + narrow_searcher = None + + if narrow_queries is not None: + # Potentially expensive? I don't see another way to do it in Whoosh... + narrow_searcher = self.index.searcher() + + for nq in narrow_queries: + recent_narrowed_results = narrow_searcher.search( + self.parser.parse(force_str(nq)), limit=None + ) + + if len(recent_narrowed_results) <= 0: + return {"results": [], "hits": 0} + + if narrowed_results: + narrowed_results.filter(recent_narrowed_results) + else: + narrowed_results = recent_narrowed_results + + self.index = self.index.refresh() + + if self.index.doc_count(): + searcher = self.index.searcher() + parsed_query = self.parser.parse(query_string) + + # In the event of an invalid/stopworded query, recover gracefully. + if parsed_query is None: + return {"results": [], "hits": 0} + + page_num, page_length = self.calculate_page(start_offset, end_offset) + + search_kwargs = { + "pagelen": page_length, + "sortedby": sort_by, + "reverse": reverse, + } + + # Handle the case where the results have been narrowed. + if narrowed_results is not None: + search_kwargs["filter"] = narrowed_results + + try: + raw_page = searcher.search_page(parsed_query, page_num, **search_kwargs) + except ValueError: + if not self.silently_fail: + raise + + return {"results": [], "hits": 0, "spelling_suggestion": None} + + # Because as of Whoosh 2.5.1, it will return the wrong page of + # results if you request something too high. :( + if raw_page.pagenum < page_num: + return {"results": [], "hits": 0, "spelling_suggestion": None} + + results = self._process_results( + raw_page, + highlight=highlight, + query_string=query_string, + spelling_query=spelling_query, + result_class=result_class, + ) + searcher.close() + + if hasattr(narrow_searcher, "close"): + narrow_searcher.close() + + return results + else: + if self.include_spelling: + if spelling_query: + spelling_suggestion = self.create_spelling_suggestion( + spelling_query + ) + else: + spelling_suggestion = self.create_spelling_suggestion(query_string) + else: + spelling_suggestion = None + + return { + "results": [], + "hits": 0, + "spelling_suggestion": spelling_suggestion, + } + + def more_like_this( + self, + model_instance, + additional_query_string=None, + start_offset=0, + end_offset=None, + models=None, + limit_to_registered_models=None, + result_class=None, + **kwargs + ): + if not self.setup_complete: + self.setup() + + field_name = self.content_field_name + narrow_queries = set() + narrowed_results = None + self.index = self.index.refresh() + + if limit_to_registered_models is None: + limit_to_registered_models = getattr( + settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True + ) + + if models and len(models): + model_choices = sorted(get_model_ct(model) for model in models) + elif limit_to_registered_models: + # Using narrow queries, limit the results to only models handled + # with the current routers. + model_choices = self.build_models_list() + else: + model_choices = [] + + if len(model_choices) > 0: + if narrow_queries is None: + narrow_queries = set() + + narrow_queries.add( + " OR ".join(["%s:%s" % (DJANGO_CT, rm) for rm in model_choices]) + ) + + if additional_query_string and additional_query_string != "*": + narrow_queries.add(additional_query_string) + + narrow_searcher = None + + if narrow_queries is not None: + # Potentially expensive? I don't see another way to do it in Whoosh... + narrow_searcher = self.index.searcher() + + for nq in narrow_queries: + recent_narrowed_results = narrow_searcher.search( + self.parser.parse(force_str(nq)), limit=None + ) + + if len(recent_narrowed_results) <= 0: + return {"results": [], "hits": 0} + + if narrowed_results: + narrowed_results.filter(recent_narrowed_results) + else: + narrowed_results = recent_narrowed_results + + page_num, page_length = self.calculate_page(start_offset, end_offset) + + self.index = self.index.refresh() + raw_results = EmptyResults() + + searcher = None + if self.index.doc_count(): + query = "%s:%s" % (ID, get_identifier(model_instance)) + searcher = self.index.searcher() + parsed_query = self.parser.parse(query) + results = searcher.search(parsed_query) + + if len(results): + raw_results = results[0].more_like_this(field_name, top=end_offset) + + # Handle the case where the results have been narrowed. + if narrowed_results is not None and hasattr(raw_results, "filter"): + raw_results.filter(narrowed_results) + + try: + raw_page = ResultsPage(raw_results, page_num, page_length) + except ValueError: + if not self.silently_fail: + raise + + return {"results": [], "hits": 0, "spelling_suggestion": None} + + # Because as of Whoosh 2.5.1, it will return the wrong page of + # results if you request something too high. :( + if raw_page.pagenum < page_num: + return {"results": [], "hits": 0, "spelling_suggestion": None} + + results = self._process_results(raw_page, result_class=result_class) + + if searcher: + searcher.close() + + if hasattr(narrow_searcher, "close"): + narrow_searcher.close() + + return results + + def _process_results( + self, + raw_page, + highlight=False, + query_string="", + spelling_query=None, + result_class=None, + ): + from haystack import connections + + results = [] + + # It's important to grab the hits first before slicing. Otherwise, this + # can cause pagination failures. + hits = len(raw_page) + + if result_class is None: + result_class = SearchResult + + facets = {} + spelling_suggestion = None + unified_index = connections[self.connection_alias].get_unified_index() + indexed_models = unified_index.get_indexed_models() + + for doc_offset, raw_result in enumerate(raw_page): + score = raw_page.score(doc_offset) or 0 + app_label, model_name = raw_result[DJANGO_CT].split(".") + additional_fields = {} + model = haystack_get_model(app_label, model_name) + + if model and model in indexed_models: + for key, value in raw_result.items(): + index = unified_index.get_index(model) + string_key = str(key) + + if string_key in index.fields and hasattr( + index.fields[string_key], "convert" + ): + # Special-cased due to the nature of KEYWORD fields. + if index.fields[string_key].is_multivalued: + if value is None or len(value) is 0: + additional_fields[string_key] = [] + else: + additional_fields[string_key] = value.split(",") + else: + additional_fields[string_key] = index.fields[ + string_key + ].convert(value) + else: + additional_fields[string_key] = self._to_python(value) + + del (additional_fields[DJANGO_CT]) + del (additional_fields[DJANGO_ID]) + + if highlight: + sa = StemmingAnalyzer() + formatter = WhooshHtmlFormatter("em") + terms = [token.text for token in sa(query_string)] + + whoosh_result = whoosh_highlight( + additional_fields.get(self.content_field_name), + terms, + sa, + ContextFragmenter(), + formatter, + ) + additional_fields["highlighted"] = { + self.content_field_name: [whoosh_result] + } + + result = result_class( + app_label, + model_name, + raw_result[DJANGO_ID], + score, + **additional_fields + ) + results.append(result) + else: + hits -= 1 + + if self.include_spelling: + if spelling_query: + spelling_suggestion = self.create_spelling_suggestion(spelling_query) + else: + spelling_suggestion = self.create_spelling_suggestion(query_string) + + return { + "results": results, + "hits": hits, + "facets": facets, + "spelling_suggestion": spelling_suggestion, + } + + def create_spelling_suggestion(self, query_string): + spelling_suggestion = None + reader = self.index.reader() + corrector = reader.corrector(self.content_field_name) + cleaned_query = force_str(query_string) + + if not query_string: + return spelling_suggestion + + # Clean the string. + for rev_word in self.RESERVED_WORDS: + cleaned_query = cleaned_query.replace(rev_word, "") + + for rev_char in self.RESERVED_CHARACTERS: + cleaned_query = cleaned_query.replace(rev_char, "") + + # Break it down. + query_words = cleaned_query.split() + suggested_words = [] + + for word in query_words: + suggestions = corrector.suggest(word, limit=1) + + if len(suggestions) > 0: + suggested_words.append(suggestions[0]) + + spelling_suggestion = " ".join(suggested_words) + return spelling_suggestion + + def _from_python(self, value): + """ + Converts Python values to a string for Whoosh. + + Code courtesy of pysolr. + """ + if hasattr(value, "strftime"): + if not hasattr(value, "hour"): + value = datetime(value.year, value.month, value.day, 0, 0, 0) + elif isinstance(value, bool): + if value: + value = "true" + else: + value = "false" + elif isinstance(value, (list, tuple)): + value = ",".join([force_str(v) for v in value]) + elif isinstance(value, (int, float)): + # Leave it alone. + pass + else: + value = force_str(value) + return value + + def _to_python(self, value): + """ + Converts values from Whoosh to native Python values. + + A port of the same method in pysolr, as they deal with data the same way. + """ + if value == "true": + return True + elif value == "false": + return False + + if value and isinstance(value, str): + possible_datetime = DATETIME_REGEX.search(value) + + if possible_datetime: + date_values = possible_datetime.groupdict() + + for dk, dv in date_values.items(): + date_values[dk] = int(dv) + + return datetime( + date_values["year"], + date_values["month"], + date_values["day"], + date_values["hour"], + date_values["minute"], + date_values["second"], + ) + + try: + # Attempt to use json to load the values. + converted_value = json.loads(value) + + # Try to handle most built-in types. + if isinstance( + converted_value, + (list, tuple, set, dict, int, float, complex), + ): + return converted_value + except: + # If it fails (SyntaxError or its ilk) or we don't trust it, + # continue on. + pass + + return value + + +class WhooshSearchQuery(BaseSearchQuery): + def _convert_datetime(self, date): + if hasattr(date, "hour"): + return force_str(date.strftime("%Y%m%d%H%M%S")) + else: + return force_str(date.strftime("%Y%m%d000000")) + + def clean(self, query_fragment): + """ + Provides a mechanism for sanitizing user input before presenting the + value to the backend. + + Whoosh 1.X differs here in that you can no longer use a backslash + to escape reserved characters. Instead, the whole word should be + quoted. + """ + words = query_fragment.split() + cleaned_words = [] + + for word in words: + if word in self.backend.RESERVED_WORDS: + word = word.replace(word, word.lower()) + + for char in self.backend.RESERVED_CHARACTERS: + if char in word: + word = "'%s'" % word + break + + cleaned_words.append(word) + + return " ".join(cleaned_words) + + def build_query_fragment(self, field, filter_type, value): + from haystack import connections + + query_frag = "" + is_datetime = False + + if not hasattr(value, "input_type_name"): + # Handle when we've got a ``ValuesListQuerySet``... + if hasattr(value, "values_list"): + value = list(value) + + if hasattr(value, "strftime"): + is_datetime = True + + if isinstance(value, str) and value != " ": + # It's not an ``InputType``. Assume ``Clean``. + value = Clean(value) + else: + value = PythonData(value) + + # Prepare the query using the InputType. + prepared_value = value.prepare(self) + + if not isinstance(prepared_value, (set, list, tuple)): + # Then convert whatever we get back to what pysolr wants if needed. + prepared_value = self.backend._from_python(prepared_value) + + # 'content' is a special reserved word, much like 'pk' in + # Django's ORM layer. It indicates 'no special field'. + if field == "content": + index_fieldname = "" + else: + index_fieldname = "%s:" % connections[ + self._using + ].get_unified_index().get_index_fieldname(field) + + filter_types = { + "content": "%s", + "contains": "*%s*", + "endswith": "*%s", + "startswith": "%s*", + "exact": "%s", + "gt": "{%s to}", + "gte": "[%s to]", + "lt": "{to %s}", + "lte": "[to %s]", + "fuzzy": "%s~{}/%d".format(FUZZY_WHOOSH_MAX_EDITS), + } + + if value.post_process is False: + query_frag = prepared_value + else: + if filter_type in [ + "content", + "contains", + "startswith", + "endswith", + "fuzzy", + ]: + if value.input_type_name == "exact": + query_frag = prepared_value + else: + # Iterate over terms & incorportate the converted form of each into the query. + terms = [] + + if isinstance(prepared_value, str): + possible_values = prepared_value.split(" ") + else: + if is_datetime is True: + prepared_value = self._convert_datetime(prepared_value) + + possible_values = [prepared_value] + + for possible_value in possible_values: + possible_value_str = self.backend._from_python( + possible_value + ) + if filter_type == "fuzzy": + terms.append( + filter_types[filter_type] % ( + possible_value_str, + min( + FUZZY_WHOOSH_MIN_PREFIX, + len(possible_value_str) + ) + ) + ) + else: + terms.append( + filter_types[filter_type] % possible_value_str + ) + + if len(terms) == 1: + query_frag = terms[0] + else: + query_frag = "(%s)" % " AND ".join(terms) + elif filter_type == "in": + in_options = [] + + for possible_value in prepared_value: + is_datetime = False + + if hasattr(possible_value, "strftime"): + is_datetime = True + + pv = self.backend._from_python(possible_value) + + if is_datetime is True: + pv = self._convert_datetime(pv) + + if isinstance(pv, str) and not is_datetime: + in_options.append('"%s"' % pv) + else: + in_options.append("%s" % pv) + + query_frag = "(%s)" % " OR ".join(in_options) + elif filter_type == "range": + start = self.backend._from_python(prepared_value[0]) + end = self.backend._from_python(prepared_value[1]) + + if hasattr(prepared_value[0], "strftime"): + start = self._convert_datetime(start) + + if hasattr(prepared_value[1], "strftime"): + end = self._convert_datetime(end) + + query_frag = "[%s to %s]" % (start, end) + elif filter_type == "exact": + if value.input_type_name == "exact": + query_frag = prepared_value + else: + prepared_value = Exact(prepared_value).prepare(self) + query_frag = filter_types[filter_type] % prepared_value + else: + if is_datetime is True: + prepared_value = self._convert_datetime(prepared_value) + + query_frag = filter_types[filter_type] % prepared_value + + if len(query_frag) and not isinstance(value, Raw): + if not query_frag.startswith("(") and not query_frag.endswith(")"): + query_frag = "(%s)" % query_frag + + return "%s%s" % (index_fieldname, query_frag) + + +class WhooshEngine(BaseEngine): + backend = WhooshSearchBackend + query = WhooshSearchQuery diff --git a/app_doc/search_indexes.py b/app_doc/search_indexes.py new file mode 100644 index 0000000..5447d6b --- /dev/null +++ b/app_doc/search_indexes.py @@ -0,0 +1,20 @@ +# coding:utf-8 +# @文件: search_indexes.py +# @创建者:州的先生 +# #日期:2020/11/22 +# 博客地址:zmister.com + +from haystack import indexes +from app_doc.models import * + +# 文档索引 +class DocIndex(indexes.SearchIndex,indexes.Indexable): + text = indexes.CharField(document=True, use_template=True) + top_doc = indexes.IntegerField(model_attr='top_doc') + modify_time = indexes.DateTimeField(model_attr='modify_time') + + def get_model(self): + return Doc + + def index_queryset(self, using=None): + return self.get_model().objects.filter(status=1) \ No newline at end of file diff --git a/app_doc/urls.py b/app_doc/urls.py index cd5b63c..50d875d 100644 --- a/app_doc/urls.py +++ b/app_doc/urls.py @@ -1,5 +1,5 @@ -from django.urls import path,re_path -from app_doc import views,views_user,util_upload_img,import_views +from django.urls import path,re_path,include +from app_doc import views,views_user,views_search,util_upload_img,import_views urlpatterns = [ path('',views.project_list,name='pro_list'),# 文档首页 @@ -22,6 +22,7 @@ urlpatterns = [ path('manage_project_import/',import_views.import_project,name="import_project"), # 导入文集 path('manage_project_doc_sort/',import_views.project_doc_sort,name='project_doc_sort'), # 导入文集文档排序 path('manage_project_transfer//',views.manage_project_transfer,name='manage_pro_transfer'), # 文集转让 + path('manage_pro_doc_sort//',views.manage_project_doc_sort,name='manage_pro_doc_sort'), # 文集排序 #################文档相关 path('project-/doc-/', views.doc, name='doc'), # 文档浏览页 path('create_doc/', views.create_doc, name="create_doc"), # 新建文档 @@ -57,6 +58,8 @@ urlpatterns = [ path('user/center_menu/',views_user.user_center_menu,name="user_center_menu"), # 个人中心菜单数据 path('upload_doc_img/',util_upload_img.upload_img,name="upload_doc_img"), # 上传图片 path('search/',views.search,name="search"), # 搜索功能 + # path('doc_search/', include('haystack.urls')), # 全文检索框架 + path('doc_search/', views_search.DocSearchView(),name="doc_search"), # 全文检索框架 path('manage_overview/',views.manage_overview,name="manage_overview"), # 个人中心概览 path('manage_self/',views.manage_self,name="manage_self"), # 个人设置 ] \ No newline at end of file diff --git a/template/app_doc/search.html b/template/app_doc/search.html index 267e085..770ee64 100644 --- a/template/app_doc/search.html +++ b/template/app_doc/search.html @@ -6,6 +6,7 @@ + @@ -23,320 +24,6 @@ 觅道搜索{% if site_name != None and site_name != '' %} - {{site_name}}{% endif %} @@ -345,9 +32,9 @@
-
- - + + +
diff --git a/template/app_doc/search_result.html b/template/app_doc/search_result.html index 8c2f060..5336d0a 100644 --- a/template/app_doc/search_result.html +++ b/template/app_doc/search_result.html @@ -13,109 +13,9 @@ {{ kw }} - 觅道搜索 - {% if site_name != None and site_name != '' %}{{ site_name }}{% endif %} + @@ -239,7 +139,7 @@
@@ -312,7 +212,7 @@
{{ result.intro|get_key_context:kw }}

- {{ result.create_user }} - {{result.modify_time}}

+ {% if result.create_user.first_name != '' %}{{ result.create_user.first_name }}{% else %}{{result.create_user}}{% endif %} - {{result.modify_time}}

{% endfor %} diff --git a/template/search/indexes/app_doc/doc_text.txt b/template/search/indexes/app_doc/doc_text.txt new file mode 100644 index 0000000..c999f32 --- /dev/null +++ b/template/search/indexes/app_doc/doc_text.txt @@ -0,0 +1,2 @@ +{{object.name}} +{{object.pre_content}} \ No newline at end of file diff --git a/template/search/search.html b/template/search/search.html new file mode 100644 index 0000000..d1d0de7 --- /dev/null +++ b/template/search/search.html @@ -0,0 +1,327 @@ +{% load static %} + + + + + + + + + + + + {{ query }} - 觅道搜索 - {% if site_name != None and site_name != '' %}{{ site_name }}{% endif %} + + + + + + + + +
+
+ +
+ +
+ +
+
+
+ + + +
+
+
+ +
+ +
+
+
+ + + +
+ +
+
+
+
+ + +
+
+ +
+
+
+
+
+ + +
+ +
+
+
+ +
+ +
+ 文档 + 文集 + 标签 +
+
+ +
+ +
+
+
+
+ +
+ + +{% if query != '' %} +
+ 觅道文档在 + {% if date_range == 'recent1' %} + 近1天内 + {% elif date_range == 'recent7' %} + 近7天内 + {% elif date_range == 'recent30' %} + 近30天内 + {% elif date_range == 'recent365' %} + 近一年内 + {% else %} + 全部时间内 + {% endif %} + 搜索到 {{ page.paginator.count }} 条结果 +
+{% endif %} + + +
+ + {% if query %} + {% load doc_filter %} + {% load highlight %} + {% for result in page.object_list %} +
+ +

+ {% highlight result.object.name with query %} +

+ + {% highlight result.object.pre_content with query %} + +

+ {{ result.object.top_doc | get_doc_top }} - {{result.object.modify_time}}

+
+ {% empty %} +

没有搜索结果。

+ {% endfor %} + {% else %} + {# Show some example queries to run, maybe query syntax, something else? #} + {% endif %} + +
+ + + +
+
+ + {% if page.has_previous %} + 上一页 + {% else %} + 上一页 + {% endif %} + + + + {{ page.number }}/{{ page.paginator.num_pages }} + + + {% if page.has_next %} + 下一页 + {% else %} + 下一页 + {% endif %} +
+
+ + + +{% include 'app_doc/foot_base.html' %} + + + + +{% block custom_script %} + + + {% if debug %} + {% else %} + {{ static_code | safe }} + {% endif %} + +{% endblock %} + + \ No newline at end of file