3564 - Fix leading wildcard search

This commit is contained in:
Count Infinity 2024-11-20 21:40:54 -07:00
parent 1ad1cf9fc1
commit 0845ba8523
2 changed files with 176 additions and 50 deletions

View file

@ -5,6 +5,7 @@ command test-suite).
"""
from unittest import mock
from parameterized import parameterized
from evennia.help import filehelp
from evennia.help import utils as help_utils
@ -140,3 +141,56 @@ class TestFileHelp(TestCase):
self.assertEqual(HELP_ENTRY_DICTS[inum].get("aliases", []), helpentry.aliases)
self.assertEqual(HELP_ENTRY_DICTS[inum]["category"].lower(), helpentry.help_category)
self.assertEqual(HELP_ENTRY_DICTS[inum]["text"], helpentry.entrytext)
class HelpUtils(TestCase):
def setUp(self):
self.candidate_entries = [
filehelp.FileHelpEntry(
key="*examine",
aliases=["*exam", "*ex", "@examine"],
help_category="building",
entrytext="Lorem ipsum examine",
lock_storage="",
),
filehelp.FileHelpEntry(
key="inventory",
aliases=[],
help_category="general",
entrytext="A character's inventory",
lock_storage="",
),
filehelp.FileHelpEntry(
key="userpassword",
aliases=[],
help_category="admin",
entrytext="change the password of an account",
lock_storage="",
),
]
@parameterized.expand(
[
("*examine", "*examine", "Leading wildcard should return exact matches."),
("@examine", "*examine", "Aliases should return an entry."),
("inventory", "inventory", "It should return exact matches."),
("inv*", "inventory", "Trailing wildcard search should return an entry."),
("userpaZZword~2", "userpassword", "Fuzzy matching should return an entry."),
(
"*word",
"userpassword",
"Leading wildcard should return an entry when no exact match.",
),
]
)
def test_help_search_with_index(self, search_term, expected_entry_key, error_msg):
"""Test search terms return correct entries"""
expected_entry = [
entry for entry in self.candidate_entries if entry.key == expected_entry_key
]
entries, _ = help_utils.help_search_with_index(search_term, self.candidate_entries)
self.assertEqual(entries, expected_entry, error_msg)

View file

@ -9,26 +9,8 @@ This is used primarily by the default `help` command.
import re
from django.conf import settings
from lunr.stemmer import stemmer
# these are words that Lunr normally ignores but which we want to find
# since we use them (e.g. as command names).
# Lunr's default ignore-word list is found here:
# https://github.com/yeraydiazdiaz/lunr.py/blob/master/lunr/stop_word_filter.py
_LUNR_STOP_WORD_FILTER_EXCEPTIONS = [
"about",
"might",
"get",
"who",
"say",
"where",
] + settings.LUNR_STOP_WORD_FILTER_EXCEPTIONS
_LUNR = None
_LUNR_EXCEPTION = None
_LUNR_GET_BUILDER = None
_LUNR_BUILDER_PIPELINE = None
_RE_HELP_SUBTOPICS_START = re.compile(r"^\s*?#\s*?subtopics\s*?$", re.I + re.M)
_RE_HELP_SUBTOPIC_SPLIT = re.compile(r"^\s*?(\#{2,6}\s*?\w+?[a-z0-9 \-\?!,\.]*?)$", re.M + re.I)
@ -37,6 +19,123 @@ _RE_HELP_SUBTOPIC_PARSE = re.compile(r"^(?P<nesting>\#{2,6})\s*?(?P<name>.*?)$",
MAX_SUBTOPIC_NESTING = 5
def wildcard_stemmer(token, i, tokens):
"""
Custom LUNR stemmer that returns both the original and stemmed token
if the token contains a leading wildcard (*).
Args:
token (str): The input token to be stemmed
i (int): Index of current token. Unused here but required by LUNR.
tokens (list): List of tokens being processed. Unused here but required by LUNR.
Returns:
list: A list containing the stemmed tokens and original token if it has leading '*'.
"""
original_token = token.clone()
# Then apply the standard Lunr stemmer
stemmed_token = stemmer(token)
if original_token.string.startswith("*"):
# Return both tokens
return [original_token, stemmed_token]
return stemmed_token
class LunrSearch:
"""
Singleton class for managing Lunr search index configuration and initialization.
"""
# these are words that Lunr normally ignores but which we want to find
# since we use them (e.g. as command names).
# Lunr's default ignore-word list is found here:
# https://github.com/yeraydiazdiaz/lunr.py/blob/master/lunr/stop_word_filter.py
_LUNR_STOP_WORD_FILTER_EXCEPTIONS = [
"about",
"might",
"get",
"who",
"say",
"where",
] + settings.LUNR_STOP_WORD_FILTER_EXCEPTIONS
_instance = None
def __new__(cls):
"""
Ensure only one instance of the class is created (Singleton)
"""
if not cls._instance:
cls._instance = super(LunrSearch, cls).__new__(cls)
cls._instance._initialize()
return cls._instance
def _initialize(self):
"""
Lazy load Lunr libraries and set up custom configuration
we have to delay-load lunr because it messes with logging if it's imported
before twisted's logging has been set up
"""
# Lunr-related imports
from lunr import get_default_builder
from lunr import lunr
from lunr import stop_word_filter
from lunr.exceptions import QueryParseError
from lunr.stemmer import stemmer
from lunr.pipeline import Pipeline
# Store imported modules as instance attributes
self.get_default_builder = get_default_builder
self.lunr = lunr
self.stop_word_filter = stop_word_filter
self.QueryParseError = QueryParseError
self.default_stemmer = stemmer
self._setup_stop_words_filter()
self.custom_builder_pipeline = (self.custom_stop_words_filter, wildcard_stemmer)
# Register custom stemmer if we want to serialize.
Pipeline.register_function(wildcard_stemmer, "wildcard_stemmer")
def _setup_stop_words_filter(self):
"""
Create a custom stop words filter, removing specified exceptions
"""
stop_words = self.stop_word_filter.WORDS.copy()
for ignore_word in self._LUNR_STOP_WORD_FILTER_EXCEPTIONS:
try:
stop_words.remove(ignore_word)
except ValueError:
pass
self.custom_stop_words_filter = self.stop_word_filter.generate_stop_word_filter(stop_words)
def index(self, ref, fields, documents):
"""
Creates a Lunr searchable index.
Args:
ref (str): Unique identifier field within a document
fields (list): A list of Lunr field mappings
``{"field_name": str, "boost": int}``. See the Lunr documentation
for more details.
documents (list[dict]): This is the body of possible entities to search.
Each dict should have all keys in the `fields` arg.
Returns: A lunr.Index object
"""
# Create and configure builder
builder = self.get_default_builder()
builder.pipeline.reset()
builder.pipeline.add(*self.custom_builder_pipeline)
return self.lunr(ref, fields, documents, builder=builder)
def help_search_with_index(query, candidate_entries, suggestion_maxnum=5, fields=None):
"""
Lunr-powered fast index search and suggestion wrapper. See https://lunrjs.com/.
@ -57,31 +156,7 @@ def help_search_with_index(query, candidate_entries, suggestion_maxnum=5, fields
how many suggestions are included.
"""
global _LUNR, _LUNR_EXCEPTION, _LUNR_BUILDER_PIPELINE, _LUNR_GET_BUILDER
if not _LUNR:
# we have to delay-load lunr because it messes with logging if it's imported
# before twisted's logging has been set up
from lunr import get_default_builder as _LUNR_GET_BUILDER
from lunr import lunr as _LUNR
from lunr import stop_word_filter
from lunr.exceptions import QueryParseError as _LUNR_EXCEPTION
from lunr.stemmer import stemmer
# from lunr.trimmer import trimmer
# pre-create a lunr index-builder pipeline where we've removed some of
# the stop-words from the default in lunr.
stop_words = stop_word_filter.WORDS
for ignore_word in _LUNR_STOP_WORD_FILTER_EXCEPTIONS:
try:
stop_words.remove(ignore_word)
except ValueError:
pass
custom_stop_words_filter = stop_word_filter.generate_stop_word_filter(stop_words)
# _LUNR_BUILDER_PIPELINE = (trimmer, custom_stop_words_filter, stemmer)
_LUNR_BUILDER_PIPELINE = (custom_stop_words_filter, stemmer)
from lunr.exceptions import QueryParseError
indx = [cnd.search_index_entry for cnd in candidate_entries]
mapping = {indx[ix]["key"]: cand for ix, cand in enumerate(candidate_entries)}
@ -94,16 +169,13 @@ def help_search_with_index(query, candidate_entries, suggestion_maxnum=5, fields
{"field_name": "tags", "boost": 5},
]
# build the search index
builder = _LUNR_GET_BUILDER()
builder.pipeline.reset()
builder.pipeline.add(*_LUNR_BUILDER_PIPELINE)
lunr_search = LunrSearch()
search_index = _LUNR(ref="key", fields=fields, documents=indx, builder=builder)
search_index = lunr_search.index(ref="key", fields=fields, documents=indx)
try:
matches = search_index.search(query)[:suggestion_maxnum]
except _LUNR_EXCEPTION:
except QueryParseError:
# this is a user-input problem
matches = []