From a9ad82d005d39f8f7f006a6153e6cd2e80ea3367 Mon Sep 17 00:00:00 2001 From: Kelketek Rritaa Date: Sat, 15 Feb 2014 18:41:55 -0600 Subject: [PATCH] Added a 'regexable' mode for ANSIString, fixed a few bugs with it. Refactored with metaclass and added comments. Resolves #481, Resolves #480 --- src/utils/ansi.py | 595 ++++++++++++++++++++++++++++++---------------- 1 file changed, 395 insertions(+), 200 deletions(-) diff --git a/src/utils/ansi.py b/src/utils/ansi.py index 4abdd39332..2fd2a71924 100644 --- a/src/utils/ansi.py +++ b/src/utils/ansi.py @@ -174,11 +174,11 @@ class ANSIParser(object): strip_ansi flag instead removes all ansi markup. """ - if hasattr(string, 'raw_string'): + if hasattr(string, '_raw_string'): if strip_ansi: - return string.clean_string + return string.clean() else: - return string.raw_string + return string.raw() if not string: return '' self.do_xterm256 = xterm256 @@ -322,6 +322,10 @@ def group(lst, n): def _spacing_preflight(func): + """ + This wrapper function is used to do some preflight checks on functions used + for padding ANSIStrings. + """ def wrapped(self, width, fillchar=None): if fillchar is None: fillchar = " " @@ -336,195 +340,14 @@ def _spacing_preflight(func): return wrapped -class ANSIString(unicode): - """ - String-like object that is aware of ANSI codes. - - This isn't especially efficient, as it doesn't really have an - understanding of what the codes mean in order to eliminate - redundant characters, but a proper parser would have to be written for - that. - - Take note of the instructions at the bottom of the module, which modify - this class. - """ - - def __new__(cls, *args, **kwargs): - """ - When creating a new ANSIString, you may use a custom parser that has - the same attributes as the standard one, and you may declare the - string to be handled as already decoded. It is important not to double - decode strings, as escapes can only be respected once. - """ - string = to_str(args[0], force_string=True) - if not isinstance(string, basestring): - string = str(string) - parser = kwargs.get('parser', ANSI_PARSER) - decoded = kwargs.get('decoded', False) or hasattr(string, 'raw_string') - if not decoded: - string = parser.parse_ansi(string) - return super(ANSIString, cls).__new__(ANSIString, string, 'utf-8') - - def __repr__(self): - return "ANSIString(%s, decoded=True)" % repr(self.raw_string) - - def __init__(self, *args, **kwargs): - self.parser = kwargs.pop('parser', ANSI_PARSER) - super(ANSIString, self).__init__(*args, **kwargs) - self.raw_string = unicode(self) - self.clean_string = unicode(self.parser.parse_ansi( - self.raw_string, strip_ansi=True), 'utf-8') - self._code_indexes, self._char_indexes = self._get_indexes() - - def __len__(self): - return len(self.clean_string) - - def __add__(self, other): - if not isinstance(other, basestring): - return NotImplemented - return ANSIString(self.raw_string + getattr( - other, 'raw_string', other), decoded=True) - - def __radd__(self, other): - if not isinstance(other, basestring): - return NotImplemented - return ANSIString(getattr( - other, 'raw_string', other) + self.raw_string, decoded=True) - - def __getslice__(self, i, j): - return self.__getitem__(slice(i, j)) - - def _slice(self, item): - slice_indexes = self._char_indexes[item] - if not slice_indexes: - return ANSIString('') - try: - string = self[item.start].raw_string - except IndexError: - return ANSIString('') - last_mark = slice_indexes[0] - for i in slice_indexes[1:]: - for index in range(last_mark, i): - if index in self._code_indexes: - string += self.raw_string[index] - last_mark = i - try: - string += self.raw_string[i] - except IndexError: - pass - return ANSIString(string, decoded=True) - - def __getitem__(self, item): - if isinstance(item, slice): - return self._slice(item) - try: - item = self._char_indexes[item] - except IndexError: - raise IndexError("ANSIString index out of range.") - clean = self.raw_string[item] - - result = '' - for index in range(0, item + 1): - if index in self._code_indexes: - result += self.raw_string[index] - return ANSIString(result + clean, decoded=True) - - def rsplit(self, sep=None, maxsplit=None): - return self.split(sep, maxsplit, reverse=True) - - def split(self, sep=None, maxsplit=None, reverse=False): - if hasattr(sep, 'clean_string'): - sep = sep.clean_string - args = [sep] - if maxsplit is not None: - args.append(maxsplit) - if reverse: - parent_result = self.clean_string.rsplit(*args) - else: - parent_result = self.clean_string.split(*args) - current_index = 0 - result = [] - for section in parent_result: - result.append(self[current_index:current_index + len(section)]) - current_index += (len(section)) + len(sep) - return result - - def partition(self, sep, reverse=False): - if hasattr(sep, 'clean_string'): - sep = sep.clean_string - if reverse: - parent_result = self.clean_string.rpartition(sep) - else: - parent_result = self.clean_string.partition(sep) - current_index = 0 - result = tuple() - for section in parent_result: - result += (self[current_index:current_index + len(section)],) - current_index += len(section) - return result - - def _get_indexes(self): - matches = [ - (match.start(), match.end()) - for match in self.parser.ansi_regex.finditer(self.raw_string)] - code_indexes = [] - # These are all the indexes which hold code characters. - for start, end in matches: - code_indexes.extend(range(start, end)) - - if not code_indexes: - # Plain string, no ANSI codes. - return code_indexes, range(0, len(self.raw_string)) - flat_ranges = [] - # We need to get the ones between them, but the code might start at - # the beginning, and there might be codes at the end. - for tup in matches: - flat_ranges.extend(tup) - # Is the beginning of the string a code character? - if flat_ranges[0] == 0: - flat_ranges.pop(0) - else: - flat_ranges.insert(0, 0) - # How about the end? - end_index = (len(self.raw_string) - 1) - if flat_ranges[-1] == end_index: - flat_ranges.pop() - else: - flat_ranges.append(end_index) - char_indexes = [] - for start, end in list(group(flat_ranges, 2)): - char_indexes.extend(range(start, end)) - # The end character will be left off if it's a normal character. Fix - # that here. - if end_index in flat_ranges: - char_indexes.append(end_index) - return code_indexes, char_indexes - - @_spacing_preflight - def center(self, width, fillchar, difference): - remainder = difference % 2 - difference /= 2 - spacing = difference * fillchar - result = spacing + self + spacing + (remainder * fillchar) - return result - - @_spacing_preflight - def ljust(self, width, fillchar, difference): - return self + (difference * fillchar) - - @_spacing_preflight - def rjust(self, width, fillchar, difference): - return (difference * fillchar) + self - - def _query_super(func_name): """ Have the string class handle this with the cleaned string instead of ANSIString. """ - def query_func(self, *args, **kwargs): - return getattr(self.clean_string, func_name)(*args, **kwargs) - return query_func + def wrapped(self, *args, **kwargs): + return getattr(self.clean(), func_name)(*args, **kwargs) + return wrapped def _on_raw(func_name): @@ -536,7 +359,7 @@ def _on_raw(func_name): try: string = args.pop(0) if hasattr(string, 'raw_string'): - args.insert(0, string.raw_string) + args.insert(0, string.raw()) else: args.insert(0, string) except IndexError: @@ -566,16 +389,388 @@ def _transform(func_name): return ANSIString(''.join(to_string), decoded=True) return wrapped +class ANSIMeta(type): + """ + Many functions on ANSIString are just light wrappers around the unicode + base class. We apply them here, as part of the classes construction. + """ + def __init__(cls, *args, **kwargs): + for func_name in [ + 'count', 'startswith', 'endswith', 'find', 'index', 'isalnum', + 'isalpha', 'isdigit', 'islower', 'isspace', 'istitle', 'isupper', + 'rfind', 'rindex', '__len__']: + setattr(cls, func_name, _query_super(func_name)) + for func_name in [ + '__mul__', '__mod__', 'expandtabs', '__rmul__', 'join', + 'decode', 'replace', 'format']: + setattr(cls, func_name, _on_raw(func_name)) + for func_name in [ + 'capitalize', 'translate', 'lower', 'upper', 'swapcase']: + setattr(cls, func_name, _transform(func_name)) + super(ANSIMeta, cls).__init__(*args, **kwargs) -for func_name in [ - 'count', 'startswith', 'endswith', 'find', 'index', 'isalnum', - 'isalpha', 'isdigit', 'islower', 'isspace', 'istitle', 'isupper', - 'rfind', 'rindex']: - setattr(ANSIString, func_name, _query_super(func_name)) -for func_name in [ - '__mul__', '__mod__', 'expandtabs', '__rmul__', 'join', - 'decode', 'replace', 'format']: - setattr(ANSIString, func_name, _on_raw(func_name)) -for func_name in [ - 'capitalize', 'translate', 'lower', 'upper', 'swapcase']: - setattr(ANSIString, func_name, _transform(func_name)) + +class ANSIString(unicode): + """ + String-like object that is aware of ANSI codes. + + This isn't especially efficient, as it doesn't really have an + understanding of what the codes mean in order to eliminate + redundant characters. This could be made as an enhancement to ANSI_PARSER. + + If one is going to use ANSIString, one should generally avoid converting + away from it until one is about to send information on the wire. This is + because escape sequences in the string may otherwise already be decoded, + and taken literally the second time around. + + Please refer to the Metaclass, ANSIMeta, which is used to apply wrappers + for several of the methods that need not be defined directly here. + """ + __metaclass__ = ANSIMeta + + def __new__(cls, *args, **kwargs): + """ + When creating a new ANSIString, you may use a custom parser that has + the same attributes as the standard one, and you may declare the + string to be handled as already decoded. It is important not to double + decode strings, as escapes can only be respected once. + + If the regexable flag is set, using __getitem__, such as when getting + an index or slicing, will return the result from the raw string. If + this flag is set False, it will intelligently skip ANSI escapes. + + ANSIString('{rHello{g, W{yorld', regexable=True)[0] will return the + first byte of the escape sequence before 'Hello', while + ANSIString('{rHello{g, W{yorld')[0] will return a red 'H'. + + When a regexable ANSIString is sliced, the result is returned as a + non-regexable ANSI String. This ensures that usage of regexable + ANSIStrings is an explicit choice. + + Why all this complication with the regexable flag? + + The reason is that while we are able to subclass the unicode object in + Python, the byte representation of the string in memory cannot be + changed and still exists under the hood. This doesn't matter for things + coded in pure Python, but since Regexes need to be mindful of + performance, the module that handles them operates directly on the + memory representation of the string in order to do matching. It is thus + completely unaware of our customizations to the class. Interestingly, + however, while the re module does its matching on the raw string, it + slices the string using the object's methods. This means that running + a regex on an ANSIString would return matches at bogus indexes, since + the __getitem__ method of ANSIString skips ANSI escape sequences, which + were part of the raw data regex was matching against. + + So, if you need to use regex on an ANSIString, make sure you get it in + regexable mode first, and be ready to deal with a few edge cases. + """ + string = to_str(args[0], force_string=True) + if not isinstance(string, basestring): + string = str(string) + parser = kwargs.get('parser', ANSI_PARSER) + regexable = kwargs.get('regexable', False) + decoded = kwargs.get('decoded', False) or hasattr(string, 'raw_string') + if not decoded: + string = parser.parse_ansi(string) + if isinstance(string, unicode): + string = super(ANSIString, cls).__new__(ANSIString, string) + else: + string = super(ANSIString, cls).__new__(ANSIString, string, 'utf-8') + string._regexable = regexable + return string + + def __repr__(self): + """ + Let's make the repr the command that would actually be used to + construct this object, for convenience and reference. + """ + if self._regexable: + reg = ', regexable=True' + else: + reg = '' + return "ANSIString(%s, decoded=True%s)" % (repr(self._raw_string), reg) + + def __init__(self, *args, **kwargs): + """ + When the ANSIString is first initialized, a few internal variables + have to be set. + + The first is the parser. It is possible to replace Evennia's standard + ANSI parser with one of your own syntax if you wish, so long as it + implements the same interface. + + The second is the _raw_string. It should be noted that the ANSIStrings + are unicode based. This seemed more reasonable than basing it off of + the string class, because if someone were to use a unicode character, + the benefits of knowing the indexes of the ANSI characters would be + negated by the fact that a character within the string might require + more than one byte to be represented. The raw string is, then, a + unicode object rather than a true encoded string. If you need the + encoded string for sending over the wire, try using the .encode() + method. + + The third thing to set is the _clean_string. This is a unicode object + that is devoid of all ANSI Escapes. + + Finally, _code_indexes and _char_indexes are defined. These are lookup + tables for which characters in the raw string are related to ANSI + escapes, and which are for the readable text. + """ + self.parser = kwargs.pop('parser', ANSI_PARSER) + super(ANSIString, self).__init__(*args, **kwargs) + self._raw_string = unicode(self) + self._clean_string = unicode(self.parser.parse_ansi( + self._raw_string, strip_ansi=True), 'utf-8') + self._code_indexes, self._char_indexes = self._get_indexes() + + def __add__(self, other): + """ + We have to be careful when adding two strings not to reprocess things + that don't need to be reprocessed, lest we end up with escapes being + interpreted literally. + """ + if not isinstance(other, basestring): + return NotImplemented + return ANSIString(self._raw_string + getattr( + other, 'raw_string', other), decoded=True) + + def __radd__(self, other): + """ + Likewise, if we're on the other end. + """ + if not isinstance(other, basestring): + return NotImplemented + return ANSIString(getattr( + other, 'raw_string', other) + self._raw_string, decoded=True) + + def __getslice__(self, i, j): + """ + This function is deprecated, so we just make it call the proper + function. + """ + return self.__getitem__(slice(i, j)) + + def _slice(self, slc): + """ + This function takes a slice() object. + + Slices have to be handled specially. Not only are they able to specify + a start and end with [x:y], but many forget that they can also specify + an interval with [x:y:z]. As a result, not only do we have to track + the ANSI Escapes that have played before the start of the slice, we + must also replay any in these intervals, should the exist. + + Thankfully, slicing the _char_indexes table gives us the actual + indexes that need slicing in the raw string. We can check between + those indexes to figure out what escape characters need to be + replayed. + """ + slice_indexes = self._char_indexes[slc] + if not slice_indexes: + return ANSIString('') + try: + string = self[slc.start]._raw_string + except IndexError: + return ANSIString('') + last_mark = slice_indexes[0] + # Check between the slice intervals for escape sequences. + for i in slice_indexes[1:]: + for index in range(last_mark, i): + if index in self._code_indexes: + string += self._raw_string[index] + last_mark = i + try: + string += self._raw_string[i] + except IndexError: + pass + return ANSIString(string, decoded=True) + + def __getitem__(self, item): + """ + Gateway for slices and getting specific indexes in the ANSIString. If + this is a regexable ANSIString, it will get the data from the raw + string instead, bypassing ANSIString's intelligent escape skipping, + for reasons explained in the __new__ method's docstring. + """ + if self._regexable: + return ANSIString(self._raw_string[item], decoded=True) + if isinstance(item, slice): + # Slices must be handled specially. + return self._slice(item) + try: + item = self._char_indexes[item] + except IndexError: + raise IndexError("ANSIString index out of range.") + + clean = self._raw_string[item] + result = '' + # Get the character they're after, and replay all escape sequences + # previous to it. + for index in range(0, item + 1): + if index in self._code_indexes: + result += self._raw_string[index] + return ANSIString(result + clean, decoded=True) + + def rsplit(self, sep=None, maxsplit=None): + """ + Like split, but from the end of the string, rather than the beginning. + """ + return self.split(sep, maxsplit, reverse=True) + + def split(self, sep=None, maxsplit=None, reverse=False): + """ + Splits in a manner similar to the standard string split method. First, + we split the clean string. Then we measure each section of the result + to figure out where they start and end, and replay any escapes that + would have occured before that. + """ + if hasattr(sep, 'clean_string'): + sep = sep.clean_string + args = [sep] + if maxsplit is not None: + args.append(maxsplit) + if reverse: + parent_result = self._clean_string.rsplit(*args) + else: + parent_result = self._clean_string.split(*args) + # Might be None. + sep = sep or '' + current_index = 0 + result = [] + for section in parent_result: + result.append(self[current_index:current_index + len(section)]) + current_index += (len(section)) + len(sep) + return result + + def clean(self): + """ + Return a unicode object without the ANSI escapes. + """ + return self._clean_string + + def raw(self): + """ + Return a unicode object with the ANSI escapes. + """ + return self._raw_string + + def is_regexable(self): + """ + State whether or not this ANSIString is a 'regexable' ANSIString. + Regexable ANSIStrings return indexes from _raw_string when sliced. + """ + return self._regexable + + def regexable(self): + """ + Return the regexable version of this ANSIString. + """ + return ANSIString(self, decoded=True, regexable=True) + + def non_regexable(self): + """ + Return the non-regexable version of this ANSIString. + """ + return ANSIString(self, decoded=True) + + def partition(self, sep, reverse=False): + """ + Similar to split, but always creates a tuple with three items: + 1. The part before the separator + 2. The separator itself. + 3. The part after. + + We use the same techniques we used in split() to make sure each are + colored. + """ + if hasattr(sep, '_clean_string'): + sep = sep.clean() + if reverse: + parent_result = self._clean_string.rpartition(sep) + else: + parent_result = self._clean_string.partition(sep) + current_index = 0 + result = tuple() + for section in parent_result: + result += (self[current_index:current_index + len(section)],) + current_index += len(section) + return result + + def _get_indexes(self): + """ + Two tables need to be made, one which contains the indexes of all + readable characters, and one which contains the indexes of all ANSI + escapes. It's important to remember that ANSI escapes require more + that one character at a time, though no readable character needs more + than one character, since the unicode base class abstracts that away + from us. However, several readable characters can be placed in a row. + + We must use regexes here to figure out where all the escape sequences + are hiding in the string. Then we use the ranges of their starts and + ends to create a final, comprehensive list of all indexes which are + dedicated to code, and all dedicated to text. + + It's possible that only one of these tables is actually needed, the + other assumed to be what isn't in the first. + """ + matches = [ + (match.start(), match.end()) + for match in self.parser.ansi_regex.finditer(self._raw_string)] + code_indexes = [] + # These are all the indexes which hold code characters. + for start, end in matches: + code_indexes.extend(range(start, end)) + + if not code_indexes: + # Plain string, no ANSI codes. + return code_indexes, range(0, len(self._raw_string)) + flat_ranges = [] + # We need to get the ones between them, but the code might start at + # the beginning, and there might be codes at the end. + for tup in matches: + flat_ranges.extend(tup) + # Is the beginning of the string a code character? + if flat_ranges[0] == 0: + flat_ranges.pop(0) + else: + flat_ranges.insert(0, 0) + # How about the end? + end_index = (len(self._raw_string) - 1) + if flat_ranges[-1] == end_index: + flat_ranges.pop() + else: + flat_ranges.append(end_index) + char_indexes = [] + for start, end in list(group(flat_ranges, 2)): + char_indexes.extend(range(start, end)) + # The end character will be left off if it's a normal character. Fix + # that here. + if end_index in flat_ranges: + char_indexes.append(end_index) + return code_indexes, char_indexes + + @_spacing_preflight + def center(self, width, fillchar, difference): + """ + Center some text with some spaces padding both sides. + """ + remainder = difference % 2 + difference /= 2 + spacing = difference * fillchar + result = spacing + self + spacing + (remainder * fillchar) + return result + + @_spacing_preflight + def ljust(self, width, fillchar, difference): + """ + Left justify some text. + """ + return self + (difference * fillchar) + + @_spacing_preflight + def rjust(self, width, fillchar, difference): + """ + Right justify some text. + """ + return (difference * fillchar) + self