# -*- coding: utf-8 -*- from common import escape, is_whitespace, unescape import re import unittest class AccountedString(object): """ Retain full accountability of edits made to a string, by recording a series of edit actions (either deletions or insertions).""" def __init__(self, pre, config, encoding='ascii'): """ initialise with pre as the pre-edit version and an optional offset indicating a location in a source string. If pre is not a unicode string, then it will be converted into one, assuming the specified encoding""" super(AccountedString, self).__init__() self._pre = pre if isinstance(pre, unicode) else unicode(pre, encoding) self.config = config self.actions = list() self._build() self._normalise_whitespace() def __unicode__(self): """ create a unicode version of the string with edits applied""" return unicode(_PostString(self)) def _build(self): pass @staticmethod def _leading_whitespace(start, statuses): for status in statuses[:start]: if status != 'd': return False return True def _make_statuses(self): statuses = ['w' if is_whitespace(char) else 'r' for char in self._pre] for action in self.actions: start = action.index for i in range(len(action)): if type(action) is Deletion: statuses[start + i] = 'd' elif type(action) is Insertion: status = 'w' if is_whitespace(unicode(action)[i]) else 'i' statuses[start + i] = status return statuses def _normalise_whitespace(self, call_again=True): statuses = self._make_statuses() for start, end in self._whitespace_ranges(statuses): if self._leading_whitespace(start, statuses): self.register(Deletion(start, length=end - start + 1)) elif self._trailing_whitespace(end, statuses): self.register(Deletion(start, length=end - start + 1)) elif start != end: self.register(Deletion(start, length=end - start)) elif start == end and self._pre[start] != ' ': self.register(Deletion(start, length=1)) self.register(Insertion(start, ' ')) self.actions = sorted(self.actions) if call_again: self._normalise_whitespace(False) @staticmethod def _trailing_whitespace(end, statuses): for status in statuses[end + 1:]: if status != 'd': return False return True @staticmethod def _whitespace_ranges(statuses): start = None for i, status in enumerate(statuses): if start is None and status == 'w': start = i elif start is not None and status != 'w': end = i - 1 yield start, end start = None if start is not None: end = len(statuses) - 1 yield start, end def register(self, action): """ register an action to apply to the string """ assert isinstance(action, Action) self.actions.append(action) @staticmethod def revert(post_string, actions): pre_string = post_string for action in sorted(actions): pre_string = action.revert(pre_string) return pre_string class _PostString(object): """ creates a post-edit version of an accounted string """ def __init__(self, account): super(_PostString, self).__init__() # store the pre-edit version so actions may refer to it self._pre = account._pre if len(account.actions) == 0: # if there are no actions then just use the pre-edit version self._string = account._pre else: # _i tracks the location in the pre-edit version self._i = 0 # _string holds the constructed post-edit version self._string = u'' action_queue = sorted(account.actions) nextAction = action_queue.pop(0) while self._i < len(self._pre): if self._i == nextAction.index: nextAction.execute(self) if len(action_queue) == 0: # done with edits - just copy the # rest of the pre-edit version self._string += self._pre[self._i:] break else: nextAction = action_queue.pop(0) else: # no action to take here - copy the current # character, and move _i on to the next self._string += self._pre[self._i] self._i += 1 def __unicode__(self): return self._string.replace('\n', ' ') class Action(object): """ an abstract class for deletions and insertions """ FORMAT = u'@{index}{symbol}"{content}"' PATTERN = re.compile(r'@(\d+)([\+\-])"(.+)"$', re.UNICODE) def __init__(self, index, content=u''): super(Action, self).__init__() self.index = index self.content = content def __cmp__(self, other): """ order actions by index---if the indices are the same then order insertions first """ if self.index < other.index: return -1 elif self.index > other.index: return 1 else: if type(self) == Insertion: return -1 else: return 1 def __len__(self): return len(self.content) def __unicode__(self): return self.content @staticmethod def _get_action_type(symbol): for action_type in Action.__subclasses__(): if symbol == action_type.SYMBOL: return action_type return None def execute(self, post_string): raise NotImplementedError @staticmethod def fromstring(string): match = Action.PATTERN.match(string) index = int(match.group(1)) action_type = Action._get_action_type(match.group(2)) content = unescape(match.group(3)) return action_type(index, content) def revert(self, post_string): raise NotImplementedError def tostring(self): return Action.FORMAT.format(index=self.index, symbol=self.SYMBOL, content=escape(self.content)) class Deletion(Action): SYMBOL = u'-' def __init__(self, index, content=None, length=None): assert not (content is None and length is None) super(Deletion, self).__init__(index) if content is not None: self.content = content else: # create a substring of wildcards because we don't # know the actual substring yet self.content = ' ' * length def execute(self, post): # start the deletion at the current index start = post._i # move the index tracker to just beyond the deletion, # so that characters skipped will not be copied across # by the _PostString constructor post._i += len(self) # record what was deleted so that it can be reverted later self.content = post._pre[start:post._i] def revert(self, post): start = self.index return post[:start] + self.content + post[start:] class Insertion(Action): SYMBOL = u'+' def execute(self, post): # copy the insertion to the _PostString in progress post._string += unicode(self) def revert(self, post): start = self.index end = start + len(self) return post[:start] + post[end:] class _TestAccounting(unittest.TestCase): simple = u'The quick fox.' def assertReverted(self, accounted_string): reverted = unicode(accounted_string) for action_type in (Insertion, Deletion): for action in accounted_string.actions: if type(action) == action_type: reverted = action.revert(reverted) self.assertEqual(accounted_string._pre, reverted) def test_unmodified(self): test = AccountedString(self.simple, None) self.assertEqual(self.simple, unicode(test)) def test_insert(self): test = AccountedString(self.simple, None) test.register(Insertion(17, 'brown ')) self.assertEqual(u'The quick brown fox.', unicode(test)) self.assertReverted(test) def test_deletion(self): test = AccountedString(u'The quick fox.', None) test.register(Deletion(4, content='')) test.register(Deletion(12, content='')) self.assertEqual(u'The quick fox.', unicode(test)) self.assertReverted(test) def test_mix(self): test = AccountedString(u'This should do it: sudo reboot!', None) test.register(Deletion(19, length=24)) test.register(Insertion(19, '')) self.assertEqual(u'This should do it: !', unicode(test)) self.assertReverted(test) def test_normalisation(self): test = AccountedString(u'blah\nblah', None) test._normalise_whitespace() self.assertEqual(u'blah blah', unicode(test))