1*eb8dc403SDave Cobbley''' 2*eb8dc403SDave CobbleySimple Diff for Python version 1.0 3*eb8dc403SDave Cobbley 4*eb8dc403SDave CobbleyAnnotate two versions of a list with the values that have been 5*eb8dc403SDave Cobbleychanged between the versions, similar to unix's `diff` but with 6*eb8dc403SDave Cobbleya dead-simple Python interface. 7*eb8dc403SDave Cobbley 8*eb8dc403SDave Cobbley(C) Paul Butler 2008-2012 <http://www.paulbutler.org/> 9*eb8dc403SDave CobbleyMay be used and distributed under the zlib/libpng license 10*eb8dc403SDave Cobbley<http://www.opensource.org/licenses/zlib-license.php> 11*eb8dc403SDave Cobbley''' 12*eb8dc403SDave Cobbley 13*eb8dc403SDave Cobbley__all__ = ['diff', 'string_diff', 'html_diff'] 14*eb8dc403SDave Cobbley__version__ = '1.0' 15*eb8dc403SDave Cobbley 16*eb8dc403SDave Cobbley 17*eb8dc403SDave Cobbleydef diff(old, new): 18*eb8dc403SDave Cobbley ''' 19*eb8dc403SDave Cobbley Find the differences between two lists. Returns a list of pairs, where the 20*eb8dc403SDave Cobbley first value is in ['+','-','='] and represents an insertion, deletion, or 21*eb8dc403SDave Cobbley no change for that list. The second value of the pair is the list 22*eb8dc403SDave Cobbley of elements. 23*eb8dc403SDave Cobbley 24*eb8dc403SDave Cobbley Params: 25*eb8dc403SDave Cobbley old the old list of immutable, comparable values (ie. a list 26*eb8dc403SDave Cobbley of strings) 27*eb8dc403SDave Cobbley new the new list of immutable, comparable values 28*eb8dc403SDave Cobbley 29*eb8dc403SDave Cobbley Returns: 30*eb8dc403SDave Cobbley A list of pairs, with the first part of the pair being one of three 31*eb8dc403SDave Cobbley strings ('-', '+', '=') and the second part being a list of values from 32*eb8dc403SDave Cobbley the original old and/or new lists. The first part of the pair 33*eb8dc403SDave Cobbley corresponds to whether the list of values is a deletion, insertion, or 34*eb8dc403SDave Cobbley unchanged, respectively. 35*eb8dc403SDave Cobbley 36*eb8dc403SDave Cobbley Examples: 37*eb8dc403SDave Cobbley >>> diff([1,2,3,4],[1,3,4]) 38*eb8dc403SDave Cobbley [('=', [1]), ('-', [2]), ('=', [3, 4])] 39*eb8dc403SDave Cobbley 40*eb8dc403SDave Cobbley >>> diff([1,2,3,4],[2,3,4,1]) 41*eb8dc403SDave Cobbley [('-', [1]), ('=', [2, 3, 4]), ('+', [1])] 42*eb8dc403SDave Cobbley 43*eb8dc403SDave Cobbley >>> diff('The quick brown fox jumps over the lazy dog'.split(), 44*eb8dc403SDave Cobbley ... 'The slow blue cheese drips over the lazy carrot'.split()) 45*eb8dc403SDave Cobbley ... # doctest: +NORMALIZE_WHITESPACE 46*eb8dc403SDave Cobbley [('=', ['The']), 47*eb8dc403SDave Cobbley ('-', ['quick', 'brown', 'fox', 'jumps']), 48*eb8dc403SDave Cobbley ('+', ['slow', 'blue', 'cheese', 'drips']), 49*eb8dc403SDave Cobbley ('=', ['over', 'the', 'lazy']), 50*eb8dc403SDave Cobbley ('-', ['dog']), 51*eb8dc403SDave Cobbley ('+', ['carrot'])] 52*eb8dc403SDave Cobbley 53*eb8dc403SDave Cobbley ''' 54*eb8dc403SDave Cobbley 55*eb8dc403SDave Cobbley # Create a map from old values to their indices 56*eb8dc403SDave Cobbley old_index_map = dict() 57*eb8dc403SDave Cobbley for i, val in enumerate(old): 58*eb8dc403SDave Cobbley old_index_map.setdefault(val,list()).append(i) 59*eb8dc403SDave Cobbley 60*eb8dc403SDave Cobbley # Find the largest substring common to old and new. 61*eb8dc403SDave Cobbley # We use a dynamic programming approach here. 62*eb8dc403SDave Cobbley # 63*eb8dc403SDave Cobbley # We iterate over each value in the `new` list, calling the 64*eb8dc403SDave Cobbley # index `inew`. At each iteration, `overlap[i]` is the 65*eb8dc403SDave Cobbley # length of the largest suffix of `old[:i]` equal to a suffix 66*eb8dc403SDave Cobbley # of `new[:inew]` (or unset when `old[i]` != `new[inew]`). 67*eb8dc403SDave Cobbley # 68*eb8dc403SDave Cobbley # At each stage of iteration, the new `overlap` (called 69*eb8dc403SDave Cobbley # `_overlap` until the original `overlap` is no longer needed) 70*eb8dc403SDave Cobbley # is built from the old one. 71*eb8dc403SDave Cobbley # 72*eb8dc403SDave Cobbley # If the length of overlap exceeds the largest substring 73*eb8dc403SDave Cobbley # seen so far (`sub_length`), we update the largest substring 74*eb8dc403SDave Cobbley # to the overlapping strings. 75*eb8dc403SDave Cobbley 76*eb8dc403SDave Cobbley overlap = dict() 77*eb8dc403SDave Cobbley # `sub_start_old` is the index of the beginning of the largest overlapping 78*eb8dc403SDave Cobbley # substring in the old list. `sub_start_new` is the index of the beginning 79*eb8dc403SDave Cobbley # of the same substring in the new list. `sub_length` is the length that 80*eb8dc403SDave Cobbley # overlaps in both. 81*eb8dc403SDave Cobbley # These track the largest overlapping substring seen so far, so naturally 82*eb8dc403SDave Cobbley # we start with a 0-length substring. 83*eb8dc403SDave Cobbley sub_start_old = 0 84*eb8dc403SDave Cobbley sub_start_new = 0 85*eb8dc403SDave Cobbley sub_length = 0 86*eb8dc403SDave Cobbley 87*eb8dc403SDave Cobbley for inew, val in enumerate(new): 88*eb8dc403SDave Cobbley _overlap = dict() 89*eb8dc403SDave Cobbley for iold in old_index_map.get(val,list()): 90*eb8dc403SDave Cobbley # now we are considering all values of iold such that 91*eb8dc403SDave Cobbley # `old[iold] == new[inew]`. 92*eb8dc403SDave Cobbley _overlap[iold] = (iold and overlap.get(iold - 1, 0)) + 1 93*eb8dc403SDave Cobbley if(_overlap[iold] > sub_length): 94*eb8dc403SDave Cobbley # this is the largest substring seen so far, so store its 95*eb8dc403SDave Cobbley # indices 96*eb8dc403SDave Cobbley sub_length = _overlap[iold] 97*eb8dc403SDave Cobbley sub_start_old = iold - sub_length + 1 98*eb8dc403SDave Cobbley sub_start_new = inew - sub_length + 1 99*eb8dc403SDave Cobbley overlap = _overlap 100*eb8dc403SDave Cobbley 101*eb8dc403SDave Cobbley if sub_length == 0: 102*eb8dc403SDave Cobbley # If no common substring is found, we return an insert and delete... 103*eb8dc403SDave Cobbley return (old and [('-', old)] or []) + (new and [('+', new)] or []) 104*eb8dc403SDave Cobbley else: 105*eb8dc403SDave Cobbley # ...otherwise, the common substring is unchanged and we recursively 106*eb8dc403SDave Cobbley # diff the text before and after that substring 107*eb8dc403SDave Cobbley return diff(old[ : sub_start_old], new[ : sub_start_new]) + \ 108*eb8dc403SDave Cobbley [('=', new[sub_start_new : sub_start_new + sub_length])] + \ 109*eb8dc403SDave Cobbley diff(old[sub_start_old + sub_length : ], 110*eb8dc403SDave Cobbley new[sub_start_new + sub_length : ]) 111*eb8dc403SDave Cobbley 112*eb8dc403SDave Cobbley 113*eb8dc403SDave Cobbleydef string_diff(old, new): 114*eb8dc403SDave Cobbley ''' 115*eb8dc403SDave Cobbley Returns the difference between the old and new strings when split on 116*eb8dc403SDave Cobbley whitespace. Considers punctuation a part of the word 117*eb8dc403SDave Cobbley 118*eb8dc403SDave Cobbley This function is intended as an example; you'll probably want 119*eb8dc403SDave Cobbley a more sophisticated wrapper in practice. 120*eb8dc403SDave Cobbley 121*eb8dc403SDave Cobbley Params: 122*eb8dc403SDave Cobbley old the old string 123*eb8dc403SDave Cobbley new the new string 124*eb8dc403SDave Cobbley 125*eb8dc403SDave Cobbley Returns: 126*eb8dc403SDave Cobbley the output of `diff` on the two strings after splitting them 127*eb8dc403SDave Cobbley on whitespace (a list of change instructions; see the docstring 128*eb8dc403SDave Cobbley of `diff`) 129*eb8dc403SDave Cobbley 130*eb8dc403SDave Cobbley Examples: 131*eb8dc403SDave Cobbley >>> string_diff('The quick brown fox', 'The fast blue fox') 132*eb8dc403SDave Cobbley ... # doctest: +NORMALIZE_WHITESPACE 133*eb8dc403SDave Cobbley [('=', ['The']), 134*eb8dc403SDave Cobbley ('-', ['quick', 'brown']), 135*eb8dc403SDave Cobbley ('+', ['fast', 'blue']), 136*eb8dc403SDave Cobbley ('=', ['fox'])] 137*eb8dc403SDave Cobbley 138*eb8dc403SDave Cobbley ''' 139*eb8dc403SDave Cobbley return diff(old.split(), new.split()) 140*eb8dc403SDave Cobbley 141*eb8dc403SDave Cobbley 142*eb8dc403SDave Cobbleydef html_diff(old, new): 143*eb8dc403SDave Cobbley ''' 144*eb8dc403SDave Cobbley Returns the difference between two strings (as in stringDiff) in 145*eb8dc403SDave Cobbley HTML format. HTML code in the strings is NOT escaped, so you 146*eb8dc403SDave Cobbley will get weird results if the strings contain HTML. 147*eb8dc403SDave Cobbley 148*eb8dc403SDave Cobbley This function is intended as an example; you'll probably want 149*eb8dc403SDave Cobbley a more sophisticated wrapper in practice. 150*eb8dc403SDave Cobbley 151*eb8dc403SDave Cobbley Params: 152*eb8dc403SDave Cobbley old the old string 153*eb8dc403SDave Cobbley new the new string 154*eb8dc403SDave Cobbley 155*eb8dc403SDave Cobbley Returns: 156*eb8dc403SDave Cobbley the output of the diff expressed with HTML <ins> and <del> 157*eb8dc403SDave Cobbley tags. 158*eb8dc403SDave Cobbley 159*eb8dc403SDave Cobbley Examples: 160*eb8dc403SDave Cobbley >>> html_diff('The quick brown fox', 'The fast blue fox') 161*eb8dc403SDave Cobbley 'The <del>quick brown</del> <ins>fast blue</ins> fox' 162*eb8dc403SDave Cobbley ''' 163*eb8dc403SDave Cobbley con = {'=': (lambda x: x), 164*eb8dc403SDave Cobbley '+': (lambda x: "<ins>" + x + "</ins>"), 165*eb8dc403SDave Cobbley '-': (lambda x: "<del>" + x + "</del>")} 166*eb8dc403SDave Cobbley return " ".join([(con[a])(" ".join(b)) for a, b in string_diff(old, new)]) 167*eb8dc403SDave Cobbley 168*eb8dc403SDave Cobbley 169*eb8dc403SDave Cobbleydef check_diff(old, new): 170*eb8dc403SDave Cobbley ''' 171*eb8dc403SDave Cobbley This tests that diffs returned by `diff` are valid. You probably won't 172*eb8dc403SDave Cobbley want to use this function, but it's provided for documentation and 173*eb8dc403SDave Cobbley testing. 174*eb8dc403SDave Cobbley 175*eb8dc403SDave Cobbley A diff should satisfy the property that the old input is equal to the 176*eb8dc403SDave Cobbley elements of the result annotated with '-' or '=' concatenated together. 177*eb8dc403SDave Cobbley Likewise, the new input is equal to the elements of the result annotated 178*eb8dc403SDave Cobbley with '+' or '=' concatenated together. This function compares `old`, 179*eb8dc403SDave Cobbley `new`, and the results of `diff(old, new)` to ensure this is true. 180*eb8dc403SDave Cobbley 181*eb8dc403SDave Cobbley Tests: 182*eb8dc403SDave Cobbley >>> check_diff('ABCBA', 'CBABA') 183*eb8dc403SDave Cobbley >>> check_diff('Foobarbaz', 'Foobarbaz') 184*eb8dc403SDave Cobbley >>> check_diff('Foobarbaz', 'Boobazbam') 185*eb8dc403SDave Cobbley >>> check_diff('The quick brown fox', 'Some quick brown car') 186*eb8dc403SDave Cobbley >>> check_diff('A thick red book', 'A quick blue book') 187*eb8dc403SDave Cobbley >>> check_diff('dafhjkdashfkhasfjsdafdasfsda', 'asdfaskjfhksahkfjsdha') 188*eb8dc403SDave Cobbley >>> check_diff('88288822828828288282828', '88288882882828282882828') 189*eb8dc403SDave Cobbley >>> check_diff('1234567890', '24689') 190*eb8dc403SDave Cobbley ''' 191*eb8dc403SDave Cobbley old = list(old) 192*eb8dc403SDave Cobbley new = list(new) 193*eb8dc403SDave Cobbley result = diff(old, new) 194*eb8dc403SDave Cobbley _old = [val for (a, vals) in result if (a in '=-') for val in vals] 195*eb8dc403SDave Cobbley assert old == _old, 'Expected %s, got %s' % (old, _old) 196*eb8dc403SDave Cobbley _new = [val for (a, vals) in result if (a in '=+') for val in vals] 197*eb8dc403SDave Cobbley assert new == _new, 'Expected %s, got %s' % (new, _new) 198*eb8dc403SDave Cobbley 199