1'''
2Simple Diff for Python version 1.0
3
4Annotate two versions of a list with the values that have been
5changed between the versions, similar to unix's `diff` but with
6a dead-simple Python interface.
7
8(C) Paul Butler 2008-2012 <http://www.paulbutler.org/>
9May be used and distributed under the zlib/libpng license
10<http://www.opensource.org/licenses/zlib-license.php>
11'''
12
13__all__ = ['diff', 'string_diff', 'html_diff']
14__version__ = '1.0'
15
16
17def diff(old, new):
18    '''
19    Find the differences between two lists. Returns a list of pairs, where the
20    first value is in ['+','-','='] and represents an insertion, deletion, or
21    no change for that list. The second value of the pair is the list
22    of elements.
23
24    Params:
25        old     the old list of immutable, comparable values (ie. a list
26                of strings)
27        new     the new list of immutable, comparable values
28
29    Returns:
30        A list of pairs, with the first part of the pair being one of three
31        strings ('-', '+', '=') and the second part being a list of values from
32        the original old and/or new lists. The first part of the pair
33        corresponds to whether the list of values is a deletion, insertion, or
34        unchanged, respectively.
35
36    Examples:
37        >>> diff([1,2,3,4],[1,3,4])
38        [('=', [1]), ('-', [2]), ('=', [3, 4])]
39
40        >>> diff([1,2,3,4],[2,3,4,1])
41        [('-', [1]), ('=', [2, 3, 4]), ('+', [1])]
42
43        >>> diff('The quick brown fox jumps over the lazy dog'.split(),
44        ...      'The slow blue cheese drips over the lazy carrot'.split())
45        ... # doctest: +NORMALIZE_WHITESPACE
46        [('=', ['The']),
47         ('-', ['quick', 'brown', 'fox', 'jumps']),
48         ('+', ['slow', 'blue', 'cheese', 'drips']),
49         ('=', ['over', 'the', 'lazy']),
50         ('-', ['dog']),
51         ('+', ['carrot'])]
52
53    '''
54
55    # Create a map from old values to their indices
56    old_index_map = dict()
57    for i, val in enumerate(old):
58        old_index_map.setdefault(val,list()).append(i)
59
60    # Find the largest substring common to old and new.
61    # We use a dynamic programming approach here.
62    #
63    # We iterate over each value in the `new` list, calling the
64    # index `inew`. At each iteration, `overlap[i]` is the
65    # length of the largest suffix of `old[:i]` equal to a suffix
66    # of `new[:inew]` (or unset when `old[i]` != `new[inew]`).
67    #
68    # At each stage of iteration, the new `overlap` (called
69    # `_overlap` until the original `overlap` is no longer needed)
70    # is built from the old one.
71    #
72    # If the length of overlap exceeds the largest substring
73    # seen so far (`sub_length`), we update the largest substring
74    # to the overlapping strings.
75
76    overlap = dict()
77    # `sub_start_old` is the index of the beginning of the largest overlapping
78    # substring in the old list. `sub_start_new` is the index of the beginning
79    # of the same substring in the new list. `sub_length` is the length that
80    # overlaps in both.
81    # These track the largest overlapping substring seen so far, so naturally
82    # we start with a 0-length substring.
83    sub_start_old = 0
84    sub_start_new = 0
85    sub_length = 0
86
87    for inew, val in enumerate(new):
88        _overlap = dict()
89        for iold in old_index_map.get(val,list()):
90            # now we are considering all values of iold such that
91            # `old[iold] == new[inew]`.
92            _overlap[iold] = (iold and overlap.get(iold - 1, 0)) + 1
93            if(_overlap[iold] > sub_length):
94                # this is the largest substring seen so far, so store its
95                # indices
96                sub_length = _overlap[iold]
97                sub_start_old = iold - sub_length + 1
98                sub_start_new = inew - sub_length + 1
99        overlap = _overlap
100
101    if sub_length == 0:
102        # If no common substring is found, we return an insert and delete...
103        return (old and [('-', old)] or []) + (new and [('+', new)] or [])
104    else:
105        # ...otherwise, the common substring is unchanged and we recursively
106        # diff the text before and after that substring
107        return diff(old[ : sub_start_old], new[ : sub_start_new]) + \
108               [('=', new[sub_start_new : sub_start_new + sub_length])] + \
109               diff(old[sub_start_old + sub_length : ],
110                       new[sub_start_new + sub_length : ])
111
112
113def string_diff(old, new):
114    '''
115    Returns the difference between the old and new strings when split on
116    whitespace. Considers punctuation a part of the word
117
118    This function is intended as an example; you'll probably want
119    a more sophisticated wrapper in practice.
120
121    Params:
122        old     the old string
123        new     the new string
124
125    Returns:
126        the output of `diff` on the two strings after splitting them
127        on whitespace (a list of change instructions; see the docstring
128        of `diff`)
129
130    Examples:
131        >>> string_diff('The quick brown fox', 'The fast blue fox')
132        ... # doctest: +NORMALIZE_WHITESPACE
133        [('=', ['The']),
134         ('-', ['quick', 'brown']),
135         ('+', ['fast', 'blue']),
136         ('=', ['fox'])]
137
138    '''
139    return diff(old.split(), new.split())
140
141
142def html_diff(old, new):
143    '''
144    Returns the difference between two strings (as in stringDiff) in
145    HTML format. HTML code in the strings is NOT escaped, so you
146    will get weird results if the strings contain HTML.
147
148    This function is intended as an example; you'll probably want
149    a more sophisticated wrapper in practice.
150
151    Params:
152        old     the old string
153        new     the new string
154
155    Returns:
156        the output of the diff expressed with HTML <ins> and <del>
157        tags.
158
159    Examples:
160        >>> html_diff('The quick brown fox', 'The fast blue fox')
161        'The <del>quick brown</del> <ins>fast blue</ins> fox'
162    '''
163    con = {'=': (lambda x: x),
164           '+': (lambda x: "<ins>" + x + "</ins>"),
165           '-': (lambda x: "<del>" + x + "</del>")}
166    return " ".join([(con[a])(" ".join(b)) for a, b in string_diff(old, new)])
167
168
169def check_diff(old, new):
170    '''
171    This tests that diffs returned by `diff` are valid. You probably won't
172    want to use this function, but it's provided for documentation and
173    testing.
174
175    A diff should satisfy the property that the old input is equal to the
176    elements of the result annotated with '-' or '=' concatenated together.
177    Likewise, the new input is equal to the elements of the result annotated
178    with '+' or '=' concatenated together. This function compares `old`,
179    `new`, and the results of `diff(old, new)` to ensure this is true.
180
181    Tests:
182        >>> check_diff('ABCBA', 'CBABA')
183        >>> check_diff('Foobarbaz', 'Foobarbaz')
184        >>> check_diff('Foobarbaz', 'Boobazbam')
185        >>> check_diff('The quick brown fox', 'Some quick brown car')
186        >>> check_diff('A thick red book', 'A quick blue book')
187        >>> check_diff('dafhjkdashfkhasfjsdafdasfsda', 'asdfaskjfhksahkfjsdha')
188        >>> check_diff('88288822828828288282828', '88288882882828282882828')
189        >>> check_diff('1234567890', '24689')
190    '''
191    old = list(old)
192    new = list(new)
193    result = diff(old, new)
194    _old = [val for (a, vals) in result if (a in '=-') for val in vals]
195    assert old == _old, 'Expected %s, got %s' % (old, _old)
196    _new = [val for (a, vals) in result if (a in '=+') for val in vals]
197    assert new == _new, 'Expected %s, got %s' % (new, _new)
198
199