feat(parser/diff): patience diff from rogpeppe/go-internal

2025-03-28 06:50:37 -04:00 · 2025-03-28 06:50:37 -04:00 · a3a0d99be6
commit a3a0d99be6
parent e644b180a9
5 changed files with 241 additions and 228 deletions
--- a/internal/parser/diff/diff.go
+++ b/internal/parser/diff/diff.go
@ -1,10 +1,5 @@
 package diff

-import (
-	"fmt"
-	"strings"
-)
-
 // compareStrings compares two strings character by character, optionally ignoring whitespace.
 func compareStrings(str1, str2 string, compareSpace bool) bool {
 	if compareSpace {
@ -49,33 +44,3 @@ func isWhitespace(b byte) bool {
 		b == 0x85 ||
 		b == 0xA0
 }
-
-func formatDiff(oldList []string, newList []string, ops []Op[string]) string {
-	var result []string
-	i, j := 0, 0
-	for _, op := range ops {
-		if op.OpType == OpDelete {
-			for i < op.OldPos {
-				result = append(result, "  "+oldList[i])
-				i++
-				j++
-			}
-			result = append(result, "- "+fmt.Sprint(op.Elem))
-			i++
-		} else if op.OpType == OpInsert {
-			for j < op.NewPos {
-				result = append(result, "  "+newList[j])
-				i++
-				j++
-			}
-			result = append(result, "+ "+fmt.Sprint(op.Elem))
-			j++
-		}
-	}
-	for i < len(oldList) && j < len(newList) {
-		result = append(result, "  "+oldList[i])
-		i++
-		j++
-	}
-	return strings.Join(result, "\n")
-}
--- a/internal/parser/diff/myers.go
+++ b/internal/parser/diff/myers.go
@ -1,134 +0,0 @@
-package diff
-
-// source: https://github.com/MFAshby/myers
-// Myer's diff algorithm in golang
-// Ported from https://blog.robertelder.org/diff-algorithm/
-
-type OpType int
-
-const (
-	OpInsert OpType = iota
-	OpDelete
-)
-
-type Op[T any] struct {
-	OpType OpType // Insert or delete, as above
-	OldPos int    // Position in the old list of item to be inserted or deleted
-	NewPos int    // Position in the _new_ list of item to be inserted
-	Elem   T      // Actual value to be inserted or deleted
-}
-
-// Returns a minimal list of differences between 2 lists e and f
-// requiring O(min(len(e),len(f))) space and O(min(len(e),len(f)) * D)
-// worst-case execution time where D is the number of differences.
-func myersDiff[T any](e, f []T, equals func(T, T) bool) []Op[T] {
-	return diffInternal(e, f, equals, 0, 0)
-}
-
-func diffInternal[T any](e, f []T, equals func(T, T) bool, i, j int) []Op[T] {
-	N := len(e)
-	M := len(f)
-	L := N + M
-	Z := 2*min(N, M) + 2
-	switch {
-	case N > 0 && M > 0:
-		w := N - M
-		g := make([]int, Z)
-		p := make([]int, Z)
-
-		hMax := L/2 + L%2 + 1
-		for h := range hMax {
-			for r := range 2 {
-				var c, d []int
-				var o, m int
-				if r == 0 {
-					c = g
-					d = p
-					o = 1
-					m = 1
-				} else {
-					c = p
-					d = g
-					o = 0
-					m = -1
-				}
-				kMin := -(h - 2*max(0, h-M))
-				kMax := h - 2*max(0, h-N) + 1
-				for k := kMin; k < kMax; k += 2 {
-					var a int
-					if k == -h || k != h && c[pyMod((k-1), Z)] < c[pyMod((k+1), Z)] {
-						a = c[pyMod((k+1), Z)]
-					} else {
-						a = c[pyMod((k-1), Z)] + 1
-					}
-					b := a - k
-					s, t := a, b
-
-					for a < N && b < M && equals(e[(1-o)*N+m*a+(o-1)], f[(1-o)*M+m*b+(o-1)]) {
-						a, b = a+1, b+1
-					}
-					c[pyMod(k, Z)] = a
-					z := -(k - w)
-					if pyMod(L, 2) == o && z >= -(h-o) && z <= h-o && c[pyMod(k, Z)]+d[pyMod(z, Z)] >= N {
-						var D, x, y, u, v int
-						if o == 1 {
-							D = 2*h - 1
-							x = s
-							y = t
-							u = a
-							v = b
-						} else {
-							D = 2 * h
-							x = N - a
-							y = M - b
-							u = N - s
-							v = M - t
-						}
-						switch {
-						case D > 1 || (x != u && y != v):
-							return append(diffInternal(e[0:x], f[0:y], equals, i, j), diffInternal(e[u:N], f[v:M], equals, i+u, j+v)...)
-						case M > N:
-							return diffInternal(make([]T, 0), f[N:M], equals, i+N, j+N)
-						case M < N:
-							return diffInternal(e[M:N], make([]T, 0), equals, i+M, j+M)
-						default:
-							return make([]Op[T], 0)
-						}
-					}
-				}
-			}
-		}
-	case N > 0:
-		res := make([]Op[T], N)
-		for n := range N {
-			res[n] = Op[T]{OpDelete, i + n, -1, e[n]}
-		}
-		return res
-	default:
-		res := make([]Op[T], M)
-		for n := range M {
-			res[n] = Op[T]{OpInsert, i, j + n, f[n]}
-		}
-		return res
-	}
-	panic("Should never hit this!")
-}
-
-/**
- * The remainder op in python always matches the sign of the _denominator_
- * e.g -1%3 = 2.
- * In golang it matches the sign of the numerator.
- * See https://en.wikipedia.org/wiki/Modulo_operation#Variants_of_the_definition
- */
-func pyMod(x, y int) int {
-	return (x%y + y) % y
-}
-
-// Let us map element in same way as in
-
-// Convenient wrapper for string lists
-func myersDiffStr(e, f []string, compareSpace bool) []Op[string] {
-	return myersDiff[string](e, f, func(s1, s2 string) bool {
-		return compareStrings(s1, s2, compareSpace)
-	})
-}
--- a/internal/parser/diff/myers_test.go
+++ b/internal/parser/diff/myers_test.go
@ -1,49 +0,0 @@
-package diff
-
-import (
-	"reflect"
-	t "testing"
-)
-
-type TestCase struct {
-	l1  []string
-	l2  []string
-	exp []Op[string]
-}
-
-func TestDiff(t *t.T) {
-	A := "A"
-	B := "B"
-	C := "C"
-	testCases := []TestCase{
-		{[]string{}, []string{}, []Op[string]{}},
-		{[]string{}, []string{"foo"}, []Op[string]{{OpInsert, 0, 0, "foo"}}},
-		{[]string{"foo"}, []string{}, []Op[string]{{OpDelete, 0, -1, "foo"}}},
-		{[]string{"foo", "bar", "baz"}, []string{"foo", "bar", "baz"}, []Op[string]{}},
-		{[]string{"foo", "bar", "baz"}, []string{"foo", "baz"}, []Op[string]{{OpDelete, 1, -1, "bar"}}},
-		{[]string{"baz"}, []string{"foo", "baz"}, []Op[string]{{OpInsert, 0, 0, "foo"}}},
-		{[]string{"bar", "baz"}, []string{"foo", "baz"}, []Op[string]{{OpDelete, 0, -1, "bar"}, {OpInsert, 1, 0, "foo"}}},
-		{[]string{"foo", "bar", "baz"}, []string{"foo", "bar"}, []Op[string]{{OpDelete, 2, -1, "baz"}}},
-		{
-			[]string{A, B, C, A, B, B, A},
-			[]string{C, B, A, B, A, C},
-			[]Op[string]{{OpDelete, 0, -1, A}, {OpInsert, 1, 0, C}, {OpDelete, 2, -1, C}, {OpDelete, 5, -1, B}, {OpInsert, 7, 5, C}},
-		},
-		{
-			[]string{C, A, B, A, B, A, B, A, B, A, B, A, B, C},
-			[]string{B, A, B, A, B, A, B, A, B, A, B, A, B, A},
-			[]Op[string]{{OpDelete, 0, -1, C}, {OpInsert, 1, 0, B}, {OpDelete, 13, -1, C}, {OpInsert, 14, 13, A}},
-		},
-		{
-			[]string{B},
-			[]string{A, B, C, B, A},
-			[]Op[string]{{OpInsert, 0, 0, A}, {OpInsert, 0, 1, B}, {OpInsert, 0, 2, C}, {OpInsert, 1, 4, A}},
-		},
-	}
-	for _, c := range testCases {
-		act := myersDiffStr(c.l1, c.l2, true)
-		if !reflect.DeepEqual(c.exp, act) {
-			t.Errorf("Failed diff, expected %v actual %v\n", c.exp, act)
-		}
-	}
-}
--- a/internal/parser/diff/parser.go
+++ b/internal/parser/diff/parser.go
@ -80,16 +80,8 @@ func (*Diff) Run(results []stage.ExecutorResult, confAny any) (
 							resultStr = resultStr[:output.MaxDiffLength]
 							truncated = true
 						}
-						answerLines := strings.Split(answerStr, "\n")
-						resultLines := strings.Split(resultStr, "\n")
-						// Generate Myers diff
-						diffOps := myersDiffStr(answerLines, resultLines,
-							output.CompareSpace)
-						// Generate diff block with surrounding context
-						diffOutput := formatDiff(
-							answerLines,
-							resultLines,
-							diffOps,
+						diffOutput := patienceDiff(
+							answerStr, resultStr, output.CompareSpace,
 						)
 						diffOutput = strings.TrimSuffix(diffOutput, "\n  ")
 						if truncated {
--- a/internal/parser/diff/patience.go
+++ b/internal/parser/diff/patience.go
@ -0,0 +1,239 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package diff
+
+// modified from https://github.com/rogpeppe/go-internal/blob/master/diff/diff.go
+
+import (
+	"bytes"
+	"sort"
+	"strings"
+)
+
+// A pair is a pair of values tracked for both the x and y side of a diff.
+// It is typically a pair of line indexes.
+type pair struct{ x, y int }
+
+// Diff returns an anchored diff of the two texts old and new
+// in the “unified diff” format. If old and new are identical,
+// Diff returns a nil slice (no output).
+//
+// Unix diff implementations typically look for a diff with
+// the smallest number of lines inserted and removed,
+// which can in the worst case take time quadratic in the
+// number of lines in the texts. As a result, many implementations
+// either can be made to run for a long time or cut off the search
+// after a predetermined amount of work.
+//
+// In contrast, this implementation looks for a diff with the
+// smallest number of “unique” lines inserted and removed,
+// where unique means a line that appears just once in both old and new.
+// We call this an “anchored diff” because the unique lines anchor
+// the chosen matching regions. An anchored diff is usually clearer
+// than a standard diff, because the algorithm does not try to
+// reuse unrelated blank lines or closing braces.
+// The algorithm also guarantees to run in O(n log n) time
+// instead of the standard O(n²) time.
+//
+// Some systems call this approach a “patience diff,” named for
+// the “patience sorting” algorithm, itself named for a solitaire card game.
+// We avoid that name for two reasons. First, the name has been used
+// for a few different variants of the algorithm, so it is imprecise.
+// Second, the name is frequently interpreted as meaning that you have
+// to wait longer (to be patient) for the diff, meaning that it is a slower algorithm,
+// when in fact the algorithm is faster than the standard one.
+func patienceDiff(old, new string, compareSpace bool) string {
+	x := strings.SplitAfter(old, "\n")
+	y := strings.SplitAfter(new, "\n")
+
+	// Print diff header.
+	var out bytes.Buffer
+
+	// Loop over matches to consider,
+	// expanding each match to include surrounding lines,
+	// and then printing diff chunks.
+	// To avoid setup/teardown cases outside the loop,
+	// tgs returns a leading {0,0} and trailing {len(x), len(y)} pair
+	// in the sequence of matches.
+	var (
+		done  pair     // printed up to x[:done.x] and y[:done.y]
+		chunk pair     // start lines of current chunk
+		count pair     // number of lines from each side in current chunk
+		ctext []string // lines for current chunk
+	)
+	for _, m := range tgs(x, y) {
+		if m.x < done.x {
+			// Already handled scanning forward from earlier match.
+			continue
+		}
+
+		// Expand matching lines as far possible,
+		// establishing that x[start.x:end.x] == y[start.y:end.y].
+		// Note that on the first (or last) iteration we may (or definitely do)
+		// have an empty match: start.x==end.x and start.y==end.y.
+		start := m
+		for start.x > done.x && start.y > done.y && compareStrings(x[start.x-1], y[start.y-1], compareSpace) {
+			start.x--
+			start.y--
+		}
+		end := m
+		for end.x < len(x) && end.y < len(y) && compareStrings(x[end.x], y[end.y], compareSpace) {
+			end.x++
+			end.y++
+		}
+
+		// Emit the mismatched lines before start into this chunk.
+		// (No effect on first sentinel iteration, when start = {0,0}.)
+		for _, s := range x[done.x:start.x] {
+			ctext = append(ctext, "- "+s)
+			count.x++
+		}
+		for _, s := range y[done.y:start.y] {
+			ctext = append(ctext, "+ "+s)
+			count.y++
+		}
+
+		// If we're not at EOF and have too few common lines,
+		// the chunk includes all the common lines and continues.
+		const C = 3 // number of context lines
+		if (end.x < len(x) || end.y < len(y)) &&
+			(end.x-start.x < C || (len(ctext) > 0 && end.x-start.x < 2*C)) {
+			for _, s := range x[start.x:end.x] {
+				ctext = append(ctext, "  "+s)
+				count.x++
+				count.y++
+			}
+			done = end
+			continue
+		}
+
+		// End chunk with common lines for context.
+		if len(ctext) > 0 {
+			n := min(end.x-start.x, C)
+			for _, s := range x[start.x : start.x+n] {
+				ctext = append(ctext, "  "+s)
+				count.x++
+				count.y++
+			}
+			done = pair{start.x + n, start.y + n}
+
+			// Format and emit chunk.
+			// Convert line numbers to 1-indexed.
+			// Special case: empty file shows up as 0,0 not 1,0.
+			if count.x > 0 {
+				chunk.x++
+			}
+			if count.y > 0 {
+				chunk.y++
+			}
+			// We do not need this line
+			// fmt.Fprintf(&out, "@@ -%d,%d +%d,%d @@\n", chunk.x, count.x, chunk.y, count.y)
+			for _, s := range ctext {
+				out.WriteString(s)
+			}
+			count.x = 0
+			count.y = 0
+			ctext = ctext[:0]
+		}
+
+		// If we reached EOF, we're done.
+		if end.x >= len(x) && end.y >= len(y) {
+			break
+		}
+
+		// Otherwise start a new chunk.
+		chunk = pair{end.x - C, end.y - C}
+		for _, s := range x[chunk.x:end.x] {
+			ctext = append(ctext, "  "+s)
+			count.x++
+			count.y++
+		}
+		done = end
+	}
+
+	return out.String()
+}
+
+// tgs returns the pairs of indexes of the longest common subsequence
+// of unique lines in x and y, where a unique line is one that appears
+// once in x and once in y.
+//
+// The longest common subsequence algorithm is as described in
+// Thomas G. Szymanski, “A Special Case of the Maximal Common
+// Subsequence Problem,” Princeton TR #170 (January 1975),
+// available at https://research.swtch.com/tgs170.pdf.
+func tgs(x, y []string) []pair {
+	// Count the number of times each string appears in a and b.
+	// We only care about 0, 1, many, counted as 0, -1, -2
+	// for the x side and 0, -4, -8 for the y side.
+	// Using negative numbers now lets us distinguish positive line numbers later.
+	m := make(map[string]int)
+	for _, s := range x {
+		if c := m[s]; c > -2 {
+			m[s] = c - 1
+		}
+	}
+	for _, s := range y {
+		if c := m[s]; c > -8 {
+			m[s] = c - 4
+		}
+	}
+
+	// Now unique strings can be identified by m[s] = -1+-4.
+	//
+	// Gather the indexes of those strings in x and y, building:
+	//	xi[i] = increasing indexes of unique strings in x.
+	//	yi[i] = increasing indexes of unique strings in y.
+	//	inv[i] = index j such that x[xi[i]] = y[yi[j]].
+	var xi, yi, inv []int
+	for i, s := range y {
+		if m[s] == -1+-4 {
+			m[s] = len(yi)
+			yi = append(yi, i)
+		}
+	}
+	for i, s := range x {
+		if j, ok := m[s]; ok && j >= 0 {
+			xi = append(xi, i)
+			inv = append(inv, j)
+		}
+	}
+
+	// Apply Algorithm A from Szymanski's paper.
+	// In those terms, A = J = inv and B = [0, n).
+	// We add sentinel pairs {0,0}, and {len(x),len(y)}
+	// to the returned sequence, to help the processing loop.
+	J := inv
+	n := len(xi)
+	T := make([]int, n)
+	L := make([]int, n)
+	for i := range T {
+		T[i] = n + 1
+	}
+	for i := range n {
+		k := sort.Search(n, func(k int) bool {
+			return T[k] >= J[i]
+		})
+		T[k] = J[i]
+		L[i] = k + 1
+	}
+	k := 0
+	for _, v := range L {
+		if k < v {
+			k = v
+		}
+	}
+	seq := make([]pair, 2+k)
+	seq[1+k] = pair{len(x), len(y)} // sentinel at end
+	lastj := n
+	for i := n - 1; i >= 0; i-- {
+		if L[i] == k && J[i] < lastj {
+			seq[k] = pair{xi[i], yi[J[i]]}
+			k--
+		}
+	}
+	seq[0] = pair{0, 0} // sentinel at start
+	return seq
+}