From a3a0d99be65c43ad6fc4a59f819e2ee0b35c57e8 Mon Sep 17 00:00:00 2001 From: Boming Zhang Date: Fri, 28 Mar 2025 06:50:37 -0400 Subject: [PATCH] feat(parser/diff): patience diff from rogpeppe/go-internal --- internal/parser/diff/diff.go | 35 ----- internal/parser/diff/myers.go | 134 ---------------- internal/parser/diff/myers_test.go | 49 ------ internal/parser/diff/parser.go | 12 +- internal/parser/diff/patience.go | 239 +++++++++++++++++++++++++++++ 5 files changed, 241 insertions(+), 228 deletions(-) delete mode 100644 internal/parser/diff/myers.go delete mode 100644 internal/parser/diff/myers_test.go create mode 100644 internal/parser/diff/patience.go diff --git a/internal/parser/diff/diff.go b/internal/parser/diff/diff.go index 3e22e8b..25f0695 100644 --- a/internal/parser/diff/diff.go +++ b/internal/parser/diff/diff.go @@ -1,10 +1,5 @@ package diff -import ( - "fmt" - "strings" -) - // compareStrings compares two strings character by character, optionally ignoring whitespace. func compareStrings(str1, str2 string, compareSpace bool) bool { if compareSpace { @@ -49,33 +44,3 @@ func isWhitespace(b byte) bool { b == 0x85 || b == 0xA0 } - -func formatDiff(oldList []string, newList []string, ops []Op[string]) string { - var result []string - i, j := 0, 0 - for _, op := range ops { - if op.OpType == OpDelete { - for i < op.OldPos { - result = append(result, " "+oldList[i]) - i++ - j++ - } - result = append(result, "- "+fmt.Sprint(op.Elem)) - i++ - } else if op.OpType == OpInsert { - for j < op.NewPos { - result = append(result, " "+newList[j]) - i++ - j++ - } - result = append(result, "+ "+fmt.Sprint(op.Elem)) - j++ - } - } - for i < len(oldList) && j < len(newList) { - result = append(result, " "+oldList[i]) - i++ - j++ - } - return strings.Join(result, "\n") -} diff --git a/internal/parser/diff/myers.go b/internal/parser/diff/myers.go deleted file mode 100644 index a52a6a8..0000000 --- a/internal/parser/diff/myers.go +++ /dev/null @@ -1,134 +0,0 @@ -package diff - -// source: https://github.com/MFAshby/myers -// Myer's diff algorithm in golang -// Ported from https://blog.robertelder.org/diff-algorithm/ - -type OpType int - -const ( - OpInsert OpType = iota - OpDelete -) - -type Op[T any] struct { - OpType OpType // Insert or delete, as above - OldPos int // Position in the old list of item to be inserted or deleted - NewPos int // Position in the _new_ list of item to be inserted - Elem T // Actual value to be inserted or deleted -} - -// Returns a minimal list of differences between 2 lists e and f -// requiring O(min(len(e),len(f))) space and O(min(len(e),len(f)) * D) -// worst-case execution time where D is the number of differences. -func myersDiff[T any](e, f []T, equals func(T, T) bool) []Op[T] { - return diffInternal(e, f, equals, 0, 0) -} - -func diffInternal[T any](e, f []T, equals func(T, T) bool, i, j int) []Op[T] { - N := len(e) - M := len(f) - L := N + M - Z := 2*min(N, M) + 2 - switch { - case N > 0 && M > 0: - w := N - M - g := make([]int, Z) - p := make([]int, Z) - - hMax := L/2 + L%2 + 1 - for h := range hMax { - for r := range 2 { - var c, d []int - var o, m int - if r == 0 { - c = g - d = p - o = 1 - m = 1 - } else { - c = p - d = g - o = 0 - m = -1 - } - kMin := -(h - 2*max(0, h-M)) - kMax := h - 2*max(0, h-N) + 1 - for k := kMin; k < kMax; k += 2 { - var a int - if k == -h || k != h && c[pyMod((k-1), Z)] < c[pyMod((k+1), Z)] { - a = c[pyMod((k+1), Z)] - } else { - a = c[pyMod((k-1), Z)] + 1 - } - b := a - k - s, t := a, b - - for a < N && b < M && equals(e[(1-o)*N+m*a+(o-1)], f[(1-o)*M+m*b+(o-1)]) { - a, b = a+1, b+1 - } - c[pyMod(k, Z)] = a - z := -(k - w) - if pyMod(L, 2) == o && z >= -(h-o) && z <= h-o && c[pyMod(k, Z)]+d[pyMod(z, Z)] >= N { - var D, x, y, u, v int - if o == 1 { - D = 2*h - 1 - x = s - y = t - u = a - v = b - } else { - D = 2 * h - x = N - a - y = M - b - u = N - s - v = M - t - } - switch { - case D > 1 || (x != u && y != v): - return append(diffInternal(e[0:x], f[0:y], equals, i, j), diffInternal(e[u:N], f[v:M], equals, i+u, j+v)...) - case M > N: - return diffInternal(make([]T, 0), f[N:M], equals, i+N, j+N) - case M < N: - return diffInternal(e[M:N], make([]T, 0), equals, i+M, j+M) - default: - return make([]Op[T], 0) - } - } - } - } - } - case N > 0: - res := make([]Op[T], N) - for n := range N { - res[n] = Op[T]{OpDelete, i + n, -1, e[n]} - } - return res - default: - res := make([]Op[T], M) - for n := range M { - res[n] = Op[T]{OpInsert, i, j + n, f[n]} - } - return res - } - panic("Should never hit this!") -} - -/** - * The remainder op in python always matches the sign of the _denominator_ - * e.g -1%3 = 2. - * In golang it matches the sign of the numerator. - * See https://en.wikipedia.org/wiki/Modulo_operation#Variants_of_the_definition - */ -func pyMod(x, y int) int { - return (x%y + y) % y -} - -// Let us map element in same way as in - -// Convenient wrapper for string lists -func myersDiffStr(e, f []string, compareSpace bool) []Op[string] { - return myersDiff[string](e, f, func(s1, s2 string) bool { - return compareStrings(s1, s2, compareSpace) - }) -} diff --git a/internal/parser/diff/myers_test.go b/internal/parser/diff/myers_test.go deleted file mode 100644 index 6d331c4..0000000 --- a/internal/parser/diff/myers_test.go +++ /dev/null @@ -1,49 +0,0 @@ -package diff - -import ( - "reflect" - t "testing" -) - -type TestCase struct { - l1 []string - l2 []string - exp []Op[string] -} - -func TestDiff(t *t.T) { - A := "A" - B := "B" - C := "C" - testCases := []TestCase{ - {[]string{}, []string{}, []Op[string]{}}, - {[]string{}, []string{"foo"}, []Op[string]{{OpInsert, 0, 0, "foo"}}}, - {[]string{"foo"}, []string{}, []Op[string]{{OpDelete, 0, -1, "foo"}}}, - {[]string{"foo", "bar", "baz"}, []string{"foo", "bar", "baz"}, []Op[string]{}}, - {[]string{"foo", "bar", "baz"}, []string{"foo", "baz"}, []Op[string]{{OpDelete, 1, -1, "bar"}}}, - {[]string{"baz"}, []string{"foo", "baz"}, []Op[string]{{OpInsert, 0, 0, "foo"}}}, - {[]string{"bar", "baz"}, []string{"foo", "baz"}, []Op[string]{{OpDelete, 0, -1, "bar"}, {OpInsert, 1, 0, "foo"}}}, - {[]string{"foo", "bar", "baz"}, []string{"foo", "bar"}, []Op[string]{{OpDelete, 2, -1, "baz"}}}, - { - []string{A, B, C, A, B, B, A}, - []string{C, B, A, B, A, C}, - []Op[string]{{OpDelete, 0, -1, A}, {OpInsert, 1, 0, C}, {OpDelete, 2, -1, C}, {OpDelete, 5, -1, B}, {OpInsert, 7, 5, C}}, - }, - { - []string{C, A, B, A, B, A, B, A, B, A, B, A, B, C}, - []string{B, A, B, A, B, A, B, A, B, A, B, A, B, A}, - []Op[string]{{OpDelete, 0, -1, C}, {OpInsert, 1, 0, B}, {OpDelete, 13, -1, C}, {OpInsert, 14, 13, A}}, - }, - { - []string{B}, - []string{A, B, C, B, A}, - []Op[string]{{OpInsert, 0, 0, A}, {OpInsert, 0, 1, B}, {OpInsert, 0, 2, C}, {OpInsert, 1, 4, A}}, - }, - } - for _, c := range testCases { - act := myersDiffStr(c.l1, c.l2, true) - if !reflect.DeepEqual(c.exp, act) { - t.Errorf("Failed diff, expected %v actual %v\n", c.exp, act) - } - } -} diff --git a/internal/parser/diff/parser.go b/internal/parser/diff/parser.go index 70e958e..a2d945e 100644 --- a/internal/parser/diff/parser.go +++ b/internal/parser/diff/parser.go @@ -80,16 +80,8 @@ func (*Diff) Run(results []stage.ExecutorResult, confAny any) ( resultStr = resultStr[:output.MaxDiffLength] truncated = true } - answerLines := strings.Split(answerStr, "\n") - resultLines := strings.Split(resultStr, "\n") - // Generate Myers diff - diffOps := myersDiffStr(answerLines, resultLines, - output.CompareSpace) - // Generate diff block with surrounding context - diffOutput := formatDiff( - answerLines, - resultLines, - diffOps, + diffOutput := patienceDiff( + answerStr, resultStr, output.CompareSpace, ) diffOutput = strings.TrimSuffix(diffOutput, "\n ") if truncated { diff --git a/internal/parser/diff/patience.go b/internal/parser/diff/patience.go new file mode 100644 index 0000000..2aa5b1e --- /dev/null +++ b/internal/parser/diff/patience.go @@ -0,0 +1,239 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package diff + +// modified from https://github.com/rogpeppe/go-internal/blob/master/diff/diff.go + +import ( + "bytes" + "sort" + "strings" +) + +// A pair is a pair of values tracked for both the x and y side of a diff. +// It is typically a pair of line indexes. +type pair struct{ x, y int } + +// Diff returns an anchored diff of the two texts old and new +// in the “unified diff” format. If old and new are identical, +// Diff returns a nil slice (no output). +// +// Unix diff implementations typically look for a diff with +// the smallest number of lines inserted and removed, +// which can in the worst case take time quadratic in the +// number of lines in the texts. As a result, many implementations +// either can be made to run for a long time or cut off the search +// after a predetermined amount of work. +// +// In contrast, this implementation looks for a diff with the +// smallest number of “unique” lines inserted and removed, +// where unique means a line that appears just once in both old and new. +// We call this an “anchored diff” because the unique lines anchor +// the chosen matching regions. An anchored diff is usually clearer +// than a standard diff, because the algorithm does not try to +// reuse unrelated blank lines or closing braces. +// The algorithm also guarantees to run in O(n log n) time +// instead of the standard O(n²) time. +// +// Some systems call this approach a “patience diff,” named for +// the “patience sorting” algorithm, itself named for a solitaire card game. +// We avoid that name for two reasons. First, the name has been used +// for a few different variants of the algorithm, so it is imprecise. +// Second, the name is frequently interpreted as meaning that you have +// to wait longer (to be patient) for the diff, meaning that it is a slower algorithm, +// when in fact the algorithm is faster than the standard one. +func patienceDiff(old, new string, compareSpace bool) string { + x := strings.SplitAfter(old, "\n") + y := strings.SplitAfter(new, "\n") + + // Print diff header. + var out bytes.Buffer + + // Loop over matches to consider, + // expanding each match to include surrounding lines, + // and then printing diff chunks. + // To avoid setup/teardown cases outside the loop, + // tgs returns a leading {0,0} and trailing {len(x), len(y)} pair + // in the sequence of matches. + var ( + done pair // printed up to x[:done.x] and y[:done.y] + chunk pair // start lines of current chunk + count pair // number of lines from each side in current chunk + ctext []string // lines for current chunk + ) + for _, m := range tgs(x, y) { + if m.x < done.x { + // Already handled scanning forward from earlier match. + continue + } + + // Expand matching lines as far possible, + // establishing that x[start.x:end.x] == y[start.y:end.y]. + // Note that on the first (or last) iteration we may (or definitely do) + // have an empty match: start.x==end.x and start.y==end.y. + start := m + for start.x > done.x && start.y > done.y && compareStrings(x[start.x-1], y[start.y-1], compareSpace) { + start.x-- + start.y-- + } + end := m + for end.x < len(x) && end.y < len(y) && compareStrings(x[end.x], y[end.y], compareSpace) { + end.x++ + end.y++ + } + + // Emit the mismatched lines before start into this chunk. + // (No effect on first sentinel iteration, when start = {0,0}.) + for _, s := range x[done.x:start.x] { + ctext = append(ctext, "- "+s) + count.x++ + } + for _, s := range y[done.y:start.y] { + ctext = append(ctext, "+ "+s) + count.y++ + } + + // If we're not at EOF and have too few common lines, + // the chunk includes all the common lines and continues. + const C = 3 // number of context lines + if (end.x < len(x) || end.y < len(y)) && + (end.x-start.x < C || (len(ctext) > 0 && end.x-start.x < 2*C)) { + for _, s := range x[start.x:end.x] { + ctext = append(ctext, " "+s) + count.x++ + count.y++ + } + done = end + continue + } + + // End chunk with common lines for context. + if len(ctext) > 0 { + n := min(end.x-start.x, C) + for _, s := range x[start.x : start.x+n] { + ctext = append(ctext, " "+s) + count.x++ + count.y++ + } + done = pair{start.x + n, start.y + n} + + // Format and emit chunk. + // Convert line numbers to 1-indexed. + // Special case: empty file shows up as 0,0 not 1,0. + if count.x > 0 { + chunk.x++ + } + if count.y > 0 { + chunk.y++ + } + // We do not need this line + // fmt.Fprintf(&out, "@@ -%d,%d +%d,%d @@\n", chunk.x, count.x, chunk.y, count.y) + for _, s := range ctext { + out.WriteString(s) + } + count.x = 0 + count.y = 0 + ctext = ctext[:0] + } + + // If we reached EOF, we're done. + if end.x >= len(x) && end.y >= len(y) { + break + } + + // Otherwise start a new chunk. + chunk = pair{end.x - C, end.y - C} + for _, s := range x[chunk.x:end.x] { + ctext = append(ctext, " "+s) + count.x++ + count.y++ + } + done = end + } + + return out.String() +} + +// tgs returns the pairs of indexes of the longest common subsequence +// of unique lines in x and y, where a unique line is one that appears +// once in x and once in y. +// +// The longest common subsequence algorithm is as described in +// Thomas G. Szymanski, “A Special Case of the Maximal Common +// Subsequence Problem,” Princeton TR #170 (January 1975), +// available at https://research.swtch.com/tgs170.pdf. +func tgs(x, y []string) []pair { + // Count the number of times each string appears in a and b. + // We only care about 0, 1, many, counted as 0, -1, -2 + // for the x side and 0, -4, -8 for the y side. + // Using negative numbers now lets us distinguish positive line numbers later. + m := make(map[string]int) + for _, s := range x { + if c := m[s]; c > -2 { + m[s] = c - 1 + } + } + for _, s := range y { + if c := m[s]; c > -8 { + m[s] = c - 4 + } + } + + // Now unique strings can be identified by m[s] = -1+-4. + // + // Gather the indexes of those strings in x and y, building: + // xi[i] = increasing indexes of unique strings in x. + // yi[i] = increasing indexes of unique strings in y. + // inv[i] = index j such that x[xi[i]] = y[yi[j]]. + var xi, yi, inv []int + for i, s := range y { + if m[s] == -1+-4 { + m[s] = len(yi) + yi = append(yi, i) + } + } + for i, s := range x { + if j, ok := m[s]; ok && j >= 0 { + xi = append(xi, i) + inv = append(inv, j) + } + } + + // Apply Algorithm A from Szymanski's paper. + // In those terms, A = J = inv and B = [0, n). + // We add sentinel pairs {0,0}, and {len(x),len(y)} + // to the returned sequence, to help the processing loop. + J := inv + n := len(xi) + T := make([]int, n) + L := make([]int, n) + for i := range T { + T[i] = n + 1 + } + for i := range n { + k := sort.Search(n, func(k int) bool { + return T[k] >= J[i] + }) + T[k] = J[i] + L[i] = k + 1 + } + k := 0 + for _, v := range L { + if k < v { + k = v + } + } + seq := make([]pair, 2+k) + seq[1+k] = pair{len(x), len(y)} // sentinel at end + lastj := n + for i := n - 1; i >= 0; i-- { + if L[i] == k && J[i] < lastj { + seq[k] = pair{xi[i], yi[J[i]]} + k-- + } + } + seq[0] = pair{0, 0} // sentinel at start + return seq +}