feat(parser/diff): patience diff from peter-evans/patience

2025-03-28 07:36:08 -04:00 · 2025-03-28 07:36:08 -04:00 · ac75b19801
commit ac75b19801
parent d56fc12e3a
5 changed files with 38 additions and 248 deletions
--- a/go.mod
+++ b/go.mod
@ -12,6 +12,7 @@ require (
 	github.com/koding/multiconfig v0.0.0-20171124222453-69c27309b2d7
 	github.com/mcuadros/go-defaults v1.2.0
 	github.com/mitchellh/mapstructure v1.5.0
+	github.com/peter-evans/patience v0.3.0
 	google.golang.org/grpc v1.71.0
 	google.golang.org/protobuf v1.36.5
 )
--- a/go.sum
+++ b/go.sum
@ -76,6 +76,8 @@ github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RR
 github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
 github.com/onsi/gomega v1.34.1 h1:EUMJIKUjM8sKjYbtxQI9A4z2o+rruxnzNvpknOXie6k=
 github.com/onsi/gomega v1.34.1/go.mod h1:kU1QgUvBDLXBJq618Xvm2LUX6rSAfRaFRTcdOeDLwwY=
+github.com/peter-evans/patience v0.3.0 h1:rX0JdJeepqdQl1Sk9c9uvorjYYzL2TfgLX1adqYm9cA=
+github.com/peter-evans/patience v0.3.0/go.mod h1:Kmxu5sY1NmBLFSStvXjX1wS9mIv7wMcP/ubucyMOAu0=
 github.com/pjbgf/sha1cd v0.3.2 h1:a9wb0bp1oC2TGwStyn0Umc/IGKQnEgF0vVaZ8QF8eo4=
 github.com/pjbgf/sha1cd v0.3.2/go.mod h1:zQWigSxVmsHEZow5qaLtPYxpcKMMQpa09ixqBxuCS6A=
 github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
--- a/internal/parser/diff/diff.go
+++ b/internal/parser/diff/diff.go
@ -1,5 +1,12 @@
 package diff

+import (
+	"fmt"
+	"strings"
+
+	"github.com/peter-evans/patience"
+)
+
 // compareStrings compares two strings character by character, optionally ignoring whitespace.
 func compareStrings(str1, str2 string, compareSpace bool) bool {
 	if compareSpace {
@ -44,3 +51,26 @@ func isWhitespace(b byte) bool {
 		b == 0x85 ||
 		b == 0xA0
 }
+
+// typeSymbol returns the associated symbol of a DiffType.
+func typeSymbol(t patience.DiffType) string {
+	switch t {
+	case patience.Equal:
+		return "  "
+	case patience.Insert:
+		return "+ "
+	case patience.Delete:
+		return "- "
+	default:
+		panic("unknown DiffType")
+	}
+}
+
+// DiffText returns the source and destination texts (all equalities, insertions and deletions).
+func DiffText(diffs []patience.DiffLine) string {
+	s := make([]string, len(diffs))
+	for i, l := range diffs {
+		s[i] = fmt.Sprintf("%s%s", typeSymbol(l.Type), l.Text)
+	}
+	return strings.Join(s, "\n")
+}
--- a/internal/parser/diff/parser.go
+++ b/internal/parser/diff/parser.go
@ -7,6 +7,7 @@ import (
 	"strings"

 	"github.com/joint-online-judge/JOJ3/internal/stage"
+	"github.com/peter-evans/patience"
 )

 func (*Diff) Run(results []stage.ExecutorResult, confAny any) (
@ -80,9 +81,10 @@ func (*Diff) Run(results []stage.ExecutorResult, confAny any) (
 							resultStr = resultStr[:output.MaxDiffLength]
 							truncated = true
 						}
-						diffOutput := patienceDiff(
-							answerStr, resultStr, output.CompareSpace,
-						)
+						answerLines := strings.Split(answerStr, "\n")
+						resultLines := strings.Split(resultStr, "\n")
+						diffs := patience.Diff(answerLines, resultLines)
+						diffOutput := DiffText(diffs)
 						diffOutput = strings.TrimSuffix(diffOutput, "\n  ")
 						if truncated {
 							diffOutput += "\n\n(truncated)"
--- a/internal/parser/diff/patience.go
+++ b/internal/parser/diff/patience.go
@ -1,245 +0,0 @@
-// Copyright 2022 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package diff
-
-// modified from https://github.com/rogpeppe/go-internal/blob/master/diff/diff.go
-
-import (
-	"bytes"
-	"sort"
-	"strings"
-)
-
-// A pair is a pair of values tracked for both the x and y side of a diff.
-// It is typically a pair of line indexes.
-type pair struct{ x, y int }
-
-// Diff returns an anchored diff of the two texts old and new
-// in the “unified diff” format. If old and new are identical,
-// Diff returns a nil slice (no output).
-//
-// Unix diff implementations typically look for a diff with
-// the smallest number of lines inserted and removed,
-// which can in the worst case take time quadratic in the
-// number of lines in the texts. As a result, many implementations
-// either can be made to run for a long time or cut off the search
-// after a predetermined amount of work.
-//
-// In contrast, this implementation looks for a diff with the
-// smallest number of “unique” lines inserted and removed,
-// where unique means a line that appears just once in both old and new.
-// We call this an “anchored diff” because the unique lines anchor
-// the chosen matching regions. An anchored diff is usually clearer
-// than a standard diff, because the algorithm does not try to
-// reuse unrelated blank lines or closing braces.
-// The algorithm also guarantees to run in O(n log n) time
-// instead of the standard O(n²) time.
-//
-// Some systems call this approach a “patience diff,” named for
-// the “patience sorting” algorithm, itself named for a solitaire card game.
-// We avoid that name for two reasons. First, the name has been used
-// for a few different variants of the algorithm, so it is imprecise.
-// Second, the name is frequently interpreted as meaning that you have
-// to wait longer (to be patient) for the diff, meaning that it is a slower algorithm,
-// when in fact the algorithm is faster than the standard one.
-func patienceDiff(old, new string, compareSpace bool) string {
-	if len(old) != 0 && old[len(old)-1] != '\n' {
-		old += "\n"
-	}
-	if len(new) != 0 && new[len(new)-1] != '\n' {
-		new += "\n"
-	}
-	x := strings.SplitAfter(old, "\n")
-	y := strings.SplitAfter(new, "\n")
-
-	// Print diff header.
-	var out bytes.Buffer
-
-	// Loop over matches to consider,
-	// expanding each match to include surrounding lines,
-	// and then printing diff chunks.
-	// To avoid setup/teardown cases outside the loop,
-	// tgs returns a leading {0,0} and trailing {len(x), len(y)} pair
-	// in the sequence of matches.
-	var (
-		done  pair     // printed up to x[:done.x] and y[:done.y]
-		chunk pair     // start lines of current chunk
-		count pair     // number of lines from each side in current chunk
-		ctext []string // lines for current chunk
-	)
-	for _, m := range tgs(x, y) {
-		if m.x < done.x {
-			// Already handled scanning forward from earlier match.
-			continue
-		}
-
-		// Expand matching lines as far possible,
-		// establishing that x[start.x:end.x] == y[start.y:end.y].
-		// Note that on the first (or last) iteration we may (or definitely do)
-		// have an empty match: start.x==end.x and start.y==end.y.
-		start := m
-		for start.x > done.x && start.y > done.y && compareStrings(x[start.x-1], y[start.y-1], compareSpace) {
-			start.x--
-			start.y--
-		}
-		end := m
-		for end.x < len(x) && end.y < len(y) && compareStrings(x[end.x], y[end.y], compareSpace) {
-			end.x++
-			end.y++
-		}
-
-		// Emit the mismatched lines before start into this chunk.
-		// (No effect on first sentinel iteration, when start = {0,0}.)
-		for _, s := range x[done.x:start.x] {
-			ctext = append(ctext, "- "+s)
-			count.x++
-		}
-		for _, s := range y[done.y:start.y] {
-			ctext = append(ctext, "+ "+s)
-			count.y++
-		}
-
-		// If we're not at EOF and have too few common lines,
-		// the chunk includes all the common lines and continues.
-		const C = 3 // number of context lines
-		if (end.x < len(x) || end.y < len(y)) &&
-			(end.x-start.x < C || (len(ctext) > 0 && end.x-start.x < 2*C)) {
-			for _, s := range x[start.x:end.x] {
-				ctext = append(ctext, "  "+s)
-				count.x++
-				count.y++
-			}
-			done = end
-			continue
-		}
-
-		// End chunk with common lines for context.
-		if len(ctext) > 0 {
-			n := min(end.x-start.x, C)
-			for _, s := range x[start.x : start.x+n] {
-				ctext = append(ctext, "  "+s)
-				count.x++
-				count.y++
-			}
-			done = pair{start.x + n, start.y + n}
-
-			// Format and emit chunk.
-			// Convert line numbers to 1-indexed.
-			// Special case: empty file shows up as 0,0 not 1,0.
-			if count.x > 0 {
-				chunk.x++
-			}
-			if count.y > 0 {
-				chunk.y++
-			}
-			// We do not need this line
-			// fmt.Fprintf(&out, "@@ -%d,%d +%d,%d @@\n", chunk.x, count.x, chunk.y, count.y)
-			for _, s := range ctext {
-				out.WriteString(s)
-			}
-			count.x = 0
-			count.y = 0
-			ctext = ctext[:0]
-		}
-
-		// If we reached EOF, we're done.
-		if end.x >= len(x) && end.y >= len(y) {
-			break
-		}
-
-		// Otherwise start a new chunk.
-		chunk = pair{end.x - C, end.y - C}
-		for _, s := range x[chunk.x:end.x] {
-			ctext = append(ctext, "  "+s)
-			count.x++
-			count.y++
-		}
-		done = end
-	}
-
-	return out.String()
-}
-
-// tgs returns the pairs of indexes of the longest common subsequence
-// of unique lines in x and y, where a unique line is one that appears
-// once in x and once in y.
-//
-// The longest common subsequence algorithm is as described in
-// Thomas G. Szymanski, “A Special Case of the Maximal Common
-// Subsequence Problem,” Princeton TR #170 (January 1975),
-// available at https://research.swtch.com/tgs170.pdf.
-func tgs(x, y []string) []pair {
-	// Count the number of times each string appears in a and b.
-	// We only care about 0, 1, many, counted as 0, -1, -2
-	// for the x side and 0, -4, -8 for the y side.
-	// Using negative numbers now lets us distinguish positive line numbers later.
-	m := make(map[string]int)
-	for _, s := range x {
-		if c := m[s]; c > -2 {
-			m[s] = c - 1
-		}
-	}
-	for _, s := range y {
-		if c := m[s]; c > -8 {
-			m[s] = c - 4
-		}
-	}
-
-	// Now unique strings can be identified by m[s] = -1+-4.
-	//
-	// Gather the indexes of those strings in x and y, building:
-	//	xi[i] = increasing indexes of unique strings in x.
-	//	yi[i] = increasing indexes of unique strings in y.
-	//	inv[i] = index j such that x[xi[i]] = y[yi[j]].
-	var xi, yi, inv []int
-	for i, s := range y {
-		if m[s] == -1+-4 {
-			m[s] = len(yi)
-			yi = append(yi, i)
-		}
-	}
-	for i, s := range x {
-		if j, ok := m[s]; ok && j >= 0 {
-			xi = append(xi, i)
-			inv = append(inv, j)
-		}
-	}
-
-	// Apply Algorithm A from Szymanski's paper.
-	// In those terms, A = J = inv and B = [0, n).
-	// We add sentinel pairs {0,0}, and {len(x),len(y)}
-	// to the returned sequence, to help the processing loop.
-	J := inv
-	n := len(xi)
-	T := make([]int, n)
-	L := make([]int, n)
-	for i := range T {
-		T[i] = n + 1
-	}
-	for i := range n {
-		k := sort.Search(n, func(k int) bool {
-			return T[k] >= J[i]
-		})
-		T[k] = J[i]
-		L[i] = k + 1
-	}
-	k := 0
-	for _, v := range L {
-		if k < v {
-			k = v
-		}
-	}
-	seq := make([]pair, 2+k)
-	seq[1+k] = pair{len(x), len(y)} // sentinel at end
-	lastj := n
-	for i := n - 1; i >= 0; i-- {
-		if L[i] == k && J[i] < lastj {
-			seq[k] = pair{xi[i], yi[J[i]]}
-			k--
-		}
-	}
-	seq[0] = pair{0, 0} // sentinel at start
-	return seq
-}