From ac75b19801c1b619ba7790dcc7569a84db9ca630 Mon Sep 17 00:00:00 2001 From: Boming Zhang Date: Fri, 28 Mar 2025 07:36:08 -0400 Subject: [PATCH] feat(parser/diff): patience diff from peter-evans/patience --- go.mod | 1 + go.sum | 2 + internal/parser/diff/diff.go | 30 ++++ internal/parser/diff/parser.go | 8 +- internal/parser/diff/patience.go | 245 ------------------------------- 5 files changed, 38 insertions(+), 248 deletions(-) delete mode 100644 internal/parser/diff/patience.go diff --git a/go.mod b/go.mod index 13661a8..fdd7b2f 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,7 @@ require ( github.com/koding/multiconfig v0.0.0-20171124222453-69c27309b2d7 github.com/mcuadros/go-defaults v1.2.0 github.com/mitchellh/mapstructure v1.5.0 + github.com/peter-evans/patience v0.3.0 google.golang.org/grpc v1.71.0 google.golang.org/protobuf v1.36.5 ) diff --git a/go.sum b/go.sum index b254936..75ce526 100644 --- a/go.sum +++ b/go.sum @@ -76,6 +76,8 @@ github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RR github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/onsi/gomega v1.34.1 h1:EUMJIKUjM8sKjYbtxQI9A4z2o+rruxnzNvpknOXie6k= github.com/onsi/gomega v1.34.1/go.mod h1:kU1QgUvBDLXBJq618Xvm2LUX6rSAfRaFRTcdOeDLwwY= +github.com/peter-evans/patience v0.3.0 h1:rX0JdJeepqdQl1Sk9c9uvorjYYzL2TfgLX1adqYm9cA= +github.com/peter-evans/patience v0.3.0/go.mod h1:Kmxu5sY1NmBLFSStvXjX1wS9mIv7wMcP/ubucyMOAu0= github.com/pjbgf/sha1cd v0.3.2 h1:a9wb0bp1oC2TGwStyn0Umc/IGKQnEgF0vVaZ8QF8eo4= github.com/pjbgf/sha1cd v0.3.2/go.mod h1:zQWigSxVmsHEZow5qaLtPYxpcKMMQpa09ixqBxuCS6A= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= diff --git a/internal/parser/diff/diff.go b/internal/parser/diff/diff.go index 25f0695..8c94432 100644 --- a/internal/parser/diff/diff.go +++ b/internal/parser/diff/diff.go @@ -1,5 +1,12 @@ package diff +import ( + "fmt" + "strings" + + "github.com/peter-evans/patience" +) + // compareStrings compares two strings character by character, optionally ignoring whitespace. func compareStrings(str1, str2 string, compareSpace bool) bool { if compareSpace { @@ -44,3 +51,26 @@ func isWhitespace(b byte) bool { b == 0x85 || b == 0xA0 } + +// typeSymbol returns the associated symbol of a DiffType. +func typeSymbol(t patience.DiffType) string { + switch t { + case patience.Equal: + return " " + case patience.Insert: + return "+ " + case patience.Delete: + return "- " + default: + panic("unknown DiffType") + } +} + +// DiffText returns the source and destination texts (all equalities, insertions and deletions). +func DiffText(diffs []patience.DiffLine) string { + s := make([]string, len(diffs)) + for i, l := range diffs { + s[i] = fmt.Sprintf("%s%s", typeSymbol(l.Type), l.Text) + } + return strings.Join(s, "\n") +} diff --git a/internal/parser/diff/parser.go b/internal/parser/diff/parser.go index a2d945e..8a89e65 100644 --- a/internal/parser/diff/parser.go +++ b/internal/parser/diff/parser.go @@ -7,6 +7,7 @@ import ( "strings" "github.com/joint-online-judge/JOJ3/internal/stage" + "github.com/peter-evans/patience" ) func (*Diff) Run(results []stage.ExecutorResult, confAny any) ( @@ -80,9 +81,10 @@ func (*Diff) Run(results []stage.ExecutorResult, confAny any) ( resultStr = resultStr[:output.MaxDiffLength] truncated = true } - diffOutput := patienceDiff( - answerStr, resultStr, output.CompareSpace, - ) + answerLines := strings.Split(answerStr, "\n") + resultLines := strings.Split(resultStr, "\n") + diffs := patience.Diff(answerLines, resultLines) + diffOutput := DiffText(diffs) diffOutput = strings.TrimSuffix(diffOutput, "\n ") if truncated { diffOutput += "\n\n(truncated)" diff --git a/internal/parser/diff/patience.go b/internal/parser/diff/patience.go deleted file mode 100644 index f6e56e6..0000000 --- a/internal/parser/diff/patience.go +++ /dev/null @@ -1,245 +0,0 @@ -// Copyright 2022 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package diff - -// modified from https://github.com/rogpeppe/go-internal/blob/master/diff/diff.go - -import ( - "bytes" - "sort" - "strings" -) - -// A pair is a pair of values tracked for both the x and y side of a diff. -// It is typically a pair of line indexes. -type pair struct{ x, y int } - -// Diff returns an anchored diff of the two texts old and new -// in the “unified diff” format. If old and new are identical, -// Diff returns a nil slice (no output). -// -// Unix diff implementations typically look for a diff with -// the smallest number of lines inserted and removed, -// which can in the worst case take time quadratic in the -// number of lines in the texts. As a result, many implementations -// either can be made to run for a long time or cut off the search -// after a predetermined amount of work. -// -// In contrast, this implementation looks for a diff with the -// smallest number of “unique” lines inserted and removed, -// where unique means a line that appears just once in both old and new. -// We call this an “anchored diff” because the unique lines anchor -// the chosen matching regions. An anchored diff is usually clearer -// than a standard diff, because the algorithm does not try to -// reuse unrelated blank lines or closing braces. -// The algorithm also guarantees to run in O(n log n) time -// instead of the standard O(n²) time. -// -// Some systems call this approach a “patience diff,” named for -// the “patience sorting” algorithm, itself named for a solitaire card game. -// We avoid that name for two reasons. First, the name has been used -// for a few different variants of the algorithm, so it is imprecise. -// Second, the name is frequently interpreted as meaning that you have -// to wait longer (to be patient) for the diff, meaning that it is a slower algorithm, -// when in fact the algorithm is faster than the standard one. -func patienceDiff(old, new string, compareSpace bool) string { - if len(old) != 0 && old[len(old)-1] != '\n' { - old += "\n" - } - if len(new) != 0 && new[len(new)-1] != '\n' { - new += "\n" - } - x := strings.SplitAfter(old, "\n") - y := strings.SplitAfter(new, "\n") - - // Print diff header. - var out bytes.Buffer - - // Loop over matches to consider, - // expanding each match to include surrounding lines, - // and then printing diff chunks. - // To avoid setup/teardown cases outside the loop, - // tgs returns a leading {0,0} and trailing {len(x), len(y)} pair - // in the sequence of matches. - var ( - done pair // printed up to x[:done.x] and y[:done.y] - chunk pair // start lines of current chunk - count pair // number of lines from each side in current chunk - ctext []string // lines for current chunk - ) - for _, m := range tgs(x, y) { - if m.x < done.x { - // Already handled scanning forward from earlier match. - continue - } - - // Expand matching lines as far possible, - // establishing that x[start.x:end.x] == y[start.y:end.y]. - // Note that on the first (or last) iteration we may (or definitely do) - // have an empty match: start.x==end.x and start.y==end.y. - start := m - for start.x > done.x && start.y > done.y && compareStrings(x[start.x-1], y[start.y-1], compareSpace) { - start.x-- - start.y-- - } - end := m - for end.x < len(x) && end.y < len(y) && compareStrings(x[end.x], y[end.y], compareSpace) { - end.x++ - end.y++ - } - - // Emit the mismatched lines before start into this chunk. - // (No effect on first sentinel iteration, when start = {0,0}.) - for _, s := range x[done.x:start.x] { - ctext = append(ctext, "- "+s) - count.x++ - } - for _, s := range y[done.y:start.y] { - ctext = append(ctext, "+ "+s) - count.y++ - } - - // If we're not at EOF and have too few common lines, - // the chunk includes all the common lines and continues. - const C = 3 // number of context lines - if (end.x < len(x) || end.y < len(y)) && - (end.x-start.x < C || (len(ctext) > 0 && end.x-start.x < 2*C)) { - for _, s := range x[start.x:end.x] { - ctext = append(ctext, " "+s) - count.x++ - count.y++ - } - done = end - continue - } - - // End chunk with common lines for context. - if len(ctext) > 0 { - n := min(end.x-start.x, C) - for _, s := range x[start.x : start.x+n] { - ctext = append(ctext, " "+s) - count.x++ - count.y++ - } - done = pair{start.x + n, start.y + n} - - // Format and emit chunk. - // Convert line numbers to 1-indexed. - // Special case: empty file shows up as 0,0 not 1,0. - if count.x > 0 { - chunk.x++ - } - if count.y > 0 { - chunk.y++ - } - // We do not need this line - // fmt.Fprintf(&out, "@@ -%d,%d +%d,%d @@\n", chunk.x, count.x, chunk.y, count.y) - for _, s := range ctext { - out.WriteString(s) - } - count.x = 0 - count.y = 0 - ctext = ctext[:0] - } - - // If we reached EOF, we're done. - if end.x >= len(x) && end.y >= len(y) { - break - } - - // Otherwise start a new chunk. - chunk = pair{end.x - C, end.y - C} - for _, s := range x[chunk.x:end.x] { - ctext = append(ctext, " "+s) - count.x++ - count.y++ - } - done = end - } - - return out.String() -} - -// tgs returns the pairs of indexes of the longest common subsequence -// of unique lines in x and y, where a unique line is one that appears -// once in x and once in y. -// -// The longest common subsequence algorithm is as described in -// Thomas G. Szymanski, “A Special Case of the Maximal Common -// Subsequence Problem,” Princeton TR #170 (January 1975), -// available at https://research.swtch.com/tgs170.pdf. -func tgs(x, y []string) []pair { - // Count the number of times each string appears in a and b. - // We only care about 0, 1, many, counted as 0, -1, -2 - // for the x side and 0, -4, -8 for the y side. - // Using negative numbers now lets us distinguish positive line numbers later. - m := make(map[string]int) - for _, s := range x { - if c := m[s]; c > -2 { - m[s] = c - 1 - } - } - for _, s := range y { - if c := m[s]; c > -8 { - m[s] = c - 4 - } - } - - // Now unique strings can be identified by m[s] = -1+-4. - // - // Gather the indexes of those strings in x and y, building: - // xi[i] = increasing indexes of unique strings in x. - // yi[i] = increasing indexes of unique strings in y. - // inv[i] = index j such that x[xi[i]] = y[yi[j]]. - var xi, yi, inv []int - for i, s := range y { - if m[s] == -1+-4 { - m[s] = len(yi) - yi = append(yi, i) - } - } - for i, s := range x { - if j, ok := m[s]; ok && j >= 0 { - xi = append(xi, i) - inv = append(inv, j) - } - } - - // Apply Algorithm A from Szymanski's paper. - // In those terms, A = J = inv and B = [0, n). - // We add sentinel pairs {0,0}, and {len(x),len(y)} - // to the returned sequence, to help the processing loop. - J := inv - n := len(xi) - T := make([]int, n) - L := make([]int, n) - for i := range T { - T[i] = n + 1 - } - for i := range n { - k := sort.Search(n, func(k int) bool { - return T[k] >= J[i] - }) - T[k] = J[i] - L[i] = k + 1 - } - k := 0 - for _, v := range L { - if k < v { - k = v - } - } - seq := make([]pair, 2+k) - seq[1+k] = pair{len(x), len(y)} // sentinel at end - lastj := n - for i := n - 1; i >= 0; i-- { - if L[i] == k && J[i] < lastj { - seq[k] = pair{xi[i], yi[J[i]]} - k-- - } - } - seq[0] = pair{0, 0} // sentinel at start - return seq -}