This commit brings support of whitelisted characters during repo healthcheck/non-ascii file check. Supported by an extra switch to `repo-health-checker`, `-whitelistedChars`. The argument takes a comma-separated list of non-ASCII characters and ignores them during repo healthcheck. Illegal cmdline input is logged by the logger. Co-Authored-By: GitHub Copilot <noreply@microsoft.com> <details> <summary>Copilot Prompt</summary> <br> This is a repo for an online judge orchestrator system «JOJ3». Under `cmd/` lies a source directory for a Go command, `repo-health-checker`. You tell from its name that it checks the repo for stuff like repo size, commit message, non-ASCII character usage, etc. before sending the work to the actual judging and grading system. Now, I want the non-ASCII character checking function of the repo health checker to be flexible - it shall accept a list of non-ASCII characters and deem them acceptable. ## Your task - Accept this new cmdline arg. In `cmd/repo-health-checker/main.go`, accept a new command line flag `-whitelisted-chars`, which shall take exactly one string of comma-separated non-ASCII characters. This string shall be passed to the actual healthcheck package. - Respect this list while scanning the files. In `pkg/healthcheck/nonascii.go`, function `getNonASCII()`, we utilize a bufio *Scanner* to scan through all files for non-ASCII characters. We would like the list of acceptable chars to be passed from the cmdline to here, and modify the scanner logic to actually accept the corresponding characters. - Error handling and reporting. This command line arg, `-whitelisted-chars`, could be completely abscent; in which case, no characters shall be escaped by default. The comma-separated list passed to the command may contain ASCII characters or multiple characters that are not properly separated; in which case, ignore that element, and report the incident via the SLog logging framework used in this project. - Test your work. Create new testcases under `examples/healthcheck/` to reflect this change. Reflect to `examples/healthcheck/asciifile/` to learn about how to configure the repo health checker. Integrate your work to the Go test framework such that it could be invoked by running `make test` at the terminal. - Note: Use `git init` to init your testcase directory and make a initial commit - this project, JOJ3, only runs in Git repos. ## Notes - Directory structure. `cmd/` for invokable commands, `pkg/` for the actual logic, `internal` - something you don't need to worry about. - JOJ3 vs. Health Check. `joj3` is a separate executable; in this session we are only working on the `repo-health-checker`. - Extras. Make sure to read `README.md` and the directory structure before you go; also, create To-do before you execute your plan. </details> Reviewed-on: #100 Reviewed-by: 张泊明518370910136 <bomingzh@sjtu.edu.cn> Co-authored-by: Mack Wang <mac-wang@sjtu.edu.cn> Co-committed-by: Mack Wang <mac-wang@sjtu.edu.cn>
188 lines
4.7 KiB
Go
188 lines
4.7 KiB
Go
package healthcheck
|
|
|
|
import (
|
|
"bufio"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
|
|
"github.com/go-git/go-git/v5"
|
|
"github.com/go-git/go-git/v5/plumbing/format/gitattributes"
|
|
)
|
|
|
|
// Read the list of comma-separated allowed characters from command line and convert it to a hashmap.
|
|
func parseWhitelistedChars(csv string) map[rune]struct{} {
|
|
whitelist := make(map[rune]struct{})
|
|
if strings.TrimSpace(csv) == "" {
|
|
return whitelist
|
|
}
|
|
|
|
for _, raw := range strings.Split(csv, ",") {
|
|
elem := strings.TrimSpace(raw)
|
|
if elem == "" {
|
|
slog.Warn("ignoring invalid whitelisted-chars element", "element", raw, "reason", "empty element")
|
|
continue
|
|
}
|
|
|
|
if utf8.RuneCountInString(elem) != 1 {
|
|
slog.Warn("ignoring invalid whitelisted-chars element", "element", elem, "reason", "element must be exactly one character")
|
|
continue
|
|
}
|
|
|
|
ch, _ := utf8.DecodeRuneInString(elem)
|
|
if ch == utf8.RuneError {
|
|
slog.Warn("ignoring invalid whitelisted-chars element", "element", elem, "reason", "invalid utf-8 rune")
|
|
continue
|
|
}
|
|
if ch <= unicode.MaxASCII {
|
|
slog.Warn("ignoring invalid whitelisted-chars element", "element", elem, "reason", "ASCII characters are not allowed")
|
|
continue
|
|
}
|
|
|
|
whitelist[ch] = struct{}{}
|
|
}
|
|
|
|
return whitelist
|
|
}
|
|
|
|
// getSubmodulePathsFromGoGit uses the go-git library to open the repository
|
|
// at the given root path and retrieve a list of all submodule paths.
|
|
// It returns a set of submodule paths for efficient lookup.
|
|
func getSubmodulePathsFromGoGit(root string) (map[string]struct{}, error) {
|
|
submodulePaths := make(map[string]struct{})
|
|
|
|
// Open the git repository at the given path.
|
|
repo, err := git.PlainOpen(root)
|
|
if err != nil {
|
|
if err == git.ErrRepositoryNotExists {
|
|
return submodulePaths, nil
|
|
}
|
|
return nil, fmt.Errorf("error opening git repository: %w", err)
|
|
}
|
|
|
|
worktree, err := repo.Worktree()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error getting worktree: %w", err)
|
|
}
|
|
|
|
// Get the list of submodules.
|
|
submodules, err := worktree.Submodules()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error getting submodules: %w", err)
|
|
}
|
|
|
|
for _, sm := range submodules {
|
|
submodulePaths[filepath.ToSlash(sm.Config().Path)] = struct{}{}
|
|
}
|
|
|
|
return submodulePaths, nil
|
|
}
|
|
|
|
// getNonASCII retrieves a list of files in the specified root directory that contain non-ASCII characters.
|
|
// It searches for non-ASCII characters in each file's content and returns a list of paths to files containing non-ASCII characters.
|
|
func getNonASCII(root string, whitelist map[rune]struct{}) ([]string, error) {
|
|
var nonASCII []string
|
|
gitattrExist := true
|
|
var matcher gitattributes.Matcher
|
|
_, err := os.Stat(".gitattributes")
|
|
if os.IsNotExist(err) {
|
|
gitattrExist = false
|
|
}
|
|
|
|
submodules, err := getSubmodulePathsFromGoGit(root)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if gitattrExist {
|
|
fs := os.DirFS(".")
|
|
f, err := fs.Open(".gitattributes")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
attribute, err := gitattributes.ReadAttributes(f, nil, true)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
matcher = gitattributes.NewMatcher(attribute)
|
|
}
|
|
|
|
err = filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
relPath, err := filepath.Rel(root, path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if info.IsDir() {
|
|
if info.Name() == ".git" {
|
|
return filepath.SkipDir
|
|
}
|
|
if _, isSubmodule := submodules[relPath]; isSubmodule {
|
|
return filepath.SkipDir
|
|
}
|
|
return nil
|
|
}
|
|
|
|
if gitattrExist {
|
|
ret, matched := matcher.Match(strings.Split(relPath, "/"), nil)
|
|
if matched && ret["text"].IsUnset() && !ret["text"].IsSet() {
|
|
return nil
|
|
}
|
|
}
|
|
|
|
file, err := os.Open(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer file.Close()
|
|
|
|
scanner := bufio.NewScanner(file)
|
|
for scanner.Scan() {
|
|
cont := true
|
|
for _, c := range scanner.Text() {
|
|
if _, ok := whitelist[c]; ok {
|
|
continue
|
|
}
|
|
if c > unicode.MaxASCII {
|
|
nonASCII = append(nonASCII, "\t"+path)
|
|
cont = false
|
|
break
|
|
}
|
|
}
|
|
if !cont {
|
|
break
|
|
}
|
|
}
|
|
|
|
return nil
|
|
})
|
|
|
|
return nonASCII, err
|
|
}
|
|
|
|
// NonASCIIFiles checks for non-ASCII characters in files within the specified root directory.
|
|
// It prints a message with the paths to files containing non-ASCII characters, if any.
|
|
// Additionally it accept a list of whitelisted characters that are allowed, repo-wide.
|
|
func NonASCIIFiles(root, whitelistedChars string) error {
|
|
whitelist := parseWhitelistedChars(whitelistedChars)
|
|
nonASCII, err := getNonASCII(root, whitelist)
|
|
if err != nil {
|
|
slog.Error("getting non-ascii", "err", err)
|
|
return fmt.Errorf("error getting non-ascii: %w", err)
|
|
}
|
|
if len(nonASCII) > 0 {
|
|
return fmt.Errorf("Non-ASCII characters found in the following files:\n%s",
|
|
strings.Join(nonASCII, "\n"))
|
|
}
|
|
return nil
|
|
}
|