mirror of
https://github.com/LukeHagar/slinky.git
synced 2025-12-06 04:21:20 +00:00
Introduce a new .slinkignore file format to allow users to specify paths and URLs to ignore during scanning. Update the CollectURLs and CollectURLsProgress functions to respect these ignore rules. Add tests to verify the functionality of the .slinkignore file, ensuring that specified paths and URLs are excluded from results. Update README.md to document the new feature and its usage.
621 lines
14 KiB
Go
621 lines
14 KiB
Go
package fsurls
|
|
|
|
import (
|
|
"bufio"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/url"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
|
|
"github.com/bmatcuk/doublestar/v4"
|
|
ignore "github.com/sabhiram/go-gitignore"
|
|
)
|
|
|
|
// URL patterns from various contexts
|
|
var bareURLRegex = regexp.MustCompile(`(?i)\bhttps?://[^\s<>\[\]{}"']+`)
|
|
var mdLinkRegex = regexp.MustCompile(`(?is)!?\[[^\]]*\]\((.*?)\)`) // captures (url)
|
|
var angleURLRegex = regexp.MustCompile(`(?i)<(https?://[^>\s]+)>`)
|
|
var quotedURLRegex = regexp.MustCompile(`(?i)"(https?://[^"\s]+)"|'(https?://[^'\s]+)'`)
|
|
var htmlHrefRegex = regexp.MustCompile(`(?i)href\s*=\s*"([^"]+)"|href\s*=\s*'([^']+)'`)
|
|
var htmlSrcRegex = regexp.MustCompile(`(?i)src\s*=\s*"([^"]+)"|src\s*=\s*'([^']+)'`)
|
|
|
|
// Markdown code sections to ignore when extracting autolinks
|
|
var mdFencedCodeRegex = regexp.MustCompile("(?s)```[\\s\\S]*?```")
|
|
var mdInlineCodeRegex = regexp.MustCompile("`[^`]+`")
|
|
|
|
// Strict hostname validation: labels 1-63 chars, alnum & hyphen, not start/end hyphen, at least one dot, simple TLD
|
|
var hostnameRegex = regexp.MustCompile(`^(?i)([a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?)(?:\.[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?)+$`)
|
|
|
|
func isDebugEnv() bool {
|
|
if os.Getenv("SLINKY_DEBUG") == "1" {
|
|
return true
|
|
}
|
|
if strings.EqualFold(os.Getenv("ACTIONS_STEP_DEBUG"), "true") {
|
|
return true
|
|
}
|
|
if os.Getenv("RUNNER_DEBUG") == "1" {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// CollectURLs walks the directory tree rooted at rootPath and collects URLs found in
|
|
// text-based files matching any of the provided glob patterns (doublestar ** supported).
|
|
// If globs is empty, all files are considered. Respects .gitignore if present and respectGitignore=true.
|
|
// Returns a map from URL -> sorted unique list of file paths that contained it.
|
|
func CollectURLs(rootPath string, globs []string, respectGitignore bool) (map[string][]string, error) {
|
|
if strings.TrimSpace(rootPath) == "" {
|
|
rootPath = "."
|
|
}
|
|
cleanRoot := filepath.Clean(rootPath)
|
|
|
|
st, _ := os.Stat(cleanRoot)
|
|
isFileRoot := st != nil && !st.IsDir()
|
|
|
|
var ign *ignore.GitIgnore
|
|
if !isFileRoot && respectGitignore {
|
|
ign = loadGitIgnore(cleanRoot)
|
|
}
|
|
// Load optional .slinkignore config
|
|
slPathIgnore, slURLPatterns := loadSlinkyIgnore(cleanRoot)
|
|
|
|
var patterns []string
|
|
for _, g := range globs {
|
|
g = strings.TrimSpace(g)
|
|
if g == "" {
|
|
continue
|
|
}
|
|
patterns = append(patterns, g)
|
|
}
|
|
|
|
shouldInclude := func(rel string) bool {
|
|
if len(patterns) == 0 {
|
|
return true
|
|
}
|
|
for _, p := range patterns {
|
|
ok, _ := doublestar.PathMatch(p, rel)
|
|
if ok {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
urlToFiles := make(map[string]map[string]struct{})
|
|
|
|
// 2 MiB max file size to avoid huge/binary files
|
|
const maxSize = 2 * 1024 * 1024
|
|
|
|
// Walk the filesystem
|
|
walkFn := func(path string, d os.DirEntry, err error) error {
|
|
if isDebugEnv() {
|
|
fmt.Printf("::debug:: Walking path: %s\n", path)
|
|
}
|
|
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
rel, rerr := filepath.Rel(cleanRoot, path)
|
|
if rerr != nil {
|
|
rel = path
|
|
}
|
|
rel = filepath.ToSlash(rel)
|
|
if d.IsDir() {
|
|
base := filepath.Base(path)
|
|
if base == ".git" {
|
|
return filepath.SkipDir
|
|
}
|
|
return nil
|
|
}
|
|
if (ign != nil && ign.MatchesPath(rel)) || (slPathIgnore != nil && slPathIgnore.MatchesPath(rel)) {
|
|
return nil
|
|
}
|
|
info, ierr := d.Info()
|
|
if ierr != nil {
|
|
return nil
|
|
}
|
|
if info.Size() > maxSize {
|
|
return nil
|
|
}
|
|
if isFileRoot && rel == "." {
|
|
rel = filepath.ToSlash(filepath.Base(path))
|
|
}
|
|
if !shouldInclude(rel) {
|
|
return nil
|
|
}
|
|
|
|
// Debug: announce file being parsed; GitHub shows ::debug only in debug runs
|
|
if isDebugEnv() {
|
|
fmt.Printf("::debug:: Scanned File: %s\n", rel)
|
|
}
|
|
|
|
f, ferr := os.Open(path)
|
|
if ferr != nil {
|
|
return nil
|
|
}
|
|
defer f.Close()
|
|
br := bufio.NewReader(f)
|
|
// Read up to maxSize bytes
|
|
var b strings.Builder
|
|
read := int64(0)
|
|
for {
|
|
chunk, cerr := br.ReadString('\n')
|
|
b.WriteString(chunk)
|
|
read += int64(len(chunk))
|
|
if cerr == io.EOF || read > maxSize {
|
|
break
|
|
}
|
|
if cerr != nil {
|
|
break
|
|
}
|
|
}
|
|
content := b.String()
|
|
// Skip if likely binary (NUL present)
|
|
if strings.IndexByte(content, '\x00') >= 0 {
|
|
return nil
|
|
}
|
|
|
|
candidates := extractCandidates(rel, content)
|
|
if len(candidates) == 0 {
|
|
return nil
|
|
}
|
|
for _, raw := range candidates {
|
|
u := sanitizeURLToken(raw)
|
|
if u == "" {
|
|
continue
|
|
}
|
|
if isURLIgnored(u, slURLPatterns) {
|
|
continue
|
|
}
|
|
fileSet, ok := urlToFiles[u]
|
|
if !ok {
|
|
fileSet = make(map[string]struct{})
|
|
urlToFiles[u] = fileSet
|
|
}
|
|
fileSet[rel] = struct{}{}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
_ = filepath.WalkDir(cleanRoot, walkFn)
|
|
|
|
// Convert to sorted slices
|
|
result := make(map[string][]string, len(urlToFiles))
|
|
for u, files := range urlToFiles {
|
|
var list []string
|
|
for fp := range files {
|
|
list = append(list, fp)
|
|
}
|
|
sort.Strings(list)
|
|
result[u] = list
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
// CollectURLsProgress is like CollectURLs but invokes onFile(relPath) for each included file.
|
|
func CollectURLsProgress(rootPath string, globs []string, respectGitignore bool, onFile func(string)) (map[string][]string, error) {
|
|
if strings.TrimSpace(rootPath) == "" {
|
|
rootPath = "."
|
|
}
|
|
cleanRoot := filepath.Clean(rootPath)
|
|
|
|
st, _ := os.Stat(cleanRoot)
|
|
isFileRoot := st != nil && !st.IsDir()
|
|
|
|
var ign *ignore.GitIgnore
|
|
if !isFileRoot && respectGitignore {
|
|
ign = loadGitIgnore(cleanRoot)
|
|
}
|
|
slPathIgnore, slURLPatterns := loadSlinkyIgnore(cleanRoot)
|
|
|
|
var patterns []string
|
|
for _, g := range globs {
|
|
g = strings.TrimSpace(g)
|
|
if g == "" {
|
|
continue
|
|
}
|
|
patterns = append(patterns, g)
|
|
}
|
|
|
|
shouldInclude := func(rel string) bool {
|
|
if len(patterns) == 0 {
|
|
return true
|
|
}
|
|
for _, p := range patterns {
|
|
ok, _ := doublestar.PathMatch(p, rel)
|
|
if ok {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
urlToFiles := make(map[string]map[string]struct{})
|
|
|
|
// 2 MiB max file size to avoid huge/binary files
|
|
const maxSize = 2 * 1024 * 1024
|
|
|
|
walkFn := func(path string, d os.DirEntry, err error) error {
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
rel, rerr := filepath.Rel(cleanRoot, path)
|
|
if rerr != nil {
|
|
rel = path
|
|
}
|
|
rel = filepath.ToSlash(rel)
|
|
if d.IsDir() {
|
|
base := filepath.Base(path)
|
|
if base == ".git" {
|
|
return filepath.SkipDir
|
|
}
|
|
return nil
|
|
}
|
|
if (ign != nil && ign.MatchesPath(rel)) || (slPathIgnore != nil && slPathIgnore.MatchesPath(rel)) {
|
|
return nil
|
|
}
|
|
info, ierr := d.Info()
|
|
if ierr != nil {
|
|
return nil
|
|
}
|
|
if info.Size() > maxSize {
|
|
return nil
|
|
}
|
|
if isFileRoot && rel == "." {
|
|
rel = filepath.ToSlash(filepath.Base(path))
|
|
}
|
|
if !shouldInclude(rel) {
|
|
return nil
|
|
}
|
|
|
|
if onFile != nil {
|
|
onFile(rel)
|
|
}
|
|
|
|
f, ferr := os.Open(path)
|
|
if ferr != nil {
|
|
return nil
|
|
}
|
|
defer f.Close()
|
|
br := bufio.NewReader(f)
|
|
var b strings.Builder
|
|
read := int64(0)
|
|
for {
|
|
chunk, cerr := br.ReadString('\n')
|
|
b.WriteString(chunk)
|
|
read += int64(len(chunk))
|
|
if cerr == io.EOF || read > maxSize {
|
|
break
|
|
}
|
|
if cerr != nil {
|
|
break
|
|
}
|
|
}
|
|
content := b.String()
|
|
if strings.IndexByte(content, '\x00') >= 0 {
|
|
return nil
|
|
}
|
|
|
|
candidates := extractCandidates(rel, content)
|
|
if len(candidates) == 0 {
|
|
return nil
|
|
}
|
|
for _, raw := range candidates {
|
|
u := sanitizeURLToken(raw)
|
|
if u == "" {
|
|
continue
|
|
}
|
|
if isURLIgnored(u, slURLPatterns) {
|
|
continue
|
|
}
|
|
fileSet, ok := urlToFiles[u]
|
|
if !ok {
|
|
fileSet = make(map[string]struct{})
|
|
urlToFiles[u] = fileSet
|
|
}
|
|
fileSet[rel] = struct{}{}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
_ = filepath.WalkDir(cleanRoot, walkFn)
|
|
|
|
result := make(map[string][]string, len(urlToFiles))
|
|
for u, files := range urlToFiles {
|
|
var list []string
|
|
for fp := range files {
|
|
list = append(list, fp)
|
|
}
|
|
sort.Strings(list)
|
|
result[u] = list
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func sanitizeURLToken(s string) string {
|
|
s = strings.TrimSpace(s)
|
|
// Strip surrounding angle brackets or quotes
|
|
if strings.HasPrefix(s, "<") && strings.HasSuffix(s, ">") {
|
|
s = strings.TrimSuffix(strings.TrimPrefix(s, "<"), ">")
|
|
}
|
|
if (strings.HasPrefix(s, "\"") && strings.HasSuffix(s, "\"")) || (strings.HasPrefix(s, "'") && strings.HasSuffix(s, "'")) {
|
|
s = strings.TrimSuffix(strings.TrimPrefix(s, string(s[0])), string(s[0]))
|
|
}
|
|
// Trim obvious invalid chars at both ends and balance brackets/parentheses
|
|
s = trimDelimiters(s)
|
|
low := strings.ToLower(s)
|
|
if !(strings.HasPrefix(low, "http://") || strings.HasPrefix(low, "https://")) {
|
|
return ""
|
|
}
|
|
// Parse and validate hostname strictly
|
|
u, err := url.Parse(s)
|
|
if err != nil || u == nil {
|
|
return ""
|
|
}
|
|
host := u.Hostname()
|
|
if host == "" {
|
|
return ""
|
|
}
|
|
// Reject placeholders like [tenant] or {tenant}
|
|
if strings.ContainsAny(host, "[]{}") {
|
|
return ""
|
|
}
|
|
// Must match strict hostname rules
|
|
if !hostnameRegex.MatchString(host) {
|
|
return ""
|
|
}
|
|
return s
|
|
}
|
|
|
|
func trimTrailingDelimiters(s string) string {
|
|
for {
|
|
if s == "" {
|
|
return s
|
|
}
|
|
last := s[len(s)-1]
|
|
// Preserve closing brackets/parens if balanced; only strip if unmatched
|
|
switch last {
|
|
case ')':
|
|
open := strings.Count(s, "(")
|
|
close := strings.Count(s, ")")
|
|
if close > open {
|
|
s = s[:len(s)-1]
|
|
continue
|
|
}
|
|
case ']':
|
|
open := strings.Count(s, "[")
|
|
close := strings.Count(s, "]")
|
|
if close > open {
|
|
s = s[:len(s)-1]
|
|
continue
|
|
}
|
|
case '}':
|
|
open := strings.Count(s, "{")
|
|
close := strings.Count(s, "}")
|
|
if close > open {
|
|
s = s[:len(s)-1]
|
|
continue
|
|
}
|
|
case '>':
|
|
open := strings.Count(s, "<")
|
|
close := strings.Count(s, ">")
|
|
if close > open {
|
|
s = s[:len(s)-1]
|
|
continue
|
|
}
|
|
default:
|
|
// Common trailing punctuation and markdown emphasis markers that are not part of URLs
|
|
if strings.ContainsRune(",.;:!?]'\"*_~`", rune(last)) {
|
|
s = s[:len(s)-1]
|
|
continue
|
|
}
|
|
}
|
|
return s
|
|
}
|
|
}
|
|
|
|
func trimLeadingDelimiters(s string) string {
|
|
for {
|
|
if s == "" {
|
|
return s
|
|
}
|
|
first := s[0]
|
|
// Strip common leading punctuation/formatting not valid at URL start
|
|
if strings.ContainsRune("'\"*_~`,;:!?)]}.", rune(first)) {
|
|
s = s[1:]
|
|
continue
|
|
}
|
|
// If starts with unmatched opening bracket, drop it
|
|
switch first {
|
|
case '(':
|
|
open := strings.Count(s, "(")
|
|
close := strings.Count(s, ")")
|
|
if open > close {
|
|
s = s[1:]
|
|
continue
|
|
}
|
|
case '[':
|
|
open := strings.Count(s, "[")
|
|
close := strings.Count(s, "]")
|
|
if open > close {
|
|
s = s[1:]
|
|
continue
|
|
}
|
|
case '{':
|
|
open := strings.Count(s, "{")
|
|
close := strings.Count(s, "}")
|
|
if open > close {
|
|
s = s[1:]
|
|
continue
|
|
}
|
|
case '<':
|
|
open := strings.Count(s, "<")
|
|
close := strings.Count(s, ">")
|
|
if open > close {
|
|
s = s[1:]
|
|
continue
|
|
}
|
|
}
|
|
return s
|
|
}
|
|
}
|
|
|
|
// trimDelimiters trims invalid leading/trailing delimiters until the string stabilizes.
|
|
func trimDelimiters(s string) string {
|
|
prev := ""
|
|
for s != prev {
|
|
prev = s
|
|
s = trimLeadingDelimiters(s)
|
|
s = trimTrailingDelimiters(s)
|
|
}
|
|
return s
|
|
}
|
|
|
|
func extractCandidates(rel string, content string) []string {
|
|
var out []string
|
|
|
|
lowerRel := strings.ToLower(rel)
|
|
ext := strings.ToLower(filepath.Ext(lowerRel))
|
|
|
|
appendFromDual := func(matches [][]string) {
|
|
for _, m := range matches {
|
|
if len(m) > 2 {
|
|
if m[1] != "" {
|
|
out = append(out, m[1])
|
|
} else if m[2] != "" {
|
|
out = append(out, m[2])
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
isMarkdown := ext == ".md" || ext == ".markdown" || ext == ".mdx"
|
|
isHTML := ext == ".html" || ext == ".htm" || ext == ".xhtml"
|
|
|
|
switch {
|
|
case isMarkdown:
|
|
// Remove fenced and inline code before scanning for URLs
|
|
withoutFences := mdFencedCodeRegex.ReplaceAllString(content, "")
|
|
withoutInline := mdInlineCodeRegex.ReplaceAllString(withoutFences, "")
|
|
|
|
for _, m := range mdLinkRegex.FindAllStringSubmatch(withoutInline, -1) {
|
|
if len(m) > 1 {
|
|
out = append(out, m[1])
|
|
}
|
|
}
|
|
for _, m := range angleURLRegex.FindAllStringSubmatch(withoutInline, -1) {
|
|
if len(m) > 1 {
|
|
out = append(out, m[1])
|
|
}
|
|
}
|
|
for _, m := range quotedURLRegex.FindAllStringSubmatch(withoutInline, -1) {
|
|
if len(m) > 2 {
|
|
if m[1] != "" {
|
|
out = append(out, m[1])
|
|
} else if m[2] != "" {
|
|
out = append(out, m[2])
|
|
}
|
|
}
|
|
}
|
|
out = append(out, bareURLRegex.FindAllString(withoutInline, -1)...)
|
|
|
|
case isHTML:
|
|
appendFromDual(htmlHrefRegex.FindAllStringSubmatch(content, -1))
|
|
appendFromDual(htmlSrcRegex.FindAllStringSubmatch(content, -1))
|
|
|
|
default:
|
|
for _, m := range angleURLRegex.FindAllStringSubmatch(content, -1) {
|
|
if len(m) > 1 {
|
|
out = append(out, m[1])
|
|
}
|
|
}
|
|
for _, m := range quotedURLRegex.FindAllStringSubmatch(content, -1) {
|
|
if len(m) > 2 {
|
|
if m[1] != "" {
|
|
out = append(out, m[1])
|
|
} else if m[2] != "" {
|
|
out = append(out, m[2])
|
|
}
|
|
}
|
|
}
|
|
out = append(out, bareURLRegex.FindAllString(content, -1)...)
|
|
}
|
|
|
|
return out
|
|
}
|
|
|
|
func loadGitIgnore(root string) *ignore.GitIgnore {
|
|
var lines []string
|
|
gi := filepath.Join(root, ".gitignore")
|
|
if b, err := os.ReadFile(gi); err == nil {
|
|
for _, ln := range strings.Split(string(b), "\n") {
|
|
lines = append(lines, ln)
|
|
}
|
|
}
|
|
ge := filepath.Join(root, ".git", "info", "exclude")
|
|
if b, err := os.ReadFile(ge); err == nil {
|
|
for _, ln := range strings.Split(string(b), "\n") {
|
|
lines = append(lines, ln)
|
|
}
|
|
}
|
|
if len(lines) == 0 {
|
|
return nil
|
|
}
|
|
return ignore.CompileIgnoreLines(lines...)
|
|
}
|
|
|
|
// .slinkignore support
|
|
type slinkyIgnore struct {
|
|
IgnorePaths []string `json:"ignorePaths"`
|
|
IgnoreURLs []string `json:"ignoreURLs"`
|
|
}
|
|
|
|
func loadSlinkyIgnore(root string) (*ignore.GitIgnore, []string) {
|
|
cfgPath := filepath.Join(root, ".slinkignore")
|
|
b, err := os.ReadFile(cfgPath)
|
|
if err != nil || len(b) == 0 {
|
|
return nil, nil
|
|
}
|
|
var cfg slinkyIgnore
|
|
if jerr := json.Unmarshal(b, &cfg); jerr != nil {
|
|
return nil, nil
|
|
}
|
|
var ign *ignore.GitIgnore
|
|
if len(cfg.IgnorePaths) > 0 {
|
|
ign = ignore.CompileIgnoreLines(cfg.IgnorePaths...)
|
|
}
|
|
var urlPatterns []string
|
|
for _, p := range cfg.IgnoreURLs {
|
|
p = strings.TrimSpace(p)
|
|
if p != "" {
|
|
urlPatterns = append(urlPatterns, p)
|
|
}
|
|
}
|
|
return ign, urlPatterns
|
|
}
|
|
|
|
func isURLIgnored(u string, patterns []string) bool {
|
|
if len(patterns) == 0 {
|
|
return false
|
|
}
|
|
for _, p := range patterns {
|
|
if p == "" {
|
|
continue
|
|
}
|
|
// simple contains or wildcard suffix/prefix match
|
|
if p == u || strings.Contains(u, p) {
|
|
return true
|
|
}
|
|
// doublestar path-like match for full URL string
|
|
if ok, _ := doublestar.PathMatch(p, u); ok {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|