Files
slinky/internal/fsurls/fsurls.go
Luke Hagar c42386c8e8 Add debug logging for ignored files and .slinkignore detection in fsurls.go
Enhance the CollectURLs and CollectURLsProgress functions to include debug output when files are ignored based on ignore patterns. Additionally, add debug statements to log the detection of .slinkignore files and the compilation of ignore patterns. This change improves traceability during URL collection and helps in debugging ignore logic.
2025-09-19 18:12:37 +00:00

765 lines
19 KiB
Go

package fsurls
import (
"bufio"
"encoding/json"
"fmt"
"io"
"net/url"
"os"
"path/filepath"
"regexp"
"sort"
"strings"
"github.com/bmatcuk/doublestar/v4"
ignore "github.com/sabhiram/go-gitignore"
)
// URL patterns from various contexts
var bareURLRegex = regexp.MustCompile(`(?i)\bhttps?://[^\s<>\[\]{}"']+`)
var mdLinkRegex = regexp.MustCompile(`(?is)!?\[[^\]]*\]\((.*?)\)`) // captures (url)
var angleURLRegex = regexp.MustCompile(`(?i)<(https?://[^>\s]+)>`)
var quotedURLRegex = regexp.MustCompile(`(?i)"(https?://[^"\s]+)"|'(https?://[^'\s]+)'`)
var htmlHrefRegex = regexp.MustCompile(`(?i)href\s*=\s*"([^"]+)"|href\s*=\s*'([^']+)'`)
var htmlSrcRegex = regexp.MustCompile(`(?i)src\s*=\s*"([^"]+)"|src\s*=\s*'([^']+)'`)
// Strict hostname validation: labels 1-63 chars, alnum & hyphen, not start/end hyphen, at least one dot, simple TLD
var hostnameRegex = regexp.MustCompile(`^(?i)([a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?)(?:\.[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?)+$`)
func isDebugEnv() bool {
if os.Getenv("SLINKY_DEBUG") == "1" {
return true
}
if strings.EqualFold(os.Getenv("ACTIONS_STEP_DEBUG"), "true") {
return true
}
if os.Getenv("RUNNER_DEBUG") == "1" {
return true
}
return false
}
// CollectURLs walks the directory tree rooted at rootPath and collects URLs found in
// text-based files matching any of the provided glob patterns (doublestar ** supported).
// If globs is empty, all files are considered. Respects .gitignore if present and respectGitignore=true.
// Returns a map from URL -> sorted unique list of file paths that contained it.
func CollectURLs(rootPath string, globs []string, respectGitignore bool) (map[string][]string, error) {
if strings.TrimSpace(rootPath) == "" {
rootPath = "."
}
cleanRoot := filepath.Clean(rootPath)
st, _ := os.Stat(cleanRoot)
isFileRoot := st != nil && !st.IsDir()
var ign *ignore.GitIgnore
if !isFileRoot && respectGitignore {
ign = loadGitIgnore(cleanRoot)
}
// Load optional .slinkignore config
slPathIgnore, slURLPatterns := loadSlinkyIgnore(cleanRoot)
var patterns []string
for _, g := range globs {
g = strings.TrimSpace(g)
if g == "" {
continue
}
patterns = append(patterns, g)
}
shouldInclude := func(rel string) bool {
if len(patterns) == 0 {
return true
}
for _, p := range patterns {
ok, _ := doublestar.PathMatch(p, rel)
if ok {
return true
}
}
return false
}
urlToFiles := make(map[string]map[string]struct{})
// 2 MiB max file size to avoid huge/binary files
const maxSize = 2 * 1024 * 1024
// Walk the filesystem
walkFn := func(path string, d os.DirEntry, err error) error {
// if isDebugEnv() {
// fmt.Printf("::debug:: Walking path: %s\n", path)
// }
if err != nil {
return nil
}
rel, rerr := filepath.Rel(cleanRoot, path)
if rerr != nil {
rel = path
}
rel = filepath.ToSlash(rel)
if d.IsDir() {
base := filepath.Base(path)
if base == ".git" {
return filepath.SkipDir
}
return nil
}
// Always skip any .slinkignore file from scanning
if filepath.Base(path) == ".slinkignore" || rel == ".slinkignore" || strings.HasSuffix(rel, "/.slinkignore") {
return nil
}
if (ign != nil && ign.MatchesPath(rel)) || (slPathIgnore != nil && slPathIgnore.MatchesPath(rel)) {
if isDebugEnv() {
fmt.Printf("::debug:: Ignoring file: %s\n", rel)
}
return nil
}
info, ierr := d.Info()
if ierr != nil {
return nil
}
if info.Size() > maxSize {
return nil
}
if isFileRoot && rel == "." {
rel = filepath.ToSlash(filepath.Base(path))
}
if !shouldInclude(rel) {
return nil
}
f, ferr := os.Open(path)
if ferr != nil {
return nil
}
defer f.Close()
br := bufio.NewReader(f)
// Read up to maxSize bytes
var b strings.Builder
read := int64(0)
for {
chunk, cerr := br.ReadString('\n')
b.WriteString(chunk)
read += int64(len(chunk))
if cerr == io.EOF || read > maxSize {
break
}
if cerr != nil {
break
}
}
content := b.String()
// Skip if likely binary (NUL present)
if strings.IndexByte(content, '\x00') >= 0 {
return nil
}
matches := extractCandidateMatches(content)
if len(matches) == 0 {
return nil
}
for _, m := range matches {
u := sanitizeURLToken(m.URL)
if u == "" {
continue
}
if isURLIgnored(u, slURLPatterns) {
continue
}
line, col := computeLineCol(content, m.Offset)
source := fmt.Sprintf("%s|%d|%d", rel, line, col)
fileSet, ok := urlToFiles[u]
if !ok {
fileSet = make(map[string]struct{})
urlToFiles[u] = fileSet
}
fileSet[source] = struct{}{}
}
return nil
}
_ = filepath.WalkDir(cleanRoot, walkFn)
// Convert to sorted slices
result := make(map[string][]string, len(urlToFiles))
for u, files := range urlToFiles {
var list []string
for fp := range files {
list = append(list, fp)
}
sort.Strings(list)
result[u] = list
}
return result, nil
}
// CollectURLsProgress is like CollectURLs but invokes onFile(relPath) for each included file.
func CollectURLsProgress(rootPath string, globs []string, respectGitignore bool, onFile func(string)) (map[string][]string, error) {
if strings.TrimSpace(rootPath) == "" {
rootPath = "."
}
cleanRoot := filepath.Clean(rootPath)
st, _ := os.Stat(cleanRoot)
isFileRoot := st != nil && !st.IsDir()
var ign *ignore.GitIgnore
if !isFileRoot && respectGitignore {
ign = loadGitIgnore(cleanRoot)
}
slPathIgnore, slURLPatterns := loadSlinkyIgnore(cleanRoot)
var patterns []string
for _, g := range globs {
g = strings.TrimSpace(g)
if g == "" {
continue
}
patterns = append(patterns, g)
}
shouldInclude := func(rel string) bool {
if len(patterns) == 0 {
return true
}
for _, p := range patterns {
ok, _ := doublestar.PathMatch(p, rel)
if ok {
return true
}
}
return false
}
urlToFiles := make(map[string]map[string]struct{})
// 2 MiB max file size to avoid huge/binary files
const maxSize = 2 * 1024 * 1024
walkFn := func(path string, d os.DirEntry, err error) error {
if err != nil {
return nil
}
rel, rerr := filepath.Rel(cleanRoot, path)
if rerr != nil {
rel = path
}
rel = filepath.ToSlash(rel)
if d.IsDir() {
base := filepath.Base(path)
if base == ".git" {
return filepath.SkipDir
}
return nil
}
// Always skip any .slinkignore file from scanning
if filepath.Base(path) == ".slinkignore" || rel == ".slinkignore" || strings.HasSuffix(rel, "/.slinkignore") {
return nil
}
if (ign != nil && ign.MatchesPath(rel)) || (slPathIgnore != nil && slPathIgnore.MatchesPath(rel)) {
if isDebugEnv() {
fmt.Printf("::debug:: Ignoring file: %s\n", rel)
}
return nil
}
info, ierr := d.Info()
if ierr != nil {
return nil
}
if info.Size() > maxSize {
return nil
}
if isFileRoot && rel == "." {
rel = filepath.ToSlash(filepath.Base(path))
}
if !shouldInclude(rel) {
return nil
}
if onFile != nil {
onFile(rel)
}
f, ferr := os.Open(path)
if ferr != nil {
return nil
}
defer f.Close()
br := bufio.NewReader(f)
var b strings.Builder
read := int64(0)
for {
chunk, cerr := br.ReadString('\n')
b.WriteString(chunk)
read += int64(len(chunk))
if cerr == io.EOF || read > maxSize {
break
}
if cerr != nil {
break
}
}
content := b.String()
if strings.IndexByte(content, '\x00') >= 0 {
return nil
}
matches := extractCandidateMatches(content)
if len(matches) == 0 {
return nil
}
for _, m := range matches {
u := sanitizeURLToken(m.URL)
if u == "" {
continue
}
if isURLIgnored(u, slURLPatterns) {
continue
}
line, col := computeLineCol(content, m.Offset)
source := fmt.Sprintf("%s|%d|%d", rel, line, col)
fileSet, ok := urlToFiles[u]
if !ok {
fileSet = make(map[string]struct{})
urlToFiles[u] = fileSet
}
fileSet[source] = struct{}{}
}
return nil
}
_ = filepath.WalkDir(cleanRoot, walkFn)
result := make(map[string][]string, len(urlToFiles))
for u, files := range urlToFiles {
var list []string
for fp := range files {
list = append(list, fp)
}
sort.Strings(list)
result[u] = list
}
return result, nil
}
func sanitizeURLToken(s string) string {
s = strings.TrimSpace(s)
// Strip surrounding angle brackets or quotes
if strings.HasPrefix(s, "<") && strings.HasSuffix(s, ">") {
s = strings.TrimSuffix(strings.TrimPrefix(s, "<"), ">")
}
if (strings.HasPrefix(s, "\"") && strings.HasSuffix(s, "\"")) || (strings.HasPrefix(s, "'") && strings.HasSuffix(s, "'")) {
s = strings.TrimSuffix(strings.TrimPrefix(s, string(s[0])), string(s[0]))
}
// Trim obvious invalid chars at both ends and balance brackets/parentheses
s = trimDelimiters(s)
low := strings.ToLower(s)
if !(strings.HasPrefix(low, "http://") || strings.HasPrefix(low, "https://")) {
return ""
}
// Parse and validate hostname strictly
u, err := url.Parse(s)
if err != nil || u == nil {
return ""
}
host := u.Hostname()
if host == "" {
return ""
}
// Reject placeholders like [tenant] or {tenant}
if strings.ContainsAny(host, "[]{}") {
return ""
}
// Must match strict hostname rules
if !hostnameRegex.MatchString(host) {
return ""
}
return s
}
func trimTrailingDelimiters(s string) string {
for {
if s == "" {
return s
}
last := s[len(s)-1]
// Preserve closing brackets/parens if balanced; only strip if unmatched
switch last {
case ')':
open := strings.Count(s, "(")
close := strings.Count(s, ")")
if close > open {
s = s[:len(s)-1]
continue
}
case ']':
open := strings.Count(s, "[")
close := strings.Count(s, "]")
if close > open {
s = s[:len(s)-1]
continue
}
case '}':
open := strings.Count(s, "{")
close := strings.Count(s, "}")
if close > open {
s = s[:len(s)-1]
continue
}
case '>':
open := strings.Count(s, "<")
close := strings.Count(s, ">")
if close > open {
s = s[:len(s)-1]
continue
}
default:
// Common trailing punctuation and markdown emphasis markers that are not part of URLs
if strings.ContainsRune(",.;:!?]'\"*_~`", rune(last)) {
s = s[:len(s)-1]
continue
}
}
return s
}
}
func trimLeadingDelimiters(s string) string {
for {
if s == "" {
return s
}
first := s[0]
// Strip common leading punctuation/formatting not valid at URL start
if strings.ContainsRune("'\"*_~`,;:!?)]}.", rune(first)) {
s = s[1:]
continue
}
// If starts with unmatched opening bracket, drop it
switch first {
case '(':
open := strings.Count(s, "(")
close := strings.Count(s, ")")
if open > close {
s = s[1:]
continue
}
case '[':
open := strings.Count(s, "[")
close := strings.Count(s, "]")
if open > close {
s = s[1:]
continue
}
case '{':
open := strings.Count(s, "{")
close := strings.Count(s, "}")
if open > close {
s = s[1:]
continue
}
case '<':
open := strings.Count(s, "<")
close := strings.Count(s, ">")
if open > close {
s = s[1:]
continue
}
}
return s
}
}
// trimDelimiters trims invalid leading/trailing delimiters until the string stabilizes.
func trimDelimiters(s string) string {
prev := ""
for s != prev {
prev = s
s = trimLeadingDelimiters(s)
s = trimTrailingDelimiters(s)
}
return s
}
// matchCandidate holds a URL and its byte offset within the content
type matchCandidate struct {
URL string
Offset int
}
// computeLineCol returns 1-based line and column given a byte offset
func computeLineCol(content string, offset int) (int, int) {
if offset < 0 {
return 1, 1
}
if offset > len(content) {
offset = len(content)
}
line := 1
col := 1
for i := 0; i < offset; i++ {
if content[i] == '\n' {
line++
col = 1
} else {
col++
}
}
return line, col
}
// extractCandidateMatches finds URL-like tokens with their offsets for line/col mapping
func extractCandidateMatches(content string) []matchCandidate {
var out []matchCandidate
// Markdown links: capture group 1 is the URL inside (...)
if subs := mdLinkRegex.FindAllStringSubmatchIndex(content, -1); len(subs) > 0 {
for _, idx := range subs {
if len(idx) >= 4 && idx[2] >= 0 && idx[3] >= 0 {
url := content[idx[2]:idx[3]]
out = append(out, matchCandidate{URL: url, Offset: idx[2]})
}
}
}
// HTML href
if subs := htmlHrefRegex.FindAllStringSubmatchIndex(content, -1); len(subs) > 0 {
for _, idx := range subs {
// groups 1 and 2 are alternatives
if len(idx) >= 4 && idx[2] >= 0 && idx[3] >= 0 {
url := content[idx[2]:idx[3]]
out = append(out, matchCandidate{URL: url, Offset: idx[2]})
} else if len(idx) >= 6 && idx[4] >= 0 && idx[5] >= 0 {
url := content[idx[4]:idx[5]]
out = append(out, matchCandidate{URL: url, Offset: idx[4]})
}
}
}
// HTML src
if subs := htmlSrcRegex.FindAllStringSubmatchIndex(content, -1); len(subs) > 0 {
for _, idx := range subs {
if len(idx) >= 4 && idx[2] >= 0 && idx[3] >= 0 {
url := content[idx[2]:idx[3]]
out = append(out, matchCandidate{URL: url, Offset: idx[2]})
} else if len(idx) >= 6 && idx[4] >= 0 && idx[5] >= 0 {
url := content[idx[4]:idx[5]]
out = append(out, matchCandidate{URL: url, Offset: idx[4]})
}
}
}
// Angle autolinks <http://...>
if subs := angleURLRegex.FindAllStringSubmatchIndex(content, -1); len(subs) > 0 {
for _, idx := range subs {
if len(idx) >= 4 && idx[2] >= 0 && idx[3] >= 0 {
url := content[idx[2]:idx[3]]
out = append(out, matchCandidate{URL: url, Offset: idx[2]})
}
}
}
// Quoted URLs
if subs := quotedURLRegex.FindAllStringSubmatchIndex(content, -1); len(subs) > 0 {
for _, idx := range subs {
if len(idx) >= 4 && idx[2] >= 0 && idx[3] >= 0 {
url := content[idx[2]:idx[3]]
out = append(out, matchCandidate{URL: url, Offset: idx[2]})
} else if len(idx) >= 6 && idx[4] >= 0 && idx[5] >= 0 {
url := content[idx[4]:idx[5]]
out = append(out, matchCandidate{URL: url, Offset: idx[4]})
}
}
}
// Bare URLs
if spans := bareURLRegex.FindAllStringIndex(content, -1); len(spans) > 0 {
for _, sp := range spans {
url := content[sp[0]:sp[1]]
out = append(out, matchCandidate{URL: url, Offset: sp[0]})
}
}
return out
}
func loadGitIgnore(root string) *ignore.GitIgnore {
var lines []string
gi := filepath.Join(root, ".gitignore")
if isDebugEnv() {
fmt.Printf("::debug:: Checking for .gitignore at: %s\n", gi)
}
if _, err := os.Stat(gi); err != nil {
if isDebugEnv() {
fmt.Printf("::debug:: .gitignore not found at: %s\n", gi)
}
return nil
}
if isDebugEnv() {
fmt.Printf("::debug:: Reading .gitignore from: %s\n", gi)
}
if b, err := os.ReadFile(gi); err == nil {
for ln := range strings.SplitSeq(string(b), "\n") {
lines = append(lines, ln)
}
}
ge := filepath.Join(root, ".git", "info", "exclude")
if isDebugEnv() {
fmt.Printf("::debug:: Checking for .git/info/exclude at: %s\n", ge)
}
if _, err := os.Stat(ge); err != nil {
if isDebugEnv() {
fmt.Printf("::debug:: .git/info/exclude not found at: %s\n", ge)
}
return nil
}
if isDebugEnv() {
fmt.Printf("::debug:: Reading .git/info/exclude from: %s\n", ge)
}
if b, err := os.ReadFile(ge); err == nil {
for ln := range strings.SplitSeq(string(b), "\n") {
lines = append(lines, ln)
}
}
if len(lines) == 0 {
if isDebugEnv() {
fmt.Printf("::debug:: .gitignore or .git/info/exclude is empty\n")
}
return nil
}
if isDebugEnv() {
fmt.Printf("::debug:: Compiling .gitignore and .git/info/exclude\n")
}
return ignore.CompileIgnoreLines(lines...)
}
// .slinkignore support
type slinkyIgnore struct {
IgnorePaths []string `json:"ignorePaths" optional:"true"`
IgnoreURLs []string `json:"ignoreURLs" optional:"true"`
}
func loadSlinkyIgnore(root string) (*ignore.GitIgnore, []string) {
cfgPath := findSlinkyConfig(root)
if cfgPath == "" {
return nil, nil
}
b, err := os.ReadFile(cfgPath)
if err != nil || len(b) == 0 {
return nil, nil
}
var cfg slinkyIgnore
// First attempt strict JSON
if jerr := json.Unmarshal(b, &cfg); jerr != nil {
// Try a relaxed pass: strip trailing commas before ] or }
relaxed := regexp.MustCompile(`,\s*([}\]])`).ReplaceAll(b, []byte("$1"))
if jerr2 := json.Unmarshal(relaxed, &cfg); jerr2 != nil {
// Emit a GitHub Actions warning so users see misconfigurations
fmt.Printf("::warning:: Failed to parse .slinkignore at %s: %v\n", cfgPath, jerr)
return nil, nil
}
}
if isDebugEnv() {
fmt.Println("::debug:: Loaded .slinkignore")
fmt.Printf("::debug:: IgnorePaths: %v\n", cfg.IgnorePaths)
fmt.Printf("::debug:: IgnoreURLs: %v\n", cfg.IgnoreURLs)
}
var ign *ignore.GitIgnore
if len(cfg.IgnorePaths) > 0 {
var lines []string
for _, p := range cfg.IgnorePaths {
p = strings.TrimSpace(p)
if p == "" {
continue
}
lines = append(lines, p)
// Add a recursive variant to match anywhere
if !strings.HasPrefix(p, "**/") {
lines = append(lines, "**/"+p)
}
// If likely a directory name, add a catch-all under it
base := strings.TrimSuffix(p, "/")
if base != "" && !strings.ContainsAny(base, "*?[]") {
// Heuristic: directory-like if it has no '.' in the last segment or explicitly ends with '/'
last := filepath.Base(base)
if strings.HasSuffix(p, "/") || !strings.Contains(last, ".") {
lines = append(lines, "**/"+base+"/**")
}
}
}
if isDebugEnv() {
fmt.Printf("::debug:: Compiled ignore patterns: %v\n", lines)
}
ign = ignore.CompileIgnoreLines(lines...)
if isDebugEnv() {
fmt.Printf("::debug:: Ignore matcher created successfully\n")
}
}
var urlPatterns []string
for _, p := range cfg.IgnoreURLs {
p = strings.TrimSpace(p)
if p != "" {
urlPatterns = append(urlPatterns, p)
}
}
return ign, urlPatterns
}
// findSlinkyConfig searches upward from root for a .slinkignore file
func findSlinkyConfig(root string) string {
cur := root
for {
cfg := filepath.Join(cur, ".slinkignore")
if st, err := os.Stat(cfg); err == nil && !st.IsDir() {
if isDebugEnv() {
fmt.Printf("::debug:: Found .slinkignore at: %s\n", cfg)
}
return cfg
}
parent := filepath.Dir(cur)
if parent == cur || strings.TrimSpace(parent) == "" {
break
}
cur = parent
}
if isDebugEnv() {
fmt.Printf("::debug:: No .slinkignore file found starting from: %s\n", root)
}
return ""
}
func isURLIgnored(u string, patterns []string) bool {
if len(patterns) == 0 {
return false
}
for _, raw := range patterns {
p := strings.TrimSpace(raw)
if p == "" {
continue
}
// No wildcards: exact or substring match
if !strings.ContainsAny(p, "*?") {
if u == p || strings.Contains(u, p) {
return true
}
continue
}
// Glob-style: allow '*' to span slashes by converting '*' -> '**'
dsPat := strings.ReplaceAll(p, "*", "**")
if ok, _ := doublestar.PathMatch(dsPat, u); ok {
return true
}
// Regex fallback: '*' -> '.*', '?' -> '.'
if re, err := wildcardToRegex(p); err == nil && re.MatchString(u) {
return true
}
}
return false
}
func wildcardToRegex(pattern string) (*regexp.Regexp, error) {
escaped := regexp.QuoteMeta(pattern)
escaped = strings.ReplaceAll(escaped, "\\*", ".*")
escaped = strings.ReplaceAll(escaped, "\\?", ".")
return regexp.Compile("^" + escaped + "$")
}