Refactor URL extraction in fsurls.go to improve handling of matches and offsets. Replace extractCandidates with extractCandidateMatches for better clarity and accuracy in URL processing. Update WriteMarkdown function to enhance link formatting in reports, ensuring correct display and linking of file paths with line numbers.

This commit is contained in:
Luke Hagar
2025-09-13 01:14:55 +00:00
parent 7c52360fbe
commit 2df17e8247
2 changed files with 118 additions and 78 deletions

View File

@@ -24,10 +24,6 @@ var quotedURLRegex = regexp.MustCompile(`(?i)"(https?://[^"\s]+)"|'(https?://[^'
var htmlHrefRegex = regexp.MustCompile(`(?i)href\s*=\s*"([^"]+)"|href\s*=\s*'([^']+)'`) var htmlHrefRegex = regexp.MustCompile(`(?i)href\s*=\s*"([^"]+)"|href\s*=\s*'([^']+)'`)
var htmlSrcRegex = regexp.MustCompile(`(?i)src\s*=\s*"([^"]+)"|src\s*=\s*'([^']+)'`) var htmlSrcRegex = regexp.MustCompile(`(?i)src\s*=\s*"([^"]+)"|src\s*=\s*'([^']+)'`)
// Markdown code sections to ignore when extracting autolinks
var mdFencedCodeRegex = regexp.MustCompile("(?s)```[\\s\\S]*?```")
var mdInlineCodeRegex = regexp.MustCompile("`[^`]+`")
// Strict hostname validation: labels 1-63 chars, alnum & hyphen, not start/end hyphen, at least one dot, simple TLD // Strict hostname validation: labels 1-63 chars, alnum & hyphen, not start/end hyphen, at least one dot, simple TLD
var hostnameRegex = regexp.MustCompile(`^(?i)([a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?)(?:\.[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?)+$`) var hostnameRegex = regexp.MustCompile(`^(?i)([a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?)(?:\.[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?)+$`)
@@ -160,24 +156,26 @@ func CollectURLs(rootPath string, globs []string, respectGitignore bool) (map[st
return nil return nil
} }
candidates := extractCandidates(rel, content) matches := extractCandidateMatches(content)
if len(candidates) == 0 { if len(matches) == 0 {
return nil return nil
} }
for _, raw := range candidates { for _, m := range matches {
u := sanitizeURLToken(raw) u := sanitizeURLToken(m.URL)
if u == "" { if u == "" {
continue continue
} }
if isURLIgnored(u, slURLPatterns) { if isURLIgnored(u, slURLPatterns) {
continue continue
} }
line, col := computeLineCol(content, m.Offset)
source := fmt.Sprintf("%s|%d|%d", rel, line, col)
fileSet, ok := urlToFiles[u] fileSet, ok := urlToFiles[u]
if !ok { if !ok {
fileSet = make(map[string]struct{}) fileSet = make(map[string]struct{})
urlToFiles[u] = fileSet urlToFiles[u] = fileSet
} }
fileSet[rel] = struct{}{} fileSet[source] = struct{}{}
} }
return nil return nil
} }
@@ -301,24 +299,26 @@ func CollectURLsProgress(rootPath string, globs []string, respectGitignore bool,
return nil return nil
} }
candidates := extractCandidates(rel, content) matches := extractCandidateMatches(content)
if len(candidates) == 0 { if len(matches) == 0 {
return nil return nil
} }
for _, raw := range candidates { for _, m := range matches {
u := sanitizeURLToken(raw) u := sanitizeURLToken(m.URL)
if u == "" { if u == "" {
continue continue
} }
if isURLIgnored(u, slURLPatterns) { if isURLIgnored(u, slURLPatterns) {
continue continue
} }
line, col := computeLineCol(content, m.Offset)
source := fmt.Sprintf("%s|%d|%d", rel, line, col)
fileSet, ok := urlToFiles[u] fileSet, ok := urlToFiles[u]
if !ok { if !ok {
fileSet = make(map[string]struct{}) fileSet = make(map[string]struct{})
urlToFiles[u] = fileSet urlToFiles[u] = fileSet
} }
fileSet[rel] = struct{}{} fileSet[source] = struct{}{}
} }
return nil return nil
} }
@@ -477,75 +477,101 @@ func trimDelimiters(s string) string {
} }
func extractCandidates(rel string, content string) []string { func extractCandidates(rel string, content string) []string {
var out []string return nil
}
lowerRel := strings.ToLower(rel) // matchCandidate holds a URL and its byte offset within the content
ext := strings.ToLower(filepath.Ext(lowerRel)) type matchCandidate struct {
URL string
Offset int
}
appendFromDual := func(matches [][]string) { // computeLineCol returns 1-based line and column given a byte offset
for _, m := range matches { func computeLineCol(content string, offset int) (int, int) {
if len(m) > 2 { if offset < 0 {
if m[1] != "" { return 1, 1
out = append(out, m[1]) }
} else if m[2] != "" { if offset > len(content) {
out = append(out, m[2]) offset = len(content)
} }
line := 1
col := 1
for i := 0; i < offset; i++ {
if content[i] == '\n' {
line++
col = 1
} else {
col++
}
}
return line, col
}
// extractCandidateMatches finds URL-like tokens with their offsets for line/col mapping
func extractCandidateMatches(content string) []matchCandidate {
var out []matchCandidate
// Markdown links: capture group 1 is the URL inside (...)
if subs := mdLinkRegex.FindAllStringSubmatchIndex(content, -1); len(subs) > 0 {
for _, idx := range subs {
if len(idx) >= 4 && idx[2] >= 0 && idx[3] >= 0 {
url := content[idx[2]:idx[3]]
out = append(out, matchCandidate{URL: url, Offset: idx[2]})
} }
} }
} }
// HTML href
isMarkdown := ext == ".md" || ext == ".markdown" || ext == ".mdx" if subs := htmlHrefRegex.FindAllStringSubmatchIndex(content, -1); len(subs) > 0 {
isHTML := ext == ".html" || ext == ".htm" || ext == ".xhtml" for _, idx := range subs {
// groups 1 and 2 are alternatives
switch { if len(idx) >= 4 && idx[2] >= 0 && idx[3] >= 0 {
case isMarkdown: url := content[idx[2]:idx[3]]
// Remove fenced and inline code before scanning for URLs out = append(out, matchCandidate{URL: url, Offset: idx[2]})
withoutFences := mdFencedCodeRegex.ReplaceAllString(content, "") } else if len(idx) >= 6 && idx[4] >= 0 && idx[5] >= 0 {
withoutInline := mdInlineCodeRegex.ReplaceAllString(withoutFences, "") url := content[idx[4]:idx[5]]
out = append(out, matchCandidate{URL: url, Offset: idx[4]})
for _, m := range mdLinkRegex.FindAllStringSubmatch(withoutInline, -1) {
if len(m) > 1 {
out = append(out, m[1])
} }
} }
for _, m := range angleURLRegex.FindAllStringSubmatch(withoutInline, -1) {
if len(m) > 1 {
out = append(out, m[1])
}
}
for _, m := range quotedURLRegex.FindAllStringSubmatch(withoutInline, -1) {
if len(m) > 2 {
if m[1] != "" {
out = append(out, m[1])
} else if m[2] != "" {
out = append(out, m[2])
}
}
}
out = append(out, bareURLRegex.FindAllString(withoutInline, -1)...)
case isHTML:
appendFromDual(htmlHrefRegex.FindAllStringSubmatch(content, -1))
appendFromDual(htmlSrcRegex.FindAllStringSubmatch(content, -1))
default:
for _, m := range angleURLRegex.FindAllStringSubmatch(content, -1) {
if len(m) > 1 {
out = append(out, m[1])
}
}
for _, m := range quotedURLRegex.FindAllStringSubmatch(content, -1) {
if len(m) > 2 {
if m[1] != "" {
out = append(out, m[1])
} else if m[2] != "" {
out = append(out, m[2])
}
}
}
out = append(out, bareURLRegex.FindAllString(content, -1)...)
} }
// HTML src
if subs := htmlSrcRegex.FindAllStringSubmatchIndex(content, -1); len(subs) > 0 {
for _, idx := range subs {
if len(idx) >= 4 && idx[2] >= 0 && idx[3] >= 0 {
url := content[idx[2]:idx[3]]
out = append(out, matchCandidate{URL: url, Offset: idx[2]})
} else if len(idx) >= 6 && idx[4] >= 0 && idx[5] >= 0 {
url := content[idx[4]:idx[5]]
out = append(out, matchCandidate{URL: url, Offset: idx[4]})
}
}
}
// Angle autolinks <http://...>
if subs := angleURLRegex.FindAllStringSubmatchIndex(content, -1); len(subs) > 0 {
for _, idx := range subs {
if len(idx) >= 4 && idx[2] >= 0 && idx[3] >= 0 {
url := content[idx[2]:idx[3]]
out = append(out, matchCandidate{URL: url, Offset: idx[2]})
}
}
}
// Quoted URLs
if subs := quotedURLRegex.FindAllStringSubmatchIndex(content, -1); len(subs) > 0 {
for _, idx := range subs {
if len(idx) >= 4 && idx[2] >= 0 && idx[3] >= 0 {
url := content[idx[2]:idx[3]]
out = append(out, matchCandidate{URL: url, Offset: idx[2]})
} else if len(idx) >= 6 && idx[4] >= 0 && idx[5] >= 0 {
url := content[idx[4]:idx[5]]
out = append(out, matchCandidate{URL: url, Offset: idx[4]})
}
}
}
// Bare URLs
if spans := bareURLRegex.FindAllStringIndex(content, -1); len(spans) > 0 {
for _, sp := range spans {
url := content[sp[0]:sp[1]]
out = append(out, matchCandidate{URL: url, Offset: sp[0]})
}
}
return out return out
} }

View File

@@ -145,10 +145,24 @@ func WriteMarkdown(path string, results []web.Result, s Summary) (string, error)
} }
sort.Strings(files) sort.Strings(files)
for _, fn := range files { for _, fn := range files {
if strings.TrimSpace(s.RepoBlobBaseURL) != "" { display := fn
buf.WriteString(fmt.Sprintf(" - [%s](%s/%s)\n", escapeMD(fn), strings.TrimRight(s.RepoBlobBaseURL, "/"), escapeLinkPath(fn))) linkPath := fn
if parts := strings.Split(fn, "|"); len(parts) >= 2 {
p := parts[0]
line := strings.TrimSpace(parts[1])
display = p
if line != "" {
linkPath = fmt.Sprintf("%s#L%s", escapeLinkPath(p), line)
} else {
linkPath = escapeLinkPath(p)
}
} else { } else {
buf.WriteString(fmt.Sprintf(" - [%s](./%s)\n", escapeMD(fn), escapeLinkPath(fn))) linkPath = escapeLinkPath(linkPath)
}
if strings.TrimSpace(s.RepoBlobBaseURL) != "" {
buf.WriteString(fmt.Sprintf(" - [%s](%s/%s)\n", escapeMD(display), strings.TrimRight(s.RepoBlobBaseURL, "/"), linkPath))
} else {
buf.WriteString(fmt.Sprintf(" - [%s](./%s)\n", escapeMD(display), linkPath))
} }
} }
} }