mirror of
https://github.com/LukeHagar/slinky.git
synced 2025-12-06 12:47:45 +00:00
Enhance URL extraction in fsurls.go by introducing new trimming functions for leading and trailing delimiters, improving URL sanitization. Update regex patterns for better markdown handling and adjust the extractCandidates function to support relative paths. Add tests to validate URL sanitization and preservation of balanced parentheses.
This commit is contained in:
@@ -16,13 +16,17 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// URL patterns from various contexts
|
// URL patterns from various contexts
|
||||||
var bareURLRegex = regexp.MustCompile(`(?i)\bhttps?://[^\s<>()\[\]{}"']+`)
|
var bareURLRegex = regexp.MustCompile(`(?i)\bhttps?://[^\s<>\[\]{}"']+`)
|
||||||
var mdLinkRegex = regexp.MustCompile(`(?is)!?\[[^\]]*\]\((.*?)\)`) // captures (url)
|
var mdLinkRegex = regexp.MustCompile(`(?is)!?\[[^\]]*\]\((.*?)\)`) // captures (url)
|
||||||
var angleURLRegex = regexp.MustCompile(`(?i)<(https?://[^>\s]+)>`)
|
var angleURLRegex = regexp.MustCompile(`(?i)<(https?://[^>\s]+)>`)
|
||||||
var quotedURLRegex = regexp.MustCompile(`(?i)"(https?://[^"\s]+)"|'(https?://[^'\s]+)'`)
|
var quotedURLRegex = regexp.MustCompile(`(?i)"(https?://[^"\s]+)"|'(https?://[^'\s]+)'`)
|
||||||
var htmlHrefRegex = regexp.MustCompile(`(?i)href\s*=\s*"([^"]+)"|href\s*=\s*'([^']+)'`)
|
var htmlHrefRegex = regexp.MustCompile(`(?i)href\s*=\s*"([^"]+)"|href\s*=\s*'([^']+)'`)
|
||||||
var htmlSrcRegex = regexp.MustCompile(`(?i)src\s*=\s*"([^"]+)"|src\s*=\s*'([^']+)'`)
|
var htmlSrcRegex = regexp.MustCompile(`(?i)src\s*=\s*"([^"]+)"|src\s*=\s*'([^']+)'`)
|
||||||
|
|
||||||
|
// Markdown code sections to ignore when extracting autolinks
|
||||||
|
var mdFencedCodeRegex = regexp.MustCompile("(?s)```[\\s\\S]*?```")
|
||||||
|
var mdInlineCodeRegex = regexp.MustCompile("`[^`]+`")
|
||||||
|
|
||||||
// Strict hostname validation: labels 1-63 chars, alnum & hyphen, not start/end hyphen, at least one dot, simple TLD
|
// Strict hostname validation: labels 1-63 chars, alnum & hyphen, not start/end hyphen, at least one dot, simple TLD
|
||||||
var hostnameRegex = regexp.MustCompile(`^(?i)([a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?)(?:\.[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?)+$`)
|
var hostnameRegex = regexp.MustCompile(`^(?i)([a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?)(?:\.[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?)+$`)
|
||||||
|
|
||||||
@@ -153,7 +157,7 @@ func CollectURLs(rootPath string, globs []string, respectGitignore bool) (map[st
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
candidates := extractCandidates(content)
|
candidates := extractCandidates(rel, content)
|
||||||
if len(candidates) == 0 {
|
if len(candidates) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -290,7 +294,7 @@ func CollectURLsProgress(rootPath string, globs []string, respectGitignore bool,
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
candidates := extractCandidates(content)
|
candidates := extractCandidates(rel, content)
|
||||||
if len(candidates) == 0 {
|
if len(candidates) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -332,8 +336,8 @@ func sanitizeURLToken(s string) string {
|
|||||||
if (strings.HasPrefix(s, "\"") && strings.HasSuffix(s, "\"")) || (strings.HasPrefix(s, "'") && strings.HasSuffix(s, "'")) {
|
if (strings.HasPrefix(s, "\"") && strings.HasSuffix(s, "\"")) || (strings.HasPrefix(s, "'") && strings.HasSuffix(s, "'")) {
|
||||||
s = strings.TrimSuffix(strings.TrimPrefix(s, string(s[0])), string(s[0]))
|
s = strings.TrimSuffix(strings.TrimPrefix(s, string(s[0])), string(s[0]))
|
||||||
}
|
}
|
||||||
// Trim trailing punctuation and balance parentheses
|
// Trim obvious invalid chars at both ends and balance brackets/parentheses
|
||||||
s = trimTrailingDelimiters(s)
|
s = trimDelimiters(s)
|
||||||
low := strings.ToLower(s)
|
low := strings.ToLower(s)
|
||||||
if !(strings.HasPrefix(low, "http://") || strings.HasPrefix(low, "https://")) {
|
if !(strings.HasPrefix(low, "http://") || strings.HasPrefix(low, "https://")) {
|
||||||
return ""
|
return ""
|
||||||
@@ -364,62 +368,174 @@ func trimTrailingDelimiters(s string) string {
|
|||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
last := s[len(s)-1]
|
last := s[len(s)-1]
|
||||||
if strings.ContainsRune(").,;:!?]'\"}", rune(last)) {
|
// Preserve closing brackets/parens if balanced; only strip if unmatched
|
||||||
s = s[:len(s)-1]
|
switch last {
|
||||||
continue
|
case ')':
|
||||||
}
|
|
||||||
if last == ')' {
|
|
||||||
open := strings.Count(s, "(")
|
open := strings.Count(s, "(")
|
||||||
close := strings.Count(s, ")")
|
close := strings.Count(s, ")")
|
||||||
if close > open {
|
if close > open {
|
||||||
s = s[:len(s)-1]
|
s = s[:len(s)-1]
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
case ']':
|
||||||
|
open := strings.Count(s, "[")
|
||||||
|
close := strings.Count(s, "]")
|
||||||
|
if close > open {
|
||||||
|
s = s[:len(s)-1]
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
case '}':
|
||||||
|
open := strings.Count(s, "{")
|
||||||
|
close := strings.Count(s, "}")
|
||||||
|
if close > open {
|
||||||
|
s = s[:len(s)-1]
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
case '>':
|
||||||
|
open := strings.Count(s, "<")
|
||||||
|
close := strings.Count(s, ">")
|
||||||
|
if close > open {
|
||||||
|
s = s[:len(s)-1]
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
// Common trailing punctuation and markdown emphasis markers that are not part of URLs
|
||||||
|
if strings.ContainsRune(",.;:!?]'\"*_~`", rune(last)) {
|
||||||
|
s = s[:len(s)-1]
|
||||||
|
continue
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractCandidates(content string) []string {
|
func trimLeadingDelimiters(s string) string {
|
||||||
|
for {
|
||||||
|
if s == "" {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
first := s[0]
|
||||||
|
// Strip common leading punctuation/formatting not valid at URL start
|
||||||
|
if strings.ContainsRune("'\"*_~`,;:!?)]}.", rune(first)) {
|
||||||
|
s = s[1:]
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// If starts with unmatched opening bracket, drop it
|
||||||
|
switch first {
|
||||||
|
case '(':
|
||||||
|
open := strings.Count(s, "(")
|
||||||
|
close := strings.Count(s, ")")
|
||||||
|
if open > close {
|
||||||
|
s = s[1:]
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
case '[':
|
||||||
|
open := strings.Count(s, "[")
|
||||||
|
close := strings.Count(s, "]")
|
||||||
|
if open > close {
|
||||||
|
s = s[1:]
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
case '{':
|
||||||
|
open := strings.Count(s, "{")
|
||||||
|
close := strings.Count(s, "}")
|
||||||
|
if open > close {
|
||||||
|
s = s[1:]
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
case '<':
|
||||||
|
open := strings.Count(s, "<")
|
||||||
|
close := strings.Count(s, ">")
|
||||||
|
if open > close {
|
||||||
|
s = s[1:]
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// trimDelimiters trims invalid leading/trailing delimiters until the string stabilizes.
|
||||||
|
func trimDelimiters(s string) string {
|
||||||
|
prev := ""
|
||||||
|
for s != prev {
|
||||||
|
prev = s
|
||||||
|
s = trimLeadingDelimiters(s)
|
||||||
|
s = trimTrailingDelimiters(s)
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractCandidates(rel string, content string) []string {
|
||||||
var out []string
|
var out []string
|
||||||
for _, m := range mdLinkRegex.FindAllStringSubmatch(content, -1) {
|
|
||||||
if len(m) > 1 {
|
lowerRel := strings.ToLower(rel)
|
||||||
out = append(out, m[1])
|
ext := strings.ToLower(filepath.Ext(lowerRel))
|
||||||
}
|
|
||||||
}
|
appendFromDual := func(matches [][]string) {
|
||||||
for _, m := range htmlHrefRegex.FindAllStringSubmatch(content, -1) {
|
for _, m := range matches {
|
||||||
if len(m) > 2 {
|
if len(m) > 2 {
|
||||||
if m[1] != "" {
|
if m[1] != "" {
|
||||||
out = append(out, m[1])
|
out = append(out, m[1])
|
||||||
} else if m[2] != "" {
|
} else if m[2] != "" {
|
||||||
out = append(out, m[2])
|
out = append(out, m[2])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for _, m := range htmlSrcRegex.FindAllStringSubmatch(content, -1) {
|
|
||||||
if len(m) > 2 {
|
isMarkdown := ext == ".md" || ext == ".markdown" || ext == ".mdx"
|
||||||
if m[1] != "" {
|
isHTML := ext == ".html" || ext == ".htm" || ext == ".xhtml"
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case isMarkdown:
|
||||||
|
// Remove fenced and inline code before scanning for URLs
|
||||||
|
withoutFences := mdFencedCodeRegex.ReplaceAllString(content, "")
|
||||||
|
withoutInline := mdInlineCodeRegex.ReplaceAllString(withoutFences, "")
|
||||||
|
|
||||||
|
for _, m := range mdLinkRegex.FindAllStringSubmatch(withoutInline, -1) {
|
||||||
|
if len(m) > 1 {
|
||||||
out = append(out, m[1])
|
out = append(out, m[1])
|
||||||
} else if m[2] != "" {
|
|
||||||
out = append(out, m[2])
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
for _, m := range angleURLRegex.FindAllStringSubmatch(withoutInline, -1) {
|
||||||
for _, m := range angleURLRegex.FindAllStringSubmatch(content, -1) {
|
if len(m) > 1 {
|
||||||
if len(m) > 1 {
|
|
||||||
out = append(out, m[1])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, m := range quotedURLRegex.FindAllStringSubmatch(content, -1) {
|
|
||||||
if len(m) > 2 {
|
|
||||||
if m[1] != "" {
|
|
||||||
out = append(out, m[1])
|
out = append(out, m[1])
|
||||||
} else if m[2] != "" {
|
|
||||||
out = append(out, m[2])
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for _, m := range quotedURLRegex.FindAllStringSubmatch(withoutInline, -1) {
|
||||||
|
if len(m) > 2 {
|
||||||
|
if m[1] != "" {
|
||||||
|
out = append(out, m[1])
|
||||||
|
} else if m[2] != "" {
|
||||||
|
out = append(out, m[2])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out = append(out, bareURLRegex.FindAllString(withoutInline, -1)...)
|
||||||
|
|
||||||
|
case isHTML:
|
||||||
|
appendFromDual(htmlHrefRegex.FindAllStringSubmatch(content, -1))
|
||||||
|
appendFromDual(htmlSrcRegex.FindAllStringSubmatch(content, -1))
|
||||||
|
|
||||||
|
default:
|
||||||
|
for _, m := range angleURLRegex.FindAllStringSubmatch(content, -1) {
|
||||||
|
if len(m) > 1 {
|
||||||
|
out = append(out, m[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, m := range quotedURLRegex.FindAllStringSubmatch(content, -1) {
|
||||||
|
if len(m) > 2 {
|
||||||
|
if m[1] != "" {
|
||||||
|
out = append(out, m[1])
|
||||||
|
} else if m[2] != "" {
|
||||||
|
out = append(out, m[2])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out = append(out, bareURLRegex.FindAllString(content, -1)...)
|
||||||
}
|
}
|
||||||
out = append(out, bareURLRegex.FindAllString(content, -1)...)
|
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -27,6 +27,22 @@ func TestCollectURLs_FromCodeFiles(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Ensure sanitizer trims emphasis and punctuation
|
||||||
|
if _, ok := urls["https://sailpoint.api.identitynow.com/v2024"]; !ok {
|
||||||
|
t.Fatalf("expected sanitized emphasized URL to be collected without trailing *")
|
||||||
|
}
|
||||||
|
if _, ok := urls["https://example.com/path"]; !ok {
|
||||||
|
t.Fatalf("expected URL with trailing ) to be trimmed")
|
||||||
|
}
|
||||||
|
if _, ok := urls["https://example.com/foo"]; !ok {
|
||||||
|
t.Fatalf("expected URL with trailing , to be trimmed")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Balanced parens should be preserved
|
||||||
|
if _, ok := urls["https://example.com/q?(x)"]; !ok {
|
||||||
|
t.Fatalf("expected URL with balanced parentheses to be preserved")
|
||||||
|
}
|
||||||
|
|
||||||
// Placeholder patterns should be excluded by strict validation
|
// Placeholder patterns should be excluded by strict validation
|
||||||
placeholders := []string{
|
placeholders := []string{
|
||||||
"https://[tenant].api.identitynow.com",
|
"https://[tenant].api.identitynow.com",
|
||||||
|
|||||||
@@ -165,21 +165,8 @@ func WriteMarkdown(path string, results []web.Result, s Summary) (string, error)
|
|||||||
return path, nil
|
return path, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func escapeMD(s string) string { return html.EscapeString(s) }
|
func escapeMD(s string) string {
|
||||||
|
return html.EscapeString(s)
|
||||||
func formatSourcesList(srcs []string) string {
|
|
||||||
if len(srcs) == 0 {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
var b strings.Builder
|
|
||||||
b.WriteString("<ul>\n")
|
|
||||||
for _, s := range srcs {
|
|
||||||
b.WriteString(" <li><code>")
|
|
||||||
b.WriteString(escapeMD(s))
|
|
||||||
b.WriteString("</code></li>\n")
|
|
||||||
}
|
|
||||||
b.WriteString("</ul>")
|
|
||||||
return b.String()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func escapeLinkPath(p string) string {
|
func escapeLinkPath(p string) string {
|
||||||
|
|||||||
17
testdata/test18.md
vendored
Normal file
17
testdata/test18.md
vendored
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
# Emphasis and punctuation edge cases
|
||||||
|
|
||||||
|
This line has an emphasized URL: *https://sailpoint.api.identitynow.com/v2024*
|
||||||
|
|
||||||
|
This one has trailing punctuation: https://example.com/path), https://example.com/foo,
|
||||||
|
|
||||||
|
Balanced parentheses should remain: https://example.com/q?(x)
|
||||||
|
|
||||||
|
Inline code should be ignored: `https://ignore.me/inside/code`
|
||||||
|
|
||||||
|
Fenced code should be ignored:
|
||||||
|
|
||||||
|
```
|
||||||
|
curl https://ignore.me/in/fenced/code
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user