mirror of
https://github.com/LukeHagar/slinky.git
synced 2025-12-06 04:21:20 +00:00
testing some optimizations
This commit is contained in:
@@ -7,7 +7,9 @@ COPY . .
|
||||
RUN CGO_ENABLED=0 go build -o /usr/local/bin/slinky ./
|
||||
|
||||
FROM alpine:3.20
|
||||
RUN apk add --no-cache curl jq ca-certificates
|
||||
# jq is used in entrypoint.sh for parsing GitHub event JSON
|
||||
# ca-certificates is needed for HTTPS requests
|
||||
RUN apk add --no-cache jq ca-certificates
|
||||
COPY --from=build /usr/local/bin/slinky /usr/local/bin/slinky
|
||||
COPY entrypoint.sh /entrypoint.sh
|
||||
RUN chmod +x /entrypoint.sh
|
||||
|
||||
@@ -16,15 +16,18 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
pull-requests: write # Only needed if comment-pr is enabled
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Run Slinky
|
||||
uses: LukeHagar/slinky@v1
|
||||
with:
|
||||
targets: "docs/,README.md,**/*.md"
|
||||
# comment-pr: true # Optional: post results as PR comment (requires GITHUB_TOKEN)
|
||||
```
|
||||
|
||||
**Note:** The `GITHUB_TOKEN` is automatically provided by GitHub Actions and is only required for PR comment functionality. Core link checking works without it. If you disable PR comments (`comment-pr: false`), you can remove the `pull-requests: write` permission.
|
||||
|
||||
### Inputs
|
||||
|
||||
- **targets**: Comma-separated paths and patterns to scan. Can be directories, files, or glob patterns (e.g. `docs/,api-specs/**/*.yaml,README.md`). Default: `**/*`
|
||||
|
||||
@@ -53,6 +53,7 @@ runs:
|
||||
INPUT_FAIL_ON_FAILURES: ${{ inputs.fail_on_failures }}
|
||||
INPUT_COMMENT_PR: ${{ inputs.comment_pr }}
|
||||
INPUT_STEP_SUMMARY: ${{ inputs.step_summary }}
|
||||
GITHUB_TOKEN: ${{ github.token }}
|
||||
|
||||
outputs:
|
||||
json_path:
|
||||
|
||||
125
cmd/check.go
125
cmd/check.go
@@ -158,9 +158,51 @@ func init() {
|
||||
fmt.Printf("::debug:: Root: %s\n", displayRoot)
|
||||
}
|
||||
|
||||
// Validate and clamp numeric inputs
|
||||
if maxConcurrency < 1 {
|
||||
maxConcurrency = 1
|
||||
} else if maxConcurrency > 100 {
|
||||
maxConcurrency = 100
|
||||
}
|
||||
if timeoutSeconds < 1 {
|
||||
timeoutSeconds = 1
|
||||
} else if timeoutSeconds > 300 {
|
||||
timeoutSeconds = 300 // Max 5 minutes
|
||||
}
|
||||
|
||||
// Build config
|
||||
timeout := time.Duration(timeoutSeconds) * time.Second
|
||||
cfg := web.Config{MaxConcurrency: maxConcurrency, RequestTimeout: timeout}
|
||||
|
||||
// Set up URL cache if cache path is provided via environment variable
|
||||
var urlCache *web.URLCache
|
||||
if cachePath := os.Getenv("SLINKY_CACHE_PATH"); cachePath != "" {
|
||||
cacheTTL := 24 // Default 24 hours
|
||||
if ttlStr := os.Getenv("SLINKY_CACHE_TTL_HOURS"); ttlStr != "" {
|
||||
if ttl, err := time.ParseDuration(ttlStr + "h"); err == nil && ttl > 0 {
|
||||
cacheTTL = int(ttl.Hours())
|
||||
}
|
||||
}
|
||||
urlCache = web.NewURLCache(cachePath, cacheTTL)
|
||||
if err := urlCache.Load(); err != nil {
|
||||
if shouldDebug() {
|
||||
fmt.Printf("::debug:: Failed to load cache: %v\n", err)
|
||||
}
|
||||
}
|
||||
// Save cache when done
|
||||
defer func() {
|
||||
if err := urlCache.Save(); err != nil {
|
||||
if shouldDebug() {
|
||||
fmt.Printf("::debug:: Failed to save cache: %v\n", err)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
cfg := web.Config{
|
||||
MaxConcurrency: maxConcurrency,
|
||||
RequestTimeout: timeout,
|
||||
Cache: urlCache,
|
||||
}
|
||||
|
||||
// Prepare URL list
|
||||
var urls []string
|
||||
@@ -275,9 +317,16 @@ func init() {
|
||||
}
|
||||
|
||||
// If running on a PR, post or update the comment(s), chunking as needed
|
||||
if ghOK && strings.TrimSpace(finalMDPath) != "" {
|
||||
// Check if PR commenting is enabled (default to true if not set)
|
||||
commentPR := true
|
||||
if val := os.Getenv("INPUT_COMMENT_PR"); val != "" {
|
||||
commentPR = strings.EqualFold(val, "true")
|
||||
}
|
||||
if ghOK && commentPR && strings.TrimSpace(finalMDPath) != "" {
|
||||
b, rerr := os.ReadFile(finalMDPath)
|
||||
if rerr == nil {
|
||||
if rerr != nil {
|
||||
fmt.Printf("::warning:: Failed to read markdown report for PR comment: %v\n", rerr)
|
||||
} else {
|
||||
full := string(b)
|
||||
if shouldDebug() {
|
||||
fmt.Printf("::debug:: Report size (chars): %d\n", len(full))
|
||||
@@ -286,7 +335,10 @@ func init() {
|
||||
if shouldDebug() {
|
||||
fmt.Printf("::debug:: Posting %d chunk(s)\n", len(chunks))
|
||||
}
|
||||
_ = upsertPRComments(ghRepo, ghPR, ghToken, chunks)
|
||||
if err := upsertPRComments(ghRepo, ghPR, ghToken, chunks); err != nil {
|
||||
// Non-critical error: log warning but don't fail the run
|
||||
fmt.Printf("::warning:: Failed to post PR comment: %v\n", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -440,32 +492,52 @@ func chunkMarkdownByURL(body string) []string {
|
||||
}
|
||||
|
||||
// upsertPRComments deletes any existing slinky comments and posts the new chunked comments in order.
|
||||
// Returns error if critical failures occur, but individual comment failures are logged and ignored.
|
||||
func upsertPRComments(repo string, prNumber int, token string, chunks []string) error {
|
||||
apiBase := "https://api.github.com"
|
||||
listURL := fmt.Sprintf("%s/repos/%s/issues/%d/comments?per_page=100", apiBase, repo, prNumber)
|
||||
req, _ := http.NewRequest(http.MethodGet, listURL, nil)
|
||||
req, err := http.NewRequest(http.MethodGet, listURL, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
req.Header.Set("Authorization", "Bearer "+token)
|
||||
req.Header.Set("Accept", "application/vnd.github+json")
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("failed to list comments: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 400 {
|
||||
return fmt.Errorf("failed to list comments: HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var comments []struct {
|
||||
ID int `json:"id"`
|
||||
Body string `json:"body"`
|
||||
}
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
_ = json.Unmarshal(b, &comments)
|
||||
b, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read comments response: %w", err)
|
||||
}
|
||||
if err := json.Unmarshal(b, &comments); err != nil {
|
||||
// Non-critical: continue even if we can't parse existing comments
|
||||
if shouldDebug() {
|
||||
fmt.Printf("::debug:: Failed to parse comments: %v\n", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Delete all existing slinky-report comments to avoid stale entries
|
||||
for _, c := range comments {
|
||||
if strings.Contains(c.Body, "<!-- slinky-report -->") {
|
||||
delURL := fmt.Sprintf("%s/repos/%s/issues/comments/%d", apiBase, repo, c.ID)
|
||||
dReq, _ := http.NewRequest(http.MethodDelete, delURL, nil)
|
||||
dReq, err := http.NewRequest(http.MethodDelete, delURL, nil)
|
||||
if err != nil {
|
||||
continue // Skip if we can't create request
|
||||
}
|
||||
dReq.Header.Set("Authorization", "Bearer "+token)
|
||||
dReq.Header.Set("Accept", "application/vnd.github+json")
|
||||
_, _ = http.DefaultClient.Do(dReq)
|
||||
_, _ = http.DefaultClient.Do(dReq) // Non-critical: ignore delete errors
|
||||
}
|
||||
}
|
||||
|
||||
@@ -473,14 +545,39 @@ func upsertPRComments(repo string, prNumber int, token string, chunks []string)
|
||||
for idx, chunk := range chunks {
|
||||
body := fmt.Sprintf("%s\n%s", "<!-- slinky-report -->", chunk)
|
||||
postURL := fmt.Sprintf("%s/repos/%s/issues/%d/comments", apiBase, repo, prNumber)
|
||||
payload, _ := json.Marshal(map[string]string{"body": body})
|
||||
req, _ = http.NewRequest(http.MethodPost, postURL, bytes.NewReader(payload))
|
||||
payload, err := json.Marshal(map[string]string{"body": body})
|
||||
if err != nil {
|
||||
if shouldDebug() {
|
||||
fmt.Printf("::debug:: Failed to marshal comment payload: %v\n", err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
req, err := http.NewRequest(http.MethodPost, postURL, bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
if shouldDebug() {
|
||||
fmt.Printf("::debug:: Failed to create POST request: %v\n", err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
req.Header.Set("Authorization", "Bearer "+token)
|
||||
req.Header.Set("Accept", "application/vnd.github+json")
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
res, _ := http.DefaultClient.Do(req)
|
||||
res, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
if shouldDebug() {
|
||||
fmt.Printf("::debug:: Failed to post chunk %d/%d: %v\n", idx+1, len(chunks), err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
res.Body.Close()
|
||||
if res.StatusCode >= 400 {
|
||||
if shouldDebug() {
|
||||
fmt.Printf("::debug:: Failed to post chunk %d/%d: HTTP %d\n", idx+1, len(chunks), res.StatusCode)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if shouldDebug() {
|
||||
fmt.Printf("::debug:: Posted chunk %d/%d: %v\n", idx+1, len(chunks), res)
|
||||
fmt.Printf("::debug:: Posted chunk %d/%d successfully\n", idx+1, len(chunks))
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
||||
@@ -1,93 +1,55 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
# Set up environment variables for GitHub blob base URL
|
||||
# Set up GitHub blob base URL for PR links
|
||||
if [ -n "${INPUT_REPO_BLOB_BASE:-}" ]; then
|
||||
export SLINKY_REPO_BLOB_BASE_URL="${INPUT_REPO_BLOB_BASE}"
|
||||
elif [ -n "${GITHUB_REPOSITORY:-}" ]; then
|
||||
COMMIT_SHA="${GITHUB_SHA:-}"
|
||||
if [ -n "${GITHUB_EVENT_PATH:-}" ] && command -v jq >/dev/null 2>&1; then
|
||||
PR_HEAD_SHA="$(jq -r '.pull_request.head.sha // empty' "$GITHUB_EVENT_PATH" || true)"
|
||||
if [ -n "$PR_HEAD_SHA" ]; then
|
||||
COMMIT_SHA="$PR_HEAD_SHA"
|
||||
fi
|
||||
fi
|
||||
if [ -n "$COMMIT_SHA" ]; then
|
||||
export SLINKY_REPO_BLOB_BASE_URL="https://github.com/${GITHUB_REPOSITORY}/blob/${COMMIT_SHA}"
|
||||
PR_HEAD_SHA="$(jq -r '.pull_request.head.sha // empty' "$GITHUB_EVENT_PATH" 2>/dev/null || true)"
|
||||
[ -n "$PR_HEAD_SHA" ] && COMMIT_SHA="$PR_HEAD_SHA"
|
||||
fi
|
||||
[ -n "$COMMIT_SHA" ] && export SLINKY_REPO_BLOB_BASE_URL="https://github.com/${GITHUB_REPOSITORY}/blob/${COMMIT_SHA}"
|
||||
fi
|
||||
|
||||
# Build command arguments
|
||||
set -- check
|
||||
# Build slinky command arguments
|
||||
ARGS="check"
|
||||
|
||||
# Add optional flags
|
||||
if [ -n "${INPUT_CONCURRENCY:-}" ]; then
|
||||
set -- "$@" --concurrency "${INPUT_CONCURRENCY}"
|
||||
fi
|
||||
# Optional flags
|
||||
[ -n "${INPUT_CONCURRENCY:-}" ] && ARGS="$ARGS --concurrency ${INPUT_CONCURRENCY}"
|
||||
[ -n "${INPUT_TIMEOUT:-}" ] && ARGS="$ARGS --timeout ${INPUT_TIMEOUT}"
|
||||
[ -n "${INPUT_JSON_OUT:-}" ] && ARGS="$ARGS --json-out ${INPUT_JSON_OUT}"
|
||||
[ -n "${INPUT_MD_OUT:-}" ] && ARGS="$ARGS --md-out ${INPUT_MD_OUT}"
|
||||
[ -n "${INPUT_REPO_BLOB_BASE:-}" ] && ARGS="$ARGS --repo-blob-base ${INPUT_REPO_BLOB_BASE}"
|
||||
|
||||
if [ -n "${INPUT_TIMEOUT:-}" ]; then
|
||||
set -- "$@" --timeout "${INPUT_TIMEOUT}"
|
||||
fi
|
||||
# Boolean flags with defaults
|
||||
[ "${INPUT_FAIL_ON_FAILURES:-true}" = "true" ] && ARGS="$ARGS --fail-on-failures=true" || ARGS="$ARGS --fail-on-failures=false"
|
||||
[ "${INPUT_RESPECT_GITIGNORE:-true}" = "true" ] && ARGS="$ARGS --respect-gitignore=true" || ARGS="$ARGS --respect-gitignore=false"
|
||||
|
||||
if [ -n "${INPUT_JSON_OUT:-}" ]; then
|
||||
set -- "$@" --json-out "${INPUT_JSON_OUT}"
|
||||
fi
|
||||
|
||||
if [ -n "${INPUT_MD_OUT:-}" ]; then
|
||||
set -- "$@" --md-out "${INPUT_MD_OUT}"
|
||||
fi
|
||||
|
||||
if [ -n "${INPUT_REPO_BLOB_BASE:-}" ]; then
|
||||
set -- "$@" --repo-blob-base "${INPUT_REPO_BLOB_BASE}"
|
||||
fi
|
||||
|
||||
if [ "${INPUT_FAIL_ON_FAILURES:-true}" = "true" ]; then
|
||||
set -- "$@" --fail-on-failures=true
|
||||
else
|
||||
set -- "$@" --fail-on-failures=false
|
||||
fi
|
||||
|
||||
if [ "${INPUT_RESPECT_GITIGNORE:-true}" = "true" ]; then
|
||||
set -- "$@" --respect-gitignore=true
|
||||
else
|
||||
set -- "$@" --respect-gitignore=false
|
||||
fi
|
||||
|
||||
# Add targets
|
||||
# Add targets (comma-separated glob patterns)
|
||||
if [ -n "${INPUT_TARGETS:-}" ]; then
|
||||
# Split comma-separated targets and add each one
|
||||
IFS=','
|
||||
for target in $INPUT_TARGETS; do
|
||||
target=$(echo "$target" | xargs) # trim whitespace
|
||||
if [ -n "$target" ]; then
|
||||
set -- "$@" "$target"
|
||||
fi
|
||||
target=$(echo "$target" | xargs)
|
||||
[ -n "$target" ] && ARGS="$ARGS $target"
|
||||
done
|
||||
unset IFS
|
||||
else
|
||||
# Default: scan everything
|
||||
set -- "$@" "**/*"
|
||||
ARGS="$ARGS **/*"
|
||||
fi
|
||||
|
||||
# Debug output
|
||||
if [ "${ACTIONS_STEP_DEBUG:-}" = "true" ]; then
|
||||
printf "::debug:: CLI Args: slinky %s\n" "$*"
|
||||
fi
|
||||
[ "${ACTIONS_STEP_DEBUG:-}" = "true" ] && printf "::debug:: Running: slinky %s\n" "$ARGS"
|
||||
|
||||
# Execute the command
|
||||
set +e
|
||||
slinky "$@"
|
||||
SLINKY_EXIT_CODE=$?
|
||||
set -e
|
||||
# Execute slinky (crawl repo with glob input, filter via .slinkignore)
|
||||
slinky $ARGS
|
||||
EXIT_CODE=$?
|
||||
|
||||
# Expose outputs
|
||||
if [ -n "${GITHUB_OUTPUT:-}" ]; then
|
||||
if [ -n "${INPUT_JSON_OUT:-}" ]; then
|
||||
echo "json_path=${INPUT_JSON_OUT}" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
if [ -n "${INPUT_MD_OUT:-}" ]; then
|
||||
echo "md_path=${INPUT_MD_OUT}" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
[ -n "${INPUT_JSON_OUT:-}" ] && echo "json_path=${INPUT_JSON_OUT}" >> "$GITHUB_OUTPUT"
|
||||
[ -n "${INPUT_MD_OUT:-}" ] && echo "md_path=${INPUT_MD_OUT}" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
# Append report to job summary if requested
|
||||
@@ -95,4 +57,4 @@ if [ "${INPUT_STEP_SUMMARY:-true}" = "true" ] && [ -n "${GITHUB_STEP_SUMMARY:-}"
|
||||
cat "${INPUT_MD_OUT}" >> "$GITHUB_STEP_SUMMARY"
|
||||
fi
|
||||
|
||||
exit ${SLINKY_EXIT_CODE:-0}
|
||||
exit ${EXIT_CODE:-0}
|
||||
@@ -1,7 +1,6 @@
|
||||
package fsurls
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -247,22 +246,14 @@ func CollectURLsWithIgnoreConfig(rootPath string, globs []string, respectGitigno
|
||||
return nil
|
||||
}
|
||||
defer f.Close()
|
||||
br := bufio.NewReader(f)
|
||||
// Read up to maxSize bytes
|
||||
var b strings.Builder
|
||||
read := int64(0)
|
||||
for {
|
||||
chunk, cerr := br.ReadString('\n')
|
||||
b.WriteString(chunk)
|
||||
read += int64(len(chunk))
|
||||
if cerr == io.EOF || read > maxSize {
|
||||
break
|
||||
}
|
||||
if cerr != nil {
|
||||
break
|
||||
}
|
||||
// Read up to maxSize bytes efficiently using LimitReader
|
||||
limitedReader := io.LimitReader(f, maxSize)
|
||||
contentBytes, readErr := io.ReadAll(limitedReader)
|
||||
if readErr != nil {
|
||||
// Non-critical error: skip file and continue
|
||||
return nil
|
||||
}
|
||||
content := b.String()
|
||||
content := string(contentBytes)
|
||||
// Skip if likely binary (NUL present)
|
||||
if strings.IndexByte(content, '\x00') >= 0 {
|
||||
return nil
|
||||
@@ -428,21 +419,14 @@ func CollectURLsProgressWithIgnoreConfig(rootPath string, globs []string, respec
|
||||
return nil
|
||||
}
|
||||
defer f.Close()
|
||||
br := bufio.NewReader(f)
|
||||
var b strings.Builder
|
||||
read := int64(0)
|
||||
for {
|
||||
chunk, cerr := br.ReadString('\n')
|
||||
b.WriteString(chunk)
|
||||
read += int64(len(chunk))
|
||||
if cerr == io.EOF || read > maxSize {
|
||||
break
|
||||
}
|
||||
if cerr != nil {
|
||||
break
|
||||
}
|
||||
// Read up to maxSize bytes efficiently using LimitReader
|
||||
limitedReader := io.LimitReader(f, maxSize)
|
||||
contentBytes, readErr := io.ReadAll(limitedReader)
|
||||
if readErr != nil {
|
||||
// Non-critical error: skip file and continue
|
||||
return nil
|
||||
}
|
||||
content := b.String()
|
||||
content := string(contentBytes)
|
||||
if strings.IndexByte(content, '\x00') >= 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
132
internal/web/cache.go
Normal file
132
internal/web/cache.go
Normal file
@@ -0,0 +1,132 @@
|
||||
package web
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
// CacheEntry represents a cached URL check result
|
||||
type CacheEntry struct {
|
||||
URL string `json:"url"`
|
||||
OK bool `json:"ok"`
|
||||
Status int `json:"status"`
|
||||
ErrMsg string `json:"error,omitempty"`
|
||||
Checked time.Time `json:"checked"`
|
||||
}
|
||||
|
||||
// URLCache manages URL result caching
|
||||
type URLCache struct {
|
||||
entries map[string]CacheEntry
|
||||
ttl time.Duration
|
||||
path string
|
||||
}
|
||||
|
||||
// NewURLCache creates a new URL cache with optional file path
|
||||
func NewURLCache(cachePath string, ttlHours int) *URLCache {
|
||||
ttl := time.Duration(ttlHours) * time.Hour
|
||||
if ttl <= 0 {
|
||||
ttl = 24 * time.Hour // Default 24 hours
|
||||
}
|
||||
return &URLCache{
|
||||
entries: make(map[string]CacheEntry),
|
||||
ttl: ttl,
|
||||
path: cachePath,
|
||||
}
|
||||
}
|
||||
|
||||
// Load loads cache entries from file if path is set
|
||||
func (c *URLCache) Load() error {
|
||||
if c.path == "" {
|
||||
return nil // No cache file specified
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(c.path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil // Cache file doesn't exist yet, that's OK
|
||||
}
|
||||
return fmt.Errorf("failed to read cache file: %w", err)
|
||||
}
|
||||
|
||||
var entries []CacheEntry
|
||||
if err := json.Unmarshal(data, &entries); err != nil {
|
||||
// Non-critical: if cache is corrupted, start fresh
|
||||
return nil
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
c.entries = make(map[string]CacheEntry, len(entries))
|
||||
for _, entry := range entries {
|
||||
// Only load entries that haven't expired
|
||||
if now.Sub(entry.Checked) < c.ttl {
|
||||
c.entries[entry.URL] = entry
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Get retrieves a cached result for a URL
|
||||
func (c *URLCache) Get(url string) (CacheEntry, bool) {
|
||||
entry, ok := c.entries[url]
|
||||
if !ok {
|
||||
return CacheEntry{}, false
|
||||
}
|
||||
|
||||
// Check if entry has expired
|
||||
if time.Since(entry.Checked) >= c.ttl {
|
||||
delete(c.entries, url)
|
||||
return CacheEntry{}, false
|
||||
}
|
||||
|
||||
return entry, true
|
||||
}
|
||||
|
||||
// Set stores a result in the cache
|
||||
func (c *URLCache) Set(url string, ok bool, status int, errMsg string) {
|
||||
c.entries[url] = CacheEntry{
|
||||
URL: url,
|
||||
OK: ok,
|
||||
Status: status,
|
||||
ErrMsg: errMsg,
|
||||
Checked: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
// Save saves cache entries to file if path is set
|
||||
func (c *URLCache) Save() error {
|
||||
if c.path == "" {
|
||||
return nil // No cache file specified
|
||||
}
|
||||
|
||||
// Convert map to slice for JSON serialization
|
||||
entries := make([]CacheEntry, 0, len(c.entries))
|
||||
for _, entry := range c.entries {
|
||||
entries = append(entries, entry)
|
||||
}
|
||||
|
||||
data, err := json.MarshalIndent(entries, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal cache: %w", err)
|
||||
}
|
||||
|
||||
// Write to temp file first, then rename (atomic write)
|
||||
tmpPath := c.path + ".tmp"
|
||||
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||
return fmt.Errorf("failed to write cache file: %w", err)
|
||||
}
|
||||
|
||||
if err := os.Rename(tmpPath, c.path); err != nil {
|
||||
return fmt.Errorf("failed to rename cache file: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Clear removes all entries from the cache
|
||||
func (c *URLCache) Clear() {
|
||||
c.entries = make(map[string]CacheEntry)
|
||||
}
|
||||
|
||||
@@ -2,9 +2,11 @@ package web
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"sort"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
@@ -13,13 +15,18 @@ import (
|
||||
func CheckURLs(ctx context.Context, urls []string, sources map[string][]string, out chan<- Result, stats chan<- Stats, cfg Config) {
|
||||
defer close(out)
|
||||
|
||||
// Build HTTP client similar to crawler
|
||||
// Build HTTP client with optimized connection pooling
|
||||
// Increase MaxIdleConns to handle many unique domains efficiently
|
||||
maxIdleConns := cfg.MaxConcurrency * 4
|
||||
if maxIdleConns < 100 {
|
||||
maxIdleConns = 100 // Minimum 100 idle connections for better performance across domains
|
||||
}
|
||||
transport := &http.Transport{
|
||||
Proxy: http.ProxyFromEnvironment,
|
||||
DialContext: (&net.Dialer{Timeout: 2 * time.Second, KeepAlive: 30 * time.Second}).DialContext,
|
||||
TLSHandshakeTimeout: 5 * time.Second,
|
||||
ExpectContinueTimeout: 1 * time.Second,
|
||||
MaxIdleConns: cfg.MaxConcurrency * 2,
|
||||
MaxIdleConns: maxIdleConns,
|
||||
MaxIdleConnsPerHost: cfg.MaxConcurrency,
|
||||
MaxConnsPerHost: cfg.MaxConcurrency,
|
||||
IdleConnTimeout: 30 * time.Second,
|
||||
@@ -31,16 +38,11 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string,
|
||||
jobs := make(chan job, len(urls))
|
||||
done := make(chan struct{})
|
||||
|
||||
// Seed jobs
|
||||
unique := make(map[string]struct{}, len(urls))
|
||||
// Seed jobs (URLs are already deduplicated in check.go, so no need to deduplicate here)
|
||||
for _, u := range urls {
|
||||
if u == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := unique[u]; ok {
|
||||
continue
|
||||
}
|
||||
unique[u] = struct{}{}
|
||||
jobs <- job{url: u}
|
||||
}
|
||||
close(jobs)
|
||||
@@ -49,8 +51,9 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string,
|
||||
if concurrency <= 0 {
|
||||
concurrency = 8
|
||||
}
|
||||
processed := 0
|
||||
pending := len(unique)
|
||||
// Use atomic counters to avoid race conditions
|
||||
var processed int64
|
||||
var pending int64 = int64(len(urls))
|
||||
|
||||
worker := func() {
|
||||
for j := range jobs {
|
||||
@@ -59,15 +62,47 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string,
|
||||
return
|
||||
default:
|
||||
}
|
||||
ok, status, resp, err := fetchWithMethod(ctx, client, http.MethodGet, j.url)
|
||||
if resp != nil && resp.Body != nil {
|
||||
resp.Body.Close()
|
||||
|
||||
var ok bool
|
||||
var status int
|
||||
var err error
|
||||
var cacheHit bool
|
||||
|
||||
// Check cache first if available
|
||||
if cfg.Cache != nil {
|
||||
if cached, found := cfg.Cache.Get(j.url); found {
|
||||
ok = cached.OK
|
||||
status = cached.Status
|
||||
err = nil
|
||||
if cached.ErrMsg != "" {
|
||||
err = fmt.Errorf("%s", cached.ErrMsg)
|
||||
}
|
||||
cacheHit = true
|
||||
}
|
||||
}
|
||||
// Treat 401/403/408/429 as valid links
|
||||
if status == http.StatusUnauthorized || status == http.StatusForbidden || status == http.StatusRequestTimeout || status == http.StatusTooManyRequests {
|
||||
ok = true
|
||||
err = nil
|
||||
|
||||
// If not cached, fetch from network
|
||||
if !cacheHit {
|
||||
var resp *http.Response
|
||||
ok, status, resp, err = fetchWithMethod(ctx, client, http.MethodGet, j.url)
|
||||
if resp != nil && resp.Body != nil {
|
||||
resp.Body.Close()
|
||||
}
|
||||
// Status code handling is now done in fetchWithMethod:
|
||||
// - 200-299, 401, 403 are OK (page exists)
|
||||
// - 404, DNS errors, connection refused are bad (flagged)
|
||||
// - 408, 429, 5xx are retried then flagged if still failing
|
||||
|
||||
// Store in cache
|
||||
if cfg.Cache != nil {
|
||||
errMsg := ""
|
||||
if err != nil {
|
||||
errMsg = err.Error()
|
||||
}
|
||||
cfg.Cache.Set(j.url, ok, status, errMsg)
|
||||
}
|
||||
}
|
||||
|
||||
// Check context before sending result
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
@@ -82,16 +117,17 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string,
|
||||
|
||||
// Send result with context check
|
||||
select {
|
||||
case out <- Result{URL: j.url, OK: ok, Status: status, Err: err, ErrMsg: errString(err), Method: http.MethodGet, Sources: cloneAndSort(srcs)}:
|
||||
case out <- Result{URL: j.url, OK: ok, Status: status, Err: err, ErrMsg: errString(err), Method: http.MethodGet, Sources: cloneAndSort(srcs), CacheHit: cacheHit}:
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
|
||||
processed++
|
||||
pending--
|
||||
// Atomically update counters
|
||||
proc := atomic.AddInt64(&processed, 1)
|
||||
pend := atomic.AddInt64(&pending, -1)
|
||||
if stats != nil {
|
||||
select {
|
||||
case stats <- Stats{Pending: pending, Processed: processed}:
|
||||
case stats <- Stats{Pending: int(pend), Processed: int(proc)}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,34 +3,182 @@ package web
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const browserUA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36"
|
||||
const maxRedirects = 10
|
||||
const maxRetries = 3
|
||||
|
||||
// fetchWithMethod performs HTTP request with retry logic and redirect following.
|
||||
// Returns: (isOK, statusCode, response, error)
|
||||
// Status code handling:
|
||||
// - OK (don't flag): 200-299, 401 (Unauthorized), 403 (Forbidden) - page exists
|
||||
// - Bad (flag): 404 (Not Found), DNS errors, connection refused
|
||||
// - Retry then flag: 408 (Timeout), 429 (Rate Limited), 5xx (Server Errors)
|
||||
func fetchWithMethod(ctx context.Context, client *http.Client, method string, raw string) (bool, int, *http.Response, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, method, raw, nil)
|
||||
var lastErr error
|
||||
var lastResp *http.Response
|
||||
|
||||
// Retry logic for transient errors
|
||||
for attempt := 0; attempt <= maxRetries; attempt++ {
|
||||
if attempt > 0 {
|
||||
// Exponential backoff: 1s, 2s, 4s
|
||||
backoff := time.Duration(1<<uint(attempt-1)) * time.Second
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return false, 0, nil, ctx.Err()
|
||||
case <-time.After(backoff):
|
||||
}
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, method, raw, nil)
|
||||
if err != nil {
|
||||
return false, 0, nil, err
|
||||
}
|
||||
req.Header.Set("User-Agent", browserUA)
|
||||
req.Header.Set("Accept", "*/*")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
// Check if error is retryable
|
||||
if isDNSError(err) || isRefused(err) {
|
||||
// Non-retryable: DNS errors and connection refused are permanent failures
|
||||
return false, 404, nil, simpleError("host not found")
|
||||
}
|
||||
if isTimeout(err) {
|
||||
// Retryable: timeouts may be transient
|
||||
if attempt < maxRetries {
|
||||
continue
|
||||
}
|
||||
return false, 408, nil, simpleError("request timeout")
|
||||
}
|
||||
// Other network errors: retry if we have attempts left
|
||||
if attempt < maxRetries {
|
||||
continue
|
||||
}
|
||||
return false, 0, nil, err
|
||||
}
|
||||
|
||||
// Follow redirects and check final status
|
||||
finalResp, finalStatus, redirectErr := followRedirects(ctx, client, resp, raw, 0)
|
||||
if redirectErr != nil {
|
||||
// Redirect error: retry if transient
|
||||
if attempt < maxRetries && (isTimeout(redirectErr) || isRetryableStatus(finalStatus)) {
|
||||
if finalResp != nil && finalResp.Body != nil {
|
||||
finalResp.Body.Close()
|
||||
}
|
||||
continue
|
||||
}
|
||||
if finalResp != nil && finalResp.Body != nil {
|
||||
finalResp.Body.Close()
|
||||
}
|
||||
return false, finalStatus, nil, redirectErr
|
||||
}
|
||||
|
||||
lastResp = finalResp
|
||||
status := finalResp.StatusCode
|
||||
|
||||
// Check if status is retryable
|
||||
if isRetryableStatus(status) && attempt < maxRetries {
|
||||
if finalResp.Body != nil {
|
||||
finalResp.Body.Close()
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Determine if link is OK based on status code
|
||||
isOK := isOKStatus(status)
|
||||
return isOK, status, finalResp, nil
|
||||
}
|
||||
|
||||
// All retries exhausted
|
||||
if lastResp != nil && lastResp.Body != nil {
|
||||
lastResp.Body.Close()
|
||||
}
|
||||
return false, 0, nil, fmt.Errorf("max retries exceeded: %w", lastErr)
|
||||
}
|
||||
|
||||
// followRedirects follows redirects up to maxRedirects, checking for loops.
|
||||
func followRedirects(ctx context.Context, client *http.Client, resp *http.Response, originalURL string, depth int) (*http.Response, int, error) {
|
||||
if depth > maxRedirects {
|
||||
return resp, resp.StatusCode, simpleError("too many redirects")
|
||||
}
|
||||
|
||||
status := resp.StatusCode
|
||||
if status < 300 || status >= 400 {
|
||||
// Not a redirect
|
||||
return resp, status, nil
|
||||
}
|
||||
|
||||
// Handle redirect
|
||||
location := resp.Header.Get("Location")
|
||||
if location == "" {
|
||||
return resp, status, nil
|
||||
}
|
||||
|
||||
// Resolve relative URLs
|
||||
baseURL, err := url.Parse(originalURL)
|
||||
if err != nil {
|
||||
return false, 0, nil, err
|
||||
return resp, status, err
|
||||
}
|
||||
redirectURL, err := baseURL.Parse(location)
|
||||
if err != nil {
|
||||
return resp, status, err
|
||||
}
|
||||
|
||||
// Check for redirect loop (simple check: same URL)
|
||||
if redirectURL.String() == originalURL {
|
||||
return resp, status, simpleError("redirect loop detected")
|
||||
}
|
||||
|
||||
// Close previous response body
|
||||
if resp.Body != nil {
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
// Follow redirect
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", redirectURL.String(), nil)
|
||||
if err != nil {
|
||||
return resp, status, err
|
||||
}
|
||||
req.Header.Set("User-Agent", browserUA)
|
||||
req.Header.Set("Accept", "*/*")
|
||||
resp, err := client.Do(req)
|
||||
|
||||
newResp, err := client.Do(req)
|
||||
if err != nil {
|
||||
if isDNSError(err) {
|
||||
return false, 404, nil, simpleError("host not found")
|
||||
}
|
||||
if isTimeout(err) {
|
||||
return false, 408, nil, simpleError("request timeout")
|
||||
}
|
||||
if isRefused(err) {
|
||||
return false, 503, nil, simpleError("connection refused")
|
||||
}
|
||||
return false, 0, nil, err
|
||||
return resp, status, err
|
||||
}
|
||||
return resp.StatusCode >= 200 && resp.StatusCode < 400, resp.StatusCode, resp, nil
|
||||
|
||||
// Recursively follow redirects
|
||||
return followRedirects(ctx, client, newResp, redirectURL.String(), depth+1)
|
||||
}
|
||||
|
||||
// isOKStatus determines if a status code indicates the link is valid.
|
||||
// 200-299: Success
|
||||
// 401: Unauthorized (page exists, just requires auth)
|
||||
// 403: Forbidden (page exists, just requires permissions)
|
||||
func isOKStatus(status int) bool {
|
||||
if status >= 200 && status < 300 {
|
||||
return true
|
||||
}
|
||||
if status == http.StatusUnauthorized || status == http.StatusForbidden {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// isRetryableStatus determines if a status code should trigger a retry.
|
||||
func isRetryableStatus(status int) bool {
|
||||
return status == http.StatusRequestTimeout || // 408
|
||||
status == http.StatusTooManyRequests || // 429
|
||||
status >= 500 // 5xx server errors
|
||||
}
|
||||
|
||||
func errString(e error) string {
|
||||
|
||||
@@ -26,4 +26,5 @@ type Config struct {
|
||||
RequestTimeout time.Duration
|
||||
MaxRetries429 int
|
||||
Exclude []string
|
||||
Cache *URLCache // Optional URL result cache
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user