testing some optimizations

This commit is contained in:
Luke Hagar
2025-11-14 20:33:15 +00:00
parent 4fb0c9b375
commit bd21420b3e
10 changed files with 511 additions and 145 deletions

View File

@@ -7,7 +7,9 @@ COPY . .
RUN CGO_ENABLED=0 go build -o /usr/local/bin/slinky ./
FROM alpine:3.20
RUN apk add --no-cache curl jq ca-certificates
# jq is used in entrypoint.sh for parsing GitHub event JSON
# ca-certificates is needed for HTTPS requests
RUN apk add --no-cache jq ca-certificates
COPY --from=build /usr/local/bin/slinky /usr/local/bin/slinky
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh

View File

@@ -16,15 +16,18 @@ jobs:
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
pull-requests: write # Only needed if comment-pr is enabled
steps:
- uses: actions/checkout@v4
- name: Run Slinky
uses: LukeHagar/slinky@v1
with:
targets: "docs/,README.md,**/*.md"
# comment-pr: true # Optional: post results as PR comment (requires GITHUB_TOKEN)
```
**Note:** The `GITHUB_TOKEN` is automatically provided by GitHub Actions and is only required for PR comment functionality. Core link checking works without it. If you disable PR comments (`comment-pr: false`), you can remove the `pull-requests: write` permission.
### Inputs
- **targets**: Comma-separated paths and patterns to scan. Can be directories, files, or glob patterns (e.g. `docs/,api-specs/**/*.yaml,README.md`). Default: `**/*`

View File

@@ -53,6 +53,7 @@ runs:
INPUT_FAIL_ON_FAILURES: ${{ inputs.fail_on_failures }}
INPUT_COMMENT_PR: ${{ inputs.comment_pr }}
INPUT_STEP_SUMMARY: ${{ inputs.step_summary }}
GITHUB_TOKEN: ${{ github.token }}
outputs:
json_path:

View File

@@ -158,9 +158,51 @@ func init() {
fmt.Printf("::debug:: Root: %s\n", displayRoot)
}
// Validate and clamp numeric inputs
if maxConcurrency < 1 {
maxConcurrency = 1
} else if maxConcurrency > 100 {
maxConcurrency = 100
}
if timeoutSeconds < 1 {
timeoutSeconds = 1
} else if timeoutSeconds > 300 {
timeoutSeconds = 300 // Max 5 minutes
}
// Build config
timeout := time.Duration(timeoutSeconds) * time.Second
cfg := web.Config{MaxConcurrency: maxConcurrency, RequestTimeout: timeout}
// Set up URL cache if cache path is provided via environment variable
var urlCache *web.URLCache
if cachePath := os.Getenv("SLINKY_CACHE_PATH"); cachePath != "" {
cacheTTL := 24 // Default 24 hours
if ttlStr := os.Getenv("SLINKY_CACHE_TTL_HOURS"); ttlStr != "" {
if ttl, err := time.ParseDuration(ttlStr + "h"); err == nil && ttl > 0 {
cacheTTL = int(ttl.Hours())
}
}
urlCache = web.NewURLCache(cachePath, cacheTTL)
if err := urlCache.Load(); err != nil {
if shouldDebug() {
fmt.Printf("::debug:: Failed to load cache: %v\n", err)
}
}
// Save cache when done
defer func() {
if err := urlCache.Save(); err != nil {
if shouldDebug() {
fmt.Printf("::debug:: Failed to save cache: %v\n", err)
}
}
}()
}
cfg := web.Config{
MaxConcurrency: maxConcurrency,
RequestTimeout: timeout,
Cache: urlCache,
}
// Prepare URL list
var urls []string
@@ -275,9 +317,16 @@ func init() {
}
// If running on a PR, post or update the comment(s), chunking as needed
if ghOK && strings.TrimSpace(finalMDPath) != "" {
// Check if PR commenting is enabled (default to true if not set)
commentPR := true
if val := os.Getenv("INPUT_COMMENT_PR"); val != "" {
commentPR = strings.EqualFold(val, "true")
}
if ghOK && commentPR && strings.TrimSpace(finalMDPath) != "" {
b, rerr := os.ReadFile(finalMDPath)
if rerr == nil {
if rerr != nil {
fmt.Printf("::warning:: Failed to read markdown report for PR comment: %v\n", rerr)
} else {
full := string(b)
if shouldDebug() {
fmt.Printf("::debug:: Report size (chars): %d\n", len(full))
@@ -286,7 +335,10 @@ func init() {
if shouldDebug() {
fmt.Printf("::debug:: Posting %d chunk(s)\n", len(chunks))
}
_ = upsertPRComments(ghRepo, ghPR, ghToken, chunks)
if err := upsertPRComments(ghRepo, ghPR, ghToken, chunks); err != nil {
// Non-critical error: log warning but don't fail the run
fmt.Printf("::warning:: Failed to post PR comment: %v\n", err)
}
}
}
@@ -440,32 +492,52 @@ func chunkMarkdownByURL(body string) []string {
}
// upsertPRComments deletes any existing slinky comments and posts the new chunked comments in order.
// Returns error if critical failures occur, but individual comment failures are logged and ignored.
func upsertPRComments(repo string, prNumber int, token string, chunks []string) error {
apiBase := "https://api.github.com"
listURL := fmt.Sprintf("%s/repos/%s/issues/%d/comments?per_page=100", apiBase, repo, prNumber)
req, _ := http.NewRequest(http.MethodGet, listURL, nil)
req, err := http.NewRequest(http.MethodGet, listURL, nil)
if err != nil {
return fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Authorization", "Bearer "+token)
req.Header.Set("Accept", "application/vnd.github+json")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return err
return fmt.Errorf("failed to list comments: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
return fmt.Errorf("failed to list comments: HTTP %d", resp.StatusCode)
}
var comments []struct {
ID int `json:"id"`
Body string `json:"body"`
}
b, _ := io.ReadAll(resp.Body)
_ = json.Unmarshal(b, &comments)
b, err := io.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("failed to read comments response: %w", err)
}
if err := json.Unmarshal(b, &comments); err != nil {
// Non-critical: continue even if we can't parse existing comments
if shouldDebug() {
fmt.Printf("::debug:: Failed to parse comments: %v\n", err)
}
}
// Delete all existing slinky-report comments to avoid stale entries
for _, c := range comments {
if strings.Contains(c.Body, "<!-- slinky-report -->") {
delURL := fmt.Sprintf("%s/repos/%s/issues/comments/%d", apiBase, repo, c.ID)
dReq, _ := http.NewRequest(http.MethodDelete, delURL, nil)
dReq, err := http.NewRequest(http.MethodDelete, delURL, nil)
if err != nil {
continue // Skip if we can't create request
}
dReq.Header.Set("Authorization", "Bearer "+token)
dReq.Header.Set("Accept", "application/vnd.github+json")
_, _ = http.DefaultClient.Do(dReq)
_, _ = http.DefaultClient.Do(dReq) // Non-critical: ignore delete errors
}
}
@@ -473,14 +545,39 @@ func upsertPRComments(repo string, prNumber int, token string, chunks []string)
for idx, chunk := range chunks {
body := fmt.Sprintf("%s\n%s", "<!-- slinky-report -->", chunk)
postURL := fmt.Sprintf("%s/repos/%s/issues/%d/comments", apiBase, repo, prNumber)
payload, _ := json.Marshal(map[string]string{"body": body})
req, _ = http.NewRequest(http.MethodPost, postURL, bytes.NewReader(payload))
payload, err := json.Marshal(map[string]string{"body": body})
if err != nil {
if shouldDebug() {
fmt.Printf("::debug:: Failed to marshal comment payload: %v\n", err)
}
continue
}
req, err := http.NewRequest(http.MethodPost, postURL, bytes.NewReader(payload))
if err != nil {
if shouldDebug() {
fmt.Printf("::debug:: Failed to create POST request: %v\n", err)
}
continue
}
req.Header.Set("Authorization", "Bearer "+token)
req.Header.Set("Accept", "application/vnd.github+json")
req.Header.Set("Content-Type", "application/json")
res, _ := http.DefaultClient.Do(req)
res, err := http.DefaultClient.Do(req)
if err != nil {
if shouldDebug() {
fmt.Printf("::debug:: Failed to post chunk %d/%d: %v\n", idx+1, len(chunks), err)
}
continue
}
res.Body.Close()
if res.StatusCode >= 400 {
if shouldDebug() {
fmt.Printf("::debug:: Failed to post chunk %d/%d: HTTP %d\n", idx+1, len(chunks), res.StatusCode)
}
continue
}
if shouldDebug() {
fmt.Printf("::debug:: Posted chunk %d/%d: %v\n", idx+1, len(chunks), res)
fmt.Printf("::debug:: Posted chunk %d/%d successfully\n", idx+1, len(chunks))
}
}
return nil

View File

@@ -1,93 +1,55 @@
#!/bin/sh
set -eu
# Set up environment variables for GitHub blob base URL
# Set up GitHub blob base URL for PR links
if [ -n "${INPUT_REPO_BLOB_BASE:-}" ]; then
export SLINKY_REPO_BLOB_BASE_URL="${INPUT_REPO_BLOB_BASE}"
elif [ -n "${GITHUB_REPOSITORY:-}" ]; then
COMMIT_SHA="${GITHUB_SHA:-}"
if [ -n "${GITHUB_EVENT_PATH:-}" ] && command -v jq >/dev/null 2>&1; then
PR_HEAD_SHA="$(jq -r '.pull_request.head.sha // empty' "$GITHUB_EVENT_PATH" || true)"
if [ -n "$PR_HEAD_SHA" ]; then
COMMIT_SHA="$PR_HEAD_SHA"
fi
fi
if [ -n "$COMMIT_SHA" ]; then
export SLINKY_REPO_BLOB_BASE_URL="https://github.com/${GITHUB_REPOSITORY}/blob/${COMMIT_SHA}"
PR_HEAD_SHA="$(jq -r '.pull_request.head.sha // empty' "$GITHUB_EVENT_PATH" 2>/dev/null || true)"
[ -n "$PR_HEAD_SHA" ] && COMMIT_SHA="$PR_HEAD_SHA"
fi
[ -n "$COMMIT_SHA" ] && export SLINKY_REPO_BLOB_BASE_URL="https://github.com/${GITHUB_REPOSITORY}/blob/${COMMIT_SHA}"
fi
# Build command arguments
set -- check
# Build slinky command arguments
ARGS="check"
# Add optional flags
if [ -n "${INPUT_CONCURRENCY:-}" ]; then
set -- "$@" --concurrency "${INPUT_CONCURRENCY}"
fi
# Optional flags
[ -n "${INPUT_CONCURRENCY:-}" ] && ARGS="$ARGS --concurrency ${INPUT_CONCURRENCY}"
[ -n "${INPUT_TIMEOUT:-}" ] && ARGS="$ARGS --timeout ${INPUT_TIMEOUT}"
[ -n "${INPUT_JSON_OUT:-}" ] && ARGS="$ARGS --json-out ${INPUT_JSON_OUT}"
[ -n "${INPUT_MD_OUT:-}" ] && ARGS="$ARGS --md-out ${INPUT_MD_OUT}"
[ -n "${INPUT_REPO_BLOB_BASE:-}" ] && ARGS="$ARGS --repo-blob-base ${INPUT_REPO_BLOB_BASE}"
if [ -n "${INPUT_TIMEOUT:-}" ]; then
set -- "$@" --timeout "${INPUT_TIMEOUT}"
fi
# Boolean flags with defaults
[ "${INPUT_FAIL_ON_FAILURES:-true}" = "true" ] && ARGS="$ARGS --fail-on-failures=true" || ARGS="$ARGS --fail-on-failures=false"
[ "${INPUT_RESPECT_GITIGNORE:-true}" = "true" ] && ARGS="$ARGS --respect-gitignore=true" || ARGS="$ARGS --respect-gitignore=false"
if [ -n "${INPUT_JSON_OUT:-}" ]; then
set -- "$@" --json-out "${INPUT_JSON_OUT}"
fi
if [ -n "${INPUT_MD_OUT:-}" ]; then
set -- "$@" --md-out "${INPUT_MD_OUT}"
fi
if [ -n "${INPUT_REPO_BLOB_BASE:-}" ]; then
set -- "$@" --repo-blob-base "${INPUT_REPO_BLOB_BASE}"
fi
if [ "${INPUT_FAIL_ON_FAILURES:-true}" = "true" ]; then
set -- "$@" --fail-on-failures=true
else
set -- "$@" --fail-on-failures=false
fi
if [ "${INPUT_RESPECT_GITIGNORE:-true}" = "true" ]; then
set -- "$@" --respect-gitignore=true
else
set -- "$@" --respect-gitignore=false
fi
# Add targets
# Add targets (comma-separated glob patterns)
if [ -n "${INPUT_TARGETS:-}" ]; then
# Split comma-separated targets and add each one
IFS=','
for target in $INPUT_TARGETS; do
target=$(echo "$target" | xargs) # trim whitespace
if [ -n "$target" ]; then
set -- "$@" "$target"
fi
target=$(echo "$target" | xargs)
[ -n "$target" ] && ARGS="$ARGS $target"
done
unset IFS
else
# Default: scan everything
set -- "$@" "**/*"
ARGS="$ARGS **/*"
fi
# Debug output
if [ "${ACTIONS_STEP_DEBUG:-}" = "true" ]; then
printf "::debug:: CLI Args: slinky %s\n" "$*"
fi
[ "${ACTIONS_STEP_DEBUG:-}" = "true" ] && printf "::debug:: Running: slinky %s\n" "$ARGS"
# Execute the command
set +e
slinky "$@"
SLINKY_EXIT_CODE=$?
set -e
# Execute slinky (crawl repo with glob input, filter via .slinkignore)
slinky $ARGS
EXIT_CODE=$?
# Expose outputs
if [ -n "${GITHUB_OUTPUT:-}" ]; then
if [ -n "${INPUT_JSON_OUT:-}" ]; then
echo "json_path=${INPUT_JSON_OUT}" >> "$GITHUB_OUTPUT"
fi
if [ -n "${INPUT_MD_OUT:-}" ]; then
echo "md_path=${INPUT_MD_OUT}" >> "$GITHUB_OUTPUT"
fi
[ -n "${INPUT_JSON_OUT:-}" ] && echo "json_path=${INPUT_JSON_OUT}" >> "$GITHUB_OUTPUT"
[ -n "${INPUT_MD_OUT:-}" ] && echo "md_path=${INPUT_MD_OUT}" >> "$GITHUB_OUTPUT"
fi
# Append report to job summary if requested
@@ -95,4 +57,4 @@ if [ "${INPUT_STEP_SUMMARY:-true}" = "true" ] && [ -n "${GITHUB_STEP_SUMMARY:-}"
cat "${INPUT_MD_OUT}" >> "$GITHUB_STEP_SUMMARY"
fi
exit ${SLINKY_EXIT_CODE:-0}
exit ${EXIT_CODE:-0}

View File

@@ -1,7 +1,6 @@
package fsurls
import (
"bufio"
"encoding/json"
"fmt"
"io"
@@ -247,22 +246,14 @@ func CollectURLsWithIgnoreConfig(rootPath string, globs []string, respectGitigno
return nil
}
defer f.Close()
br := bufio.NewReader(f)
// Read up to maxSize bytes
var b strings.Builder
read := int64(0)
for {
chunk, cerr := br.ReadString('\n')
b.WriteString(chunk)
read += int64(len(chunk))
if cerr == io.EOF || read > maxSize {
break
}
if cerr != nil {
break
}
// Read up to maxSize bytes efficiently using LimitReader
limitedReader := io.LimitReader(f, maxSize)
contentBytes, readErr := io.ReadAll(limitedReader)
if readErr != nil {
// Non-critical error: skip file and continue
return nil
}
content := b.String()
content := string(contentBytes)
// Skip if likely binary (NUL present)
if strings.IndexByte(content, '\x00') >= 0 {
return nil
@@ -428,21 +419,14 @@ func CollectURLsProgressWithIgnoreConfig(rootPath string, globs []string, respec
return nil
}
defer f.Close()
br := bufio.NewReader(f)
var b strings.Builder
read := int64(0)
for {
chunk, cerr := br.ReadString('\n')
b.WriteString(chunk)
read += int64(len(chunk))
if cerr == io.EOF || read > maxSize {
break
}
if cerr != nil {
break
}
// Read up to maxSize bytes efficiently using LimitReader
limitedReader := io.LimitReader(f, maxSize)
contentBytes, readErr := io.ReadAll(limitedReader)
if readErr != nil {
// Non-critical error: skip file and continue
return nil
}
content := b.String()
content := string(contentBytes)
if strings.IndexByte(content, '\x00') >= 0 {
return nil
}

132
internal/web/cache.go Normal file
View File

@@ -0,0 +1,132 @@
package web
import (
"encoding/json"
"fmt"
"os"
"time"
)
// CacheEntry represents a cached URL check result
type CacheEntry struct {
URL string `json:"url"`
OK bool `json:"ok"`
Status int `json:"status"`
ErrMsg string `json:"error,omitempty"`
Checked time.Time `json:"checked"`
}
// URLCache manages URL result caching
type URLCache struct {
entries map[string]CacheEntry
ttl time.Duration
path string
}
// NewURLCache creates a new URL cache with optional file path
func NewURLCache(cachePath string, ttlHours int) *URLCache {
ttl := time.Duration(ttlHours) * time.Hour
if ttl <= 0 {
ttl = 24 * time.Hour // Default 24 hours
}
return &URLCache{
entries: make(map[string]CacheEntry),
ttl: ttl,
path: cachePath,
}
}
// Load loads cache entries from file if path is set
func (c *URLCache) Load() error {
if c.path == "" {
return nil // No cache file specified
}
data, err := os.ReadFile(c.path)
if err != nil {
if os.IsNotExist(err) {
return nil // Cache file doesn't exist yet, that's OK
}
return fmt.Errorf("failed to read cache file: %w", err)
}
var entries []CacheEntry
if err := json.Unmarshal(data, &entries); err != nil {
// Non-critical: if cache is corrupted, start fresh
return nil
}
now := time.Now()
c.entries = make(map[string]CacheEntry, len(entries))
for _, entry := range entries {
// Only load entries that haven't expired
if now.Sub(entry.Checked) < c.ttl {
c.entries[entry.URL] = entry
}
}
return nil
}
// Get retrieves a cached result for a URL
func (c *URLCache) Get(url string) (CacheEntry, bool) {
entry, ok := c.entries[url]
if !ok {
return CacheEntry{}, false
}
// Check if entry has expired
if time.Since(entry.Checked) >= c.ttl {
delete(c.entries, url)
return CacheEntry{}, false
}
return entry, true
}
// Set stores a result in the cache
func (c *URLCache) Set(url string, ok bool, status int, errMsg string) {
c.entries[url] = CacheEntry{
URL: url,
OK: ok,
Status: status,
ErrMsg: errMsg,
Checked: time.Now(),
}
}
// Save saves cache entries to file if path is set
func (c *URLCache) Save() error {
if c.path == "" {
return nil // No cache file specified
}
// Convert map to slice for JSON serialization
entries := make([]CacheEntry, 0, len(c.entries))
for _, entry := range c.entries {
entries = append(entries, entry)
}
data, err := json.MarshalIndent(entries, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal cache: %w", err)
}
// Write to temp file first, then rename (atomic write)
tmpPath := c.path + ".tmp"
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
return fmt.Errorf("failed to write cache file: %w", err)
}
if err := os.Rename(tmpPath, c.path); err != nil {
return fmt.Errorf("failed to rename cache file: %w", err)
}
return nil
}
// Clear removes all entries from the cache
func (c *URLCache) Clear() {
c.entries = make(map[string]CacheEntry)
}

View File

@@ -2,9 +2,11 @@ package web
import (
"context"
"fmt"
"net"
"net/http"
"sort"
"sync/atomic"
"time"
)
@@ -13,13 +15,18 @@ import (
func CheckURLs(ctx context.Context, urls []string, sources map[string][]string, out chan<- Result, stats chan<- Stats, cfg Config) {
defer close(out)
// Build HTTP client similar to crawler
// Build HTTP client with optimized connection pooling
// Increase MaxIdleConns to handle many unique domains efficiently
maxIdleConns := cfg.MaxConcurrency * 4
if maxIdleConns < 100 {
maxIdleConns = 100 // Minimum 100 idle connections for better performance across domains
}
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{Timeout: 2 * time.Second, KeepAlive: 30 * time.Second}).DialContext,
TLSHandshakeTimeout: 5 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
MaxIdleConns: cfg.MaxConcurrency * 2,
MaxIdleConns: maxIdleConns,
MaxIdleConnsPerHost: cfg.MaxConcurrency,
MaxConnsPerHost: cfg.MaxConcurrency,
IdleConnTimeout: 30 * time.Second,
@@ -31,16 +38,11 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string,
jobs := make(chan job, len(urls))
done := make(chan struct{})
// Seed jobs
unique := make(map[string]struct{}, len(urls))
// Seed jobs (URLs are already deduplicated in check.go, so no need to deduplicate here)
for _, u := range urls {
if u == "" {
continue
}
if _, ok := unique[u]; ok {
continue
}
unique[u] = struct{}{}
jobs <- job{url: u}
}
close(jobs)
@@ -49,8 +51,9 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string,
if concurrency <= 0 {
concurrency = 8
}
processed := 0
pending := len(unique)
// Use atomic counters to avoid race conditions
var processed int64
var pending int64 = int64(len(urls))
worker := func() {
for j := range jobs {
@@ -59,15 +62,47 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string,
return
default:
}
ok, status, resp, err := fetchWithMethod(ctx, client, http.MethodGet, j.url)
if resp != nil && resp.Body != nil {
resp.Body.Close()
var ok bool
var status int
var err error
var cacheHit bool
// Check cache first if available
if cfg.Cache != nil {
if cached, found := cfg.Cache.Get(j.url); found {
ok = cached.OK
status = cached.Status
err = nil
if cached.ErrMsg != "" {
err = fmt.Errorf("%s", cached.ErrMsg)
}
cacheHit = true
}
}
// Treat 401/403/408/429 as valid links
if status == http.StatusUnauthorized || status == http.StatusForbidden || status == http.StatusRequestTimeout || status == http.StatusTooManyRequests {
ok = true
err = nil
// If not cached, fetch from network
if !cacheHit {
var resp *http.Response
ok, status, resp, err = fetchWithMethod(ctx, client, http.MethodGet, j.url)
if resp != nil && resp.Body != nil {
resp.Body.Close()
}
// Status code handling is now done in fetchWithMethod:
// - 200-299, 401, 403 are OK (page exists)
// - 404, DNS errors, connection refused are bad (flagged)
// - 408, 429, 5xx are retried then flagged if still failing
// Store in cache
if cfg.Cache != nil {
errMsg := ""
if err != nil {
errMsg = err.Error()
}
cfg.Cache.Set(j.url, ok, status, errMsg)
}
}
// Check context before sending result
select {
case <-ctx.Done():
@@ -82,16 +117,17 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string,
// Send result with context check
select {
case out <- Result{URL: j.url, OK: ok, Status: status, Err: err, ErrMsg: errString(err), Method: http.MethodGet, Sources: cloneAndSort(srcs)}:
case out <- Result{URL: j.url, OK: ok, Status: status, Err: err, ErrMsg: errString(err), Method: http.MethodGet, Sources: cloneAndSort(srcs), CacheHit: cacheHit}:
case <-ctx.Done():
return
}
processed++
pending--
// Atomically update counters
proc := atomic.AddInt64(&processed, 1)
pend := atomic.AddInt64(&pending, -1)
if stats != nil {
select {
case stats <- Stats{Pending: pending, Processed: processed}:
case stats <- Stats{Pending: int(pend), Processed: int(proc)}:
default:
}
}

View File

@@ -3,34 +3,182 @@ package web
import (
"context"
"errors"
"fmt"
"net"
"net/http"
"net/url"
"strings"
"time"
)
const browserUA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36"
const maxRedirects = 10
const maxRetries = 3
// fetchWithMethod performs HTTP request with retry logic and redirect following.
// Returns: (isOK, statusCode, response, error)
// Status code handling:
// - OK (don't flag): 200-299, 401 (Unauthorized), 403 (Forbidden) - page exists
// - Bad (flag): 404 (Not Found), DNS errors, connection refused
// - Retry then flag: 408 (Timeout), 429 (Rate Limited), 5xx (Server Errors)
func fetchWithMethod(ctx context.Context, client *http.Client, method string, raw string) (bool, int, *http.Response, error) {
req, err := http.NewRequestWithContext(ctx, method, raw, nil)
var lastErr error
var lastResp *http.Response
// Retry logic for transient errors
for attempt := 0; attempt <= maxRetries; attempt++ {
if attempt > 0 {
// Exponential backoff: 1s, 2s, 4s
backoff := time.Duration(1<<uint(attempt-1)) * time.Second
select {
case <-ctx.Done():
return false, 0, nil, ctx.Err()
case <-time.After(backoff):
}
}
req, err := http.NewRequestWithContext(ctx, method, raw, nil)
if err != nil {
return false, 0, nil, err
}
req.Header.Set("User-Agent", browserUA)
req.Header.Set("Accept", "*/*")
resp, err := client.Do(req)
if err != nil {
lastErr = err
// Check if error is retryable
if isDNSError(err) || isRefused(err) {
// Non-retryable: DNS errors and connection refused are permanent failures
return false, 404, nil, simpleError("host not found")
}
if isTimeout(err) {
// Retryable: timeouts may be transient
if attempt < maxRetries {
continue
}
return false, 408, nil, simpleError("request timeout")
}
// Other network errors: retry if we have attempts left
if attempt < maxRetries {
continue
}
return false, 0, nil, err
}
// Follow redirects and check final status
finalResp, finalStatus, redirectErr := followRedirects(ctx, client, resp, raw, 0)
if redirectErr != nil {
// Redirect error: retry if transient
if attempt < maxRetries && (isTimeout(redirectErr) || isRetryableStatus(finalStatus)) {
if finalResp != nil && finalResp.Body != nil {
finalResp.Body.Close()
}
continue
}
if finalResp != nil && finalResp.Body != nil {
finalResp.Body.Close()
}
return false, finalStatus, nil, redirectErr
}
lastResp = finalResp
status := finalResp.StatusCode
// Check if status is retryable
if isRetryableStatus(status) && attempt < maxRetries {
if finalResp.Body != nil {
finalResp.Body.Close()
}
continue
}
// Determine if link is OK based on status code
isOK := isOKStatus(status)
return isOK, status, finalResp, nil
}
// All retries exhausted
if lastResp != nil && lastResp.Body != nil {
lastResp.Body.Close()
}
return false, 0, nil, fmt.Errorf("max retries exceeded: %w", lastErr)
}
// followRedirects follows redirects up to maxRedirects, checking for loops.
func followRedirects(ctx context.Context, client *http.Client, resp *http.Response, originalURL string, depth int) (*http.Response, int, error) {
if depth > maxRedirects {
return resp, resp.StatusCode, simpleError("too many redirects")
}
status := resp.StatusCode
if status < 300 || status >= 400 {
// Not a redirect
return resp, status, nil
}
// Handle redirect
location := resp.Header.Get("Location")
if location == "" {
return resp, status, nil
}
// Resolve relative URLs
baseURL, err := url.Parse(originalURL)
if err != nil {
return false, 0, nil, err
return resp, status, err
}
redirectURL, err := baseURL.Parse(location)
if err != nil {
return resp, status, err
}
// Check for redirect loop (simple check: same URL)
if redirectURL.String() == originalURL {
return resp, status, simpleError("redirect loop detected")
}
// Close previous response body
if resp.Body != nil {
resp.Body.Close()
}
// Follow redirect
req, err := http.NewRequestWithContext(ctx, "GET", redirectURL.String(), nil)
if err != nil {
return resp, status, err
}
req.Header.Set("User-Agent", browserUA)
req.Header.Set("Accept", "*/*")
resp, err := client.Do(req)
newResp, err := client.Do(req)
if err != nil {
if isDNSError(err) {
return false, 404, nil, simpleError("host not found")
}
if isTimeout(err) {
return false, 408, nil, simpleError("request timeout")
}
if isRefused(err) {
return false, 503, nil, simpleError("connection refused")
}
return false, 0, nil, err
return resp, status, err
}
return resp.StatusCode >= 200 && resp.StatusCode < 400, resp.StatusCode, resp, nil
// Recursively follow redirects
return followRedirects(ctx, client, newResp, redirectURL.String(), depth+1)
}
// isOKStatus determines if a status code indicates the link is valid.
// 200-299: Success
// 401: Unauthorized (page exists, just requires auth)
// 403: Forbidden (page exists, just requires permissions)
func isOKStatus(status int) bool {
if status >= 200 && status < 300 {
return true
}
if status == http.StatusUnauthorized || status == http.StatusForbidden {
return true
}
return false
}
// isRetryableStatus determines if a status code should trigger a retry.
func isRetryableStatus(status int) bool {
return status == http.StatusRequestTimeout || // 408
status == http.StatusTooManyRequests || // 429
status >= 500 // 5xx server errors
}
func errString(e error) string {

View File

@@ -26,4 +26,5 @@ type Config struct {
RequestTimeout time.Duration
MaxRetries429 int
Exclude []string
Cache *URLCache // Optional URL result cache
}