testing some optimizations

2025-12-06 04:21:20 +00:00 · 2025-11-14 20:33:15 +00:00
parent 4fb0c9b375
commit bd21420b3e
10 changed files with 511 additions and 145 deletions
--- a/4
+++ b/4
@@ -7,7 +7,9 @@ COPY . .
 RUN CGO_ENABLED=0 go build -o /usr/local/bin/slinky ./

 FROM alpine:3.20
-RUN apk add --no-cache curl jq ca-certificates
+# jq is used in entrypoint.sh for parsing GitHub event JSON
+# ca-certificates is needed for HTTPS requests
+RUN apk add --no-cache jq ca-certificates
 COPY --from=build /usr/local/bin/slinky /usr/local/bin/slinky
 COPY entrypoint.sh /entrypoint.sh
 RUN chmod +x /entrypoint.sh
--- a/README.md
+++ b/README.md
@@ -16,15 +16,18 @@ jobs:
    runs-on: ubuntu-latest
    permissions:
      contents: read
-      pull-requests: write
+      pull-requests: write  # Only needed if comment-pr is enabled
    steps:
      - uses: actions/checkout@v4
      - name: Run Slinky
        uses: LukeHagar/slinky@v1
        with:
          targets: "docs/,README.md,**/*.md"
+          # comment-pr: true  # Optional: post results as PR comment (requires GITHUB_TOKEN)
 ```

+**Note:** The `GITHUB_TOKEN` is automatically provided by GitHub Actions and is only required for PR comment functionality. Core link checking works without it. If you disable PR comments (`comment-pr: false`), you can remove the `pull-requests: write` permission.
+
 ### Inputs

 - **targets**: Comma-separated paths and patterns to scan. Can be directories, files, or glob patterns (e.g. `docs/,api-specs/**/*.yaml,README.md`). Default: `**/*`
--- a/action.yml
+++ b/action.yml
@@ -53,6 +53,7 @@ runs:
    INPUT_FAIL_ON_FAILURES: ${{ inputs.fail_on_failures }}
    INPUT_COMMENT_PR: ${{ inputs.comment_pr }}
    INPUT_STEP_SUMMARY: ${{ inputs.step_summary }}
+    GITHUB_TOKEN: ${{ github.token }}

 outputs:
  json_path:
--- a/cmd/check.go
+++ b/cmd/check.go
@@ -158,9 +158,51 @@ func init() {
 				fmt.Printf("::debug:: Root: %s\n", displayRoot)
 			}

+			// Validate and clamp numeric inputs
+			if maxConcurrency < 1 {
+				maxConcurrency = 1
+			} else if maxConcurrency > 100 {
+				maxConcurrency = 100
+			}
+			if timeoutSeconds < 1 {
+				timeoutSeconds = 1
+			} else if timeoutSeconds > 300 {
+				timeoutSeconds = 300 // Max 5 minutes
+			}
+			
 			// Build config
 			timeout := time.Duration(timeoutSeconds) * time.Second
-			cfg := web.Config{MaxConcurrency: maxConcurrency, RequestTimeout: timeout}
+			
+			// Set up URL cache if cache path is provided via environment variable
+			var urlCache *web.URLCache
+			if cachePath := os.Getenv("SLINKY_CACHE_PATH"); cachePath != "" {
+				cacheTTL := 24 // Default 24 hours
+				if ttlStr := os.Getenv("SLINKY_CACHE_TTL_HOURS"); ttlStr != "" {
+					if ttl, err := time.ParseDuration(ttlStr + "h"); err == nil && ttl > 0 {
+						cacheTTL = int(ttl.Hours())
+					}
+				}
+				urlCache = web.NewURLCache(cachePath, cacheTTL)
+				if err := urlCache.Load(); err != nil {
+					if shouldDebug() {
+						fmt.Printf("::debug:: Failed to load cache: %v\n", err)
+					}
+				}
+				// Save cache when done
+				defer func() {
+					if err := urlCache.Save(); err != nil {
+						if shouldDebug() {
+							fmt.Printf("::debug:: Failed to save cache: %v\n", err)
+						}
+					}
+				}()
+			}
+			
+			cfg := web.Config{
+				MaxConcurrency: maxConcurrency,
+				RequestTimeout: timeout,
+				Cache:          urlCache,
+			}

 			// Prepare URL list
 			var urls []string
@@ -275,9 +317,16 @@ func init() {
 			}

 			// If running on a PR, post or update the comment(s), chunking as needed
-			if ghOK && strings.TrimSpace(finalMDPath) != "" {
+			// Check if PR commenting is enabled (default to true if not set)
+			commentPR := true
+			if val := os.Getenv("INPUT_COMMENT_PR"); val != "" {
+				commentPR = strings.EqualFold(val, "true")
+			}
+			if ghOK && commentPR && strings.TrimSpace(finalMDPath) != "" {
 				b, rerr := os.ReadFile(finalMDPath)
-				if rerr == nil {
+				if rerr != nil {
+					fmt.Printf("::warning:: Failed to read markdown report for PR comment: %v\n", rerr)
+				} else {
 					full := string(b)
 					if shouldDebug() {
 						fmt.Printf("::debug:: Report size (chars): %d\n", len(full))
@@ -286,7 +335,10 @@ func init() {
 					if shouldDebug() {
 						fmt.Printf("::debug:: Posting %d chunk(s)\n", len(chunks))
 					}
-					_ = upsertPRComments(ghRepo, ghPR, ghToken, chunks)
+					if err := upsertPRComments(ghRepo, ghPR, ghToken, chunks); err != nil {
+						// Non-critical error: log warning but don't fail the run
+						fmt.Printf("::warning:: Failed to post PR comment: %v\n", err)
+					}
 				}
 			}

@@ -440,32 +492,52 @@ func chunkMarkdownByURL(body string) []string {
 }

 // upsertPRComments deletes any existing slinky comments and posts the new chunked comments in order.
+// Returns error if critical failures occur, but individual comment failures are logged and ignored.
 func upsertPRComments(repo string, prNumber int, token string, chunks []string) error {
 	apiBase := "https://api.github.com"
 	listURL := fmt.Sprintf("%s/repos/%s/issues/%d/comments?per_page=100", apiBase, repo, prNumber)
-	req, _ := http.NewRequest(http.MethodGet, listURL, nil)
+	req, err := http.NewRequest(http.MethodGet, listURL, nil)
+	if err != nil {
+		return fmt.Errorf("failed to create request: %w", err)
+	}
 	req.Header.Set("Authorization", "Bearer "+token)
 	req.Header.Set("Accept", "application/vnd.github+json")
 	resp, err := http.DefaultClient.Do(req)
 	if err != nil {
-		return err
+		return fmt.Errorf("failed to list comments: %w", err)
 	}
 	defer resp.Body.Close()
+	
+	if resp.StatusCode >= 400 {
+		return fmt.Errorf("failed to list comments: HTTP %d", resp.StatusCode)
+	}
+	
 	var comments []struct {
 		ID   int    `json:"id"`
 		Body string `json:"body"`
 	}
-	b, _ := io.ReadAll(resp.Body)
-	_ = json.Unmarshal(b, &comments)
+	b, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return fmt.Errorf("failed to read comments response: %w", err)
+	}
+	if err := json.Unmarshal(b, &comments); err != nil {
+		// Non-critical: continue even if we can't parse existing comments
+		if shouldDebug() {
+			fmt.Printf("::debug:: Failed to parse comments: %v\n", err)
+		}
+	}

 	// Delete all existing slinky-report comments to avoid stale entries
 	for _, c := range comments {
 		if strings.Contains(c.Body, "<!-- slinky-report -->") {
 			delURL := fmt.Sprintf("%s/repos/%s/issues/comments/%d", apiBase, repo, c.ID)
-			dReq, _ := http.NewRequest(http.MethodDelete, delURL, nil)
+			dReq, err := http.NewRequest(http.MethodDelete, delURL, nil)
+			if err != nil {
+				continue // Skip if we can't create request
+			}
 			dReq.Header.Set("Authorization", "Bearer "+token)
 			dReq.Header.Set("Accept", "application/vnd.github+json")
-			_, _ = http.DefaultClient.Do(dReq)
+			_, _ = http.DefaultClient.Do(dReq) // Non-critical: ignore delete errors
 		}
 	}

@@ -473,14 +545,39 @@ func upsertPRComments(repo string, prNumber int, token string, chunks []string)
 	for idx, chunk := range chunks {
 		body := fmt.Sprintf("%s\n%s", "<!-- slinky-report -->", chunk)
 		postURL := fmt.Sprintf("%s/repos/%s/issues/%d/comments", apiBase, repo, prNumber)
-		payload, _ := json.Marshal(map[string]string{"body": body})
-		req, _ = http.NewRequest(http.MethodPost, postURL, bytes.NewReader(payload))
+		payload, err := json.Marshal(map[string]string{"body": body})
+		if err != nil {
+			if shouldDebug() {
+				fmt.Printf("::debug:: Failed to marshal comment payload: %v\n", err)
+			}
+			continue
+		}
+		req, err := http.NewRequest(http.MethodPost, postURL, bytes.NewReader(payload))
+		if err != nil {
+			if shouldDebug() {
+				fmt.Printf("::debug:: Failed to create POST request: %v\n", err)
+			}
+			continue
+		}
 		req.Header.Set("Authorization", "Bearer "+token)
 		req.Header.Set("Accept", "application/vnd.github+json")
 		req.Header.Set("Content-Type", "application/json")
-		res, _ := http.DefaultClient.Do(req)
+		res, err := http.DefaultClient.Do(req)
+		if err != nil {
+			if shouldDebug() {
+				fmt.Printf("::debug:: Failed to post chunk %d/%d: %v\n", idx+1, len(chunks), err)
+			}
+			continue
+		}
+		res.Body.Close()
+		if res.StatusCode >= 400 {
+			if shouldDebug() {
+				fmt.Printf("::debug:: Failed to post chunk %d/%d: HTTP %d\n", idx+1, len(chunks), res.StatusCode)
+			}
+			continue
+		}
 		if shouldDebug() {
-			fmt.Printf("::debug:: Posted chunk %d/%d: %v\n", idx+1, len(chunks), res)
+			fmt.Printf("::debug:: Posted chunk %d/%d successfully\n", idx+1, len(chunks))
 		}
 	}
 	return nil
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -1,93 +1,55 @@
 #!/bin/sh
 set -eu

-# Set up environment variables for GitHub blob base URL
+# Set up GitHub blob base URL for PR links
 if [ -n "${INPUT_REPO_BLOB_BASE:-}" ]; then
  export SLINKY_REPO_BLOB_BASE_URL="${INPUT_REPO_BLOB_BASE}"
 elif [ -n "${GITHUB_REPOSITORY:-}" ]; then
  COMMIT_SHA="${GITHUB_SHA:-}"
  if [ -n "${GITHUB_EVENT_PATH:-}" ] && command -v jq >/dev/null 2>&1; then
-    PR_HEAD_SHA="$(jq -r '.pull_request.head.sha // empty' "$GITHUB_EVENT_PATH" || true)"
-    if [ -n "$PR_HEAD_SHA" ]; then
-      COMMIT_SHA="$PR_HEAD_SHA"
-    fi
-  fi
-  if [ -n "$COMMIT_SHA" ]; then
-    export SLINKY_REPO_BLOB_BASE_URL="https://github.com/${GITHUB_REPOSITORY}/blob/${COMMIT_SHA}"
+    PR_HEAD_SHA="$(jq -r '.pull_request.head.sha // empty' "$GITHUB_EVENT_PATH" 2>/dev/null || true)"
+    [ -n "$PR_HEAD_SHA" ] && COMMIT_SHA="$PR_HEAD_SHA"
  fi
+  [ -n "$COMMIT_SHA" ] && export SLINKY_REPO_BLOB_BASE_URL="https://github.com/${GITHUB_REPOSITORY}/blob/${COMMIT_SHA}"
 fi

-# Build command arguments
-set -- check
+# Build slinky command arguments
+ARGS="check"

-# Add optional flags
-if [ -n "${INPUT_CONCURRENCY:-}" ]; then
-  set -- "$@" --concurrency "${INPUT_CONCURRENCY}"
-fi
+# Optional flags
+[ -n "${INPUT_CONCURRENCY:-}" ] && ARGS="$ARGS --concurrency ${INPUT_CONCURRENCY}"
+[ -n "${INPUT_TIMEOUT:-}" ] && ARGS="$ARGS --timeout ${INPUT_TIMEOUT}"
+[ -n "${INPUT_JSON_OUT:-}" ] && ARGS="$ARGS --json-out ${INPUT_JSON_OUT}"
+[ -n "${INPUT_MD_OUT:-}" ] && ARGS="$ARGS --md-out ${INPUT_MD_OUT}"
+[ -n "${INPUT_REPO_BLOB_BASE:-}" ] && ARGS="$ARGS --repo-blob-base ${INPUT_REPO_BLOB_BASE}"

-if [ -n "${INPUT_TIMEOUT:-}" ]; then
-  set -- "$@" --timeout "${INPUT_TIMEOUT}"
-fi
+# Boolean flags with defaults
+[ "${INPUT_FAIL_ON_FAILURES:-true}" = "true" ] && ARGS="$ARGS --fail-on-failures=true" || ARGS="$ARGS --fail-on-failures=false"
+[ "${INPUT_RESPECT_GITIGNORE:-true}" = "true" ] && ARGS="$ARGS --respect-gitignore=true" || ARGS="$ARGS --respect-gitignore=false"

-if [ -n "${INPUT_JSON_OUT:-}" ]; then
-  set -- "$@" --json-out "${INPUT_JSON_OUT}"
-fi
-
-if [ -n "${INPUT_MD_OUT:-}" ]; then
-  set -- "$@" --md-out "${INPUT_MD_OUT}"
-fi
-
-if [ -n "${INPUT_REPO_BLOB_BASE:-}" ]; then
-  set -- "$@" --repo-blob-base "${INPUT_REPO_BLOB_BASE}"
-fi
-
-if [ "${INPUT_FAIL_ON_FAILURES:-true}" = "true" ]; then
-  set -- "$@" --fail-on-failures=true
-else
-  set -- "$@" --fail-on-failures=false
-fi
-
-if [ "${INPUT_RESPECT_GITIGNORE:-true}" = "true" ]; then
-  set -- "$@" --respect-gitignore=true
-else
-  set -- "$@" --respect-gitignore=false
-fi
-
-# Add targets
+# Add targets (comma-separated glob patterns)
 if [ -n "${INPUT_TARGETS:-}" ]; then
-  # Split comma-separated targets and add each one
  IFS=','
  for target in $INPUT_TARGETS; do
-    target=$(echo "$target" | xargs)  # trim whitespace
-    if [ -n "$target" ]; then
-      set -- "$@" "$target"
-    fi
+    target=$(echo "$target" | xargs)
+    [ -n "$target" ] && ARGS="$ARGS $target"
  done
  unset IFS
 else
-  # Default: scan everything
-  set -- "$@" "**/*"
+  ARGS="$ARGS **/*"
 fi

 # Debug output
-if [ "${ACTIONS_STEP_DEBUG:-}" = "true" ]; then
-  printf "::debug:: CLI Args: slinky %s\n" "$*"
-fi
+[ "${ACTIONS_STEP_DEBUG:-}" = "true" ] && printf "::debug:: Running: slinky %s\n" "$ARGS"

-# Execute the command
-set +e
-slinky "$@"
-SLINKY_EXIT_CODE=$?
-set -e
+# Execute slinky (crawl repo with glob input, filter via .slinkignore)
+slinky $ARGS
+EXIT_CODE=$?

 # Expose outputs
 if [ -n "${GITHUB_OUTPUT:-}" ]; then
-  if [ -n "${INPUT_JSON_OUT:-}" ]; then
-    echo "json_path=${INPUT_JSON_OUT}" >> "$GITHUB_OUTPUT"
-  fi
-  if [ -n "${INPUT_MD_OUT:-}" ]; then
-    echo "md_path=${INPUT_MD_OUT}" >> "$GITHUB_OUTPUT"
-  fi
+  [ -n "${INPUT_JSON_OUT:-}" ] && echo "json_path=${INPUT_JSON_OUT}" >> "$GITHUB_OUTPUT"
+  [ -n "${INPUT_MD_OUT:-}" ] && echo "md_path=${INPUT_MD_OUT}" >> "$GITHUB_OUTPUT"
 fi

 # Append report to job summary if requested
@@ -95,4 +57,4 @@ if [ "${INPUT_STEP_SUMMARY:-true}" = "true" ] && [ -n "${GITHUB_STEP_SUMMARY:-}"
  cat "${INPUT_MD_OUT}" >> "$GITHUB_STEP_SUMMARY"
 fi

-exit ${SLINKY_EXIT_CODE:-0}
+exit ${EXIT_CODE:-0}
--- a/internal/fsurls/fsurls.go
+++ b/internal/fsurls/fsurls.go
@@ -1,7 +1,6 @@
 package fsurls

 import (
-	"bufio"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -247,22 +246,14 @@ func CollectURLsWithIgnoreConfig(rootPath string, globs []string, respectGitigno
 			return nil
 		}
 		defer f.Close()
-		br := bufio.NewReader(f)
-		// Read up to maxSize bytes
-		var b strings.Builder
-		read := int64(0)
-		for {
-			chunk, cerr := br.ReadString('\n')
-			b.WriteString(chunk)
-			read += int64(len(chunk))
-			if cerr == io.EOF || read > maxSize {
-				break
-			}
-			if cerr != nil {
-				break
-			}
+		// Read up to maxSize bytes efficiently using LimitReader
+		limitedReader := io.LimitReader(f, maxSize)
+		contentBytes, readErr := io.ReadAll(limitedReader)
+		if readErr != nil {
+			// Non-critical error: skip file and continue
+			return nil
 		}
-		content := b.String()
+		content := string(contentBytes)
 		// Skip if likely binary (NUL present)
 		if strings.IndexByte(content, '\x00') >= 0 {
 			return nil
@@ -428,21 +419,14 @@ func CollectURLsProgressWithIgnoreConfig(rootPath string, globs []string, respec
 			return nil
 		}
 		defer f.Close()
-		br := bufio.NewReader(f)
-		var b strings.Builder
-		read := int64(0)
-		for {
-			chunk, cerr := br.ReadString('\n')
-			b.WriteString(chunk)
-			read += int64(len(chunk))
-			if cerr == io.EOF || read > maxSize {
-				break
-			}
-			if cerr != nil {
-				break
-			}
+		// Read up to maxSize bytes efficiently using LimitReader
+		limitedReader := io.LimitReader(f, maxSize)
+		contentBytes, readErr := io.ReadAll(limitedReader)
+		if readErr != nil {
+			// Non-critical error: skip file and continue
+			return nil
 		}
-		content := b.String()
+		content := string(contentBytes)
 		if strings.IndexByte(content, '\x00') >= 0 {
 			return nil
 		}
--- a/internal/web/cache.go
+++ b/internal/web/cache.go
@@ -0,0 +1,132 @@
+package web
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"time"
+)
+
+// CacheEntry represents a cached URL check result
+type CacheEntry struct {
+	URL     string    `json:"url"`
+	OK      bool      `json:"ok"`
+	Status  int       `json:"status"`
+	ErrMsg  string    `json:"error,omitempty"`
+	Checked time.Time `json:"checked"`
+}
+
+// URLCache manages URL result caching
+type URLCache struct {
+	entries map[string]CacheEntry
+	ttl     time.Duration
+	path    string
+}
+
+// NewURLCache creates a new URL cache with optional file path
+func NewURLCache(cachePath string, ttlHours int) *URLCache {
+	ttl := time.Duration(ttlHours) * time.Hour
+	if ttl <= 0 {
+		ttl = 24 * time.Hour // Default 24 hours
+	}
+	return &URLCache{
+		entries: make(map[string]CacheEntry),
+		ttl:     ttl,
+		path:    cachePath,
+	}
+}
+
+// Load loads cache entries from file if path is set
+func (c *URLCache) Load() error {
+	if c.path == "" {
+		return nil // No cache file specified
+	}
+	
+	data, err := os.ReadFile(c.path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil // Cache file doesn't exist yet, that's OK
+		}
+		return fmt.Errorf("failed to read cache file: %w", err)
+	}
+	
+	var entries []CacheEntry
+	if err := json.Unmarshal(data, &entries); err != nil {
+		// Non-critical: if cache is corrupted, start fresh
+		return nil
+	}
+	
+	now := time.Now()
+	c.entries = make(map[string]CacheEntry, len(entries))
+	for _, entry := range entries {
+		// Only load entries that haven't expired
+		if now.Sub(entry.Checked) < c.ttl {
+			c.entries[entry.URL] = entry
+		}
+	}
+	
+	return nil
+}
+
+// Get retrieves a cached result for a URL
+func (c *URLCache) Get(url string) (CacheEntry, bool) {
+	entry, ok := c.entries[url]
+	if !ok {
+		return CacheEntry{}, false
+	}
+	
+	// Check if entry has expired
+	if time.Since(entry.Checked) >= c.ttl {
+		delete(c.entries, url)
+		return CacheEntry{}, false
+	}
+	
+	return entry, true
+}
+
+// Set stores a result in the cache
+func (c *URLCache) Set(url string, ok bool, status int, errMsg string) {
+	c.entries[url] = CacheEntry{
+		URL:     url,
+		OK:      ok,
+		Status:  status,
+		ErrMsg:  errMsg,
+		Checked: time.Now(),
+	}
+}
+
+// Save saves cache entries to file if path is set
+func (c *URLCache) Save() error {
+	if c.path == "" {
+		return nil // No cache file specified
+	}
+	
+	// Convert map to slice for JSON serialization
+	entries := make([]CacheEntry, 0, len(c.entries))
+	for _, entry := range c.entries {
+		entries = append(entries, entry)
+	}
+	
+	data, err := json.MarshalIndent(entries, "", "  ")
+	if err != nil {
+		return fmt.Errorf("failed to marshal cache: %w", err)
+	}
+	
+	// Write to temp file first, then rename (atomic write)
+	tmpPath := c.path + ".tmp"
+	if err := os.WriteFile(tmpPath, data, 0644); err != nil {
+		return fmt.Errorf("failed to write cache file: %w", err)
+	}
+	
+	if err := os.Rename(tmpPath, c.path); err != nil {
+		return fmt.Errorf("failed to rename cache file: %w", err)
+	}
+	
+	return nil
+}
+
+// Clear removes all entries from the cache
+func (c *URLCache) Clear() {
+	c.entries = make(map[string]CacheEntry)
+}
+
--- a/internal/web/checker.go
+++ b/internal/web/checker.go
@@ -2,9 +2,11 @@ package web

 import (
 	"context"
+	"fmt"
 	"net"
 	"net/http"
 	"sort"
+	"sync/atomic"
 	"time"
 )

@@ -13,13 +15,18 @@ import (
 func CheckURLs(ctx context.Context, urls []string, sources map[string][]string, out chan<- Result, stats chan<- Stats, cfg Config) {
 	defer close(out)

-	// Build HTTP client similar to crawler
+	// Build HTTP client with optimized connection pooling
+	// Increase MaxIdleConns to handle many unique domains efficiently
+	maxIdleConns := cfg.MaxConcurrency * 4
+	if maxIdleConns < 100 {
+		maxIdleConns = 100 // Minimum 100 idle connections for better performance across domains
+	}
 	transport := &http.Transport{
 		Proxy:                 http.ProxyFromEnvironment,
 		DialContext:           (&net.Dialer{Timeout: 2 * time.Second, KeepAlive: 30 * time.Second}).DialContext,
 		TLSHandshakeTimeout:   5 * time.Second,
 		ExpectContinueTimeout: 1 * time.Second,
-		MaxIdleConns:          cfg.MaxConcurrency * 2,
+		MaxIdleConns:          maxIdleConns,
 		MaxIdleConnsPerHost:   cfg.MaxConcurrency,
 		MaxConnsPerHost:       cfg.MaxConcurrency,
 		IdleConnTimeout:       30 * time.Second,
@@ -31,16 +38,11 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string,
 	jobs := make(chan job, len(urls))
 	done := make(chan struct{})

-	// Seed jobs
-	unique := make(map[string]struct{}, len(urls))
+	// Seed jobs (URLs are already deduplicated in check.go, so no need to deduplicate here)
 	for _, u := range urls {
 		if u == "" {
 			continue
 		}
-		if _, ok := unique[u]; ok {
-			continue
-		}
-		unique[u] = struct{}{}
 		jobs <- job{url: u}
 	}
 	close(jobs)
@@ -49,8 +51,9 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string,
 	if concurrency <= 0 {
 		concurrency = 8
 	}
-	processed := 0
-	pending := len(unique)
+	// Use atomic counters to avoid race conditions
+	var processed int64
+	var pending int64 = int64(len(urls))

 	worker := func() {
 		for j := range jobs {
@@ -59,15 +62,47 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string,
 				return
 			default:
 			}
-			ok, status, resp, err := fetchWithMethod(ctx, client, http.MethodGet, j.url)
-			if resp != nil && resp.Body != nil {
-				resp.Body.Close()
+			
+			var ok bool
+			var status int
+			var err error
+			var cacheHit bool
+			
+			// Check cache first if available
+			if cfg.Cache != nil {
+				if cached, found := cfg.Cache.Get(j.url); found {
+					ok = cached.OK
+					status = cached.Status
+					err = nil
+					if cached.ErrMsg != "" {
+						err = fmt.Errorf("%s", cached.ErrMsg)
+					}
+					cacheHit = true
+				}
 			}
-			// Treat 401/403/408/429 as valid links
-			if status == http.StatusUnauthorized || status == http.StatusForbidden || status == http.StatusRequestTimeout || status == http.StatusTooManyRequests {
-				ok = true
-				err = nil
+			
+			// If not cached, fetch from network
+			if !cacheHit {
+				var resp *http.Response
+				ok, status, resp, err = fetchWithMethod(ctx, client, http.MethodGet, j.url)
+				if resp != nil && resp.Body != nil {
+					resp.Body.Close()
+				}
+				// Status code handling is now done in fetchWithMethod:
+				// - 200-299, 401, 403 are OK (page exists)
+				// - 404, DNS errors, connection refused are bad (flagged)
+				// - 408, 429, 5xx are retried then flagged if still failing
+				
+				// Store in cache
+				if cfg.Cache != nil {
+					errMsg := ""
+					if err != nil {
+						errMsg = err.Error()
+					}
+					cfg.Cache.Set(j.url, ok, status, errMsg)
+				}
 			}
+			
 			// Check context before sending result
 			select {
 			case <-ctx.Done():
@@ -82,16 +117,17 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string,

 			// Send result with context check
 			select {
-			case out <- Result{URL: j.url, OK: ok, Status: status, Err: err, ErrMsg: errString(err), Method: http.MethodGet, Sources: cloneAndSort(srcs)}:
+			case out <- Result{URL: j.url, OK: ok, Status: status, Err: err, ErrMsg: errString(err), Method: http.MethodGet, Sources: cloneAndSort(srcs), CacheHit: cacheHit}:
 			case <-ctx.Done():
 				return
 			}

-			processed++
-			pending--
+			// Atomically update counters
+			proc := atomic.AddInt64(&processed, 1)
+			pend := atomic.AddInt64(&pending, -1)
 			if stats != nil {
 				select {
-				case stats <- Stats{Pending: pending, Processed: processed}:
+				case stats <- Stats{Pending: int(pend), Processed: int(proc)}:
 				default:
 				}
 			}
--- a/internal/web/http.go
+++ b/internal/web/http.go
@@ -3,34 +3,182 @@ package web
 import (
 	"context"
 	"errors"
+	"fmt"
 	"net"
 	"net/http"
+	"net/url"
 	"strings"
+	"time"
 )

 const browserUA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36"
+const maxRedirects = 10
+const maxRetries = 3

+// fetchWithMethod performs HTTP request with retry logic and redirect following.
+// Returns: (isOK, statusCode, response, error)
+// Status code handling:
+//   - OK (don't flag): 200-299, 401 (Unauthorized), 403 (Forbidden) - page exists
+//   - Bad (flag): 404 (Not Found), DNS errors, connection refused
+//   - Retry then flag: 408 (Timeout), 429 (Rate Limited), 5xx (Server Errors)
 func fetchWithMethod(ctx context.Context, client *http.Client, method string, raw string) (bool, int, *http.Response, error) {
-	req, err := http.NewRequestWithContext(ctx, method, raw, nil)
+	var lastErr error
+	var lastResp *http.Response
+	
+	// Retry logic for transient errors
+	for attempt := 0; attempt <= maxRetries; attempt++ {
+		if attempt > 0 {
+			// Exponential backoff: 1s, 2s, 4s
+			backoff := time.Duration(1<<uint(attempt-1)) * time.Second
+			select {
+			case <-ctx.Done():
+				return false, 0, nil, ctx.Err()
+			case <-time.After(backoff):
+			}
+		}
+		
+		req, err := http.NewRequestWithContext(ctx, method, raw, nil)
+		if err != nil {
+			return false, 0, nil, err
+		}
+		req.Header.Set("User-Agent", browserUA)
+		req.Header.Set("Accept", "*/*")
+		
+		resp, err := client.Do(req)
+		if err != nil {
+			lastErr = err
+			// Check if error is retryable
+			if isDNSError(err) || isRefused(err) {
+				// Non-retryable: DNS errors and connection refused are permanent failures
+				return false, 404, nil, simpleError("host not found")
+			}
+			if isTimeout(err) {
+				// Retryable: timeouts may be transient
+				if attempt < maxRetries {
+					continue
+				}
+				return false, 408, nil, simpleError("request timeout")
+			}
+			// Other network errors: retry if we have attempts left
+			if attempt < maxRetries {
+				continue
+			}
+			return false, 0, nil, err
+		}
+		
+		// Follow redirects and check final status
+		finalResp, finalStatus, redirectErr := followRedirects(ctx, client, resp, raw, 0)
+		if redirectErr != nil {
+			// Redirect error: retry if transient
+			if attempt < maxRetries && (isTimeout(redirectErr) || isRetryableStatus(finalStatus)) {
+				if finalResp != nil && finalResp.Body != nil {
+					finalResp.Body.Close()
+				}
+				continue
+			}
+			if finalResp != nil && finalResp.Body != nil {
+				finalResp.Body.Close()
+			}
+			return false, finalStatus, nil, redirectErr
+		}
+		
+		lastResp = finalResp
+		status := finalResp.StatusCode
+		
+		// Check if status is retryable
+		if isRetryableStatus(status) && attempt < maxRetries {
+			if finalResp.Body != nil {
+				finalResp.Body.Close()
+			}
+			continue
+		}
+		
+		// Determine if link is OK based on status code
+		isOK := isOKStatus(status)
+		return isOK, status, finalResp, nil
+	}
+	
+	// All retries exhausted
+	if lastResp != nil && lastResp.Body != nil {
+		lastResp.Body.Close()
+	}
+	return false, 0, nil, fmt.Errorf("max retries exceeded: %w", lastErr)
+}
+
+// followRedirects follows redirects up to maxRedirects, checking for loops.
+func followRedirects(ctx context.Context, client *http.Client, resp *http.Response, originalURL string, depth int) (*http.Response, int, error) {
+	if depth > maxRedirects {
+		return resp, resp.StatusCode, simpleError("too many redirects")
+	}
+	
+	status := resp.StatusCode
+	if status < 300 || status >= 400 {
+		// Not a redirect
+		return resp, status, nil
+	}
+	
+	// Handle redirect
+	location := resp.Header.Get("Location")
+	if location == "" {
+		return resp, status, nil
+	}
+	
+	// Resolve relative URLs
+	baseURL, err := url.Parse(originalURL)
 	if err != nil {
-		return false, 0, nil, err
+		return resp, status, err
+	}
+	redirectURL, err := baseURL.Parse(location)
+	if err != nil {
+		return resp, status, err
+	}
+	
+	// Check for redirect loop (simple check: same URL)
+	if redirectURL.String() == originalURL {
+		return resp, status, simpleError("redirect loop detected")
+	}
+	
+	// Close previous response body
+	if resp.Body != nil {
+		resp.Body.Close()
+	}
+	
+	// Follow redirect
+	req, err := http.NewRequestWithContext(ctx, "GET", redirectURL.String(), nil)
+	if err != nil {
+		return resp, status, err
 	}
 	req.Header.Set("User-Agent", browserUA)
 	req.Header.Set("Accept", "*/*")
-	resp, err := client.Do(req)
+	
+	newResp, err := client.Do(req)
 	if err != nil {
-		if isDNSError(err) {
-			return false, 404, nil, simpleError("host not found")
-		}
-		if isTimeout(err) {
-			return false, 408, nil, simpleError("request timeout")
-		}
-		if isRefused(err) {
-			return false, 503, nil, simpleError("connection refused")
-		}
-		return false, 0, nil, err
+		return resp, status, err
 	}
-	return resp.StatusCode >= 200 && resp.StatusCode < 400, resp.StatusCode, resp, nil
+	
+	// Recursively follow redirects
+	return followRedirects(ctx, client, newResp, redirectURL.String(), depth+1)
+}
+
+// isOKStatus determines if a status code indicates the link is valid.
+// 200-299: Success
+// 401: Unauthorized (page exists, just requires auth)
+// 403: Forbidden (page exists, just requires permissions)
+func isOKStatus(status int) bool {
+	if status >= 200 && status < 300 {
+		return true
+	}
+	if status == http.StatusUnauthorized || status == http.StatusForbidden {
+		return true
+	}
+	return false
+}
+
+// isRetryableStatus determines if a status code should trigger a retry.
+func isRetryableStatus(status int) bool {
+	return status == http.StatusRequestTimeout || // 408
+		status == http.StatusTooManyRequests || // 429
+		status >= 500 // 5xx server errors
 }

 func errString(e error) string {
--- a/internal/web/types.go
+++ b/internal/web/types.go
@@ -26,4 +26,5 @@ type Config struct {
 	RequestTimeout time.Duration
 	MaxRetries429  int
 	Exclude        []string
+	Cache          *URLCache // Optional URL result cache
 }