slinky/cmd/check.go

package cmd

import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"os"
	"path/filepath"
	"sort"
	"strings"
	"time"

	"github.com/spf13/cobra"

	"slinky/internal/fsurls"
	"slinky/internal/report"
	"slinky/internal/web"
)

// SerializableResult mirrors web.Result but omits the error field for JSON.
type SerializableResult struct {
	URL         string   `json:"url"`
	OK          bool     `json:"ok"`
	Status      int      `json:"status"`
	ErrMsg      string   `json:"error"`
	Method      string   `json:"method"`
	ContentType string   `json:"contentType"`
	Sources     []string `json:"sources"`
}

func init() {
	checkCmd := &cobra.Command{
		Use:   "check [targets...]",
		Short: "Scan for URLs and validate them (headless)",
		Args:  cobra.ArbitraryArgs,
		RunE: func(cmd *cobra.Command, args []string) error {
			// Parse targets: allow comma-separated chunks
			var raw []string
			for _, a := range args {
				for part := range strings.SplitSeq(a, ",") {
					p := strings.TrimSpace(part)
					if p != "" {
						raw = append(raw, toSlash(p))
					}
				}
			}
			if len(raw) == 0 {
				raw = []string{"**/*"}
			}

			// Separate into globs (relative to ".") and concrete paths (dirs/files)
			var globPatterns []string
			type pathRoot struct {
				path  string
				isDir bool
			}
			var roots []pathRoot
			for _, t := range raw {
				if hasGlobMeta(t) {
					globPatterns = append(globPatterns, t)
					continue
				}
				if fi, err := os.Stat(t); err == nil {
					roots = append(roots, pathRoot{path: t, isDir: fi.IsDir()})
				} else {
					// If stat fails, treat as glob pattern under "."
					globPatterns = append(globPatterns, t)
				}
			}

			// Debug: show effective targets
			if shouldDebug() {
				fmt.Printf("::debug:: Roots: %s\n", strings.Join(func() []string {
					var out []string
					for _, r := range roots {
						out = append(out, r.path)
					}
					return out
				}(), ","))
				fmt.Printf("::debug:: Glob patterns: %s\n", strings.Join(globPatterns, ","))
			}

			// Aggregate URL->files across all targets
			agg := make(map[string]map[string]struct{})
			merge := func(res map[string][]string, prefix string, isDir bool) {
				for u, files := range res {
					set, ok := agg[u]
					if !ok {
						set = make(map[string]struct{})
						agg[u] = set
					}
					for _, fp := range files {
						var merged string
						if prefix == "" {
							merged = fp
						} else if isDir {
							merged = toSlash(filepath.Join(prefix, fp))
						} else {
							// File root: keep the concrete file path
							merged = toSlash(prefix)
						}
						set[merged] = struct{}{}
					}
				}
			}

			// 1) Collect for globs under current dir
			if len(globPatterns) > 0 {
				res, err := fsurls.CollectURLs(".", globPatterns, respectGitignore)
				if err != nil {
					return err
				}
				merge(res, "", true)
			}

			// 2) Collect for each concrete root
			for _, r := range roots {
				clean := toSlash(filepath.Clean(r.path))
				if r.isDir {
					res, err := fsurls.CollectURLs(r.path, []string{"**/*"}, respectGitignore)
					if err != nil {
						return err
					}
					merge(res, clean, true)
				} else {
					res, err := fsurls.CollectURLs(r.path, nil, respectGitignore)
					if err != nil {
						return err
					}
					merge(res, clean, false)
				}
			}

			// Convert aggregator to final map with sorted file lists
			urlToFiles := make(map[string][]string, len(agg))
			for u, set := range agg {
				var files []string
				for f := range set {
					files = append(files, f)
				}
				sort.Strings(files)
				urlToFiles[u] = files
			}

			// Derive display root; we use "." when multiple roots to avoid confusion
			displayRoot := "."
			if len(roots) == 1 && len(globPatterns) == 0 {
				displayRoot = roots[0].path
			}
			if shouldDebug() {
				fmt.Printf("::debug:: Root: %s\n", displayRoot)
			}

			// Build config
			timeout := time.Duration(timeoutSeconds) * time.Second
			cfg := web.Config{MaxConcurrency: maxConcurrency, RequestTimeout: timeout}

			// Prepare URL list
			var urls []string
			for u := range urlToFiles {
				urls = append(urls, u)
			}
			sort.Strings(urls)

			// If no URLs found, exit early
			if len(urls) == 0 {
				fmt.Println("No URLs found.")
				return nil
			}

			// Run checks
			startedAt := time.Now()
			ctx, cancel := context.WithCancel(context.Background())
			defer cancel()
			results := make(chan web.Result, 256)
			go web.CheckURLs(ctx, urls, urlToFiles, results, nil, cfg)

			var total, okCount, failCount int
			totalURLs := len(urls)
			lastPctLogged := 0
			var failures []SerializableResult
			var failedResults []web.Result

			for r := range results {
				total++
				if r.OK {
					okCount++
				} else {
					failCount++
				}
				// Progress notices every 5%
				if totalURLs > 0 {
					pct := (total * 100) / totalURLs
					for pct >= lastPctLogged+5 && lastPctLogged < 100 {
						lastPctLogged += 5
						fmt.Printf("::progress:: %d%% (%d/%d)\n", lastPctLogged, total, totalURLs)
					}
				}
				// Emit GitHub Actions debug log for each URL.
				// These lines appear only when step debug logging is enabled via the
				// repository/organization secret ACTIONS_STEP_DEBUG=true.
				if shouldDebug() {
					fmt.Printf("::debug:: Scanned URL: %s status=%d ok=%v err=%s sources=%d\n", r.URL, r.Status, r.OK, r.ErrMsg, len(r.Sources))
				}
				if jsonOut != "" && !r.OK {
					failures = append(failures, SerializableResult{
						URL:         r.URL,
						OK:          r.OK,
						Status:      r.Status,
						ErrMsg:      r.ErrMsg,
						Method:      r.Method,
						ContentType: r.ContentType,
						Sources:     r.Sources,
					})
				}
				if !r.OK {
					failedResults = append(failedResults, r)
				}
			}

			// Write JSON if requested (failures only)
			if jsonOut != "" {
				f, ferr := os.Create(jsonOut)
				if ferr != nil {
					return ferr
				}
				enc := json.NewEncoder(f)
				enc.SetIndent("", "  ")
				if err := enc.Encode(failures); err != nil {
					_ = f.Close()
					return err
				}
				_ = f.Close()
			}

			// Build report summary
			base := repoBlobBase
			if strings.TrimSpace(base) == "" {
				base = os.Getenv("SLINKY_REPO_BLOB_BASE_URL")
			}
			summary := report.Summary{
				RootPath:        displayRoot,
				StartedAt:       startedAt,
				FinishedAt:      time.Now(),
				Processed:       total,
				OK:              okCount,
				Fail:            failCount,
				FilesScanned:    countFiles(urlToFiles),
				JSONPath:        jsonOut,
				RepoBlobBaseURL: base,
			}

			// Ensure we have a markdown file if needed for PR comment
			mdPath := mdOut
			ghRepo, ghPR, ghToken, ghOK := detectGitHubPR()
			var finalMDPath string
			if strings.TrimSpace(mdPath) != "" {
				if _, err := report.WriteMarkdown(mdPath, failedResults, summary); err != nil {
					return err
				}
				finalMDPath = mdPath
			} else if ghOK {
				p, err := report.WriteMarkdown("", failedResults, summary)
				if err != nil {
					return err
				}
				finalMDPath = p
			}

			// If running on a PR, post or update the comment(s), chunking as needed
			if ghOK && strings.TrimSpace(finalMDPath) != "" {
				b, rerr := os.ReadFile(finalMDPath)
				if rerr == nil {
					full := string(b)
					if shouldDebug() {
						fmt.Printf("::debug:: Report size (chars): %d\n", len(full))
					}
					chunks := chunkMarkdownByURL(full)
					if shouldDebug() {
						fmt.Printf("::debug:: Posting %d chunk(s)\n", len(chunks))
					}
					_ = upsertPRComments(ghRepo, ghPR, ghToken, chunks)
				}
			}

			fmt.Printf("Checked %d URLs: %d OK, %d failed\n", total, okCount, failCount)
			if failOnFailures && failCount > 0 {
				return fmt.Errorf("%d links failed", failCount)
			}
			return nil
		},
	}

	checkCmd.Flags().IntVar(&maxConcurrency, "concurrency", 16, "maximum concurrent requests")
	checkCmd.Flags().StringVar(&jsonOut, "json-out", "", "path to write full JSON results (array)")
	checkCmd.Flags().StringVar(&mdOut, "md-out", "", "path to write Markdown report for PR comment")
	checkCmd.Flags().StringVar(&repoBlobBase, "repo-blob-base", "", "override GitHub blob base URL (e.g. https://github.com/owner/repo/blob/<sha>)")
	checkCmd.Flags().IntVar(&timeoutSeconds, "timeout", 10, "HTTP request timeout in seconds")
	checkCmd.Flags().BoolVar(&failOnFailures, "fail-on-failures", true, "exit non-zero if any links fail")
	checkCmd.Flags().BoolVar(&respectGitignore, "respect-gitignore", true, "respect .gitignore while scanning (default true)")

	rootCmd.AddCommand(checkCmd)
}

var (
	timeoutSeconds   int
	failOnFailures   bool
	repoBlobBase     string
	respectGitignore bool
)

func toSlash(p string) string {
	p = strings.TrimSpace(p)
	if p == "" {
		return p
	}
	p = filepath.ToSlash(p)
	if after, ok := strings.CutPrefix(p, "./"); ok {
		p = after
	}
	return p
}

func hasGlobMeta(s string) bool {
	return strings.ContainsAny(s, "*?[")
}

func countFiles(urlToFiles map[string][]string) int {
	seen := make(map[string]struct{})
	for _, files := range urlToFiles {
		for _, f := range files {
			seen[f] = struct{}{}
		}
	}
	return len(seen)
}

func detectGitHubPR() (repo string, prNumber int, token string, ok bool) {
	repo = os.Getenv("GITHUB_REPOSITORY")
	token = os.Getenv("GITHUB_TOKEN")
	eventPath := os.Getenv("GITHUB_EVENT_PATH")
	if repo == "" || eventPath == "" || token == "" {
		return "", 0, "", false
	}
	data, err := os.ReadFile(eventPath)
	if err != nil {
		return "", 0, "", false
	}
	var ev struct {
		PullRequest struct {
			Number int `json:"number"`
		} `json:"pull_request"`
	}
	_ = json.Unmarshal(data, &ev)
	if ev.PullRequest.Number == 0 {
		return "", 0, "", false
	}
	return repo, ev.PullRequest.Number, token, true
}

// chunkMarkdownByURL splits markdown into chunks under GitHub's comment body limit,
// keeping whole URL entries together. Only the first chunk includes the original
// header and the "Failures by URL" section header. Subsequent chunks have no headers.
func chunkMarkdownByURL(body string) []string {
	const maxBody = 65000
	lines := strings.Split(body, "\n")
	// locate failures header
	failIdx := -1
	for i, ln := range lines {
		if strings.TrimSpace(ln) == "### Failures by URL" {
			failIdx = i
			break
		}
	}
	if failIdx < 0 {
		// no failures section; return as single chunk
		return []string{body}
	}
	preamble := strings.Join(lines[:failIdx+1], "\n") + "\n"
	entryLines := lines[failIdx+1:]

	// build entries by URL block, starting at lines with "- " at column 0
	type entry struct {
		text   string
		length int
	}
	var entries []entry
	for i := 0; i < len(entryLines); {
		// skip leading blank lines
		for i < len(entryLines) && strings.TrimSpace(entryLines[i]) == "" {
			i++
		}
		if i >= len(entryLines) {
			break
		}
		if !strings.HasPrefix(entryLines[i], "- ") {
			// if unexpected, include line as is
			entries = append(entries, entry{text: entryLines[i] + "\n", length: len(entryLines[i]) + 1})
			i++
			continue
		}
		start := i
		i++
		for i < len(entryLines) && !strings.HasPrefix(entryLines[i], "- ") {
			i++
		}
		block := strings.Join(entryLines[start:i], "\n") + "\n"
		entries = append(entries, entry{text: block, length: len(block)})
	}

	var chunks []string
	// start first chunk with full preamble
	cur := preamble
	curLen := len(cur)
	for _, e := range entries {
		if curLen+e.length > maxBody && curLen > len(preamble) {
			// flush current chunk, start new without headers
			chunks = append(chunks, cur)
			cur = ""
			curLen = 0
		}
		// if new chunk and would still exceed, force place the single large entry
		if curLen == 0 && e.length > maxBody {
			// fallback: include as is; GitHub will still likely accept since entries are typically smaller
		}
		cur += e.text
		curLen += e.length
	}
	if strings.TrimSpace(cur) != "" {
		chunks = append(chunks, cur)
	}
	return chunks
}

// upsertPRComments deletes any existing slinky comments and posts the new chunked comments in order.
func upsertPRComments(repo string, prNumber int, token string, chunks []string) error {
	apiBase := "https://api.github.com"
	listURL := fmt.Sprintf("%s/repos/%s/issues/%d/comments?per_page=100", apiBase, repo, prNumber)
	req, _ := http.NewRequest(http.MethodGet, listURL, nil)
	req.Header.Set("Authorization", "Bearer "+token)
	req.Header.Set("Accept", "application/vnd.github+json")
	resp, err := http.DefaultClient.Do(req)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	var comments []struct {
		ID   int    `json:"id"`
		Body string `json:"body"`
	}
	b, _ := io.ReadAll(resp.Body)
	_ = json.Unmarshal(b, &comments)

	// Delete all existing slinky-report comments to avoid stale entries
	for _, c := range comments {
		if strings.Contains(c.Body, "<!-- slinky-report -->") {
			delURL := fmt.Sprintf("%s/repos/%s/issues/comments/%d", apiBase, repo, c.ID)
			dReq, _ := http.NewRequest(http.MethodDelete, delURL, nil)
			dReq.Header.Set("Authorization", "Bearer "+token)
			dReq.Header.Set("Accept", "application/vnd.github+json")
			_, _ = http.DefaultClient.Do(dReq)
		}
	}

	// Post new comments in order
	for idx, chunk := range chunks {
		body := fmt.Sprintf("%s\n%s", "<!-- slinky-report -->", chunk)
		postURL := fmt.Sprintf("%s/repos/%s/issues/%d/comments", apiBase, repo, prNumber)
		payload, _ := json.Marshal(map[string]string{"body": body})
		req, _ = http.NewRequest(http.MethodPost, postURL, bytes.NewReader(payload))
		req.Header.Set("Authorization", "Bearer "+token)
		req.Header.Set("Accept", "application/vnd.github+json")
		req.Header.Set("Content-Type", "application/json")
		res, _ := http.DefaultClient.Do(req)
		if shouldDebug() {
			fmt.Printf("::debug:: Posted chunk %d/%d: %v\n", idx+1, len(chunks), res)
		}
	}
	return nil
}