#!/usr/bin/env python3
"""Collect SKILL.md directories from public GitHub repos for offline SkillHub import."""

from __future__ import annotations

import argparse
import datetime as dt
import json
import os
import re
import shutil
import sys
import urllib.error
import urllib.parse
import urllib.request
import zipfile
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
DEFAULT_SOURCES = ROOT / "config" / "github-skill-sources.json"
DEFAULT_OUTPUT = ROOT / "seed-skills" / "github"
DEFAULT_DOWNLOADS = ROOT / "downloads" / "github"
DEFAULT_REPORTS = ROOT / "reports"

DANGEROUS_PATTERNS = [
    ("shell-download", re.compile(r"\b(curl|wget)\b.+\|\s*(sh|bash)", re.I)),
    ("recursive-delete", re.compile(r"\brm\s+-rf\b", re.I)),
    ("sudo", re.compile(r"\bsudo\b", re.I)),
    ("eval", re.compile(r"\beval\s+[`\"'$]", re.I)),
    ("base64-decode", re.compile(r"\bbase64\b.+\b(-d|--decode)\b", re.I)),
    ("netcat", re.compile(r"\b(nc|netcat)\b\s+", re.I)),
    ("private-key", re.compile(r"BEGIN [A-Z ]*PRIVATE KEY", re.I)),
    ("secret-token", re.compile(r"\b(api[_-]?key|secret|token)\s*[:=]\s*['\"][^'\"]{8,}", re.I)),
]

TEXT_EXTENSIONS = {
    "",
    ".bash",
    ".css",
    ".csv",
    ".env",
    ".html",
    ".ini",
    ".js",
    ".json",
    ".jsonl",
    ".md",
    ".mjs",
    ".py",
    ".rb",
    ".sh",
    ".sql",
    ".toml",
    ".ts",
    ".txt",
    ".xml",
    ".yaml",
    ".yml",
}


def log(message: str) -> None:
    print(message, flush=True)


def normalize_path(value: str) -> str:
    value = value.strip().strip("/")
    return "." if value in ("", ".") else value


def slugify(value: str, fallback: str = "skill") -> str:
    value = value.lower().replace("_", "-")
    value = re.sub(r"[^a-z0-9.-]+", "-", value)
    value = re.sub(r"-{2,}", "-", value).strip(".-")
    return value or fallback


def request_bytes(url: str, token: str | None = None) -> bytes:
    headers = {"User-Agent": "skillhub-offline-collector/1.0"}
    if token:
        headers["Authorization"] = f"Bearer {token}"
    request = urllib.request.Request(url, headers=headers)
    with urllib.request.urlopen(request, timeout=60) as response:
        return response.read()


def request_json(url: str, token: str | None = None) -> dict:
    return json.loads(request_bytes(url, token).decode("utf-8"))


def download_archive(repo: str, ref: str, downloads_dir: Path, token: str | None) -> Path:
    owner, name = repo.split("/", 1)
    archive_name = f"{slugify(owner)}--{slugify(name)}--{slugify(ref)}.zip"
    archive_path = downloads_dir / archive_name
    if archive_path.exists() and archive_path.stat().st_size > 0:
        log(f"Using cached archive: {archive_path}")
        return archive_path

    encoded_ref = urllib.parse.quote(ref, safe="")
    url = f"https://codeload.github.com/{owner}/{name}/zip/{encoded_ref}"
    log(f"Downloading {repo}@{ref}")
    archive_path.parent.mkdir(parents=True, exist_ok=True)
    archive_path.write_bytes(request_bytes(url, token))
    return archive_path


def fetch_repo_metadata(repo: str, token: str | None) -> dict:
    try:
        data = request_json(f"https://api.github.com/repos/{repo}", token)
        return {
            "repo": repo,
            "default_branch": data.get("default_branch"),
            "stargazers_count": data.get("stargazers_count"),
            "forks_count": data.get("forks_count"),
            "open_issues_count": data.get("open_issues_count"),
            "license": (data.get("license") or {}).get("spdx_id"),
            "html_url": data.get("html_url"),
            "pushed_at": data.get("pushed_at"),
        }
    except (urllib.error.URLError, json.JSONDecodeError) as exc:
        return {"repo": repo, "metadata_error": str(exc)}


def extract_archive(archive: Path, work_dir: Path) -> Path:
    if work_dir.exists():
        shutil.rmtree(work_dir)
    work_dir.mkdir(parents=True)
    with zipfile.ZipFile(archive) as zf:
        zf.extractall(work_dir)
    children = [path for path in work_dir.iterdir() if path.is_dir()]
    if len(children) != 1:
        raise RuntimeError(f"Unexpected archive root for {archive}")
    return children[0]


def is_under(path: str, base: str) -> bool:
    base = normalize_path(base)
    path = normalize_path(path)
    return base == "." or path == base or path.startswith(base + "/")


def selected(rel_dir: str, include: list[str], exclude: list[str]) -> bool:
    includes = [normalize_path(item) for item in include or ["."]]
    excludes = [normalize_path(item) for item in exclude or []]
    return any(is_under(rel_dir, item) for item in includes) and not any(
        is_under(rel_dir, item) for item in excludes
    )


def parse_frontmatter(text: str) -> dict:
    if not text.startswith("---"):
        return {}
    lines = text.splitlines()
    data: dict[str, str] = {}
    for line in lines[1:]:
        if line.strip() == "---":
            break
        match = re.match(r"^([A-Za-z0-9_-]+):\s*(.*)$", line)
        if match:
            data[match.group(1).strip().lower()] = match.group(2).strip().strip("\"'")
    return data


def first_heading(text: str) -> str | None:
    for line in text.splitlines():
        match = re.match(r"^#\s+(.+?)\s*$", line)
        if match:
            return match.group(1).strip()
    return None


def first_sentence(text: str) -> str | None:
    for line in text.splitlines():
        line = line.strip()
        if not line or line.startswith("#") or line == "---" or ":" in line[:32]:
            continue
        return line[:240]
    return None


def read_skill_metadata(skill_file: Path) -> dict:
    text = skill_file.read_text(encoding="utf-8", errors="replace")
    frontmatter = parse_frontmatter(text)
    name = frontmatter.get("name") or first_heading(text) or skill_file.parent.name
    description = (
        frontmatter.get("description")
        or first_sentence(text)
        or f"Imported skill from {skill_file.parent.name}."
    )
    return {
        "name": slugify(name, skill_file.parent.name),
        "title": name,
        "description": description,
        "license": frontmatter.get("license"),
        "frontmatter": frontmatter,
        "text": text,
    }


def ensure_package_json(
    skill_dir: Path,
    metadata: dict,
    repo_metadata: dict,
    source_repo: str,
    source_ref: str,
    source_path: str,
) -> None:
    package_path = skill_dir / "package.json"
    package: dict = {}
    if package_path.exists():
        try:
            package = json.loads(package_path.read_text(encoding="utf-8"))
        except json.JSONDecodeError:
            package = {}

    package.setdefault("name", metadata["name"])
    package.setdefault("version", "0.1.0")
    package.setdefault("description", metadata["description"])
    package.setdefault("keywords", ["github-import", source_repo.split("/", 1)[0], source_repo.split("/", 1)[1]])
    package.setdefault("license", metadata.get("license") or repo_metadata.get("license") or "UNKNOWN")
    package["source"] = {
        "repo": source_repo,
        "ref": source_ref,
        "path": source_path,
        "url": f"https://github.com/{source_repo}/tree/{source_ref}/{source_path}",
    }
    package_path.write_text(json.dumps(package, ensure_ascii=True, indent=2) + "\n", encoding="utf-8")


def find_skill_dirs(root: Path, include: list[str], exclude: list[str]) -> list[Path]:
    skill_dirs: list[Path] = []
    for skill_file in root.rglob("*"):
        if not skill_file.is_file() or skill_file.name.lower() != "skill.md":
            continue
        rel_dir = skill_file.parent.relative_to(root).as_posix()
        if selected(rel_dir, include, exclude):
            skill_dirs.append(skill_file.parent)
    return sorted(set(skill_dirs))


def scan_directory(skill_dir: Path) -> list[dict]:
    findings: list[dict] = []
    for path in skill_dir.rglob("*"):
        if not path.is_file():
            continue
        rel = path.relative_to(skill_dir).as_posix()
        if path.suffix.lower() not in TEXT_EXTENSIONS:
            continue
        if path.stat().st_size > 1024 * 1024:
            findings.append({"file": rel, "kind": "large-file", "detail": f"{path.stat().st_size} bytes"})
            continue
        text = path.read_text(encoding="utf-8", errors="ignore")
        for kind, pattern in DANGEROUS_PATTERNS:
            if pattern.search(text):
                findings.append({"file": rel, "kind": kind, "detail": "pattern matched"})
    return findings


def collect(args: argparse.Namespace) -> tuple[list[dict], list[dict]]:
    config = json.loads(args.sources.read_text(encoding="utf-8"))
    token = os.environ.get("GITHUB_TOKEN")
    output_root = args.output
    downloads_dir = args.downloads
    work_root = ROOT / ".tmp" / "github-skill-collector"

    output_root.mkdir(parents=True, exist_ok=True)
    downloads_dir.mkdir(parents=True, exist_ok=True)

    catalog: list[dict] = []
    repo_summaries: list[dict] = []

    for source in config["sources"]:
        repo = source["repo"]
        ref = source.get("ref", "main")
        namespace = source.get("namespace") or slugify(repo.replace("/", "-"))
        repo_slug = slugify(repo.replace("/", "--"))
        repo_output = output_root / repo_slug
        if repo_output.exists():
            shutil.rmtree(repo_output)
        repo_output.mkdir(parents=True)

        repo_metadata = fetch_repo_metadata(repo, token)
        archive = download_archive(repo, ref, downloads_dir, token)
        extracted = extract_archive(archive, work_root / repo_slug)
        skill_dirs = find_skill_dirs(extracted, source.get("include", ["."]), source.get("exclude", []))
        if args.limit_per_repo:
            skill_dirs = skill_dirs[: args.limit_per_repo]

        repo_summary = {
            "repo": repo,
            "ref": ref,
            "namespace": namespace,
            "selection_reason": source.get("selection_reason"),
            "license_note": source.get("license_note"),
            "metadata": repo_metadata,
            "skill_count": len(skill_dirs),
        }
        repo_summaries.append(repo_summary)
        log(f"Found {len(skill_dirs)} skills in {repo}")

        used_slugs: set[str] = set()
        for source_dir in skill_dirs:
            rel_dir = source_dir.relative_to(extracted).as_posix()
            source_skill_file = next(
                item for item in source_dir.iterdir() if item.is_file() and item.name.lower() == "skill.md"
            )
            metadata = read_skill_metadata(source_skill_file)
            skill_slug = metadata["name"]
            if skill_slug in used_slugs:
                skill_slug = slugify(f"{source_dir.parent.name}-{skill_slug}")
            used_slugs.add(skill_slug)

            target_dir = repo_output / skill_slug
            shutil.copytree(source_dir, target_dir)
            target_skill_file = target_dir / "skill.md"
            if not target_skill_file.exists():
                target_skill_file.write_text(metadata["text"], encoding="utf-8")
            ensure_package_json(target_dir, metadata, repo_metadata, repo, ref, rel_dir)

            findings = scan_directory(target_dir)
            catalog.append(
                {
                    "name": skill_slug,
                    "title": metadata["title"],
                    "description": metadata["description"],
                    "namespace": namespace,
                    "source_repo": repo,
                    "source_ref": ref,
                    "source_path": rel_dir,
                    "package_dir": target_dir.relative_to(ROOT).as_posix(),
                    "license": metadata.get("license") or repo_metadata.get("license") or "UNKNOWN",
                    "security_findings": findings,
                }
            )

    catalog_path = output_root / "_catalog.json"
    catalog_path.write_text(json.dumps(catalog, ensure_ascii=True, indent=2) + "\n", encoding="utf-8")
    repo_path = output_root / "_sources.json"
    repo_path.write_text(json.dumps(repo_summaries, ensure_ascii=True, indent=2) + "\n", encoding="utf-8")
    return catalog, repo_summaries


def write_report(catalog: list[dict], repos: list[dict], reports_dir: Path) -> Path:
    reports_dir.mkdir(parents=True, exist_ok=True)
    report_path = reports_dir / "github-skill-import-report.md"
    lines = [
        "# GitHub Skill Import Report",
        "",
        f"Generated at: {dt.datetime.now(dt.timezone.utc).isoformat()}",
        "",
        "## Sources",
        "",
    ]
    for repo in repos:
        meta = repo.get("metadata", {})
        lines.extend(
            [
                f"- {repo['repo']} ({repo['namespace']}): {repo['skill_count']} skills",
                f"  - Stars: {meta.get('stargazers_count', 'unknown')}; forks: {meta.get('forks_count', 'unknown')}; license: {meta.get('license', 'unknown')}",
                f"  - Reason: {repo.get('selection_reason')}",
                f"  - License note: {repo.get('license_note')}",
            ]
        )
    lines.extend(["", "## Skill Findings", ""])
    risky = [item for item in catalog if item["security_findings"]]
    if not risky:
        lines.append("No simple static risk patterns matched in imported files.")
    else:
        for item in risky:
            lines.append(f"- {item['namespace']}/{item['name']} from {item['source_repo']}/{item['source_path']}")
            for finding in item["security_findings"]:
                lines.append(f"  - {finding['kind']}: {finding['file']}")
    lines.extend(
        [
            "",
            "## Imported Skills",
            "",
        ]
    )
    for item in catalog:
        lines.append(f"- {item['namespace']}/{item['name']} - {item['description']}")
    report_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
    return report_path


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--sources", type=Path, default=DEFAULT_SOURCES)
    parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
    parser.add_argument("--downloads", type=Path, default=DEFAULT_DOWNLOADS)
    parser.add_argument("--reports", type=Path, default=DEFAULT_REPORTS)
    parser.add_argument("--limit-per-repo", type=int, default=0, help="For smoke tests; 0 imports all found skills.")
    args = parser.parse_args()

    try:
        catalog, repos = collect(args)
        report_path = write_report(catalog, repos, args.reports)
    except Exception as exc:
        print(f"collect-github-skills failed: {exc}", file=sys.stderr)
        return 1

    log(f"Imported {len(catalog)} skills into {args.output}")
    log(f"Report: {report_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())