Auditor/theauditor/agent_template_validator.py

"""Agent template validator - ensures templates comply with SOP permissions."""

import json
import re
from pathlib import Path
from typing import Dict, List, Any, Tuple, Optional
import yaml


class TemplateValidator:
    """Validates agent templates for SOP compliance and structure."""

    # Tools that allow code modification
    WRITE_TOOLS = {"Write", "Edit", "MultiEdit", "NotebookEdit"}

    # Agents allowed to modify code
    ALLOWED_EDITOR_AGENTS = {"coder", "documentation-manager", "implementation-specialist"}

    # Required frontmatter fields
    REQUIRED_FIELDS = {"name", "description", "tools", "model"}

    def __init__(self, template_dir: str = None):
        """Initialize validator with template directory."""
        if template_dir:
            self.template_dir = Path(template_dir)
        else:
            # Default to agent_templates relative to module
            self.template_dir = Path(__file__).parent.parent / "agent_templates"

        self.violations = []
        self.warnings = []

    def _extract_frontmatter(self, content: str) -> Optional[Dict[str, Any]]:
        """Extract YAML frontmatter from markdown file.

        Args:
            content: File content

        Returns:
            Parsed frontmatter dict or None if not found
        """
        # Match frontmatter between --- markers
        pattern = r'^---\s*\n(.*?)\n---\s*\n'
        match = re.match(pattern, content, re.DOTALL)

        if not match:
            return None

        try:
            frontmatter_text = match.group(1)
            return yaml.safe_load(frontmatter_text)
        except yaml.YAMLError as e:
            self.violations.append(f"Invalid YAML frontmatter: {e}")
            return None

    def _parse_tools(self, tools_value: Any) -> List[str]:
        """Parse tools from frontmatter value.

        Args:
            tools_value: Tools field from frontmatter

        Returns:
            List of tool names
        """
        if isinstance(tools_value, str):
            # Comma-separated string
            return [t.strip() for t in tools_value.split(',')]
        elif isinstance(tools_value, list):
            return tools_value
        else:
            return []

    def _check_sop_permissions(
        self,
        template_name: str,
        frontmatter: Dict[str, Any]
    ) -> List[str]:
        """Check SOP permission rules.

        Args:
            template_name: Name of template file
            frontmatter: Parsed frontmatter

        Returns:
            List of violations found
        """
        violations = []

        # Get name and description, ensuring they're strings
        agent_name = frontmatter.get("name", "")
        if not isinstance(agent_name, str):
            agent_name = str(agent_name) if agent_name else ""
        # Skip validation for templates with placeholders
        if "{" in agent_name or "}" in agent_name:
            # This is a template with placeholders, not a real agent
            return []
        agent_name = agent_name.lower()

        description = frontmatter.get("description", "")
        if not isinstance(description, str):
            description = str(description) if description else ""
        description = description.lower()

        tools = self._parse_tools(frontmatter.get("tools", ""))

        # Check if agent has write tools
        has_write_tools = any(tool in self.WRITE_TOOLS for tool in tools)

        # Check compliance/legal agents first (they have stricter rules)
        is_compliance_agent = (
            "compliance" in agent_name or
            "compliance" in description or
            "legal" in agent_name or
            "legal" in description
        )

        if is_compliance_agent and has_write_tools:
            violations.append(
                f"Compliance/legal agent '{agent_name}' must not have write tools, "
                f"found: {self.WRITE_TOOLS & set(tools)}"
            )
        elif has_write_tools:
            # For non-compliance agents, check if they're allowed to have write tools
            is_allowed_editor = any(
                allowed in agent_name
                for allowed in self.ALLOWED_EDITOR_AGENTS
            )

            if not is_allowed_editor:
                violations.append(
                    f"Agent '{agent_name}' has write tools ({self.WRITE_TOOLS & set(tools)}) "
                    f"but is not in allowed editor list: {self.ALLOWED_EDITOR_AGENTS}"
                )

        return violations

    def _check_internal_links(
        self,
        content: str,
        template_path: Path
    ) -> List[str]:
        """Check internal repository links are valid.

        Args:
            content: Template content
            template_path: Path to template file

        Returns:
            List of broken links
        """
        broken_links = []

        # Find markdown links and references to repo paths
        link_patterns = [
            r'\[.*?\]\((\/[^)]+)\)',  # Markdown links with absolute paths
            r'`(\/[^`]+)`',  # Code blocks with paths
            r'"(\/[^"]+)"',  # Quoted paths
            r"'(\/[^']+)'",  # Single-quoted paths
        ]

        for pattern in link_patterns:
            for match in re.finditer(pattern, content):
                path_str = match.group(1)

                # Skip URLs and anchors
                if path_str.startswith('http') or path_str.startswith('#'):
                    continue

                # Check if path exists relative to repo root
                repo_root = template_path.parent.parent
                full_path = repo_root / path_str.lstrip('/')

                if not full_path.exists():
                    broken_links.append(f"Broken internal link: {path_str}")

        return broken_links

    def validate_template(self, template_path: Path) -> Dict[str, Any]:
        """Validate a single template file.

        Args:
            template_path: Path to template markdown file

        Returns:
            Validation result dict
        """
        result = {
            "path": str(template_path),
            "valid": True,
            "violations": [],
            "warnings": []
        }

        try:
            with open(template_path, 'r', encoding='utf-8') as f:
                content = f.read()
        except IOError as e:
            result["valid"] = False
            result["violations"].append(f"Cannot read file: {e}")
            return result

        # Extract frontmatter
        frontmatter = self._extract_frontmatter(content)

        if frontmatter is None:
            result["valid"] = False
            result["violations"].append("No valid frontmatter found")
            return result

        # Check required fields
        missing_fields = self.REQUIRED_FIELDS - set(frontmatter.keys())
        if missing_fields:
            result["valid"] = False
            result["violations"].append(
                f"Missing required frontmatter fields: {missing_fields}"
            )

        # Check SOP permissions
        sop_violations = self._check_sop_permissions(
            template_path.name,
            frontmatter
        )
        if sop_violations:
            result["valid"] = False
            result["violations"].extend(sop_violations)

        # Check internal links
        broken_links = self._check_internal_links(content, template_path)
        if broken_links:
            result["warnings"].extend(broken_links)

        # Check for tool typos/inconsistencies
        tools = self._parse_tools(frontmatter.get("tools", ""))
        known_tools = {
            "Bash", "Glob", "Grep", "LS", "Read", "Edit", "Write",
            "MultiEdit", "NotebookEdit", "WebFetch", "TodoWrite",
            "WebSearch", "BashOutput", "KillBash", "Task", "ExitPlanMode"
        }

        unknown_tools = set(tools) - known_tools
        if unknown_tools:
            result["warnings"].append(
                f"Unknown tools found: {unknown_tools}"
            )

        return result

    def validate_all(self, source_dir: Optional[str] = None) -> Dict[str, Any]:
        """Validate all templates in directory.

        Args:
            source_dir: Directory containing templates (default: self.template_dir)

        Returns:
            Validation summary
        """
        if source_dir:
            template_dir = Path(source_dir)
        else:
            template_dir = self.template_dir

        if not template_dir.exists():
            return {
                "valid": False,
                "error": f"Template directory not found: {template_dir}",
                "templates": []
            }

        results = []
        all_valid = True
        total_violations = 0
        total_warnings = 0

        # Find all .md files
        for template_path in template_dir.glob("*.md"):
            result = self.validate_template(template_path)
            results.append(result)

            if not result["valid"]:
                all_valid = False

            total_violations += len(result["violations"])
            total_warnings += len(result["warnings"])

        return {
            "valid": all_valid,
            "templates_checked": len(results),
            "total_violations": total_violations,
            "total_warnings": total_warnings,
            "templates": results
        }

    def generate_report(
        self,
        validation_results: Dict[str, Any],
        format: str = "json"
    ) -> str:
        """Generate validation report.

        Args:
            validation_results: Results from validate_all()
            format: Output format ('json' or 'text')

        Returns:
            Formatted report string
        """
        if format == "json":
            return json.dumps(validation_results, indent=2, sort_keys=True)

        # Text format
        lines = []
        lines.append("=== Agent Template Validation Report ===\n")
        lines.append(f"Templates checked: {validation_results['templates_checked']}")
        lines.append(f"Total violations: {validation_results['total_violations']}")
        lines.append(f"Total warnings: {validation_results['total_warnings']}")
        lines.append(f"Overall status: {'PASS' if validation_results['valid'] else 'FAIL'}\n")

        for template in validation_results.get("templates", []):
            lines.append(f"\n{template['path']}:")
            lines.append(f"  Status: {'✓' if template['valid'] else '✗'}")

            if template["violations"]:
                lines.append("  Violations:")
                for v in template["violations"]:
                    lines.append(f"    - {v}")

            if template["warnings"]:
                lines.append("  Warnings:")
                for w in template["warnings"]:
                    lines.append(f"    - {w}")

        return "\n".join(lines)


# Module-level convenience function
def validate_templates(source_dir: str) -> Tuple[bool, Dict[str, Any]]:
    """Validate all templates in directory.

    Args:
        source_dir: Directory containing agent templates

    Returns:
        Tuple of (all_valid, validation_results)
    """
    validator = TemplateValidator()
    results = validator.validate_all(source_dir)
    return results["valid"], results