trueref/tests/quality/phaser_rag_eval.py

#!/usr/bin/env python3
"""
Phaser RAG Quality Evaluation Suite
====================================
Simulates an LLM querying TrueRef for Phaser documentation and guidance.
Tests are designed to be hard and objective: each defines exact expected content
fragments and/or expected source files that MUST appear in the top-k results.

Scoring metrics per test:
  file@1    - expected file appeared as hit #1
  file@3    - expected file appeared in hits 1-3
  file@5    - expected file appeared in hits 1-5
  content@5 - at least one expected content fragment found across the top-5 hits combined
  content@1 - expected content fragment found in hit #1

Overall suite scores:
  MRR      - Mean Reciprocal Rank (file position)
  P@1..5   - Precision@k for file hits
  C@5      - Content recall across top-5

Run:
  python3 phaser_rag_eval.py [--base-url http://localhost:18080] [--verbose]
"""

import argparse
import json
import sys
import time
from dataclasses import dataclass, field
from typing import Optional
import urllib.request
import urllib.error

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
REPO_ID = "50010965-aa3f-45f4-bb8d-72a0d50bf0db"

# Version IDs pinned to specific tags (fetched at startup if not found)
VERSIONS = {
    "v4.1.0":  "6c6a00f5-0945-4fd7-b62c-c0e69f14effe",
    "v3.88.0": "d032d4d4-e6bc-4c9d-9c3c-8853e4a1cdc9",
    "v3.85.2": "d1cf906e-54b9-416f-bd5b-9432d69d9935",
    "v3.60.0": "95d0a8e2-9071-4986-85d4-59ae97893353",
}

# ---------------------------------------------------------------------------
# Test definition
# ---------------------------------------------------------------------------
@dataclass
class TestCase:
    id: str
    name: str
    query: str
    version: str                        # key into VERSIONS
    topic: Optional[str] = None
    expected_files: list[str] = field(default_factory=list)   # substrings of filePath
    expected_content: list[str] = field(default_factory=list) # substrings that MUST appear
    required_content: list[str] = field(default_factory=list) # ALL of these must appear (stricter)
    max_hits: int = 10
    tokens_budget: int = 6000
    # Optional: minimum rerank score the top hit should exceed
    min_score: Optional[float] = None

# ---------------------------------------------------------------------------
# Test definitions — 25 hard, objective cases
# ---------------------------------------------------------------------------
TESTS: list[TestCase] = [

    # ── 1. Tween system: basic config properties ──────────────────────────
    TestCase(
        id="T01",
        name="Tween config: yoyo/hold/repeatDelay properties",
        query="What properties can I set in a TweenBuilderConfig to make a tween yoyo with a hold and repeat delay?",
        version="v4.1.0",
        topic="tweens",
        expected_files=["tweens/builders/TweenBuilder.js", "tweens/typedefs"],
        expected_content=["yoyo", "hold", "repeatDelay"],
        required_content=["yoyo", "repeatDelay"],
    ),

    # ── 2. Tween system: onComplete / onUpdate callbacks ──────────────────
    TestCase(
        id="T02",
        name="Tween callbacks: onComplete and onUpdate signatures",
        query="How do I use onComplete and onUpdate callbacks in a Phaser tween? What arguments do they receive?",
        version="v4.1.0",
        topic="tweens",
        expected_files=["tweens/"],
        expected_content=["onComplete", "onUpdate", "onStart"],
        required_content=["onComplete"],
    ),

    # ── 3. Arcade physics: setCollideWorldBounds signature ────────────────
    TestCase(
        id="T03",
        name="Arcade physics: setCollideWorldBounds signature",
        query="What are the parameters of setCollideWorldBounds in Phaser Arcade physics? Can I pass bounceX and bounceY to set bounce on world edges?",
        version="v4.1.0",
        topic="physics",
        expected_files=["physics/arcade/Body.js"],
        expected_content=["setCollideWorldBounds", "bounceX", "bounceY", "onWorldBounds"],
        required_content=["setCollideWorldBounds", "bounceX"],
    ),

    # ── 4. Arcade physics: addCollider vs addOverlap ──────────────────────
    TestCase(
        id="T04",
        name="Arcade physics: addCollider vs addOverlap difference",
        query="What is the difference between addCollider and addOverlap in Phaser's Arcade physics World? How do I add a callback?",
        version="v4.1.0",
        topic="physics",
        expected_files=["physics/arcade/World.js"],
        expected_content=["addCollider", "addOverlap", "collideCallback", "processCallback"],
        required_content=["addCollider", "addOverlap"],
    ),

    # ── 5. Camera: shake parameters ───────────────────────────────────────
    TestCase(
        id="T05",
        name="Camera shake: duration, intensity, force, callback",
        query="How do I make the camera shake in Phaser? What parameters does camera.shake accept?",
        version="v4.1.0",
        topic="camera",
        expected_files=["cameras/2d/Camera.js"],
        expected_content=["shake", "duration", "intensity", "force", "callback"],
        required_content=["shake", "intensity"],
    ),

    # ── 6. Camera: startFollow with lerp ─────────────────────────────────
    TestCase(
        id="T06",
        name="Camera follow: startFollow lerpX lerpY parameters",
        query="How do I make the Phaser camera follow a player with smooth lerp? What are the lerpX and lerpY parameters?",
        version="v4.1.0",
        topic="camera",
        expected_files=["cameras/2d/Camera.js"],
        expected_content=["startFollow", "lerpX", "lerpY", "roundPixels"],
        required_content=["startFollow", "lerpX"],
    ),

    # ── 7. Camera: setDeadzone ────────────────────────────────────────────
    TestCase(
        id="T07",
        name="Camera deadzone: setDeadzone width/height",
        query="How does camera deadzone work in Phaser? How do I create a rectangular deadzone so the camera only moves when the player exits it?",
        version="v4.1.0",
        topic="camera",
        expected_files=["cameras/2d/Camera.js"],
        expected_content=["setDeadzone", "deadzone"],
        required_content=["setDeadzone"],
    ),

    # ── 8. Scene: pass data when starting another scene ───────────────────
    TestCase(
        id="T08",
        name="Scene management: pass data on scene.start",
        query="How do I pass data to another scene when calling scene.start() or scene.launch()? How does the init method receive it?",
        version="v4.1.0",
        topic="scenes",
        expected_files=["scene/"],
        expected_content=["init", "data", "start", "launch"],
        required_content=["init"],
    ),

    # ── 9. Animation system: chaining animations ──────────────────────────
    TestCase(
        id="T09",
        name="Animation chaining: chain() and playAfterRepeat()",
        query="How can I chain multiple animations so one plays after another finishes in Phaser? What is the chain() method?",
        version="v4.1.0",
        topic="animations",
        expected_files=["gameobjects/sprite/Sprite.js", "animations/"],
        expected_content=["chain", "playAfterRepeat", "playAfterDelay"],
        required_content=["chain"],
    ),

    # ── 10. Animation system: events ─────────────────────────────────────
    TestCase(
        id="T10",
        name="Animation events: ANIMATION_COMPLETE, ANIMATION_START",
        query="What events does the Phaser animation system emit? How do I listen for when an animation completes on a specific sprite?",
        version="v4.1.0",
        topic="animations",
        expected_files=["animations/events/"],
        expected_content=["ANIMATION_COMPLETE", "ANIMATION_START", "ANIMATION_STOP"],
        required_content=["ANIMATION_COMPLETE"],
    ),

    # ── 11. Input: pointer events ─────────────────────────────────────────
    TestCase(
        id="T11",
        name="Input: setInteractive + pointerdown/pointerover events",
        query="How do I call setInteractive on a game object and listen for pointerdown and pointerover events in Phaser?",
        version="v4.1.0",
        topic="input",
        expected_files=["input/"],
        expected_content=["pointerdown", "pointerover", "pointerout", "setInteractive"],
        required_content=["setInteractive", "pointerdown"],
    ),

    # ── 12. Input: keyboard cursor keys ──────────────────────────────────
    TestCase(
        id="T12",
        name="Input: createCursorKeys and keyboard key states",
        query="How do I read arrow key input in Phaser? How does createCursorKeys() work and how do I check if a key is down?",
        version="v4.1.0",
        topic="input",
        expected_files=["input/keyboard/"],
        expected_content=["createCursorKeys", "isDown", "up", "down", "left", "right"],
        required_content=["createCursorKeys"],
    ),

    # ── 13. Loader: atlas and texture keys ───────────────────────────────
    TestCase(
        id="T13",
        name="Loader: load.atlas config object and frame keys",
        query="How do I load a texture atlas in Phaser? What are the arguments to this.load.atlas() and how do I use frame keys?",
        version="v4.1.0",
        topic="loader",
        expected_files=["loader/filetypes/AtlasJSONFile.js", "loader/"],
        expected_content=["atlas", "textureURL", "atlasURL", "frameConfig"],
        required_content=["atlas"],
        min_score=0.7,
    ),

    # ── 14. Tilemaps: setCollisionBetween ────────────────────────────────
    TestCase(
        id="T14",
        name="Tilemap: setCollisionBetween start/stop parameters",
        query="How do I set collision on a range of tile indices in a Phaser tilemap? What does setCollisionBetween do?",
        version="v4.1.0",
        topic="tilemaps",
        expected_files=["tilemaps/Tilemap.js", "tilemaps/"],
        expected_content=["setCollisionBetween", "start", "stop", "collides", "recalculateFaces"],
        required_content=["setCollisionBetween"],
    ),

    # ── 15. Tilemaps: createFromObjects ──────────────────────────────────
    TestCase(
        id="T15",
        name="Tilemap: createFromObjects from Tiled object layer",
        query="How do I convert Tiled object layer objects into Phaser game objects? How does createFromObjects work?",
        version="v4.1.0",
        topic="tilemaps",
        expected_files=["tilemaps/Tilemap.js"],
        expected_content=["createFromObjects", "objectLayerName"],
        required_content=["createFromObjects"],
    ),

    # ── 16. RenderTexture: beginDraw / endDraw (v3 API) ──────────────────
    TestCase(
        id="T16",
        name="RenderTexture v3: beginDraw / batchDraw / endDraw pattern",
        query="How do I use beginDraw and endDraw on a Phaser RenderTexture for batch drawing? What is the workflow?",
        version="v3.85.2",
        topic="rendering",
        expected_files=["textures/DynamicTexture.js"],
        expected_content=["beginDraw", "endDraw", "batchDraw", "batchDrawFrame"],
        required_content=["beginDraw", "endDraw"],
    ),

    # ── 17. Masking: BitmapMask vs GeometryMask (v3 API) ──────────────────
    TestCase(
        id="T17",
        name="Masking v3: createBitmapMask vs createGeometryMask",
        query="What is the difference between a BitmapMask and a GeometryMask in Phaser? How do I create and apply them?",
        version="v3.85.2",
        topic="rendering",
        expected_files=["gameobjects/components/Mask.js", "display/mask/"],
        expected_content=["createBitmapMask", "createGeometryMask", "setMask", "BitmapMask", "GeometryMask"],
        required_content=["BitmapMask", "GeometryMask"],
    ),

    # ── 18. Groups: getFirstDead / getFirstAlive pool pattern ─────────────
    TestCase(
        id="T18",
        name="Group: object pool with getFirstDead / getFirstAlive",
        query="How do I implement an object pool in Phaser using a Group? What are getFirstDead and getFirstAlive?",
        version="v4.1.0",
        topic="gameobjects",
        expected_files=["gameobjects/group/Group.js"],
        expected_content=["getFirstDead", "getFirstAlive", "createIfNull", "countActive"],
        required_content=["getFirstDead", "getFirstAlive"],
    ),

    # ── 19. Matter.js: fromVertices custom body shape ─────────────────────
    TestCase(
        id="T19",
        name="Matter.js: custom body shape with fromVertices",
        query="How do I create a custom polygon physics body from vertices in Phaser's Matter.js physics?",
        version="v4.1.0",
        topic="physics",
        expected_files=["physics/matter-js/Factory.js", "physics/matter-js/"],
        expected_content=["fromVertices", "vertexSets", "options"],
        required_content=["fromVertices"],
    ),

    # ── 20. Game config: FPS limit / target ───────────────────────────────
    TestCase(
        id="T20",
        name="Game config: fps.target and fps.limit settings",
        query="How do I configure the target frame rate and FPS limit in the Phaser game config? What is the difference between target and limit?",
        version="v4.1.0",
        topic="core",
        expected_files=["core/TimeStep.js", "core/Config.js"],
        expected_content=["targetFps", "fpsLimit", "target", "fps"],
        required_content=["targetFps"],
    ),

    # ── 21. Scale Manager: ScaleModes ────────────────────────────────────
    TestCase(
        id="T21",
        name="Scale Manager: FIT vs ENVELOP scale modes",
        query="What scale modes are available in Phaser's Scale Manager? How does FIT differ from ENVELOP? How do I make a responsive game?",
        version="v4.1.0",
        topic="scale",
        expected_files=["scale/"],
        expected_content=["FIT", "ENVELOP", "ScaleManager", "autoCenter"],
        required_content=["FIT"],
    ),

    # ── 22. Data Manager: set/get/events ──────────────────────────────────
    TestCase(
        id="T22",
        name="Data Manager: set/get and CHANGE_DATA event",
        query="How does the Phaser Data Manager work? How do I watch for data changes using events on a game object's data?",
        version="v4.1.0",
        topic="data",
        expected_files=["data/DataManager.js", "data/"],
        expected_content=["CHANGE_DATA", "set", "get", "events"],
        required_content=["CHANGE_DATA"],
    ),

    # ── 23. Depth sort: setDepth and displayList ──────────────────────────
    TestCase(
        id="T23",
        name="Depth sorting: setDepth and display list ordering",
        query="How does Phaser handle rendering order (z-order)? How do I use setDepth to control which objects render on top?",
        version="v4.1.0",
        topic="rendering",
        expected_files=["gameobjects/"],
        expected_content=["setDepth", "depth", "displayList"],
        required_content=["setDepth"],
    ),

    # ── 24. Version diff: v3.60 TweenChain (new in 3.60) ─────────────────
    TestCase(
        id="T24",
        name="Version-specific: TweenChain introduced in v3.60",
        query="How do I create a sequence of tweens that play one after another using TweenChain in Phaser 3.60+?",
        version="v3.60.0",
        topic="tweens",
        expected_files=["tweens/"],
        expected_content=["TweenChain", "chain"],
        required_content=["TweenChain"],
    ),

    # ── 25. Hard adversarial: camera.ignore() ─────────────────────────────
    TestCase(
        id="T25",
        name="Camera: ignore() to exclude game objects from a camera",
        query="How do I make a game object invisible to a specific camera in Phaser while remaining visible to others? What is camera.ignore()?",
        version="v4.1.0",
        topic="camera",
        expected_files=["cameras/2d/"],
        expected_content=["ignore", "camera"],
        required_content=["ignore"],
    ),
]

# ---------------------------------------------------------------------------
# HTTP helpers
# ---------------------------------------------------------------------------

def post_json(url: str, payload: dict) -> dict:
    body = json.dumps(payload).encode()
    req = urllib.request.Request(
        url, data=body,
        headers={"Content-Type": "application/json", "Accept": "application/json"},
        method="POST",
    )
    with urllib.request.urlopen(req, timeout=30) as resp:
        return json.loads(resp.read().decode())


def get_json(url: str) -> dict | list:
    req = urllib.request.Request(url, headers={"Accept": "application/json"})
    with urllib.request.urlopen(req, timeout=15) as resp:
        return json.loads(resp.read().decode())


# ---------------------------------------------------------------------------
# Evaluation logic
# ---------------------------------------------------------------------------

@dataclass
class TestResult:
    test: TestCase
    hits: list[dict]
    elapsed_ms: float
    error: Optional[str] = None

    # Computed below
    file_rank: Optional[int] = None   # 1-based rank of first expected-file match
    content_ranks: list[int] = field(default_factory=list)  # 1-based ranks where content found
    required_found: bool = False
    top_score: Optional[float] = None

    def file_at(self, k: int) -> bool:
        return self.file_rank is not None and self.file_rank <= k

    def content_at(self, k: int) -> bool:
        return any(r <= k for r in self.content_ranks)

    def mrr(self) -> float:
        if self.file_rank is None:
            return 0.0
        return 1.0 / self.file_rank

    def summary_line(self) -> str:
        f1 = "✓" if self.file_at(1) else "·"
        f3 = "✓" if self.file_at(3) else "·"
        f5 = "✓" if self.file_at(5) else "·"
        c5 = "✓" if self.content_at(5) else "·"
        req = "✓" if self.required_found else "✗"
        rank_str = f"rank={self.file_rank}" if self.file_rank else "NOT FOUND"
        score_str = f"score={self.top_score:.3f}" if self.top_score else ""
        ms_str = f"{self.elapsed_ms:.0f}ms"
        return (
            f"[{self.test.id}] {self.test.name[:52]:<52} "
            f"f@1={f1} f@3={f3} f@5={f5} c@5={c5} req={req}  "
            f"{rank_str:>12}  {score_str}  {ms_str}"
        )


def evaluate(result: TestResult, verbose: bool = False) -> None:
    hits = result.hits
    if not hits:
        return

    result.top_score = hits[0].get("score") if hits else None

    # File rank: position of first hit whose filePath matches any expected_files substring
    for i, hit in enumerate(hits):
        fp = hit.get("filePath", "")
        if any(ef in fp for ef in result.test.expected_files):
            result.file_rank = i + 1
            break

    # Content rank: for each expected_content fragment, find the first hit that contains it
    combined_content = {i: (hit.get("content") or "") for i, hit in enumerate(hits)}

    for fragment in result.test.expected_content:
        for i, content in combined_content.items():
            if fragment.lower() in content.lower():
                result.content_ranks.append(i + 1)
                break

    # Required content: ALL required fragments must appear somewhere in top-10
    all_content = " ".join(combined_content.values()).lower()
    result.required_found = all(
        r.lower() in all_content for r in result.test.required_content
    )

    if verbose:
        print(f"\n{'─'*80}")
        print(f"[{result.test.id}] {result.test.name}")
        print(f"  Query: {result.test.query}")
        print(f"  Expected files: {result.test.expected_files}")
        print(f"  Expected content: {result.test.expected_content}")
        print(f"  Top hits:")
        for i, hit in enumerate(hits[:5]):
            fp = hit.get("filePath", "?")
            score = hit.get("score", 0.0)
            snip = (hit.get("content") or "")[:100].replace("\n", " ")
            marker = " ← FILE MATCH" if any(ef in fp for ef in result.test.expected_files) else ""
            print(f"    [{i+1}] score={score:.3f}  {fp}{marker}")
            print(f"         {snip}")


# ---------------------------------------------------------------------------
# Main runner
# ---------------------------------------------------------------------------

def run(base_url: str, verbose: bool) -> None:
    base_url = base_url.rstrip("/")
    search_url = f"{base_url}/api/search"
    versions_url = f"{base_url}/api/repos/{REPO_ID}/versions"

    print(f"TrueRef Phaser RAG Evaluation Suite")
    print(f"Server : {base_url}")
    print(f"Tests  : {len(TESTS)}")
    print()

    # Resolve version IDs from server (in case they differ)
    try:
        all_versions = get_json(versions_url)
        live_map = {v["tag"]: v["id"] for v in all_versions if v.get("status") == "INDEXED"}
        for tag in list(VERSIONS.keys()):
            if tag in live_map:
                VERSIONS[tag] = live_map[tag]
    except Exception as e:
        print(f"WARN: could not refresh version IDs: {e}")

    results: list[TestResult] = []

    for tc in TESTS:
        version_id = VERSIONS.get(tc.version)
        if not version_id:
            print(f"SKIP [{tc.id}]: version {tc.version} not available")
            continue

        payload = {
            "text": tc.query,
            "scope": [{"repoId": REPO_ID, "versionId": version_id}],
            "maxHits": tc.max_hits,
            "tokensBudget": tc.tokens_budget,
        }
        if tc.topic:
            payload["topic"] = tc.topic

        t0 = time.time()
        try:
            resp = post_json(search_url, payload)
            elapsed = (time.time() - t0) * 1000
            hits = resp.get("hits", [])
            tr = TestResult(test=tc, hits=hits, elapsed_ms=elapsed)
            evaluate(tr, verbose=verbose)
        except Exception as e:
            elapsed = (time.time() - t0) * 1000
            tr = TestResult(test=tc, hits=[], elapsed_ms=elapsed, error=str(e))
            print(f"ERROR [{tc.id}]: {e}")

        results.append(tr)

    # ── Summary table ─────────────────────────────────────────────────────
    print()
    print("=" * 110)
    print(f"{'TEST ID + NAME':<56} {'f@1':>4} {'f@3':>4} {'f@5':>4} {'c@5':>4} {'req':>4}  {'file rank':>12}  {'score':>10}  {'ms':>6}")
    print("=" * 110)

    for tr in results:
        if tr.error:
            print(f"[{tr.test.id}] {'ERROR: ' + tr.test.name[:45]:<52}  ERROR: {tr.error[:40]}")
        else:
            print(tr.summary_line())

    # ── Aggregate metrics ─────────────────────────────────────────────────
    valid = [tr for tr in results if not tr.error]
    n = len(valid)
    if n == 0:
        print("\nNo valid results.")
        return

    mrr         = sum(tr.mrr() for tr in valid) / n
    p_at_1      = sum(1 for tr in valid if tr.file_at(1)) / n
    p_at_3      = sum(1 for tr in valid if tr.file_at(3)) / n
    p_at_5      = sum(1 for tr in valid if tr.file_at(5)) / n
    content_at5 = sum(1 for tr in valid if tr.content_at(5)) / n
    req_recall  = sum(1 for tr in valid if tr.required_found) / n
    avg_ms      = sum(tr.elapsed_ms for tr in valid) / n

    print("=" * 110)
    print()
    print("Aggregate metrics:")
    print(f"  MRR (file)           : {mrr:.4f}  ({mrr*100:.1f}%)")
    print(f"  Precision@1 (file)   : {p_at_1:.4f}  ({p_at_1*100:.1f}%)")
    print(f"  Precision@3 (file)   : {p_at_3:.4f}  ({p_at_3*100:.1f}%)")
    print(f"  Precision@5 (file)   : {p_at_5:.4f}  ({p_at_5*100:.1f}%)")
    print(f"  Content recall@5     : {content_at5:.4f}  ({content_at5*100:.1f}%)")
    print(f"  Required recall      : {req_recall:.4f}  ({req_recall*100:.1f}%)  ← hardest: ALL required fragments in top-10")
    print(f"  Avg query latency    : {avg_ms:.0f} ms")
    print()

    # ── Failure analysis ──────────────────────────────────────────────────
    failures = [tr for tr in valid if not tr.file_at(5) or not tr.required_found]
    if failures:
        print(f"Improvement targets ({len(failures)} tests below par):")
        for tr in failures:
            issues = []
            if not tr.file_at(5):
                issues.append(f"file not in top-5 (rank={tr.file_rank})")
            if not tr.required_found:
                missing = [r for r in tr.test.required_content
                           if r.lower() not in " ".join(h.get("content","") for h in tr.hits).lower()]
                issues.append(f"required content missing: {missing}")
            print(f"  [{tr.test.id}] {tr.test.name}: {'; '.join(issues)}")
    else:
        print("All tests passed file@5 and required-content checks.")

    # Exit code: 0 if MRR ≥ 0.5 AND required recall ≥ 0.8, else 1
    if mrr >= 0.5 and req_recall >= 0.8:
        print("\nResult: PASS")
        sys.exit(0)
    else:
        print(f"\nResult: FAIL  (MRR={mrr:.3f} threshold=0.5, req_recall={req_recall:.3f} threshold=0.8)")
        sys.exit(1)


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Phaser RAG quality evaluation")
    parser.add_argument("--base-url", default="http://localhost:18080",
                        help="TrueRef server base URL")
    parser.add_argument("--verbose", action="store_true",
                        help="Print per-test hit details")
    args = parser.parse_args()
    run(args.base_url, args.verbose)