#!/usr/bin/env python3 """ Phaser RAG Quality Evaluation Suite ==================================== Simulates an LLM querying TrueRef for Phaser documentation and guidance. Tests are designed to be hard and objective: each defines exact expected content fragments and/or expected source files that MUST appear in the top-k results. Scoring metrics per test: file@1 - expected file appeared as hit #1 file@3 - expected file appeared in hits 1-3 file@5 - expected file appeared in hits 1-5 content@5 - at least one expected content fragment found across the top-5 hits combined content@1 - expected content fragment found in hit #1 Overall suite scores: MRR - Mean Reciprocal Rank (file position) P@1..5 - Precision@k for file hits C@5 - Content recall across top-5 Run: python3 phaser_rag_eval.py [--base-url http://localhost:18080] [--verbose] """ import argparse import json import sys import time from dataclasses import dataclass, field from typing import Optional import urllib.request import urllib.error # --------------------------------------------------------------------------- # Config # --------------------------------------------------------------------------- REPO_ID = "50010965-aa3f-45f4-bb8d-72a0d50bf0db" # Version IDs pinned to specific tags (fetched at startup if not found) VERSIONS = { "v4.1.0": "6c6a00f5-0945-4fd7-b62c-c0e69f14effe", "v3.88.0": "d032d4d4-e6bc-4c9d-9c3c-8853e4a1cdc9", "v3.85.2": "d1cf906e-54b9-416f-bd5b-9432d69d9935", "v3.60.0": "95d0a8e2-9071-4986-85d4-59ae97893353", } # --------------------------------------------------------------------------- # Test definition # --------------------------------------------------------------------------- @dataclass class TestCase: id: str name: str query: str version: str # key into VERSIONS topic: Optional[str] = None expected_files: list[str] = field(default_factory=list) # substrings of filePath expected_content: list[str] = field(default_factory=list) # substrings that MUST appear required_content: list[str] = field(default_factory=list) # ALL of these must appear (stricter) max_hits: int = 10 tokens_budget: int = 6000 # Optional: minimum rerank score the top hit should exceed min_score: Optional[float] = None # --------------------------------------------------------------------------- # Test definitions — 25 hard, objective cases # --------------------------------------------------------------------------- TESTS: list[TestCase] = [ # ── 1. Tween system: basic config properties ────────────────────────── TestCase( id="T01", name="Tween config: yoyo/hold/repeatDelay properties", query="What properties can I set in a TweenBuilderConfig to make a tween yoyo with a hold and repeat delay?", version="v4.1.0", topic="tweens", expected_files=["tweens/builders/TweenBuilder.js", "tweens/typedefs"], expected_content=["yoyo", "hold", "repeatDelay"], required_content=["yoyo", "repeatDelay"], ), # ── 2. Tween system: onComplete / onUpdate callbacks ────────────────── TestCase( id="T02", name="Tween callbacks: onComplete and onUpdate signatures", query="How do I use onComplete and onUpdate callbacks in a Phaser tween? What arguments do they receive?", version="v4.1.0", topic="tweens", expected_files=["tweens/"], expected_content=["onComplete", "onUpdate", "onStart"], required_content=["onComplete"], ), # ── 3. Arcade physics: setCollideWorldBounds signature ──────────────── TestCase( id="T03", name="Arcade physics: setCollideWorldBounds signature", query="What are the parameters of setCollideWorldBounds in Phaser Arcade physics? Can I pass bounceX and bounceY to set bounce on world edges?", version="v4.1.0", topic="physics", expected_files=["physics/arcade/Body.js"], expected_content=["setCollideWorldBounds", "bounceX", "bounceY", "onWorldBounds"], required_content=["setCollideWorldBounds", "bounceX"], ), # ── 4. Arcade physics: addCollider vs addOverlap ────────────────────── TestCase( id="T04", name="Arcade physics: addCollider vs addOverlap difference", query="What is the difference between addCollider and addOverlap in Phaser's Arcade physics World? How do I add a callback?", version="v4.1.0", topic="physics", expected_files=["physics/arcade/World.js"], expected_content=["addCollider", "addOverlap", "collideCallback", "processCallback"], required_content=["addCollider", "addOverlap"], ), # ── 5. Camera: shake parameters ─────────────────────────────────────── TestCase( id="T05", name="Camera shake: duration, intensity, force, callback", query="How do I make the camera shake in Phaser? What parameters does camera.shake accept?", version="v4.1.0", topic="camera", expected_files=["cameras/2d/Camera.js"], expected_content=["shake", "duration", "intensity", "force", "callback"], required_content=["shake", "intensity"], ), # ── 6. Camera: startFollow with lerp ───────────────────────────────── TestCase( id="T06", name="Camera follow: startFollow lerpX lerpY parameters", query="How do I make the Phaser camera follow a player with smooth lerp? What are the lerpX and lerpY parameters?", version="v4.1.0", topic="camera", expected_files=["cameras/2d/Camera.js"], expected_content=["startFollow", "lerpX", "lerpY", "roundPixels"], required_content=["startFollow", "lerpX"], ), # ── 7. Camera: setDeadzone ──────────────────────────────────────────── TestCase( id="T07", name="Camera deadzone: setDeadzone width/height", query="How does camera deadzone work in Phaser? How do I create a rectangular deadzone so the camera only moves when the player exits it?", version="v4.1.0", topic="camera", expected_files=["cameras/2d/Camera.js"], expected_content=["setDeadzone", "deadzone"], required_content=["setDeadzone"], ), # ── 8. Scene: pass data when starting another scene ─────────────────── TestCase( id="T08", name="Scene management: pass data on scene.start", query="How do I pass data to another scene when calling scene.start() or scene.launch()? How does the init method receive it?", version="v4.1.0", topic="scenes", expected_files=["scene/"], expected_content=["init", "data", "start", "launch"], required_content=["init"], ), # ── 9. Animation system: chaining animations ────────────────────────── TestCase( id="T09", name="Animation chaining: chain() and playAfterRepeat()", query="How can I chain multiple animations so one plays after another finishes in Phaser? What is the chain() method?", version="v4.1.0", topic="animations", expected_files=["gameobjects/sprite/Sprite.js", "animations/"], expected_content=["chain", "playAfterRepeat", "playAfterDelay"], required_content=["chain"], ), # ── 10. Animation system: events ───────────────────────────────────── TestCase( id="T10", name="Animation events: ANIMATION_COMPLETE, ANIMATION_START", query="What events does the Phaser animation system emit? How do I listen for when an animation completes on a specific sprite?", version="v4.1.0", topic="animations", expected_files=["animations/events/"], expected_content=["ANIMATION_COMPLETE", "ANIMATION_START", "ANIMATION_STOP"], required_content=["ANIMATION_COMPLETE"], ), # ── 11. Input: pointer events ───────────────────────────────────────── TestCase( id="T11", name="Input: setInteractive + pointerdown/pointerover events", query="How do I call setInteractive on a game object and listen for pointerdown and pointerover events in Phaser?", version="v4.1.0", topic="input", expected_files=["input/"], expected_content=["pointerdown", "pointerover", "pointerout", "setInteractive"], required_content=["setInteractive", "pointerdown"], ), # ── 12. Input: keyboard cursor keys ────────────────────────────────── TestCase( id="T12", name="Input: createCursorKeys and keyboard key states", query="How do I read arrow key input in Phaser? How does createCursorKeys() work and how do I check if a key is down?", version="v4.1.0", topic="input", expected_files=["input/keyboard/"], expected_content=["createCursorKeys", "isDown", "up", "down", "left", "right"], required_content=["createCursorKeys"], ), # ── 13. Loader: atlas and texture keys ─────────────────────────────── TestCase( id="T13", name="Loader: load.atlas config object and frame keys", query="How do I load a texture atlas in Phaser? What are the arguments to this.load.atlas() and how do I use frame keys?", version="v4.1.0", topic="loader", expected_files=["loader/filetypes/AtlasJSONFile.js", "loader/"], expected_content=["atlas", "textureURL", "atlasURL", "frameConfig"], required_content=["atlas"], min_score=0.7, ), # ── 14. Tilemaps: setCollisionBetween ──────────────────────────────── TestCase( id="T14", name="Tilemap: setCollisionBetween start/stop parameters", query="How do I set collision on a range of tile indices in a Phaser tilemap? What does setCollisionBetween do?", version="v4.1.0", topic="tilemaps", expected_files=["tilemaps/Tilemap.js", "tilemaps/"], expected_content=["setCollisionBetween", "start", "stop", "collides", "recalculateFaces"], required_content=["setCollisionBetween"], ), # ── 15. Tilemaps: createFromObjects ────────────────────────────────── TestCase( id="T15", name="Tilemap: createFromObjects from Tiled object layer", query="How do I convert Tiled object layer objects into Phaser game objects? How does createFromObjects work?", version="v4.1.0", topic="tilemaps", expected_files=["tilemaps/Tilemap.js"], expected_content=["createFromObjects", "objectLayerName"], required_content=["createFromObjects"], ), # ── 16. RenderTexture: beginDraw / endDraw (v3 API) ────────────────── TestCase( id="T16", name="RenderTexture v3: beginDraw / batchDraw / endDraw pattern", query="How do I use beginDraw and endDraw on a Phaser RenderTexture for batch drawing? What is the workflow?", version="v3.85.2", topic="rendering", expected_files=["textures/DynamicTexture.js"], expected_content=["beginDraw", "endDraw", "batchDraw", "batchDrawFrame"], required_content=["beginDraw", "endDraw"], ), # ── 17. Masking: BitmapMask vs GeometryMask (v3 API) ────────────────── TestCase( id="T17", name="Masking v3: createBitmapMask vs createGeometryMask", query="What is the difference between a BitmapMask and a GeometryMask in Phaser? How do I create and apply them?", version="v3.85.2", topic="rendering", expected_files=["gameobjects/components/Mask.js", "display/mask/"], expected_content=["createBitmapMask", "createGeometryMask", "setMask", "BitmapMask", "GeometryMask"], required_content=["BitmapMask", "GeometryMask"], ), # ── 18. Groups: getFirstDead / getFirstAlive pool pattern ───────────── TestCase( id="T18", name="Group: object pool with getFirstDead / getFirstAlive", query="How do I implement an object pool in Phaser using a Group? What are getFirstDead and getFirstAlive?", version="v4.1.0", topic="gameobjects", expected_files=["gameobjects/group/Group.js"], expected_content=["getFirstDead", "getFirstAlive", "createIfNull", "countActive"], required_content=["getFirstDead", "getFirstAlive"], ), # ── 19. Matter.js: fromVertices custom body shape ───────────────────── TestCase( id="T19", name="Matter.js: custom body shape with fromVertices", query="How do I create a custom polygon physics body from vertices in Phaser's Matter.js physics?", version="v4.1.0", topic="physics", expected_files=["physics/matter-js/Factory.js", "physics/matter-js/"], expected_content=["fromVertices", "vertexSets", "options"], required_content=["fromVertices"], ), # ── 20. Game config: FPS limit / target ─────────────────────────────── TestCase( id="T20", name="Game config: fps.target and fps.limit settings", query="How do I configure the target frame rate and FPS limit in the Phaser game config? What is the difference between target and limit?", version="v4.1.0", topic="core", expected_files=["core/TimeStep.js", "core/Config.js"], expected_content=["targetFps", "fpsLimit", "target", "fps"], required_content=["targetFps"], ), # ── 21. Scale Manager: ScaleModes ──────────────────────────────────── TestCase( id="T21", name="Scale Manager: FIT vs ENVELOP scale modes", query="What scale modes are available in Phaser's Scale Manager? How does FIT differ from ENVELOP? How do I make a responsive game?", version="v4.1.0", topic="scale", expected_files=["scale/"], expected_content=["FIT", "ENVELOP", "ScaleManager", "autoCenter"], required_content=["FIT"], ), # ── 22. Data Manager: set/get/events ────────────────────────────────── TestCase( id="T22", name="Data Manager: set/get and CHANGE_DATA event", query="How does the Phaser Data Manager work? How do I watch for data changes using events on a game object's data?", version="v4.1.0", topic="data", expected_files=["data/DataManager.js", "data/"], expected_content=["CHANGE_DATA", "set", "get", "events"], required_content=["CHANGE_DATA"], ), # ── 23. Depth sort: setDepth and displayList ────────────────────────── TestCase( id="T23", name="Depth sorting: setDepth and display list ordering", query="How does Phaser handle rendering order (z-order)? How do I use setDepth to control which objects render on top?", version="v4.1.0", topic="rendering", expected_files=["gameobjects/"], expected_content=["setDepth", "depth", "displayList"], required_content=["setDepth"], ), # ── 24. Version diff: v3.60 TweenChain (new in 3.60) ───────────────── TestCase( id="T24", name="Version-specific: TweenChain introduced in v3.60", query="How do I create a sequence of tweens that play one after another using TweenChain in Phaser 3.60+?", version="v3.60.0", topic="tweens", expected_files=["tweens/"], expected_content=["TweenChain", "chain"], required_content=["TweenChain"], ), # ── 25. Hard adversarial: camera.ignore() ───────────────────────────── TestCase( id="T25", name="Camera: ignore() to exclude game objects from a camera", query="How do I make a game object invisible to a specific camera in Phaser while remaining visible to others? What is camera.ignore()?", version="v4.1.0", topic="camera", expected_files=["cameras/2d/"], expected_content=["ignore", "camera"], required_content=["ignore"], ), ] # --------------------------------------------------------------------------- # HTTP helpers # --------------------------------------------------------------------------- def post_json(url: str, payload: dict) -> dict: body = json.dumps(payload).encode() req = urllib.request.Request( url, data=body, headers={"Content-Type": "application/json", "Accept": "application/json"}, method="POST", ) with urllib.request.urlopen(req, timeout=30) as resp: return json.loads(resp.read().decode()) def get_json(url: str) -> dict | list: req = urllib.request.Request(url, headers={"Accept": "application/json"}) with urllib.request.urlopen(req, timeout=15) as resp: return json.loads(resp.read().decode()) # --------------------------------------------------------------------------- # Evaluation logic # --------------------------------------------------------------------------- @dataclass class TestResult: test: TestCase hits: list[dict] elapsed_ms: float error: Optional[str] = None # Computed below file_rank: Optional[int] = None # 1-based rank of first expected-file match content_ranks: list[int] = field(default_factory=list) # 1-based ranks where content found required_found: bool = False top_score: Optional[float] = None def file_at(self, k: int) -> bool: return self.file_rank is not None and self.file_rank <= k def content_at(self, k: int) -> bool: return any(r <= k for r in self.content_ranks) def mrr(self) -> float: if self.file_rank is None: return 0.0 return 1.0 / self.file_rank def summary_line(self) -> str: f1 = "✓" if self.file_at(1) else "·" f3 = "✓" if self.file_at(3) else "·" f5 = "✓" if self.file_at(5) else "·" c5 = "✓" if self.content_at(5) else "·" req = "✓" if self.required_found else "✗" rank_str = f"rank={self.file_rank}" if self.file_rank else "NOT FOUND" score_str = f"score={self.top_score:.3f}" if self.top_score else "" ms_str = f"{self.elapsed_ms:.0f}ms" return ( f"[{self.test.id}] {self.test.name[:52]:<52} " f"f@1={f1} f@3={f3} f@5={f5} c@5={c5} req={req} " f"{rank_str:>12} {score_str} {ms_str}" ) def evaluate(result: TestResult, verbose: bool = False) -> None: hits = result.hits if not hits: return result.top_score = hits[0].get("score") if hits else None # File rank: position of first hit whose filePath matches any expected_files substring for i, hit in enumerate(hits): fp = hit.get("filePath", "") if any(ef in fp for ef in result.test.expected_files): result.file_rank = i + 1 break # Content rank: for each expected_content fragment, find the first hit that contains it combined_content = {i: (hit.get("content") or "") for i, hit in enumerate(hits)} for fragment in result.test.expected_content: for i, content in combined_content.items(): if fragment.lower() in content.lower(): result.content_ranks.append(i + 1) break # Required content: ALL required fragments must appear somewhere in top-10 all_content = " ".join(combined_content.values()).lower() result.required_found = all( r.lower() in all_content for r in result.test.required_content ) if verbose: print(f"\n{'─'*80}") print(f"[{result.test.id}] {result.test.name}") print(f" Query: {result.test.query}") print(f" Expected files: {result.test.expected_files}") print(f" Expected content: {result.test.expected_content}") print(f" Top hits:") for i, hit in enumerate(hits[:5]): fp = hit.get("filePath", "?") score = hit.get("score", 0.0) snip = (hit.get("content") or "")[:100].replace("\n", " ") marker = " ← FILE MATCH" if any(ef in fp for ef in result.test.expected_files) else "" print(f" [{i+1}] score={score:.3f} {fp}{marker}") print(f" {snip}") # --------------------------------------------------------------------------- # Main runner # --------------------------------------------------------------------------- def run(base_url: str, verbose: bool) -> None: base_url = base_url.rstrip("/") search_url = f"{base_url}/api/search" versions_url = f"{base_url}/api/repos/{REPO_ID}/versions" print(f"TrueRef Phaser RAG Evaluation Suite") print(f"Server : {base_url}") print(f"Tests : {len(TESTS)}") print() # Resolve version IDs from server (in case they differ) try: all_versions = get_json(versions_url) live_map = {v["tag"]: v["id"] for v in all_versions if v.get("status") == "INDEXED"} for tag in list(VERSIONS.keys()): if tag in live_map: VERSIONS[tag] = live_map[tag] except Exception as e: print(f"WARN: could not refresh version IDs: {e}") results: list[TestResult] = [] for tc in TESTS: version_id = VERSIONS.get(tc.version) if not version_id: print(f"SKIP [{tc.id}]: version {tc.version} not available") continue payload = { "text": tc.query, "scope": [{"repoId": REPO_ID, "versionId": version_id}], "maxHits": tc.max_hits, "tokensBudget": tc.tokens_budget, } if tc.topic: payload["topic"] = tc.topic t0 = time.time() try: resp = post_json(search_url, payload) elapsed = (time.time() - t0) * 1000 hits = resp.get("hits", []) tr = TestResult(test=tc, hits=hits, elapsed_ms=elapsed) evaluate(tr, verbose=verbose) except Exception as e: elapsed = (time.time() - t0) * 1000 tr = TestResult(test=tc, hits=[], elapsed_ms=elapsed, error=str(e)) print(f"ERROR [{tc.id}]: {e}") results.append(tr) # ── Summary table ───────────────────────────────────────────────────── print() print("=" * 110) print(f"{'TEST ID + NAME':<56} {'f@1':>4} {'f@3':>4} {'f@5':>4} {'c@5':>4} {'req':>4} {'file rank':>12} {'score':>10} {'ms':>6}") print("=" * 110) for tr in results: if tr.error: print(f"[{tr.test.id}] {'ERROR: ' + tr.test.name[:45]:<52} ERROR: {tr.error[:40]}") else: print(tr.summary_line()) # ── Aggregate metrics ───────────────────────────────────────────────── valid = [tr for tr in results if not tr.error] n = len(valid) if n == 0: print("\nNo valid results.") return mrr = sum(tr.mrr() for tr in valid) / n p_at_1 = sum(1 for tr in valid if tr.file_at(1)) / n p_at_3 = sum(1 for tr in valid if tr.file_at(3)) / n p_at_5 = sum(1 for tr in valid if tr.file_at(5)) / n content_at5 = sum(1 for tr in valid if tr.content_at(5)) / n req_recall = sum(1 for tr in valid if tr.required_found) / n avg_ms = sum(tr.elapsed_ms for tr in valid) / n print("=" * 110) print() print("Aggregate metrics:") print(f" MRR (file) : {mrr:.4f} ({mrr*100:.1f}%)") print(f" Precision@1 (file) : {p_at_1:.4f} ({p_at_1*100:.1f}%)") print(f" Precision@3 (file) : {p_at_3:.4f} ({p_at_3*100:.1f}%)") print(f" Precision@5 (file) : {p_at_5:.4f} ({p_at_5*100:.1f}%)") print(f" Content recall@5 : {content_at5:.4f} ({content_at5*100:.1f}%)") print(f" Required recall : {req_recall:.4f} ({req_recall*100:.1f}%) ← hardest: ALL required fragments in top-10") print(f" Avg query latency : {avg_ms:.0f} ms") print() # ── Failure analysis ────────────────────────────────────────────────── failures = [tr for tr in valid if not tr.file_at(5) or not tr.required_found] if failures: print(f"Improvement targets ({len(failures)} tests below par):") for tr in failures: issues = [] if not tr.file_at(5): issues.append(f"file not in top-5 (rank={tr.file_rank})") if not tr.required_found: missing = [r for r in tr.test.required_content if r.lower() not in " ".join(h.get("content","") for h in tr.hits).lower()] issues.append(f"required content missing: {missing}") print(f" [{tr.test.id}] {tr.test.name}: {'; '.join(issues)}") else: print("All tests passed file@5 and required-content checks.") # Exit code: 0 if MRR ≥ 0.5 AND required recall ≥ 0.8, else 1 if mrr >= 0.5 and req_recall >= 0.8: print("\nResult: PASS") sys.exit(0) else: print(f"\nResult: FAIL (MRR={mrr:.3f} threshold=0.5, req_recall={req_recall:.3f} threshold=0.8)") sys.exit(1) # --------------------------------------------------------------------------- # Entry point # --------------------------------------------------------------------------- if __name__ == "__main__": parser = argparse.ArgumentParser(description="Phaser RAG quality evaluation") parser.add_argument("--base-url", default="http://localhost:18080", help="TrueRef server base URL") parser.add_argument("--verbose", action="store_true", help="Print per-test hit details") args = parser.parse_args() run(args.base_url, args.verbose)