Files
trueref/tests/quality/phaser_rag_eval.py
moze c5f950c2c0
Some checks failed
Build and publish Docker image / Build and push (push) Failing after 1m27s
Initial commit: trueref v0.1.0-SNAPSHOT
Java 21 / Spring Boot 3.5.3 multi-module Maven project.
Hybrid BM25+HNSW search with RRF, cross-encoder reranker,
ONNX Runtime 1.22.0 (CPU + CUDA 12 GPU variants).
2026-05-06 00:49:16 +02:00

612 lines
27 KiB
Python

#!/usr/bin/env python3
"""
Phaser RAG Quality Evaluation Suite
====================================
Simulates an LLM querying TrueRef for Phaser documentation and guidance.
Tests are designed to be hard and objective: each defines exact expected content
fragments and/or expected source files that MUST appear in the top-k results.
Scoring metrics per test:
file@1 - expected file appeared as hit #1
file@3 - expected file appeared in hits 1-3
file@5 - expected file appeared in hits 1-5
content@5 - at least one expected content fragment found across the top-5 hits combined
content@1 - expected content fragment found in hit #1
Overall suite scores:
MRR - Mean Reciprocal Rank (file position)
P@1..5 - Precision@k for file hits
C@5 - Content recall across top-5
Run:
python3 phaser_rag_eval.py [--base-url http://localhost:18080] [--verbose]
"""
import argparse
import json
import sys
import time
from dataclasses import dataclass, field
from typing import Optional
import urllib.request
import urllib.error
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
REPO_ID = "50010965-aa3f-45f4-bb8d-72a0d50bf0db"
# Version IDs pinned to specific tags (fetched at startup if not found)
VERSIONS = {
"v4.1.0": "6c6a00f5-0945-4fd7-b62c-c0e69f14effe",
"v3.88.0": "d032d4d4-e6bc-4c9d-9c3c-8853e4a1cdc9",
"v3.85.2": "d1cf906e-54b9-416f-bd5b-9432d69d9935",
"v3.60.0": "95d0a8e2-9071-4986-85d4-59ae97893353",
}
# ---------------------------------------------------------------------------
# Test definition
# ---------------------------------------------------------------------------
@dataclass
class TestCase:
id: str
name: str
query: str
version: str # key into VERSIONS
topic: Optional[str] = None
expected_files: list[str] = field(default_factory=list) # substrings of filePath
expected_content: list[str] = field(default_factory=list) # substrings that MUST appear
required_content: list[str] = field(default_factory=list) # ALL of these must appear (stricter)
max_hits: int = 10
tokens_budget: int = 6000
# Optional: minimum rerank score the top hit should exceed
min_score: Optional[float] = None
# ---------------------------------------------------------------------------
# Test definitions — 25 hard, objective cases
# ---------------------------------------------------------------------------
TESTS: list[TestCase] = [
# ── 1. Tween system: basic config properties ──────────────────────────
TestCase(
id="T01",
name="Tween config: yoyo/hold/repeatDelay properties",
query="What properties can I set in a TweenBuilderConfig to make a tween yoyo with a hold and repeat delay?",
version="v4.1.0",
topic="tweens",
expected_files=["tweens/builders/TweenBuilder.js", "tweens/typedefs"],
expected_content=["yoyo", "hold", "repeatDelay"],
required_content=["yoyo", "repeatDelay"],
),
# ── 2. Tween system: onComplete / onUpdate callbacks ──────────────────
TestCase(
id="T02",
name="Tween callbacks: onComplete and onUpdate signatures",
query="How do I use onComplete and onUpdate callbacks in a Phaser tween? What arguments do they receive?",
version="v4.1.0",
topic="tweens",
expected_files=["tweens/"],
expected_content=["onComplete", "onUpdate", "onStart"],
required_content=["onComplete"],
),
# ── 3. Arcade physics: setCollideWorldBounds signature ────────────────
TestCase(
id="T03",
name="Arcade physics: setCollideWorldBounds signature",
query="What are the parameters of setCollideWorldBounds in Phaser Arcade physics? Can I pass bounceX and bounceY to set bounce on world edges?",
version="v4.1.0",
topic="physics",
expected_files=["physics/arcade/Body.js"],
expected_content=["setCollideWorldBounds", "bounceX", "bounceY", "onWorldBounds"],
required_content=["setCollideWorldBounds", "bounceX"],
),
# ── 4. Arcade physics: addCollider vs addOverlap ──────────────────────
TestCase(
id="T04",
name="Arcade physics: addCollider vs addOverlap difference",
query="What is the difference between addCollider and addOverlap in Phaser's Arcade physics World? How do I add a callback?",
version="v4.1.0",
topic="physics",
expected_files=["physics/arcade/World.js"],
expected_content=["addCollider", "addOverlap", "collideCallback", "processCallback"],
required_content=["addCollider", "addOverlap"],
),
# ── 5. Camera: shake parameters ───────────────────────────────────────
TestCase(
id="T05",
name="Camera shake: duration, intensity, force, callback",
query="How do I make the camera shake in Phaser? What parameters does camera.shake accept?",
version="v4.1.0",
topic="camera",
expected_files=["cameras/2d/Camera.js"],
expected_content=["shake", "duration", "intensity", "force", "callback"],
required_content=["shake", "intensity"],
),
# ── 6. Camera: startFollow with lerp ─────────────────────────────────
TestCase(
id="T06",
name="Camera follow: startFollow lerpX lerpY parameters",
query="How do I make the Phaser camera follow a player with smooth lerp? What are the lerpX and lerpY parameters?",
version="v4.1.0",
topic="camera",
expected_files=["cameras/2d/Camera.js"],
expected_content=["startFollow", "lerpX", "lerpY", "roundPixels"],
required_content=["startFollow", "lerpX"],
),
# ── 7. Camera: setDeadzone ────────────────────────────────────────────
TestCase(
id="T07",
name="Camera deadzone: setDeadzone width/height",
query="How does camera deadzone work in Phaser? How do I create a rectangular deadzone so the camera only moves when the player exits it?",
version="v4.1.0",
topic="camera",
expected_files=["cameras/2d/Camera.js"],
expected_content=["setDeadzone", "deadzone"],
required_content=["setDeadzone"],
),
# ── 8. Scene: pass data when starting another scene ───────────────────
TestCase(
id="T08",
name="Scene management: pass data on scene.start",
query="How do I pass data to another scene when calling scene.start() or scene.launch()? How does the init method receive it?",
version="v4.1.0",
topic="scenes",
expected_files=["scene/"],
expected_content=["init", "data", "start", "launch"],
required_content=["init"],
),
# ── 9. Animation system: chaining animations ──────────────────────────
TestCase(
id="T09",
name="Animation chaining: chain() and playAfterRepeat()",
query="How can I chain multiple animations so one plays after another finishes in Phaser? What is the chain() method?",
version="v4.1.0",
topic="animations",
expected_files=["gameobjects/sprite/Sprite.js", "animations/"],
expected_content=["chain", "playAfterRepeat", "playAfterDelay"],
required_content=["chain"],
),
# ── 10. Animation system: events ─────────────────────────────────────
TestCase(
id="T10",
name="Animation events: ANIMATION_COMPLETE, ANIMATION_START",
query="What events does the Phaser animation system emit? How do I listen for when an animation completes on a specific sprite?",
version="v4.1.0",
topic="animations",
expected_files=["animations/events/"],
expected_content=["ANIMATION_COMPLETE", "ANIMATION_START", "ANIMATION_STOP"],
required_content=["ANIMATION_COMPLETE"],
),
# ── 11. Input: pointer events ─────────────────────────────────────────
TestCase(
id="T11",
name="Input: setInteractive + pointerdown/pointerover events",
query="How do I call setInteractive on a game object and listen for pointerdown and pointerover events in Phaser?",
version="v4.1.0",
topic="input",
expected_files=["input/"],
expected_content=["pointerdown", "pointerover", "pointerout", "setInteractive"],
required_content=["setInteractive", "pointerdown"],
),
# ── 12. Input: keyboard cursor keys ──────────────────────────────────
TestCase(
id="T12",
name="Input: createCursorKeys and keyboard key states",
query="How do I read arrow key input in Phaser? How does createCursorKeys() work and how do I check if a key is down?",
version="v4.1.0",
topic="input",
expected_files=["input/keyboard/"],
expected_content=["createCursorKeys", "isDown", "up", "down", "left", "right"],
required_content=["createCursorKeys"],
),
# ── 13. Loader: atlas and texture keys ───────────────────────────────
TestCase(
id="T13",
name="Loader: load.atlas config object and frame keys",
query="How do I load a texture atlas in Phaser? What are the arguments to this.load.atlas() and how do I use frame keys?",
version="v4.1.0",
topic="loader",
expected_files=["loader/filetypes/AtlasJSONFile.js", "loader/"],
expected_content=["atlas", "textureURL", "atlasURL", "frameConfig"],
required_content=["atlas"],
min_score=0.7,
),
# ── 14. Tilemaps: setCollisionBetween ────────────────────────────────
TestCase(
id="T14",
name="Tilemap: setCollisionBetween start/stop parameters",
query="How do I set collision on a range of tile indices in a Phaser tilemap? What does setCollisionBetween do?",
version="v4.1.0",
topic="tilemaps",
expected_files=["tilemaps/Tilemap.js", "tilemaps/"],
expected_content=["setCollisionBetween", "start", "stop", "collides", "recalculateFaces"],
required_content=["setCollisionBetween"],
),
# ── 15. Tilemaps: createFromObjects ──────────────────────────────────
TestCase(
id="T15",
name="Tilemap: createFromObjects from Tiled object layer",
query="How do I convert Tiled object layer objects into Phaser game objects? How does createFromObjects work?",
version="v4.1.0",
topic="tilemaps",
expected_files=["tilemaps/Tilemap.js"],
expected_content=["createFromObjects", "objectLayerName"],
required_content=["createFromObjects"],
),
# ── 16. RenderTexture: beginDraw / endDraw (v3 API) ──────────────────
TestCase(
id="T16",
name="RenderTexture v3: beginDraw / batchDraw / endDraw pattern",
query="How do I use beginDraw and endDraw on a Phaser RenderTexture for batch drawing? What is the workflow?",
version="v3.85.2",
topic="rendering",
expected_files=["textures/DynamicTexture.js"],
expected_content=["beginDraw", "endDraw", "batchDraw", "batchDrawFrame"],
required_content=["beginDraw", "endDraw"],
),
# ── 17. Masking: BitmapMask vs GeometryMask (v3 API) ──────────────────
TestCase(
id="T17",
name="Masking v3: createBitmapMask vs createGeometryMask",
query="What is the difference between a BitmapMask and a GeometryMask in Phaser? How do I create and apply them?",
version="v3.85.2",
topic="rendering",
expected_files=["gameobjects/components/Mask.js", "display/mask/"],
expected_content=["createBitmapMask", "createGeometryMask", "setMask", "BitmapMask", "GeometryMask"],
required_content=["BitmapMask", "GeometryMask"],
),
# ── 18. Groups: getFirstDead / getFirstAlive pool pattern ─────────────
TestCase(
id="T18",
name="Group: object pool with getFirstDead / getFirstAlive",
query="How do I implement an object pool in Phaser using a Group? What are getFirstDead and getFirstAlive?",
version="v4.1.0",
topic="gameobjects",
expected_files=["gameobjects/group/Group.js"],
expected_content=["getFirstDead", "getFirstAlive", "createIfNull", "countActive"],
required_content=["getFirstDead", "getFirstAlive"],
),
# ── 19. Matter.js: fromVertices custom body shape ─────────────────────
TestCase(
id="T19",
name="Matter.js: custom body shape with fromVertices",
query="How do I create a custom polygon physics body from vertices in Phaser's Matter.js physics?",
version="v4.1.0",
topic="physics",
expected_files=["physics/matter-js/Factory.js", "physics/matter-js/"],
expected_content=["fromVertices", "vertexSets", "options"],
required_content=["fromVertices"],
),
# ── 20. Game config: FPS limit / target ───────────────────────────────
TestCase(
id="T20",
name="Game config: fps.target and fps.limit settings",
query="How do I configure the target frame rate and FPS limit in the Phaser game config? What is the difference between target and limit?",
version="v4.1.0",
topic="core",
expected_files=["core/TimeStep.js", "core/Config.js"],
expected_content=["targetFps", "fpsLimit", "target", "fps"],
required_content=["targetFps"],
),
# ── 21. Scale Manager: ScaleModes ────────────────────────────────────
TestCase(
id="T21",
name="Scale Manager: FIT vs ENVELOP scale modes",
query="What scale modes are available in Phaser's Scale Manager? How does FIT differ from ENVELOP? How do I make a responsive game?",
version="v4.1.0",
topic="scale",
expected_files=["scale/"],
expected_content=["FIT", "ENVELOP", "ScaleManager", "autoCenter"],
required_content=["FIT"],
),
# ── 22. Data Manager: set/get/events ──────────────────────────────────
TestCase(
id="T22",
name="Data Manager: set/get and CHANGE_DATA event",
query="How does the Phaser Data Manager work? How do I watch for data changes using events on a game object's data?",
version="v4.1.0",
topic="data",
expected_files=["data/DataManager.js", "data/"],
expected_content=["CHANGE_DATA", "set", "get", "events"],
required_content=["CHANGE_DATA"],
),
# ── 23. Depth sort: setDepth and displayList ──────────────────────────
TestCase(
id="T23",
name="Depth sorting: setDepth and display list ordering",
query="How does Phaser handle rendering order (z-order)? How do I use setDepth to control which objects render on top?",
version="v4.1.0",
topic="rendering",
expected_files=["gameobjects/"],
expected_content=["setDepth", "depth", "displayList"],
required_content=["setDepth"],
),
# ── 24. Version diff: v3.60 TweenChain (new in 3.60) ─────────────────
TestCase(
id="T24",
name="Version-specific: TweenChain introduced in v3.60",
query="How do I create a sequence of tweens that play one after another using TweenChain in Phaser 3.60+?",
version="v3.60.0",
topic="tweens",
expected_files=["tweens/"],
expected_content=["TweenChain", "chain"],
required_content=["TweenChain"],
),
# ── 25. Hard adversarial: camera.ignore() ─────────────────────────────
TestCase(
id="T25",
name="Camera: ignore() to exclude game objects from a camera",
query="How do I make a game object invisible to a specific camera in Phaser while remaining visible to others? What is camera.ignore()?",
version="v4.1.0",
topic="camera",
expected_files=["cameras/2d/"],
expected_content=["ignore", "camera"],
required_content=["ignore"],
),
]
# ---------------------------------------------------------------------------
# HTTP helpers
# ---------------------------------------------------------------------------
def post_json(url: str, payload: dict) -> dict:
body = json.dumps(payload).encode()
req = urllib.request.Request(
url, data=body,
headers={"Content-Type": "application/json", "Accept": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode())
def get_json(url: str) -> dict | list:
req = urllib.request.Request(url, headers={"Accept": "application/json"})
with urllib.request.urlopen(req, timeout=15) as resp:
return json.loads(resp.read().decode())
# ---------------------------------------------------------------------------
# Evaluation logic
# ---------------------------------------------------------------------------
@dataclass
class TestResult:
test: TestCase
hits: list[dict]
elapsed_ms: float
error: Optional[str] = None
# Computed below
file_rank: Optional[int] = None # 1-based rank of first expected-file match
content_ranks: list[int] = field(default_factory=list) # 1-based ranks where content found
required_found: bool = False
top_score: Optional[float] = None
def file_at(self, k: int) -> bool:
return self.file_rank is not None and self.file_rank <= k
def content_at(self, k: int) -> bool:
return any(r <= k for r in self.content_ranks)
def mrr(self) -> float:
if self.file_rank is None:
return 0.0
return 1.0 / self.file_rank
def summary_line(self) -> str:
f1 = "" if self.file_at(1) else "·"
f3 = "" if self.file_at(3) else "·"
f5 = "" if self.file_at(5) else "·"
c5 = "" if self.content_at(5) else "·"
req = "" if self.required_found else ""
rank_str = f"rank={self.file_rank}" if self.file_rank else "NOT FOUND"
score_str = f"score={self.top_score:.3f}" if self.top_score else ""
ms_str = f"{self.elapsed_ms:.0f}ms"
return (
f"[{self.test.id}] {self.test.name[:52]:<52} "
f"f@1={f1} f@3={f3} f@5={f5} c@5={c5} req={req} "
f"{rank_str:>12} {score_str} {ms_str}"
)
def evaluate(result: TestResult, verbose: bool = False) -> None:
hits = result.hits
if not hits:
return
result.top_score = hits[0].get("score") if hits else None
# File rank: position of first hit whose filePath matches any expected_files substring
for i, hit in enumerate(hits):
fp = hit.get("filePath", "")
if any(ef in fp for ef in result.test.expected_files):
result.file_rank = i + 1
break
# Content rank: for each expected_content fragment, find the first hit that contains it
combined_content = {i: (hit.get("content") or "") for i, hit in enumerate(hits)}
for fragment in result.test.expected_content:
for i, content in combined_content.items():
if fragment.lower() in content.lower():
result.content_ranks.append(i + 1)
break
# Required content: ALL required fragments must appear somewhere in top-10
all_content = " ".join(combined_content.values()).lower()
result.required_found = all(
r.lower() in all_content for r in result.test.required_content
)
if verbose:
print(f"\n{''*80}")
print(f"[{result.test.id}] {result.test.name}")
print(f" Query: {result.test.query}")
print(f" Expected files: {result.test.expected_files}")
print(f" Expected content: {result.test.expected_content}")
print(f" Top hits:")
for i, hit in enumerate(hits[:5]):
fp = hit.get("filePath", "?")
score = hit.get("score", 0.0)
snip = (hit.get("content") or "")[:100].replace("\n", " ")
marker = " ← FILE MATCH" if any(ef in fp for ef in result.test.expected_files) else ""
print(f" [{i+1}] score={score:.3f} {fp}{marker}")
print(f" {snip}")
# ---------------------------------------------------------------------------
# Main runner
# ---------------------------------------------------------------------------
def run(base_url: str, verbose: bool) -> None:
base_url = base_url.rstrip("/")
search_url = f"{base_url}/api/search"
versions_url = f"{base_url}/api/repos/{REPO_ID}/versions"
print(f"TrueRef Phaser RAG Evaluation Suite")
print(f"Server : {base_url}")
print(f"Tests : {len(TESTS)}")
print()
# Resolve version IDs from server (in case they differ)
try:
all_versions = get_json(versions_url)
live_map = {v["tag"]: v["id"] for v in all_versions if v.get("status") == "INDEXED"}
for tag in list(VERSIONS.keys()):
if tag in live_map:
VERSIONS[tag] = live_map[tag]
except Exception as e:
print(f"WARN: could not refresh version IDs: {e}")
results: list[TestResult] = []
for tc in TESTS:
version_id = VERSIONS.get(tc.version)
if not version_id:
print(f"SKIP [{tc.id}]: version {tc.version} not available")
continue
payload = {
"text": tc.query,
"scope": [{"repoId": REPO_ID, "versionId": version_id}],
"maxHits": tc.max_hits,
"tokensBudget": tc.tokens_budget,
}
if tc.topic:
payload["topic"] = tc.topic
t0 = time.time()
try:
resp = post_json(search_url, payload)
elapsed = (time.time() - t0) * 1000
hits = resp.get("hits", [])
tr = TestResult(test=tc, hits=hits, elapsed_ms=elapsed)
evaluate(tr, verbose=verbose)
except Exception as e:
elapsed = (time.time() - t0) * 1000
tr = TestResult(test=tc, hits=[], elapsed_ms=elapsed, error=str(e))
print(f"ERROR [{tc.id}]: {e}")
results.append(tr)
# ── Summary table ─────────────────────────────────────────────────────
print()
print("=" * 110)
print(f"{'TEST ID + NAME':<56} {'f@1':>4} {'f@3':>4} {'f@5':>4} {'c@5':>4} {'req':>4} {'file rank':>12} {'score':>10} {'ms':>6}")
print("=" * 110)
for tr in results:
if tr.error:
print(f"[{tr.test.id}] {'ERROR: ' + tr.test.name[:45]:<52} ERROR: {tr.error[:40]}")
else:
print(tr.summary_line())
# ── Aggregate metrics ─────────────────────────────────────────────────
valid = [tr for tr in results if not tr.error]
n = len(valid)
if n == 0:
print("\nNo valid results.")
return
mrr = sum(tr.mrr() for tr in valid) / n
p_at_1 = sum(1 for tr in valid if tr.file_at(1)) / n
p_at_3 = sum(1 for tr in valid if tr.file_at(3)) / n
p_at_5 = sum(1 for tr in valid if tr.file_at(5)) / n
content_at5 = sum(1 for tr in valid if tr.content_at(5)) / n
req_recall = sum(1 for tr in valid if tr.required_found) / n
avg_ms = sum(tr.elapsed_ms for tr in valid) / n
print("=" * 110)
print()
print("Aggregate metrics:")
print(f" MRR (file) : {mrr:.4f} ({mrr*100:.1f}%)")
print(f" Precision@1 (file) : {p_at_1:.4f} ({p_at_1*100:.1f}%)")
print(f" Precision@3 (file) : {p_at_3:.4f} ({p_at_3*100:.1f}%)")
print(f" Precision@5 (file) : {p_at_5:.4f} ({p_at_5*100:.1f}%)")
print(f" Content recall@5 : {content_at5:.4f} ({content_at5*100:.1f}%)")
print(f" Required recall : {req_recall:.4f} ({req_recall*100:.1f}%) ← hardest: ALL required fragments in top-10")
print(f" Avg query latency : {avg_ms:.0f} ms")
print()
# ── Failure analysis ──────────────────────────────────────────────────
failures = [tr for tr in valid if not tr.file_at(5) or not tr.required_found]
if failures:
print(f"Improvement targets ({len(failures)} tests below par):")
for tr in failures:
issues = []
if not tr.file_at(5):
issues.append(f"file not in top-5 (rank={tr.file_rank})")
if not tr.required_found:
missing = [r for r in tr.test.required_content
if r.lower() not in " ".join(h.get("content","") for h in tr.hits).lower()]
issues.append(f"required content missing: {missing}")
print(f" [{tr.test.id}] {tr.test.name}: {'; '.join(issues)}")
else:
print("All tests passed file@5 and required-content checks.")
# Exit code: 0 if MRR ≥ 0.5 AND required recall ≥ 0.8, else 1
if mrr >= 0.5 and req_recall >= 0.8:
print("\nResult: PASS")
sys.exit(0)
else:
print(f"\nResult: FAIL (MRR={mrr:.3f} threshold=0.5, req_recall={req_recall:.3f} threshold=0.8)")
sys.exit(1)
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Phaser RAG quality evaluation")
parser.add_argument("--base-url", default="http://localhost:18080",
help="TrueRef server base URL")
parser.add_argument("--verbose", action="store_true",
help="Print per-test hit details")
args = parser.parse_args()
run(args.base_url, args.verbose)