Some checks failed
Build and publish Docker image / Build and push (push) Failing after 1m27s
Java 21 / Spring Boot 3.5.3 multi-module Maven project. Hybrid BM25+HNSW search with RRF, cross-encoder reranker, ONNX Runtime 1.22.0 (CPU + CUDA 12 GPU variants).
612 lines
27 KiB
Python
612 lines
27 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Phaser RAG Quality Evaluation Suite
|
|
====================================
|
|
Simulates an LLM querying TrueRef for Phaser documentation and guidance.
|
|
Tests are designed to be hard and objective: each defines exact expected content
|
|
fragments and/or expected source files that MUST appear in the top-k results.
|
|
|
|
Scoring metrics per test:
|
|
file@1 - expected file appeared as hit #1
|
|
file@3 - expected file appeared in hits 1-3
|
|
file@5 - expected file appeared in hits 1-5
|
|
content@5 - at least one expected content fragment found across the top-5 hits combined
|
|
content@1 - expected content fragment found in hit #1
|
|
|
|
Overall suite scores:
|
|
MRR - Mean Reciprocal Rank (file position)
|
|
P@1..5 - Precision@k for file hits
|
|
C@5 - Content recall across top-5
|
|
|
|
Run:
|
|
python3 phaser_rag_eval.py [--base-url http://localhost:18080] [--verbose]
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Config
|
|
# ---------------------------------------------------------------------------
|
|
REPO_ID = "50010965-aa3f-45f4-bb8d-72a0d50bf0db"
|
|
|
|
# Version IDs pinned to specific tags (fetched at startup if not found)
|
|
VERSIONS = {
|
|
"v4.1.0": "6c6a00f5-0945-4fd7-b62c-c0e69f14effe",
|
|
"v3.88.0": "d032d4d4-e6bc-4c9d-9c3c-8853e4a1cdc9",
|
|
"v3.85.2": "d1cf906e-54b9-416f-bd5b-9432d69d9935",
|
|
"v3.60.0": "95d0a8e2-9071-4986-85d4-59ae97893353",
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test definition
|
|
# ---------------------------------------------------------------------------
|
|
@dataclass
|
|
class TestCase:
|
|
id: str
|
|
name: str
|
|
query: str
|
|
version: str # key into VERSIONS
|
|
topic: Optional[str] = None
|
|
expected_files: list[str] = field(default_factory=list) # substrings of filePath
|
|
expected_content: list[str] = field(default_factory=list) # substrings that MUST appear
|
|
required_content: list[str] = field(default_factory=list) # ALL of these must appear (stricter)
|
|
max_hits: int = 10
|
|
tokens_budget: int = 6000
|
|
# Optional: minimum rerank score the top hit should exceed
|
|
min_score: Optional[float] = None
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test definitions — 25 hard, objective cases
|
|
# ---------------------------------------------------------------------------
|
|
TESTS: list[TestCase] = [
|
|
|
|
# ── 1. Tween system: basic config properties ──────────────────────────
|
|
TestCase(
|
|
id="T01",
|
|
name="Tween config: yoyo/hold/repeatDelay properties",
|
|
query="What properties can I set in a TweenBuilderConfig to make a tween yoyo with a hold and repeat delay?",
|
|
version="v4.1.0",
|
|
topic="tweens",
|
|
expected_files=["tweens/builders/TweenBuilder.js", "tweens/typedefs"],
|
|
expected_content=["yoyo", "hold", "repeatDelay"],
|
|
required_content=["yoyo", "repeatDelay"],
|
|
),
|
|
|
|
# ── 2. Tween system: onComplete / onUpdate callbacks ──────────────────
|
|
TestCase(
|
|
id="T02",
|
|
name="Tween callbacks: onComplete and onUpdate signatures",
|
|
query="How do I use onComplete and onUpdate callbacks in a Phaser tween? What arguments do they receive?",
|
|
version="v4.1.0",
|
|
topic="tweens",
|
|
expected_files=["tweens/"],
|
|
expected_content=["onComplete", "onUpdate", "onStart"],
|
|
required_content=["onComplete"],
|
|
),
|
|
|
|
# ── 3. Arcade physics: setCollideWorldBounds signature ────────────────
|
|
TestCase(
|
|
id="T03",
|
|
name="Arcade physics: setCollideWorldBounds signature",
|
|
query="What are the parameters of setCollideWorldBounds in Phaser Arcade physics? Can I pass bounceX and bounceY to set bounce on world edges?",
|
|
version="v4.1.0",
|
|
topic="physics",
|
|
expected_files=["physics/arcade/Body.js"],
|
|
expected_content=["setCollideWorldBounds", "bounceX", "bounceY", "onWorldBounds"],
|
|
required_content=["setCollideWorldBounds", "bounceX"],
|
|
),
|
|
|
|
# ── 4. Arcade physics: addCollider vs addOverlap ──────────────────────
|
|
TestCase(
|
|
id="T04",
|
|
name="Arcade physics: addCollider vs addOverlap difference",
|
|
query="What is the difference between addCollider and addOverlap in Phaser's Arcade physics World? How do I add a callback?",
|
|
version="v4.1.0",
|
|
topic="physics",
|
|
expected_files=["physics/arcade/World.js"],
|
|
expected_content=["addCollider", "addOverlap", "collideCallback", "processCallback"],
|
|
required_content=["addCollider", "addOverlap"],
|
|
),
|
|
|
|
# ── 5. Camera: shake parameters ───────────────────────────────────────
|
|
TestCase(
|
|
id="T05",
|
|
name="Camera shake: duration, intensity, force, callback",
|
|
query="How do I make the camera shake in Phaser? What parameters does camera.shake accept?",
|
|
version="v4.1.0",
|
|
topic="camera",
|
|
expected_files=["cameras/2d/Camera.js"],
|
|
expected_content=["shake", "duration", "intensity", "force", "callback"],
|
|
required_content=["shake", "intensity"],
|
|
),
|
|
|
|
# ── 6. Camera: startFollow with lerp ─────────────────────────────────
|
|
TestCase(
|
|
id="T06",
|
|
name="Camera follow: startFollow lerpX lerpY parameters",
|
|
query="How do I make the Phaser camera follow a player with smooth lerp? What are the lerpX and lerpY parameters?",
|
|
version="v4.1.0",
|
|
topic="camera",
|
|
expected_files=["cameras/2d/Camera.js"],
|
|
expected_content=["startFollow", "lerpX", "lerpY", "roundPixels"],
|
|
required_content=["startFollow", "lerpX"],
|
|
),
|
|
|
|
# ── 7. Camera: setDeadzone ────────────────────────────────────────────
|
|
TestCase(
|
|
id="T07",
|
|
name="Camera deadzone: setDeadzone width/height",
|
|
query="How does camera deadzone work in Phaser? How do I create a rectangular deadzone so the camera only moves when the player exits it?",
|
|
version="v4.1.0",
|
|
topic="camera",
|
|
expected_files=["cameras/2d/Camera.js"],
|
|
expected_content=["setDeadzone", "deadzone"],
|
|
required_content=["setDeadzone"],
|
|
),
|
|
|
|
# ── 8. Scene: pass data when starting another scene ───────────────────
|
|
TestCase(
|
|
id="T08",
|
|
name="Scene management: pass data on scene.start",
|
|
query="How do I pass data to another scene when calling scene.start() or scene.launch()? How does the init method receive it?",
|
|
version="v4.1.0",
|
|
topic="scenes",
|
|
expected_files=["scene/"],
|
|
expected_content=["init", "data", "start", "launch"],
|
|
required_content=["init"],
|
|
),
|
|
|
|
# ── 9. Animation system: chaining animations ──────────────────────────
|
|
TestCase(
|
|
id="T09",
|
|
name="Animation chaining: chain() and playAfterRepeat()",
|
|
query="How can I chain multiple animations so one plays after another finishes in Phaser? What is the chain() method?",
|
|
version="v4.1.0",
|
|
topic="animations",
|
|
expected_files=["gameobjects/sprite/Sprite.js", "animations/"],
|
|
expected_content=["chain", "playAfterRepeat", "playAfterDelay"],
|
|
required_content=["chain"],
|
|
),
|
|
|
|
# ── 10. Animation system: events ─────────────────────────────────────
|
|
TestCase(
|
|
id="T10",
|
|
name="Animation events: ANIMATION_COMPLETE, ANIMATION_START",
|
|
query="What events does the Phaser animation system emit? How do I listen for when an animation completes on a specific sprite?",
|
|
version="v4.1.0",
|
|
topic="animations",
|
|
expected_files=["animations/events/"],
|
|
expected_content=["ANIMATION_COMPLETE", "ANIMATION_START", "ANIMATION_STOP"],
|
|
required_content=["ANIMATION_COMPLETE"],
|
|
),
|
|
|
|
# ── 11. Input: pointer events ─────────────────────────────────────────
|
|
TestCase(
|
|
id="T11",
|
|
name="Input: setInteractive + pointerdown/pointerover events",
|
|
query="How do I call setInteractive on a game object and listen for pointerdown and pointerover events in Phaser?",
|
|
version="v4.1.0",
|
|
topic="input",
|
|
expected_files=["input/"],
|
|
expected_content=["pointerdown", "pointerover", "pointerout", "setInteractive"],
|
|
required_content=["setInteractive", "pointerdown"],
|
|
),
|
|
|
|
# ── 12. Input: keyboard cursor keys ──────────────────────────────────
|
|
TestCase(
|
|
id="T12",
|
|
name="Input: createCursorKeys and keyboard key states",
|
|
query="How do I read arrow key input in Phaser? How does createCursorKeys() work and how do I check if a key is down?",
|
|
version="v4.1.0",
|
|
topic="input",
|
|
expected_files=["input/keyboard/"],
|
|
expected_content=["createCursorKeys", "isDown", "up", "down", "left", "right"],
|
|
required_content=["createCursorKeys"],
|
|
),
|
|
|
|
# ── 13. Loader: atlas and texture keys ───────────────────────────────
|
|
TestCase(
|
|
id="T13",
|
|
name="Loader: load.atlas config object and frame keys",
|
|
query="How do I load a texture atlas in Phaser? What are the arguments to this.load.atlas() and how do I use frame keys?",
|
|
version="v4.1.0",
|
|
topic="loader",
|
|
expected_files=["loader/filetypes/AtlasJSONFile.js", "loader/"],
|
|
expected_content=["atlas", "textureURL", "atlasURL", "frameConfig"],
|
|
required_content=["atlas"],
|
|
min_score=0.7,
|
|
),
|
|
|
|
# ── 14. Tilemaps: setCollisionBetween ────────────────────────────────
|
|
TestCase(
|
|
id="T14",
|
|
name="Tilemap: setCollisionBetween start/stop parameters",
|
|
query="How do I set collision on a range of tile indices in a Phaser tilemap? What does setCollisionBetween do?",
|
|
version="v4.1.0",
|
|
topic="tilemaps",
|
|
expected_files=["tilemaps/Tilemap.js", "tilemaps/"],
|
|
expected_content=["setCollisionBetween", "start", "stop", "collides", "recalculateFaces"],
|
|
required_content=["setCollisionBetween"],
|
|
),
|
|
|
|
# ── 15. Tilemaps: createFromObjects ──────────────────────────────────
|
|
TestCase(
|
|
id="T15",
|
|
name="Tilemap: createFromObjects from Tiled object layer",
|
|
query="How do I convert Tiled object layer objects into Phaser game objects? How does createFromObjects work?",
|
|
version="v4.1.0",
|
|
topic="tilemaps",
|
|
expected_files=["tilemaps/Tilemap.js"],
|
|
expected_content=["createFromObjects", "objectLayerName"],
|
|
required_content=["createFromObjects"],
|
|
),
|
|
|
|
# ── 16. RenderTexture: beginDraw / endDraw (v3 API) ──────────────────
|
|
TestCase(
|
|
id="T16",
|
|
name="RenderTexture v3: beginDraw / batchDraw / endDraw pattern",
|
|
query="How do I use beginDraw and endDraw on a Phaser RenderTexture for batch drawing? What is the workflow?",
|
|
version="v3.85.2",
|
|
topic="rendering",
|
|
expected_files=["textures/DynamicTexture.js"],
|
|
expected_content=["beginDraw", "endDraw", "batchDraw", "batchDrawFrame"],
|
|
required_content=["beginDraw", "endDraw"],
|
|
),
|
|
|
|
# ── 17. Masking: BitmapMask vs GeometryMask (v3 API) ──────────────────
|
|
TestCase(
|
|
id="T17",
|
|
name="Masking v3: createBitmapMask vs createGeometryMask",
|
|
query="What is the difference between a BitmapMask and a GeometryMask in Phaser? How do I create and apply them?",
|
|
version="v3.85.2",
|
|
topic="rendering",
|
|
expected_files=["gameobjects/components/Mask.js", "display/mask/"],
|
|
expected_content=["createBitmapMask", "createGeometryMask", "setMask", "BitmapMask", "GeometryMask"],
|
|
required_content=["BitmapMask", "GeometryMask"],
|
|
),
|
|
|
|
# ── 18. Groups: getFirstDead / getFirstAlive pool pattern ─────────────
|
|
TestCase(
|
|
id="T18",
|
|
name="Group: object pool with getFirstDead / getFirstAlive",
|
|
query="How do I implement an object pool in Phaser using a Group? What are getFirstDead and getFirstAlive?",
|
|
version="v4.1.0",
|
|
topic="gameobjects",
|
|
expected_files=["gameobjects/group/Group.js"],
|
|
expected_content=["getFirstDead", "getFirstAlive", "createIfNull", "countActive"],
|
|
required_content=["getFirstDead", "getFirstAlive"],
|
|
),
|
|
|
|
# ── 19. Matter.js: fromVertices custom body shape ─────────────────────
|
|
TestCase(
|
|
id="T19",
|
|
name="Matter.js: custom body shape with fromVertices",
|
|
query="How do I create a custom polygon physics body from vertices in Phaser's Matter.js physics?",
|
|
version="v4.1.0",
|
|
topic="physics",
|
|
expected_files=["physics/matter-js/Factory.js", "physics/matter-js/"],
|
|
expected_content=["fromVertices", "vertexSets", "options"],
|
|
required_content=["fromVertices"],
|
|
),
|
|
|
|
# ── 20. Game config: FPS limit / target ───────────────────────────────
|
|
TestCase(
|
|
id="T20",
|
|
name="Game config: fps.target and fps.limit settings",
|
|
query="How do I configure the target frame rate and FPS limit in the Phaser game config? What is the difference between target and limit?",
|
|
version="v4.1.0",
|
|
topic="core",
|
|
expected_files=["core/TimeStep.js", "core/Config.js"],
|
|
expected_content=["targetFps", "fpsLimit", "target", "fps"],
|
|
required_content=["targetFps"],
|
|
),
|
|
|
|
# ── 21. Scale Manager: ScaleModes ────────────────────────────────────
|
|
TestCase(
|
|
id="T21",
|
|
name="Scale Manager: FIT vs ENVELOP scale modes",
|
|
query="What scale modes are available in Phaser's Scale Manager? How does FIT differ from ENVELOP? How do I make a responsive game?",
|
|
version="v4.1.0",
|
|
topic="scale",
|
|
expected_files=["scale/"],
|
|
expected_content=["FIT", "ENVELOP", "ScaleManager", "autoCenter"],
|
|
required_content=["FIT"],
|
|
),
|
|
|
|
# ── 22. Data Manager: set/get/events ──────────────────────────────────
|
|
TestCase(
|
|
id="T22",
|
|
name="Data Manager: set/get and CHANGE_DATA event",
|
|
query="How does the Phaser Data Manager work? How do I watch for data changes using events on a game object's data?",
|
|
version="v4.1.0",
|
|
topic="data",
|
|
expected_files=["data/DataManager.js", "data/"],
|
|
expected_content=["CHANGE_DATA", "set", "get", "events"],
|
|
required_content=["CHANGE_DATA"],
|
|
),
|
|
|
|
# ── 23. Depth sort: setDepth and displayList ──────────────────────────
|
|
TestCase(
|
|
id="T23",
|
|
name="Depth sorting: setDepth and display list ordering",
|
|
query="How does Phaser handle rendering order (z-order)? How do I use setDepth to control which objects render on top?",
|
|
version="v4.1.0",
|
|
topic="rendering",
|
|
expected_files=["gameobjects/"],
|
|
expected_content=["setDepth", "depth", "displayList"],
|
|
required_content=["setDepth"],
|
|
),
|
|
|
|
# ── 24. Version diff: v3.60 TweenChain (new in 3.60) ─────────────────
|
|
TestCase(
|
|
id="T24",
|
|
name="Version-specific: TweenChain introduced in v3.60",
|
|
query="How do I create a sequence of tweens that play one after another using TweenChain in Phaser 3.60+?",
|
|
version="v3.60.0",
|
|
topic="tweens",
|
|
expected_files=["tweens/"],
|
|
expected_content=["TweenChain", "chain"],
|
|
required_content=["TweenChain"],
|
|
),
|
|
|
|
# ── 25. Hard adversarial: camera.ignore() ─────────────────────────────
|
|
TestCase(
|
|
id="T25",
|
|
name="Camera: ignore() to exclude game objects from a camera",
|
|
query="How do I make a game object invisible to a specific camera in Phaser while remaining visible to others? What is camera.ignore()?",
|
|
version="v4.1.0",
|
|
topic="camera",
|
|
expected_files=["cameras/2d/"],
|
|
expected_content=["ignore", "camera"],
|
|
required_content=["ignore"],
|
|
),
|
|
]
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# HTTP helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def post_json(url: str, payload: dict) -> dict:
|
|
body = json.dumps(payload).encode()
|
|
req = urllib.request.Request(
|
|
url, data=body,
|
|
headers={"Content-Type": "application/json", "Accept": "application/json"},
|
|
method="POST",
|
|
)
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
return json.loads(resp.read().decode())
|
|
|
|
|
|
def get_json(url: str) -> dict | list:
|
|
req = urllib.request.Request(url, headers={"Accept": "application/json"})
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
return json.loads(resp.read().decode())
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Evaluation logic
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@dataclass
|
|
class TestResult:
|
|
test: TestCase
|
|
hits: list[dict]
|
|
elapsed_ms: float
|
|
error: Optional[str] = None
|
|
|
|
# Computed below
|
|
file_rank: Optional[int] = None # 1-based rank of first expected-file match
|
|
content_ranks: list[int] = field(default_factory=list) # 1-based ranks where content found
|
|
required_found: bool = False
|
|
top_score: Optional[float] = None
|
|
|
|
def file_at(self, k: int) -> bool:
|
|
return self.file_rank is not None and self.file_rank <= k
|
|
|
|
def content_at(self, k: int) -> bool:
|
|
return any(r <= k for r in self.content_ranks)
|
|
|
|
def mrr(self) -> float:
|
|
if self.file_rank is None:
|
|
return 0.0
|
|
return 1.0 / self.file_rank
|
|
|
|
def summary_line(self) -> str:
|
|
f1 = "✓" if self.file_at(1) else "·"
|
|
f3 = "✓" if self.file_at(3) else "·"
|
|
f5 = "✓" if self.file_at(5) else "·"
|
|
c5 = "✓" if self.content_at(5) else "·"
|
|
req = "✓" if self.required_found else "✗"
|
|
rank_str = f"rank={self.file_rank}" if self.file_rank else "NOT FOUND"
|
|
score_str = f"score={self.top_score:.3f}" if self.top_score else ""
|
|
ms_str = f"{self.elapsed_ms:.0f}ms"
|
|
return (
|
|
f"[{self.test.id}] {self.test.name[:52]:<52} "
|
|
f"f@1={f1} f@3={f3} f@5={f5} c@5={c5} req={req} "
|
|
f"{rank_str:>12} {score_str} {ms_str}"
|
|
)
|
|
|
|
|
|
def evaluate(result: TestResult, verbose: bool = False) -> None:
|
|
hits = result.hits
|
|
if not hits:
|
|
return
|
|
|
|
result.top_score = hits[0].get("score") if hits else None
|
|
|
|
# File rank: position of first hit whose filePath matches any expected_files substring
|
|
for i, hit in enumerate(hits):
|
|
fp = hit.get("filePath", "")
|
|
if any(ef in fp for ef in result.test.expected_files):
|
|
result.file_rank = i + 1
|
|
break
|
|
|
|
# Content rank: for each expected_content fragment, find the first hit that contains it
|
|
combined_content = {i: (hit.get("content") or "") for i, hit in enumerate(hits)}
|
|
|
|
for fragment in result.test.expected_content:
|
|
for i, content in combined_content.items():
|
|
if fragment.lower() in content.lower():
|
|
result.content_ranks.append(i + 1)
|
|
break
|
|
|
|
# Required content: ALL required fragments must appear somewhere in top-10
|
|
all_content = " ".join(combined_content.values()).lower()
|
|
result.required_found = all(
|
|
r.lower() in all_content for r in result.test.required_content
|
|
)
|
|
|
|
if verbose:
|
|
print(f"\n{'─'*80}")
|
|
print(f"[{result.test.id}] {result.test.name}")
|
|
print(f" Query: {result.test.query}")
|
|
print(f" Expected files: {result.test.expected_files}")
|
|
print(f" Expected content: {result.test.expected_content}")
|
|
print(f" Top hits:")
|
|
for i, hit in enumerate(hits[:5]):
|
|
fp = hit.get("filePath", "?")
|
|
score = hit.get("score", 0.0)
|
|
snip = (hit.get("content") or "")[:100].replace("\n", " ")
|
|
marker = " ← FILE MATCH" if any(ef in fp for ef in result.test.expected_files) else ""
|
|
print(f" [{i+1}] score={score:.3f} {fp}{marker}")
|
|
print(f" {snip}")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main runner
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def run(base_url: str, verbose: bool) -> None:
|
|
base_url = base_url.rstrip("/")
|
|
search_url = f"{base_url}/api/search"
|
|
versions_url = f"{base_url}/api/repos/{REPO_ID}/versions"
|
|
|
|
print(f"TrueRef Phaser RAG Evaluation Suite")
|
|
print(f"Server : {base_url}")
|
|
print(f"Tests : {len(TESTS)}")
|
|
print()
|
|
|
|
# Resolve version IDs from server (in case they differ)
|
|
try:
|
|
all_versions = get_json(versions_url)
|
|
live_map = {v["tag"]: v["id"] for v in all_versions if v.get("status") == "INDEXED"}
|
|
for tag in list(VERSIONS.keys()):
|
|
if tag in live_map:
|
|
VERSIONS[tag] = live_map[tag]
|
|
except Exception as e:
|
|
print(f"WARN: could not refresh version IDs: {e}")
|
|
|
|
results: list[TestResult] = []
|
|
|
|
for tc in TESTS:
|
|
version_id = VERSIONS.get(tc.version)
|
|
if not version_id:
|
|
print(f"SKIP [{tc.id}]: version {tc.version} not available")
|
|
continue
|
|
|
|
payload = {
|
|
"text": tc.query,
|
|
"scope": [{"repoId": REPO_ID, "versionId": version_id}],
|
|
"maxHits": tc.max_hits,
|
|
"tokensBudget": tc.tokens_budget,
|
|
}
|
|
if tc.topic:
|
|
payload["topic"] = tc.topic
|
|
|
|
t0 = time.time()
|
|
try:
|
|
resp = post_json(search_url, payload)
|
|
elapsed = (time.time() - t0) * 1000
|
|
hits = resp.get("hits", [])
|
|
tr = TestResult(test=tc, hits=hits, elapsed_ms=elapsed)
|
|
evaluate(tr, verbose=verbose)
|
|
except Exception as e:
|
|
elapsed = (time.time() - t0) * 1000
|
|
tr = TestResult(test=tc, hits=[], elapsed_ms=elapsed, error=str(e))
|
|
print(f"ERROR [{tc.id}]: {e}")
|
|
|
|
results.append(tr)
|
|
|
|
# ── Summary table ─────────────────────────────────────────────────────
|
|
print()
|
|
print("=" * 110)
|
|
print(f"{'TEST ID + NAME':<56} {'f@1':>4} {'f@3':>4} {'f@5':>4} {'c@5':>4} {'req':>4} {'file rank':>12} {'score':>10} {'ms':>6}")
|
|
print("=" * 110)
|
|
|
|
for tr in results:
|
|
if tr.error:
|
|
print(f"[{tr.test.id}] {'ERROR: ' + tr.test.name[:45]:<52} ERROR: {tr.error[:40]}")
|
|
else:
|
|
print(tr.summary_line())
|
|
|
|
# ── Aggregate metrics ─────────────────────────────────────────────────
|
|
valid = [tr for tr in results if not tr.error]
|
|
n = len(valid)
|
|
if n == 0:
|
|
print("\nNo valid results.")
|
|
return
|
|
|
|
mrr = sum(tr.mrr() for tr in valid) / n
|
|
p_at_1 = sum(1 for tr in valid if tr.file_at(1)) / n
|
|
p_at_3 = sum(1 for tr in valid if tr.file_at(3)) / n
|
|
p_at_5 = sum(1 for tr in valid if tr.file_at(5)) / n
|
|
content_at5 = sum(1 for tr in valid if tr.content_at(5)) / n
|
|
req_recall = sum(1 for tr in valid if tr.required_found) / n
|
|
avg_ms = sum(tr.elapsed_ms for tr in valid) / n
|
|
|
|
print("=" * 110)
|
|
print()
|
|
print("Aggregate metrics:")
|
|
print(f" MRR (file) : {mrr:.4f} ({mrr*100:.1f}%)")
|
|
print(f" Precision@1 (file) : {p_at_1:.4f} ({p_at_1*100:.1f}%)")
|
|
print(f" Precision@3 (file) : {p_at_3:.4f} ({p_at_3*100:.1f}%)")
|
|
print(f" Precision@5 (file) : {p_at_5:.4f} ({p_at_5*100:.1f}%)")
|
|
print(f" Content recall@5 : {content_at5:.4f} ({content_at5*100:.1f}%)")
|
|
print(f" Required recall : {req_recall:.4f} ({req_recall*100:.1f}%) ← hardest: ALL required fragments in top-10")
|
|
print(f" Avg query latency : {avg_ms:.0f} ms")
|
|
print()
|
|
|
|
# ── Failure analysis ──────────────────────────────────────────────────
|
|
failures = [tr for tr in valid if not tr.file_at(5) or not tr.required_found]
|
|
if failures:
|
|
print(f"Improvement targets ({len(failures)} tests below par):")
|
|
for tr in failures:
|
|
issues = []
|
|
if not tr.file_at(5):
|
|
issues.append(f"file not in top-5 (rank={tr.file_rank})")
|
|
if not tr.required_found:
|
|
missing = [r for r in tr.test.required_content
|
|
if r.lower() not in " ".join(h.get("content","") for h in tr.hits).lower()]
|
|
issues.append(f"required content missing: {missing}")
|
|
print(f" [{tr.test.id}] {tr.test.name}: {'; '.join(issues)}")
|
|
else:
|
|
print("All tests passed file@5 and required-content checks.")
|
|
|
|
# Exit code: 0 if MRR ≥ 0.5 AND required recall ≥ 0.8, else 1
|
|
if mrr >= 0.5 and req_recall >= 0.8:
|
|
print("\nResult: PASS")
|
|
sys.exit(0)
|
|
else:
|
|
print(f"\nResult: FAIL (MRR={mrr:.3f} threshold=0.5, req_recall={req_recall:.3f} threshold=0.8)")
|
|
sys.exit(1)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Entry point
|
|
# ---------------------------------------------------------------------------
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Phaser RAG quality evaluation")
|
|
parser.add_argument("--base-url", default="http://localhost:18080",
|
|
help="TrueRef server base URL")
|
|
parser.add_argument("--verbose", action="store_true",
|
|
help="Print per-test hit details")
|
|
args = parser.parse_args()
|
|
run(args.base_url, args.verbose)
|