fix(SCOPONE-0012): complete iteration 2 - speed up benchmark timer

This commit is contained in:
Giancarmine Salucci
2026-04-11 19:52:44 +02:00
parent 38f675eda5
commit 3d76fb544f
2 changed files with 1312 additions and 576 deletions

View File

@@ -4,11 +4,10 @@ import {
AI_BENCHMARK_FIXTURES,
AIBenchmarkCriticalConcept,
AIBenchmarkExpectedMove,
AIBenchmarkFixture,
isCriticalAIBenchmarkFixture,
} from './ai-benchmark-fixtures';
import { CardTracker } from './card-tracker';
import { GameState, PlayerIndex } from './types';
import { Difficulty, GameState, PlayerIndex } from './types';
function formatDurationMs(durationMs: number): string {
if (durationMs < 1000) {
@@ -18,6 +17,10 @@ function formatDurationMs(durationMs: number): string {
return `${(durationMs / 1000).toFixed(2)} s`;
}
function formatPercentage(value: number): string {
return `${(value * 100).toFixed(1)}%`;
}
function logBenchmarkProgress(message: string): void {
console.log(`[ai-benchmark] ${message}`);
}
@@ -36,18 +39,23 @@ interface FixedFixtureResult {
referenceSimulatedMs: number;
}
type SelfPlaySuiteId = 'mirror-parity' | 'beginner-dominance';
interface SelfPlayMatchResult {
suite: SelfPlaySuiteId;
seed: number;
dealer: PlayerIndex;
masterTeam: 0 | 1;
trackedTeam: 0 | 1;
trackedTeamDifficulty: Difficulty;
opponentDifficulty: Difficulty;
winner: 0 | 1 | null;
masterResult: 'win' | 'loss' | 'draw';
trackedResult: 'win' | 'loss' | 'draw';
rounds: number;
truncated: boolean;
totalPoints: [number, number];
masterDecisionCount: number;
masterAverageSimulatedDecisionMs: number;
masterMaxSimulatedDecisionMs: number;
trackedDecisionCount: number;
trackedAverageSimulatedDecisionMs: number;
trackedMaxSimulatedDecisionMs: number;
}
interface TimingSummary {
@@ -64,23 +72,25 @@ interface GateCountSummary {
passed: boolean;
}
interface SelfPlayGateSummary {
interface WinRateGateSummary {
matches: number;
requiredMatches: number;
wins: number;
requiredWins: number;
losses: number;
maxLosses: number;
draws: number;
winRate: number;
targetWinRate: number | null;
tolerance: number | null;
minWinRate: number | null;
maxWinRate: number | null;
matchCountPassed: boolean;
winGatePassed: boolean;
lossGatePassed: boolean;
winRatePassed: boolean;
passed: boolean;
}
interface SelfPlaySeedSeatResult {
masterTeam: 0 | 1;
masterResult: 'win' | 'loss' | 'draw';
trackedTeam: 0 | 1;
trackedResult: 'win' | 'loss' | 'draw';
winner: 0 | 1 | null;
rounds: number;
truncated: boolean;
@@ -97,50 +107,82 @@ interface SelfPlaySeedAggregateResult {
seatResults: SelfPlaySeedSeatResult[];
}
interface SelfPlaySuiteSummary {
suite: SelfPlaySuiteId;
label: string;
trackedTeamDifficulty: Difficulty;
opponentDifficulty: Difficulty;
matches: number;
requiredMatches: number;
seedCount: number;
seatBalanced: boolean;
wins: number;
losses: number;
draws: number;
winRate: number;
lossRate: number;
perSeed: SelfPlaySeedAggregateResult[];
dualLossSeeds: number[];
regressionWatchlist: number[];
regressionWatchlistDualLossIntersection: number[];
simulatedTiming: {
suiteSimulatedMs: number;
trackedTeamDecisions: TimingSummary;
};
results: SelfPlayMatchResult[];
}
export interface AIBenchmarkSummary {
benchmark: 'ai-quality';
qualityGate: {
iteration: 5;
iteration: 6;
passed: boolean;
fixedFixtures: GateCountSummary;
criticalConcepts: GateCountSummary;
selfPlay: SelfPlayGateSummary;
mirrorParity: WinRateGateSummary;
beginnerDominance: WinRateGateSummary;
};
fixtureCount: number;
criticalFixtureCount: number;
fixtureTotals: {
fixtures: number;
criticalFixtures: number;
};
fixedSuite: {
fixedFixtureAgreements: number;
expectedPasses: number;
criticalPasses: number;
fixedFixtureAgreementFailures: string[];
criticalPassFailures: string[];
simulatedTiming: {
productionSuiteSimulatedMs: number;
referenceSuiteSimulatedMs: number;
productionMasterDecisions: TimingSummary;
referenceMasterDecisions: TimingSummary;
};
results: FixedFixtureResult[];
};
selfPlay: {
matches: number;
wins: number;
losses: number;
draws: number;
winRate: number;
lossRate: number;
perSeed: SelfPlaySeedAggregateResult[];
dualLossSeeds: number[];
regressionWatchlist: number[];
regressionWatchlistDualLossIntersection: number[];
results: SelfPlayMatchResult[];
selfPlaySuites: {
totalMatches: number;
mirrorParity: SelfPlaySuiteSummary;
beginnerDominance: SelfPlaySuiteSummary;
};
timing: {
productionMasterSimulatedDecisions: TimingSummary;
fixedFixtureProductionMasterDecisions: TimingSummary;
fixedFixtureReferenceMasterDecisions: TimingSummary;
mirrorTrackedTeamSimulatedDecisions: TimingSummary;
beginnerTrackedTeamSimulatedDecisions: TimingSummary;
allTrackedProductionSimulatedDecisions: TimingSummary;
};
referenceProfile: Required<AISearchProfileOverride>;
}
const ITERATION_5_GATE = {
fixedFixtureAgreementTarget: 13,
criticalConceptTarget: 6,
selfPlayMatchTarget: 48,
selfPlayWinTarget: 30,
selfPlayMaxLosses: 12,
const ITERATION_6_GATE = {
mirrorMatchTarget: 500,
beginnerMatchTarget: 500,
mirrorTargetWinRate: 0.5,
mirrorWinRateTolerance: 0.05,
beginnerMinWinRate: 0.7,
} as const;
const KNOWN_REGRESSION_WATCHLIST = [1000, 1002, 1004, 1006, 1012, 1013, 1014] as const;
@@ -153,29 +195,73 @@ const REFERENCE_PROFILE: Required<AISearchProfileOverride> = {
batchSize: 2,
};
const SELF_PLAY_MATCH_SEEDS = Array.from({ length: 24 }, (_, index) => 1000 + index);
const SELF_PLAY_SEAT_SWAPS = [0, 1] as const;
const SELF_PLAY_MATCH_SEEDS = Array.from({ length: 250 }, (_, index) => 1000 + index);
const MAX_SELF_PLAY_ROUNDS = 20;
function assertIteration5BenchmarkContract(): void {
interface SelfPlaySuiteConfig {
id: SelfPlaySuiteId;
label: string;
suiteSeedKey: number;
requiredMatches: number;
trackedTeamDifficulty: Difficulty;
opponentDifficulty: Difficulty;
getTeamDifficulties(trackedTeam: 0 | 1): readonly [Difficulty, Difficulty];
}
const SELF_PLAY_SUITES: Record<SelfPlaySuiteId, SelfPlaySuiteConfig> = {
'mirror-parity': {
id: 'mirror-parity',
label: 'Master mirror parity',
suiteSeedKey: 0x4d31,
requiredMatches: SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length,
trackedTeamDifficulty: 'master',
opponentDifficulty: 'master',
getTeamDifficulties: () => ['master', 'master'],
},
'beginner-dominance': {
id: 'beginner-dominance',
label: 'Master versus beginner dominance',
suiteSeedKey: 0x4236,
requiredMatches: SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length,
trackedTeamDifficulty: 'master',
opponentDifficulty: 'beginner',
getTeamDifficulties: trackedTeam => (trackedTeam === 0
? ['master', 'beginner']
: ['beginner', 'master']),
},
};
function assertIteration6BenchmarkContract(): void {
const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length;
const selfPlayMatchCount = SELF_PLAY_MATCH_SEEDS.length * 2;
const expectedSeatBalancedMatches = SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length;
if (AI_BENCHMARK_FIXTURES.length !== ITERATION_5_GATE.fixedFixtureAgreementTarget) {
if (AI_BENCHMARK_FIXTURES.length === 0) {
throw new Error('Iteration 6 benchmark requires at least one fixed fixture.');
}
if (criticalFixtureCount === 0) {
throw new Error('Iteration 6 benchmark requires at least one critical concept fixture.');
}
if (expectedSeatBalancedMatches !== ITERATION_6_GATE.mirrorMatchTarget) {
throw new Error(
`Iteration 5 benchmark expects ${ITERATION_5_GATE.fixedFixtureAgreementTarget} fixed fixtures, received ${AI_BENCHMARK_FIXTURES.length}.`,
`Iteration 6 benchmark expects ${ITERATION_6_GATE.mirrorMatchTarget} mirror matches, received ${expectedSeatBalancedMatches}.`,
);
}
if (criticalFixtureCount !== ITERATION_5_GATE.criticalConceptTarget) {
if (expectedSeatBalancedMatches !== ITERATION_6_GATE.beginnerMatchTarget) {
throw new Error(
`Iteration 5 benchmark expects ${ITERATION_5_GATE.criticalConceptTarget} critical concept fixtures, received ${criticalFixtureCount}.`,
`Iteration 6 benchmark expects ${ITERATION_6_GATE.beginnerMatchTarget} beginner-dominance matches, received ${expectedSeatBalancedMatches}.`,
);
}
if (selfPlayMatchCount !== ITERATION_5_GATE.selfPlayMatchTarget) {
throw new Error(
`Iteration 5 benchmark expects ${ITERATION_5_GATE.selfPlayMatchTarget} self-play matches, received ${selfPlayMatchCount}.`,
);
for (const suite of Object.values(SELF_PLAY_SUITES)) {
if (suite.requiredMatches !== expectedSeatBalancedMatches) {
throw new Error(
`Iteration 6 benchmark expects ${expectedSeatBalancedMatches} matches for ${suite.id}, received ${suite.requiredMatches}.`,
);
}
}
}
@@ -220,6 +306,10 @@ function moveKey(move: AIMove): string {
return `${move.card.id}|${move.capture.map(card => card.id).sort().join(',')}`;
}
function otherTeam(team: 0 | 1): 0 | 1 {
return team === 0 ? 1 : 0;
}
function createTrackerForState(state: GameState): CardTracker {
const tracker = new CardTracker();
for (const player of state.players) {
@@ -239,10 +329,16 @@ function matchesExpectedMove(move: AIMove, expected: AIBenchmarkExpectedMove): b
return actualCapture === expectedCapture;
}
async function runFixedFixtureSuite(): Promise<{ results: FixedFixtureResult[]; wallClockMs: number; productionTimings: number[] }> {
const startedAt = performance.now();
async function runFixedFixtureSuite(): Promise<{
results: FixedFixtureResult[];
productionSuiteSimulatedMs: number;
referenceSuiteSimulatedMs: number;
productionTimings: number[];
referenceTimings: number[];
}> {
const results: FixedFixtureResult[] = [];
const productionTimings: number[] = [];
const referenceTimings: number[] = [];
logBenchmarkProgress(`Starting fixed fixture suite (${AI_BENCHMARK_FIXTURES.length} positions).`);
@@ -286,6 +382,7 @@ async function runFixedFixtureSuite(): Promise<{ results: FixedFixtureResult[];
const referenceSimulatedMs = referenceTimingSource.getElapsedMs();
productionTimings.push(productionSimulatedMs);
referenceTimings.push(referenceSimulatedMs);
const conceptGatePass = isCriticalAIBenchmarkFixture(fixture)
? matchesExpectedMove(productionMove, fixture.expectedMove)
@@ -314,11 +411,17 @@ async function runFixedFixtureSuite(): Promise<{ results: FixedFixtureResult[];
return {
results,
wallClockMs: performance.now() - startedAt,
productionSuiteSimulatedMs: sumTimings(productionTimings),
referenceSuiteSimulatedMs: sumTimings(referenceTimings),
productionTimings,
referenceTimings,
};
}
function sumTimings(samples: number[]): number {
return samples.reduce((total, sample) => total + sample, 0);
}
function summarizeTimings(samples: number[]): TimingSummary {
if (samples.length === 0) {
return {
@@ -360,13 +463,13 @@ function summarizeSelfPlayBySeed(results: SelfPlayMatchResult[]): {
};
existing.matches++;
if (result.masterResult === 'win') existing.wins++;
else if (result.masterResult === 'loss') existing.losses++;
if (result.trackedResult === 'win') existing.wins++;
else if (result.trackedResult === 'loss') existing.losses++;
else existing.draws++;
existing.seatResults.push({
masterTeam: result.masterTeam,
masterResult: result.masterResult,
trackedTeam: result.trackedTeam,
trackedResult: result.trackedResult,
winner: result.winner,
rounds: result.rounds,
truncated: result.truncated,
@@ -380,7 +483,7 @@ function summarizeSelfPlayBySeed(results: SelfPlayMatchResult[]): {
.map(aggregate => ({
...aggregate,
dualLoss: aggregate.losses >= 2,
seatResults: [...aggregate.seatResults].sort((left, right) => left.masterTeam - right.masterTeam),
seatResults: [...aggregate.seatResults].sort((left, right) => left.trackedTeam - right.trackedTeam),
}))
.sort((left, right) => left.seed - right.seed);
const dualLossSeeds = perSeed.filter(aggregate => aggregate.dualLoss).map(aggregate => aggregate.seed);
@@ -393,12 +496,18 @@ function summarizeSelfPlayBySeed(results: SelfPlayMatchResult[]): {
};
}
async function simulateSelfPlayMatch(seed: number, masterTeam: 0 | 1): Promise<{ result: SelfPlayMatchResult; timings: number[] }> {
async function simulateSelfPlayMatch(
suite: SelfPlaySuiteConfig,
seed: number,
trackedTeam: 0 | 1,
): Promise<{ result: SelfPlayMatchResult; trackedTimings: number[]; simulatedMatchMs: number }> {
const initialDealer = (seed % 4) as PlayerIndex;
let state = createInitialState(initialDealer, createMulberry32(seedFromParts(seed, 1, 0)));
const teamDifficulties = suite.getTeamDifficulties(trackedTeam);
let state = createInitialState(initialDealer, createMulberry32(seedFromParts(suite.suiteSeedKey, seed, 1, 0)));
const matchStartingPlayer = state.matchStartingPlayer;
const tracker = new CardTracker();
const masterTimings: number[] = [];
const trackedTimings: number[] = [];
let simulatedMatchMs = 0;
let rounds = 1;
let truncated = false;
@@ -407,19 +516,18 @@ async function simulateSelfPlayMatch(seed: number, masterTeam: 0 | 1): Promise<{
while (rounds <= MAX_SELF_PLAY_ROUNDS) {
while (!state.roundOver) {
const playerIdx = state.currentPlayer;
const difficulty = teamOf(playerIdx) === masterTeam ? 'master' : 'advanced';
const actingTeam = teamOf(playerIdx);
const difficulty = teamDifficulties[actingTeam];
const timingSource = createSimulatedBenchmarkTimingSource();
const options = difficulty === 'master'
? {
rng: createMulberry32(seedFromParts(seed, rounds, turnCount, playerIdx)),
timingSource,
}
: { timingSource };
const move = await chooseMove(state, playerIdx, difficulty, tracker, undefined, options);
const move = await chooseMove(state, playerIdx, difficulty, tracker, undefined, {
rng: createMulberry32(seedFromParts(suite.suiteSeedKey, seed, rounds, turnCount, playerIdx)),
timingSource,
});
const simulatedMs = timingSource.getElapsedMs();
simulatedMatchMs += simulatedMs;
if (difficulty === 'master') {
masterTimings.push(simulatedMs);
if (actingTeam === trackedTeam) {
trackedTimings.push(simulatedMs);
}
const { nextState, capture } = applyMove(
@@ -441,16 +549,17 @@ async function simulateSelfPlayMatch(seed: number, masterTeam: 0 | 1): Promise<{
break;
}
rounds++;
if (rounds > MAX_SELF_PLAY_ROUNDS) {
if (rounds === MAX_SELF_PLAY_ROUNDS) {
truncated = true;
break;
}
rounds++;
const totals: [number, number] = [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints];
const nextDealer = nextPlayer(state.dealer);
tracker.reset();
state = createInitialState(nextDealer, createMulberry32(seedFromParts(seed, rounds, 0)));
state = createInitialState(nextDealer, createMulberry32(seedFromParts(suite.suiteSeedKey, seed, rounds, 0)));
state.matchStartingPlayer = matchStartingPlayer;
state.teamScores[0].totalPoints = totals[0];
state.teamScores[1].totalPoints = totals[1];
@@ -458,47 +567,54 @@ async function simulateSelfPlayMatch(seed: number, masterTeam: 0 | 1): Promise<{
}
const outcome = getMatchOutcome(state.teamScores);
const winner = truncated ? outcome.winner : outcome.winner;
const masterResult = winner === null ? 'draw' : winner === masterTeam ? 'win' : 'loss';
const timingSummary = summarizeTimings(masterTimings);
const winner = outcome.winner;
const timingSummary = summarizeTimings(trackedTimings);
const opposingTeam = otherTeam(trackedTeam);
const trackedResult = winner === null ? 'draw' : winner === trackedTeam ? 'win' : 'loss';
return {
result: {
suite: suite.id,
seed,
dealer: initialDealer,
masterTeam,
trackedTeam,
trackedTeamDifficulty: teamDifficulties[trackedTeam],
opponentDifficulty: teamDifficulties[opposingTeam],
winner,
masterResult,
trackedResult,
rounds,
truncated,
totalPoints: [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints],
masterDecisionCount: timingSummary.count,
masterAverageSimulatedDecisionMs: timingSummary.averageMs,
masterMaxSimulatedDecisionMs: timingSummary.maxMs,
trackedDecisionCount: timingSummary.count,
trackedAverageSimulatedDecisionMs: timingSummary.averageMs,
trackedMaxSimulatedDecisionMs: timingSummary.maxMs,
},
timings: masterTimings,
trackedTimings,
simulatedMatchMs,
};
}
async function runSelfPlaySuite(): Promise<{ results: SelfPlayMatchResult[]; wallClockMs: number; productionTimings: number[] }> {
const startedAt = performance.now();
async function runSelfPlaySuite(
suite: SelfPlaySuiteConfig,
): Promise<{ results: SelfPlayMatchResult[]; suiteSimulatedMs: number; trackedTeamTimings: number[] }> {
const results: SelfPlayMatchResult[] = [];
const productionTimings: number[] = [];
const totalMatches = SELF_PLAY_MATCH_SEEDS.length * 2;
const trackedTeamTimings: number[] = [];
let suiteSimulatedMs = 0;
let completedMatches = 0;
logBenchmarkProgress(`Starting self-play suite (${totalMatches} seeded matches with seat swaps).`);
logBenchmarkProgress(`Starting ${suite.label} suite (${suite.requiredMatches} seeded matches with seat swaps).`);
for (const seed of SELF_PLAY_MATCH_SEEDS) {
for (const masterTeam of [0, 1] as const) {
const { result, timings } = await simulateSelfPlayMatch(seed, masterTeam);
for (const trackedTeam of SELF_PLAY_SEAT_SWAPS) {
const { result, trackedTimings, simulatedMatchMs } = await simulateSelfPlayMatch(suite, seed, trackedTeam);
results.push(result);
productionTimings.push(...timings);
trackedTeamTimings.push(...trackedTimings);
suiteSimulatedMs += simulatedMatchMs;
completedMatches++;
if (completedMatches === 1 || completedMatches % 4 === 0 || completedMatches === totalMatches) {
if (completedMatches === 1 || completedMatches % 25 === 0 || completedMatches === suite.requiredMatches) {
logBenchmarkProgress(
`Self-play ${completedMatches}/${totalMatches}: seed ${seed}, master team ${masterTeam}, result ${result.masterResult}, rounds ${result.rounds}, max simulated decision ${formatDurationMs(result.masterMaxSimulatedDecisionMs)}.`,
`${suite.label} ${completedMatches}/${suite.requiredMatches}: seed ${seed}, tracked team ${trackedTeam}, result ${result.trackedResult}, rounds ${result.rounds}, max simulated decision ${formatDurationMs(result.trackedMaxSimulatedDecisionMs)}.`,
);
}
}
@@ -506,38 +622,157 @@ async function runSelfPlaySuite(): Promise<{ results: SelfPlayMatchResult[]; wal
return {
results,
wallClockMs: performance.now() - startedAt,
productionTimings,
suiteSimulatedMs,
trackedTeamTimings,
};
}
function buildSelfPlaySuiteSummary(
suite: SelfPlaySuiteConfig,
run: { results: SelfPlayMatchResult[]; suiteSimulatedMs: number; trackedTeamTimings: number[] },
): SelfPlaySuiteSummary {
const wins = run.results.filter(result => result.trackedResult === 'win').length;
const losses = run.results.filter(result => result.trackedResult === 'loss').length;
const draws = run.results.filter(result => result.trackedResult === 'draw').length;
const { perSeed, dualLossSeeds, regressionWatchlistDualLossIntersection } = summarizeSelfPlayBySeed(run.results);
return {
suite: suite.id,
label: suite.label,
trackedTeamDifficulty: suite.trackedTeamDifficulty,
opponentDifficulty: suite.opponentDifficulty,
matches: run.results.length,
requiredMatches: suite.requiredMatches,
seedCount: SELF_PLAY_MATCH_SEEDS.length,
seatBalanced: true,
wins,
losses,
draws,
winRate: run.results.length === 0 ? 0 : wins / run.results.length,
lossRate: run.results.length === 0 ? 0 : losses / run.results.length,
perSeed,
dualLossSeeds,
regressionWatchlist: [...KNOWN_REGRESSION_WATCHLIST],
regressionWatchlistDualLossIntersection,
simulatedTiming: {
suiteSimulatedMs: run.suiteSimulatedMs,
trackedTeamDecisions: summarizeTimings(run.trackedTeamTimings),
},
results: run.results,
};
}
function createMirrorParityGate(summary: SelfPlaySuiteSummary): WinRateGateSummary {
const minWinRate = ITERATION_6_GATE.mirrorTargetWinRate - ITERATION_6_GATE.mirrorWinRateTolerance;
const maxWinRate = ITERATION_6_GATE.mirrorTargetWinRate + ITERATION_6_GATE.mirrorWinRateTolerance;
const matchCountPassed = summary.matches === ITERATION_6_GATE.mirrorMatchTarget;
const winRatePassed = summary.winRate >= minWinRate && summary.winRate <= maxWinRate;
return {
matches: summary.matches,
requiredMatches: ITERATION_6_GATE.mirrorMatchTarget,
wins: summary.wins,
losses: summary.losses,
draws: summary.draws,
winRate: summary.winRate,
targetWinRate: ITERATION_6_GATE.mirrorTargetWinRate,
tolerance: ITERATION_6_GATE.mirrorWinRateTolerance,
minWinRate,
maxWinRate,
matchCountPassed,
winRatePassed,
passed: matchCountPassed && winRatePassed,
};
}
function createBeginnerDominanceGate(summary: SelfPlaySuiteSummary): WinRateGateSummary {
const matchCountPassed = summary.matches === ITERATION_6_GATE.beginnerMatchTarget;
const winRatePassed = summary.winRate >= ITERATION_6_GATE.beginnerMinWinRate;
return {
matches: summary.matches,
requiredMatches: ITERATION_6_GATE.beginnerMatchTarget,
wins: summary.wins,
losses: summary.losses,
draws: summary.draws,
winRate: summary.winRate,
targetWinRate: null,
tolerance: null,
minWinRate: ITERATION_6_GATE.beginnerMinWinRate,
maxWinRate: null,
matchCountPassed,
winRatePassed,
passed: matchCountPassed && winRatePassed,
};
}
function formatPerSeedAggregates(perSeed: SelfPlaySeedAggregateResult[]): string {
return perSeed.map(seed => `${seed.seed}:${seed.wins}W-${seed.losses}L-${seed.draws}D`).join(' | ');
}
function printReadableSummary(summary: AIBenchmarkSummary): void {
const mirror = summary.selfPlaySuites.mirrorParity;
const beginner = summary.selfPlaySuites.beginnerDominance;
console.log('AI quality benchmark');
console.log(`Iteration 5 quality gate: ${summary.qualityGate.passed ? 'PASS' : 'FAIL'}`);
console.log(`Fixed-fixture gate: ${summary.qualityGate.fixedFixtures.actual}/${summary.qualityGate.fixedFixtures.total} agreements (target ${summary.qualityGate.fixedFixtures.required}/${summary.qualityGate.fixedFixtures.total}).`);
console.log(`Critical concept gate: ${summary.qualityGate.criticalConcepts.actual}/${summary.qualityGate.criticalConcepts.total} passes (target ${summary.qualityGate.criticalConcepts.required}/${summary.qualityGate.criticalConcepts.total}).`);
console.log(`Self-play gate: ${summary.qualityGate.selfPlay.matches}/${summary.qualityGate.selfPlay.requiredMatches} matches, ${summary.qualityGate.selfPlay.wins}/${summary.qualityGate.selfPlay.matches} wins (target ${summary.qualityGate.selfPlay.requiredWins}), ${summary.qualityGate.selfPlay.losses}/${summary.qualityGate.selfPlay.matches} losses (max ${summary.qualityGate.selfPlay.maxLosses}), ${summary.qualityGate.selfPlay.draws} draws.`);
console.log(`Iteration 6 quality gate: ${summary.qualityGate.passed ? 'PASS' : 'FAIL'}`);
console.log(`Fixture totals: ${summary.fixtureTotals.fixtures} total, ${summary.fixtureTotals.criticalFixtures} critical.`);
console.log(`Fixed-fixture gate: ${summary.qualityGate.fixedFixtures.actual}/${summary.qualityGate.fixedFixtures.total} agreements.`);
console.log(`Critical concept gate: ${summary.qualityGate.criticalConcepts.actual}/${summary.qualityGate.criticalConcepts.total} passes.`);
console.log(
`Mirror parity gate: ${summary.qualityGate.mirrorParity.matches}/${summary.qualityGate.mirrorParity.requiredMatches} matches, ${formatPercentage(summary.qualityGate.mirrorParity.winRate)} tracked-team win rate (target ${formatPercentage(summary.qualityGate.mirrorParity.targetWinRate ?? 0)} +/- ${formatPercentage(summary.qualityGate.mirrorParity.tolerance ?? 0)}).`,
);
console.log(
`Beginner dominance gate: ${summary.qualityGate.beginnerDominance.matches}/${summary.qualityGate.beginnerDominance.requiredMatches} matches, ${formatPercentage(summary.qualityGate.beginnerDominance.winRate)} master win rate (target >= ${formatPercentage(summary.qualityGate.beginnerDominance.minWinRate ?? 0)}).`,
);
if (summary.fixedSuite.fixedFixtureAgreementFailures.length > 0) {
console.log(`Fixed-fixture agreement failures: ${summary.fixedSuite.fixedFixtureAgreementFailures.join(', ')}`);
}
if (summary.fixedSuite.criticalPassFailures.length > 0) {
console.log(`Critical concept failures: ${summary.fixedSuite.criticalPassFailures.join(', ')}`);
}
console.log(`Per-seed outcomes: ${summary.selfPlay.perSeed.map(seed => `${seed.seed}:${seed.wins}W-${seed.losses}L-${seed.draws}D`).join(' | ')}`);
console.log(`Dual-loss seeds: ${summary.selfPlay.dualLossSeeds.length > 0 ? summary.selfPlay.dualLossSeeds.join(', ') : 'none'}`);
console.log(`Regression watchlist intersection: ${summary.selfPlay.regressionWatchlistDualLossIntersection.length > 0 ? summary.selfPlay.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${summary.selfPlay.regressionWatchlist.join(', ')})`);
console.log(`Master simulated timing: avg ${summary.timing.productionMasterSimulatedDecisions.averageMs.toFixed(1)} ms, p95 ${summary.timing.productionMasterSimulatedDecisions.p95Ms.toFixed(1)} ms, max ${summary.timing.productionMasterSimulatedDecisions.maxMs.toFixed(1)} ms.`);
console.log(`Mirror per-seed aggregates: ${formatPerSeedAggregates(mirror.perSeed)}`);
console.log(`Mirror dual-loss seeds: ${mirror.dualLossSeeds.length > 0 ? mirror.dualLossSeeds.join(', ') : 'none'}`);
console.log(
`Mirror regression watchlist intersection: ${mirror.regressionWatchlistDualLossIntersection.length > 0 ? mirror.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${mirror.regressionWatchlist.join(', ')})`,
);
console.log(`Beginner per-seed aggregates: ${formatPerSeedAggregates(beginner.perSeed)}`);
console.log(`Beginner dual-loss seeds: ${beginner.dualLossSeeds.length > 0 ? beginner.dualLossSeeds.join(', ') : 'none'}`);
console.log(
`Beginner regression watchlist intersection: ${beginner.regressionWatchlistDualLossIntersection.length > 0 ? beginner.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${beginner.regressionWatchlist.join(', ')})`,
);
console.log(
`Fixed suite simulated duration: production ${formatDurationMs(summary.fixedSuite.simulatedTiming.productionSuiteSimulatedMs)}, reference ${formatDurationMs(summary.fixedSuite.simulatedTiming.referenceSuiteSimulatedMs)}.`,
);
console.log(`Mirror suite simulated duration: ${formatDurationMs(mirror.simulatedTiming.suiteSimulatedMs)}.`);
console.log(`Beginner suite simulated duration: ${formatDurationMs(beginner.simulatedTiming.suiteSimulatedMs)}.`);
console.log(
`Simulated timing: fixed production avg ${summary.timing.fixedFixtureProductionMasterDecisions.averageMs.toFixed(1)} ms, fixed reference avg ${summary.timing.fixedFixtureReferenceMasterDecisions.averageMs.toFixed(1)} ms, mirror tracked avg ${summary.timing.mirrorTrackedTeamSimulatedDecisions.averageMs.toFixed(1)} ms, beginner tracked avg ${summary.timing.beginnerTrackedTeamSimulatedDecisions.averageMs.toFixed(1)} ms, aggregate avg ${summary.timing.allTrackedProductionSimulatedDecisions.averageMs.toFixed(1)} ms.`,
);
console.log('BENCHMARK_SUMMARY');
console.log(JSON.stringify(summary, null, 2));
}
export async function runAIBenchmark(): Promise<AIBenchmarkSummary> {
assertIteration5BenchmarkContract();
logBenchmarkProgress('Benchmark started. Running fixed fixtures first, then self-play.');
assertIteration6BenchmarkContract();
logBenchmarkProgress('Benchmark started. Running fixed fixtures, mirror parity, then beginner dominance.');
const fixedSuite = await runFixedFixtureSuite();
logBenchmarkProgress(`Fixed fixture suite complete in ${formatDurationMs(fixedSuite.wallClockMs)} wall-clock.`);
const selfPlay = await runSelfPlaySuite();
logBenchmarkProgress(`Self-play suite complete in ${formatDurationMs(selfPlay.wallClockMs)} wall-clock.`);
logBenchmarkProgress(
`Fixed fixture suite complete with production ${formatDurationMs(fixedSuite.productionSuiteSimulatedMs)} simulated and reference ${formatDurationMs(fixedSuite.referenceSuiteSimulatedMs)} simulated.`,
);
const mirrorRun = await runSelfPlaySuite(SELF_PLAY_SUITES['mirror-parity']);
logBenchmarkProgress(`Mirror parity suite complete in ${formatDurationMs(mirrorRun.suiteSimulatedMs)} simulated.`);
const beginnerRun = await runSelfPlaySuite(SELF_PLAY_SUITES['beginner-dominance']);
logBenchmarkProgress(`Beginner dominance suite complete in ${formatDurationMs(beginnerRun.suiteSimulatedMs)} simulated.`);
const mirrorSummary = buildSelfPlaySuiteSummary(SELF_PLAY_SUITES['mirror-parity'], mirrorRun);
const beginnerSummary = buildSelfPlaySuiteSummary(SELF_PLAY_SUITES['beginner-dominance'], beginnerRun);
const mirrorParityGate = createMirrorParityGate(mirrorSummary);
const beginnerDominanceGate = createBeginnerDominanceGate(beginnerSummary);
const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length;
const fixedFixtureAgreements = fixedSuite.results.filter(result => result.matchesReference).length;
const expectedPasses = fixedSuite.results.filter(result => result.expectedPass).length;
@@ -548,76 +783,73 @@ export async function runAIBenchmark(): Promise<AIBenchmarkSummary> {
const criticalPassFailures = fixedSuite.results
.filter(result => result.conceptGatePass === false)
.map(result => result.fixtureId);
const wins = selfPlay.results.filter(result => result.masterResult === 'win').length;
const losses = selfPlay.results.filter(result => result.masterResult === 'loss').length;
const draws = selfPlay.results.filter(result => result.masterResult === 'draw').length;
const { perSeed, dualLossSeeds, regressionWatchlistDualLossIntersection } = summarizeSelfPlayBySeed(selfPlay.results);
const productionMasterSimulatedDecisions = summarizeTimings([
const fixedFixtureProductionMasterDecisions = summarizeTimings(fixedSuite.productionTimings);
const fixedFixtureReferenceMasterDecisions = summarizeTimings(fixedSuite.referenceTimings);
const mirrorTrackedTeamSimulatedDecisions = mirrorSummary.simulatedTiming.trackedTeamDecisions;
const beginnerTrackedTeamSimulatedDecisions = beginnerSummary.simulatedTiming.trackedTeamDecisions;
const allTrackedProductionSimulatedDecisions = summarizeTimings([
...fixedSuite.productionTimings,
...selfPlay.productionTimings,
...mirrorRun.trackedTeamTimings,
...beginnerRun.trackedTeamTimings,
]);
const fixedFixtureGate: GateCountSummary = {
actual: fixedFixtureAgreements,
required: ITERATION_5_GATE.fixedFixtureAgreementTarget,
required: AI_BENCHMARK_FIXTURES.length,
total: AI_BENCHMARK_FIXTURES.length,
passed: fixedFixtureAgreements === ITERATION_5_GATE.fixedFixtureAgreementTarget,
passed: fixedFixtureAgreements === AI_BENCHMARK_FIXTURES.length,
};
const criticalConceptGate: GateCountSummary = {
actual: criticalPasses,
required: ITERATION_5_GATE.criticalConceptTarget,
required: criticalFixtureCount,
total: criticalFixtureCount,
passed: criticalPasses === ITERATION_5_GATE.criticalConceptTarget,
};
const selfPlayGate: SelfPlayGateSummary = {
matches: selfPlay.results.length,
requiredMatches: ITERATION_5_GATE.selfPlayMatchTarget,
wins,
requiredWins: ITERATION_5_GATE.selfPlayWinTarget,
losses,
maxLosses: ITERATION_5_GATE.selfPlayMaxLosses,
draws,
matchCountPassed: selfPlay.results.length === ITERATION_5_GATE.selfPlayMatchTarget,
winGatePassed: wins >= ITERATION_5_GATE.selfPlayWinTarget,
lossGatePassed: losses <= ITERATION_5_GATE.selfPlayMaxLosses,
passed: selfPlay.results.length === ITERATION_5_GATE.selfPlayMatchTarget
&& wins >= ITERATION_5_GATE.selfPlayWinTarget
&& losses <= ITERATION_5_GATE.selfPlayMaxLosses,
passed: criticalPasses === criticalFixtureCount,
};
return {
benchmark: 'ai-quality',
qualityGate: {
iteration: 5,
passed: fixedFixtureGate.passed && criticalConceptGate.passed && selfPlayGate.passed,
iteration: 6,
passed: fixedFixtureGate.passed
&& criticalConceptGate.passed
&& mirrorParityGate.passed
&& beginnerDominanceGate.passed,
fixedFixtures: fixedFixtureGate,
criticalConcepts: criticalConceptGate,
selfPlay: selfPlayGate,
mirrorParity: mirrorParityGate,
beginnerDominance: beginnerDominanceGate,
},
fixtureCount: AI_BENCHMARK_FIXTURES.length,
criticalFixtureCount,
fixtureTotals: {
fixtures: AI_BENCHMARK_FIXTURES.length,
criticalFixtures: criticalFixtureCount,
},
fixedSuite: {
fixedFixtureAgreements,
expectedPasses,
criticalPasses,
fixedFixtureAgreementFailures,
criticalPassFailures,
simulatedTiming: {
productionSuiteSimulatedMs: fixedSuite.productionSuiteSimulatedMs,
referenceSuiteSimulatedMs: fixedSuite.referenceSuiteSimulatedMs,
productionMasterDecisions: fixedFixtureProductionMasterDecisions,
referenceMasterDecisions: fixedFixtureReferenceMasterDecisions,
},
results: fixedSuite.results,
},
selfPlay: {
matches: selfPlay.results.length,
wins,
losses,
draws,
winRate: selfPlay.results.length === 0 ? 0 : wins / selfPlay.results.length,
lossRate: selfPlay.results.length === 0 ? 0 : losses / selfPlay.results.length,
perSeed,
dualLossSeeds,
regressionWatchlist: [...KNOWN_REGRESSION_WATCHLIST],
regressionWatchlistDualLossIntersection,
results: selfPlay.results,
selfPlaySuites: {
totalMatches: mirrorSummary.matches + beginnerSummary.matches,
mirrorParity: mirrorSummary,
beginnerDominance: beginnerSummary,
},
timing: {
productionMasterSimulatedDecisions,
fixedFixtureProductionMasterDecisions,
fixedFixtureReferenceMasterDecisions,
mirrorTrackedTeamSimulatedDecisions,
beginnerTrackedTeamSimulatedDecisions,
allTrackedProductionSimulatedDecisions,
},
referenceProfile: REFERENCE_PROFILE,
};
@@ -625,7 +857,7 @@ export async function runAIBenchmark(): Promise<AIBenchmarkSummary> {
async function runBenchmarkCli(): Promise<void> {
const summary = await runAIBenchmark();
logBenchmarkProgress('Benchmark complete. Emitting summary with iteration 5 gate results.');
logBenchmarkProgress('Benchmark complete. Emitting summary with iteration 6 gate results.');
printReadableSummary(summary);
}

File diff suppressed because it is too large Load Diff