import { applyMove, cloneState, createInitialState, getMatchOutcome, nextPlayer, teamOf } from './engine'; import { AITimingSource, AIMove, AISearchProfileOverride, chooseMove } from './ai'; import { chooseMove as chooseMoveOld } from './ai-legacy'; import { CardInferenceEngine } from './card-inference'; import { AI_BENCHMARK_FIXTURES, AIBenchmarkCriticalConcept, AIBenchmarkExpectedMove, isCriticalAIBenchmarkFixture, } from './ai-benchmark-fixtures'; import { CardTracker } from './card-tracker'; import { Difficulty, GameState, PlayerIndex } from './types'; function formatDurationMs(durationMs: number): string { if (durationMs < 1000) { return `${durationMs.toFixed(0)} ms`; } return `${(durationMs / 1000).toFixed(2)} s`; } function formatPercentage(value: number): string { return `${(value * 100).toFixed(1)}%`; } function logBenchmarkProgress(message: string): void { console.log(`[ai-benchmark] ${message}`); } interface FixedFixtureResult { fixtureId: string; name: string; tags: string[]; criticalConcept: AIBenchmarkCriticalConcept | null; productionMove: string; referenceMove: string; matchesReference: boolean; expectedPass: boolean; conceptGatePass: boolean | null; productionSimulatedMs: number; referenceSimulatedMs: number; } type SelfPlaySuiteId = 'mirror-parity' | 'beginner-dominance'; interface SelfPlayMatchResult { suite: SelfPlaySuiteId; seed: number; dealer: PlayerIndex; trackedTeam: 0 | 1; trackedTeamDifficulty: Difficulty; opponentDifficulty: Difficulty; winner: 0 | 1 | null; trackedResult: 'win' | 'loss' | 'draw'; rounds: number; truncated: boolean; totalPoints: [number, number]; trackedDecisionCount: number; trackedAverageSimulatedDecisionMs: number; trackedMaxSimulatedDecisionMs: number; } interface TimingSummary { count: number; averageMs: number; p95Ms: number; maxMs: number; } interface GateCountSummary { actual: number; required: number; total: number; passed: boolean; } interface WinRateGateSummary { matches: number; requiredMatches: number; wins: number; losses: number; draws: number; winRate: number; targetWinRate: number | null; tolerance: number | null; minWinRate: number | null; maxWinRate: number | null; matchCountPassed: boolean; winRatePassed: boolean; passed: boolean; } interface SelfPlaySeedSeatResult { trackedTeam: 0 | 1; trackedResult: 'win' | 'loss' | 'draw'; winner: 0 | 1 | null; rounds: number; truncated: boolean; totalPoints: [number, number]; } interface SelfPlaySeedAggregateResult { seed: number; matches: number; wins: number; losses: number; draws: number; dualLoss: boolean; seatResults: SelfPlaySeedSeatResult[]; } interface SelfPlaySuiteSummary { suite: SelfPlaySuiteId; label: string; trackedTeamDifficulty: Difficulty; opponentDifficulty: Difficulty; matches: number; requiredMatches: number; seedCount: number; seatBalanced: boolean; wins: number; losses: number; draws: number; winRate: number; lossRate: number; perSeed: SelfPlaySeedAggregateResult[]; dualLossSeeds: number[]; regressionWatchlist: number[]; regressionWatchlistDualLossIntersection: number[]; simulatedTiming: { suiteSimulatedMs: number; trackedTeamDecisions: TimingSummary; }; results: SelfPlayMatchResult[]; } export interface AIBenchmarkSummary { benchmark: 'ai-quality'; qualityGate: { iteration: 6; passed: boolean; fixedFixtures: GateCountSummary; criticalConcepts: GateCountSummary; mirrorParity: WinRateGateSummary; beginnerDominance: WinRateGateSummary; }; fixtureCount: number; criticalFixtureCount: number; fixtureTotals: { fixtures: number; criticalFixtures: number; }; fixedSuite: { fixedFixtureAgreements: number; expectedPasses: number; criticalPasses: number; fixedFixtureAgreementFailures: string[]; criticalPassFailures: string[]; simulatedTiming: { productionSuiteSimulatedMs: number; referenceSuiteSimulatedMs: number; productionMasterDecisions: TimingSummary; referenceMasterDecisions: TimingSummary; }; results: FixedFixtureResult[]; }; selfPlaySuites: { totalMatches: number; mirrorParity: SelfPlaySuiteSummary; beginnerDominance: SelfPlaySuiteSummary; }; timing: { fixedFixtureProductionMasterDecisions: TimingSummary; fixedFixtureReferenceMasterDecisions: TimingSummary; mirrorTrackedTeamSimulatedDecisions: TimingSummary; beginnerTrackedTeamSimulatedDecisions: TimingSummary; allTrackedProductionSimulatedDecisions: TimingSummary; }; referenceProfile: Required; } const ITERATION_6_GATE = { mirrorMatchTarget: 500, beginnerMatchTarget: 500, mirrorTargetWinRate: 0.5, mirrorWinRateTolerance: 0.05, beginnerMinWinRate: 0.7, } as const; const KNOWN_REGRESSION_WATCHLIST = [1000, 1002, 1004, 1006, 1012, 1013, 1014] as const; const KNOWN_REGRESSION_WATCHLIST_SET = new Set(KNOWN_REGRESSION_WATCHLIST); const REFERENCE_PROFILE: Required = { timeBudgetMs: 9000, sampleCount: 12, maxDepth: 7, batchSize: 2, }; const SELF_PLAY_SEAT_SWAPS = [0, 1] as const; const SELF_PLAY_MATCH_SEEDS = Array.from({ length: 250 }, (_, index) => 1000 + index); const MAX_SELF_PLAY_ROUNDS = 20; const HEAD_TO_HEAD_SEEDS = Array.from({ length: 100 }, (_, i) => 2000 + i); const HEAD_TO_HEAD_SEAT_SWAPS = [0, 1] as const; const HEAD_TO_HEAD_MASTER_TARGET_WIN_RATE = 0.60; const HEAD_TO_HEAD_ADVANCED_TARGET_WIN_RATE = 0.55; interface SelfPlaySuiteConfig { id: SelfPlaySuiteId; label: string; suiteSeedKey: number; requiredMatches: number; trackedTeamDifficulty: Difficulty; opponentDifficulty: Difficulty; getTeamDifficulties(trackedTeam: 0 | 1): readonly [Difficulty, Difficulty]; } const SELF_PLAY_SUITES: Record = { 'mirror-parity': { id: 'mirror-parity', label: 'Master mirror parity', suiteSeedKey: 0x4d31, requiredMatches: SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length, trackedTeamDifficulty: 'master', opponentDifficulty: 'master', getTeamDifficulties: () => ['master', 'master'], }, 'beginner-dominance': { id: 'beginner-dominance', label: 'Master versus beginner dominance', suiteSeedKey: 0x4236, requiredMatches: SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length, trackedTeamDifficulty: 'master', opponentDifficulty: 'beginner', getTeamDifficulties: trackedTeam => (trackedTeam === 0 ? ['master', 'beginner'] : ['beginner', 'master']), }, }; function assertIteration6BenchmarkContract(): void { const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length; const expectedSeatBalancedMatches = SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length; if (AI_BENCHMARK_FIXTURES.length === 0) { throw new Error('Iteration 6 benchmark requires at least one fixed fixture.'); } if (criticalFixtureCount === 0) { throw new Error('Iteration 6 benchmark requires at least one critical concept fixture.'); } if (expectedSeatBalancedMatches !== ITERATION_6_GATE.mirrorMatchTarget) { throw new Error( `Iteration 6 benchmark expects ${ITERATION_6_GATE.mirrorMatchTarget} mirror matches, received ${expectedSeatBalancedMatches}.`, ); } if (expectedSeatBalancedMatches !== ITERATION_6_GATE.beginnerMatchTarget) { throw new Error( `Iteration 6 benchmark expects ${ITERATION_6_GATE.beginnerMatchTarget} beginner-dominance matches, received ${expectedSeatBalancedMatches}.`, ); } for (const suite of Object.values(SELF_PLAY_SUITES)) { if (suite.requiredMatches !== expectedSeatBalancedMatches) { throw new Error( `Iteration 6 benchmark expects ${expectedSeatBalancedMatches} matches for ${suite.id}, received ${suite.requiredMatches}.`, ); } } } interface SimulatedBenchmarkTimingSource extends AITimingSource { getElapsedMs(): number; } function createSimulatedBenchmarkTimingSource(startMs = 0): SimulatedBenchmarkTimingSource { let currentMs = startMs; return { isSimulated: true, now: () => currentMs, advance: (elapsedMs: number) => { currentMs += elapsedMs; return currentMs; }, getElapsedMs: () => currentMs - startMs, }; } function seedFromParts(...parts: number[]): number { let hash = 2166136261; for (const part of parts) { hash ^= part >>> 0; hash = Math.imul(hash, 16777619); } return hash >>> 0; } function createMulberry32(seed: number): () => number { let state = seed >>> 0; return () => { state = (state + 0x6d2b79f5) >>> 0; let mixed = Math.imul(state ^ (state >>> 15), state | 1); mixed ^= mixed + Math.imul(mixed ^ (mixed >>> 7), mixed | 61); return ((mixed ^ (mixed >>> 14)) >>> 0) / 4294967296; }; } function moveKey(move: AIMove): string { return `${move.card.id}|${move.capture.map(card => card.id).sort().join(',')}`; } function otherTeam(team: 0 | 1): 0 | 1 { return team === 0 ? 1 : 0; } function createTrackerForState(state: GameState): CardTracker { const tracker = new CardTracker(); for (const player of state.players) { for (const card of player.pile) { tracker.trackPlay(card); } } return tracker; } function matchesExpectedMove(move: AIMove, expected: AIBenchmarkExpectedMove): boolean { if (move.card.id !== expected.cardId) return false; if (!expected.captureIds) return true; const actualCapture = move.capture.map(card => card.id).sort().join(','); const expectedCapture = [...expected.captureIds].sort().join(','); return actualCapture === expectedCapture; } async function runFixedFixtureSuite(): Promise<{ results: FixedFixtureResult[]; productionSuiteSimulatedMs: number; referenceSuiteSimulatedMs: number; productionTimings: number[]; referenceTimings: number[]; }> { const results: FixedFixtureResult[] = []; const productionTimings: number[] = []; const referenceTimings: number[] = []; logBenchmarkProgress(`Starting fixed fixture suite (${AI_BENCHMARK_FIXTURES.length} positions).`); for (let index = 0; index < AI_BENCHMARK_FIXTURES.length; index++) { const fixture = AI_BENCHMARK_FIXTURES[index]; const productionState = cloneState(fixture.state); const referenceState = cloneState(fixture.state); const productionTracker = createTrackerForState(productionState); const referenceTracker = createTrackerForState(referenceState); const productionSeed = seedFromParts(0x0f1e2d3c, index, 0); const referenceSeed = seedFromParts(0x0f1e2d3c, index, 1); const productionTimingSource = createSimulatedBenchmarkTimingSource(); const referenceTimingSource = createSimulatedBenchmarkTimingSource(); const productionMove = await chooseMove( productionState, productionState.currentPlayer, 'master', productionTracker, undefined, { rng: createMulberry32(productionSeed), timingSource: productionTimingSource, }, ); const productionSimulatedMs = productionTimingSource.getElapsedMs(); const referenceMove = await chooseMove( referenceState, referenceState.currentPlayer, 'master', referenceTracker, undefined, { rng: createMulberry32(referenceSeed), profileOverride: REFERENCE_PROFILE, timingSource: referenceTimingSource, }, ); const referenceSimulatedMs = referenceTimingSource.getElapsedMs(); productionTimings.push(productionSimulatedMs); referenceTimings.push(referenceSimulatedMs); const conceptGatePass = isCriticalAIBenchmarkFixture(fixture) ? matchesExpectedMove(productionMove, fixture.expectedMove) : null; results.push({ fixtureId: fixture.id, name: fixture.name, tags: [...fixture.tags], criticalConcept: fixture.criticalConcept, productionMove: moveKey(productionMove), referenceMove: moveKey(referenceMove), matchesReference: moveKey(productionMove) === moveKey(referenceMove), expectedPass: matchesExpectedMove(productionMove, fixture.expectedMove), conceptGatePass, productionSimulatedMs, referenceSimulatedMs, }); const progressLabel = `${index + 1}/${AI_BENCHMARK_FIXTURES.length}`; const matchLabel = moveKey(productionMove) === moveKey(referenceMove) ? 'agreement' : 'divergence'; logBenchmarkProgress( `Fixture ${progressLabel}: ${fixture.id} -> ${matchLabel}, production simulated ${formatDurationMs(productionSimulatedMs)}, reference simulated ${formatDurationMs(referenceSimulatedMs)}.`, ); } return { results, productionSuiteSimulatedMs: sumTimings(productionTimings), referenceSuiteSimulatedMs: sumTimings(referenceTimings), productionTimings, referenceTimings, }; } function sumTimings(samples: number[]): number { return samples.reduce((total, sample) => total + sample, 0); } function summarizeTimings(samples: number[]): TimingSummary { if (samples.length === 0) { return { count: 0, averageMs: 0, p95Ms: 0, maxMs: 0, }; } const sorted = [...samples].sort((left, right) => left - right); const sum = sorted.reduce((accumulator, value) => accumulator + value, 0); const p95Index = Math.min(sorted.length - 1, Math.max(0, Math.ceil(sorted.length * 0.95) - 1)); return { count: sorted.length, averageMs: sum / sorted.length, p95Ms: sorted[p95Index], maxMs: sorted[sorted.length - 1], }; } function summarizeSelfPlayBySeed(results: SelfPlayMatchResult[]): { perSeed: SelfPlaySeedAggregateResult[]; dualLossSeeds: number[]; regressionWatchlistDualLossIntersection: number[]; } { const aggregates = new Map(); for (const result of results) { const existing = aggregates.get(result.seed) ?? { seed: result.seed, matches: 0, wins: 0, losses: 0, draws: 0, dualLoss: false, seatResults: [], }; existing.matches++; if (result.trackedResult === 'win') existing.wins++; else if (result.trackedResult === 'loss') existing.losses++; else existing.draws++; existing.seatResults.push({ trackedTeam: result.trackedTeam, trackedResult: result.trackedResult, winner: result.winner, rounds: result.rounds, truncated: result.truncated, totalPoints: result.totalPoints, }); aggregates.set(result.seed, existing); } const perSeed = [...aggregates.values()] .map(aggregate => ({ ...aggregate, dualLoss: aggregate.losses >= 2, seatResults: [...aggregate.seatResults].sort((left, right) => left.trackedTeam - right.trackedTeam), })) .sort((left, right) => left.seed - right.seed); const dualLossSeeds = perSeed.filter(aggregate => aggregate.dualLoss).map(aggregate => aggregate.seed); const regressionWatchlistDualLossIntersection = dualLossSeeds.filter(seed => KNOWN_REGRESSION_WATCHLIST_SET.has(seed)); return { perSeed, dualLossSeeds, regressionWatchlistDualLossIntersection, }; } async function simulateSelfPlayMatch( suite: SelfPlaySuiteConfig, seed: number, trackedTeam: 0 | 1, ): Promise<{ result: SelfPlayMatchResult; trackedTimings: number[]; simulatedMatchMs: number }> { const initialDealer = (seed % 4) as PlayerIndex; const teamDifficulties = suite.getTeamDifficulties(trackedTeam); let state = createInitialState(initialDealer, createMulberry32(seedFromParts(suite.suiteSeedKey, seed, 1, 0))); const matchStartingPlayer = state.matchStartingPlayer; const tracker = new CardTracker(); const trackedTimings: number[] = []; let simulatedMatchMs = 0; let rounds = 1; let truncated = false; let turnCount = 0; while (rounds <= MAX_SELF_PLAY_ROUNDS) { while (!state.roundOver) { const playerIdx = state.currentPlayer; const actingTeam = teamOf(playerIdx); const difficulty = teamDifficulties[actingTeam]; const timingSource = createSimulatedBenchmarkTimingSource(); const move = await chooseMove(state, playerIdx, difficulty, tracker, undefined, { rng: createMulberry32(seedFromParts(suite.suiteSeedKey, seed, rounds, turnCount, playerIdx)), timingSource, }); const simulatedMs = timingSource.getElapsedMs(); simulatedMatchMs += simulatedMs; if (actingTeam === trackedTeam) { trackedTimings.push(simulatedMs); } const { nextState, capture } = applyMove( state, playerIdx, move.card, move.capture.length > 0 ? move.capture : undefined, ); tracker.trackPlay(move.card); if (capture) { tracker.trackCapture(capture.captured); } state = nextState; turnCount++; } const outcome = getMatchOutcome(state.teamScores); if (!outcome.continueMatch) { break; } if (rounds === MAX_SELF_PLAY_ROUNDS) { truncated = true; break; } rounds++; const totals: [number, number] = [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints]; const nextDealer = nextPlayer(state.dealer); tracker.reset(); state = createInitialState(nextDealer, createMulberry32(seedFromParts(suite.suiteSeedKey, seed, rounds, 0))); state.matchStartingPlayer = matchStartingPlayer; state.teamScores[0].totalPoints = totals[0]; state.teamScores[1].totalPoints = totals[1]; state.roundNumber = rounds; } const outcome = getMatchOutcome(state.teamScores); const winner = outcome.winner; const timingSummary = summarizeTimings(trackedTimings); const opposingTeam = otherTeam(trackedTeam); const trackedResult = winner === null ? 'draw' : winner === trackedTeam ? 'win' : 'loss'; return { result: { suite: suite.id, seed, dealer: initialDealer, trackedTeam, trackedTeamDifficulty: teamDifficulties[trackedTeam], opponentDifficulty: teamDifficulties[opposingTeam], winner, trackedResult, rounds, truncated, totalPoints: [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints], trackedDecisionCount: timingSummary.count, trackedAverageSimulatedDecisionMs: timingSummary.averageMs, trackedMaxSimulatedDecisionMs: timingSummary.maxMs, }, trackedTimings, simulatedMatchMs, }; } async function runSelfPlaySuite( suite: SelfPlaySuiteConfig, ): Promise<{ results: SelfPlayMatchResult[]; suiteSimulatedMs: number; trackedTeamTimings: number[] }> { const results: SelfPlayMatchResult[] = []; const trackedTeamTimings: number[] = []; let suiteSimulatedMs = 0; let completedMatches = 0; logBenchmarkProgress(`Starting ${suite.label} suite (${suite.requiredMatches} seeded matches with seat swaps).`); for (const seed of SELF_PLAY_MATCH_SEEDS) { for (const trackedTeam of SELF_PLAY_SEAT_SWAPS) { const { result, trackedTimings, simulatedMatchMs } = await simulateSelfPlayMatch(suite, seed, trackedTeam); results.push(result); trackedTeamTimings.push(...trackedTimings); suiteSimulatedMs += simulatedMatchMs; completedMatches++; if (completedMatches === 1 || completedMatches % 25 === 0 || completedMatches === suite.requiredMatches) { logBenchmarkProgress( `${suite.label} ${completedMatches}/${suite.requiredMatches}: seed ${seed}, tracked team ${trackedTeam}, result ${result.trackedResult}, rounds ${result.rounds}, max simulated decision ${formatDurationMs(result.trackedMaxSimulatedDecisionMs)}.`, ); } } } return { results, suiteSimulatedMs, trackedTeamTimings, }; } function buildSelfPlaySuiteSummary( suite: SelfPlaySuiteConfig, run: { results: SelfPlayMatchResult[]; suiteSimulatedMs: number; trackedTeamTimings: number[] }, ): SelfPlaySuiteSummary { const wins = run.results.filter(result => result.trackedResult === 'win').length; const losses = run.results.filter(result => result.trackedResult === 'loss').length; const draws = run.results.filter(result => result.trackedResult === 'draw').length; const { perSeed, dualLossSeeds, regressionWatchlistDualLossIntersection } = summarizeSelfPlayBySeed(run.results); return { suite: suite.id, label: suite.label, trackedTeamDifficulty: suite.trackedTeamDifficulty, opponentDifficulty: suite.opponentDifficulty, matches: run.results.length, requiredMatches: suite.requiredMatches, seedCount: SELF_PLAY_MATCH_SEEDS.length, seatBalanced: true, wins, losses, draws, winRate: run.results.length === 0 ? 0 : wins / run.results.length, lossRate: run.results.length === 0 ? 0 : losses / run.results.length, perSeed, dualLossSeeds, regressionWatchlist: [...KNOWN_REGRESSION_WATCHLIST], regressionWatchlistDualLossIntersection, simulatedTiming: { suiteSimulatedMs: run.suiteSimulatedMs, trackedTeamDecisions: summarizeTimings(run.trackedTeamTimings), }, results: run.results, }; } function createMirrorParityGate(summary: SelfPlaySuiteSummary): WinRateGateSummary { const minWinRate = ITERATION_6_GATE.mirrorTargetWinRate - ITERATION_6_GATE.mirrorWinRateTolerance; const maxWinRate = ITERATION_6_GATE.mirrorTargetWinRate + ITERATION_6_GATE.mirrorWinRateTolerance; const matchCountPassed = summary.matches === ITERATION_6_GATE.mirrorMatchTarget; const winRatePassed = summary.winRate >= minWinRate && summary.winRate <= maxWinRate; return { matches: summary.matches, requiredMatches: ITERATION_6_GATE.mirrorMatchTarget, wins: summary.wins, losses: summary.losses, draws: summary.draws, winRate: summary.winRate, targetWinRate: ITERATION_6_GATE.mirrorTargetWinRate, tolerance: ITERATION_6_GATE.mirrorWinRateTolerance, minWinRate, maxWinRate, matchCountPassed, winRatePassed, passed: matchCountPassed && winRatePassed, }; } function createBeginnerDominanceGate(summary: SelfPlaySuiteSummary): WinRateGateSummary { const matchCountPassed = summary.matches === ITERATION_6_GATE.beginnerMatchTarget; const winRatePassed = summary.winRate >= ITERATION_6_GATE.beginnerMinWinRate; return { matches: summary.matches, requiredMatches: ITERATION_6_GATE.beginnerMatchTarget, wins: summary.wins, losses: summary.losses, draws: summary.draws, winRate: summary.winRate, targetWinRate: null, tolerance: null, minWinRate: ITERATION_6_GATE.beginnerMinWinRate, maxWinRate: null, matchCountPassed, winRatePassed, passed: matchCountPassed && winRatePassed, }; } function formatPerSeedAggregates(perSeed: SelfPlaySeedAggregateResult[]): string { return perSeed.map(seed => `${seed.seed}:${seed.wins}W-${seed.losses}L-${seed.draws}D`).join(' | '); } function printReadableSummary(summary: AIBenchmarkSummary): void { const mirror = summary.selfPlaySuites.mirrorParity; const beginner = summary.selfPlaySuites.beginnerDominance; console.log('AI quality benchmark'); console.log(`Iteration 6 quality gate: ${summary.qualityGate.passed ? 'PASS' : 'FAIL'}`); console.log(`Fixture totals: ${summary.fixtureTotals.fixtures} total, ${summary.fixtureTotals.criticalFixtures} critical.`); console.log(`Fixed-fixture gate: ${summary.qualityGate.fixedFixtures.actual}/${summary.qualityGate.fixedFixtures.total} agreements.`); console.log(`Critical concept gate: ${summary.qualityGate.criticalConcepts.actual}/${summary.qualityGate.criticalConcepts.total} passes.`); console.log( `Mirror parity gate: ${summary.qualityGate.mirrorParity.matches}/${summary.qualityGate.mirrorParity.requiredMatches} matches, ${formatPercentage(summary.qualityGate.mirrorParity.winRate)} tracked-team win rate (target ${formatPercentage(summary.qualityGate.mirrorParity.targetWinRate ?? 0)} +/- ${formatPercentage(summary.qualityGate.mirrorParity.tolerance ?? 0)}).`, ); console.log( `Beginner dominance gate: ${summary.qualityGate.beginnerDominance.matches}/${summary.qualityGate.beginnerDominance.requiredMatches} matches, ${formatPercentage(summary.qualityGate.beginnerDominance.winRate)} master win rate (target >= ${formatPercentage(summary.qualityGate.beginnerDominance.minWinRate ?? 0)}).`, ); if (summary.fixedSuite.fixedFixtureAgreementFailures.length > 0) { console.log(`Fixed-fixture agreement failures: ${summary.fixedSuite.fixedFixtureAgreementFailures.join(', ')}`); } if (summary.fixedSuite.criticalPassFailures.length > 0) { console.log(`Critical concept failures: ${summary.fixedSuite.criticalPassFailures.join(', ')}`); } console.log(`Mirror per-seed aggregates: ${formatPerSeedAggregates(mirror.perSeed)}`); console.log(`Mirror dual-loss seeds: ${mirror.dualLossSeeds.length > 0 ? mirror.dualLossSeeds.join(', ') : 'none'}`); console.log( `Mirror regression watchlist intersection: ${mirror.regressionWatchlistDualLossIntersection.length > 0 ? mirror.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${mirror.regressionWatchlist.join(', ')})`, ); console.log(`Beginner per-seed aggregates: ${formatPerSeedAggregates(beginner.perSeed)}`); console.log(`Beginner dual-loss seeds: ${beginner.dualLossSeeds.length > 0 ? beginner.dualLossSeeds.join(', ') : 'none'}`); console.log( `Beginner regression watchlist intersection: ${beginner.regressionWatchlistDualLossIntersection.length > 0 ? beginner.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${beginner.regressionWatchlist.join(', ')})`, ); console.log( `Fixed suite simulated duration: production ${formatDurationMs(summary.fixedSuite.simulatedTiming.productionSuiteSimulatedMs)}, reference ${formatDurationMs(summary.fixedSuite.simulatedTiming.referenceSuiteSimulatedMs)}.`, ); console.log(`Mirror suite simulated duration: ${formatDurationMs(mirror.simulatedTiming.suiteSimulatedMs)}.`); console.log(`Beginner suite simulated duration: ${formatDurationMs(beginner.simulatedTiming.suiteSimulatedMs)}.`); console.log( `Simulated timing: fixed production avg ${summary.timing.fixedFixtureProductionMasterDecisions.averageMs.toFixed(1)} ms, fixed reference avg ${summary.timing.fixedFixtureReferenceMasterDecisions.averageMs.toFixed(1)} ms, mirror tracked avg ${summary.timing.mirrorTrackedTeamSimulatedDecisions.averageMs.toFixed(1)} ms, beginner tracked avg ${summary.timing.beginnerTrackedTeamSimulatedDecisions.averageMs.toFixed(1)} ms, aggregate avg ${summary.timing.allTrackedProductionSimulatedDecisions.averageMs.toFixed(1)} ms.`, ); console.log('BENCHMARK_SUMMARY'); console.log(JSON.stringify(summary, null, 2)); } export async function runAIBenchmark(): Promise { assertIteration6BenchmarkContract(); logBenchmarkProgress('Benchmark started. Running fixed fixtures, mirror parity, then beginner dominance.'); const fixedSuite = await runFixedFixtureSuite(); logBenchmarkProgress( `Fixed fixture suite complete with production ${formatDurationMs(fixedSuite.productionSuiteSimulatedMs)} simulated and reference ${formatDurationMs(fixedSuite.referenceSuiteSimulatedMs)} simulated.`, ); const mirrorRun = await runSelfPlaySuite(SELF_PLAY_SUITES['mirror-parity']); logBenchmarkProgress(`Mirror parity suite complete in ${formatDurationMs(mirrorRun.suiteSimulatedMs)} simulated.`); const beginnerRun = await runSelfPlaySuite(SELF_PLAY_SUITES['beginner-dominance']); logBenchmarkProgress(`Beginner dominance suite complete in ${formatDurationMs(beginnerRun.suiteSimulatedMs)} simulated.`); const mirrorSummary = buildSelfPlaySuiteSummary(SELF_PLAY_SUITES['mirror-parity'], mirrorRun); const beginnerSummary = buildSelfPlaySuiteSummary(SELF_PLAY_SUITES['beginner-dominance'], beginnerRun); const mirrorParityGate = createMirrorParityGate(mirrorSummary); const beginnerDominanceGate = createBeginnerDominanceGate(beginnerSummary); const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length; const fixedFixtureAgreements = fixedSuite.results.filter(result => result.matchesReference).length; const expectedPasses = fixedSuite.results.filter(result => result.expectedPass).length; const criticalPasses = fixedSuite.results.filter(result => result.conceptGatePass === true).length; const fixedFixtureAgreementFailures = fixedSuite.results .filter(result => !result.matchesReference) .map(result => result.fixtureId); const criticalPassFailures = fixedSuite.results .filter(result => result.conceptGatePass === false) .map(result => result.fixtureId); const fixedFixtureProductionMasterDecisions = summarizeTimings(fixedSuite.productionTimings); const fixedFixtureReferenceMasterDecisions = summarizeTimings(fixedSuite.referenceTimings); const mirrorTrackedTeamSimulatedDecisions = mirrorSummary.simulatedTiming.trackedTeamDecisions; const beginnerTrackedTeamSimulatedDecisions = beginnerSummary.simulatedTiming.trackedTeamDecisions; const allTrackedProductionSimulatedDecisions = summarizeTimings([ ...fixedSuite.productionTimings, ...mirrorRun.trackedTeamTimings, ...beginnerRun.trackedTeamTimings, ]); const fixedFixtureGate: GateCountSummary = { actual: fixedFixtureAgreements, required: AI_BENCHMARK_FIXTURES.length, total: AI_BENCHMARK_FIXTURES.length, passed: fixedFixtureAgreements === AI_BENCHMARK_FIXTURES.length, }; const criticalConceptGate: GateCountSummary = { actual: criticalPasses, required: criticalFixtureCount, total: criticalFixtureCount, passed: criticalPasses === criticalFixtureCount, }; return { benchmark: 'ai-quality', qualityGate: { iteration: 6, passed: fixedFixtureGate.passed && criticalConceptGate.passed && mirrorParityGate.passed && beginnerDominanceGate.passed, fixedFixtures: fixedFixtureGate, criticalConcepts: criticalConceptGate, mirrorParity: mirrorParityGate, beginnerDominance: beginnerDominanceGate, }, fixtureCount: AI_BENCHMARK_FIXTURES.length, criticalFixtureCount, fixtureTotals: { fixtures: AI_BENCHMARK_FIXTURES.length, criticalFixtures: criticalFixtureCount, }, fixedSuite: { fixedFixtureAgreements, expectedPasses, criticalPasses, fixedFixtureAgreementFailures, criticalPassFailures, simulatedTiming: { productionSuiteSimulatedMs: fixedSuite.productionSuiteSimulatedMs, referenceSuiteSimulatedMs: fixedSuite.referenceSuiteSimulatedMs, productionMasterDecisions: fixedFixtureProductionMasterDecisions, referenceMasterDecisions: fixedFixtureReferenceMasterDecisions, }, results: fixedSuite.results, }, selfPlaySuites: { totalMatches: mirrorSummary.matches + beginnerSummary.matches, mirrorParity: mirrorSummary, beginnerDominance: beginnerSummary, }, timing: { fixedFixtureProductionMasterDecisions, fixedFixtureReferenceMasterDecisions, mirrorTrackedTeamSimulatedDecisions, beginnerTrackedTeamSimulatedDecisions, allTrackedProductionSimulatedDecisions, }, referenceProfile: REFERENCE_PROFILE, }; } interface HeadToHeadMatchResult { suite: 'head-to-head-master' | 'head-to-head-advanced'; seed: number; dealer: PlayerIndex; newAITeam: 0 | 1; newAIDifficulty: Difficulty; winner: 0 | 1 | null; newAIResult: 'win' | 'loss' | 'draw'; rounds: number; totalPoints: [number, number]; } interface HeadToHeadSuiteSummary { suite: 'head-to-head-master' | 'head-to-head-advanced'; newAIDifficulty: Difficulty; matches: number; wins: number; losses: number; draws: number; winRate: number; targetWinRate: number; passed: boolean; results: HeadToHeadMatchResult[]; } const HEAD_TO_HEAD_SUITE_SEED_KEYS: Record<'head-to-head-master' | 'head-to-head-advanced', number> = { 'head-to-head-master': 0x4d42, 'head-to-head-advanced': 0x4142, }; async function simulateHeadToHeadMatch( suite: 'head-to-head-master' | 'head-to-head-advanced', difficulty: Difficulty, seed: number, newAITeam: 0 | 1, ): Promise { const suiteSeedKey = HEAD_TO_HEAD_SUITE_SEED_KEYS[suite]; const initialDealer = (seed % 4) as PlayerIndex; let state = createInitialState(initialDealer, createMulberry32(seedFromParts(suiteSeedKey, seed, 1, 0))); const matchStartingPlayer = state.matchStartingPlayer; const tracker = new CardTracker(); const inference = new CardInferenceEngine(tracker); let rounds = 1; let truncated = false; let turnCount = 0; while (rounds <= MAX_SELF_PLAY_ROUNDS) { while (!state.roundOver) { const playerIdx = state.currentPlayer; const actingTeam = teamOf(playerIdx); const isNewAI = actingTeam === newAITeam; const timingSource = createSimulatedBenchmarkTimingSource(); const rng = createMulberry32(seedFromParts(suiteSeedKey, seed, rounds, turnCount, playerIdx)); let move: AIMove; if (isNewAI) { move = await chooseMove(state, playerIdx, difficulty, tracker, undefined, { rng, timingSource, inference, }); } else { move = await chooseMoveOld(state, playerIdx, difficulty, tracker, undefined, { rng, timingSource, }); } const tableBeforeMove = [...state.table]; const { nextState, capture } = applyMove( state, playerIdx, move.card, move.capture.length > 0 ? move.capture : undefined, ); tracker.trackPlay(move.card); if (capture) tracker.trackCapture(capture.captured); inference.onMove(playerIdx, move, tableBeforeMove); state = nextState; turnCount++; } const outcome = getMatchOutcome(state.teamScores); if (!outcome.continueMatch) { break; } if (rounds === MAX_SELF_PLAY_ROUNDS) { truncated = true; break; } rounds++; const totals: [number, number] = [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints]; const nextDealer = nextPlayer(state.dealer); tracker.reset(); inference.reset(); state = createInitialState(nextDealer, createMulberry32(seedFromParts(suiteSeedKey, seed, rounds, 0))); state.matchStartingPlayer = matchStartingPlayer; state.teamScores[0].totalPoints = totals[0]; state.teamScores[1].totalPoints = totals[1]; state.roundNumber = rounds; } const outcome = getMatchOutcome(state.teamScores); const winner = outcome.winner; const newAIResult = winner === null ? 'draw' : winner === newAITeam ? 'win' : 'loss'; void truncated; // tracked internally; not surfaced in the result interface return { suite, seed, dealer: initialDealer, newAITeam, newAIDifficulty: difficulty, winner, newAIResult, rounds, totalPoints: [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints], }; } export async function runHeadToHeadBenchmark(): Promise { const configs: Array<{ suite: 'head-to-head-master' | 'head-to-head-advanced'; difficulty: Difficulty; targetWinRate: number; }> = [ { suite: 'head-to-head-master', difficulty: 'master', targetWinRate: HEAD_TO_HEAD_MASTER_TARGET_WIN_RATE }, { suite: 'head-to-head-advanced', difficulty: 'advanced', targetWinRate: HEAD_TO_HEAD_ADVANCED_TARGET_WIN_RATE }, ]; const summaries: HeadToHeadSuiteSummary[] = []; for (const { suite, difficulty, targetWinRate } of configs) { const results: HeadToHeadMatchResult[] = []; const totalMatches = HEAD_TO_HEAD_SEEDS.length * HEAD_TO_HEAD_SEAT_SWAPS.length; let completedMatches = 0; logBenchmarkProgress(`Starting ${suite} (${totalMatches} matches: ${HEAD_TO_HEAD_SEEDS.length} seeds × ${HEAD_TO_HEAD_SEAT_SWAPS.length} seat swaps).`); for (const seed of HEAD_TO_HEAD_SEEDS) { for (const newAITeam of HEAD_TO_HEAD_SEAT_SWAPS) { const result = await simulateHeadToHeadMatch(suite, difficulty, seed, newAITeam); results.push(result); completedMatches++; if (completedMatches === 1 || completedMatches % 25 === 0 || completedMatches === totalMatches) { logBenchmarkProgress( `${suite} ${completedMatches}/${totalMatches}: seed ${seed}, newAITeam ${newAITeam}, result ${result.newAIResult}, rounds ${result.rounds}.`, ); } } } const wins = results.filter(r => r.newAIResult === 'win').length; const losses = results.filter(r => r.newAIResult === 'loss').length; const draws = results.filter(r => r.newAIResult === 'draw').length; const winRate = results.length === 0 ? 0 : wins / results.length; summaries.push({ suite, newAIDifficulty: difficulty, matches: results.length, wins, losses, draws, winRate, targetWinRate, passed: winRate >= targetWinRate, results, }); } return summaries; } async function runBenchmarkCli(): Promise { const summary = await runAIBenchmark(); logBenchmarkProgress('Benchmark complete. Emitting summary with iteration 6 gate results.'); printReadableSummary(summary); logBenchmarkProgress('Starting HEAD_TO_HEAD benchmark (new AI vs legacy AI)...'); const h2hSuites = await runHeadToHeadBenchmark(); for (const h2h of h2hSuites) { console.log(`\nHEAD_TO_HEAD: ${h2h.suite} (${h2h.matches} games)`); console.log(`New AI wins: ${h2h.wins} (${formatPercentage(h2h.winRate)})`); console.log(`Legacy AI wins: ${h2h.losses} (${formatPercentage(h2h.matches === 0 ? 0 : h2h.losses / h2h.matches)})`); console.log(`Ties: ${h2h.draws}`); console.log(`Target win rate: ${formatPercentage(h2h.targetWinRate)} — ${h2h.passed ? 'PASS' : 'FAIL'}`); } } if (typeof window === 'undefined') { void runBenchmarkCli(); }