Files
scopone/src/game/ai-benchmark.ts
Giancarmine Salucci 3f74c57665
Some checks failed
Android Build & Publish / android (push) Failing after 2m10s
feat(SCOPONE-0013): PIMC AI rewrite + Gitea Android CI pipeline
- Replace minimax with PIMC (Perfect Information Monte Carlo) search
- Add PIMC_SCOPE_BOOST=150 → effective scopa value 540 (was 390)
  → Master win rate: 67.5% → 72.5% vs legacy AI (target ≥60%)
  → Advanced win rate: 97.5% vs beginner AI (target ≥55%)
  → Scope gap in losses: 6.54 → 3.00 scopa/match
- Add card inference engine for probabilistic hand tracking
- Add ai-strategy, ai-legacy evaluation bridge
- Add .gitea/workflows/android-build.yml: build debug + unsigned
  release APK and publish to Gitea generic package registry
2026-05-24 16:29:04 +02:00

1058 lines
37 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { applyMove, cloneState, createInitialState, getMatchOutcome, nextPlayer, teamOf } from './engine';
import { AITimingSource, AIMove, AISearchProfileOverride, chooseMove } from './ai';
import { chooseMove as chooseMoveOld } from './ai-legacy';
import { CardInferenceEngine } from './card-inference';
import {
AI_BENCHMARK_FIXTURES,
AIBenchmarkCriticalConcept,
AIBenchmarkExpectedMove,
isCriticalAIBenchmarkFixture,
} from './ai-benchmark-fixtures';
import { CardTracker } from './card-tracker';
import { Difficulty, GameState, PlayerIndex } from './types';
function formatDurationMs(durationMs: number): string {
if (durationMs < 1000) {
return `${durationMs.toFixed(0)} ms`;
}
return `${(durationMs / 1000).toFixed(2)} s`;
}
function formatPercentage(value: number): string {
return `${(value * 100).toFixed(1)}%`;
}
function logBenchmarkProgress(message: string): void {
console.log(`[ai-benchmark] ${message}`);
}
interface FixedFixtureResult {
fixtureId: string;
name: string;
tags: string[];
criticalConcept: AIBenchmarkCriticalConcept | null;
productionMove: string;
referenceMove: string;
matchesReference: boolean;
expectedPass: boolean;
conceptGatePass: boolean | null;
productionSimulatedMs: number;
referenceSimulatedMs: number;
}
type SelfPlaySuiteId = 'mirror-parity' | 'beginner-dominance';
interface SelfPlayMatchResult {
suite: SelfPlaySuiteId;
seed: number;
dealer: PlayerIndex;
trackedTeam: 0 | 1;
trackedTeamDifficulty: Difficulty;
opponentDifficulty: Difficulty;
winner: 0 | 1 | null;
trackedResult: 'win' | 'loss' | 'draw';
rounds: number;
truncated: boolean;
totalPoints: [number, number];
trackedDecisionCount: number;
trackedAverageSimulatedDecisionMs: number;
trackedMaxSimulatedDecisionMs: number;
}
interface TimingSummary {
count: number;
averageMs: number;
p95Ms: number;
maxMs: number;
}
interface GateCountSummary {
actual: number;
required: number;
total: number;
passed: boolean;
}
interface WinRateGateSummary {
matches: number;
requiredMatches: number;
wins: number;
losses: number;
draws: number;
winRate: number;
targetWinRate: number | null;
tolerance: number | null;
minWinRate: number | null;
maxWinRate: number | null;
matchCountPassed: boolean;
winRatePassed: boolean;
passed: boolean;
}
interface SelfPlaySeedSeatResult {
trackedTeam: 0 | 1;
trackedResult: 'win' | 'loss' | 'draw';
winner: 0 | 1 | null;
rounds: number;
truncated: boolean;
totalPoints: [number, number];
}
interface SelfPlaySeedAggregateResult {
seed: number;
matches: number;
wins: number;
losses: number;
draws: number;
dualLoss: boolean;
seatResults: SelfPlaySeedSeatResult[];
}
interface SelfPlaySuiteSummary {
suite: SelfPlaySuiteId;
label: string;
trackedTeamDifficulty: Difficulty;
opponentDifficulty: Difficulty;
matches: number;
requiredMatches: number;
seedCount: number;
seatBalanced: boolean;
wins: number;
losses: number;
draws: number;
winRate: number;
lossRate: number;
perSeed: SelfPlaySeedAggregateResult[];
dualLossSeeds: number[];
regressionWatchlist: number[];
regressionWatchlistDualLossIntersection: number[];
simulatedTiming: {
suiteSimulatedMs: number;
trackedTeamDecisions: TimingSummary;
};
results: SelfPlayMatchResult[];
}
export interface AIBenchmarkSummary {
benchmark: 'ai-quality';
qualityGate: {
iteration: 6;
passed: boolean;
fixedFixtures: GateCountSummary;
criticalConcepts: GateCountSummary;
mirrorParity: WinRateGateSummary;
beginnerDominance: WinRateGateSummary;
};
fixtureCount: number;
criticalFixtureCount: number;
fixtureTotals: {
fixtures: number;
criticalFixtures: number;
};
fixedSuite: {
fixedFixtureAgreements: number;
expectedPasses: number;
criticalPasses: number;
fixedFixtureAgreementFailures: string[];
criticalPassFailures: string[];
simulatedTiming: {
productionSuiteSimulatedMs: number;
referenceSuiteSimulatedMs: number;
productionMasterDecisions: TimingSummary;
referenceMasterDecisions: TimingSummary;
};
results: FixedFixtureResult[];
};
selfPlaySuites: {
totalMatches: number;
mirrorParity: SelfPlaySuiteSummary;
beginnerDominance: SelfPlaySuiteSummary;
};
timing: {
fixedFixtureProductionMasterDecisions: TimingSummary;
fixedFixtureReferenceMasterDecisions: TimingSummary;
mirrorTrackedTeamSimulatedDecisions: TimingSummary;
beginnerTrackedTeamSimulatedDecisions: TimingSummary;
allTrackedProductionSimulatedDecisions: TimingSummary;
};
referenceProfile: Required<AISearchProfileOverride>;
}
const ITERATION_6_GATE = {
mirrorMatchTarget: 500,
beginnerMatchTarget: 500,
mirrorTargetWinRate: 0.5,
mirrorWinRateTolerance: 0.05,
beginnerMinWinRate: 0.7,
} as const;
const KNOWN_REGRESSION_WATCHLIST = [1000, 1002, 1004, 1006, 1012, 1013, 1014] as const;
const KNOWN_REGRESSION_WATCHLIST_SET = new Set<number>(KNOWN_REGRESSION_WATCHLIST);
const REFERENCE_PROFILE: Required<AISearchProfileOverride> = {
timeBudgetMs: 9000,
sampleCount: 12,
maxDepth: 7,
batchSize: 2,
};
const SELF_PLAY_SEAT_SWAPS = [0, 1] as const;
const SELF_PLAY_MATCH_SEEDS = Array.from({ length: 250 }, (_, index) => 1000 + index);
const MAX_SELF_PLAY_ROUNDS = 20;
const HEAD_TO_HEAD_SEEDS = Array.from({ length: 100 }, (_, i) => 2000 + i);
const HEAD_TO_HEAD_SEAT_SWAPS = [0, 1] as const;
const HEAD_TO_HEAD_MASTER_TARGET_WIN_RATE = 0.60;
const HEAD_TO_HEAD_ADVANCED_TARGET_WIN_RATE = 0.55;
interface SelfPlaySuiteConfig {
id: SelfPlaySuiteId;
label: string;
suiteSeedKey: number;
requiredMatches: number;
trackedTeamDifficulty: Difficulty;
opponentDifficulty: Difficulty;
getTeamDifficulties(trackedTeam: 0 | 1): readonly [Difficulty, Difficulty];
}
const SELF_PLAY_SUITES: Record<SelfPlaySuiteId, SelfPlaySuiteConfig> = {
'mirror-parity': {
id: 'mirror-parity',
label: 'Master mirror parity',
suiteSeedKey: 0x4d31,
requiredMatches: SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length,
trackedTeamDifficulty: 'master',
opponentDifficulty: 'master',
getTeamDifficulties: () => ['master', 'master'],
},
'beginner-dominance': {
id: 'beginner-dominance',
label: 'Master versus beginner dominance',
suiteSeedKey: 0x4236,
requiredMatches: SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length,
trackedTeamDifficulty: 'master',
opponentDifficulty: 'beginner',
getTeamDifficulties: trackedTeam => (trackedTeam === 0
? ['master', 'beginner']
: ['beginner', 'master']),
},
};
function assertIteration6BenchmarkContract(): void {
const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length;
const expectedSeatBalancedMatches = SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length;
if (AI_BENCHMARK_FIXTURES.length === 0) {
throw new Error('Iteration 6 benchmark requires at least one fixed fixture.');
}
if (criticalFixtureCount === 0) {
throw new Error('Iteration 6 benchmark requires at least one critical concept fixture.');
}
if (expectedSeatBalancedMatches !== ITERATION_6_GATE.mirrorMatchTarget) {
throw new Error(
`Iteration 6 benchmark expects ${ITERATION_6_GATE.mirrorMatchTarget} mirror matches, received ${expectedSeatBalancedMatches}.`,
);
}
if (expectedSeatBalancedMatches !== ITERATION_6_GATE.beginnerMatchTarget) {
throw new Error(
`Iteration 6 benchmark expects ${ITERATION_6_GATE.beginnerMatchTarget} beginner-dominance matches, received ${expectedSeatBalancedMatches}.`,
);
}
for (const suite of Object.values(SELF_PLAY_SUITES)) {
if (suite.requiredMatches !== expectedSeatBalancedMatches) {
throw new Error(
`Iteration 6 benchmark expects ${expectedSeatBalancedMatches} matches for ${suite.id}, received ${suite.requiredMatches}.`,
);
}
}
}
interface SimulatedBenchmarkTimingSource extends AITimingSource {
getElapsedMs(): number;
}
function createSimulatedBenchmarkTimingSource(startMs = 0): SimulatedBenchmarkTimingSource {
let currentMs = startMs;
return {
isSimulated: true,
now: () => currentMs,
advance: (elapsedMs: number) => {
currentMs += elapsedMs;
return currentMs;
},
getElapsedMs: () => currentMs - startMs,
};
}
function seedFromParts(...parts: number[]): number {
let hash = 2166136261;
for (const part of parts) {
hash ^= part >>> 0;
hash = Math.imul(hash, 16777619);
}
return hash >>> 0;
}
function createMulberry32(seed: number): () => number {
let state = seed >>> 0;
return () => {
state = (state + 0x6d2b79f5) >>> 0;
let mixed = Math.imul(state ^ (state >>> 15), state | 1);
mixed ^= mixed + Math.imul(mixed ^ (mixed >>> 7), mixed | 61);
return ((mixed ^ (mixed >>> 14)) >>> 0) / 4294967296;
};
}
function moveKey(move: AIMove): string {
return `${move.card.id}|${move.capture.map(card => card.id).sort().join(',')}`;
}
function otherTeam(team: 0 | 1): 0 | 1 {
return team === 0 ? 1 : 0;
}
function createTrackerForState(state: GameState): CardTracker {
const tracker = new CardTracker();
for (const player of state.players) {
for (const card of player.pile) {
tracker.trackPlay(card);
}
}
return tracker;
}
function matchesExpectedMove(move: AIMove, expected: AIBenchmarkExpectedMove): boolean {
if (move.card.id !== expected.cardId) return false;
if (!expected.captureIds) return true;
const actualCapture = move.capture.map(card => card.id).sort().join(',');
const expectedCapture = [...expected.captureIds].sort().join(',');
return actualCapture === expectedCapture;
}
async function runFixedFixtureSuite(): Promise<{
results: FixedFixtureResult[];
productionSuiteSimulatedMs: number;
referenceSuiteSimulatedMs: number;
productionTimings: number[];
referenceTimings: number[];
}> {
const results: FixedFixtureResult[] = [];
const productionTimings: number[] = [];
const referenceTimings: number[] = [];
logBenchmarkProgress(`Starting fixed fixture suite (${AI_BENCHMARK_FIXTURES.length} positions).`);
for (let index = 0; index < AI_BENCHMARK_FIXTURES.length; index++) {
const fixture = AI_BENCHMARK_FIXTURES[index];
const productionState = cloneState(fixture.state);
const referenceState = cloneState(fixture.state);
const productionTracker = createTrackerForState(productionState);
const referenceTracker = createTrackerForState(referenceState);
const productionSeed = seedFromParts(0x0f1e2d3c, index, 0);
const referenceSeed = seedFromParts(0x0f1e2d3c, index, 1);
const productionTimingSource = createSimulatedBenchmarkTimingSource();
const referenceTimingSource = createSimulatedBenchmarkTimingSource();
const productionMove = await chooseMove(
productionState,
productionState.currentPlayer,
'master',
productionTracker,
undefined,
{
rng: createMulberry32(productionSeed),
timingSource: productionTimingSource,
},
);
const productionSimulatedMs = productionTimingSource.getElapsedMs();
const referenceMove = await chooseMove(
referenceState,
referenceState.currentPlayer,
'master',
referenceTracker,
undefined,
{
rng: createMulberry32(referenceSeed),
profileOverride: REFERENCE_PROFILE,
timingSource: referenceTimingSource,
},
);
const referenceSimulatedMs = referenceTimingSource.getElapsedMs();
productionTimings.push(productionSimulatedMs);
referenceTimings.push(referenceSimulatedMs);
const conceptGatePass = isCriticalAIBenchmarkFixture(fixture)
? matchesExpectedMove(productionMove, fixture.expectedMove)
: null;
results.push({
fixtureId: fixture.id,
name: fixture.name,
tags: [...fixture.tags],
criticalConcept: fixture.criticalConcept,
productionMove: moveKey(productionMove),
referenceMove: moveKey(referenceMove),
matchesReference: moveKey(productionMove) === moveKey(referenceMove),
expectedPass: matchesExpectedMove(productionMove, fixture.expectedMove),
conceptGatePass,
productionSimulatedMs,
referenceSimulatedMs,
});
const progressLabel = `${index + 1}/${AI_BENCHMARK_FIXTURES.length}`;
const matchLabel = moveKey(productionMove) === moveKey(referenceMove) ? 'agreement' : 'divergence';
logBenchmarkProgress(
`Fixture ${progressLabel}: ${fixture.id} -> ${matchLabel}, production simulated ${formatDurationMs(productionSimulatedMs)}, reference simulated ${formatDurationMs(referenceSimulatedMs)}.`,
);
}
return {
results,
productionSuiteSimulatedMs: sumTimings(productionTimings),
referenceSuiteSimulatedMs: sumTimings(referenceTimings),
productionTimings,
referenceTimings,
};
}
function sumTimings(samples: number[]): number {
return samples.reduce((total, sample) => total + sample, 0);
}
function summarizeTimings(samples: number[]): TimingSummary {
if (samples.length === 0) {
return {
count: 0,
averageMs: 0,
p95Ms: 0,
maxMs: 0,
};
}
const sorted = [...samples].sort((left, right) => left - right);
const sum = sorted.reduce((accumulator, value) => accumulator + value, 0);
const p95Index = Math.min(sorted.length - 1, Math.max(0, Math.ceil(sorted.length * 0.95) - 1));
return {
count: sorted.length,
averageMs: sum / sorted.length,
p95Ms: sorted[p95Index],
maxMs: sorted[sorted.length - 1],
};
}
function summarizeSelfPlayBySeed(results: SelfPlayMatchResult[]): {
perSeed: SelfPlaySeedAggregateResult[];
dualLossSeeds: number[];
regressionWatchlistDualLossIntersection: number[];
} {
const aggregates = new Map<number, SelfPlaySeedAggregateResult>();
for (const result of results) {
const existing = aggregates.get(result.seed) ?? {
seed: result.seed,
matches: 0,
wins: 0,
losses: 0,
draws: 0,
dualLoss: false,
seatResults: [],
};
existing.matches++;
if (result.trackedResult === 'win') existing.wins++;
else if (result.trackedResult === 'loss') existing.losses++;
else existing.draws++;
existing.seatResults.push({
trackedTeam: result.trackedTeam,
trackedResult: result.trackedResult,
winner: result.winner,
rounds: result.rounds,
truncated: result.truncated,
totalPoints: result.totalPoints,
});
aggregates.set(result.seed, existing);
}
const perSeed = [...aggregates.values()]
.map(aggregate => ({
...aggregate,
dualLoss: aggregate.losses >= 2,
seatResults: [...aggregate.seatResults].sort((left, right) => left.trackedTeam - right.trackedTeam),
}))
.sort((left, right) => left.seed - right.seed);
const dualLossSeeds = perSeed.filter(aggregate => aggregate.dualLoss).map(aggregate => aggregate.seed);
const regressionWatchlistDualLossIntersection = dualLossSeeds.filter(seed => KNOWN_REGRESSION_WATCHLIST_SET.has(seed));
return {
perSeed,
dualLossSeeds,
regressionWatchlistDualLossIntersection,
};
}
async function simulateSelfPlayMatch(
suite: SelfPlaySuiteConfig,
seed: number,
trackedTeam: 0 | 1,
): Promise<{ result: SelfPlayMatchResult; trackedTimings: number[]; simulatedMatchMs: number }> {
const initialDealer = (seed % 4) as PlayerIndex;
const teamDifficulties = suite.getTeamDifficulties(trackedTeam);
let state = createInitialState(initialDealer, createMulberry32(seedFromParts(suite.suiteSeedKey, seed, 1, 0)));
const matchStartingPlayer = state.matchStartingPlayer;
const tracker = new CardTracker();
const trackedTimings: number[] = [];
let simulatedMatchMs = 0;
let rounds = 1;
let truncated = false;
let turnCount = 0;
while (rounds <= MAX_SELF_PLAY_ROUNDS) {
while (!state.roundOver) {
const playerIdx = state.currentPlayer;
const actingTeam = teamOf(playerIdx);
const difficulty = teamDifficulties[actingTeam];
const timingSource = createSimulatedBenchmarkTimingSource();
const move = await chooseMove(state, playerIdx, difficulty, tracker, undefined, {
rng: createMulberry32(seedFromParts(suite.suiteSeedKey, seed, rounds, turnCount, playerIdx)),
timingSource,
});
const simulatedMs = timingSource.getElapsedMs();
simulatedMatchMs += simulatedMs;
if (actingTeam === trackedTeam) {
trackedTimings.push(simulatedMs);
}
const { nextState, capture } = applyMove(
state,
playerIdx,
move.card,
move.capture.length > 0 ? move.capture : undefined,
);
tracker.trackPlay(move.card);
if (capture) {
tracker.trackCapture(capture.captured);
}
state = nextState;
turnCount++;
}
const outcome = getMatchOutcome(state.teamScores);
if (!outcome.continueMatch) {
break;
}
if (rounds === MAX_SELF_PLAY_ROUNDS) {
truncated = true;
break;
}
rounds++;
const totals: [number, number] = [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints];
const nextDealer = nextPlayer(state.dealer);
tracker.reset();
state = createInitialState(nextDealer, createMulberry32(seedFromParts(suite.suiteSeedKey, seed, rounds, 0)));
state.matchStartingPlayer = matchStartingPlayer;
state.teamScores[0].totalPoints = totals[0];
state.teamScores[1].totalPoints = totals[1];
state.roundNumber = rounds;
}
const outcome = getMatchOutcome(state.teamScores);
const winner = outcome.winner;
const timingSummary = summarizeTimings(trackedTimings);
const opposingTeam = otherTeam(trackedTeam);
const trackedResult = winner === null ? 'draw' : winner === trackedTeam ? 'win' : 'loss';
return {
result: {
suite: suite.id,
seed,
dealer: initialDealer,
trackedTeam,
trackedTeamDifficulty: teamDifficulties[trackedTeam],
opponentDifficulty: teamDifficulties[opposingTeam],
winner,
trackedResult,
rounds,
truncated,
totalPoints: [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints],
trackedDecisionCount: timingSummary.count,
trackedAverageSimulatedDecisionMs: timingSummary.averageMs,
trackedMaxSimulatedDecisionMs: timingSummary.maxMs,
},
trackedTimings,
simulatedMatchMs,
};
}
async function runSelfPlaySuite(
suite: SelfPlaySuiteConfig,
): Promise<{ results: SelfPlayMatchResult[]; suiteSimulatedMs: number; trackedTeamTimings: number[] }> {
const results: SelfPlayMatchResult[] = [];
const trackedTeamTimings: number[] = [];
let suiteSimulatedMs = 0;
let completedMatches = 0;
logBenchmarkProgress(`Starting ${suite.label} suite (${suite.requiredMatches} seeded matches with seat swaps).`);
for (const seed of SELF_PLAY_MATCH_SEEDS) {
for (const trackedTeam of SELF_PLAY_SEAT_SWAPS) {
const { result, trackedTimings, simulatedMatchMs } = await simulateSelfPlayMatch(suite, seed, trackedTeam);
results.push(result);
trackedTeamTimings.push(...trackedTimings);
suiteSimulatedMs += simulatedMatchMs;
completedMatches++;
if (completedMatches === 1 || completedMatches % 25 === 0 || completedMatches === suite.requiredMatches) {
logBenchmarkProgress(
`${suite.label} ${completedMatches}/${suite.requiredMatches}: seed ${seed}, tracked team ${trackedTeam}, result ${result.trackedResult}, rounds ${result.rounds}, max simulated decision ${formatDurationMs(result.trackedMaxSimulatedDecisionMs)}.`,
);
}
}
}
return {
results,
suiteSimulatedMs,
trackedTeamTimings,
};
}
function buildSelfPlaySuiteSummary(
suite: SelfPlaySuiteConfig,
run: { results: SelfPlayMatchResult[]; suiteSimulatedMs: number; trackedTeamTimings: number[] },
): SelfPlaySuiteSummary {
const wins = run.results.filter(result => result.trackedResult === 'win').length;
const losses = run.results.filter(result => result.trackedResult === 'loss').length;
const draws = run.results.filter(result => result.trackedResult === 'draw').length;
const { perSeed, dualLossSeeds, regressionWatchlistDualLossIntersection } = summarizeSelfPlayBySeed(run.results);
return {
suite: suite.id,
label: suite.label,
trackedTeamDifficulty: suite.trackedTeamDifficulty,
opponentDifficulty: suite.opponentDifficulty,
matches: run.results.length,
requiredMatches: suite.requiredMatches,
seedCount: SELF_PLAY_MATCH_SEEDS.length,
seatBalanced: true,
wins,
losses,
draws,
winRate: run.results.length === 0 ? 0 : wins / run.results.length,
lossRate: run.results.length === 0 ? 0 : losses / run.results.length,
perSeed,
dualLossSeeds,
regressionWatchlist: [...KNOWN_REGRESSION_WATCHLIST],
regressionWatchlistDualLossIntersection,
simulatedTiming: {
suiteSimulatedMs: run.suiteSimulatedMs,
trackedTeamDecisions: summarizeTimings(run.trackedTeamTimings),
},
results: run.results,
};
}
function createMirrorParityGate(summary: SelfPlaySuiteSummary): WinRateGateSummary {
const minWinRate = ITERATION_6_GATE.mirrorTargetWinRate - ITERATION_6_GATE.mirrorWinRateTolerance;
const maxWinRate = ITERATION_6_GATE.mirrorTargetWinRate + ITERATION_6_GATE.mirrorWinRateTolerance;
const matchCountPassed = summary.matches === ITERATION_6_GATE.mirrorMatchTarget;
const winRatePassed = summary.winRate >= minWinRate && summary.winRate <= maxWinRate;
return {
matches: summary.matches,
requiredMatches: ITERATION_6_GATE.mirrorMatchTarget,
wins: summary.wins,
losses: summary.losses,
draws: summary.draws,
winRate: summary.winRate,
targetWinRate: ITERATION_6_GATE.mirrorTargetWinRate,
tolerance: ITERATION_6_GATE.mirrorWinRateTolerance,
minWinRate,
maxWinRate,
matchCountPassed,
winRatePassed,
passed: matchCountPassed && winRatePassed,
};
}
function createBeginnerDominanceGate(summary: SelfPlaySuiteSummary): WinRateGateSummary {
const matchCountPassed = summary.matches === ITERATION_6_GATE.beginnerMatchTarget;
const winRatePassed = summary.winRate >= ITERATION_6_GATE.beginnerMinWinRate;
return {
matches: summary.matches,
requiredMatches: ITERATION_6_GATE.beginnerMatchTarget,
wins: summary.wins,
losses: summary.losses,
draws: summary.draws,
winRate: summary.winRate,
targetWinRate: null,
tolerance: null,
minWinRate: ITERATION_6_GATE.beginnerMinWinRate,
maxWinRate: null,
matchCountPassed,
winRatePassed,
passed: matchCountPassed && winRatePassed,
};
}
function formatPerSeedAggregates(perSeed: SelfPlaySeedAggregateResult[]): string {
return perSeed.map(seed => `${seed.seed}:${seed.wins}W-${seed.losses}L-${seed.draws}D`).join(' | ');
}
function printReadableSummary(summary: AIBenchmarkSummary): void {
const mirror = summary.selfPlaySuites.mirrorParity;
const beginner = summary.selfPlaySuites.beginnerDominance;
console.log('AI quality benchmark');
console.log(`Iteration 6 quality gate: ${summary.qualityGate.passed ? 'PASS' : 'FAIL'}`);
console.log(`Fixture totals: ${summary.fixtureTotals.fixtures} total, ${summary.fixtureTotals.criticalFixtures} critical.`);
console.log(`Fixed-fixture gate: ${summary.qualityGate.fixedFixtures.actual}/${summary.qualityGate.fixedFixtures.total} agreements.`);
console.log(`Critical concept gate: ${summary.qualityGate.criticalConcepts.actual}/${summary.qualityGate.criticalConcepts.total} passes.`);
console.log(
`Mirror parity gate: ${summary.qualityGate.mirrorParity.matches}/${summary.qualityGate.mirrorParity.requiredMatches} matches, ${formatPercentage(summary.qualityGate.mirrorParity.winRate)} tracked-team win rate (target ${formatPercentage(summary.qualityGate.mirrorParity.targetWinRate ?? 0)} +/- ${formatPercentage(summary.qualityGate.mirrorParity.tolerance ?? 0)}).`,
);
console.log(
`Beginner dominance gate: ${summary.qualityGate.beginnerDominance.matches}/${summary.qualityGate.beginnerDominance.requiredMatches} matches, ${formatPercentage(summary.qualityGate.beginnerDominance.winRate)} master win rate (target >= ${formatPercentage(summary.qualityGate.beginnerDominance.minWinRate ?? 0)}).`,
);
if (summary.fixedSuite.fixedFixtureAgreementFailures.length > 0) {
console.log(`Fixed-fixture agreement failures: ${summary.fixedSuite.fixedFixtureAgreementFailures.join(', ')}`);
}
if (summary.fixedSuite.criticalPassFailures.length > 0) {
console.log(`Critical concept failures: ${summary.fixedSuite.criticalPassFailures.join(', ')}`);
}
console.log(`Mirror per-seed aggregates: ${formatPerSeedAggregates(mirror.perSeed)}`);
console.log(`Mirror dual-loss seeds: ${mirror.dualLossSeeds.length > 0 ? mirror.dualLossSeeds.join(', ') : 'none'}`);
console.log(
`Mirror regression watchlist intersection: ${mirror.regressionWatchlistDualLossIntersection.length > 0 ? mirror.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${mirror.regressionWatchlist.join(', ')})`,
);
console.log(`Beginner per-seed aggregates: ${formatPerSeedAggregates(beginner.perSeed)}`);
console.log(`Beginner dual-loss seeds: ${beginner.dualLossSeeds.length > 0 ? beginner.dualLossSeeds.join(', ') : 'none'}`);
console.log(
`Beginner regression watchlist intersection: ${beginner.regressionWatchlistDualLossIntersection.length > 0 ? beginner.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${beginner.regressionWatchlist.join(', ')})`,
);
console.log(
`Fixed suite simulated duration: production ${formatDurationMs(summary.fixedSuite.simulatedTiming.productionSuiteSimulatedMs)}, reference ${formatDurationMs(summary.fixedSuite.simulatedTiming.referenceSuiteSimulatedMs)}.`,
);
console.log(`Mirror suite simulated duration: ${formatDurationMs(mirror.simulatedTiming.suiteSimulatedMs)}.`);
console.log(`Beginner suite simulated duration: ${formatDurationMs(beginner.simulatedTiming.suiteSimulatedMs)}.`);
console.log(
`Simulated timing: fixed production avg ${summary.timing.fixedFixtureProductionMasterDecisions.averageMs.toFixed(1)} ms, fixed reference avg ${summary.timing.fixedFixtureReferenceMasterDecisions.averageMs.toFixed(1)} ms, mirror tracked avg ${summary.timing.mirrorTrackedTeamSimulatedDecisions.averageMs.toFixed(1)} ms, beginner tracked avg ${summary.timing.beginnerTrackedTeamSimulatedDecisions.averageMs.toFixed(1)} ms, aggregate avg ${summary.timing.allTrackedProductionSimulatedDecisions.averageMs.toFixed(1)} ms.`,
);
console.log('BENCHMARK_SUMMARY');
console.log(JSON.stringify(summary, null, 2));
}
export async function runAIBenchmark(): Promise<AIBenchmarkSummary> {
assertIteration6BenchmarkContract();
logBenchmarkProgress('Benchmark started. Running fixed fixtures, mirror parity, then beginner dominance.');
const fixedSuite = await runFixedFixtureSuite();
logBenchmarkProgress(
`Fixed fixture suite complete with production ${formatDurationMs(fixedSuite.productionSuiteSimulatedMs)} simulated and reference ${formatDurationMs(fixedSuite.referenceSuiteSimulatedMs)} simulated.`,
);
const mirrorRun = await runSelfPlaySuite(SELF_PLAY_SUITES['mirror-parity']);
logBenchmarkProgress(`Mirror parity suite complete in ${formatDurationMs(mirrorRun.suiteSimulatedMs)} simulated.`);
const beginnerRun = await runSelfPlaySuite(SELF_PLAY_SUITES['beginner-dominance']);
logBenchmarkProgress(`Beginner dominance suite complete in ${formatDurationMs(beginnerRun.suiteSimulatedMs)} simulated.`);
const mirrorSummary = buildSelfPlaySuiteSummary(SELF_PLAY_SUITES['mirror-parity'], mirrorRun);
const beginnerSummary = buildSelfPlaySuiteSummary(SELF_PLAY_SUITES['beginner-dominance'], beginnerRun);
const mirrorParityGate = createMirrorParityGate(mirrorSummary);
const beginnerDominanceGate = createBeginnerDominanceGate(beginnerSummary);
const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length;
const fixedFixtureAgreements = fixedSuite.results.filter(result => result.matchesReference).length;
const expectedPasses = fixedSuite.results.filter(result => result.expectedPass).length;
const criticalPasses = fixedSuite.results.filter(result => result.conceptGatePass === true).length;
const fixedFixtureAgreementFailures = fixedSuite.results
.filter(result => !result.matchesReference)
.map(result => result.fixtureId);
const criticalPassFailures = fixedSuite.results
.filter(result => result.conceptGatePass === false)
.map(result => result.fixtureId);
const fixedFixtureProductionMasterDecisions = summarizeTimings(fixedSuite.productionTimings);
const fixedFixtureReferenceMasterDecisions = summarizeTimings(fixedSuite.referenceTimings);
const mirrorTrackedTeamSimulatedDecisions = mirrorSummary.simulatedTiming.trackedTeamDecisions;
const beginnerTrackedTeamSimulatedDecisions = beginnerSummary.simulatedTiming.trackedTeamDecisions;
const allTrackedProductionSimulatedDecisions = summarizeTimings([
...fixedSuite.productionTimings,
...mirrorRun.trackedTeamTimings,
...beginnerRun.trackedTeamTimings,
]);
const fixedFixtureGate: GateCountSummary = {
actual: fixedFixtureAgreements,
required: AI_BENCHMARK_FIXTURES.length,
total: AI_BENCHMARK_FIXTURES.length,
passed: fixedFixtureAgreements === AI_BENCHMARK_FIXTURES.length,
};
const criticalConceptGate: GateCountSummary = {
actual: criticalPasses,
required: criticalFixtureCount,
total: criticalFixtureCount,
passed: criticalPasses === criticalFixtureCount,
};
return {
benchmark: 'ai-quality',
qualityGate: {
iteration: 6,
passed: fixedFixtureGate.passed
&& criticalConceptGate.passed
&& mirrorParityGate.passed
&& beginnerDominanceGate.passed,
fixedFixtures: fixedFixtureGate,
criticalConcepts: criticalConceptGate,
mirrorParity: mirrorParityGate,
beginnerDominance: beginnerDominanceGate,
},
fixtureCount: AI_BENCHMARK_FIXTURES.length,
criticalFixtureCount,
fixtureTotals: {
fixtures: AI_BENCHMARK_FIXTURES.length,
criticalFixtures: criticalFixtureCount,
},
fixedSuite: {
fixedFixtureAgreements,
expectedPasses,
criticalPasses,
fixedFixtureAgreementFailures,
criticalPassFailures,
simulatedTiming: {
productionSuiteSimulatedMs: fixedSuite.productionSuiteSimulatedMs,
referenceSuiteSimulatedMs: fixedSuite.referenceSuiteSimulatedMs,
productionMasterDecisions: fixedFixtureProductionMasterDecisions,
referenceMasterDecisions: fixedFixtureReferenceMasterDecisions,
},
results: fixedSuite.results,
},
selfPlaySuites: {
totalMatches: mirrorSummary.matches + beginnerSummary.matches,
mirrorParity: mirrorSummary,
beginnerDominance: beginnerSummary,
},
timing: {
fixedFixtureProductionMasterDecisions,
fixedFixtureReferenceMasterDecisions,
mirrorTrackedTeamSimulatedDecisions,
beginnerTrackedTeamSimulatedDecisions,
allTrackedProductionSimulatedDecisions,
},
referenceProfile: REFERENCE_PROFILE,
};
}
interface HeadToHeadMatchResult {
suite: 'head-to-head-master' | 'head-to-head-advanced';
seed: number;
dealer: PlayerIndex;
newAITeam: 0 | 1;
newAIDifficulty: Difficulty;
winner: 0 | 1 | null;
newAIResult: 'win' | 'loss' | 'draw';
rounds: number;
totalPoints: [number, number];
}
interface HeadToHeadSuiteSummary {
suite: 'head-to-head-master' | 'head-to-head-advanced';
newAIDifficulty: Difficulty;
matches: number;
wins: number;
losses: number;
draws: number;
winRate: number;
targetWinRate: number;
passed: boolean;
results: HeadToHeadMatchResult[];
}
const HEAD_TO_HEAD_SUITE_SEED_KEYS: Record<'head-to-head-master' | 'head-to-head-advanced', number> = {
'head-to-head-master': 0x4d42,
'head-to-head-advanced': 0x4142,
};
async function simulateHeadToHeadMatch(
suite: 'head-to-head-master' | 'head-to-head-advanced',
difficulty: Difficulty,
seed: number,
newAITeam: 0 | 1,
): Promise<HeadToHeadMatchResult> {
const suiteSeedKey = HEAD_TO_HEAD_SUITE_SEED_KEYS[suite];
const initialDealer = (seed % 4) as PlayerIndex;
let state = createInitialState(initialDealer, createMulberry32(seedFromParts(suiteSeedKey, seed, 1, 0)));
const matchStartingPlayer = state.matchStartingPlayer;
const tracker = new CardTracker();
const inference = new CardInferenceEngine(tracker);
let rounds = 1;
let truncated = false;
let turnCount = 0;
while (rounds <= MAX_SELF_PLAY_ROUNDS) {
while (!state.roundOver) {
const playerIdx = state.currentPlayer;
const actingTeam = teamOf(playerIdx);
const isNewAI = actingTeam === newAITeam;
const timingSource = createSimulatedBenchmarkTimingSource();
const rng = createMulberry32(seedFromParts(suiteSeedKey, seed, rounds, turnCount, playerIdx));
let move: AIMove;
if (isNewAI) {
move = await chooseMove(state, playerIdx, difficulty, tracker, undefined, {
rng, timingSource, inference,
});
} else {
move = await chooseMoveOld(state, playerIdx, difficulty, tracker, undefined, {
rng, timingSource,
});
}
const tableBeforeMove = [...state.table];
const { nextState, capture } = applyMove(
state,
playerIdx,
move.card,
move.capture.length > 0 ? move.capture : undefined,
);
tracker.trackPlay(move.card);
if (capture) tracker.trackCapture(capture.captured);
inference.onMove(playerIdx, move, tableBeforeMove);
state = nextState;
turnCount++;
}
const outcome = getMatchOutcome(state.teamScores);
if (!outcome.continueMatch) {
break;
}
if (rounds === MAX_SELF_PLAY_ROUNDS) {
truncated = true;
break;
}
rounds++;
const totals: [number, number] = [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints];
const nextDealer = nextPlayer(state.dealer);
tracker.reset();
inference.reset();
state = createInitialState(nextDealer, createMulberry32(seedFromParts(suiteSeedKey, seed, rounds, 0)));
state.matchStartingPlayer = matchStartingPlayer;
state.teamScores[0].totalPoints = totals[0];
state.teamScores[1].totalPoints = totals[1];
state.roundNumber = rounds;
}
const outcome = getMatchOutcome(state.teamScores);
const winner = outcome.winner;
const newAIResult = winner === null ? 'draw' : winner === newAITeam ? 'win' : 'loss';
void truncated; // tracked internally; not surfaced in the result interface
return {
suite,
seed,
dealer: initialDealer,
newAITeam,
newAIDifficulty: difficulty,
winner,
newAIResult,
rounds,
totalPoints: [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints],
};
}
export async function runHeadToHeadBenchmark(): Promise<HeadToHeadSuiteSummary[]> {
const configs: Array<{
suite: 'head-to-head-master' | 'head-to-head-advanced';
difficulty: Difficulty;
targetWinRate: number;
}> = [
{ suite: 'head-to-head-master', difficulty: 'master', targetWinRate: HEAD_TO_HEAD_MASTER_TARGET_WIN_RATE },
{ suite: 'head-to-head-advanced', difficulty: 'advanced', targetWinRate: HEAD_TO_HEAD_ADVANCED_TARGET_WIN_RATE },
];
const summaries: HeadToHeadSuiteSummary[] = [];
for (const { suite, difficulty, targetWinRate } of configs) {
const results: HeadToHeadMatchResult[] = [];
const totalMatches = HEAD_TO_HEAD_SEEDS.length * HEAD_TO_HEAD_SEAT_SWAPS.length;
let completedMatches = 0;
logBenchmarkProgress(`Starting ${suite} (${totalMatches} matches: ${HEAD_TO_HEAD_SEEDS.length} seeds × ${HEAD_TO_HEAD_SEAT_SWAPS.length} seat swaps).`);
for (const seed of HEAD_TO_HEAD_SEEDS) {
for (const newAITeam of HEAD_TO_HEAD_SEAT_SWAPS) {
const result = await simulateHeadToHeadMatch(suite, difficulty, seed, newAITeam);
results.push(result);
completedMatches++;
if (completedMatches === 1 || completedMatches % 25 === 0 || completedMatches === totalMatches) {
logBenchmarkProgress(
`${suite} ${completedMatches}/${totalMatches}: seed ${seed}, newAITeam ${newAITeam}, result ${result.newAIResult}, rounds ${result.rounds}.`,
);
}
}
}
const wins = results.filter(r => r.newAIResult === 'win').length;
const losses = results.filter(r => r.newAIResult === 'loss').length;
const draws = results.filter(r => r.newAIResult === 'draw').length;
const winRate = results.length === 0 ? 0 : wins / results.length;
summaries.push({
suite,
newAIDifficulty: difficulty,
matches: results.length,
wins,
losses,
draws,
winRate,
targetWinRate,
passed: winRate >= targetWinRate,
results,
});
}
return summaries;
}
async function runBenchmarkCli(): Promise<void> {
const summary = await runAIBenchmark();
logBenchmarkProgress('Benchmark complete. Emitting summary with iteration 6 gate results.');
printReadableSummary(summary);
logBenchmarkProgress('Starting HEAD_TO_HEAD benchmark (new AI vs legacy AI)...');
const h2hSuites = await runHeadToHeadBenchmark();
for (const h2h of h2hSuites) {
console.log(`\nHEAD_TO_HEAD: ${h2h.suite} (${h2h.matches} games)`);
console.log(`New AI wins: ${h2h.wins} (${formatPercentage(h2h.winRate)})`);
console.log(`Legacy AI wins: ${h2h.losses} (${formatPercentage(h2h.matches === 0 ? 0 : h2h.losses / h2h.matches)})`);
console.log(`Ties: ${h2h.draws}`);
console.log(`Target win rate: ${formatPercentage(h2h.targetWinRate)}${h2h.passed ? 'PASS' : 'FAIL'}`);
}
}
if (typeof window === 'undefined') {
void runBenchmarkCli();
}