fix(SCOPONE-0012): complete iteration 2 - speed up benchmark timer
This commit is contained in:
@@ -4,11 +4,10 @@ import {
|
||||
AI_BENCHMARK_FIXTURES,
|
||||
AIBenchmarkCriticalConcept,
|
||||
AIBenchmarkExpectedMove,
|
||||
AIBenchmarkFixture,
|
||||
isCriticalAIBenchmarkFixture,
|
||||
} from './ai-benchmark-fixtures';
|
||||
import { CardTracker } from './card-tracker';
|
||||
import { GameState, PlayerIndex } from './types';
|
||||
import { Difficulty, GameState, PlayerIndex } from './types';
|
||||
|
||||
function formatDurationMs(durationMs: number): string {
|
||||
if (durationMs < 1000) {
|
||||
@@ -18,6 +17,10 @@ function formatDurationMs(durationMs: number): string {
|
||||
return `${(durationMs / 1000).toFixed(2)} s`;
|
||||
}
|
||||
|
||||
function formatPercentage(value: number): string {
|
||||
return `${(value * 100).toFixed(1)}%`;
|
||||
}
|
||||
|
||||
function logBenchmarkProgress(message: string): void {
|
||||
console.log(`[ai-benchmark] ${message}`);
|
||||
}
|
||||
@@ -36,18 +39,23 @@ interface FixedFixtureResult {
|
||||
referenceSimulatedMs: number;
|
||||
}
|
||||
|
||||
type SelfPlaySuiteId = 'mirror-parity' | 'beginner-dominance';
|
||||
|
||||
interface SelfPlayMatchResult {
|
||||
suite: SelfPlaySuiteId;
|
||||
seed: number;
|
||||
dealer: PlayerIndex;
|
||||
masterTeam: 0 | 1;
|
||||
trackedTeam: 0 | 1;
|
||||
trackedTeamDifficulty: Difficulty;
|
||||
opponentDifficulty: Difficulty;
|
||||
winner: 0 | 1 | null;
|
||||
masterResult: 'win' | 'loss' | 'draw';
|
||||
trackedResult: 'win' | 'loss' | 'draw';
|
||||
rounds: number;
|
||||
truncated: boolean;
|
||||
totalPoints: [number, number];
|
||||
masterDecisionCount: number;
|
||||
masterAverageSimulatedDecisionMs: number;
|
||||
masterMaxSimulatedDecisionMs: number;
|
||||
trackedDecisionCount: number;
|
||||
trackedAverageSimulatedDecisionMs: number;
|
||||
trackedMaxSimulatedDecisionMs: number;
|
||||
}
|
||||
|
||||
interface TimingSummary {
|
||||
@@ -64,23 +72,25 @@ interface GateCountSummary {
|
||||
passed: boolean;
|
||||
}
|
||||
|
||||
interface SelfPlayGateSummary {
|
||||
interface WinRateGateSummary {
|
||||
matches: number;
|
||||
requiredMatches: number;
|
||||
wins: number;
|
||||
requiredWins: number;
|
||||
losses: number;
|
||||
maxLosses: number;
|
||||
draws: number;
|
||||
winRate: number;
|
||||
targetWinRate: number | null;
|
||||
tolerance: number | null;
|
||||
minWinRate: number | null;
|
||||
maxWinRate: number | null;
|
||||
matchCountPassed: boolean;
|
||||
winGatePassed: boolean;
|
||||
lossGatePassed: boolean;
|
||||
winRatePassed: boolean;
|
||||
passed: boolean;
|
||||
}
|
||||
|
||||
interface SelfPlaySeedSeatResult {
|
||||
masterTeam: 0 | 1;
|
||||
masterResult: 'win' | 'loss' | 'draw';
|
||||
trackedTeam: 0 | 1;
|
||||
trackedResult: 'win' | 'loss' | 'draw';
|
||||
winner: 0 | 1 | null;
|
||||
rounds: number;
|
||||
truncated: boolean;
|
||||
@@ -97,50 +107,82 @@ interface SelfPlaySeedAggregateResult {
|
||||
seatResults: SelfPlaySeedSeatResult[];
|
||||
}
|
||||
|
||||
interface SelfPlaySuiteSummary {
|
||||
suite: SelfPlaySuiteId;
|
||||
label: string;
|
||||
trackedTeamDifficulty: Difficulty;
|
||||
opponentDifficulty: Difficulty;
|
||||
matches: number;
|
||||
requiredMatches: number;
|
||||
seedCount: number;
|
||||
seatBalanced: boolean;
|
||||
wins: number;
|
||||
losses: number;
|
||||
draws: number;
|
||||
winRate: number;
|
||||
lossRate: number;
|
||||
perSeed: SelfPlaySeedAggregateResult[];
|
||||
dualLossSeeds: number[];
|
||||
regressionWatchlist: number[];
|
||||
regressionWatchlistDualLossIntersection: number[];
|
||||
simulatedTiming: {
|
||||
suiteSimulatedMs: number;
|
||||
trackedTeamDecisions: TimingSummary;
|
||||
};
|
||||
results: SelfPlayMatchResult[];
|
||||
}
|
||||
|
||||
export interface AIBenchmarkSummary {
|
||||
benchmark: 'ai-quality';
|
||||
qualityGate: {
|
||||
iteration: 5;
|
||||
iteration: 6;
|
||||
passed: boolean;
|
||||
fixedFixtures: GateCountSummary;
|
||||
criticalConcepts: GateCountSummary;
|
||||
selfPlay: SelfPlayGateSummary;
|
||||
mirrorParity: WinRateGateSummary;
|
||||
beginnerDominance: WinRateGateSummary;
|
||||
};
|
||||
fixtureCount: number;
|
||||
criticalFixtureCount: number;
|
||||
fixtureTotals: {
|
||||
fixtures: number;
|
||||
criticalFixtures: number;
|
||||
};
|
||||
fixedSuite: {
|
||||
fixedFixtureAgreements: number;
|
||||
expectedPasses: number;
|
||||
criticalPasses: number;
|
||||
fixedFixtureAgreementFailures: string[];
|
||||
criticalPassFailures: string[];
|
||||
simulatedTiming: {
|
||||
productionSuiteSimulatedMs: number;
|
||||
referenceSuiteSimulatedMs: number;
|
||||
productionMasterDecisions: TimingSummary;
|
||||
referenceMasterDecisions: TimingSummary;
|
||||
};
|
||||
results: FixedFixtureResult[];
|
||||
};
|
||||
selfPlay: {
|
||||
matches: number;
|
||||
wins: number;
|
||||
losses: number;
|
||||
draws: number;
|
||||
winRate: number;
|
||||
lossRate: number;
|
||||
perSeed: SelfPlaySeedAggregateResult[];
|
||||
dualLossSeeds: number[];
|
||||
regressionWatchlist: number[];
|
||||
regressionWatchlistDualLossIntersection: number[];
|
||||
results: SelfPlayMatchResult[];
|
||||
selfPlaySuites: {
|
||||
totalMatches: number;
|
||||
mirrorParity: SelfPlaySuiteSummary;
|
||||
beginnerDominance: SelfPlaySuiteSummary;
|
||||
};
|
||||
timing: {
|
||||
productionMasterSimulatedDecisions: TimingSummary;
|
||||
fixedFixtureProductionMasterDecisions: TimingSummary;
|
||||
fixedFixtureReferenceMasterDecisions: TimingSummary;
|
||||
mirrorTrackedTeamSimulatedDecisions: TimingSummary;
|
||||
beginnerTrackedTeamSimulatedDecisions: TimingSummary;
|
||||
allTrackedProductionSimulatedDecisions: TimingSummary;
|
||||
};
|
||||
referenceProfile: Required<AISearchProfileOverride>;
|
||||
}
|
||||
|
||||
const ITERATION_5_GATE = {
|
||||
fixedFixtureAgreementTarget: 13,
|
||||
criticalConceptTarget: 6,
|
||||
selfPlayMatchTarget: 48,
|
||||
selfPlayWinTarget: 30,
|
||||
selfPlayMaxLosses: 12,
|
||||
const ITERATION_6_GATE = {
|
||||
mirrorMatchTarget: 500,
|
||||
beginnerMatchTarget: 500,
|
||||
mirrorTargetWinRate: 0.5,
|
||||
mirrorWinRateTolerance: 0.05,
|
||||
beginnerMinWinRate: 0.7,
|
||||
} as const;
|
||||
|
||||
const KNOWN_REGRESSION_WATCHLIST = [1000, 1002, 1004, 1006, 1012, 1013, 1014] as const;
|
||||
@@ -153,29 +195,73 @@ const REFERENCE_PROFILE: Required<AISearchProfileOverride> = {
|
||||
batchSize: 2,
|
||||
};
|
||||
|
||||
const SELF_PLAY_MATCH_SEEDS = Array.from({ length: 24 }, (_, index) => 1000 + index);
|
||||
const SELF_PLAY_SEAT_SWAPS = [0, 1] as const;
|
||||
const SELF_PLAY_MATCH_SEEDS = Array.from({ length: 250 }, (_, index) => 1000 + index);
|
||||
const MAX_SELF_PLAY_ROUNDS = 20;
|
||||
|
||||
function assertIteration5BenchmarkContract(): void {
|
||||
interface SelfPlaySuiteConfig {
|
||||
id: SelfPlaySuiteId;
|
||||
label: string;
|
||||
suiteSeedKey: number;
|
||||
requiredMatches: number;
|
||||
trackedTeamDifficulty: Difficulty;
|
||||
opponentDifficulty: Difficulty;
|
||||
getTeamDifficulties(trackedTeam: 0 | 1): readonly [Difficulty, Difficulty];
|
||||
}
|
||||
|
||||
const SELF_PLAY_SUITES: Record<SelfPlaySuiteId, SelfPlaySuiteConfig> = {
|
||||
'mirror-parity': {
|
||||
id: 'mirror-parity',
|
||||
label: 'Master mirror parity',
|
||||
suiteSeedKey: 0x4d31,
|
||||
requiredMatches: SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length,
|
||||
trackedTeamDifficulty: 'master',
|
||||
opponentDifficulty: 'master',
|
||||
getTeamDifficulties: () => ['master', 'master'],
|
||||
},
|
||||
'beginner-dominance': {
|
||||
id: 'beginner-dominance',
|
||||
label: 'Master versus beginner dominance',
|
||||
suiteSeedKey: 0x4236,
|
||||
requiredMatches: SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length,
|
||||
trackedTeamDifficulty: 'master',
|
||||
opponentDifficulty: 'beginner',
|
||||
getTeamDifficulties: trackedTeam => (trackedTeam === 0
|
||||
? ['master', 'beginner']
|
||||
: ['beginner', 'master']),
|
||||
},
|
||||
};
|
||||
|
||||
function assertIteration6BenchmarkContract(): void {
|
||||
const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length;
|
||||
const selfPlayMatchCount = SELF_PLAY_MATCH_SEEDS.length * 2;
|
||||
const expectedSeatBalancedMatches = SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length;
|
||||
|
||||
if (AI_BENCHMARK_FIXTURES.length !== ITERATION_5_GATE.fixedFixtureAgreementTarget) {
|
||||
if (AI_BENCHMARK_FIXTURES.length === 0) {
|
||||
throw new Error('Iteration 6 benchmark requires at least one fixed fixture.');
|
||||
}
|
||||
|
||||
if (criticalFixtureCount === 0) {
|
||||
throw new Error('Iteration 6 benchmark requires at least one critical concept fixture.');
|
||||
}
|
||||
|
||||
if (expectedSeatBalancedMatches !== ITERATION_6_GATE.mirrorMatchTarget) {
|
||||
throw new Error(
|
||||
`Iteration 5 benchmark expects ${ITERATION_5_GATE.fixedFixtureAgreementTarget} fixed fixtures, received ${AI_BENCHMARK_FIXTURES.length}.`,
|
||||
`Iteration 6 benchmark expects ${ITERATION_6_GATE.mirrorMatchTarget} mirror matches, received ${expectedSeatBalancedMatches}.`,
|
||||
);
|
||||
}
|
||||
|
||||
if (criticalFixtureCount !== ITERATION_5_GATE.criticalConceptTarget) {
|
||||
if (expectedSeatBalancedMatches !== ITERATION_6_GATE.beginnerMatchTarget) {
|
||||
throw new Error(
|
||||
`Iteration 5 benchmark expects ${ITERATION_5_GATE.criticalConceptTarget} critical concept fixtures, received ${criticalFixtureCount}.`,
|
||||
`Iteration 6 benchmark expects ${ITERATION_6_GATE.beginnerMatchTarget} beginner-dominance matches, received ${expectedSeatBalancedMatches}.`,
|
||||
);
|
||||
}
|
||||
|
||||
if (selfPlayMatchCount !== ITERATION_5_GATE.selfPlayMatchTarget) {
|
||||
throw new Error(
|
||||
`Iteration 5 benchmark expects ${ITERATION_5_GATE.selfPlayMatchTarget} self-play matches, received ${selfPlayMatchCount}.`,
|
||||
);
|
||||
for (const suite of Object.values(SELF_PLAY_SUITES)) {
|
||||
if (suite.requiredMatches !== expectedSeatBalancedMatches) {
|
||||
throw new Error(
|
||||
`Iteration 6 benchmark expects ${expectedSeatBalancedMatches} matches for ${suite.id}, received ${suite.requiredMatches}.`,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -220,6 +306,10 @@ function moveKey(move: AIMove): string {
|
||||
return `${move.card.id}|${move.capture.map(card => card.id).sort().join(',')}`;
|
||||
}
|
||||
|
||||
function otherTeam(team: 0 | 1): 0 | 1 {
|
||||
return team === 0 ? 1 : 0;
|
||||
}
|
||||
|
||||
function createTrackerForState(state: GameState): CardTracker {
|
||||
const tracker = new CardTracker();
|
||||
for (const player of state.players) {
|
||||
@@ -239,10 +329,16 @@ function matchesExpectedMove(move: AIMove, expected: AIBenchmarkExpectedMove): b
|
||||
return actualCapture === expectedCapture;
|
||||
}
|
||||
|
||||
async function runFixedFixtureSuite(): Promise<{ results: FixedFixtureResult[]; wallClockMs: number; productionTimings: number[] }> {
|
||||
const startedAt = performance.now();
|
||||
async function runFixedFixtureSuite(): Promise<{
|
||||
results: FixedFixtureResult[];
|
||||
productionSuiteSimulatedMs: number;
|
||||
referenceSuiteSimulatedMs: number;
|
||||
productionTimings: number[];
|
||||
referenceTimings: number[];
|
||||
}> {
|
||||
const results: FixedFixtureResult[] = [];
|
||||
const productionTimings: number[] = [];
|
||||
const referenceTimings: number[] = [];
|
||||
|
||||
logBenchmarkProgress(`Starting fixed fixture suite (${AI_BENCHMARK_FIXTURES.length} positions).`);
|
||||
|
||||
@@ -286,6 +382,7 @@ async function runFixedFixtureSuite(): Promise<{ results: FixedFixtureResult[];
|
||||
const referenceSimulatedMs = referenceTimingSource.getElapsedMs();
|
||||
|
||||
productionTimings.push(productionSimulatedMs);
|
||||
referenceTimings.push(referenceSimulatedMs);
|
||||
|
||||
const conceptGatePass = isCriticalAIBenchmarkFixture(fixture)
|
||||
? matchesExpectedMove(productionMove, fixture.expectedMove)
|
||||
@@ -314,11 +411,17 @@ async function runFixedFixtureSuite(): Promise<{ results: FixedFixtureResult[];
|
||||
|
||||
return {
|
||||
results,
|
||||
wallClockMs: performance.now() - startedAt,
|
||||
productionSuiteSimulatedMs: sumTimings(productionTimings),
|
||||
referenceSuiteSimulatedMs: sumTimings(referenceTimings),
|
||||
productionTimings,
|
||||
referenceTimings,
|
||||
};
|
||||
}
|
||||
|
||||
function sumTimings(samples: number[]): number {
|
||||
return samples.reduce((total, sample) => total + sample, 0);
|
||||
}
|
||||
|
||||
function summarizeTimings(samples: number[]): TimingSummary {
|
||||
if (samples.length === 0) {
|
||||
return {
|
||||
@@ -360,13 +463,13 @@ function summarizeSelfPlayBySeed(results: SelfPlayMatchResult[]): {
|
||||
};
|
||||
|
||||
existing.matches++;
|
||||
if (result.masterResult === 'win') existing.wins++;
|
||||
else if (result.masterResult === 'loss') existing.losses++;
|
||||
if (result.trackedResult === 'win') existing.wins++;
|
||||
else if (result.trackedResult === 'loss') existing.losses++;
|
||||
else existing.draws++;
|
||||
|
||||
existing.seatResults.push({
|
||||
masterTeam: result.masterTeam,
|
||||
masterResult: result.masterResult,
|
||||
trackedTeam: result.trackedTeam,
|
||||
trackedResult: result.trackedResult,
|
||||
winner: result.winner,
|
||||
rounds: result.rounds,
|
||||
truncated: result.truncated,
|
||||
@@ -380,7 +483,7 @@ function summarizeSelfPlayBySeed(results: SelfPlayMatchResult[]): {
|
||||
.map(aggregate => ({
|
||||
...aggregate,
|
||||
dualLoss: aggregate.losses >= 2,
|
||||
seatResults: [...aggregate.seatResults].sort((left, right) => left.masterTeam - right.masterTeam),
|
||||
seatResults: [...aggregate.seatResults].sort((left, right) => left.trackedTeam - right.trackedTeam),
|
||||
}))
|
||||
.sort((left, right) => left.seed - right.seed);
|
||||
const dualLossSeeds = perSeed.filter(aggregate => aggregate.dualLoss).map(aggregate => aggregate.seed);
|
||||
@@ -393,12 +496,18 @@ function summarizeSelfPlayBySeed(results: SelfPlayMatchResult[]): {
|
||||
};
|
||||
}
|
||||
|
||||
async function simulateSelfPlayMatch(seed: number, masterTeam: 0 | 1): Promise<{ result: SelfPlayMatchResult; timings: number[] }> {
|
||||
async function simulateSelfPlayMatch(
|
||||
suite: SelfPlaySuiteConfig,
|
||||
seed: number,
|
||||
trackedTeam: 0 | 1,
|
||||
): Promise<{ result: SelfPlayMatchResult; trackedTimings: number[]; simulatedMatchMs: number }> {
|
||||
const initialDealer = (seed % 4) as PlayerIndex;
|
||||
let state = createInitialState(initialDealer, createMulberry32(seedFromParts(seed, 1, 0)));
|
||||
const teamDifficulties = suite.getTeamDifficulties(trackedTeam);
|
||||
let state = createInitialState(initialDealer, createMulberry32(seedFromParts(suite.suiteSeedKey, seed, 1, 0)));
|
||||
const matchStartingPlayer = state.matchStartingPlayer;
|
||||
const tracker = new CardTracker();
|
||||
const masterTimings: number[] = [];
|
||||
const trackedTimings: number[] = [];
|
||||
let simulatedMatchMs = 0;
|
||||
|
||||
let rounds = 1;
|
||||
let truncated = false;
|
||||
@@ -407,19 +516,18 @@ async function simulateSelfPlayMatch(seed: number, masterTeam: 0 | 1): Promise<{
|
||||
while (rounds <= MAX_SELF_PLAY_ROUNDS) {
|
||||
while (!state.roundOver) {
|
||||
const playerIdx = state.currentPlayer;
|
||||
const difficulty = teamOf(playerIdx) === masterTeam ? 'master' : 'advanced';
|
||||
const actingTeam = teamOf(playerIdx);
|
||||
const difficulty = teamDifficulties[actingTeam];
|
||||
const timingSource = createSimulatedBenchmarkTimingSource();
|
||||
const options = difficulty === 'master'
|
||||
? {
|
||||
rng: createMulberry32(seedFromParts(seed, rounds, turnCount, playerIdx)),
|
||||
timingSource,
|
||||
}
|
||||
: { timingSource };
|
||||
const move = await chooseMove(state, playerIdx, difficulty, tracker, undefined, options);
|
||||
const move = await chooseMove(state, playerIdx, difficulty, tracker, undefined, {
|
||||
rng: createMulberry32(seedFromParts(suite.suiteSeedKey, seed, rounds, turnCount, playerIdx)),
|
||||
timingSource,
|
||||
});
|
||||
const simulatedMs = timingSource.getElapsedMs();
|
||||
simulatedMatchMs += simulatedMs;
|
||||
|
||||
if (difficulty === 'master') {
|
||||
masterTimings.push(simulatedMs);
|
||||
if (actingTeam === trackedTeam) {
|
||||
trackedTimings.push(simulatedMs);
|
||||
}
|
||||
|
||||
const { nextState, capture } = applyMove(
|
||||
@@ -441,16 +549,17 @@ async function simulateSelfPlayMatch(seed: number, masterTeam: 0 | 1): Promise<{
|
||||
break;
|
||||
}
|
||||
|
||||
rounds++;
|
||||
if (rounds > MAX_SELF_PLAY_ROUNDS) {
|
||||
if (rounds === MAX_SELF_PLAY_ROUNDS) {
|
||||
truncated = true;
|
||||
break;
|
||||
}
|
||||
|
||||
rounds++;
|
||||
|
||||
const totals: [number, number] = [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints];
|
||||
const nextDealer = nextPlayer(state.dealer);
|
||||
tracker.reset();
|
||||
state = createInitialState(nextDealer, createMulberry32(seedFromParts(seed, rounds, 0)));
|
||||
state = createInitialState(nextDealer, createMulberry32(seedFromParts(suite.suiteSeedKey, seed, rounds, 0)));
|
||||
state.matchStartingPlayer = matchStartingPlayer;
|
||||
state.teamScores[0].totalPoints = totals[0];
|
||||
state.teamScores[1].totalPoints = totals[1];
|
||||
@@ -458,47 +567,54 @@ async function simulateSelfPlayMatch(seed: number, masterTeam: 0 | 1): Promise<{
|
||||
}
|
||||
|
||||
const outcome = getMatchOutcome(state.teamScores);
|
||||
const winner = truncated ? outcome.winner : outcome.winner;
|
||||
const masterResult = winner === null ? 'draw' : winner === masterTeam ? 'win' : 'loss';
|
||||
const timingSummary = summarizeTimings(masterTimings);
|
||||
const winner = outcome.winner;
|
||||
const timingSummary = summarizeTimings(trackedTimings);
|
||||
const opposingTeam = otherTeam(trackedTeam);
|
||||
const trackedResult = winner === null ? 'draw' : winner === trackedTeam ? 'win' : 'loss';
|
||||
|
||||
return {
|
||||
result: {
|
||||
suite: suite.id,
|
||||
seed,
|
||||
dealer: initialDealer,
|
||||
masterTeam,
|
||||
trackedTeam,
|
||||
trackedTeamDifficulty: teamDifficulties[trackedTeam],
|
||||
opponentDifficulty: teamDifficulties[opposingTeam],
|
||||
winner,
|
||||
masterResult,
|
||||
trackedResult,
|
||||
rounds,
|
||||
truncated,
|
||||
totalPoints: [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints],
|
||||
masterDecisionCount: timingSummary.count,
|
||||
masterAverageSimulatedDecisionMs: timingSummary.averageMs,
|
||||
masterMaxSimulatedDecisionMs: timingSummary.maxMs,
|
||||
trackedDecisionCount: timingSummary.count,
|
||||
trackedAverageSimulatedDecisionMs: timingSummary.averageMs,
|
||||
trackedMaxSimulatedDecisionMs: timingSummary.maxMs,
|
||||
},
|
||||
timings: masterTimings,
|
||||
trackedTimings,
|
||||
simulatedMatchMs,
|
||||
};
|
||||
}
|
||||
|
||||
async function runSelfPlaySuite(): Promise<{ results: SelfPlayMatchResult[]; wallClockMs: number; productionTimings: number[] }> {
|
||||
const startedAt = performance.now();
|
||||
async function runSelfPlaySuite(
|
||||
suite: SelfPlaySuiteConfig,
|
||||
): Promise<{ results: SelfPlayMatchResult[]; suiteSimulatedMs: number; trackedTeamTimings: number[] }> {
|
||||
const results: SelfPlayMatchResult[] = [];
|
||||
const productionTimings: number[] = [];
|
||||
const totalMatches = SELF_PLAY_MATCH_SEEDS.length * 2;
|
||||
const trackedTeamTimings: number[] = [];
|
||||
let suiteSimulatedMs = 0;
|
||||
let completedMatches = 0;
|
||||
|
||||
logBenchmarkProgress(`Starting self-play suite (${totalMatches} seeded matches with seat swaps).`);
|
||||
logBenchmarkProgress(`Starting ${suite.label} suite (${suite.requiredMatches} seeded matches with seat swaps).`);
|
||||
|
||||
for (const seed of SELF_PLAY_MATCH_SEEDS) {
|
||||
for (const masterTeam of [0, 1] as const) {
|
||||
const { result, timings } = await simulateSelfPlayMatch(seed, masterTeam);
|
||||
for (const trackedTeam of SELF_PLAY_SEAT_SWAPS) {
|
||||
const { result, trackedTimings, simulatedMatchMs } = await simulateSelfPlayMatch(suite, seed, trackedTeam);
|
||||
results.push(result);
|
||||
productionTimings.push(...timings);
|
||||
trackedTeamTimings.push(...trackedTimings);
|
||||
suiteSimulatedMs += simulatedMatchMs;
|
||||
completedMatches++;
|
||||
|
||||
if (completedMatches === 1 || completedMatches % 4 === 0 || completedMatches === totalMatches) {
|
||||
if (completedMatches === 1 || completedMatches % 25 === 0 || completedMatches === suite.requiredMatches) {
|
||||
logBenchmarkProgress(
|
||||
`Self-play ${completedMatches}/${totalMatches}: seed ${seed}, master team ${masterTeam}, result ${result.masterResult}, rounds ${result.rounds}, max simulated decision ${formatDurationMs(result.masterMaxSimulatedDecisionMs)}.`,
|
||||
`${suite.label} ${completedMatches}/${suite.requiredMatches}: seed ${seed}, tracked team ${trackedTeam}, result ${result.trackedResult}, rounds ${result.rounds}, max simulated decision ${formatDurationMs(result.trackedMaxSimulatedDecisionMs)}.`,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -506,38 +622,157 @@ async function runSelfPlaySuite(): Promise<{ results: SelfPlayMatchResult[]; wal
|
||||
|
||||
return {
|
||||
results,
|
||||
wallClockMs: performance.now() - startedAt,
|
||||
productionTimings,
|
||||
suiteSimulatedMs,
|
||||
trackedTeamTimings,
|
||||
};
|
||||
}
|
||||
|
||||
function buildSelfPlaySuiteSummary(
|
||||
suite: SelfPlaySuiteConfig,
|
||||
run: { results: SelfPlayMatchResult[]; suiteSimulatedMs: number; trackedTeamTimings: number[] },
|
||||
): SelfPlaySuiteSummary {
|
||||
const wins = run.results.filter(result => result.trackedResult === 'win').length;
|
||||
const losses = run.results.filter(result => result.trackedResult === 'loss').length;
|
||||
const draws = run.results.filter(result => result.trackedResult === 'draw').length;
|
||||
const { perSeed, dualLossSeeds, regressionWatchlistDualLossIntersection } = summarizeSelfPlayBySeed(run.results);
|
||||
|
||||
return {
|
||||
suite: suite.id,
|
||||
label: suite.label,
|
||||
trackedTeamDifficulty: suite.trackedTeamDifficulty,
|
||||
opponentDifficulty: suite.opponentDifficulty,
|
||||
matches: run.results.length,
|
||||
requiredMatches: suite.requiredMatches,
|
||||
seedCount: SELF_PLAY_MATCH_SEEDS.length,
|
||||
seatBalanced: true,
|
||||
wins,
|
||||
losses,
|
||||
draws,
|
||||
winRate: run.results.length === 0 ? 0 : wins / run.results.length,
|
||||
lossRate: run.results.length === 0 ? 0 : losses / run.results.length,
|
||||
perSeed,
|
||||
dualLossSeeds,
|
||||
regressionWatchlist: [...KNOWN_REGRESSION_WATCHLIST],
|
||||
regressionWatchlistDualLossIntersection,
|
||||
simulatedTiming: {
|
||||
suiteSimulatedMs: run.suiteSimulatedMs,
|
||||
trackedTeamDecisions: summarizeTimings(run.trackedTeamTimings),
|
||||
},
|
||||
results: run.results,
|
||||
};
|
||||
}
|
||||
|
||||
function createMirrorParityGate(summary: SelfPlaySuiteSummary): WinRateGateSummary {
|
||||
const minWinRate = ITERATION_6_GATE.mirrorTargetWinRate - ITERATION_6_GATE.mirrorWinRateTolerance;
|
||||
const maxWinRate = ITERATION_6_GATE.mirrorTargetWinRate + ITERATION_6_GATE.mirrorWinRateTolerance;
|
||||
const matchCountPassed = summary.matches === ITERATION_6_GATE.mirrorMatchTarget;
|
||||
const winRatePassed = summary.winRate >= minWinRate && summary.winRate <= maxWinRate;
|
||||
|
||||
return {
|
||||
matches: summary.matches,
|
||||
requiredMatches: ITERATION_6_GATE.mirrorMatchTarget,
|
||||
wins: summary.wins,
|
||||
losses: summary.losses,
|
||||
draws: summary.draws,
|
||||
winRate: summary.winRate,
|
||||
targetWinRate: ITERATION_6_GATE.mirrorTargetWinRate,
|
||||
tolerance: ITERATION_6_GATE.mirrorWinRateTolerance,
|
||||
minWinRate,
|
||||
maxWinRate,
|
||||
matchCountPassed,
|
||||
winRatePassed,
|
||||
passed: matchCountPassed && winRatePassed,
|
||||
};
|
||||
}
|
||||
|
||||
function createBeginnerDominanceGate(summary: SelfPlaySuiteSummary): WinRateGateSummary {
|
||||
const matchCountPassed = summary.matches === ITERATION_6_GATE.beginnerMatchTarget;
|
||||
const winRatePassed = summary.winRate >= ITERATION_6_GATE.beginnerMinWinRate;
|
||||
|
||||
return {
|
||||
matches: summary.matches,
|
||||
requiredMatches: ITERATION_6_GATE.beginnerMatchTarget,
|
||||
wins: summary.wins,
|
||||
losses: summary.losses,
|
||||
draws: summary.draws,
|
||||
winRate: summary.winRate,
|
||||
targetWinRate: null,
|
||||
tolerance: null,
|
||||
minWinRate: ITERATION_6_GATE.beginnerMinWinRate,
|
||||
maxWinRate: null,
|
||||
matchCountPassed,
|
||||
winRatePassed,
|
||||
passed: matchCountPassed && winRatePassed,
|
||||
};
|
||||
}
|
||||
|
||||
function formatPerSeedAggregates(perSeed: SelfPlaySeedAggregateResult[]): string {
|
||||
return perSeed.map(seed => `${seed.seed}:${seed.wins}W-${seed.losses}L-${seed.draws}D`).join(' | ');
|
||||
}
|
||||
|
||||
function printReadableSummary(summary: AIBenchmarkSummary): void {
|
||||
const mirror = summary.selfPlaySuites.mirrorParity;
|
||||
const beginner = summary.selfPlaySuites.beginnerDominance;
|
||||
|
||||
console.log('AI quality benchmark');
|
||||
console.log(`Iteration 5 quality gate: ${summary.qualityGate.passed ? 'PASS' : 'FAIL'}`);
|
||||
console.log(`Fixed-fixture gate: ${summary.qualityGate.fixedFixtures.actual}/${summary.qualityGate.fixedFixtures.total} agreements (target ${summary.qualityGate.fixedFixtures.required}/${summary.qualityGate.fixedFixtures.total}).`);
|
||||
console.log(`Critical concept gate: ${summary.qualityGate.criticalConcepts.actual}/${summary.qualityGate.criticalConcepts.total} passes (target ${summary.qualityGate.criticalConcepts.required}/${summary.qualityGate.criticalConcepts.total}).`);
|
||||
console.log(`Self-play gate: ${summary.qualityGate.selfPlay.matches}/${summary.qualityGate.selfPlay.requiredMatches} matches, ${summary.qualityGate.selfPlay.wins}/${summary.qualityGate.selfPlay.matches} wins (target ${summary.qualityGate.selfPlay.requiredWins}), ${summary.qualityGate.selfPlay.losses}/${summary.qualityGate.selfPlay.matches} losses (max ${summary.qualityGate.selfPlay.maxLosses}), ${summary.qualityGate.selfPlay.draws} draws.`);
|
||||
console.log(`Iteration 6 quality gate: ${summary.qualityGate.passed ? 'PASS' : 'FAIL'}`);
|
||||
console.log(`Fixture totals: ${summary.fixtureTotals.fixtures} total, ${summary.fixtureTotals.criticalFixtures} critical.`);
|
||||
console.log(`Fixed-fixture gate: ${summary.qualityGate.fixedFixtures.actual}/${summary.qualityGate.fixedFixtures.total} agreements.`);
|
||||
console.log(`Critical concept gate: ${summary.qualityGate.criticalConcepts.actual}/${summary.qualityGate.criticalConcepts.total} passes.`);
|
||||
console.log(
|
||||
`Mirror parity gate: ${summary.qualityGate.mirrorParity.matches}/${summary.qualityGate.mirrorParity.requiredMatches} matches, ${formatPercentage(summary.qualityGate.mirrorParity.winRate)} tracked-team win rate (target ${formatPercentage(summary.qualityGate.mirrorParity.targetWinRate ?? 0)} +/- ${formatPercentage(summary.qualityGate.mirrorParity.tolerance ?? 0)}).`,
|
||||
);
|
||||
console.log(
|
||||
`Beginner dominance gate: ${summary.qualityGate.beginnerDominance.matches}/${summary.qualityGate.beginnerDominance.requiredMatches} matches, ${formatPercentage(summary.qualityGate.beginnerDominance.winRate)} master win rate (target >= ${formatPercentage(summary.qualityGate.beginnerDominance.minWinRate ?? 0)}).`,
|
||||
);
|
||||
if (summary.fixedSuite.fixedFixtureAgreementFailures.length > 0) {
|
||||
console.log(`Fixed-fixture agreement failures: ${summary.fixedSuite.fixedFixtureAgreementFailures.join(', ')}`);
|
||||
}
|
||||
if (summary.fixedSuite.criticalPassFailures.length > 0) {
|
||||
console.log(`Critical concept failures: ${summary.fixedSuite.criticalPassFailures.join(', ')}`);
|
||||
}
|
||||
console.log(`Per-seed outcomes: ${summary.selfPlay.perSeed.map(seed => `${seed.seed}:${seed.wins}W-${seed.losses}L-${seed.draws}D`).join(' | ')}`);
|
||||
console.log(`Dual-loss seeds: ${summary.selfPlay.dualLossSeeds.length > 0 ? summary.selfPlay.dualLossSeeds.join(', ') : 'none'}`);
|
||||
console.log(`Regression watchlist intersection: ${summary.selfPlay.regressionWatchlistDualLossIntersection.length > 0 ? summary.selfPlay.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${summary.selfPlay.regressionWatchlist.join(', ')})`);
|
||||
console.log(`Master simulated timing: avg ${summary.timing.productionMasterSimulatedDecisions.averageMs.toFixed(1)} ms, p95 ${summary.timing.productionMasterSimulatedDecisions.p95Ms.toFixed(1)} ms, max ${summary.timing.productionMasterSimulatedDecisions.maxMs.toFixed(1)} ms.`);
|
||||
console.log(`Mirror per-seed aggregates: ${formatPerSeedAggregates(mirror.perSeed)}`);
|
||||
console.log(`Mirror dual-loss seeds: ${mirror.dualLossSeeds.length > 0 ? mirror.dualLossSeeds.join(', ') : 'none'}`);
|
||||
console.log(
|
||||
`Mirror regression watchlist intersection: ${mirror.regressionWatchlistDualLossIntersection.length > 0 ? mirror.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${mirror.regressionWatchlist.join(', ')})`,
|
||||
);
|
||||
console.log(`Beginner per-seed aggregates: ${formatPerSeedAggregates(beginner.perSeed)}`);
|
||||
console.log(`Beginner dual-loss seeds: ${beginner.dualLossSeeds.length > 0 ? beginner.dualLossSeeds.join(', ') : 'none'}`);
|
||||
console.log(
|
||||
`Beginner regression watchlist intersection: ${beginner.regressionWatchlistDualLossIntersection.length > 0 ? beginner.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${beginner.regressionWatchlist.join(', ')})`,
|
||||
);
|
||||
console.log(
|
||||
`Fixed suite simulated duration: production ${formatDurationMs(summary.fixedSuite.simulatedTiming.productionSuiteSimulatedMs)}, reference ${formatDurationMs(summary.fixedSuite.simulatedTiming.referenceSuiteSimulatedMs)}.`,
|
||||
);
|
||||
console.log(`Mirror suite simulated duration: ${formatDurationMs(mirror.simulatedTiming.suiteSimulatedMs)}.`);
|
||||
console.log(`Beginner suite simulated duration: ${formatDurationMs(beginner.simulatedTiming.suiteSimulatedMs)}.`);
|
||||
console.log(
|
||||
`Simulated timing: fixed production avg ${summary.timing.fixedFixtureProductionMasterDecisions.averageMs.toFixed(1)} ms, fixed reference avg ${summary.timing.fixedFixtureReferenceMasterDecisions.averageMs.toFixed(1)} ms, mirror tracked avg ${summary.timing.mirrorTrackedTeamSimulatedDecisions.averageMs.toFixed(1)} ms, beginner tracked avg ${summary.timing.beginnerTrackedTeamSimulatedDecisions.averageMs.toFixed(1)} ms, aggregate avg ${summary.timing.allTrackedProductionSimulatedDecisions.averageMs.toFixed(1)} ms.`,
|
||||
);
|
||||
console.log('BENCHMARK_SUMMARY');
|
||||
console.log(JSON.stringify(summary, null, 2));
|
||||
}
|
||||
|
||||
export async function runAIBenchmark(): Promise<AIBenchmarkSummary> {
|
||||
assertIteration5BenchmarkContract();
|
||||
logBenchmarkProgress('Benchmark started. Running fixed fixtures first, then self-play.');
|
||||
assertIteration6BenchmarkContract();
|
||||
logBenchmarkProgress('Benchmark started. Running fixed fixtures, mirror parity, then beginner dominance.');
|
||||
|
||||
const fixedSuite = await runFixedFixtureSuite();
|
||||
logBenchmarkProgress(`Fixed fixture suite complete in ${formatDurationMs(fixedSuite.wallClockMs)} wall-clock.`);
|
||||
const selfPlay = await runSelfPlaySuite();
|
||||
logBenchmarkProgress(`Self-play suite complete in ${formatDurationMs(selfPlay.wallClockMs)} wall-clock.`);
|
||||
logBenchmarkProgress(
|
||||
`Fixed fixture suite complete with production ${formatDurationMs(fixedSuite.productionSuiteSimulatedMs)} simulated and reference ${formatDurationMs(fixedSuite.referenceSuiteSimulatedMs)} simulated.`,
|
||||
);
|
||||
|
||||
const mirrorRun = await runSelfPlaySuite(SELF_PLAY_SUITES['mirror-parity']);
|
||||
logBenchmarkProgress(`Mirror parity suite complete in ${formatDurationMs(mirrorRun.suiteSimulatedMs)} simulated.`);
|
||||
|
||||
const beginnerRun = await runSelfPlaySuite(SELF_PLAY_SUITES['beginner-dominance']);
|
||||
logBenchmarkProgress(`Beginner dominance suite complete in ${formatDurationMs(beginnerRun.suiteSimulatedMs)} simulated.`);
|
||||
|
||||
const mirrorSummary = buildSelfPlaySuiteSummary(SELF_PLAY_SUITES['mirror-parity'], mirrorRun);
|
||||
const beginnerSummary = buildSelfPlaySuiteSummary(SELF_PLAY_SUITES['beginner-dominance'], beginnerRun);
|
||||
const mirrorParityGate = createMirrorParityGate(mirrorSummary);
|
||||
const beginnerDominanceGate = createBeginnerDominanceGate(beginnerSummary);
|
||||
|
||||
const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length;
|
||||
const fixedFixtureAgreements = fixedSuite.results.filter(result => result.matchesReference).length;
|
||||
const expectedPasses = fixedSuite.results.filter(result => result.expectedPass).length;
|
||||
@@ -548,76 +783,73 @@ export async function runAIBenchmark(): Promise<AIBenchmarkSummary> {
|
||||
const criticalPassFailures = fixedSuite.results
|
||||
.filter(result => result.conceptGatePass === false)
|
||||
.map(result => result.fixtureId);
|
||||
const wins = selfPlay.results.filter(result => result.masterResult === 'win').length;
|
||||
const losses = selfPlay.results.filter(result => result.masterResult === 'loss').length;
|
||||
const draws = selfPlay.results.filter(result => result.masterResult === 'draw').length;
|
||||
const { perSeed, dualLossSeeds, regressionWatchlistDualLossIntersection } = summarizeSelfPlayBySeed(selfPlay.results);
|
||||
const productionMasterSimulatedDecisions = summarizeTimings([
|
||||
const fixedFixtureProductionMasterDecisions = summarizeTimings(fixedSuite.productionTimings);
|
||||
const fixedFixtureReferenceMasterDecisions = summarizeTimings(fixedSuite.referenceTimings);
|
||||
const mirrorTrackedTeamSimulatedDecisions = mirrorSummary.simulatedTiming.trackedTeamDecisions;
|
||||
const beginnerTrackedTeamSimulatedDecisions = beginnerSummary.simulatedTiming.trackedTeamDecisions;
|
||||
const allTrackedProductionSimulatedDecisions = summarizeTimings([
|
||||
...fixedSuite.productionTimings,
|
||||
...selfPlay.productionTimings,
|
||||
...mirrorRun.trackedTeamTimings,
|
||||
...beginnerRun.trackedTeamTimings,
|
||||
]);
|
||||
|
||||
const fixedFixtureGate: GateCountSummary = {
|
||||
actual: fixedFixtureAgreements,
|
||||
required: ITERATION_5_GATE.fixedFixtureAgreementTarget,
|
||||
required: AI_BENCHMARK_FIXTURES.length,
|
||||
total: AI_BENCHMARK_FIXTURES.length,
|
||||
passed: fixedFixtureAgreements === ITERATION_5_GATE.fixedFixtureAgreementTarget,
|
||||
passed: fixedFixtureAgreements === AI_BENCHMARK_FIXTURES.length,
|
||||
};
|
||||
const criticalConceptGate: GateCountSummary = {
|
||||
actual: criticalPasses,
|
||||
required: ITERATION_5_GATE.criticalConceptTarget,
|
||||
required: criticalFixtureCount,
|
||||
total: criticalFixtureCount,
|
||||
passed: criticalPasses === ITERATION_5_GATE.criticalConceptTarget,
|
||||
};
|
||||
const selfPlayGate: SelfPlayGateSummary = {
|
||||
matches: selfPlay.results.length,
|
||||
requiredMatches: ITERATION_5_GATE.selfPlayMatchTarget,
|
||||
wins,
|
||||
requiredWins: ITERATION_5_GATE.selfPlayWinTarget,
|
||||
losses,
|
||||
maxLosses: ITERATION_5_GATE.selfPlayMaxLosses,
|
||||
draws,
|
||||
matchCountPassed: selfPlay.results.length === ITERATION_5_GATE.selfPlayMatchTarget,
|
||||
winGatePassed: wins >= ITERATION_5_GATE.selfPlayWinTarget,
|
||||
lossGatePassed: losses <= ITERATION_5_GATE.selfPlayMaxLosses,
|
||||
passed: selfPlay.results.length === ITERATION_5_GATE.selfPlayMatchTarget
|
||||
&& wins >= ITERATION_5_GATE.selfPlayWinTarget
|
||||
&& losses <= ITERATION_5_GATE.selfPlayMaxLosses,
|
||||
passed: criticalPasses === criticalFixtureCount,
|
||||
};
|
||||
|
||||
return {
|
||||
benchmark: 'ai-quality',
|
||||
qualityGate: {
|
||||
iteration: 5,
|
||||
passed: fixedFixtureGate.passed && criticalConceptGate.passed && selfPlayGate.passed,
|
||||
iteration: 6,
|
||||
passed: fixedFixtureGate.passed
|
||||
&& criticalConceptGate.passed
|
||||
&& mirrorParityGate.passed
|
||||
&& beginnerDominanceGate.passed,
|
||||
fixedFixtures: fixedFixtureGate,
|
||||
criticalConcepts: criticalConceptGate,
|
||||
selfPlay: selfPlayGate,
|
||||
mirrorParity: mirrorParityGate,
|
||||
beginnerDominance: beginnerDominanceGate,
|
||||
},
|
||||
fixtureCount: AI_BENCHMARK_FIXTURES.length,
|
||||
criticalFixtureCount,
|
||||
fixtureTotals: {
|
||||
fixtures: AI_BENCHMARK_FIXTURES.length,
|
||||
criticalFixtures: criticalFixtureCount,
|
||||
},
|
||||
fixedSuite: {
|
||||
fixedFixtureAgreements,
|
||||
expectedPasses,
|
||||
criticalPasses,
|
||||
fixedFixtureAgreementFailures,
|
||||
criticalPassFailures,
|
||||
simulatedTiming: {
|
||||
productionSuiteSimulatedMs: fixedSuite.productionSuiteSimulatedMs,
|
||||
referenceSuiteSimulatedMs: fixedSuite.referenceSuiteSimulatedMs,
|
||||
productionMasterDecisions: fixedFixtureProductionMasterDecisions,
|
||||
referenceMasterDecisions: fixedFixtureReferenceMasterDecisions,
|
||||
},
|
||||
results: fixedSuite.results,
|
||||
},
|
||||
selfPlay: {
|
||||
matches: selfPlay.results.length,
|
||||
wins,
|
||||
losses,
|
||||
draws,
|
||||
winRate: selfPlay.results.length === 0 ? 0 : wins / selfPlay.results.length,
|
||||
lossRate: selfPlay.results.length === 0 ? 0 : losses / selfPlay.results.length,
|
||||
perSeed,
|
||||
dualLossSeeds,
|
||||
regressionWatchlist: [...KNOWN_REGRESSION_WATCHLIST],
|
||||
regressionWatchlistDualLossIntersection,
|
||||
results: selfPlay.results,
|
||||
selfPlaySuites: {
|
||||
totalMatches: mirrorSummary.matches + beginnerSummary.matches,
|
||||
mirrorParity: mirrorSummary,
|
||||
beginnerDominance: beginnerSummary,
|
||||
},
|
||||
timing: {
|
||||
productionMasterSimulatedDecisions,
|
||||
fixedFixtureProductionMasterDecisions,
|
||||
fixedFixtureReferenceMasterDecisions,
|
||||
mirrorTrackedTeamSimulatedDecisions,
|
||||
beginnerTrackedTeamSimulatedDecisions,
|
||||
allTrackedProductionSimulatedDecisions,
|
||||
},
|
||||
referenceProfile: REFERENCE_PROFILE,
|
||||
};
|
||||
@@ -625,7 +857,7 @@ export async function runAIBenchmark(): Promise<AIBenchmarkSummary> {
|
||||
|
||||
async function runBenchmarkCli(): Promise<void> {
|
||||
const summary = await runAIBenchmark();
|
||||
logBenchmarkProgress('Benchmark complete. Emitting summary with iteration 5 gate results.');
|
||||
logBenchmarkProgress('Benchmark complete. Emitting summary with iteration 6 gate results.');
|
||||
printReadableSummary(summary);
|
||||
}
|
||||
|
||||
|
||||
1364
src/game/ai.ts
1364
src/game/ai.ts
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user