fix(SCOPONE-0012): complete iteration 2 - speed up benchmark timer
This commit is contained in:
@@ -4,11 +4,10 @@ import {
|
|||||||
AI_BENCHMARK_FIXTURES,
|
AI_BENCHMARK_FIXTURES,
|
||||||
AIBenchmarkCriticalConcept,
|
AIBenchmarkCriticalConcept,
|
||||||
AIBenchmarkExpectedMove,
|
AIBenchmarkExpectedMove,
|
||||||
AIBenchmarkFixture,
|
|
||||||
isCriticalAIBenchmarkFixture,
|
isCriticalAIBenchmarkFixture,
|
||||||
} from './ai-benchmark-fixtures';
|
} from './ai-benchmark-fixtures';
|
||||||
import { CardTracker } from './card-tracker';
|
import { CardTracker } from './card-tracker';
|
||||||
import { GameState, PlayerIndex } from './types';
|
import { Difficulty, GameState, PlayerIndex } from './types';
|
||||||
|
|
||||||
function formatDurationMs(durationMs: number): string {
|
function formatDurationMs(durationMs: number): string {
|
||||||
if (durationMs < 1000) {
|
if (durationMs < 1000) {
|
||||||
@@ -18,6 +17,10 @@ function formatDurationMs(durationMs: number): string {
|
|||||||
return `${(durationMs / 1000).toFixed(2)} s`;
|
return `${(durationMs / 1000).toFixed(2)} s`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function formatPercentage(value: number): string {
|
||||||
|
return `${(value * 100).toFixed(1)}%`;
|
||||||
|
}
|
||||||
|
|
||||||
function logBenchmarkProgress(message: string): void {
|
function logBenchmarkProgress(message: string): void {
|
||||||
console.log(`[ai-benchmark] ${message}`);
|
console.log(`[ai-benchmark] ${message}`);
|
||||||
}
|
}
|
||||||
@@ -36,18 +39,23 @@ interface FixedFixtureResult {
|
|||||||
referenceSimulatedMs: number;
|
referenceSimulatedMs: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type SelfPlaySuiteId = 'mirror-parity' | 'beginner-dominance';
|
||||||
|
|
||||||
interface SelfPlayMatchResult {
|
interface SelfPlayMatchResult {
|
||||||
|
suite: SelfPlaySuiteId;
|
||||||
seed: number;
|
seed: number;
|
||||||
dealer: PlayerIndex;
|
dealer: PlayerIndex;
|
||||||
masterTeam: 0 | 1;
|
trackedTeam: 0 | 1;
|
||||||
|
trackedTeamDifficulty: Difficulty;
|
||||||
|
opponentDifficulty: Difficulty;
|
||||||
winner: 0 | 1 | null;
|
winner: 0 | 1 | null;
|
||||||
masterResult: 'win' | 'loss' | 'draw';
|
trackedResult: 'win' | 'loss' | 'draw';
|
||||||
rounds: number;
|
rounds: number;
|
||||||
truncated: boolean;
|
truncated: boolean;
|
||||||
totalPoints: [number, number];
|
totalPoints: [number, number];
|
||||||
masterDecisionCount: number;
|
trackedDecisionCount: number;
|
||||||
masterAverageSimulatedDecisionMs: number;
|
trackedAverageSimulatedDecisionMs: number;
|
||||||
masterMaxSimulatedDecisionMs: number;
|
trackedMaxSimulatedDecisionMs: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface TimingSummary {
|
interface TimingSummary {
|
||||||
@@ -64,23 +72,25 @@ interface GateCountSummary {
|
|||||||
passed: boolean;
|
passed: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface SelfPlayGateSummary {
|
interface WinRateGateSummary {
|
||||||
matches: number;
|
matches: number;
|
||||||
requiredMatches: number;
|
requiredMatches: number;
|
||||||
wins: number;
|
wins: number;
|
||||||
requiredWins: number;
|
|
||||||
losses: number;
|
losses: number;
|
||||||
maxLosses: number;
|
|
||||||
draws: number;
|
draws: number;
|
||||||
|
winRate: number;
|
||||||
|
targetWinRate: number | null;
|
||||||
|
tolerance: number | null;
|
||||||
|
minWinRate: number | null;
|
||||||
|
maxWinRate: number | null;
|
||||||
matchCountPassed: boolean;
|
matchCountPassed: boolean;
|
||||||
winGatePassed: boolean;
|
winRatePassed: boolean;
|
||||||
lossGatePassed: boolean;
|
|
||||||
passed: boolean;
|
passed: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface SelfPlaySeedSeatResult {
|
interface SelfPlaySeedSeatResult {
|
||||||
masterTeam: 0 | 1;
|
trackedTeam: 0 | 1;
|
||||||
masterResult: 'win' | 'loss' | 'draw';
|
trackedResult: 'win' | 'loss' | 'draw';
|
||||||
winner: 0 | 1 | null;
|
winner: 0 | 1 | null;
|
||||||
rounds: number;
|
rounds: number;
|
||||||
truncated: boolean;
|
truncated: boolean;
|
||||||
@@ -97,27 +107,15 @@ interface SelfPlaySeedAggregateResult {
|
|||||||
seatResults: SelfPlaySeedSeatResult[];
|
seatResults: SelfPlaySeedSeatResult[];
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface AIBenchmarkSummary {
|
interface SelfPlaySuiteSummary {
|
||||||
benchmark: 'ai-quality';
|
suite: SelfPlaySuiteId;
|
||||||
qualityGate: {
|
label: string;
|
||||||
iteration: 5;
|
trackedTeamDifficulty: Difficulty;
|
||||||
passed: boolean;
|
opponentDifficulty: Difficulty;
|
||||||
fixedFixtures: GateCountSummary;
|
|
||||||
criticalConcepts: GateCountSummary;
|
|
||||||
selfPlay: SelfPlayGateSummary;
|
|
||||||
};
|
|
||||||
fixtureCount: number;
|
|
||||||
criticalFixtureCount: number;
|
|
||||||
fixedSuite: {
|
|
||||||
fixedFixtureAgreements: number;
|
|
||||||
expectedPasses: number;
|
|
||||||
criticalPasses: number;
|
|
||||||
fixedFixtureAgreementFailures: string[];
|
|
||||||
criticalPassFailures: string[];
|
|
||||||
results: FixedFixtureResult[];
|
|
||||||
};
|
|
||||||
selfPlay: {
|
|
||||||
matches: number;
|
matches: number;
|
||||||
|
requiredMatches: number;
|
||||||
|
seedCount: number;
|
||||||
|
seatBalanced: boolean;
|
||||||
wins: number;
|
wins: number;
|
||||||
losses: number;
|
losses: number;
|
||||||
draws: number;
|
draws: number;
|
||||||
@@ -127,20 +125,64 @@ export interface AIBenchmarkSummary {
|
|||||||
dualLossSeeds: number[];
|
dualLossSeeds: number[];
|
||||||
regressionWatchlist: number[];
|
regressionWatchlist: number[];
|
||||||
regressionWatchlistDualLossIntersection: number[];
|
regressionWatchlistDualLossIntersection: number[];
|
||||||
|
simulatedTiming: {
|
||||||
|
suiteSimulatedMs: number;
|
||||||
|
trackedTeamDecisions: TimingSummary;
|
||||||
|
};
|
||||||
results: SelfPlayMatchResult[];
|
results: SelfPlayMatchResult[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface AIBenchmarkSummary {
|
||||||
|
benchmark: 'ai-quality';
|
||||||
|
qualityGate: {
|
||||||
|
iteration: 6;
|
||||||
|
passed: boolean;
|
||||||
|
fixedFixtures: GateCountSummary;
|
||||||
|
criticalConcepts: GateCountSummary;
|
||||||
|
mirrorParity: WinRateGateSummary;
|
||||||
|
beginnerDominance: WinRateGateSummary;
|
||||||
|
};
|
||||||
|
fixtureCount: number;
|
||||||
|
criticalFixtureCount: number;
|
||||||
|
fixtureTotals: {
|
||||||
|
fixtures: number;
|
||||||
|
criticalFixtures: number;
|
||||||
|
};
|
||||||
|
fixedSuite: {
|
||||||
|
fixedFixtureAgreements: number;
|
||||||
|
expectedPasses: number;
|
||||||
|
criticalPasses: number;
|
||||||
|
fixedFixtureAgreementFailures: string[];
|
||||||
|
criticalPassFailures: string[];
|
||||||
|
simulatedTiming: {
|
||||||
|
productionSuiteSimulatedMs: number;
|
||||||
|
referenceSuiteSimulatedMs: number;
|
||||||
|
productionMasterDecisions: TimingSummary;
|
||||||
|
referenceMasterDecisions: TimingSummary;
|
||||||
|
};
|
||||||
|
results: FixedFixtureResult[];
|
||||||
|
};
|
||||||
|
selfPlaySuites: {
|
||||||
|
totalMatches: number;
|
||||||
|
mirrorParity: SelfPlaySuiteSummary;
|
||||||
|
beginnerDominance: SelfPlaySuiteSummary;
|
||||||
};
|
};
|
||||||
timing: {
|
timing: {
|
||||||
productionMasterSimulatedDecisions: TimingSummary;
|
fixedFixtureProductionMasterDecisions: TimingSummary;
|
||||||
|
fixedFixtureReferenceMasterDecisions: TimingSummary;
|
||||||
|
mirrorTrackedTeamSimulatedDecisions: TimingSummary;
|
||||||
|
beginnerTrackedTeamSimulatedDecisions: TimingSummary;
|
||||||
|
allTrackedProductionSimulatedDecisions: TimingSummary;
|
||||||
};
|
};
|
||||||
referenceProfile: Required<AISearchProfileOverride>;
|
referenceProfile: Required<AISearchProfileOverride>;
|
||||||
}
|
}
|
||||||
|
|
||||||
const ITERATION_5_GATE = {
|
const ITERATION_6_GATE = {
|
||||||
fixedFixtureAgreementTarget: 13,
|
mirrorMatchTarget: 500,
|
||||||
criticalConceptTarget: 6,
|
beginnerMatchTarget: 500,
|
||||||
selfPlayMatchTarget: 48,
|
mirrorTargetWinRate: 0.5,
|
||||||
selfPlayWinTarget: 30,
|
mirrorWinRateTolerance: 0.05,
|
||||||
selfPlayMaxLosses: 12,
|
beginnerMinWinRate: 0.7,
|
||||||
} as const;
|
} as const;
|
||||||
|
|
||||||
const KNOWN_REGRESSION_WATCHLIST = [1000, 1002, 1004, 1006, 1012, 1013, 1014] as const;
|
const KNOWN_REGRESSION_WATCHLIST = [1000, 1002, 1004, 1006, 1012, 1013, 1014] as const;
|
||||||
@@ -153,31 +195,75 @@ const REFERENCE_PROFILE: Required<AISearchProfileOverride> = {
|
|||||||
batchSize: 2,
|
batchSize: 2,
|
||||||
};
|
};
|
||||||
|
|
||||||
const SELF_PLAY_MATCH_SEEDS = Array.from({ length: 24 }, (_, index) => 1000 + index);
|
const SELF_PLAY_SEAT_SWAPS = [0, 1] as const;
|
||||||
|
const SELF_PLAY_MATCH_SEEDS = Array.from({ length: 250 }, (_, index) => 1000 + index);
|
||||||
const MAX_SELF_PLAY_ROUNDS = 20;
|
const MAX_SELF_PLAY_ROUNDS = 20;
|
||||||
|
|
||||||
function assertIteration5BenchmarkContract(): void {
|
interface SelfPlaySuiteConfig {
|
||||||
|
id: SelfPlaySuiteId;
|
||||||
|
label: string;
|
||||||
|
suiteSeedKey: number;
|
||||||
|
requiredMatches: number;
|
||||||
|
trackedTeamDifficulty: Difficulty;
|
||||||
|
opponentDifficulty: Difficulty;
|
||||||
|
getTeamDifficulties(trackedTeam: 0 | 1): readonly [Difficulty, Difficulty];
|
||||||
|
}
|
||||||
|
|
||||||
|
const SELF_PLAY_SUITES: Record<SelfPlaySuiteId, SelfPlaySuiteConfig> = {
|
||||||
|
'mirror-parity': {
|
||||||
|
id: 'mirror-parity',
|
||||||
|
label: 'Master mirror parity',
|
||||||
|
suiteSeedKey: 0x4d31,
|
||||||
|
requiredMatches: SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length,
|
||||||
|
trackedTeamDifficulty: 'master',
|
||||||
|
opponentDifficulty: 'master',
|
||||||
|
getTeamDifficulties: () => ['master', 'master'],
|
||||||
|
},
|
||||||
|
'beginner-dominance': {
|
||||||
|
id: 'beginner-dominance',
|
||||||
|
label: 'Master versus beginner dominance',
|
||||||
|
suiteSeedKey: 0x4236,
|
||||||
|
requiredMatches: SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length,
|
||||||
|
trackedTeamDifficulty: 'master',
|
||||||
|
opponentDifficulty: 'beginner',
|
||||||
|
getTeamDifficulties: trackedTeam => (trackedTeam === 0
|
||||||
|
? ['master', 'beginner']
|
||||||
|
: ['beginner', 'master']),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
function assertIteration6BenchmarkContract(): void {
|
||||||
const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length;
|
const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length;
|
||||||
const selfPlayMatchCount = SELF_PLAY_MATCH_SEEDS.length * 2;
|
const expectedSeatBalancedMatches = SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length;
|
||||||
|
|
||||||
if (AI_BENCHMARK_FIXTURES.length !== ITERATION_5_GATE.fixedFixtureAgreementTarget) {
|
if (AI_BENCHMARK_FIXTURES.length === 0) {
|
||||||
|
throw new Error('Iteration 6 benchmark requires at least one fixed fixture.');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (criticalFixtureCount === 0) {
|
||||||
|
throw new Error('Iteration 6 benchmark requires at least one critical concept fixture.');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (expectedSeatBalancedMatches !== ITERATION_6_GATE.mirrorMatchTarget) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Iteration 5 benchmark expects ${ITERATION_5_GATE.fixedFixtureAgreementTarget} fixed fixtures, received ${AI_BENCHMARK_FIXTURES.length}.`,
|
`Iteration 6 benchmark expects ${ITERATION_6_GATE.mirrorMatchTarget} mirror matches, received ${expectedSeatBalancedMatches}.`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (criticalFixtureCount !== ITERATION_5_GATE.criticalConceptTarget) {
|
if (expectedSeatBalancedMatches !== ITERATION_6_GATE.beginnerMatchTarget) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Iteration 5 benchmark expects ${ITERATION_5_GATE.criticalConceptTarget} critical concept fixtures, received ${criticalFixtureCount}.`,
|
`Iteration 6 benchmark expects ${ITERATION_6_GATE.beginnerMatchTarget} beginner-dominance matches, received ${expectedSeatBalancedMatches}.`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (selfPlayMatchCount !== ITERATION_5_GATE.selfPlayMatchTarget) {
|
for (const suite of Object.values(SELF_PLAY_SUITES)) {
|
||||||
|
if (suite.requiredMatches !== expectedSeatBalancedMatches) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Iteration 5 benchmark expects ${ITERATION_5_GATE.selfPlayMatchTarget} self-play matches, received ${selfPlayMatchCount}.`,
|
`Iteration 6 benchmark expects ${expectedSeatBalancedMatches} matches for ${suite.id}, received ${suite.requiredMatches}.`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
interface SimulatedBenchmarkTimingSource extends AITimingSource {
|
interface SimulatedBenchmarkTimingSource extends AITimingSource {
|
||||||
getElapsedMs(): number;
|
getElapsedMs(): number;
|
||||||
@@ -220,6 +306,10 @@ function moveKey(move: AIMove): string {
|
|||||||
return `${move.card.id}|${move.capture.map(card => card.id).sort().join(',')}`;
|
return `${move.card.id}|${move.capture.map(card => card.id).sort().join(',')}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function otherTeam(team: 0 | 1): 0 | 1 {
|
||||||
|
return team === 0 ? 1 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
function createTrackerForState(state: GameState): CardTracker {
|
function createTrackerForState(state: GameState): CardTracker {
|
||||||
const tracker = new CardTracker();
|
const tracker = new CardTracker();
|
||||||
for (const player of state.players) {
|
for (const player of state.players) {
|
||||||
@@ -239,10 +329,16 @@ function matchesExpectedMove(move: AIMove, expected: AIBenchmarkExpectedMove): b
|
|||||||
return actualCapture === expectedCapture;
|
return actualCapture === expectedCapture;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function runFixedFixtureSuite(): Promise<{ results: FixedFixtureResult[]; wallClockMs: number; productionTimings: number[] }> {
|
async function runFixedFixtureSuite(): Promise<{
|
||||||
const startedAt = performance.now();
|
results: FixedFixtureResult[];
|
||||||
|
productionSuiteSimulatedMs: number;
|
||||||
|
referenceSuiteSimulatedMs: number;
|
||||||
|
productionTimings: number[];
|
||||||
|
referenceTimings: number[];
|
||||||
|
}> {
|
||||||
const results: FixedFixtureResult[] = [];
|
const results: FixedFixtureResult[] = [];
|
||||||
const productionTimings: number[] = [];
|
const productionTimings: number[] = [];
|
||||||
|
const referenceTimings: number[] = [];
|
||||||
|
|
||||||
logBenchmarkProgress(`Starting fixed fixture suite (${AI_BENCHMARK_FIXTURES.length} positions).`);
|
logBenchmarkProgress(`Starting fixed fixture suite (${AI_BENCHMARK_FIXTURES.length} positions).`);
|
||||||
|
|
||||||
@@ -286,6 +382,7 @@ async function runFixedFixtureSuite(): Promise<{ results: FixedFixtureResult[];
|
|||||||
const referenceSimulatedMs = referenceTimingSource.getElapsedMs();
|
const referenceSimulatedMs = referenceTimingSource.getElapsedMs();
|
||||||
|
|
||||||
productionTimings.push(productionSimulatedMs);
|
productionTimings.push(productionSimulatedMs);
|
||||||
|
referenceTimings.push(referenceSimulatedMs);
|
||||||
|
|
||||||
const conceptGatePass = isCriticalAIBenchmarkFixture(fixture)
|
const conceptGatePass = isCriticalAIBenchmarkFixture(fixture)
|
||||||
? matchesExpectedMove(productionMove, fixture.expectedMove)
|
? matchesExpectedMove(productionMove, fixture.expectedMove)
|
||||||
@@ -314,11 +411,17 @@ async function runFixedFixtureSuite(): Promise<{ results: FixedFixtureResult[];
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
results,
|
results,
|
||||||
wallClockMs: performance.now() - startedAt,
|
productionSuiteSimulatedMs: sumTimings(productionTimings),
|
||||||
|
referenceSuiteSimulatedMs: sumTimings(referenceTimings),
|
||||||
productionTimings,
|
productionTimings,
|
||||||
|
referenceTimings,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function sumTimings(samples: number[]): number {
|
||||||
|
return samples.reduce((total, sample) => total + sample, 0);
|
||||||
|
}
|
||||||
|
|
||||||
function summarizeTimings(samples: number[]): TimingSummary {
|
function summarizeTimings(samples: number[]): TimingSummary {
|
||||||
if (samples.length === 0) {
|
if (samples.length === 0) {
|
||||||
return {
|
return {
|
||||||
@@ -360,13 +463,13 @@ function summarizeSelfPlayBySeed(results: SelfPlayMatchResult[]): {
|
|||||||
};
|
};
|
||||||
|
|
||||||
existing.matches++;
|
existing.matches++;
|
||||||
if (result.masterResult === 'win') existing.wins++;
|
if (result.trackedResult === 'win') existing.wins++;
|
||||||
else if (result.masterResult === 'loss') existing.losses++;
|
else if (result.trackedResult === 'loss') existing.losses++;
|
||||||
else existing.draws++;
|
else existing.draws++;
|
||||||
|
|
||||||
existing.seatResults.push({
|
existing.seatResults.push({
|
||||||
masterTeam: result.masterTeam,
|
trackedTeam: result.trackedTeam,
|
||||||
masterResult: result.masterResult,
|
trackedResult: result.trackedResult,
|
||||||
winner: result.winner,
|
winner: result.winner,
|
||||||
rounds: result.rounds,
|
rounds: result.rounds,
|
||||||
truncated: result.truncated,
|
truncated: result.truncated,
|
||||||
@@ -380,7 +483,7 @@ function summarizeSelfPlayBySeed(results: SelfPlayMatchResult[]): {
|
|||||||
.map(aggregate => ({
|
.map(aggregate => ({
|
||||||
...aggregate,
|
...aggregate,
|
||||||
dualLoss: aggregate.losses >= 2,
|
dualLoss: aggregate.losses >= 2,
|
||||||
seatResults: [...aggregate.seatResults].sort((left, right) => left.masterTeam - right.masterTeam),
|
seatResults: [...aggregate.seatResults].sort((left, right) => left.trackedTeam - right.trackedTeam),
|
||||||
}))
|
}))
|
||||||
.sort((left, right) => left.seed - right.seed);
|
.sort((left, right) => left.seed - right.seed);
|
||||||
const dualLossSeeds = perSeed.filter(aggregate => aggregate.dualLoss).map(aggregate => aggregate.seed);
|
const dualLossSeeds = perSeed.filter(aggregate => aggregate.dualLoss).map(aggregate => aggregate.seed);
|
||||||
@@ -393,12 +496,18 @@ function summarizeSelfPlayBySeed(results: SelfPlayMatchResult[]): {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
async function simulateSelfPlayMatch(seed: number, masterTeam: 0 | 1): Promise<{ result: SelfPlayMatchResult; timings: number[] }> {
|
async function simulateSelfPlayMatch(
|
||||||
|
suite: SelfPlaySuiteConfig,
|
||||||
|
seed: number,
|
||||||
|
trackedTeam: 0 | 1,
|
||||||
|
): Promise<{ result: SelfPlayMatchResult; trackedTimings: number[]; simulatedMatchMs: number }> {
|
||||||
const initialDealer = (seed % 4) as PlayerIndex;
|
const initialDealer = (seed % 4) as PlayerIndex;
|
||||||
let state = createInitialState(initialDealer, createMulberry32(seedFromParts(seed, 1, 0)));
|
const teamDifficulties = suite.getTeamDifficulties(trackedTeam);
|
||||||
|
let state = createInitialState(initialDealer, createMulberry32(seedFromParts(suite.suiteSeedKey, seed, 1, 0)));
|
||||||
const matchStartingPlayer = state.matchStartingPlayer;
|
const matchStartingPlayer = state.matchStartingPlayer;
|
||||||
const tracker = new CardTracker();
|
const tracker = new CardTracker();
|
||||||
const masterTimings: number[] = [];
|
const trackedTimings: number[] = [];
|
||||||
|
let simulatedMatchMs = 0;
|
||||||
|
|
||||||
let rounds = 1;
|
let rounds = 1;
|
||||||
let truncated = false;
|
let truncated = false;
|
||||||
@@ -407,19 +516,18 @@ async function simulateSelfPlayMatch(seed: number, masterTeam: 0 | 1): Promise<{
|
|||||||
while (rounds <= MAX_SELF_PLAY_ROUNDS) {
|
while (rounds <= MAX_SELF_PLAY_ROUNDS) {
|
||||||
while (!state.roundOver) {
|
while (!state.roundOver) {
|
||||||
const playerIdx = state.currentPlayer;
|
const playerIdx = state.currentPlayer;
|
||||||
const difficulty = teamOf(playerIdx) === masterTeam ? 'master' : 'advanced';
|
const actingTeam = teamOf(playerIdx);
|
||||||
|
const difficulty = teamDifficulties[actingTeam];
|
||||||
const timingSource = createSimulatedBenchmarkTimingSource();
|
const timingSource = createSimulatedBenchmarkTimingSource();
|
||||||
const options = difficulty === 'master'
|
const move = await chooseMove(state, playerIdx, difficulty, tracker, undefined, {
|
||||||
? {
|
rng: createMulberry32(seedFromParts(suite.suiteSeedKey, seed, rounds, turnCount, playerIdx)),
|
||||||
rng: createMulberry32(seedFromParts(seed, rounds, turnCount, playerIdx)),
|
|
||||||
timingSource,
|
timingSource,
|
||||||
}
|
});
|
||||||
: { timingSource };
|
|
||||||
const move = await chooseMove(state, playerIdx, difficulty, tracker, undefined, options);
|
|
||||||
const simulatedMs = timingSource.getElapsedMs();
|
const simulatedMs = timingSource.getElapsedMs();
|
||||||
|
simulatedMatchMs += simulatedMs;
|
||||||
|
|
||||||
if (difficulty === 'master') {
|
if (actingTeam === trackedTeam) {
|
||||||
masterTimings.push(simulatedMs);
|
trackedTimings.push(simulatedMs);
|
||||||
}
|
}
|
||||||
|
|
||||||
const { nextState, capture } = applyMove(
|
const { nextState, capture } = applyMove(
|
||||||
@@ -441,16 +549,17 @@ async function simulateSelfPlayMatch(seed: number, masterTeam: 0 | 1): Promise<{
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
rounds++;
|
if (rounds === MAX_SELF_PLAY_ROUNDS) {
|
||||||
if (rounds > MAX_SELF_PLAY_ROUNDS) {
|
|
||||||
truncated = true;
|
truncated = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
rounds++;
|
||||||
|
|
||||||
const totals: [number, number] = [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints];
|
const totals: [number, number] = [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints];
|
||||||
const nextDealer = nextPlayer(state.dealer);
|
const nextDealer = nextPlayer(state.dealer);
|
||||||
tracker.reset();
|
tracker.reset();
|
||||||
state = createInitialState(nextDealer, createMulberry32(seedFromParts(seed, rounds, 0)));
|
state = createInitialState(nextDealer, createMulberry32(seedFromParts(suite.suiteSeedKey, seed, rounds, 0)));
|
||||||
state.matchStartingPlayer = matchStartingPlayer;
|
state.matchStartingPlayer = matchStartingPlayer;
|
||||||
state.teamScores[0].totalPoints = totals[0];
|
state.teamScores[0].totalPoints = totals[0];
|
||||||
state.teamScores[1].totalPoints = totals[1];
|
state.teamScores[1].totalPoints = totals[1];
|
||||||
@@ -458,47 +567,54 @@ async function simulateSelfPlayMatch(seed: number, masterTeam: 0 | 1): Promise<{
|
|||||||
}
|
}
|
||||||
|
|
||||||
const outcome = getMatchOutcome(state.teamScores);
|
const outcome = getMatchOutcome(state.teamScores);
|
||||||
const winner = truncated ? outcome.winner : outcome.winner;
|
const winner = outcome.winner;
|
||||||
const masterResult = winner === null ? 'draw' : winner === masterTeam ? 'win' : 'loss';
|
const timingSummary = summarizeTimings(trackedTimings);
|
||||||
const timingSummary = summarizeTimings(masterTimings);
|
const opposingTeam = otherTeam(trackedTeam);
|
||||||
|
const trackedResult = winner === null ? 'draw' : winner === trackedTeam ? 'win' : 'loss';
|
||||||
|
|
||||||
return {
|
return {
|
||||||
result: {
|
result: {
|
||||||
|
suite: suite.id,
|
||||||
seed,
|
seed,
|
||||||
dealer: initialDealer,
|
dealer: initialDealer,
|
||||||
masterTeam,
|
trackedTeam,
|
||||||
|
trackedTeamDifficulty: teamDifficulties[trackedTeam],
|
||||||
|
opponentDifficulty: teamDifficulties[opposingTeam],
|
||||||
winner,
|
winner,
|
||||||
masterResult,
|
trackedResult,
|
||||||
rounds,
|
rounds,
|
||||||
truncated,
|
truncated,
|
||||||
totalPoints: [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints],
|
totalPoints: [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints],
|
||||||
masterDecisionCount: timingSummary.count,
|
trackedDecisionCount: timingSummary.count,
|
||||||
masterAverageSimulatedDecisionMs: timingSummary.averageMs,
|
trackedAverageSimulatedDecisionMs: timingSummary.averageMs,
|
||||||
masterMaxSimulatedDecisionMs: timingSummary.maxMs,
|
trackedMaxSimulatedDecisionMs: timingSummary.maxMs,
|
||||||
},
|
},
|
||||||
timings: masterTimings,
|
trackedTimings,
|
||||||
|
simulatedMatchMs,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
async function runSelfPlaySuite(): Promise<{ results: SelfPlayMatchResult[]; wallClockMs: number; productionTimings: number[] }> {
|
async function runSelfPlaySuite(
|
||||||
const startedAt = performance.now();
|
suite: SelfPlaySuiteConfig,
|
||||||
|
): Promise<{ results: SelfPlayMatchResult[]; suiteSimulatedMs: number; trackedTeamTimings: number[] }> {
|
||||||
const results: SelfPlayMatchResult[] = [];
|
const results: SelfPlayMatchResult[] = [];
|
||||||
const productionTimings: number[] = [];
|
const trackedTeamTimings: number[] = [];
|
||||||
const totalMatches = SELF_PLAY_MATCH_SEEDS.length * 2;
|
let suiteSimulatedMs = 0;
|
||||||
let completedMatches = 0;
|
let completedMatches = 0;
|
||||||
|
|
||||||
logBenchmarkProgress(`Starting self-play suite (${totalMatches} seeded matches with seat swaps).`);
|
logBenchmarkProgress(`Starting ${suite.label} suite (${suite.requiredMatches} seeded matches with seat swaps).`);
|
||||||
|
|
||||||
for (const seed of SELF_PLAY_MATCH_SEEDS) {
|
for (const seed of SELF_PLAY_MATCH_SEEDS) {
|
||||||
for (const masterTeam of [0, 1] as const) {
|
for (const trackedTeam of SELF_PLAY_SEAT_SWAPS) {
|
||||||
const { result, timings } = await simulateSelfPlayMatch(seed, masterTeam);
|
const { result, trackedTimings, simulatedMatchMs } = await simulateSelfPlayMatch(suite, seed, trackedTeam);
|
||||||
results.push(result);
|
results.push(result);
|
||||||
productionTimings.push(...timings);
|
trackedTeamTimings.push(...trackedTimings);
|
||||||
|
suiteSimulatedMs += simulatedMatchMs;
|
||||||
completedMatches++;
|
completedMatches++;
|
||||||
|
|
||||||
if (completedMatches === 1 || completedMatches % 4 === 0 || completedMatches === totalMatches) {
|
if (completedMatches === 1 || completedMatches % 25 === 0 || completedMatches === suite.requiredMatches) {
|
||||||
logBenchmarkProgress(
|
logBenchmarkProgress(
|
||||||
`Self-play ${completedMatches}/${totalMatches}: seed ${seed}, master team ${masterTeam}, result ${result.masterResult}, rounds ${result.rounds}, max simulated decision ${formatDurationMs(result.masterMaxSimulatedDecisionMs)}.`,
|
`${suite.label} ${completedMatches}/${suite.requiredMatches}: seed ${seed}, tracked team ${trackedTeam}, result ${result.trackedResult}, rounds ${result.rounds}, max simulated decision ${formatDurationMs(result.trackedMaxSimulatedDecisionMs)}.`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -506,38 +622,157 @@ async function runSelfPlaySuite(): Promise<{ results: SelfPlayMatchResult[]; wal
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
results,
|
results,
|
||||||
wallClockMs: performance.now() - startedAt,
|
suiteSimulatedMs,
|
||||||
productionTimings,
|
trackedTeamTimings,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function buildSelfPlaySuiteSummary(
|
||||||
|
suite: SelfPlaySuiteConfig,
|
||||||
|
run: { results: SelfPlayMatchResult[]; suiteSimulatedMs: number; trackedTeamTimings: number[] },
|
||||||
|
): SelfPlaySuiteSummary {
|
||||||
|
const wins = run.results.filter(result => result.trackedResult === 'win').length;
|
||||||
|
const losses = run.results.filter(result => result.trackedResult === 'loss').length;
|
||||||
|
const draws = run.results.filter(result => result.trackedResult === 'draw').length;
|
||||||
|
const { perSeed, dualLossSeeds, regressionWatchlistDualLossIntersection } = summarizeSelfPlayBySeed(run.results);
|
||||||
|
|
||||||
|
return {
|
||||||
|
suite: suite.id,
|
||||||
|
label: suite.label,
|
||||||
|
trackedTeamDifficulty: suite.trackedTeamDifficulty,
|
||||||
|
opponentDifficulty: suite.opponentDifficulty,
|
||||||
|
matches: run.results.length,
|
||||||
|
requiredMatches: suite.requiredMatches,
|
||||||
|
seedCount: SELF_PLAY_MATCH_SEEDS.length,
|
||||||
|
seatBalanced: true,
|
||||||
|
wins,
|
||||||
|
losses,
|
||||||
|
draws,
|
||||||
|
winRate: run.results.length === 0 ? 0 : wins / run.results.length,
|
||||||
|
lossRate: run.results.length === 0 ? 0 : losses / run.results.length,
|
||||||
|
perSeed,
|
||||||
|
dualLossSeeds,
|
||||||
|
regressionWatchlist: [...KNOWN_REGRESSION_WATCHLIST],
|
||||||
|
regressionWatchlistDualLossIntersection,
|
||||||
|
simulatedTiming: {
|
||||||
|
suiteSimulatedMs: run.suiteSimulatedMs,
|
||||||
|
trackedTeamDecisions: summarizeTimings(run.trackedTeamTimings),
|
||||||
|
},
|
||||||
|
results: run.results,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function createMirrorParityGate(summary: SelfPlaySuiteSummary): WinRateGateSummary {
|
||||||
|
const minWinRate = ITERATION_6_GATE.mirrorTargetWinRate - ITERATION_6_GATE.mirrorWinRateTolerance;
|
||||||
|
const maxWinRate = ITERATION_6_GATE.mirrorTargetWinRate + ITERATION_6_GATE.mirrorWinRateTolerance;
|
||||||
|
const matchCountPassed = summary.matches === ITERATION_6_GATE.mirrorMatchTarget;
|
||||||
|
const winRatePassed = summary.winRate >= minWinRate && summary.winRate <= maxWinRate;
|
||||||
|
|
||||||
|
return {
|
||||||
|
matches: summary.matches,
|
||||||
|
requiredMatches: ITERATION_6_GATE.mirrorMatchTarget,
|
||||||
|
wins: summary.wins,
|
||||||
|
losses: summary.losses,
|
||||||
|
draws: summary.draws,
|
||||||
|
winRate: summary.winRate,
|
||||||
|
targetWinRate: ITERATION_6_GATE.mirrorTargetWinRate,
|
||||||
|
tolerance: ITERATION_6_GATE.mirrorWinRateTolerance,
|
||||||
|
minWinRate,
|
||||||
|
maxWinRate,
|
||||||
|
matchCountPassed,
|
||||||
|
winRatePassed,
|
||||||
|
passed: matchCountPassed && winRatePassed,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function createBeginnerDominanceGate(summary: SelfPlaySuiteSummary): WinRateGateSummary {
|
||||||
|
const matchCountPassed = summary.matches === ITERATION_6_GATE.beginnerMatchTarget;
|
||||||
|
const winRatePassed = summary.winRate >= ITERATION_6_GATE.beginnerMinWinRate;
|
||||||
|
|
||||||
|
return {
|
||||||
|
matches: summary.matches,
|
||||||
|
requiredMatches: ITERATION_6_GATE.beginnerMatchTarget,
|
||||||
|
wins: summary.wins,
|
||||||
|
losses: summary.losses,
|
||||||
|
draws: summary.draws,
|
||||||
|
winRate: summary.winRate,
|
||||||
|
targetWinRate: null,
|
||||||
|
tolerance: null,
|
||||||
|
minWinRate: ITERATION_6_GATE.beginnerMinWinRate,
|
||||||
|
maxWinRate: null,
|
||||||
|
matchCountPassed,
|
||||||
|
winRatePassed,
|
||||||
|
passed: matchCountPassed && winRatePassed,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatPerSeedAggregates(perSeed: SelfPlaySeedAggregateResult[]): string {
|
||||||
|
return perSeed.map(seed => `${seed.seed}:${seed.wins}W-${seed.losses}L-${seed.draws}D`).join(' | ');
|
||||||
|
}
|
||||||
|
|
||||||
function printReadableSummary(summary: AIBenchmarkSummary): void {
|
function printReadableSummary(summary: AIBenchmarkSummary): void {
|
||||||
|
const mirror = summary.selfPlaySuites.mirrorParity;
|
||||||
|
const beginner = summary.selfPlaySuites.beginnerDominance;
|
||||||
|
|
||||||
console.log('AI quality benchmark');
|
console.log('AI quality benchmark');
|
||||||
console.log(`Iteration 5 quality gate: ${summary.qualityGate.passed ? 'PASS' : 'FAIL'}`);
|
console.log(`Iteration 6 quality gate: ${summary.qualityGate.passed ? 'PASS' : 'FAIL'}`);
|
||||||
console.log(`Fixed-fixture gate: ${summary.qualityGate.fixedFixtures.actual}/${summary.qualityGate.fixedFixtures.total} agreements (target ${summary.qualityGate.fixedFixtures.required}/${summary.qualityGate.fixedFixtures.total}).`);
|
console.log(`Fixture totals: ${summary.fixtureTotals.fixtures} total, ${summary.fixtureTotals.criticalFixtures} critical.`);
|
||||||
console.log(`Critical concept gate: ${summary.qualityGate.criticalConcepts.actual}/${summary.qualityGate.criticalConcepts.total} passes (target ${summary.qualityGate.criticalConcepts.required}/${summary.qualityGate.criticalConcepts.total}).`);
|
console.log(`Fixed-fixture gate: ${summary.qualityGate.fixedFixtures.actual}/${summary.qualityGate.fixedFixtures.total} agreements.`);
|
||||||
console.log(`Self-play gate: ${summary.qualityGate.selfPlay.matches}/${summary.qualityGate.selfPlay.requiredMatches} matches, ${summary.qualityGate.selfPlay.wins}/${summary.qualityGate.selfPlay.matches} wins (target ${summary.qualityGate.selfPlay.requiredWins}), ${summary.qualityGate.selfPlay.losses}/${summary.qualityGate.selfPlay.matches} losses (max ${summary.qualityGate.selfPlay.maxLosses}), ${summary.qualityGate.selfPlay.draws} draws.`);
|
console.log(`Critical concept gate: ${summary.qualityGate.criticalConcepts.actual}/${summary.qualityGate.criticalConcepts.total} passes.`);
|
||||||
|
console.log(
|
||||||
|
`Mirror parity gate: ${summary.qualityGate.mirrorParity.matches}/${summary.qualityGate.mirrorParity.requiredMatches} matches, ${formatPercentage(summary.qualityGate.mirrorParity.winRate)} tracked-team win rate (target ${formatPercentage(summary.qualityGate.mirrorParity.targetWinRate ?? 0)} +/- ${formatPercentage(summary.qualityGate.mirrorParity.tolerance ?? 0)}).`,
|
||||||
|
);
|
||||||
|
console.log(
|
||||||
|
`Beginner dominance gate: ${summary.qualityGate.beginnerDominance.matches}/${summary.qualityGate.beginnerDominance.requiredMatches} matches, ${formatPercentage(summary.qualityGate.beginnerDominance.winRate)} master win rate (target >= ${formatPercentage(summary.qualityGate.beginnerDominance.minWinRate ?? 0)}).`,
|
||||||
|
);
|
||||||
if (summary.fixedSuite.fixedFixtureAgreementFailures.length > 0) {
|
if (summary.fixedSuite.fixedFixtureAgreementFailures.length > 0) {
|
||||||
console.log(`Fixed-fixture agreement failures: ${summary.fixedSuite.fixedFixtureAgreementFailures.join(', ')}`);
|
console.log(`Fixed-fixture agreement failures: ${summary.fixedSuite.fixedFixtureAgreementFailures.join(', ')}`);
|
||||||
}
|
}
|
||||||
if (summary.fixedSuite.criticalPassFailures.length > 0) {
|
if (summary.fixedSuite.criticalPassFailures.length > 0) {
|
||||||
console.log(`Critical concept failures: ${summary.fixedSuite.criticalPassFailures.join(', ')}`);
|
console.log(`Critical concept failures: ${summary.fixedSuite.criticalPassFailures.join(', ')}`);
|
||||||
}
|
}
|
||||||
console.log(`Per-seed outcomes: ${summary.selfPlay.perSeed.map(seed => `${seed.seed}:${seed.wins}W-${seed.losses}L-${seed.draws}D`).join(' | ')}`);
|
console.log(`Mirror per-seed aggregates: ${formatPerSeedAggregates(mirror.perSeed)}`);
|
||||||
console.log(`Dual-loss seeds: ${summary.selfPlay.dualLossSeeds.length > 0 ? summary.selfPlay.dualLossSeeds.join(', ') : 'none'}`);
|
console.log(`Mirror dual-loss seeds: ${mirror.dualLossSeeds.length > 0 ? mirror.dualLossSeeds.join(', ') : 'none'}`);
|
||||||
console.log(`Regression watchlist intersection: ${summary.selfPlay.regressionWatchlistDualLossIntersection.length > 0 ? summary.selfPlay.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${summary.selfPlay.regressionWatchlist.join(', ')})`);
|
console.log(
|
||||||
console.log(`Master simulated timing: avg ${summary.timing.productionMasterSimulatedDecisions.averageMs.toFixed(1)} ms, p95 ${summary.timing.productionMasterSimulatedDecisions.p95Ms.toFixed(1)} ms, max ${summary.timing.productionMasterSimulatedDecisions.maxMs.toFixed(1)} ms.`);
|
`Mirror regression watchlist intersection: ${mirror.regressionWatchlistDualLossIntersection.length > 0 ? mirror.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${mirror.regressionWatchlist.join(', ')})`,
|
||||||
|
);
|
||||||
|
console.log(`Beginner per-seed aggregates: ${formatPerSeedAggregates(beginner.perSeed)}`);
|
||||||
|
console.log(`Beginner dual-loss seeds: ${beginner.dualLossSeeds.length > 0 ? beginner.dualLossSeeds.join(', ') : 'none'}`);
|
||||||
|
console.log(
|
||||||
|
`Beginner regression watchlist intersection: ${beginner.regressionWatchlistDualLossIntersection.length > 0 ? beginner.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${beginner.regressionWatchlist.join(', ')})`,
|
||||||
|
);
|
||||||
|
console.log(
|
||||||
|
`Fixed suite simulated duration: production ${formatDurationMs(summary.fixedSuite.simulatedTiming.productionSuiteSimulatedMs)}, reference ${formatDurationMs(summary.fixedSuite.simulatedTiming.referenceSuiteSimulatedMs)}.`,
|
||||||
|
);
|
||||||
|
console.log(`Mirror suite simulated duration: ${formatDurationMs(mirror.simulatedTiming.suiteSimulatedMs)}.`);
|
||||||
|
console.log(`Beginner suite simulated duration: ${formatDurationMs(beginner.simulatedTiming.suiteSimulatedMs)}.`);
|
||||||
|
console.log(
|
||||||
|
`Simulated timing: fixed production avg ${summary.timing.fixedFixtureProductionMasterDecisions.averageMs.toFixed(1)} ms, fixed reference avg ${summary.timing.fixedFixtureReferenceMasterDecisions.averageMs.toFixed(1)} ms, mirror tracked avg ${summary.timing.mirrorTrackedTeamSimulatedDecisions.averageMs.toFixed(1)} ms, beginner tracked avg ${summary.timing.beginnerTrackedTeamSimulatedDecisions.averageMs.toFixed(1)} ms, aggregate avg ${summary.timing.allTrackedProductionSimulatedDecisions.averageMs.toFixed(1)} ms.`,
|
||||||
|
);
|
||||||
console.log('BENCHMARK_SUMMARY');
|
console.log('BENCHMARK_SUMMARY');
|
||||||
console.log(JSON.stringify(summary, null, 2));
|
console.log(JSON.stringify(summary, null, 2));
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function runAIBenchmark(): Promise<AIBenchmarkSummary> {
|
export async function runAIBenchmark(): Promise<AIBenchmarkSummary> {
|
||||||
assertIteration5BenchmarkContract();
|
assertIteration6BenchmarkContract();
|
||||||
logBenchmarkProgress('Benchmark started. Running fixed fixtures first, then self-play.');
|
logBenchmarkProgress('Benchmark started. Running fixed fixtures, mirror parity, then beginner dominance.');
|
||||||
|
|
||||||
const fixedSuite = await runFixedFixtureSuite();
|
const fixedSuite = await runFixedFixtureSuite();
|
||||||
logBenchmarkProgress(`Fixed fixture suite complete in ${formatDurationMs(fixedSuite.wallClockMs)} wall-clock.`);
|
logBenchmarkProgress(
|
||||||
const selfPlay = await runSelfPlaySuite();
|
`Fixed fixture suite complete with production ${formatDurationMs(fixedSuite.productionSuiteSimulatedMs)} simulated and reference ${formatDurationMs(fixedSuite.referenceSuiteSimulatedMs)} simulated.`,
|
||||||
logBenchmarkProgress(`Self-play suite complete in ${formatDurationMs(selfPlay.wallClockMs)} wall-clock.`);
|
);
|
||||||
|
|
||||||
|
const mirrorRun = await runSelfPlaySuite(SELF_PLAY_SUITES['mirror-parity']);
|
||||||
|
logBenchmarkProgress(`Mirror parity suite complete in ${formatDurationMs(mirrorRun.suiteSimulatedMs)} simulated.`);
|
||||||
|
|
||||||
|
const beginnerRun = await runSelfPlaySuite(SELF_PLAY_SUITES['beginner-dominance']);
|
||||||
|
logBenchmarkProgress(`Beginner dominance suite complete in ${formatDurationMs(beginnerRun.suiteSimulatedMs)} simulated.`);
|
||||||
|
|
||||||
|
const mirrorSummary = buildSelfPlaySuiteSummary(SELF_PLAY_SUITES['mirror-parity'], mirrorRun);
|
||||||
|
const beginnerSummary = buildSelfPlaySuiteSummary(SELF_PLAY_SUITES['beginner-dominance'], beginnerRun);
|
||||||
|
const mirrorParityGate = createMirrorParityGate(mirrorSummary);
|
||||||
|
const beginnerDominanceGate = createBeginnerDominanceGate(beginnerSummary);
|
||||||
|
|
||||||
const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length;
|
const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length;
|
||||||
const fixedFixtureAgreements = fixedSuite.results.filter(result => result.matchesReference).length;
|
const fixedFixtureAgreements = fixedSuite.results.filter(result => result.matchesReference).length;
|
||||||
const expectedPasses = fixedSuite.results.filter(result => result.expectedPass).length;
|
const expectedPasses = fixedSuite.results.filter(result => result.expectedPass).length;
|
||||||
@@ -548,76 +783,73 @@ export async function runAIBenchmark(): Promise<AIBenchmarkSummary> {
|
|||||||
const criticalPassFailures = fixedSuite.results
|
const criticalPassFailures = fixedSuite.results
|
||||||
.filter(result => result.conceptGatePass === false)
|
.filter(result => result.conceptGatePass === false)
|
||||||
.map(result => result.fixtureId);
|
.map(result => result.fixtureId);
|
||||||
const wins = selfPlay.results.filter(result => result.masterResult === 'win').length;
|
const fixedFixtureProductionMasterDecisions = summarizeTimings(fixedSuite.productionTimings);
|
||||||
const losses = selfPlay.results.filter(result => result.masterResult === 'loss').length;
|
const fixedFixtureReferenceMasterDecisions = summarizeTimings(fixedSuite.referenceTimings);
|
||||||
const draws = selfPlay.results.filter(result => result.masterResult === 'draw').length;
|
const mirrorTrackedTeamSimulatedDecisions = mirrorSummary.simulatedTiming.trackedTeamDecisions;
|
||||||
const { perSeed, dualLossSeeds, regressionWatchlistDualLossIntersection } = summarizeSelfPlayBySeed(selfPlay.results);
|
const beginnerTrackedTeamSimulatedDecisions = beginnerSummary.simulatedTiming.trackedTeamDecisions;
|
||||||
const productionMasterSimulatedDecisions = summarizeTimings([
|
const allTrackedProductionSimulatedDecisions = summarizeTimings([
|
||||||
...fixedSuite.productionTimings,
|
...fixedSuite.productionTimings,
|
||||||
...selfPlay.productionTimings,
|
...mirrorRun.trackedTeamTimings,
|
||||||
|
...beginnerRun.trackedTeamTimings,
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const fixedFixtureGate: GateCountSummary = {
|
const fixedFixtureGate: GateCountSummary = {
|
||||||
actual: fixedFixtureAgreements,
|
actual: fixedFixtureAgreements,
|
||||||
required: ITERATION_5_GATE.fixedFixtureAgreementTarget,
|
required: AI_BENCHMARK_FIXTURES.length,
|
||||||
total: AI_BENCHMARK_FIXTURES.length,
|
total: AI_BENCHMARK_FIXTURES.length,
|
||||||
passed: fixedFixtureAgreements === ITERATION_5_GATE.fixedFixtureAgreementTarget,
|
passed: fixedFixtureAgreements === AI_BENCHMARK_FIXTURES.length,
|
||||||
};
|
};
|
||||||
const criticalConceptGate: GateCountSummary = {
|
const criticalConceptGate: GateCountSummary = {
|
||||||
actual: criticalPasses,
|
actual: criticalPasses,
|
||||||
required: ITERATION_5_GATE.criticalConceptTarget,
|
required: criticalFixtureCount,
|
||||||
total: criticalFixtureCount,
|
total: criticalFixtureCount,
|
||||||
passed: criticalPasses === ITERATION_5_GATE.criticalConceptTarget,
|
passed: criticalPasses === criticalFixtureCount,
|
||||||
};
|
|
||||||
const selfPlayGate: SelfPlayGateSummary = {
|
|
||||||
matches: selfPlay.results.length,
|
|
||||||
requiredMatches: ITERATION_5_GATE.selfPlayMatchTarget,
|
|
||||||
wins,
|
|
||||||
requiredWins: ITERATION_5_GATE.selfPlayWinTarget,
|
|
||||||
losses,
|
|
||||||
maxLosses: ITERATION_5_GATE.selfPlayMaxLosses,
|
|
||||||
draws,
|
|
||||||
matchCountPassed: selfPlay.results.length === ITERATION_5_GATE.selfPlayMatchTarget,
|
|
||||||
winGatePassed: wins >= ITERATION_5_GATE.selfPlayWinTarget,
|
|
||||||
lossGatePassed: losses <= ITERATION_5_GATE.selfPlayMaxLosses,
|
|
||||||
passed: selfPlay.results.length === ITERATION_5_GATE.selfPlayMatchTarget
|
|
||||||
&& wins >= ITERATION_5_GATE.selfPlayWinTarget
|
|
||||||
&& losses <= ITERATION_5_GATE.selfPlayMaxLosses,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
return {
|
return {
|
||||||
benchmark: 'ai-quality',
|
benchmark: 'ai-quality',
|
||||||
qualityGate: {
|
qualityGate: {
|
||||||
iteration: 5,
|
iteration: 6,
|
||||||
passed: fixedFixtureGate.passed && criticalConceptGate.passed && selfPlayGate.passed,
|
passed: fixedFixtureGate.passed
|
||||||
|
&& criticalConceptGate.passed
|
||||||
|
&& mirrorParityGate.passed
|
||||||
|
&& beginnerDominanceGate.passed,
|
||||||
fixedFixtures: fixedFixtureGate,
|
fixedFixtures: fixedFixtureGate,
|
||||||
criticalConcepts: criticalConceptGate,
|
criticalConcepts: criticalConceptGate,
|
||||||
selfPlay: selfPlayGate,
|
mirrorParity: mirrorParityGate,
|
||||||
|
beginnerDominance: beginnerDominanceGate,
|
||||||
},
|
},
|
||||||
fixtureCount: AI_BENCHMARK_FIXTURES.length,
|
fixtureCount: AI_BENCHMARK_FIXTURES.length,
|
||||||
criticalFixtureCount,
|
criticalFixtureCount,
|
||||||
|
fixtureTotals: {
|
||||||
|
fixtures: AI_BENCHMARK_FIXTURES.length,
|
||||||
|
criticalFixtures: criticalFixtureCount,
|
||||||
|
},
|
||||||
fixedSuite: {
|
fixedSuite: {
|
||||||
fixedFixtureAgreements,
|
fixedFixtureAgreements,
|
||||||
expectedPasses,
|
expectedPasses,
|
||||||
criticalPasses,
|
criticalPasses,
|
||||||
fixedFixtureAgreementFailures,
|
fixedFixtureAgreementFailures,
|
||||||
criticalPassFailures,
|
criticalPassFailures,
|
||||||
|
simulatedTiming: {
|
||||||
|
productionSuiteSimulatedMs: fixedSuite.productionSuiteSimulatedMs,
|
||||||
|
referenceSuiteSimulatedMs: fixedSuite.referenceSuiteSimulatedMs,
|
||||||
|
productionMasterDecisions: fixedFixtureProductionMasterDecisions,
|
||||||
|
referenceMasterDecisions: fixedFixtureReferenceMasterDecisions,
|
||||||
|
},
|
||||||
results: fixedSuite.results,
|
results: fixedSuite.results,
|
||||||
},
|
},
|
||||||
selfPlay: {
|
selfPlaySuites: {
|
||||||
matches: selfPlay.results.length,
|
totalMatches: mirrorSummary.matches + beginnerSummary.matches,
|
||||||
wins,
|
mirrorParity: mirrorSummary,
|
||||||
losses,
|
beginnerDominance: beginnerSummary,
|
||||||
draws,
|
|
||||||
winRate: selfPlay.results.length === 0 ? 0 : wins / selfPlay.results.length,
|
|
||||||
lossRate: selfPlay.results.length === 0 ? 0 : losses / selfPlay.results.length,
|
|
||||||
perSeed,
|
|
||||||
dualLossSeeds,
|
|
||||||
regressionWatchlist: [...KNOWN_REGRESSION_WATCHLIST],
|
|
||||||
regressionWatchlistDualLossIntersection,
|
|
||||||
results: selfPlay.results,
|
|
||||||
},
|
},
|
||||||
timing: {
|
timing: {
|
||||||
productionMasterSimulatedDecisions,
|
fixedFixtureProductionMasterDecisions,
|
||||||
|
fixedFixtureReferenceMasterDecisions,
|
||||||
|
mirrorTrackedTeamSimulatedDecisions,
|
||||||
|
beginnerTrackedTeamSimulatedDecisions,
|
||||||
|
allTrackedProductionSimulatedDecisions,
|
||||||
},
|
},
|
||||||
referenceProfile: REFERENCE_PROFILE,
|
referenceProfile: REFERENCE_PROFILE,
|
||||||
};
|
};
|
||||||
@@ -625,7 +857,7 @@ export async function runAIBenchmark(): Promise<AIBenchmarkSummary> {
|
|||||||
|
|
||||||
async function runBenchmarkCli(): Promise<void> {
|
async function runBenchmarkCli(): Promise<void> {
|
||||||
const summary = await runAIBenchmark();
|
const summary = await runAIBenchmark();
|
||||||
logBenchmarkProgress('Benchmark complete. Emitting summary with iteration 5 gate results.');
|
logBenchmarkProgress('Benchmark complete. Emitting summary with iteration 6 gate results.');
|
||||||
printReadableSummary(summary);
|
printReadableSummary(summary);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
1322
src/game/ai.ts
1322
src/game/ai.ts
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user