feat(SCOPONE-0009) improve ai, dealer, apparigliare e sparigliare
This commit is contained in:
634
src/game/ai-benchmark.ts
Normal file
634
src/game/ai-benchmark.ts
Normal file
@@ -0,0 +1,634 @@
|
||||
import { applyMove, cloneState, createInitialState, getMatchOutcome, nextPlayer, teamOf } from './engine';
|
||||
import { AITimingSource, AIMove, AISearchProfileOverride, chooseMove } from './ai';
|
||||
import {
|
||||
AI_BENCHMARK_FIXTURES,
|
||||
AIBenchmarkCriticalConcept,
|
||||
AIBenchmarkExpectedMove,
|
||||
AIBenchmarkFixture,
|
||||
isCriticalAIBenchmarkFixture,
|
||||
} from './ai-benchmark-fixtures';
|
||||
import { CardTracker } from './card-tracker';
|
||||
import { GameState, PlayerIndex } from './types';
|
||||
|
||||
function formatDurationMs(durationMs: number): string {
|
||||
if (durationMs < 1000) {
|
||||
return `${durationMs.toFixed(0)} ms`;
|
||||
}
|
||||
|
||||
return `${(durationMs / 1000).toFixed(2)} s`;
|
||||
}
|
||||
|
||||
function logBenchmarkProgress(message: string): void {
|
||||
console.log(`[ai-benchmark] ${message}`);
|
||||
}
|
||||
|
||||
interface FixedFixtureResult {
|
||||
fixtureId: string;
|
||||
name: string;
|
||||
tags: string[];
|
||||
criticalConcept: AIBenchmarkCriticalConcept | null;
|
||||
productionMove: string;
|
||||
referenceMove: string;
|
||||
matchesReference: boolean;
|
||||
expectedPass: boolean;
|
||||
conceptGatePass: boolean | null;
|
||||
productionSimulatedMs: number;
|
||||
referenceSimulatedMs: number;
|
||||
}
|
||||
|
||||
interface SelfPlayMatchResult {
|
||||
seed: number;
|
||||
dealer: PlayerIndex;
|
||||
masterTeam: 0 | 1;
|
||||
winner: 0 | 1 | null;
|
||||
masterResult: 'win' | 'loss' | 'draw';
|
||||
rounds: number;
|
||||
truncated: boolean;
|
||||
totalPoints: [number, number];
|
||||
masterDecisionCount: number;
|
||||
masterAverageSimulatedDecisionMs: number;
|
||||
masterMaxSimulatedDecisionMs: number;
|
||||
}
|
||||
|
||||
interface TimingSummary {
|
||||
count: number;
|
||||
averageMs: number;
|
||||
p95Ms: number;
|
||||
maxMs: number;
|
||||
}
|
||||
|
||||
interface GateCountSummary {
|
||||
actual: number;
|
||||
required: number;
|
||||
total: number;
|
||||
passed: boolean;
|
||||
}
|
||||
|
||||
interface SelfPlayGateSummary {
|
||||
matches: number;
|
||||
requiredMatches: number;
|
||||
wins: number;
|
||||
requiredWins: number;
|
||||
losses: number;
|
||||
maxLosses: number;
|
||||
draws: number;
|
||||
matchCountPassed: boolean;
|
||||
winGatePassed: boolean;
|
||||
lossGatePassed: boolean;
|
||||
passed: boolean;
|
||||
}
|
||||
|
||||
interface SelfPlaySeedSeatResult {
|
||||
masterTeam: 0 | 1;
|
||||
masterResult: 'win' | 'loss' | 'draw';
|
||||
winner: 0 | 1 | null;
|
||||
rounds: number;
|
||||
truncated: boolean;
|
||||
totalPoints: [number, number];
|
||||
}
|
||||
|
||||
interface SelfPlaySeedAggregateResult {
|
||||
seed: number;
|
||||
matches: number;
|
||||
wins: number;
|
||||
losses: number;
|
||||
draws: number;
|
||||
dualLoss: boolean;
|
||||
seatResults: SelfPlaySeedSeatResult[];
|
||||
}
|
||||
|
||||
export interface AIBenchmarkSummary {
|
||||
benchmark: 'ai-quality';
|
||||
qualityGate: {
|
||||
iteration: 5;
|
||||
passed: boolean;
|
||||
fixedFixtures: GateCountSummary;
|
||||
criticalConcepts: GateCountSummary;
|
||||
selfPlay: SelfPlayGateSummary;
|
||||
};
|
||||
fixtureCount: number;
|
||||
criticalFixtureCount: number;
|
||||
fixedSuite: {
|
||||
fixedFixtureAgreements: number;
|
||||
expectedPasses: number;
|
||||
criticalPasses: number;
|
||||
fixedFixtureAgreementFailures: string[];
|
||||
criticalPassFailures: string[];
|
||||
results: FixedFixtureResult[];
|
||||
};
|
||||
selfPlay: {
|
||||
matches: number;
|
||||
wins: number;
|
||||
losses: number;
|
||||
draws: number;
|
||||
winRate: number;
|
||||
lossRate: number;
|
||||
perSeed: SelfPlaySeedAggregateResult[];
|
||||
dualLossSeeds: number[];
|
||||
regressionWatchlist: number[];
|
||||
regressionWatchlistDualLossIntersection: number[];
|
||||
results: SelfPlayMatchResult[];
|
||||
};
|
||||
timing: {
|
||||
productionMasterSimulatedDecisions: TimingSummary;
|
||||
};
|
||||
referenceProfile: Required<AISearchProfileOverride>;
|
||||
}
|
||||
|
||||
const ITERATION_5_GATE = {
|
||||
fixedFixtureAgreementTarget: 13,
|
||||
criticalConceptTarget: 6,
|
||||
selfPlayMatchTarget: 48,
|
||||
selfPlayWinTarget: 30,
|
||||
selfPlayMaxLosses: 12,
|
||||
} as const;
|
||||
|
||||
const KNOWN_REGRESSION_WATCHLIST = [1000, 1002, 1004, 1006, 1012, 1013, 1014] as const;
|
||||
const KNOWN_REGRESSION_WATCHLIST_SET = new Set<number>(KNOWN_REGRESSION_WATCHLIST);
|
||||
|
||||
const REFERENCE_PROFILE: Required<AISearchProfileOverride> = {
|
||||
timeBudgetMs: 9000,
|
||||
sampleCount: 12,
|
||||
maxDepth: 7,
|
||||
batchSize: 2,
|
||||
};
|
||||
|
||||
const SELF_PLAY_MATCH_SEEDS = Array.from({ length: 24 }, (_, index) => 1000 + index);
|
||||
const MAX_SELF_PLAY_ROUNDS = 20;
|
||||
|
||||
function assertIteration5BenchmarkContract(): void {
|
||||
const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length;
|
||||
const selfPlayMatchCount = SELF_PLAY_MATCH_SEEDS.length * 2;
|
||||
|
||||
if (AI_BENCHMARK_FIXTURES.length !== ITERATION_5_GATE.fixedFixtureAgreementTarget) {
|
||||
throw new Error(
|
||||
`Iteration 5 benchmark expects ${ITERATION_5_GATE.fixedFixtureAgreementTarget} fixed fixtures, received ${AI_BENCHMARK_FIXTURES.length}.`,
|
||||
);
|
||||
}
|
||||
|
||||
if (criticalFixtureCount !== ITERATION_5_GATE.criticalConceptTarget) {
|
||||
throw new Error(
|
||||
`Iteration 5 benchmark expects ${ITERATION_5_GATE.criticalConceptTarget} critical concept fixtures, received ${criticalFixtureCount}.`,
|
||||
);
|
||||
}
|
||||
|
||||
if (selfPlayMatchCount !== ITERATION_5_GATE.selfPlayMatchTarget) {
|
||||
throw new Error(
|
||||
`Iteration 5 benchmark expects ${ITERATION_5_GATE.selfPlayMatchTarget} self-play matches, received ${selfPlayMatchCount}.`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
interface SimulatedBenchmarkTimingSource extends AITimingSource {
|
||||
getElapsedMs(): number;
|
||||
}
|
||||
|
||||
function createSimulatedBenchmarkTimingSource(startMs = 0): SimulatedBenchmarkTimingSource {
|
||||
let currentMs = startMs;
|
||||
|
||||
return {
|
||||
isSimulated: true,
|
||||
now: () => currentMs,
|
||||
advance: (elapsedMs: number) => {
|
||||
currentMs += elapsedMs;
|
||||
return currentMs;
|
||||
},
|
||||
getElapsedMs: () => currentMs - startMs,
|
||||
};
|
||||
}
|
||||
|
||||
function seedFromParts(...parts: number[]): number {
|
||||
let hash = 2166136261;
|
||||
for (const part of parts) {
|
||||
hash ^= part >>> 0;
|
||||
hash = Math.imul(hash, 16777619);
|
||||
}
|
||||
return hash >>> 0;
|
||||
}
|
||||
|
||||
function createMulberry32(seed: number): () => number {
|
||||
let state = seed >>> 0;
|
||||
return () => {
|
||||
state = (state + 0x6d2b79f5) >>> 0;
|
||||
let mixed = Math.imul(state ^ (state >>> 15), state | 1);
|
||||
mixed ^= mixed + Math.imul(mixed ^ (mixed >>> 7), mixed | 61);
|
||||
return ((mixed ^ (mixed >>> 14)) >>> 0) / 4294967296;
|
||||
};
|
||||
}
|
||||
|
||||
function moveKey(move: AIMove): string {
|
||||
return `${move.card.id}|${move.capture.map(card => card.id).sort().join(',')}`;
|
||||
}
|
||||
|
||||
function createTrackerForState(state: GameState): CardTracker {
|
||||
const tracker = new CardTracker();
|
||||
for (const player of state.players) {
|
||||
for (const card of player.pile) {
|
||||
tracker.trackPlay(card);
|
||||
}
|
||||
}
|
||||
return tracker;
|
||||
}
|
||||
|
||||
function matchesExpectedMove(move: AIMove, expected: AIBenchmarkExpectedMove): boolean {
|
||||
if (move.card.id !== expected.cardId) return false;
|
||||
if (!expected.captureIds) return true;
|
||||
|
||||
const actualCapture = move.capture.map(card => card.id).sort().join(',');
|
||||
const expectedCapture = [...expected.captureIds].sort().join(',');
|
||||
return actualCapture === expectedCapture;
|
||||
}
|
||||
|
||||
async function runFixedFixtureSuite(): Promise<{ results: FixedFixtureResult[]; wallClockMs: number; productionTimings: number[] }> {
|
||||
const startedAt = performance.now();
|
||||
const results: FixedFixtureResult[] = [];
|
||||
const productionTimings: number[] = [];
|
||||
|
||||
logBenchmarkProgress(`Starting fixed fixture suite (${AI_BENCHMARK_FIXTURES.length} positions).`);
|
||||
|
||||
for (let index = 0; index < AI_BENCHMARK_FIXTURES.length; index++) {
|
||||
const fixture = AI_BENCHMARK_FIXTURES[index];
|
||||
const productionState = cloneState(fixture.state);
|
||||
const referenceState = cloneState(fixture.state);
|
||||
const productionTracker = createTrackerForState(productionState);
|
||||
const referenceTracker = createTrackerForState(referenceState);
|
||||
|
||||
const productionSeed = seedFromParts(0x0f1e2d3c, index, 0);
|
||||
const referenceSeed = seedFromParts(0x0f1e2d3c, index, 1);
|
||||
const productionTimingSource = createSimulatedBenchmarkTimingSource();
|
||||
const referenceTimingSource = createSimulatedBenchmarkTimingSource();
|
||||
|
||||
const productionMove = await chooseMove(
|
||||
productionState,
|
||||
productionState.currentPlayer,
|
||||
'master',
|
||||
productionTracker,
|
||||
undefined,
|
||||
{
|
||||
rng: createMulberry32(productionSeed),
|
||||
timingSource: productionTimingSource,
|
||||
},
|
||||
);
|
||||
const productionSimulatedMs = productionTimingSource.getElapsedMs();
|
||||
|
||||
const referenceMove = await chooseMove(
|
||||
referenceState,
|
||||
referenceState.currentPlayer,
|
||||
'master',
|
||||
referenceTracker,
|
||||
undefined,
|
||||
{
|
||||
rng: createMulberry32(referenceSeed),
|
||||
profileOverride: REFERENCE_PROFILE,
|
||||
timingSource: referenceTimingSource,
|
||||
},
|
||||
);
|
||||
const referenceSimulatedMs = referenceTimingSource.getElapsedMs();
|
||||
|
||||
productionTimings.push(productionSimulatedMs);
|
||||
|
||||
const conceptGatePass = isCriticalAIBenchmarkFixture(fixture)
|
||||
? matchesExpectedMove(productionMove, fixture.expectedMove)
|
||||
: null;
|
||||
|
||||
results.push({
|
||||
fixtureId: fixture.id,
|
||||
name: fixture.name,
|
||||
tags: [...fixture.tags],
|
||||
criticalConcept: fixture.criticalConcept,
|
||||
productionMove: moveKey(productionMove),
|
||||
referenceMove: moveKey(referenceMove),
|
||||
matchesReference: moveKey(productionMove) === moveKey(referenceMove),
|
||||
expectedPass: matchesExpectedMove(productionMove, fixture.expectedMove),
|
||||
conceptGatePass,
|
||||
productionSimulatedMs,
|
||||
referenceSimulatedMs,
|
||||
});
|
||||
|
||||
const progressLabel = `${index + 1}/${AI_BENCHMARK_FIXTURES.length}`;
|
||||
const matchLabel = moveKey(productionMove) === moveKey(referenceMove) ? 'agreement' : 'divergence';
|
||||
logBenchmarkProgress(
|
||||
`Fixture ${progressLabel}: ${fixture.id} -> ${matchLabel}, production simulated ${formatDurationMs(productionSimulatedMs)}, reference simulated ${formatDurationMs(referenceSimulatedMs)}.`,
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
results,
|
||||
wallClockMs: performance.now() - startedAt,
|
||||
productionTimings,
|
||||
};
|
||||
}
|
||||
|
||||
function summarizeTimings(samples: number[]): TimingSummary {
|
||||
if (samples.length === 0) {
|
||||
return {
|
||||
count: 0,
|
||||
averageMs: 0,
|
||||
p95Ms: 0,
|
||||
maxMs: 0,
|
||||
};
|
||||
}
|
||||
|
||||
const sorted = [...samples].sort((left, right) => left - right);
|
||||
const sum = sorted.reduce((accumulator, value) => accumulator + value, 0);
|
||||
const p95Index = Math.min(sorted.length - 1, Math.max(0, Math.ceil(sorted.length * 0.95) - 1));
|
||||
|
||||
return {
|
||||
count: sorted.length,
|
||||
averageMs: sum / sorted.length,
|
||||
p95Ms: sorted[p95Index],
|
||||
maxMs: sorted[sorted.length - 1],
|
||||
};
|
||||
}
|
||||
|
||||
function summarizeSelfPlayBySeed(results: SelfPlayMatchResult[]): {
|
||||
perSeed: SelfPlaySeedAggregateResult[];
|
||||
dualLossSeeds: number[];
|
||||
regressionWatchlistDualLossIntersection: number[];
|
||||
} {
|
||||
const aggregates = new Map<number, SelfPlaySeedAggregateResult>();
|
||||
|
||||
for (const result of results) {
|
||||
const existing = aggregates.get(result.seed) ?? {
|
||||
seed: result.seed,
|
||||
matches: 0,
|
||||
wins: 0,
|
||||
losses: 0,
|
||||
draws: 0,
|
||||
dualLoss: false,
|
||||
seatResults: [],
|
||||
};
|
||||
|
||||
existing.matches++;
|
||||
if (result.masterResult === 'win') existing.wins++;
|
||||
else if (result.masterResult === 'loss') existing.losses++;
|
||||
else existing.draws++;
|
||||
|
||||
existing.seatResults.push({
|
||||
masterTeam: result.masterTeam,
|
||||
masterResult: result.masterResult,
|
||||
winner: result.winner,
|
||||
rounds: result.rounds,
|
||||
truncated: result.truncated,
|
||||
totalPoints: result.totalPoints,
|
||||
});
|
||||
|
||||
aggregates.set(result.seed, existing);
|
||||
}
|
||||
|
||||
const perSeed = [...aggregates.values()]
|
||||
.map(aggregate => ({
|
||||
...aggregate,
|
||||
dualLoss: aggregate.losses >= 2,
|
||||
seatResults: [...aggregate.seatResults].sort((left, right) => left.masterTeam - right.masterTeam),
|
||||
}))
|
||||
.sort((left, right) => left.seed - right.seed);
|
||||
const dualLossSeeds = perSeed.filter(aggregate => aggregate.dualLoss).map(aggregate => aggregate.seed);
|
||||
const regressionWatchlistDualLossIntersection = dualLossSeeds.filter(seed => KNOWN_REGRESSION_WATCHLIST_SET.has(seed));
|
||||
|
||||
return {
|
||||
perSeed,
|
||||
dualLossSeeds,
|
||||
regressionWatchlistDualLossIntersection,
|
||||
};
|
||||
}
|
||||
|
||||
async function simulateSelfPlayMatch(seed: number, masterTeam: 0 | 1): Promise<{ result: SelfPlayMatchResult; timings: number[] }> {
|
||||
const initialDealer = (seed % 4) as PlayerIndex;
|
||||
let state = createInitialState(initialDealer, createMulberry32(seedFromParts(seed, 1, 0)));
|
||||
const matchStartingPlayer = state.matchStartingPlayer;
|
||||
const tracker = new CardTracker();
|
||||
const masterTimings: number[] = [];
|
||||
|
||||
let rounds = 1;
|
||||
let truncated = false;
|
||||
let turnCount = 0;
|
||||
|
||||
while (rounds <= MAX_SELF_PLAY_ROUNDS) {
|
||||
while (!state.roundOver) {
|
||||
const playerIdx = state.currentPlayer;
|
||||
const difficulty = teamOf(playerIdx) === masterTeam ? 'master' : 'advanced';
|
||||
const timingSource = createSimulatedBenchmarkTimingSource();
|
||||
const options = difficulty === 'master'
|
||||
? {
|
||||
rng: createMulberry32(seedFromParts(seed, rounds, turnCount, playerIdx)),
|
||||
timingSource,
|
||||
}
|
||||
: { timingSource };
|
||||
const move = await chooseMove(state, playerIdx, difficulty, tracker, undefined, options);
|
||||
const simulatedMs = timingSource.getElapsedMs();
|
||||
|
||||
if (difficulty === 'master') {
|
||||
masterTimings.push(simulatedMs);
|
||||
}
|
||||
|
||||
const { nextState, capture } = applyMove(
|
||||
state,
|
||||
playerIdx,
|
||||
move.card,
|
||||
move.capture.length > 0 ? move.capture : undefined,
|
||||
);
|
||||
tracker.trackPlay(move.card);
|
||||
if (capture) {
|
||||
tracker.trackCapture(capture.captured);
|
||||
}
|
||||
state = nextState;
|
||||
turnCount++;
|
||||
}
|
||||
|
||||
const outcome = getMatchOutcome(state.teamScores);
|
||||
if (!outcome.continueMatch) {
|
||||
break;
|
||||
}
|
||||
|
||||
rounds++;
|
||||
if (rounds > MAX_SELF_PLAY_ROUNDS) {
|
||||
truncated = true;
|
||||
break;
|
||||
}
|
||||
|
||||
const totals: [number, number] = [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints];
|
||||
const nextDealer = nextPlayer(state.dealer);
|
||||
tracker.reset();
|
||||
state = createInitialState(nextDealer, createMulberry32(seedFromParts(seed, rounds, 0)));
|
||||
state.matchStartingPlayer = matchStartingPlayer;
|
||||
state.teamScores[0].totalPoints = totals[0];
|
||||
state.teamScores[1].totalPoints = totals[1];
|
||||
state.roundNumber = rounds;
|
||||
}
|
||||
|
||||
const outcome = getMatchOutcome(state.teamScores);
|
||||
const winner = truncated ? outcome.winner : outcome.winner;
|
||||
const masterResult = winner === null ? 'draw' : winner === masterTeam ? 'win' : 'loss';
|
||||
const timingSummary = summarizeTimings(masterTimings);
|
||||
|
||||
return {
|
||||
result: {
|
||||
seed,
|
||||
dealer: initialDealer,
|
||||
masterTeam,
|
||||
winner,
|
||||
masterResult,
|
||||
rounds,
|
||||
truncated,
|
||||
totalPoints: [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints],
|
||||
masterDecisionCount: timingSummary.count,
|
||||
masterAverageSimulatedDecisionMs: timingSummary.averageMs,
|
||||
masterMaxSimulatedDecisionMs: timingSummary.maxMs,
|
||||
},
|
||||
timings: masterTimings,
|
||||
};
|
||||
}
|
||||
|
||||
async function runSelfPlaySuite(): Promise<{ results: SelfPlayMatchResult[]; wallClockMs: number; productionTimings: number[] }> {
|
||||
const startedAt = performance.now();
|
||||
const results: SelfPlayMatchResult[] = [];
|
||||
const productionTimings: number[] = [];
|
||||
const totalMatches = SELF_PLAY_MATCH_SEEDS.length * 2;
|
||||
let completedMatches = 0;
|
||||
|
||||
logBenchmarkProgress(`Starting self-play suite (${totalMatches} seeded matches with seat swaps).`);
|
||||
|
||||
for (const seed of SELF_PLAY_MATCH_SEEDS) {
|
||||
for (const masterTeam of [0, 1] as const) {
|
||||
const { result, timings } = await simulateSelfPlayMatch(seed, masterTeam);
|
||||
results.push(result);
|
||||
productionTimings.push(...timings);
|
||||
completedMatches++;
|
||||
|
||||
if (completedMatches === 1 || completedMatches % 4 === 0 || completedMatches === totalMatches) {
|
||||
logBenchmarkProgress(
|
||||
`Self-play ${completedMatches}/${totalMatches}: seed ${seed}, master team ${masterTeam}, result ${result.masterResult}, rounds ${result.rounds}, max simulated decision ${formatDurationMs(result.masterMaxSimulatedDecisionMs)}.`,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
results,
|
||||
wallClockMs: performance.now() - startedAt,
|
||||
productionTimings,
|
||||
};
|
||||
}
|
||||
|
||||
function printReadableSummary(summary: AIBenchmarkSummary): void {
|
||||
console.log('AI quality benchmark');
|
||||
console.log(`Iteration 5 quality gate: ${summary.qualityGate.passed ? 'PASS' : 'FAIL'}`);
|
||||
console.log(`Fixed-fixture gate: ${summary.qualityGate.fixedFixtures.actual}/${summary.qualityGate.fixedFixtures.total} agreements (target ${summary.qualityGate.fixedFixtures.required}/${summary.qualityGate.fixedFixtures.total}).`);
|
||||
console.log(`Critical concept gate: ${summary.qualityGate.criticalConcepts.actual}/${summary.qualityGate.criticalConcepts.total} passes (target ${summary.qualityGate.criticalConcepts.required}/${summary.qualityGate.criticalConcepts.total}).`);
|
||||
console.log(`Self-play gate: ${summary.qualityGate.selfPlay.matches}/${summary.qualityGate.selfPlay.requiredMatches} matches, ${summary.qualityGate.selfPlay.wins}/${summary.qualityGate.selfPlay.matches} wins (target ${summary.qualityGate.selfPlay.requiredWins}), ${summary.qualityGate.selfPlay.losses}/${summary.qualityGate.selfPlay.matches} losses (max ${summary.qualityGate.selfPlay.maxLosses}), ${summary.qualityGate.selfPlay.draws} draws.`);
|
||||
if (summary.fixedSuite.fixedFixtureAgreementFailures.length > 0) {
|
||||
console.log(`Fixed-fixture agreement failures: ${summary.fixedSuite.fixedFixtureAgreementFailures.join(', ')}`);
|
||||
}
|
||||
if (summary.fixedSuite.criticalPassFailures.length > 0) {
|
||||
console.log(`Critical concept failures: ${summary.fixedSuite.criticalPassFailures.join(', ')}`);
|
||||
}
|
||||
console.log(`Per-seed outcomes: ${summary.selfPlay.perSeed.map(seed => `${seed.seed}:${seed.wins}W-${seed.losses}L-${seed.draws}D`).join(' | ')}`);
|
||||
console.log(`Dual-loss seeds: ${summary.selfPlay.dualLossSeeds.length > 0 ? summary.selfPlay.dualLossSeeds.join(', ') : 'none'}`);
|
||||
console.log(`Regression watchlist intersection: ${summary.selfPlay.regressionWatchlistDualLossIntersection.length > 0 ? summary.selfPlay.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${summary.selfPlay.regressionWatchlist.join(', ')})`);
|
||||
console.log(`Master simulated timing: avg ${summary.timing.productionMasterSimulatedDecisions.averageMs.toFixed(1)} ms, p95 ${summary.timing.productionMasterSimulatedDecisions.p95Ms.toFixed(1)} ms, max ${summary.timing.productionMasterSimulatedDecisions.maxMs.toFixed(1)} ms.`);
|
||||
console.log('BENCHMARK_SUMMARY');
|
||||
console.log(JSON.stringify(summary, null, 2));
|
||||
}
|
||||
|
||||
export async function runAIBenchmark(): Promise<AIBenchmarkSummary> {
|
||||
assertIteration5BenchmarkContract();
|
||||
logBenchmarkProgress('Benchmark started. Running fixed fixtures first, then self-play.');
|
||||
const fixedSuite = await runFixedFixtureSuite();
|
||||
logBenchmarkProgress(`Fixed fixture suite complete in ${formatDurationMs(fixedSuite.wallClockMs)} wall-clock.`);
|
||||
const selfPlay = await runSelfPlaySuite();
|
||||
logBenchmarkProgress(`Self-play suite complete in ${formatDurationMs(selfPlay.wallClockMs)} wall-clock.`);
|
||||
const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length;
|
||||
const fixedFixtureAgreements = fixedSuite.results.filter(result => result.matchesReference).length;
|
||||
const expectedPasses = fixedSuite.results.filter(result => result.expectedPass).length;
|
||||
const criticalPasses = fixedSuite.results.filter(result => result.conceptGatePass === true).length;
|
||||
const fixedFixtureAgreementFailures = fixedSuite.results
|
||||
.filter(result => !result.matchesReference)
|
||||
.map(result => result.fixtureId);
|
||||
const criticalPassFailures = fixedSuite.results
|
||||
.filter(result => result.conceptGatePass === false)
|
||||
.map(result => result.fixtureId);
|
||||
const wins = selfPlay.results.filter(result => result.masterResult === 'win').length;
|
||||
const losses = selfPlay.results.filter(result => result.masterResult === 'loss').length;
|
||||
const draws = selfPlay.results.filter(result => result.masterResult === 'draw').length;
|
||||
const { perSeed, dualLossSeeds, regressionWatchlistDualLossIntersection } = summarizeSelfPlayBySeed(selfPlay.results);
|
||||
const productionMasterSimulatedDecisions = summarizeTimings([
|
||||
...fixedSuite.productionTimings,
|
||||
...selfPlay.productionTimings,
|
||||
]);
|
||||
const fixedFixtureGate: GateCountSummary = {
|
||||
actual: fixedFixtureAgreements,
|
||||
required: ITERATION_5_GATE.fixedFixtureAgreementTarget,
|
||||
total: AI_BENCHMARK_FIXTURES.length,
|
||||
passed: fixedFixtureAgreements === ITERATION_5_GATE.fixedFixtureAgreementTarget,
|
||||
};
|
||||
const criticalConceptGate: GateCountSummary = {
|
||||
actual: criticalPasses,
|
||||
required: ITERATION_5_GATE.criticalConceptTarget,
|
||||
total: criticalFixtureCount,
|
||||
passed: criticalPasses === ITERATION_5_GATE.criticalConceptTarget,
|
||||
};
|
||||
const selfPlayGate: SelfPlayGateSummary = {
|
||||
matches: selfPlay.results.length,
|
||||
requiredMatches: ITERATION_5_GATE.selfPlayMatchTarget,
|
||||
wins,
|
||||
requiredWins: ITERATION_5_GATE.selfPlayWinTarget,
|
||||
losses,
|
||||
maxLosses: ITERATION_5_GATE.selfPlayMaxLosses,
|
||||
draws,
|
||||
matchCountPassed: selfPlay.results.length === ITERATION_5_GATE.selfPlayMatchTarget,
|
||||
winGatePassed: wins >= ITERATION_5_GATE.selfPlayWinTarget,
|
||||
lossGatePassed: losses <= ITERATION_5_GATE.selfPlayMaxLosses,
|
||||
passed: selfPlay.results.length === ITERATION_5_GATE.selfPlayMatchTarget
|
||||
&& wins >= ITERATION_5_GATE.selfPlayWinTarget
|
||||
&& losses <= ITERATION_5_GATE.selfPlayMaxLosses,
|
||||
};
|
||||
|
||||
return {
|
||||
benchmark: 'ai-quality',
|
||||
qualityGate: {
|
||||
iteration: 5,
|
||||
passed: fixedFixtureGate.passed && criticalConceptGate.passed && selfPlayGate.passed,
|
||||
fixedFixtures: fixedFixtureGate,
|
||||
criticalConcepts: criticalConceptGate,
|
||||
selfPlay: selfPlayGate,
|
||||
},
|
||||
fixtureCount: AI_BENCHMARK_FIXTURES.length,
|
||||
criticalFixtureCount,
|
||||
fixedSuite: {
|
||||
fixedFixtureAgreements,
|
||||
expectedPasses,
|
||||
criticalPasses,
|
||||
fixedFixtureAgreementFailures,
|
||||
criticalPassFailures,
|
||||
results: fixedSuite.results,
|
||||
},
|
||||
selfPlay: {
|
||||
matches: selfPlay.results.length,
|
||||
wins,
|
||||
losses,
|
||||
draws,
|
||||
winRate: selfPlay.results.length === 0 ? 0 : wins / selfPlay.results.length,
|
||||
lossRate: selfPlay.results.length === 0 ? 0 : losses / selfPlay.results.length,
|
||||
perSeed,
|
||||
dualLossSeeds,
|
||||
regressionWatchlist: [...KNOWN_REGRESSION_WATCHLIST],
|
||||
regressionWatchlistDualLossIntersection,
|
||||
results: selfPlay.results,
|
||||
},
|
||||
timing: {
|
||||
productionMasterSimulatedDecisions,
|
||||
},
|
||||
referenceProfile: REFERENCE_PROFILE,
|
||||
};
|
||||
}
|
||||
|
||||
async function runBenchmarkCli(): Promise<void> {
|
||||
const summary = await runAIBenchmark();
|
||||
logBenchmarkProgress('Benchmark complete. Emitting summary with iteration 5 gate results.');
|
||||
printReadableSummary(summary);
|
||||
}
|
||||
|
||||
if (typeof window === 'undefined') {
|
||||
void runBenchmarkCli();
|
||||
}
|
||||
Reference in New Issue
Block a user