Some checks failed
Android Build & Publish / android (push) Failing after 2m10s
- Replace minimax with PIMC (Perfect Information Monte Carlo) search - Add PIMC_SCOPE_BOOST=150 → effective scopa value 540 (was 390) → Master win rate: 67.5% → 72.5% vs legacy AI (target ≥60%) → Advanced win rate: 97.5% vs beginner AI (target ≥55%) → Scope gap in losses: 6.54 → 3.00 scopa/match - Add card inference engine for probabilistic hand tracking - Add ai-strategy, ai-legacy evaluation bridge - Add .gitea/workflows/android-build.yml: build debug + unsigned release APK and publish to Gitea generic package registry
1058 lines
37 KiB
TypeScript
1058 lines
37 KiB
TypeScript
import { applyMove, cloneState, createInitialState, getMatchOutcome, nextPlayer, teamOf } from './engine';
|
||
import { AITimingSource, AIMove, AISearchProfileOverride, chooseMove } from './ai';
|
||
import { chooseMove as chooseMoveOld } from './ai-legacy';
|
||
import { CardInferenceEngine } from './card-inference';
|
||
import {
|
||
AI_BENCHMARK_FIXTURES,
|
||
AIBenchmarkCriticalConcept,
|
||
AIBenchmarkExpectedMove,
|
||
isCriticalAIBenchmarkFixture,
|
||
} from './ai-benchmark-fixtures';
|
||
import { CardTracker } from './card-tracker';
|
||
import { Difficulty, GameState, PlayerIndex } from './types';
|
||
|
||
function formatDurationMs(durationMs: number): string {
|
||
if (durationMs < 1000) {
|
||
return `${durationMs.toFixed(0)} ms`;
|
||
}
|
||
|
||
return `${(durationMs / 1000).toFixed(2)} s`;
|
||
}
|
||
|
||
function formatPercentage(value: number): string {
|
||
return `${(value * 100).toFixed(1)}%`;
|
||
}
|
||
|
||
function logBenchmarkProgress(message: string): void {
|
||
console.log(`[ai-benchmark] ${message}`);
|
||
}
|
||
|
||
interface FixedFixtureResult {
|
||
fixtureId: string;
|
||
name: string;
|
||
tags: string[];
|
||
criticalConcept: AIBenchmarkCriticalConcept | null;
|
||
productionMove: string;
|
||
referenceMove: string;
|
||
matchesReference: boolean;
|
||
expectedPass: boolean;
|
||
conceptGatePass: boolean | null;
|
||
productionSimulatedMs: number;
|
||
referenceSimulatedMs: number;
|
||
}
|
||
|
||
type SelfPlaySuiteId = 'mirror-parity' | 'beginner-dominance';
|
||
|
||
interface SelfPlayMatchResult {
|
||
suite: SelfPlaySuiteId;
|
||
seed: number;
|
||
dealer: PlayerIndex;
|
||
trackedTeam: 0 | 1;
|
||
trackedTeamDifficulty: Difficulty;
|
||
opponentDifficulty: Difficulty;
|
||
winner: 0 | 1 | null;
|
||
trackedResult: 'win' | 'loss' | 'draw';
|
||
rounds: number;
|
||
truncated: boolean;
|
||
totalPoints: [number, number];
|
||
trackedDecisionCount: number;
|
||
trackedAverageSimulatedDecisionMs: number;
|
||
trackedMaxSimulatedDecisionMs: number;
|
||
}
|
||
|
||
interface TimingSummary {
|
||
count: number;
|
||
averageMs: number;
|
||
p95Ms: number;
|
||
maxMs: number;
|
||
}
|
||
|
||
interface GateCountSummary {
|
||
actual: number;
|
||
required: number;
|
||
total: number;
|
||
passed: boolean;
|
||
}
|
||
|
||
interface WinRateGateSummary {
|
||
matches: number;
|
||
requiredMatches: number;
|
||
wins: number;
|
||
losses: number;
|
||
draws: number;
|
||
winRate: number;
|
||
targetWinRate: number | null;
|
||
tolerance: number | null;
|
||
minWinRate: number | null;
|
||
maxWinRate: number | null;
|
||
matchCountPassed: boolean;
|
||
winRatePassed: boolean;
|
||
passed: boolean;
|
||
}
|
||
|
||
interface SelfPlaySeedSeatResult {
|
||
trackedTeam: 0 | 1;
|
||
trackedResult: 'win' | 'loss' | 'draw';
|
||
winner: 0 | 1 | null;
|
||
rounds: number;
|
||
truncated: boolean;
|
||
totalPoints: [number, number];
|
||
}
|
||
|
||
interface SelfPlaySeedAggregateResult {
|
||
seed: number;
|
||
matches: number;
|
||
wins: number;
|
||
losses: number;
|
||
draws: number;
|
||
dualLoss: boolean;
|
||
seatResults: SelfPlaySeedSeatResult[];
|
||
}
|
||
|
||
interface SelfPlaySuiteSummary {
|
||
suite: SelfPlaySuiteId;
|
||
label: string;
|
||
trackedTeamDifficulty: Difficulty;
|
||
opponentDifficulty: Difficulty;
|
||
matches: number;
|
||
requiredMatches: number;
|
||
seedCount: number;
|
||
seatBalanced: boolean;
|
||
wins: number;
|
||
losses: number;
|
||
draws: number;
|
||
winRate: number;
|
||
lossRate: number;
|
||
perSeed: SelfPlaySeedAggregateResult[];
|
||
dualLossSeeds: number[];
|
||
regressionWatchlist: number[];
|
||
regressionWatchlistDualLossIntersection: number[];
|
||
simulatedTiming: {
|
||
suiteSimulatedMs: number;
|
||
trackedTeamDecisions: TimingSummary;
|
||
};
|
||
results: SelfPlayMatchResult[];
|
||
}
|
||
|
||
export interface AIBenchmarkSummary {
|
||
benchmark: 'ai-quality';
|
||
qualityGate: {
|
||
iteration: 6;
|
||
passed: boolean;
|
||
fixedFixtures: GateCountSummary;
|
||
criticalConcepts: GateCountSummary;
|
||
mirrorParity: WinRateGateSummary;
|
||
beginnerDominance: WinRateGateSummary;
|
||
};
|
||
fixtureCount: number;
|
||
criticalFixtureCount: number;
|
||
fixtureTotals: {
|
||
fixtures: number;
|
||
criticalFixtures: number;
|
||
};
|
||
fixedSuite: {
|
||
fixedFixtureAgreements: number;
|
||
expectedPasses: number;
|
||
criticalPasses: number;
|
||
fixedFixtureAgreementFailures: string[];
|
||
criticalPassFailures: string[];
|
||
simulatedTiming: {
|
||
productionSuiteSimulatedMs: number;
|
||
referenceSuiteSimulatedMs: number;
|
||
productionMasterDecisions: TimingSummary;
|
||
referenceMasterDecisions: TimingSummary;
|
||
};
|
||
results: FixedFixtureResult[];
|
||
};
|
||
selfPlaySuites: {
|
||
totalMatches: number;
|
||
mirrorParity: SelfPlaySuiteSummary;
|
||
beginnerDominance: SelfPlaySuiteSummary;
|
||
};
|
||
timing: {
|
||
fixedFixtureProductionMasterDecisions: TimingSummary;
|
||
fixedFixtureReferenceMasterDecisions: TimingSummary;
|
||
mirrorTrackedTeamSimulatedDecisions: TimingSummary;
|
||
beginnerTrackedTeamSimulatedDecisions: TimingSummary;
|
||
allTrackedProductionSimulatedDecisions: TimingSummary;
|
||
};
|
||
referenceProfile: Required<AISearchProfileOverride>;
|
||
}
|
||
|
||
const ITERATION_6_GATE = {
|
||
mirrorMatchTarget: 500,
|
||
beginnerMatchTarget: 500,
|
||
mirrorTargetWinRate: 0.5,
|
||
mirrorWinRateTolerance: 0.05,
|
||
beginnerMinWinRate: 0.7,
|
||
} as const;
|
||
|
||
const KNOWN_REGRESSION_WATCHLIST = [1000, 1002, 1004, 1006, 1012, 1013, 1014] as const;
|
||
const KNOWN_REGRESSION_WATCHLIST_SET = new Set<number>(KNOWN_REGRESSION_WATCHLIST);
|
||
|
||
const REFERENCE_PROFILE: Required<AISearchProfileOverride> = {
|
||
timeBudgetMs: 9000,
|
||
sampleCount: 12,
|
||
maxDepth: 7,
|
||
batchSize: 2,
|
||
};
|
||
|
||
const SELF_PLAY_SEAT_SWAPS = [0, 1] as const;
|
||
const SELF_PLAY_MATCH_SEEDS = Array.from({ length: 250 }, (_, index) => 1000 + index);
|
||
const MAX_SELF_PLAY_ROUNDS = 20;
|
||
|
||
const HEAD_TO_HEAD_SEEDS = Array.from({ length: 100 }, (_, i) => 2000 + i);
|
||
const HEAD_TO_HEAD_SEAT_SWAPS = [0, 1] as const;
|
||
const HEAD_TO_HEAD_MASTER_TARGET_WIN_RATE = 0.60;
|
||
const HEAD_TO_HEAD_ADVANCED_TARGET_WIN_RATE = 0.55;
|
||
|
||
interface SelfPlaySuiteConfig {
|
||
id: SelfPlaySuiteId;
|
||
label: string;
|
||
suiteSeedKey: number;
|
||
requiredMatches: number;
|
||
trackedTeamDifficulty: Difficulty;
|
||
opponentDifficulty: Difficulty;
|
||
getTeamDifficulties(trackedTeam: 0 | 1): readonly [Difficulty, Difficulty];
|
||
}
|
||
|
||
const SELF_PLAY_SUITES: Record<SelfPlaySuiteId, SelfPlaySuiteConfig> = {
|
||
'mirror-parity': {
|
||
id: 'mirror-parity',
|
||
label: 'Master mirror parity',
|
||
suiteSeedKey: 0x4d31,
|
||
requiredMatches: SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length,
|
||
trackedTeamDifficulty: 'master',
|
||
opponentDifficulty: 'master',
|
||
getTeamDifficulties: () => ['master', 'master'],
|
||
},
|
||
'beginner-dominance': {
|
||
id: 'beginner-dominance',
|
||
label: 'Master versus beginner dominance',
|
||
suiteSeedKey: 0x4236,
|
||
requiredMatches: SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length,
|
||
trackedTeamDifficulty: 'master',
|
||
opponentDifficulty: 'beginner',
|
||
getTeamDifficulties: trackedTeam => (trackedTeam === 0
|
||
? ['master', 'beginner']
|
||
: ['beginner', 'master']),
|
||
},
|
||
};
|
||
|
||
function assertIteration6BenchmarkContract(): void {
|
||
const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length;
|
||
const expectedSeatBalancedMatches = SELF_PLAY_MATCH_SEEDS.length * SELF_PLAY_SEAT_SWAPS.length;
|
||
|
||
if (AI_BENCHMARK_FIXTURES.length === 0) {
|
||
throw new Error('Iteration 6 benchmark requires at least one fixed fixture.');
|
||
}
|
||
|
||
if (criticalFixtureCount === 0) {
|
||
throw new Error('Iteration 6 benchmark requires at least one critical concept fixture.');
|
||
}
|
||
|
||
if (expectedSeatBalancedMatches !== ITERATION_6_GATE.mirrorMatchTarget) {
|
||
throw new Error(
|
||
`Iteration 6 benchmark expects ${ITERATION_6_GATE.mirrorMatchTarget} mirror matches, received ${expectedSeatBalancedMatches}.`,
|
||
);
|
||
}
|
||
|
||
if (expectedSeatBalancedMatches !== ITERATION_6_GATE.beginnerMatchTarget) {
|
||
throw new Error(
|
||
`Iteration 6 benchmark expects ${ITERATION_6_GATE.beginnerMatchTarget} beginner-dominance matches, received ${expectedSeatBalancedMatches}.`,
|
||
);
|
||
}
|
||
|
||
for (const suite of Object.values(SELF_PLAY_SUITES)) {
|
||
if (suite.requiredMatches !== expectedSeatBalancedMatches) {
|
||
throw new Error(
|
||
`Iteration 6 benchmark expects ${expectedSeatBalancedMatches} matches for ${suite.id}, received ${suite.requiredMatches}.`,
|
||
);
|
||
}
|
||
}
|
||
}
|
||
|
||
interface SimulatedBenchmarkTimingSource extends AITimingSource {
|
||
getElapsedMs(): number;
|
||
}
|
||
|
||
function createSimulatedBenchmarkTimingSource(startMs = 0): SimulatedBenchmarkTimingSource {
|
||
let currentMs = startMs;
|
||
|
||
return {
|
||
isSimulated: true,
|
||
now: () => currentMs,
|
||
advance: (elapsedMs: number) => {
|
||
currentMs += elapsedMs;
|
||
return currentMs;
|
||
},
|
||
getElapsedMs: () => currentMs - startMs,
|
||
};
|
||
}
|
||
|
||
function seedFromParts(...parts: number[]): number {
|
||
let hash = 2166136261;
|
||
for (const part of parts) {
|
||
hash ^= part >>> 0;
|
||
hash = Math.imul(hash, 16777619);
|
||
}
|
||
return hash >>> 0;
|
||
}
|
||
|
||
function createMulberry32(seed: number): () => number {
|
||
let state = seed >>> 0;
|
||
return () => {
|
||
state = (state + 0x6d2b79f5) >>> 0;
|
||
let mixed = Math.imul(state ^ (state >>> 15), state | 1);
|
||
mixed ^= mixed + Math.imul(mixed ^ (mixed >>> 7), mixed | 61);
|
||
return ((mixed ^ (mixed >>> 14)) >>> 0) / 4294967296;
|
||
};
|
||
}
|
||
|
||
function moveKey(move: AIMove): string {
|
||
return `${move.card.id}|${move.capture.map(card => card.id).sort().join(',')}`;
|
||
}
|
||
|
||
function otherTeam(team: 0 | 1): 0 | 1 {
|
||
return team === 0 ? 1 : 0;
|
||
}
|
||
|
||
function createTrackerForState(state: GameState): CardTracker {
|
||
const tracker = new CardTracker();
|
||
for (const player of state.players) {
|
||
for (const card of player.pile) {
|
||
tracker.trackPlay(card);
|
||
}
|
||
}
|
||
return tracker;
|
||
}
|
||
|
||
function matchesExpectedMove(move: AIMove, expected: AIBenchmarkExpectedMove): boolean {
|
||
if (move.card.id !== expected.cardId) return false;
|
||
if (!expected.captureIds) return true;
|
||
|
||
const actualCapture = move.capture.map(card => card.id).sort().join(',');
|
||
const expectedCapture = [...expected.captureIds].sort().join(',');
|
||
return actualCapture === expectedCapture;
|
||
}
|
||
|
||
async function runFixedFixtureSuite(): Promise<{
|
||
results: FixedFixtureResult[];
|
||
productionSuiteSimulatedMs: number;
|
||
referenceSuiteSimulatedMs: number;
|
||
productionTimings: number[];
|
||
referenceTimings: number[];
|
||
}> {
|
||
const results: FixedFixtureResult[] = [];
|
||
const productionTimings: number[] = [];
|
||
const referenceTimings: number[] = [];
|
||
|
||
logBenchmarkProgress(`Starting fixed fixture suite (${AI_BENCHMARK_FIXTURES.length} positions).`);
|
||
|
||
for (let index = 0; index < AI_BENCHMARK_FIXTURES.length; index++) {
|
||
const fixture = AI_BENCHMARK_FIXTURES[index];
|
||
const productionState = cloneState(fixture.state);
|
||
const referenceState = cloneState(fixture.state);
|
||
const productionTracker = createTrackerForState(productionState);
|
||
const referenceTracker = createTrackerForState(referenceState);
|
||
|
||
const productionSeed = seedFromParts(0x0f1e2d3c, index, 0);
|
||
const referenceSeed = seedFromParts(0x0f1e2d3c, index, 1);
|
||
const productionTimingSource = createSimulatedBenchmarkTimingSource();
|
||
const referenceTimingSource = createSimulatedBenchmarkTimingSource();
|
||
|
||
const productionMove = await chooseMove(
|
||
productionState,
|
||
productionState.currentPlayer,
|
||
'master',
|
||
productionTracker,
|
||
undefined,
|
||
{
|
||
rng: createMulberry32(productionSeed),
|
||
timingSource: productionTimingSource,
|
||
},
|
||
);
|
||
const productionSimulatedMs = productionTimingSource.getElapsedMs();
|
||
|
||
const referenceMove = await chooseMove(
|
||
referenceState,
|
||
referenceState.currentPlayer,
|
||
'master',
|
||
referenceTracker,
|
||
undefined,
|
||
{
|
||
rng: createMulberry32(referenceSeed),
|
||
profileOverride: REFERENCE_PROFILE,
|
||
timingSource: referenceTimingSource,
|
||
},
|
||
);
|
||
const referenceSimulatedMs = referenceTimingSource.getElapsedMs();
|
||
|
||
productionTimings.push(productionSimulatedMs);
|
||
referenceTimings.push(referenceSimulatedMs);
|
||
|
||
const conceptGatePass = isCriticalAIBenchmarkFixture(fixture)
|
||
? matchesExpectedMove(productionMove, fixture.expectedMove)
|
||
: null;
|
||
|
||
results.push({
|
||
fixtureId: fixture.id,
|
||
name: fixture.name,
|
||
tags: [...fixture.tags],
|
||
criticalConcept: fixture.criticalConcept,
|
||
productionMove: moveKey(productionMove),
|
||
referenceMove: moveKey(referenceMove),
|
||
matchesReference: moveKey(productionMove) === moveKey(referenceMove),
|
||
expectedPass: matchesExpectedMove(productionMove, fixture.expectedMove),
|
||
conceptGatePass,
|
||
productionSimulatedMs,
|
||
referenceSimulatedMs,
|
||
});
|
||
|
||
const progressLabel = `${index + 1}/${AI_BENCHMARK_FIXTURES.length}`;
|
||
const matchLabel = moveKey(productionMove) === moveKey(referenceMove) ? 'agreement' : 'divergence';
|
||
logBenchmarkProgress(
|
||
`Fixture ${progressLabel}: ${fixture.id} -> ${matchLabel}, production simulated ${formatDurationMs(productionSimulatedMs)}, reference simulated ${formatDurationMs(referenceSimulatedMs)}.`,
|
||
);
|
||
}
|
||
|
||
return {
|
||
results,
|
||
productionSuiteSimulatedMs: sumTimings(productionTimings),
|
||
referenceSuiteSimulatedMs: sumTimings(referenceTimings),
|
||
productionTimings,
|
||
referenceTimings,
|
||
};
|
||
}
|
||
|
||
function sumTimings(samples: number[]): number {
|
||
return samples.reduce((total, sample) => total + sample, 0);
|
||
}
|
||
|
||
function summarizeTimings(samples: number[]): TimingSummary {
|
||
if (samples.length === 0) {
|
||
return {
|
||
count: 0,
|
||
averageMs: 0,
|
||
p95Ms: 0,
|
||
maxMs: 0,
|
||
};
|
||
}
|
||
|
||
const sorted = [...samples].sort((left, right) => left - right);
|
||
const sum = sorted.reduce((accumulator, value) => accumulator + value, 0);
|
||
const p95Index = Math.min(sorted.length - 1, Math.max(0, Math.ceil(sorted.length * 0.95) - 1));
|
||
|
||
return {
|
||
count: sorted.length,
|
||
averageMs: sum / sorted.length,
|
||
p95Ms: sorted[p95Index],
|
||
maxMs: sorted[sorted.length - 1],
|
||
};
|
||
}
|
||
|
||
function summarizeSelfPlayBySeed(results: SelfPlayMatchResult[]): {
|
||
perSeed: SelfPlaySeedAggregateResult[];
|
||
dualLossSeeds: number[];
|
||
regressionWatchlistDualLossIntersection: number[];
|
||
} {
|
||
const aggregates = new Map<number, SelfPlaySeedAggregateResult>();
|
||
|
||
for (const result of results) {
|
||
const existing = aggregates.get(result.seed) ?? {
|
||
seed: result.seed,
|
||
matches: 0,
|
||
wins: 0,
|
||
losses: 0,
|
||
draws: 0,
|
||
dualLoss: false,
|
||
seatResults: [],
|
||
};
|
||
|
||
existing.matches++;
|
||
if (result.trackedResult === 'win') existing.wins++;
|
||
else if (result.trackedResult === 'loss') existing.losses++;
|
||
else existing.draws++;
|
||
|
||
existing.seatResults.push({
|
||
trackedTeam: result.trackedTeam,
|
||
trackedResult: result.trackedResult,
|
||
winner: result.winner,
|
||
rounds: result.rounds,
|
||
truncated: result.truncated,
|
||
totalPoints: result.totalPoints,
|
||
});
|
||
|
||
aggregates.set(result.seed, existing);
|
||
}
|
||
|
||
const perSeed = [...aggregates.values()]
|
||
.map(aggregate => ({
|
||
...aggregate,
|
||
dualLoss: aggregate.losses >= 2,
|
||
seatResults: [...aggregate.seatResults].sort((left, right) => left.trackedTeam - right.trackedTeam),
|
||
}))
|
||
.sort((left, right) => left.seed - right.seed);
|
||
const dualLossSeeds = perSeed.filter(aggregate => aggregate.dualLoss).map(aggregate => aggregate.seed);
|
||
const regressionWatchlistDualLossIntersection = dualLossSeeds.filter(seed => KNOWN_REGRESSION_WATCHLIST_SET.has(seed));
|
||
|
||
return {
|
||
perSeed,
|
||
dualLossSeeds,
|
||
regressionWatchlistDualLossIntersection,
|
||
};
|
||
}
|
||
|
||
async function simulateSelfPlayMatch(
|
||
suite: SelfPlaySuiteConfig,
|
||
seed: number,
|
||
trackedTeam: 0 | 1,
|
||
): Promise<{ result: SelfPlayMatchResult; trackedTimings: number[]; simulatedMatchMs: number }> {
|
||
const initialDealer = (seed % 4) as PlayerIndex;
|
||
const teamDifficulties = suite.getTeamDifficulties(trackedTeam);
|
||
let state = createInitialState(initialDealer, createMulberry32(seedFromParts(suite.suiteSeedKey, seed, 1, 0)));
|
||
const matchStartingPlayer = state.matchStartingPlayer;
|
||
const tracker = new CardTracker();
|
||
const trackedTimings: number[] = [];
|
||
let simulatedMatchMs = 0;
|
||
|
||
let rounds = 1;
|
||
let truncated = false;
|
||
let turnCount = 0;
|
||
|
||
while (rounds <= MAX_SELF_PLAY_ROUNDS) {
|
||
while (!state.roundOver) {
|
||
const playerIdx = state.currentPlayer;
|
||
const actingTeam = teamOf(playerIdx);
|
||
const difficulty = teamDifficulties[actingTeam];
|
||
const timingSource = createSimulatedBenchmarkTimingSource();
|
||
const move = await chooseMove(state, playerIdx, difficulty, tracker, undefined, {
|
||
rng: createMulberry32(seedFromParts(suite.suiteSeedKey, seed, rounds, turnCount, playerIdx)),
|
||
timingSource,
|
||
});
|
||
const simulatedMs = timingSource.getElapsedMs();
|
||
simulatedMatchMs += simulatedMs;
|
||
|
||
if (actingTeam === trackedTeam) {
|
||
trackedTimings.push(simulatedMs);
|
||
}
|
||
|
||
const { nextState, capture } = applyMove(
|
||
state,
|
||
playerIdx,
|
||
move.card,
|
||
move.capture.length > 0 ? move.capture : undefined,
|
||
);
|
||
tracker.trackPlay(move.card);
|
||
if (capture) {
|
||
tracker.trackCapture(capture.captured);
|
||
}
|
||
state = nextState;
|
||
turnCount++;
|
||
}
|
||
|
||
const outcome = getMatchOutcome(state.teamScores);
|
||
if (!outcome.continueMatch) {
|
||
break;
|
||
}
|
||
|
||
if (rounds === MAX_SELF_PLAY_ROUNDS) {
|
||
truncated = true;
|
||
break;
|
||
}
|
||
|
||
rounds++;
|
||
|
||
const totals: [number, number] = [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints];
|
||
const nextDealer = nextPlayer(state.dealer);
|
||
tracker.reset();
|
||
state = createInitialState(nextDealer, createMulberry32(seedFromParts(suite.suiteSeedKey, seed, rounds, 0)));
|
||
state.matchStartingPlayer = matchStartingPlayer;
|
||
state.teamScores[0].totalPoints = totals[0];
|
||
state.teamScores[1].totalPoints = totals[1];
|
||
state.roundNumber = rounds;
|
||
}
|
||
|
||
const outcome = getMatchOutcome(state.teamScores);
|
||
const winner = outcome.winner;
|
||
const timingSummary = summarizeTimings(trackedTimings);
|
||
const opposingTeam = otherTeam(trackedTeam);
|
||
const trackedResult = winner === null ? 'draw' : winner === trackedTeam ? 'win' : 'loss';
|
||
|
||
return {
|
||
result: {
|
||
suite: suite.id,
|
||
seed,
|
||
dealer: initialDealer,
|
||
trackedTeam,
|
||
trackedTeamDifficulty: teamDifficulties[trackedTeam],
|
||
opponentDifficulty: teamDifficulties[opposingTeam],
|
||
winner,
|
||
trackedResult,
|
||
rounds,
|
||
truncated,
|
||
totalPoints: [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints],
|
||
trackedDecisionCount: timingSummary.count,
|
||
trackedAverageSimulatedDecisionMs: timingSummary.averageMs,
|
||
trackedMaxSimulatedDecisionMs: timingSummary.maxMs,
|
||
},
|
||
trackedTimings,
|
||
simulatedMatchMs,
|
||
};
|
||
}
|
||
|
||
async function runSelfPlaySuite(
|
||
suite: SelfPlaySuiteConfig,
|
||
): Promise<{ results: SelfPlayMatchResult[]; suiteSimulatedMs: number; trackedTeamTimings: number[] }> {
|
||
const results: SelfPlayMatchResult[] = [];
|
||
const trackedTeamTimings: number[] = [];
|
||
let suiteSimulatedMs = 0;
|
||
let completedMatches = 0;
|
||
|
||
logBenchmarkProgress(`Starting ${suite.label} suite (${suite.requiredMatches} seeded matches with seat swaps).`);
|
||
|
||
for (const seed of SELF_PLAY_MATCH_SEEDS) {
|
||
for (const trackedTeam of SELF_PLAY_SEAT_SWAPS) {
|
||
const { result, trackedTimings, simulatedMatchMs } = await simulateSelfPlayMatch(suite, seed, trackedTeam);
|
||
results.push(result);
|
||
trackedTeamTimings.push(...trackedTimings);
|
||
suiteSimulatedMs += simulatedMatchMs;
|
||
completedMatches++;
|
||
|
||
if (completedMatches === 1 || completedMatches % 25 === 0 || completedMatches === suite.requiredMatches) {
|
||
logBenchmarkProgress(
|
||
`${suite.label} ${completedMatches}/${suite.requiredMatches}: seed ${seed}, tracked team ${trackedTeam}, result ${result.trackedResult}, rounds ${result.rounds}, max simulated decision ${formatDurationMs(result.trackedMaxSimulatedDecisionMs)}.`,
|
||
);
|
||
}
|
||
}
|
||
}
|
||
|
||
return {
|
||
results,
|
||
suiteSimulatedMs,
|
||
trackedTeamTimings,
|
||
};
|
||
}
|
||
|
||
function buildSelfPlaySuiteSummary(
|
||
suite: SelfPlaySuiteConfig,
|
||
run: { results: SelfPlayMatchResult[]; suiteSimulatedMs: number; trackedTeamTimings: number[] },
|
||
): SelfPlaySuiteSummary {
|
||
const wins = run.results.filter(result => result.trackedResult === 'win').length;
|
||
const losses = run.results.filter(result => result.trackedResult === 'loss').length;
|
||
const draws = run.results.filter(result => result.trackedResult === 'draw').length;
|
||
const { perSeed, dualLossSeeds, regressionWatchlistDualLossIntersection } = summarizeSelfPlayBySeed(run.results);
|
||
|
||
return {
|
||
suite: suite.id,
|
||
label: suite.label,
|
||
trackedTeamDifficulty: suite.trackedTeamDifficulty,
|
||
opponentDifficulty: suite.opponentDifficulty,
|
||
matches: run.results.length,
|
||
requiredMatches: suite.requiredMatches,
|
||
seedCount: SELF_PLAY_MATCH_SEEDS.length,
|
||
seatBalanced: true,
|
||
wins,
|
||
losses,
|
||
draws,
|
||
winRate: run.results.length === 0 ? 0 : wins / run.results.length,
|
||
lossRate: run.results.length === 0 ? 0 : losses / run.results.length,
|
||
perSeed,
|
||
dualLossSeeds,
|
||
regressionWatchlist: [...KNOWN_REGRESSION_WATCHLIST],
|
||
regressionWatchlistDualLossIntersection,
|
||
simulatedTiming: {
|
||
suiteSimulatedMs: run.suiteSimulatedMs,
|
||
trackedTeamDecisions: summarizeTimings(run.trackedTeamTimings),
|
||
},
|
||
results: run.results,
|
||
};
|
||
}
|
||
|
||
function createMirrorParityGate(summary: SelfPlaySuiteSummary): WinRateGateSummary {
|
||
const minWinRate = ITERATION_6_GATE.mirrorTargetWinRate - ITERATION_6_GATE.mirrorWinRateTolerance;
|
||
const maxWinRate = ITERATION_6_GATE.mirrorTargetWinRate + ITERATION_6_GATE.mirrorWinRateTolerance;
|
||
const matchCountPassed = summary.matches === ITERATION_6_GATE.mirrorMatchTarget;
|
||
const winRatePassed = summary.winRate >= minWinRate && summary.winRate <= maxWinRate;
|
||
|
||
return {
|
||
matches: summary.matches,
|
||
requiredMatches: ITERATION_6_GATE.mirrorMatchTarget,
|
||
wins: summary.wins,
|
||
losses: summary.losses,
|
||
draws: summary.draws,
|
||
winRate: summary.winRate,
|
||
targetWinRate: ITERATION_6_GATE.mirrorTargetWinRate,
|
||
tolerance: ITERATION_6_GATE.mirrorWinRateTolerance,
|
||
minWinRate,
|
||
maxWinRate,
|
||
matchCountPassed,
|
||
winRatePassed,
|
||
passed: matchCountPassed && winRatePassed,
|
||
};
|
||
}
|
||
|
||
function createBeginnerDominanceGate(summary: SelfPlaySuiteSummary): WinRateGateSummary {
|
||
const matchCountPassed = summary.matches === ITERATION_6_GATE.beginnerMatchTarget;
|
||
const winRatePassed = summary.winRate >= ITERATION_6_GATE.beginnerMinWinRate;
|
||
|
||
return {
|
||
matches: summary.matches,
|
||
requiredMatches: ITERATION_6_GATE.beginnerMatchTarget,
|
||
wins: summary.wins,
|
||
losses: summary.losses,
|
||
draws: summary.draws,
|
||
winRate: summary.winRate,
|
||
targetWinRate: null,
|
||
tolerance: null,
|
||
minWinRate: ITERATION_6_GATE.beginnerMinWinRate,
|
||
maxWinRate: null,
|
||
matchCountPassed,
|
||
winRatePassed,
|
||
passed: matchCountPassed && winRatePassed,
|
||
};
|
||
}
|
||
|
||
function formatPerSeedAggregates(perSeed: SelfPlaySeedAggregateResult[]): string {
|
||
return perSeed.map(seed => `${seed.seed}:${seed.wins}W-${seed.losses}L-${seed.draws}D`).join(' | ');
|
||
}
|
||
|
||
function printReadableSummary(summary: AIBenchmarkSummary): void {
|
||
const mirror = summary.selfPlaySuites.mirrorParity;
|
||
const beginner = summary.selfPlaySuites.beginnerDominance;
|
||
|
||
console.log('AI quality benchmark');
|
||
console.log(`Iteration 6 quality gate: ${summary.qualityGate.passed ? 'PASS' : 'FAIL'}`);
|
||
console.log(`Fixture totals: ${summary.fixtureTotals.fixtures} total, ${summary.fixtureTotals.criticalFixtures} critical.`);
|
||
console.log(`Fixed-fixture gate: ${summary.qualityGate.fixedFixtures.actual}/${summary.qualityGate.fixedFixtures.total} agreements.`);
|
||
console.log(`Critical concept gate: ${summary.qualityGate.criticalConcepts.actual}/${summary.qualityGate.criticalConcepts.total} passes.`);
|
||
console.log(
|
||
`Mirror parity gate: ${summary.qualityGate.mirrorParity.matches}/${summary.qualityGate.mirrorParity.requiredMatches} matches, ${formatPercentage(summary.qualityGate.mirrorParity.winRate)} tracked-team win rate (target ${formatPercentage(summary.qualityGate.mirrorParity.targetWinRate ?? 0)} +/- ${formatPercentage(summary.qualityGate.mirrorParity.tolerance ?? 0)}).`,
|
||
);
|
||
console.log(
|
||
`Beginner dominance gate: ${summary.qualityGate.beginnerDominance.matches}/${summary.qualityGate.beginnerDominance.requiredMatches} matches, ${formatPercentage(summary.qualityGate.beginnerDominance.winRate)} master win rate (target >= ${formatPercentage(summary.qualityGate.beginnerDominance.minWinRate ?? 0)}).`,
|
||
);
|
||
if (summary.fixedSuite.fixedFixtureAgreementFailures.length > 0) {
|
||
console.log(`Fixed-fixture agreement failures: ${summary.fixedSuite.fixedFixtureAgreementFailures.join(', ')}`);
|
||
}
|
||
if (summary.fixedSuite.criticalPassFailures.length > 0) {
|
||
console.log(`Critical concept failures: ${summary.fixedSuite.criticalPassFailures.join(', ')}`);
|
||
}
|
||
console.log(`Mirror per-seed aggregates: ${formatPerSeedAggregates(mirror.perSeed)}`);
|
||
console.log(`Mirror dual-loss seeds: ${mirror.dualLossSeeds.length > 0 ? mirror.dualLossSeeds.join(', ') : 'none'}`);
|
||
console.log(
|
||
`Mirror regression watchlist intersection: ${mirror.regressionWatchlistDualLossIntersection.length > 0 ? mirror.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${mirror.regressionWatchlist.join(', ')})`,
|
||
);
|
||
console.log(`Beginner per-seed aggregates: ${formatPerSeedAggregates(beginner.perSeed)}`);
|
||
console.log(`Beginner dual-loss seeds: ${beginner.dualLossSeeds.length > 0 ? beginner.dualLossSeeds.join(', ') : 'none'}`);
|
||
console.log(
|
||
`Beginner regression watchlist intersection: ${beginner.regressionWatchlistDualLossIntersection.length > 0 ? beginner.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${beginner.regressionWatchlist.join(', ')})`,
|
||
);
|
||
console.log(
|
||
`Fixed suite simulated duration: production ${formatDurationMs(summary.fixedSuite.simulatedTiming.productionSuiteSimulatedMs)}, reference ${formatDurationMs(summary.fixedSuite.simulatedTiming.referenceSuiteSimulatedMs)}.`,
|
||
);
|
||
console.log(`Mirror suite simulated duration: ${formatDurationMs(mirror.simulatedTiming.suiteSimulatedMs)}.`);
|
||
console.log(`Beginner suite simulated duration: ${formatDurationMs(beginner.simulatedTiming.suiteSimulatedMs)}.`);
|
||
console.log(
|
||
`Simulated timing: fixed production avg ${summary.timing.fixedFixtureProductionMasterDecisions.averageMs.toFixed(1)} ms, fixed reference avg ${summary.timing.fixedFixtureReferenceMasterDecisions.averageMs.toFixed(1)} ms, mirror tracked avg ${summary.timing.mirrorTrackedTeamSimulatedDecisions.averageMs.toFixed(1)} ms, beginner tracked avg ${summary.timing.beginnerTrackedTeamSimulatedDecisions.averageMs.toFixed(1)} ms, aggregate avg ${summary.timing.allTrackedProductionSimulatedDecisions.averageMs.toFixed(1)} ms.`,
|
||
);
|
||
console.log('BENCHMARK_SUMMARY');
|
||
console.log(JSON.stringify(summary, null, 2));
|
||
}
|
||
|
||
export async function runAIBenchmark(): Promise<AIBenchmarkSummary> {
|
||
assertIteration6BenchmarkContract();
|
||
logBenchmarkProgress('Benchmark started. Running fixed fixtures, mirror parity, then beginner dominance.');
|
||
|
||
const fixedSuite = await runFixedFixtureSuite();
|
||
logBenchmarkProgress(
|
||
`Fixed fixture suite complete with production ${formatDurationMs(fixedSuite.productionSuiteSimulatedMs)} simulated and reference ${formatDurationMs(fixedSuite.referenceSuiteSimulatedMs)} simulated.`,
|
||
);
|
||
|
||
const mirrorRun = await runSelfPlaySuite(SELF_PLAY_SUITES['mirror-parity']);
|
||
logBenchmarkProgress(`Mirror parity suite complete in ${formatDurationMs(mirrorRun.suiteSimulatedMs)} simulated.`);
|
||
|
||
const beginnerRun = await runSelfPlaySuite(SELF_PLAY_SUITES['beginner-dominance']);
|
||
logBenchmarkProgress(`Beginner dominance suite complete in ${formatDurationMs(beginnerRun.suiteSimulatedMs)} simulated.`);
|
||
|
||
const mirrorSummary = buildSelfPlaySuiteSummary(SELF_PLAY_SUITES['mirror-parity'], mirrorRun);
|
||
const beginnerSummary = buildSelfPlaySuiteSummary(SELF_PLAY_SUITES['beginner-dominance'], beginnerRun);
|
||
const mirrorParityGate = createMirrorParityGate(mirrorSummary);
|
||
const beginnerDominanceGate = createBeginnerDominanceGate(beginnerSummary);
|
||
|
||
const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length;
|
||
const fixedFixtureAgreements = fixedSuite.results.filter(result => result.matchesReference).length;
|
||
const expectedPasses = fixedSuite.results.filter(result => result.expectedPass).length;
|
||
const criticalPasses = fixedSuite.results.filter(result => result.conceptGatePass === true).length;
|
||
const fixedFixtureAgreementFailures = fixedSuite.results
|
||
.filter(result => !result.matchesReference)
|
||
.map(result => result.fixtureId);
|
||
const criticalPassFailures = fixedSuite.results
|
||
.filter(result => result.conceptGatePass === false)
|
||
.map(result => result.fixtureId);
|
||
const fixedFixtureProductionMasterDecisions = summarizeTimings(fixedSuite.productionTimings);
|
||
const fixedFixtureReferenceMasterDecisions = summarizeTimings(fixedSuite.referenceTimings);
|
||
const mirrorTrackedTeamSimulatedDecisions = mirrorSummary.simulatedTiming.trackedTeamDecisions;
|
||
const beginnerTrackedTeamSimulatedDecisions = beginnerSummary.simulatedTiming.trackedTeamDecisions;
|
||
const allTrackedProductionSimulatedDecisions = summarizeTimings([
|
||
...fixedSuite.productionTimings,
|
||
...mirrorRun.trackedTeamTimings,
|
||
...beginnerRun.trackedTeamTimings,
|
||
]);
|
||
|
||
const fixedFixtureGate: GateCountSummary = {
|
||
actual: fixedFixtureAgreements,
|
||
required: AI_BENCHMARK_FIXTURES.length,
|
||
total: AI_BENCHMARK_FIXTURES.length,
|
||
passed: fixedFixtureAgreements === AI_BENCHMARK_FIXTURES.length,
|
||
};
|
||
const criticalConceptGate: GateCountSummary = {
|
||
actual: criticalPasses,
|
||
required: criticalFixtureCount,
|
||
total: criticalFixtureCount,
|
||
passed: criticalPasses === criticalFixtureCount,
|
||
};
|
||
|
||
return {
|
||
benchmark: 'ai-quality',
|
||
qualityGate: {
|
||
iteration: 6,
|
||
passed: fixedFixtureGate.passed
|
||
&& criticalConceptGate.passed
|
||
&& mirrorParityGate.passed
|
||
&& beginnerDominanceGate.passed,
|
||
fixedFixtures: fixedFixtureGate,
|
||
criticalConcepts: criticalConceptGate,
|
||
mirrorParity: mirrorParityGate,
|
||
beginnerDominance: beginnerDominanceGate,
|
||
},
|
||
fixtureCount: AI_BENCHMARK_FIXTURES.length,
|
||
criticalFixtureCount,
|
||
fixtureTotals: {
|
||
fixtures: AI_BENCHMARK_FIXTURES.length,
|
||
criticalFixtures: criticalFixtureCount,
|
||
},
|
||
fixedSuite: {
|
||
fixedFixtureAgreements,
|
||
expectedPasses,
|
||
criticalPasses,
|
||
fixedFixtureAgreementFailures,
|
||
criticalPassFailures,
|
||
simulatedTiming: {
|
||
productionSuiteSimulatedMs: fixedSuite.productionSuiteSimulatedMs,
|
||
referenceSuiteSimulatedMs: fixedSuite.referenceSuiteSimulatedMs,
|
||
productionMasterDecisions: fixedFixtureProductionMasterDecisions,
|
||
referenceMasterDecisions: fixedFixtureReferenceMasterDecisions,
|
||
},
|
||
results: fixedSuite.results,
|
||
},
|
||
selfPlaySuites: {
|
||
totalMatches: mirrorSummary.matches + beginnerSummary.matches,
|
||
mirrorParity: mirrorSummary,
|
||
beginnerDominance: beginnerSummary,
|
||
},
|
||
timing: {
|
||
fixedFixtureProductionMasterDecisions,
|
||
fixedFixtureReferenceMasterDecisions,
|
||
mirrorTrackedTeamSimulatedDecisions,
|
||
beginnerTrackedTeamSimulatedDecisions,
|
||
allTrackedProductionSimulatedDecisions,
|
||
},
|
||
referenceProfile: REFERENCE_PROFILE,
|
||
};
|
||
}
|
||
|
||
interface HeadToHeadMatchResult {
|
||
suite: 'head-to-head-master' | 'head-to-head-advanced';
|
||
seed: number;
|
||
dealer: PlayerIndex;
|
||
newAITeam: 0 | 1;
|
||
newAIDifficulty: Difficulty;
|
||
winner: 0 | 1 | null;
|
||
newAIResult: 'win' | 'loss' | 'draw';
|
||
rounds: number;
|
||
totalPoints: [number, number];
|
||
}
|
||
|
||
interface HeadToHeadSuiteSummary {
|
||
suite: 'head-to-head-master' | 'head-to-head-advanced';
|
||
newAIDifficulty: Difficulty;
|
||
matches: number;
|
||
wins: number;
|
||
losses: number;
|
||
draws: number;
|
||
winRate: number;
|
||
targetWinRate: number;
|
||
passed: boolean;
|
||
results: HeadToHeadMatchResult[];
|
||
}
|
||
|
||
const HEAD_TO_HEAD_SUITE_SEED_KEYS: Record<'head-to-head-master' | 'head-to-head-advanced', number> = {
|
||
'head-to-head-master': 0x4d42,
|
||
'head-to-head-advanced': 0x4142,
|
||
};
|
||
|
||
async function simulateHeadToHeadMatch(
|
||
suite: 'head-to-head-master' | 'head-to-head-advanced',
|
||
difficulty: Difficulty,
|
||
seed: number,
|
||
newAITeam: 0 | 1,
|
||
): Promise<HeadToHeadMatchResult> {
|
||
const suiteSeedKey = HEAD_TO_HEAD_SUITE_SEED_KEYS[suite];
|
||
const initialDealer = (seed % 4) as PlayerIndex;
|
||
let state = createInitialState(initialDealer, createMulberry32(seedFromParts(suiteSeedKey, seed, 1, 0)));
|
||
const matchStartingPlayer = state.matchStartingPlayer;
|
||
const tracker = new CardTracker();
|
||
const inference = new CardInferenceEngine(tracker);
|
||
let rounds = 1;
|
||
let truncated = false;
|
||
let turnCount = 0;
|
||
|
||
while (rounds <= MAX_SELF_PLAY_ROUNDS) {
|
||
while (!state.roundOver) {
|
||
const playerIdx = state.currentPlayer;
|
||
const actingTeam = teamOf(playerIdx);
|
||
const isNewAI = actingTeam === newAITeam;
|
||
const timingSource = createSimulatedBenchmarkTimingSource();
|
||
const rng = createMulberry32(seedFromParts(suiteSeedKey, seed, rounds, turnCount, playerIdx));
|
||
|
||
let move: AIMove;
|
||
if (isNewAI) {
|
||
move = await chooseMove(state, playerIdx, difficulty, tracker, undefined, {
|
||
rng, timingSource, inference,
|
||
});
|
||
} else {
|
||
move = await chooseMoveOld(state, playerIdx, difficulty, tracker, undefined, {
|
||
rng, timingSource,
|
||
});
|
||
}
|
||
|
||
const tableBeforeMove = [...state.table];
|
||
const { nextState, capture } = applyMove(
|
||
state,
|
||
playerIdx,
|
||
move.card,
|
||
move.capture.length > 0 ? move.capture : undefined,
|
||
);
|
||
tracker.trackPlay(move.card);
|
||
if (capture) tracker.trackCapture(capture.captured);
|
||
inference.onMove(playerIdx, move, tableBeforeMove);
|
||
state = nextState;
|
||
turnCount++;
|
||
}
|
||
|
||
const outcome = getMatchOutcome(state.teamScores);
|
||
if (!outcome.continueMatch) {
|
||
break;
|
||
}
|
||
|
||
if (rounds === MAX_SELF_PLAY_ROUNDS) {
|
||
truncated = true;
|
||
break;
|
||
}
|
||
|
||
rounds++;
|
||
const totals: [number, number] = [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints];
|
||
const nextDealer = nextPlayer(state.dealer);
|
||
tracker.reset();
|
||
inference.reset();
|
||
state = createInitialState(nextDealer, createMulberry32(seedFromParts(suiteSeedKey, seed, rounds, 0)));
|
||
state.matchStartingPlayer = matchStartingPlayer;
|
||
state.teamScores[0].totalPoints = totals[0];
|
||
state.teamScores[1].totalPoints = totals[1];
|
||
state.roundNumber = rounds;
|
||
}
|
||
|
||
const outcome = getMatchOutcome(state.teamScores);
|
||
const winner = outcome.winner;
|
||
const newAIResult = winner === null ? 'draw' : winner === newAITeam ? 'win' : 'loss';
|
||
|
||
void truncated; // tracked internally; not surfaced in the result interface
|
||
|
||
return {
|
||
suite,
|
||
seed,
|
||
dealer: initialDealer,
|
||
newAITeam,
|
||
newAIDifficulty: difficulty,
|
||
winner,
|
||
newAIResult,
|
||
rounds,
|
||
totalPoints: [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints],
|
||
};
|
||
}
|
||
|
||
export async function runHeadToHeadBenchmark(): Promise<HeadToHeadSuiteSummary[]> {
|
||
const configs: Array<{
|
||
suite: 'head-to-head-master' | 'head-to-head-advanced';
|
||
difficulty: Difficulty;
|
||
targetWinRate: number;
|
||
}> = [
|
||
{ suite: 'head-to-head-master', difficulty: 'master', targetWinRate: HEAD_TO_HEAD_MASTER_TARGET_WIN_RATE },
|
||
{ suite: 'head-to-head-advanced', difficulty: 'advanced', targetWinRate: HEAD_TO_HEAD_ADVANCED_TARGET_WIN_RATE },
|
||
];
|
||
|
||
const summaries: HeadToHeadSuiteSummary[] = [];
|
||
|
||
for (const { suite, difficulty, targetWinRate } of configs) {
|
||
const results: HeadToHeadMatchResult[] = [];
|
||
const totalMatches = HEAD_TO_HEAD_SEEDS.length * HEAD_TO_HEAD_SEAT_SWAPS.length;
|
||
let completedMatches = 0;
|
||
|
||
logBenchmarkProgress(`Starting ${suite} (${totalMatches} matches: ${HEAD_TO_HEAD_SEEDS.length} seeds × ${HEAD_TO_HEAD_SEAT_SWAPS.length} seat swaps).`);
|
||
|
||
for (const seed of HEAD_TO_HEAD_SEEDS) {
|
||
for (const newAITeam of HEAD_TO_HEAD_SEAT_SWAPS) {
|
||
const result = await simulateHeadToHeadMatch(suite, difficulty, seed, newAITeam);
|
||
results.push(result);
|
||
completedMatches++;
|
||
|
||
if (completedMatches === 1 || completedMatches % 25 === 0 || completedMatches === totalMatches) {
|
||
logBenchmarkProgress(
|
||
`${suite} ${completedMatches}/${totalMatches}: seed ${seed}, newAITeam ${newAITeam}, result ${result.newAIResult}, rounds ${result.rounds}.`,
|
||
);
|
||
}
|
||
}
|
||
}
|
||
|
||
const wins = results.filter(r => r.newAIResult === 'win').length;
|
||
const losses = results.filter(r => r.newAIResult === 'loss').length;
|
||
const draws = results.filter(r => r.newAIResult === 'draw').length;
|
||
const winRate = results.length === 0 ? 0 : wins / results.length;
|
||
|
||
summaries.push({
|
||
suite,
|
||
newAIDifficulty: difficulty,
|
||
matches: results.length,
|
||
wins,
|
||
losses,
|
||
draws,
|
||
winRate,
|
||
targetWinRate,
|
||
passed: winRate >= targetWinRate,
|
||
results,
|
||
});
|
||
}
|
||
|
||
return summaries;
|
||
}
|
||
|
||
async function runBenchmarkCli(): Promise<void> {
|
||
const summary = await runAIBenchmark();
|
||
logBenchmarkProgress('Benchmark complete. Emitting summary with iteration 6 gate results.');
|
||
printReadableSummary(summary);
|
||
|
||
logBenchmarkProgress('Starting HEAD_TO_HEAD benchmark (new AI vs legacy AI)...');
|
||
const h2hSuites = await runHeadToHeadBenchmark();
|
||
for (const h2h of h2hSuites) {
|
||
console.log(`\nHEAD_TO_HEAD: ${h2h.suite} (${h2h.matches} games)`);
|
||
console.log(`New AI wins: ${h2h.wins} (${formatPercentage(h2h.winRate)})`);
|
||
console.log(`Legacy AI wins: ${h2h.losses} (${formatPercentage(h2h.matches === 0 ? 0 : h2h.losses / h2h.matches)})`);
|
||
console.log(`Ties: ${h2h.draws}`);
|
||
console.log(`Target win rate: ${formatPercentage(h2h.targetWinRate)} — ${h2h.passed ? 'PASS' : 'FAIL'}`);
|
||
}
|
||
}
|
||
|
||
if (typeof window === 'undefined') {
|
||
void runBenchmarkCli();
|
||
} |