feat(SCOPONE-0009) improve ai, dealer, apparigliare e sparigliare

This commit is contained in:
Giancarmine Salucci
2026-04-09 22:30:27 +02:00
parent d0a44d295a
commit 77ab1f43a6
8 changed files with 3787 additions and 510 deletions

View File

@@ -0,0 +1,543 @@
import { buildDeck, findCaptures, getOpeningPlayerForDealer } from './engine';
import { Card, GameState, Player, PlayerIndex, TeamScore } from './types';
export interface AIBenchmarkExpectedMove {
cardId: string;
captureIds?: string[];
}
export type AIBenchmarkCriticalConcept =
| 'full-table-scopa'
| 'partner-scopa-setup'
| 'settebello-capture'
| 'anti-scopa-defense'
| 'dealer-rank-residue-preservation'
| 'exact-endgame-resolution';
export interface AIBenchmarkFixture {
id: string;
name: string;
description: string;
tags: string[];
criticalConcept: AIBenchmarkCriticalConcept | null;
state: GameState;
expectedMove: AIBenchmarkExpectedMove;
}
interface RawFixture {
id: string;
name: string;
description: string;
tags: string[];
criticalConcept?: AIBenchmarkCriticalConcept;
dealer: PlayerIndex;
currentPlayer: PlayerIndex;
handSizes: [number, number, number, number];
hands: [string[] | undefined, string[] | undefined, string[] | undefined, string[] | undefined];
table: string[];
piles?: [string[], string[], string[], string[]];
pileCardCounts?: [number, number, number, number];
scopes?: [number, number, number, number];
totalPoints?: [number, number];
roundNumber?: number;
lastCaptureTeam?: 0 | 1 | null;
expectedMove: AIBenchmarkExpectedMove;
}
const PLAYER_NAMES = ['Tu', 'AI Ovest', 'Compagno', 'AI Est'] as const;
const CARD_BY_ID = new Map(buildDeck().map(card => [card.id, card]));
const PILES_TEMPLATE_A: [string[], string[], string[], string[]] = [
['denara_1', 'coppe_7', 'spade_6', 'bastoni_8'],
['denara_3', 'coppe_1', 'spade_2', 'bastoni_5'],
['denara_6', 'coppe_4', 'spade_1', 'bastoni_7'],
['denara_10', 'coppe_6', 'spade_4', 'bastoni_2'],
];
const PILES_TEMPLATE_B: [string[], string[], string[], string[]] = [
['denara_2', 'coppe_8', 'spade_5', 'bastoni_9'],
['denara_5', 'coppe_1', 'spade_2', 'bastoni_6'],
['denara_6', 'coppe_4', 'spade_1', 'bastoni_7'],
['denara_10', 'coppe_6', 'spade_9', 'bastoni_2'],
];
const RAW_FIXTURES: RawFixture[] = [
{
id: 'settebello-direct-capture',
name: 'Settebello Direct Capture',
description: 'The root player should take the settebello immediately when a direct seven match is available.',
tags: ['critical-settebello-capture', 'denari-race'],
criticalConcept: 'settebello-capture',
dealer: 3,
currentPlayer: 0,
handSizes: [5, 5, 5, 5],
hands: [[
'spade_7',
'denara_8',
'bastoni_6',
'coppe_9',
'denara_4',
], undefined, undefined, undefined],
table: ['denara_7', 'coppe_2', 'bastoni_4', 'spade_9'],
piles: PILES_TEMPLATE_A,
totalPoints: [6, 7],
expectedMove: {
cardId: 'spade_7',
captureIds: ['denara_7'],
},
},
{
id: 'anti-scopa-safe-dump',
name: 'Anti-Scopa Safe Dump',
description: 'The root player should prefer the safe high dump instead of taking a flashy but dangerous capture.',
tags: ['critical-anti-scopa', 'table-control'],
criticalConcept: 'anti-scopa-defense',
dealer: 0,
currentPlayer: 1,
handSizes: [5, 5, 5, 5],
hands: [undefined, [
'bastoni_9',
'coppe_3',
'spade_10',
'denara_5',
'coppe_8',
], undefined, undefined],
table: ['bastoni_1', 'coppe_5', 'denara_7', 'spade_8'],
piles: PILES_TEMPLATE_A,
totalPoints: [8, 8],
expectedMove: {
cardId: 'spade_10',
},
},
{
id: 'dealer-rank-residue-preserve-pair',
name: 'Dealer Rank Residue Preserve Pair',
description: 'The dealer should keep the double-nine structure intact and release the harmless low card.',
tags: ['critical-dealer-rank-residue', 'dealer-side-control'],
criticalConcept: 'dealer-rank-residue-preservation',
dealer: 3,
currentPlayer: 3,
handSizes: [5, 5, 5, 5],
hands: [undefined, undefined, undefined, [
'spade_3',
'denara_9',
'coppe_9',
'bastoni_10',
'denara_5',
]],
table: ['denara_2', 'coppe_5', 'bastoni_4', 'spade_8'],
piles: PILES_TEMPLATE_A,
scopes: [0, 1, 0, 1],
totalPoints: [9, 7],
expectedMove: {
cardId: 'spade_3',
},
},
{
id: 'exact-endgame-resolution',
name: 'Exact Endgame Resolution',
description: 'With one card per player and a winning capture on the table, the search should resolve the hand exactly.',
tags: ['critical-exact-endgame', 'endgame'],
criticalConcept: 'exact-endgame-resolution',
dealer: 1,
currentPlayer: 2,
handSizes: [1, 1, 1, 1],
hands: [undefined, undefined, ['spade_6'], undefined],
table: ['coppe_2', 'bastoni_4'],
pileCardCounts: [9, 8, 9, 8],
scopes: [1, 0, 1, 0],
totalPoints: [10, 9],
roundNumber: 4,
lastCaptureTeam: 0,
expectedMove: {
cardId: 'spade_6',
captureIds: ['coppe_2', 'bastoni_4'],
},
},
{
id: 'full-table-scopa',
name: 'Full Table Scopa',
description: 'A full-table sweep should be preferred when it is available and it is not the final play of the round.',
tags: ['critical-full-table-scopa', 'scopa-window'],
criticalConcept: 'full-table-scopa',
dealer: 2,
currentPlayer: 0,
handSizes: [5, 5, 5, 5],
hands: [[
'spade_10',
'denara_8',
'coppe_9',
'bastoni_3',
'denara_4',
], undefined, undefined, undefined],
table: ['bastoni_1', 'coppe_2', 'denara_3', 'spade_4'],
piles: PILES_TEMPLATE_B,
expectedMove: {
cardId: 'spade_10',
captureIds: ['bastoni_1', 'coppe_2', 'denara_3', 'spade_4'],
},
},
{
id: 'partner-scopa-setup',
name: 'Partner Scopa Setup',
description: 'When there is no safe immediate sweep, the root player should prefer the quiet partner invitation that preserves table pressure for the partner line instead of cashing a smaller material capture.',
tags: ['critical-partner-setup', 'partner-window', 'table-control'],
criticalConcept: 'partner-scopa-setup',
dealer: 0,
currentPlayer: 1,
handSizes: [5, 5, 5, 5],
hands: [
undefined,
['coppe_10', 'spade_6', 'bastoni_3', 'denara_5', 'coppe_2'],
['spade_10', 'denara_6', 'bastoni_4', 'coppe_9', 'denara_8'],
undefined,
],
table: ['denara_1', 'coppe_4', 'bastoni_7', 'spade_8'],
pileCardCounts: [4, 4, 4, 4],
scopes: [0, 1, 0, 1],
totalPoints: [8, 9],
expectedMove: {
cardId: 'coppe_10',
},
},
{
id: 'denari-race-conversion',
name: 'Denari Race Conversion',
description: 'When denari control is in play, the benchmark should reward the denari-preserving nine capture.',
tags: ['denari-race'],
dealer: 1,
currentPlayer: 2,
handSizes: [5, 5, 5, 5],
hands: [undefined, undefined, [
'denara_9',
'spade_10',
'coppe_8',
'bastoni_6',
'denara_5',
], undefined],
table: ['denara_4', 'coppe_2', 'spade_5', 'bastoni_3'],
piles: PILES_TEMPLATE_A,
totalPoints: [7, 8],
expectedMove: {
cardId: 'denara_9',
},
},
{
id: 'primiera-seven-pressure',
name: 'Primiera Seven Pressure',
description: 'A seven that improves primiera pressure should beat quieter material moves.',
tags: ['primiera-pressure'],
dealer: 0,
currentPlayer: 1,
handSizes: [5, 5, 5, 5],
hands: [undefined, [
'denara_7',
'coppe_10',
'spade_6',
'bastoni_5',
'coppe_9',
], undefined, undefined],
table: ['denara_1', 'coppe_3', 'bastoni_4', 'spade_8'],
piles: PILES_TEMPLATE_B,
expectedMove: {
cardId: 'denara_7',
},
},
{
id: 'safe-low-dump',
name: 'Safe Low Dump',
description: 'The search should prefer the lone safe release over cards that either capture or create leverage for the next player.',
tags: ['table-control'],
dealer: 2,
currentPlayer: 0,
handSizes: [5, 5, 5, 5],
hands: [[
'coppe_10',
'spade_8',
'bastoni_6',
'denara_4',
'coppe_3',
], undefined, undefined, undefined],
table: ['denara_2', 'spade_9', 'bastoni_4', 'coppe_5'],
piles: PILES_TEMPLATE_A,
expectedMove: {
cardId: 'coppe_3',
},
},
{
id: 'late-denari-shield',
name: 'Late Denari Shield',
description: 'The denari nine should still be preferred late when it blocks the opponent from flipping the denari race.',
tags: ['denari-race', 'late-round'],
dealer: 1,
currentPlayer: 3,
handSizes: [5, 5, 5, 5],
hands: [undefined, undefined, undefined, [
'denara_9',
'coppe_7',
'spade_10',
'bastoni_8',
'denara_4',
]],
table: ['denara_1', 'coppe_2', 'bastoni_5', 'spade_6'],
piles: PILES_TEMPLATE_B,
scopes: [1, 0, 0, 0],
totalPoints: [9, 9],
expectedMove: {
cardId: 'denara_9',
},
},
{
id: 'only-safe-release',
name: 'Only Safe Release',
description: 'Only the deuce avoids either an immediate capture or a tactical concession.',
tags: ['anti-concession'],
dealer: 3,
currentPlayer: 1,
handSizes: [5, 5, 5, 5],
hands: [undefined, [
'bastoni_10',
'coppe_8',
'denara_9',
'spade_3',
'coppe_2',
], undefined, undefined],
table: ['denara_4', 'spade_5', 'bastoni_1', 'coppe_3'],
piles: PILES_TEMPLATE_A,
expectedMove: {
cardId: 'coppe_2',
},
},
{
id: 'table-clear-material-sweep',
name: 'Table Clear Material Sweep',
description: 'A full-table material sweep with the ten should win over lower-value tactical grabs.',
tags: ['scopa-window', 'material-swing'],
dealer: 0,
currentPlayer: 2,
handSizes: [5, 5, 5, 5],
hands: [undefined, undefined, [
'spade_7',
'denara_8',
'coppe_9',
'bastoni_10',
'denara_3',
], undefined],
table: ['denara_4', 'coppe_3', 'bastoni_1', 'spade_2'],
piles: [
['denara_1', 'coppe_4', 'spade_5', 'bastoni_6'],
['denara_2', 'coppe_5', 'spade_6', 'bastoni_7'],
['denara_5', 'coppe_6', 'spade_8', 'bastoni_9'],
['denara_6', 'coppe_7', 'spade_9', 'bastoni_2'],
],
expectedMove: {
cardId: 'bastoni_10',
captureIds: ['denara_4', 'coppe_3', 'bastoni_1', 'spade_2'],
},
},
{
id: 'direct-eight-conversion',
name: 'Direct Eight Conversion',
description: 'The direct eight capture should be preferred when it removes the strongest immediate counter-card from the table.',
tags: ['material-swing'],
dealer: 2,
currentPlayer: 0,
handSizes: [5, 5, 5, 5],
hands: [[
'coppe_10',
'spade_5',
'denara_8',
'bastoni_9',
'coppe_3',
], undefined, undefined, undefined],
table: ['denara_2', 'coppe_8', 'bastoni_4', 'spade_9'],
piles: PILES_TEMPLATE_A,
expectedMove: {
cardId: 'denara_8',
captureIds: ['coppe_8'],
},
},
];
function cloneCard(card: Card): Card {
return { ...card };
}
function cardFromId(id: string): Card {
const card = CARD_BY_ID.get(id);
if (!card) {
throw new Error(`Unknown card id in benchmark fixture: ${id}`);
}
return cloneCard(card);
}
function cardsFromIds(ids: string[]): Card[] {
return ids.map(cardFromId);
}
function createTeamScore(totalPoints = 0): TeamScore {
return {
cards: 0,
scope: 0,
denari: 0,
settebello: false,
primiera: 0,
roundPoints: 0,
totalPoints,
};
}
function buildPlayers(
hands: [Card[], Card[], Card[], Card[]],
piles: [Card[], Card[], Card[], Card[]],
scopes: [number, number, number, number],
): [Player, Player, Player, Player] {
return [0, 1, 2, 3].map(index => ({
index: index as PlayerIndex,
hand: hands[index].map(cloneCard),
pile: piles[index].map(cloneCard),
scope: scopes[index],
isHuman: index === 0,
name: PLAYER_NAMES[index],
})) as [Player, Player, Player, Player];
}
function flattenIds(groups: string[][]): string[] {
return groups.flatMap(group => group);
}
function buildFixture(raw: RawFixture): AIBenchmarkFixture {
const explicitHands = raw.hands.map(hand => hand ? [...hand] : undefined) as RawFixture['hands'];
const explicitPiles = raw.piles
? raw.piles.map(pile => [...pile]) as [string[], string[], string[], string[]]
: undefined;
const reservedIds = new Set<string>();
for (const id of flattenIds(raw.hands.filter((hand): hand is string[] => Array.isArray(hand)))) {
if (reservedIds.has(id)) throw new Error(`Duplicate hand card ${id} in fixture ${raw.id}`);
reservedIds.add(id);
}
for (const id of raw.table) {
if (reservedIds.has(id)) throw new Error(`Duplicate table card ${id} in fixture ${raw.id}`);
reservedIds.add(id);
}
if (explicitPiles) {
for (const id of flattenIds(explicitPiles)) {
if (reservedIds.has(id)) throw new Error(`Duplicate pile card ${id} in fixture ${raw.id}`);
reservedIds.add(id);
}
}
const remainingDeckIds = buildDeck()
.map(card => card.id)
.filter(id => !reservedIds.has(id));
const hands = explicitHands.map((hand, playerIdx) => {
const requiredSize = raw.handSizes[playerIdx];
if (hand && hand.length !== requiredSize) {
throw new Error(`Fixture ${raw.id} hand size mismatch for player ${playerIdx}`);
}
if (hand) return [...hand];
const assigned = remainingDeckIds.splice(0, requiredSize);
if (assigned.length !== requiredSize) {
throw new Error(`Fixture ${raw.id} does not have enough cards to fill player ${playerIdx} hand`);
}
return assigned;
}) as [string[], string[], string[], string[]];
const piles = explicitPiles ?? (() => {
if (!raw.pileCardCounts) {
throw new Error(`Fixture ${raw.id} is missing piles or pileCardCounts`);
}
return raw.pileCardCounts.map(count => {
const assigned = remainingDeckIds.splice(0, count);
if (assigned.length !== count) {
throw new Error(`Fixture ${raw.id} does not have enough cards to fill pile count ${count}`);
}
return assigned;
}) as [string[], string[], string[], string[]];
})();
if (remainingDeckIds.length !== 0) {
throw new Error(`Fixture ${raw.id} does not account for all 40 cards`);
}
const state: GameState = {
players: buildPlayers(
hands.map(cardsFromIds) as [Card[], Card[], Card[], Card[]],
piles.map(cardsFromIds) as [Card[], Card[], Card[], Card[]],
raw.scopes ?? [0, 0, 0, 0],
),
table: cardsFromIds(raw.table),
matchStartingPlayer: getOpeningPlayerForDealer(raw.dealer),
dealer: raw.dealer,
currentPlayer: raw.currentPlayer,
roundOver: false,
gameOver: false,
teamScores: [
createTeamScore(raw.totalPoints?.[0] ?? 0),
createTeamScore(raw.totalPoints?.[1] ?? 0),
],
lastCapturTeam: raw.lastCaptureTeam ?? null,
roundNumber: raw.roundNumber ?? 1,
};
validateFixtureState(raw, state);
return {
id: raw.id,
name: raw.name,
description: raw.description,
tags: [...raw.tags],
criticalConcept: raw.criticalConcept ?? null,
state,
expectedMove: raw.expectedMove,
};
}
function validateFixtureState(raw: RawFixture, state: GameState): void {
const allCardIds = new Set<string>();
for (const player of state.players) {
for (const card of player.hand) {
if (allCardIds.has(card.id)) throw new Error(`Fixture ${raw.id} duplicates ${card.id}`);
allCardIds.add(card.id);
}
for (const card of player.pile) {
if (allCardIds.has(card.id)) throw new Error(`Fixture ${raw.id} duplicates ${card.id}`);
allCardIds.add(card.id);
}
}
for (const card of state.table) {
if (allCardIds.has(card.id)) throw new Error(`Fixture ${raw.id} duplicates ${card.id}`);
allCardIds.add(card.id);
}
if (allCardIds.size !== 40) {
throw new Error(`Fixture ${raw.id} must contain exactly 40 unique cards, found ${allCardIds.size}`);
}
const rootHand = state.players[state.currentPlayer].hand;
if (!rootHand.some(card => card.id === raw.expectedMove.cardId)) {
throw new Error(`Fixture ${raw.id} expected move card ${raw.expectedMove.cardId} is not in the root hand`);
}
if (raw.expectedMove.captureIds) {
const played = cardFromId(raw.expectedMove.cardId);
const legalCaptures = findCaptures(played, state.table)
.map(capture => capture.map(card => card.id).sort().join(','));
const expectedCaptureKey = [...raw.expectedMove.captureIds].sort().join(',');
if (!legalCaptures.includes(expectedCaptureKey)) {
throw new Error(`Fixture ${raw.id} expected capture ${expectedCaptureKey} is not legal`);
}
}
}
export function isCriticalAIBenchmarkFixture(fixture: AIBenchmarkFixture): boolean {
return fixture.criticalConcept !== null;
}
export const AI_BENCHMARK_FIXTURES: AIBenchmarkFixture[] = RAW_FIXTURES.map(buildFixture);

634
src/game/ai-benchmark.ts Normal file
View File

@@ -0,0 +1,634 @@
import { applyMove, cloneState, createInitialState, getMatchOutcome, nextPlayer, teamOf } from './engine';
import { AITimingSource, AIMove, AISearchProfileOverride, chooseMove } from './ai';
import {
AI_BENCHMARK_FIXTURES,
AIBenchmarkCriticalConcept,
AIBenchmarkExpectedMove,
AIBenchmarkFixture,
isCriticalAIBenchmarkFixture,
} from './ai-benchmark-fixtures';
import { CardTracker } from './card-tracker';
import { GameState, PlayerIndex } from './types';
function formatDurationMs(durationMs: number): string {
if (durationMs < 1000) {
return `${durationMs.toFixed(0)} ms`;
}
return `${(durationMs / 1000).toFixed(2)} s`;
}
function logBenchmarkProgress(message: string): void {
console.log(`[ai-benchmark] ${message}`);
}
interface FixedFixtureResult {
fixtureId: string;
name: string;
tags: string[];
criticalConcept: AIBenchmarkCriticalConcept | null;
productionMove: string;
referenceMove: string;
matchesReference: boolean;
expectedPass: boolean;
conceptGatePass: boolean | null;
productionSimulatedMs: number;
referenceSimulatedMs: number;
}
interface SelfPlayMatchResult {
seed: number;
dealer: PlayerIndex;
masterTeam: 0 | 1;
winner: 0 | 1 | null;
masterResult: 'win' | 'loss' | 'draw';
rounds: number;
truncated: boolean;
totalPoints: [number, number];
masterDecisionCount: number;
masterAverageSimulatedDecisionMs: number;
masterMaxSimulatedDecisionMs: number;
}
interface TimingSummary {
count: number;
averageMs: number;
p95Ms: number;
maxMs: number;
}
interface GateCountSummary {
actual: number;
required: number;
total: number;
passed: boolean;
}
interface SelfPlayGateSummary {
matches: number;
requiredMatches: number;
wins: number;
requiredWins: number;
losses: number;
maxLosses: number;
draws: number;
matchCountPassed: boolean;
winGatePassed: boolean;
lossGatePassed: boolean;
passed: boolean;
}
interface SelfPlaySeedSeatResult {
masterTeam: 0 | 1;
masterResult: 'win' | 'loss' | 'draw';
winner: 0 | 1 | null;
rounds: number;
truncated: boolean;
totalPoints: [number, number];
}
interface SelfPlaySeedAggregateResult {
seed: number;
matches: number;
wins: number;
losses: number;
draws: number;
dualLoss: boolean;
seatResults: SelfPlaySeedSeatResult[];
}
export interface AIBenchmarkSummary {
benchmark: 'ai-quality';
qualityGate: {
iteration: 5;
passed: boolean;
fixedFixtures: GateCountSummary;
criticalConcepts: GateCountSummary;
selfPlay: SelfPlayGateSummary;
};
fixtureCount: number;
criticalFixtureCount: number;
fixedSuite: {
fixedFixtureAgreements: number;
expectedPasses: number;
criticalPasses: number;
fixedFixtureAgreementFailures: string[];
criticalPassFailures: string[];
results: FixedFixtureResult[];
};
selfPlay: {
matches: number;
wins: number;
losses: number;
draws: number;
winRate: number;
lossRate: number;
perSeed: SelfPlaySeedAggregateResult[];
dualLossSeeds: number[];
regressionWatchlist: number[];
regressionWatchlistDualLossIntersection: number[];
results: SelfPlayMatchResult[];
};
timing: {
productionMasterSimulatedDecisions: TimingSummary;
};
referenceProfile: Required<AISearchProfileOverride>;
}
const ITERATION_5_GATE = {
fixedFixtureAgreementTarget: 13,
criticalConceptTarget: 6,
selfPlayMatchTarget: 48,
selfPlayWinTarget: 30,
selfPlayMaxLosses: 12,
} as const;
const KNOWN_REGRESSION_WATCHLIST = [1000, 1002, 1004, 1006, 1012, 1013, 1014] as const;
const KNOWN_REGRESSION_WATCHLIST_SET = new Set<number>(KNOWN_REGRESSION_WATCHLIST);
const REFERENCE_PROFILE: Required<AISearchProfileOverride> = {
timeBudgetMs: 9000,
sampleCount: 12,
maxDepth: 7,
batchSize: 2,
};
const SELF_PLAY_MATCH_SEEDS = Array.from({ length: 24 }, (_, index) => 1000 + index);
const MAX_SELF_PLAY_ROUNDS = 20;
function assertIteration5BenchmarkContract(): void {
const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length;
const selfPlayMatchCount = SELF_PLAY_MATCH_SEEDS.length * 2;
if (AI_BENCHMARK_FIXTURES.length !== ITERATION_5_GATE.fixedFixtureAgreementTarget) {
throw new Error(
`Iteration 5 benchmark expects ${ITERATION_5_GATE.fixedFixtureAgreementTarget} fixed fixtures, received ${AI_BENCHMARK_FIXTURES.length}.`,
);
}
if (criticalFixtureCount !== ITERATION_5_GATE.criticalConceptTarget) {
throw new Error(
`Iteration 5 benchmark expects ${ITERATION_5_GATE.criticalConceptTarget} critical concept fixtures, received ${criticalFixtureCount}.`,
);
}
if (selfPlayMatchCount !== ITERATION_5_GATE.selfPlayMatchTarget) {
throw new Error(
`Iteration 5 benchmark expects ${ITERATION_5_GATE.selfPlayMatchTarget} self-play matches, received ${selfPlayMatchCount}.`,
);
}
}
interface SimulatedBenchmarkTimingSource extends AITimingSource {
getElapsedMs(): number;
}
function createSimulatedBenchmarkTimingSource(startMs = 0): SimulatedBenchmarkTimingSource {
let currentMs = startMs;
return {
isSimulated: true,
now: () => currentMs,
advance: (elapsedMs: number) => {
currentMs += elapsedMs;
return currentMs;
},
getElapsedMs: () => currentMs - startMs,
};
}
function seedFromParts(...parts: number[]): number {
let hash = 2166136261;
for (const part of parts) {
hash ^= part >>> 0;
hash = Math.imul(hash, 16777619);
}
return hash >>> 0;
}
function createMulberry32(seed: number): () => number {
let state = seed >>> 0;
return () => {
state = (state + 0x6d2b79f5) >>> 0;
let mixed = Math.imul(state ^ (state >>> 15), state | 1);
mixed ^= mixed + Math.imul(mixed ^ (mixed >>> 7), mixed | 61);
return ((mixed ^ (mixed >>> 14)) >>> 0) / 4294967296;
};
}
function moveKey(move: AIMove): string {
return `${move.card.id}|${move.capture.map(card => card.id).sort().join(',')}`;
}
function createTrackerForState(state: GameState): CardTracker {
const tracker = new CardTracker();
for (const player of state.players) {
for (const card of player.pile) {
tracker.trackPlay(card);
}
}
return tracker;
}
function matchesExpectedMove(move: AIMove, expected: AIBenchmarkExpectedMove): boolean {
if (move.card.id !== expected.cardId) return false;
if (!expected.captureIds) return true;
const actualCapture = move.capture.map(card => card.id).sort().join(',');
const expectedCapture = [...expected.captureIds].sort().join(',');
return actualCapture === expectedCapture;
}
async function runFixedFixtureSuite(): Promise<{ results: FixedFixtureResult[]; wallClockMs: number; productionTimings: number[] }> {
const startedAt = performance.now();
const results: FixedFixtureResult[] = [];
const productionTimings: number[] = [];
logBenchmarkProgress(`Starting fixed fixture suite (${AI_BENCHMARK_FIXTURES.length} positions).`);
for (let index = 0; index < AI_BENCHMARK_FIXTURES.length; index++) {
const fixture = AI_BENCHMARK_FIXTURES[index];
const productionState = cloneState(fixture.state);
const referenceState = cloneState(fixture.state);
const productionTracker = createTrackerForState(productionState);
const referenceTracker = createTrackerForState(referenceState);
const productionSeed = seedFromParts(0x0f1e2d3c, index, 0);
const referenceSeed = seedFromParts(0x0f1e2d3c, index, 1);
const productionTimingSource = createSimulatedBenchmarkTimingSource();
const referenceTimingSource = createSimulatedBenchmarkTimingSource();
const productionMove = await chooseMove(
productionState,
productionState.currentPlayer,
'master',
productionTracker,
undefined,
{
rng: createMulberry32(productionSeed),
timingSource: productionTimingSource,
},
);
const productionSimulatedMs = productionTimingSource.getElapsedMs();
const referenceMove = await chooseMove(
referenceState,
referenceState.currentPlayer,
'master',
referenceTracker,
undefined,
{
rng: createMulberry32(referenceSeed),
profileOverride: REFERENCE_PROFILE,
timingSource: referenceTimingSource,
},
);
const referenceSimulatedMs = referenceTimingSource.getElapsedMs();
productionTimings.push(productionSimulatedMs);
const conceptGatePass = isCriticalAIBenchmarkFixture(fixture)
? matchesExpectedMove(productionMove, fixture.expectedMove)
: null;
results.push({
fixtureId: fixture.id,
name: fixture.name,
tags: [...fixture.tags],
criticalConcept: fixture.criticalConcept,
productionMove: moveKey(productionMove),
referenceMove: moveKey(referenceMove),
matchesReference: moveKey(productionMove) === moveKey(referenceMove),
expectedPass: matchesExpectedMove(productionMove, fixture.expectedMove),
conceptGatePass,
productionSimulatedMs,
referenceSimulatedMs,
});
const progressLabel = `${index + 1}/${AI_BENCHMARK_FIXTURES.length}`;
const matchLabel = moveKey(productionMove) === moveKey(referenceMove) ? 'agreement' : 'divergence';
logBenchmarkProgress(
`Fixture ${progressLabel}: ${fixture.id} -> ${matchLabel}, production simulated ${formatDurationMs(productionSimulatedMs)}, reference simulated ${formatDurationMs(referenceSimulatedMs)}.`,
);
}
return {
results,
wallClockMs: performance.now() - startedAt,
productionTimings,
};
}
function summarizeTimings(samples: number[]): TimingSummary {
if (samples.length === 0) {
return {
count: 0,
averageMs: 0,
p95Ms: 0,
maxMs: 0,
};
}
const sorted = [...samples].sort((left, right) => left - right);
const sum = sorted.reduce((accumulator, value) => accumulator + value, 0);
const p95Index = Math.min(sorted.length - 1, Math.max(0, Math.ceil(sorted.length * 0.95) - 1));
return {
count: sorted.length,
averageMs: sum / sorted.length,
p95Ms: sorted[p95Index],
maxMs: sorted[sorted.length - 1],
};
}
function summarizeSelfPlayBySeed(results: SelfPlayMatchResult[]): {
perSeed: SelfPlaySeedAggregateResult[];
dualLossSeeds: number[];
regressionWatchlistDualLossIntersection: number[];
} {
const aggregates = new Map<number, SelfPlaySeedAggregateResult>();
for (const result of results) {
const existing = aggregates.get(result.seed) ?? {
seed: result.seed,
matches: 0,
wins: 0,
losses: 0,
draws: 0,
dualLoss: false,
seatResults: [],
};
existing.matches++;
if (result.masterResult === 'win') existing.wins++;
else if (result.masterResult === 'loss') existing.losses++;
else existing.draws++;
existing.seatResults.push({
masterTeam: result.masterTeam,
masterResult: result.masterResult,
winner: result.winner,
rounds: result.rounds,
truncated: result.truncated,
totalPoints: result.totalPoints,
});
aggregates.set(result.seed, existing);
}
const perSeed = [...aggregates.values()]
.map(aggregate => ({
...aggregate,
dualLoss: aggregate.losses >= 2,
seatResults: [...aggregate.seatResults].sort((left, right) => left.masterTeam - right.masterTeam),
}))
.sort((left, right) => left.seed - right.seed);
const dualLossSeeds = perSeed.filter(aggregate => aggregate.dualLoss).map(aggregate => aggregate.seed);
const regressionWatchlistDualLossIntersection = dualLossSeeds.filter(seed => KNOWN_REGRESSION_WATCHLIST_SET.has(seed));
return {
perSeed,
dualLossSeeds,
regressionWatchlistDualLossIntersection,
};
}
async function simulateSelfPlayMatch(seed: number, masterTeam: 0 | 1): Promise<{ result: SelfPlayMatchResult; timings: number[] }> {
const initialDealer = (seed % 4) as PlayerIndex;
let state = createInitialState(initialDealer, createMulberry32(seedFromParts(seed, 1, 0)));
const matchStartingPlayer = state.matchStartingPlayer;
const tracker = new CardTracker();
const masterTimings: number[] = [];
let rounds = 1;
let truncated = false;
let turnCount = 0;
while (rounds <= MAX_SELF_PLAY_ROUNDS) {
while (!state.roundOver) {
const playerIdx = state.currentPlayer;
const difficulty = teamOf(playerIdx) === masterTeam ? 'master' : 'advanced';
const timingSource = createSimulatedBenchmarkTimingSource();
const options = difficulty === 'master'
? {
rng: createMulberry32(seedFromParts(seed, rounds, turnCount, playerIdx)),
timingSource,
}
: { timingSource };
const move = await chooseMove(state, playerIdx, difficulty, tracker, undefined, options);
const simulatedMs = timingSource.getElapsedMs();
if (difficulty === 'master') {
masterTimings.push(simulatedMs);
}
const { nextState, capture } = applyMove(
state,
playerIdx,
move.card,
move.capture.length > 0 ? move.capture : undefined,
);
tracker.trackPlay(move.card);
if (capture) {
tracker.trackCapture(capture.captured);
}
state = nextState;
turnCount++;
}
const outcome = getMatchOutcome(state.teamScores);
if (!outcome.continueMatch) {
break;
}
rounds++;
if (rounds > MAX_SELF_PLAY_ROUNDS) {
truncated = true;
break;
}
const totals: [number, number] = [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints];
const nextDealer = nextPlayer(state.dealer);
tracker.reset();
state = createInitialState(nextDealer, createMulberry32(seedFromParts(seed, rounds, 0)));
state.matchStartingPlayer = matchStartingPlayer;
state.teamScores[0].totalPoints = totals[0];
state.teamScores[1].totalPoints = totals[1];
state.roundNumber = rounds;
}
const outcome = getMatchOutcome(state.teamScores);
const winner = truncated ? outcome.winner : outcome.winner;
const masterResult = winner === null ? 'draw' : winner === masterTeam ? 'win' : 'loss';
const timingSummary = summarizeTimings(masterTimings);
return {
result: {
seed,
dealer: initialDealer,
masterTeam,
winner,
masterResult,
rounds,
truncated,
totalPoints: [state.teamScores[0].totalPoints, state.teamScores[1].totalPoints],
masterDecisionCount: timingSummary.count,
masterAverageSimulatedDecisionMs: timingSummary.averageMs,
masterMaxSimulatedDecisionMs: timingSummary.maxMs,
},
timings: masterTimings,
};
}
async function runSelfPlaySuite(): Promise<{ results: SelfPlayMatchResult[]; wallClockMs: number; productionTimings: number[] }> {
const startedAt = performance.now();
const results: SelfPlayMatchResult[] = [];
const productionTimings: number[] = [];
const totalMatches = SELF_PLAY_MATCH_SEEDS.length * 2;
let completedMatches = 0;
logBenchmarkProgress(`Starting self-play suite (${totalMatches} seeded matches with seat swaps).`);
for (const seed of SELF_PLAY_MATCH_SEEDS) {
for (const masterTeam of [0, 1] as const) {
const { result, timings } = await simulateSelfPlayMatch(seed, masterTeam);
results.push(result);
productionTimings.push(...timings);
completedMatches++;
if (completedMatches === 1 || completedMatches % 4 === 0 || completedMatches === totalMatches) {
logBenchmarkProgress(
`Self-play ${completedMatches}/${totalMatches}: seed ${seed}, master team ${masterTeam}, result ${result.masterResult}, rounds ${result.rounds}, max simulated decision ${formatDurationMs(result.masterMaxSimulatedDecisionMs)}.`,
);
}
}
}
return {
results,
wallClockMs: performance.now() - startedAt,
productionTimings,
};
}
function printReadableSummary(summary: AIBenchmarkSummary): void {
console.log('AI quality benchmark');
console.log(`Iteration 5 quality gate: ${summary.qualityGate.passed ? 'PASS' : 'FAIL'}`);
console.log(`Fixed-fixture gate: ${summary.qualityGate.fixedFixtures.actual}/${summary.qualityGate.fixedFixtures.total} agreements (target ${summary.qualityGate.fixedFixtures.required}/${summary.qualityGate.fixedFixtures.total}).`);
console.log(`Critical concept gate: ${summary.qualityGate.criticalConcepts.actual}/${summary.qualityGate.criticalConcepts.total} passes (target ${summary.qualityGate.criticalConcepts.required}/${summary.qualityGate.criticalConcepts.total}).`);
console.log(`Self-play gate: ${summary.qualityGate.selfPlay.matches}/${summary.qualityGate.selfPlay.requiredMatches} matches, ${summary.qualityGate.selfPlay.wins}/${summary.qualityGate.selfPlay.matches} wins (target ${summary.qualityGate.selfPlay.requiredWins}), ${summary.qualityGate.selfPlay.losses}/${summary.qualityGate.selfPlay.matches} losses (max ${summary.qualityGate.selfPlay.maxLosses}), ${summary.qualityGate.selfPlay.draws} draws.`);
if (summary.fixedSuite.fixedFixtureAgreementFailures.length > 0) {
console.log(`Fixed-fixture agreement failures: ${summary.fixedSuite.fixedFixtureAgreementFailures.join(', ')}`);
}
if (summary.fixedSuite.criticalPassFailures.length > 0) {
console.log(`Critical concept failures: ${summary.fixedSuite.criticalPassFailures.join(', ')}`);
}
console.log(`Per-seed outcomes: ${summary.selfPlay.perSeed.map(seed => `${seed.seed}:${seed.wins}W-${seed.losses}L-${seed.draws}D`).join(' | ')}`);
console.log(`Dual-loss seeds: ${summary.selfPlay.dualLossSeeds.length > 0 ? summary.selfPlay.dualLossSeeds.join(', ') : 'none'}`);
console.log(`Regression watchlist intersection: ${summary.selfPlay.regressionWatchlistDualLossIntersection.length > 0 ? summary.selfPlay.regressionWatchlistDualLossIntersection.join(', ') : 'none'} (watchlist ${summary.selfPlay.regressionWatchlist.join(', ')})`);
console.log(`Master simulated timing: avg ${summary.timing.productionMasterSimulatedDecisions.averageMs.toFixed(1)} ms, p95 ${summary.timing.productionMasterSimulatedDecisions.p95Ms.toFixed(1)} ms, max ${summary.timing.productionMasterSimulatedDecisions.maxMs.toFixed(1)} ms.`);
console.log('BENCHMARK_SUMMARY');
console.log(JSON.stringify(summary, null, 2));
}
export async function runAIBenchmark(): Promise<AIBenchmarkSummary> {
assertIteration5BenchmarkContract();
logBenchmarkProgress('Benchmark started. Running fixed fixtures first, then self-play.');
const fixedSuite = await runFixedFixtureSuite();
logBenchmarkProgress(`Fixed fixture suite complete in ${formatDurationMs(fixedSuite.wallClockMs)} wall-clock.`);
const selfPlay = await runSelfPlaySuite();
logBenchmarkProgress(`Self-play suite complete in ${formatDurationMs(selfPlay.wallClockMs)} wall-clock.`);
const criticalFixtureCount = AI_BENCHMARK_FIXTURES.filter(isCriticalAIBenchmarkFixture).length;
const fixedFixtureAgreements = fixedSuite.results.filter(result => result.matchesReference).length;
const expectedPasses = fixedSuite.results.filter(result => result.expectedPass).length;
const criticalPasses = fixedSuite.results.filter(result => result.conceptGatePass === true).length;
const fixedFixtureAgreementFailures = fixedSuite.results
.filter(result => !result.matchesReference)
.map(result => result.fixtureId);
const criticalPassFailures = fixedSuite.results
.filter(result => result.conceptGatePass === false)
.map(result => result.fixtureId);
const wins = selfPlay.results.filter(result => result.masterResult === 'win').length;
const losses = selfPlay.results.filter(result => result.masterResult === 'loss').length;
const draws = selfPlay.results.filter(result => result.masterResult === 'draw').length;
const { perSeed, dualLossSeeds, regressionWatchlistDualLossIntersection } = summarizeSelfPlayBySeed(selfPlay.results);
const productionMasterSimulatedDecisions = summarizeTimings([
...fixedSuite.productionTimings,
...selfPlay.productionTimings,
]);
const fixedFixtureGate: GateCountSummary = {
actual: fixedFixtureAgreements,
required: ITERATION_5_GATE.fixedFixtureAgreementTarget,
total: AI_BENCHMARK_FIXTURES.length,
passed: fixedFixtureAgreements === ITERATION_5_GATE.fixedFixtureAgreementTarget,
};
const criticalConceptGate: GateCountSummary = {
actual: criticalPasses,
required: ITERATION_5_GATE.criticalConceptTarget,
total: criticalFixtureCount,
passed: criticalPasses === ITERATION_5_GATE.criticalConceptTarget,
};
const selfPlayGate: SelfPlayGateSummary = {
matches: selfPlay.results.length,
requiredMatches: ITERATION_5_GATE.selfPlayMatchTarget,
wins,
requiredWins: ITERATION_5_GATE.selfPlayWinTarget,
losses,
maxLosses: ITERATION_5_GATE.selfPlayMaxLosses,
draws,
matchCountPassed: selfPlay.results.length === ITERATION_5_GATE.selfPlayMatchTarget,
winGatePassed: wins >= ITERATION_5_GATE.selfPlayWinTarget,
lossGatePassed: losses <= ITERATION_5_GATE.selfPlayMaxLosses,
passed: selfPlay.results.length === ITERATION_5_GATE.selfPlayMatchTarget
&& wins >= ITERATION_5_GATE.selfPlayWinTarget
&& losses <= ITERATION_5_GATE.selfPlayMaxLosses,
};
return {
benchmark: 'ai-quality',
qualityGate: {
iteration: 5,
passed: fixedFixtureGate.passed && criticalConceptGate.passed && selfPlayGate.passed,
fixedFixtures: fixedFixtureGate,
criticalConcepts: criticalConceptGate,
selfPlay: selfPlayGate,
},
fixtureCount: AI_BENCHMARK_FIXTURES.length,
criticalFixtureCount,
fixedSuite: {
fixedFixtureAgreements,
expectedPasses,
criticalPasses,
fixedFixtureAgreementFailures,
criticalPassFailures,
results: fixedSuite.results,
},
selfPlay: {
matches: selfPlay.results.length,
wins,
losses,
draws,
winRate: selfPlay.results.length === 0 ? 0 : wins / selfPlay.results.length,
lossRate: selfPlay.results.length === 0 ? 0 : losses / selfPlay.results.length,
perSeed,
dualLossSeeds,
regressionWatchlist: [...KNOWN_REGRESSION_WATCHLIST],
regressionWatchlistDualLossIntersection,
results: selfPlay.results,
},
timing: {
productionMasterSimulatedDecisions,
},
referenceProfile: REFERENCE_PROFILE,
};
}
async function runBenchmarkCli(): Promise<void> {
const summary = await runAIBenchmark();
logBenchmarkProgress('Benchmark complete. Emitting summary with iteration 5 gate results.');
printReadableSummary(summary);
}
if (typeof window === 'undefined') {
void runBenchmarkCli();
}

File diff suppressed because it is too large Load Diff

View File

@@ -4,19 +4,19 @@ export interface CardTrackerSnapshot {
playedCardIds: string[];
}
export interface CardTrackerValueParityResidue {
export interface CardTrackerValueRankResidue {
value: number;
knownCount: number;
unseenCount: number;
hasOddUnseenResidue: boolean;
hasEvenUnseenResidue: boolean;
hasSingletonUnseenRankResidue: boolean;
hasPairedUnseenRankResidue: boolean;
}
interface VisibleValueResidueKnowledge {
unseenCards: Card[];
unseenCountBySuit: Record<Suit, number>;
unseenCountByValue: number[];
valueParityResidues: CardTrackerValueParityResidue[];
valueRankResidues: CardTrackerValueRankResidue[];
}
function normalizeSnapshot(snapshot: CardTrackerSnapshot): CardTrackerSnapshot {
@@ -116,15 +116,15 @@ export class CardTracker {
}
}
const valueParityResidues: CardTrackerValueParityResidue[] = [];
const valueRankResidues: CardTrackerValueRankResidue[] = [];
for (let value = 1; value <= 10; value++) {
const unseenCount = unseenCountByValue[value];
valueParityResidues.push({
valueRankResidues.push({
value,
knownCount: knownCountByValue[value],
unseenCount,
hasOddUnseenResidue: unseenCount % 2 === 1,
hasEvenUnseenResidue: unseenCount % 2 === 0,
hasSingletonUnseenRankResidue: unseenCount % 2 === 1,
hasPairedUnseenRankResidue: unseenCount >= 2 && unseenCount % 2 === 0,
});
}
@@ -132,7 +132,7 @@ export class CardTracker {
unseenCards,
unseenCountBySuit,
unseenCountByValue,
valueParityResidues,
valueRankResidues,
};
}
@@ -151,24 +151,24 @@ export class CardTracker {
/** Count how many unseen cards share a value */
countRemainingValue(value: number, myHand: Card[], table: Card[]): number {
return this.getValueParityResidue(value, myHand, table).unseenCount;
return this.getValueRankResidue(value, myHand, table).unseenCount;
}
/** Get visible known-count, unseen-count, and parity residue for a single value */
getValueParityResidue(value: number, myHand: Card[], table: Card[]): CardTrackerValueParityResidue {
const valueParityResidues = this.buildVisibleValueResidueKnowledge(myHand, table).valueParityResidues;
return valueParityResidues[value - 1] ?? {
/** Get visible known-count, unseen-count, and same-rank residue for a single value */
getValueRankResidue(value: number, myHand: Card[], table: Card[]): CardTrackerValueRankResidue {
const valueRankResidues = this.buildVisibleValueResidueKnowledge(myHand, table).valueRankResidues;
return valueRankResidues[value - 1] ?? {
value,
knownCount: 0,
unseenCount: 0,
hasOddUnseenResidue: false,
hasEvenUnseenResidue: true,
hasSingletonUnseenRankResidue: false,
hasPairedUnseenRankResidue: false,
};
}
/** Get visible known-count, unseen-count, and parity residue for all card values */
getValueParityResidueSummary(myHand: Card[], table: Card[]): CardTrackerValueParityResidue[] {
return this.buildVisibleValueResidueKnowledge(myHand, table).valueParityResidues;
/** Get visible known-count, unseen-count, and same-rank residue for all card values */
getValueRankResidueSummary(myHand: Card[], table: Card[]): CardTrackerValueRankResidue[] {
return this.buildVisibleValueResidueKnowledge(myHand, table).valueRankResidues;
}
/** Probability that a hidden hand contains at least one card with the requested value */

View File

@@ -3,6 +3,8 @@ import {
TeamScore, ScoreBreakdown, PRIMIERA_VALUES, Capture, DealerRelativeRole
} from './types';
export type RandomSource = () => number;
// ---------------------------------------------------------------------------
// Deck
// ---------------------------------------------------------------------------
@@ -17,10 +19,10 @@ export function buildDeck(): Card[] {
return deck;
}
export function shuffle<T>(arr: T[]): T[] {
export function shuffle<T>(arr: T[], rng: RandomSource = Math.random): T[] {
const a = [...arr];
for (let i = a.length - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
const j = Math.floor(rng() * (i + 1));
[a[i], a[j]] = [a[j], a[i]];
}
return a;
@@ -105,8 +107,8 @@ export function getDealerRelativeRole(
return 'dealer';
}
export function createInitialState(dealer: PlayerIndex = 3): GameState {
const deck = shuffle(buildDeck());
export function createInitialState(dealer: PlayerIndex = 3, rng: RandomSource = Math.random): GameState {
const deck = shuffle(buildDeck(), rng);
const startingPlayer = getOpeningPlayerForDealer(dealer);
const players: [Player, Player, Player, Player] = [