fix: harden queue lifecycle and publish gate

- Preserve phase results on partial retry and keep interrupted phase
  context after restart.
- Avoid webhook bookkeeping crashes when retention deletes stale jobs.
- Add deeper unit, integration, and e2e coverage around queue seams.
- Require verify job to pass before publish runs.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-16 18:39:19 +02:00
parent 679053b27d
commit a9429e2118
16 changed files with 1867 additions and 87 deletions

View File

@@ -74,6 +74,7 @@ export class JobQueue<TData extends JobData = JobData> {
private readonly handlers = new Map<string, PhaseHandler<TData>>();
private readonly serializer = new SseSerializer();
private readonly controllers = new Map<string, AbortController>();
private readonly pendingWebhookDispatches = new Set<Promise<void>>();
private readonly webhookDispatcher?: WebhookDispatcher<TData>;
private readonly retentionScheduler?: RetentionScheduler<TData>;
private wakeupTimer: NodeJS.Timeout | null = null;
@@ -177,6 +178,7 @@ export class JobQueue<TData extends JobData = JobData> {
const retried = this.storage.resetForRetry(
id,
phases,
options.fromStart === false ? existing.phaseResults : {},
existing.maxAttempts,
options.scheduledAt instanceof Date ? options.scheduledAt.toISOString() : options.scheduledAt ?? null,
);
@@ -194,22 +196,14 @@ export class JobQueue<TData extends JobData = JobData> {
return job;
}
const phases = clonePhases(job.phases);
const activePhaseIndex = phases.findIndex((phase) => phase.status === 'active');
if (activePhaseIndex >= 0) {
phases[activePhaseIndex] = {
...phases[activePhaseIndex],
status: 'cancelled',
completedAt: now(),
};
}
const phases = this.markUnfinishedPhasesCancelled(clonePhases(job.phases));
const controller = this.controllers.get(id);
controller?.abort(new CancellationError());
const cancelled = this.storage.cancelJob(id, phases);
this.events.emit('job:cancelled', cancelled);
void this.dispatchWebhook('job:cancelled', cancelled);
this.queueWebhookDispatch('job:cancelled', cancelled);
this.requestPump();
return cancelled;
@@ -378,9 +372,33 @@ export class JobQueue<TData extends JobData = JobData> {
this.wakeupTimer = null;
}
await this.workerPool.drain(timeoutMs);
this.events.removeAllListeners();
this.storage.close();
let drainError: unknown;
try {
await this.workerPool.drain(timeoutMs);
} catch (error) {
drainError = error;
for (const controller of this.controllers.values()) {
if (!controller.signal.aborted) {
controller.abort(new CancellationError('Job queue shut down before active jobs finished'));
}
}
try {
await this.workerPool.drain(250);
} catch {
// Best effort: cleanup still needs to run after timeout.
}
} finally {
await this.drainWebhookDispatches();
this.events.removeAllListeners();
this.storage.close();
}
if (drainError) {
throw drainError;
}
}
private async pump(): Promise<void> {
@@ -508,19 +526,18 @@ export class JobQueue<TData extends JobData = JobData> {
this.events.emit('job:phase:completed', updated, completedPhase);
return updated;
},
onCancelled: async (phaseName, phases) => {
const activePhases = clonePhases(phases);
if (phaseName) {
const active = activePhases.find((phase) => phase.name === phaseName);
if (active) {
active.status = 'cancelled';
active.completedAt = now();
}
onCancelled: async (_phaseName, phases) => {
const current = this.requireJob(jobId);
if (current.status === 'cancelled') {
return;
}
const cancelled = this.storage.cancelJob(jobId, activePhases);
const cancelled = this.storage.cancelJob(
jobId,
this.markUnfinishedPhasesCancelled(clonePhases(phases)),
);
this.events.emit('job:cancelled', cancelled);
void this.dispatchWebhook('job:cancelled', cancelled);
this.queueWebhookDispatch('job:cancelled', cancelled);
},
});
@@ -533,14 +550,17 @@ export class JobQueue<TData extends JobData = JobData> {
const completed = this.storage.completeJob(jobId, result.phases, result.phaseResults);
this.events.emit('job:completed', completed);
void this.dispatchWebhook('job:completed', completed);
this.queueWebhookDispatch('job:completed', completed);
} catch (error) {
if (error instanceof CancellationError) {
const current = this.requireJob(jobId);
if (current.status !== 'cancelled') {
const cancelled = this.storage.cancelJob(jobId, clonePhases(current.phases));
const cancelled = this.storage.cancelJob(
jobId,
this.markUnfinishedPhasesCancelled(clonePhases(current.phases)),
);
this.events.emit('job:cancelled', cancelled);
void this.dispatchWebhook('job:cancelled', cancelled);
this.queueWebhookDispatch('job:cancelled', cancelled);
}
return;
@@ -596,7 +616,7 @@ export class JobQueue<TData extends JobData = JobData> {
};
this.events.emit('job:retrying', pending, retry);
void this.dispatchWebhook('job:retrying', pending);
this.queueWebhookDispatch('job:retrying', pending);
this.requestPump();
return;
}
@@ -608,7 +628,7 @@ export class JobQueue<TData extends JobData = JobData> {
const failed = this.storage.failJob(jobId, phases, failure);
this.events.emit('job:failed', failed, failure);
void this.dispatchWebhook('job:failed', failed);
this.queueWebhookDispatch('job:failed', failed);
}
private async markStaleJobs(cutoffIso: string): Promise<JobRecord<TData>[]> {
@@ -620,7 +640,7 @@ export class JobQueue<TData extends JobData = JobData> {
}
this.events.emit('job:stale', job);
void this.dispatchWebhook('job:stale', job);
this.queueWebhookDispatch('job:stale', job);
}
return staleJobs;
@@ -648,8 +668,10 @@ export class JobQueue<TData extends JobData = JobData> {
try {
const result = await this.webhookDispatcher.dispatch(event, job);
this.storage.markWebhookSent(job.id);
const refreshed = this.requireJob(job.id);
this.events.emit('job:webhook:delivered', refreshed, result);
const refreshed = this.getJob(job.id);
if (refreshed) {
this.events.emit('job:webhook:delivered', refreshed, result);
}
} catch (error) {
const dispatchError =
error instanceof Error && 'dispatchError' in error
@@ -666,11 +688,48 @@ export class JobQueue<TData extends JobData = JobData> {
finalAttempt: 1,
};
const refreshed = this.requireJob(job.id);
this.events.emit('job:webhook:failed', refreshed, dispatchError);
const refreshed = this.getJob(job.id);
if (refreshed) {
this.events.emit('job:webhook:failed', refreshed, dispatchError);
}
}
}
private queueWebhookDispatch(event: WebhookEventName, job: JobRecord<TData>): void {
if (!this.webhookDispatcher?.supports(event, job)) {
return;
}
const pending = this.dispatchWebhook(event, job).finally(() => {
this.pendingWebhookDispatches.delete(pending);
});
this.pendingWebhookDispatches.add(pending);
}
private async drainWebhookDispatches(): Promise<void> {
while (this.pendingWebhookDispatches.size > 0) {
await Promise.allSettled([...this.pendingWebhookDispatches]);
}
}
private markUnfinishedPhasesCancelled(phases: JobPhaseState[]): JobPhaseState[] {
const cancelledAt = now();
return phases.map((phase) => {
if (phase.status === 'completed' || phase.status === 'failed' || phase.status === 'cancelled') {
return phase;
}
return {
...phase,
status: 'cancelled',
completedAt: phase.completedAt ?? cancelledAt,
error: null,
};
});
}
private pushJobEvent(
controller: ReadableStreamDefaultController<Uint8Array>,
encoder: { encode: (input?: string) => Uint8Array },