🪵 refactor: Preserve Job Error State for Late Stream Subscribers (#11372)
Some checks failed
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile, librechat-dev, node) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile.multi, librechat-dev-api, api-build) (push) Waiting to run
Sync Locize Translations & Create Translation PR / Sync Translation Keys with Locize (push) Waiting to run
Sync Locize Translations & Create Translation PR / Create Translation PR on Version Published (push) Blocked by required conditions
Publish `@librechat/data-schemas` to NPM / build-and-publish (push) Has been cancelled

* 🪵 refactor: Preserve job error state for late stream subscribers

* 🔧 fix: Enhance error handling for late subscribers in GenerationJobManager

- Implemented a cleanup strategy for error jobs to prevent immediate deletion, allowing late clients to receive error messages.
- Updated job status handling to prioritize error notifications over completion events.
- Added integration tests to verify error preservation and proper notification to late subscribers, including scenarios with Redis support.
This commit is contained in:
Danny Avila 2026-01-15 23:02:03 -05:00 committed by GitHub
parent 81f4af55b5
commit c378e777ef
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 335 additions and 11 deletions

View file

@ -33,6 +33,7 @@ export interface GenerationJobManagerOptions {
* @property readyPromise - Resolves immediately (legacy, kept for API compatibility)
* @property resolveReady - Function to resolve readyPromise
* @property finalEvent - Cached final event for late subscribers
* @property errorEvent - Cached error event for late subscribers (errors before client connects)
* @property syncSent - Whether sync event was sent (reset when all subscribers leave)
* @property earlyEventBuffer - Buffer for events emitted before first subscriber connects
* @property hasSubscriber - Whether at least one subscriber has connected
@ -47,6 +48,7 @@ interface RuntimeJobState {
readyPromise: Promise<void>;
resolveReady: () => void;
finalEvent?: t.ServerSentEvent;
errorEvent?: string;
syncSent: boolean;
earlyEventBuffer: t.ServerSentEvent[];
hasSubscriber: boolean;
@ -421,6 +423,7 @@ class GenerationJobManagerClass {
earlyEventBuffer: [],
hasSubscriber: false,
finalEvent,
errorEvent: jobData.error,
};
this.runtimeState.set(streamId, runtime);
@ -510,6 +513,8 @@ class GenerationJobManagerClass {
/**
* Mark job as complete.
* If cleanupOnComplete is true (default), immediately cleans up job resources.
* Exception: Jobs with errors are NOT immediately deleted to allow late-connecting
* clients to receive the error (race condition where error occurs before client connects).
* Note: eventTransport is NOT cleaned up here to allow the final event to be
* fully transmitted. It will be cleaned up when subscribers disconnect or
* by the periodic cleanup job.
@ -527,7 +532,29 @@ class GenerationJobManagerClass {
this.jobStore.clearContentState(streamId);
this.runStepBuffers?.delete(streamId);
// Immediate cleanup if configured (default: true)
// For error jobs, DON'T delete immediately - keep around so late-connecting
// clients can receive the error. This handles the race condition where error
// occurs before client connects to SSE stream.
//
// Cleanup strategy: Error jobs are cleaned up by periodic cleanup (every 60s)
// via jobStore.cleanup() which checks for jobs with status 'error' and
// completedAt set. The TTL is configurable via jobStore options (default: 0,
// meaning cleanup on next interval). This gives clients ~60s to connect and
// receive the error before the job is removed.
if (error) {
await this.jobStore.updateJob(streamId, {
status: 'error',
completedAt: Date.now(),
error,
});
// Keep runtime state so subscribe() can access errorEvent
logger.debug(
`[GenerationJobManager] Job completed with error (keeping for late subscribers): ${streamId}`,
);
return;
}
// Immediate cleanup if configured (default: true) - only for successful completions
if (this._cleanupOnComplete) {
this.runtimeState.delete(streamId);
// Don't cleanup eventTransport here - let the done event fully transmit first.
@ -536,9 +563,8 @@ class GenerationJobManagerClass {
} else {
// Only update status if keeping the job around
await this.jobStore.updateJob(streamId, {
status: error ? 'error' : 'complete',
status: 'complete',
completedAt: Date.now(),
error,
});
}
@ -678,14 +704,22 @@ class GenerationJobManagerClass {
const jobData = await this.jobStore.getJob(streamId);
// If job already complete, send final event
// If job already complete/error, send final event or error
// Error status takes precedence to ensure errors aren't misreported as successes
setImmediate(() => {
if (
runtime.finalEvent &&
jobData &&
['complete', 'error', 'aborted'].includes(jobData.status)
) {
onDone?.(runtime.finalEvent);
if (jobData && ['complete', 'error', 'aborted'].includes(jobData.status)) {
// Check for error status FIRST and prioritize error handling
if (jobData.status === 'error' && (runtime.errorEvent || jobData.error)) {
const errorToSend = runtime.errorEvent ?? jobData.error;
if (errorToSend) {
logger.debug(
`[GenerationJobManager] Sending stored error to late subscriber: ${streamId}`,
);
onError?.(errorToSend);
}
} else if (runtime.finalEvent) {
onDone?.(runtime.finalEvent);
}
}
});
@ -986,8 +1020,18 @@ class GenerationJobManagerClass {
/**
* Emit an error event.
* Stores the error for late-connecting subscribers (race condition where error
* occurs before client connects to SSE stream).
*/
emitError(streamId: string, error: string): void {
const runtime = this.runtimeState.get(streamId);
if (runtime) {
runtime.errorEvent = error;
}
// Persist error to job store for cross-replica consistency
this.jobStore.updateJob(streamId, { error }).catch((err) => {
logger.error(`[GenerationJobManager] Failed to persist error:`, err);
});
this.eventTransport.emitError(streamId, error);
}