mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-01-22 18:26:12 +01:00
🪵 refactor: Preserve Job Error State for Late Stream Subscribers (#11372)
Some checks failed
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile, librechat-dev, node) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile.multi, librechat-dev-api, api-build) (push) Waiting to run
Sync Locize Translations & Create Translation PR / Sync Translation Keys with Locize (push) Waiting to run
Sync Locize Translations & Create Translation PR / Create Translation PR on Version Published (push) Blocked by required conditions
Publish `@librechat/data-schemas` to NPM / build-and-publish (push) Has been cancelled
Some checks failed
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile, librechat-dev, node) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile.multi, librechat-dev-api, api-build) (push) Waiting to run
Sync Locize Translations & Create Translation PR / Sync Translation Keys with Locize (push) Waiting to run
Sync Locize Translations & Create Translation PR / Create Translation PR on Version Published (push) Blocked by required conditions
Publish `@librechat/data-schemas` to NPM / build-and-publish (push) Has been cancelled
* 🪵 refactor: Preserve job error state for late stream subscribers * 🔧 fix: Enhance error handling for late subscribers in GenerationJobManager - Implemented a cleanup strategy for error jobs to prevent immediate deletion, allowing late clients to receive error messages. - Updated job status handling to prioritize error notifications over completion events. - Added integration tests to verify error preservation and proper notification to late subscribers, including scenarios with Redis support.
This commit is contained in:
parent
81f4af55b5
commit
c378e777ef
3 changed files with 335 additions and 11 deletions
|
|
@ -33,6 +33,7 @@ export interface GenerationJobManagerOptions {
|
|||
* @property readyPromise - Resolves immediately (legacy, kept for API compatibility)
|
||||
* @property resolveReady - Function to resolve readyPromise
|
||||
* @property finalEvent - Cached final event for late subscribers
|
||||
* @property errorEvent - Cached error event for late subscribers (errors before client connects)
|
||||
* @property syncSent - Whether sync event was sent (reset when all subscribers leave)
|
||||
* @property earlyEventBuffer - Buffer for events emitted before first subscriber connects
|
||||
* @property hasSubscriber - Whether at least one subscriber has connected
|
||||
|
|
@ -47,6 +48,7 @@ interface RuntimeJobState {
|
|||
readyPromise: Promise<void>;
|
||||
resolveReady: () => void;
|
||||
finalEvent?: t.ServerSentEvent;
|
||||
errorEvent?: string;
|
||||
syncSent: boolean;
|
||||
earlyEventBuffer: t.ServerSentEvent[];
|
||||
hasSubscriber: boolean;
|
||||
|
|
@ -421,6 +423,7 @@ class GenerationJobManagerClass {
|
|||
earlyEventBuffer: [],
|
||||
hasSubscriber: false,
|
||||
finalEvent,
|
||||
errorEvent: jobData.error,
|
||||
};
|
||||
|
||||
this.runtimeState.set(streamId, runtime);
|
||||
|
|
@ -510,6 +513,8 @@ class GenerationJobManagerClass {
|
|||
/**
|
||||
* Mark job as complete.
|
||||
* If cleanupOnComplete is true (default), immediately cleans up job resources.
|
||||
* Exception: Jobs with errors are NOT immediately deleted to allow late-connecting
|
||||
* clients to receive the error (race condition where error occurs before client connects).
|
||||
* Note: eventTransport is NOT cleaned up here to allow the final event to be
|
||||
* fully transmitted. It will be cleaned up when subscribers disconnect or
|
||||
* by the periodic cleanup job.
|
||||
|
|
@ -527,7 +532,29 @@ class GenerationJobManagerClass {
|
|||
this.jobStore.clearContentState(streamId);
|
||||
this.runStepBuffers?.delete(streamId);
|
||||
|
||||
// Immediate cleanup if configured (default: true)
|
||||
// For error jobs, DON'T delete immediately - keep around so late-connecting
|
||||
// clients can receive the error. This handles the race condition where error
|
||||
// occurs before client connects to SSE stream.
|
||||
//
|
||||
// Cleanup strategy: Error jobs are cleaned up by periodic cleanup (every 60s)
|
||||
// via jobStore.cleanup() which checks for jobs with status 'error' and
|
||||
// completedAt set. The TTL is configurable via jobStore options (default: 0,
|
||||
// meaning cleanup on next interval). This gives clients ~60s to connect and
|
||||
// receive the error before the job is removed.
|
||||
if (error) {
|
||||
await this.jobStore.updateJob(streamId, {
|
||||
status: 'error',
|
||||
completedAt: Date.now(),
|
||||
error,
|
||||
});
|
||||
// Keep runtime state so subscribe() can access errorEvent
|
||||
logger.debug(
|
||||
`[GenerationJobManager] Job completed with error (keeping for late subscribers): ${streamId}`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Immediate cleanup if configured (default: true) - only for successful completions
|
||||
if (this._cleanupOnComplete) {
|
||||
this.runtimeState.delete(streamId);
|
||||
// Don't cleanup eventTransport here - let the done event fully transmit first.
|
||||
|
|
@ -536,9 +563,8 @@ class GenerationJobManagerClass {
|
|||
} else {
|
||||
// Only update status if keeping the job around
|
||||
await this.jobStore.updateJob(streamId, {
|
||||
status: error ? 'error' : 'complete',
|
||||
status: 'complete',
|
||||
completedAt: Date.now(),
|
||||
error,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
@ -678,14 +704,22 @@ class GenerationJobManagerClass {
|
|||
|
||||
const jobData = await this.jobStore.getJob(streamId);
|
||||
|
||||
// If job already complete, send final event
|
||||
// If job already complete/error, send final event or error
|
||||
// Error status takes precedence to ensure errors aren't misreported as successes
|
||||
setImmediate(() => {
|
||||
if (
|
||||
runtime.finalEvent &&
|
||||
jobData &&
|
||||
['complete', 'error', 'aborted'].includes(jobData.status)
|
||||
) {
|
||||
onDone?.(runtime.finalEvent);
|
||||
if (jobData && ['complete', 'error', 'aborted'].includes(jobData.status)) {
|
||||
// Check for error status FIRST and prioritize error handling
|
||||
if (jobData.status === 'error' && (runtime.errorEvent || jobData.error)) {
|
||||
const errorToSend = runtime.errorEvent ?? jobData.error;
|
||||
if (errorToSend) {
|
||||
logger.debug(
|
||||
`[GenerationJobManager] Sending stored error to late subscriber: ${streamId}`,
|
||||
);
|
||||
onError?.(errorToSend);
|
||||
}
|
||||
} else if (runtime.finalEvent) {
|
||||
onDone?.(runtime.finalEvent);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
|
|
@ -986,8 +1020,18 @@ class GenerationJobManagerClass {
|
|||
|
||||
/**
|
||||
* Emit an error event.
|
||||
* Stores the error for late-connecting subscribers (race condition where error
|
||||
* occurs before client connects to SSE stream).
|
||||
*/
|
||||
emitError(streamId: string, error: string): void {
|
||||
const runtime = this.runtimeState.get(streamId);
|
||||
if (runtime) {
|
||||
runtime.errorEvent = error;
|
||||
}
|
||||
// Persist error to job store for cross-replica consistency
|
||||
this.jobStore.updateJob(streamId, { error }).catch((err) => {
|
||||
logger.error(`[GenerationJobManager] Failed to persist error:`, err);
|
||||
});
|
||||
this.eventTransport.emitError(streamId, error);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue