mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-01-22 18:26:12 +01:00
🪐 feat: Cross-replica support in GenerationJobManager for Redis mode (#11233)
* feat: Implement cross-replica support in GenerationJobManager for Redis mode - Enhanced GenerationJobManager to support cross-replica scenarios by lazily creating runtime states from Redis when jobs exist but are not present in local memory. - Added functionality to persist `syncSent` and `finalEvent` states to Redis for consistency across replicas. - Implemented abort signal handling to allow replicas to receive and respond to abort requests from other instances, ensuring proper job termination. - Updated tests to validate cross-replica behavior, including job retrieval, subscription, and abort signal propagation. This update improves the robustness and reliability of job management in distributed environments. * fix: Enhance error handling and implement abort signal for cross-replica jobs in GenerationJobManager - Added error handling for Redis job updates in GenerationJobManager to log failures when persisting `syncSent` and `finalEvent` states. - Implemented a listener for cross-replica abort signals, ensuring that lazily-initialized jobs can respond to abort requests from other replicas. - Introduced a new integration test to validate the handling of abort signals for lazily-initialized jobs across replicas. These changes improve the reliability and robustness of job management in distributed environments.
This commit is contained in:
parent
b5aa38ff33
commit
a7645f4705
5 changed files with 826 additions and 7 deletions
|
|
@ -227,13 +227,17 @@ class GenerationJobManagerClass {
|
|||
/**
|
||||
* Set up all-subscribers-left callback.
|
||||
* When all SSE clients disconnect, this:
|
||||
* 1. Resets syncSent so reconnecting clients get sync event
|
||||
* 1. Resets syncSent so reconnecting clients get sync event (persisted to Redis)
|
||||
* 2. Calls any registered allSubscribersLeft handlers (e.g., to save partial responses)
|
||||
*/
|
||||
this.eventTransport.onAllSubscribersLeft(streamId, () => {
|
||||
const currentRuntime = this.runtimeState.get(streamId);
|
||||
if (currentRuntime) {
|
||||
currentRuntime.syncSent = false;
|
||||
// Persist syncSent=false to Redis for cross-replica consistency
|
||||
this.jobStore.updateJob(streamId, { syncSent: false }).catch((err) => {
|
||||
logger.error(`[GenerationJobManager] Failed to persist syncSent=false:`, err);
|
||||
});
|
||||
// Call registered handlers (from job.emitter.on('allSubscribersLeft', ...))
|
||||
if (currentRuntime.allSubscribersLeftHandlers) {
|
||||
this.jobStore
|
||||
|
|
@ -258,6 +262,21 @@ class GenerationJobManagerClass {
|
|||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* Set up cross-replica abort listener (Redis mode only).
|
||||
* When abort is triggered on ANY replica, this replica receives the signal
|
||||
* and aborts its local AbortController (if it's the one running generation).
|
||||
*/
|
||||
if (this.eventTransport.onAbort) {
|
||||
this.eventTransport.onAbort(streamId, () => {
|
||||
const currentRuntime = this.runtimeState.get(streamId);
|
||||
if (currentRuntime && !currentRuntime.abortController.signal.aborted) {
|
||||
logger.debug(`[GenerationJobManager] Received cross-replica abort for ${streamId}`);
|
||||
currentRuntime.abortController.abort();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
logger.debug(`[GenerationJobManager] Created job: ${streamId}`);
|
||||
|
||||
// Return facade for backwards compatibility
|
||||
|
|
@ -343,15 +362,133 @@ class GenerationJobManagerClass {
|
|||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get or create runtime state for a job.
|
||||
*
|
||||
* This enables cross-replica support in Redis mode:
|
||||
* - If runtime exists locally (same replica), return it
|
||||
* - If job exists in Redis but not locally (cross-replica), create minimal runtime
|
||||
*
|
||||
* The lazily-created runtime state is sufficient for:
|
||||
* - Subscribing to events (via Redis pub/sub)
|
||||
* - Getting resume state
|
||||
* - Handling reconnections
|
||||
* - Receiving cross-replica abort signals (via Redis pub/sub)
|
||||
*
|
||||
* @param streamId - The stream identifier
|
||||
* @returns Runtime state or null if job doesn't exist anywhere
|
||||
*/
|
||||
private async getOrCreateRuntimeState(streamId: string): Promise<RuntimeJobState | null> {
|
||||
const existingRuntime = this.runtimeState.get(streamId);
|
||||
if (existingRuntime) {
|
||||
return existingRuntime;
|
||||
}
|
||||
|
||||
// Job doesn't exist locally - check Redis
|
||||
const jobData = await this.jobStore.getJob(streamId);
|
||||
if (!jobData) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Cross-replica scenario: job exists in Redis but not locally
|
||||
// Create minimal runtime state for handling reconnection/subscription
|
||||
logger.debug(`[GenerationJobManager] Creating cross-replica runtime for ${streamId}`);
|
||||
|
||||
let resolveReady: () => void;
|
||||
const readyPromise = new Promise<void>((resolve) => {
|
||||
resolveReady = resolve;
|
||||
});
|
||||
|
||||
// For jobs created on other replicas, readyPromise should be pre-resolved
|
||||
// since generation has already started
|
||||
resolveReady!();
|
||||
|
||||
// Parse finalEvent from Redis if available
|
||||
let finalEvent: t.ServerSentEvent | undefined;
|
||||
if (jobData.finalEvent) {
|
||||
try {
|
||||
finalEvent = JSON.parse(jobData.finalEvent) as t.ServerSentEvent;
|
||||
} catch {
|
||||
// Ignore parse errors
|
||||
}
|
||||
}
|
||||
|
||||
const runtime: RuntimeJobState = {
|
||||
abortController: new AbortController(),
|
||||
readyPromise,
|
||||
resolveReady: resolveReady!,
|
||||
syncSent: jobData.syncSent ?? false,
|
||||
earlyEventBuffer: [],
|
||||
hasSubscriber: false,
|
||||
finalEvent,
|
||||
};
|
||||
|
||||
this.runtimeState.set(streamId, runtime);
|
||||
|
||||
// Set up all-subscribers-left callback for this replica
|
||||
this.eventTransport.onAllSubscribersLeft(streamId, () => {
|
||||
const currentRuntime = this.runtimeState.get(streamId);
|
||||
if (currentRuntime) {
|
||||
currentRuntime.syncSent = false;
|
||||
// Persist syncSent=false to Redis
|
||||
this.jobStore.updateJob(streamId, { syncSent: false }).catch((err) => {
|
||||
logger.error(`[GenerationJobManager] Failed to persist syncSent=false:`, err);
|
||||
});
|
||||
// Call registered handlers
|
||||
if (currentRuntime.allSubscribersLeftHandlers) {
|
||||
this.jobStore
|
||||
.getContentParts(streamId)
|
||||
.then((result) => {
|
||||
const parts = result?.content ?? [];
|
||||
for (const handler of currentRuntime.allSubscribersLeftHandlers ?? []) {
|
||||
try {
|
||||
handler(parts);
|
||||
} catch (err) {
|
||||
logger.error(`[GenerationJobManager] Error in allSubscribersLeft handler:`, err);
|
||||
}
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
logger.error(
|
||||
`[GenerationJobManager] Failed to get content parts for allSubscribersLeft handlers:`,
|
||||
err,
|
||||
);
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Set up cross-replica abort listener (Redis mode only)
|
||||
// This ensures lazily-initialized jobs can receive abort signals
|
||||
if (this.eventTransport.onAbort) {
|
||||
this.eventTransport.onAbort(streamId, () => {
|
||||
const currentRuntime = this.runtimeState.get(streamId);
|
||||
if (currentRuntime && !currentRuntime.abortController.signal.aborted) {
|
||||
logger.debug(
|
||||
`[GenerationJobManager] Received cross-replica abort for lazily-init job ${streamId}`,
|
||||
);
|
||||
currentRuntime.abortController.abort();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return runtime;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a job by streamId.
|
||||
*/
|
||||
async getJob(streamId: string): Promise<t.GenerationJob | undefined> {
|
||||
const jobData = await this.jobStore.getJob(streamId);
|
||||
const runtime = this.runtimeState.get(streamId);
|
||||
if (!jobData || !runtime) {
|
||||
if (!jobData) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const runtime = await this.getOrCreateRuntimeState(streamId);
|
||||
if (!runtime) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
return this.buildJobFacade(streamId, jobData, runtime);
|
||||
}
|
||||
|
||||
|
|
@ -411,6 +548,10 @@ class GenerationJobManagerClass {
|
|||
/**
|
||||
* Abort a job (user-initiated).
|
||||
* Returns all data needed for token spending and message saving.
|
||||
*
|
||||
* Cross-replica support (Redis mode):
|
||||
* - Emits abort signal via Redis pub/sub
|
||||
* - The replica running generation receives signal and aborts its AbortController
|
||||
*/
|
||||
async abortJob(streamId: string): Promise<AbortResult> {
|
||||
const jobData = await this.jobStore.getJob(streamId);
|
||||
|
|
@ -421,6 +562,13 @@ class GenerationJobManagerClass {
|
|||
return { success: false, jobData: null, content: [], finalEvent: null };
|
||||
}
|
||||
|
||||
// Emit abort signal for cross-replica support (Redis mode)
|
||||
// This ensures the generating replica receives the abort signal
|
||||
if (this.eventTransport.emitAbort) {
|
||||
this.eventTransport.emitAbort(streamId);
|
||||
}
|
||||
|
||||
// Also abort local controller if we have it (same-replica abort)
|
||||
if (runtime) {
|
||||
runtime.abortController.abort();
|
||||
}
|
||||
|
|
@ -506,6 +654,10 @@ class GenerationJobManagerClass {
|
|||
* - Resolves readyPromise (legacy, for API compatibility)
|
||||
* - Replays any buffered early events (e.g., 'created' event)
|
||||
*
|
||||
* Supports cross-replica reconnection in Redis mode:
|
||||
* - If job exists in Redis but not locally, creates minimal runtime state
|
||||
* - Events are delivered via Redis pub/sub, not in-memory EventEmitter
|
||||
*
|
||||
* @param streamId - The stream to subscribe to
|
||||
* @param onChunk - Handler for chunk events (streamed tokens, run steps, etc.)
|
||||
* @param onDone - Handler for completion event (includes final message)
|
||||
|
|
@ -518,7 +670,8 @@ class GenerationJobManagerClass {
|
|||
onDone?: t.DoneHandler,
|
||||
onError?: t.ErrorHandler,
|
||||
): Promise<{ unsubscribe: t.UnsubscribeFn } | null> {
|
||||
const runtime = this.runtimeState.get(streamId);
|
||||
// Use lazy initialization to support cross-replica subscriptions
|
||||
const runtime = await this.getOrCreateRuntimeState(streamId);
|
||||
if (!runtime) {
|
||||
return null;
|
||||
}
|
||||
|
|
@ -788,29 +941,46 @@ class GenerationJobManagerClass {
|
|||
|
||||
/**
|
||||
* Mark that sync has been sent.
|
||||
* Persists to Redis for cross-replica consistency.
|
||||
*/
|
||||
markSyncSent(streamId: string): void {
|
||||
const runtime = this.runtimeState.get(streamId);
|
||||
if (runtime) {
|
||||
runtime.syncSent = true;
|
||||
}
|
||||
// Persist to Redis for cross-replica consistency
|
||||
this.jobStore.updateJob(streamId, { syncSent: true }).catch((err) => {
|
||||
logger.error(`[GenerationJobManager] Failed to persist syncSent flag:`, err);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if sync has been sent.
|
||||
* Checks local runtime first, then falls back to Redis for cross-replica scenarios.
|
||||
*/
|
||||
wasSyncSent(streamId: string): boolean {
|
||||
return this.runtimeState.get(streamId)?.syncSent ?? false;
|
||||
async wasSyncSent(streamId: string): Promise<boolean> {
|
||||
const localSyncSent = this.runtimeState.get(streamId)?.syncSent;
|
||||
if (localSyncSent !== undefined) {
|
||||
return localSyncSent;
|
||||
}
|
||||
// Cross-replica: check Redis
|
||||
const jobData = await this.jobStore.getJob(streamId);
|
||||
return jobData?.syncSent ?? false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Emit a done event.
|
||||
* Persists finalEvent to Redis for cross-replica access.
|
||||
*/
|
||||
emitDone(streamId: string, event: t.ServerSentEvent): void {
|
||||
const runtime = this.runtimeState.get(streamId);
|
||||
if (runtime) {
|
||||
runtime.finalEvent = event;
|
||||
}
|
||||
// Persist finalEvent to Redis for cross-replica consistency
|
||||
this.jobStore.updateJob(streamId, { finalEvent: JSON.stringify(event) }).catch((err) => {
|
||||
logger.error(`[GenerationJobManager] Failed to persist finalEvent:`, err);
|
||||
});
|
||||
this.eventTransport.emitDone(streamId, event);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue