🪐 feat: Cross-replica support in GenerationJobManager for Redis mode (#11233)
Some checks are pending
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run

* feat: Implement cross-replica support in GenerationJobManager for Redis mode

- Enhanced GenerationJobManager to support cross-replica scenarios by lazily creating runtime states from Redis when jobs exist but are not present in local memory.
- Added functionality to persist `syncSent` and `finalEvent` states to Redis for consistency across replicas.
- Implemented abort signal handling to allow replicas to receive and respond to abort requests from other instances, ensuring proper job termination.
- Updated tests to validate cross-replica behavior, including job retrieval, subscription, and abort signal propagation.

This update improves the robustness and reliability of job management in distributed environments.

* fix: Enhance error handling and implement abort signal for cross-replica jobs in GenerationJobManager

- Added error handling for Redis job updates in GenerationJobManager to log failures when persisting `syncSent` and `finalEvent` states.
- Implemented a listener for cross-replica abort signals, ensuring that lazily-initialized jobs can respond to abort requests from other replicas.
- Introduced a new integration test to validate the handling of abort signals for lazily-initialized jobs across replicas.

These changes improve the reliability and robustness of job management in distributed environments.
This commit is contained in:
Danny Avila 2026-01-06 11:39:24 -05:00 committed by GitHub
parent b5aa38ff33
commit a7645f4705
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 826 additions and 7 deletions

View file

@ -227,13 +227,17 @@ class GenerationJobManagerClass {
/**
* Set up all-subscribers-left callback.
* When all SSE clients disconnect, this:
* 1. Resets syncSent so reconnecting clients get sync event
* 1. Resets syncSent so reconnecting clients get sync event (persisted to Redis)
* 2. Calls any registered allSubscribersLeft handlers (e.g., to save partial responses)
*/
this.eventTransport.onAllSubscribersLeft(streamId, () => {
const currentRuntime = this.runtimeState.get(streamId);
if (currentRuntime) {
currentRuntime.syncSent = false;
// Persist syncSent=false to Redis for cross-replica consistency
this.jobStore.updateJob(streamId, { syncSent: false }).catch((err) => {
logger.error(`[GenerationJobManager] Failed to persist syncSent=false:`, err);
});
// Call registered handlers (from job.emitter.on('allSubscribersLeft', ...))
if (currentRuntime.allSubscribersLeftHandlers) {
this.jobStore
@ -258,6 +262,21 @@ class GenerationJobManagerClass {
}
});
/**
* Set up cross-replica abort listener (Redis mode only).
* When abort is triggered on ANY replica, this replica receives the signal
* and aborts its local AbortController (if it's the one running generation).
*/
if (this.eventTransport.onAbort) {
this.eventTransport.onAbort(streamId, () => {
const currentRuntime = this.runtimeState.get(streamId);
if (currentRuntime && !currentRuntime.abortController.signal.aborted) {
logger.debug(`[GenerationJobManager] Received cross-replica abort for ${streamId}`);
currentRuntime.abortController.abort();
}
});
}
logger.debug(`[GenerationJobManager] Created job: ${streamId}`);
// Return facade for backwards compatibility
@ -343,15 +362,133 @@ class GenerationJobManagerClass {
};
}
/**
* Get or create runtime state for a job.
*
* This enables cross-replica support in Redis mode:
* - If runtime exists locally (same replica), return it
* - If job exists in Redis but not locally (cross-replica), create minimal runtime
*
* The lazily-created runtime state is sufficient for:
* - Subscribing to events (via Redis pub/sub)
* - Getting resume state
* - Handling reconnections
* - Receiving cross-replica abort signals (via Redis pub/sub)
*
* @param streamId - The stream identifier
* @returns Runtime state or null if job doesn't exist anywhere
*/
private async getOrCreateRuntimeState(streamId: string): Promise<RuntimeJobState | null> {
const existingRuntime = this.runtimeState.get(streamId);
if (existingRuntime) {
return existingRuntime;
}
// Job doesn't exist locally - check Redis
const jobData = await this.jobStore.getJob(streamId);
if (!jobData) {
return null;
}
// Cross-replica scenario: job exists in Redis but not locally
// Create minimal runtime state for handling reconnection/subscription
logger.debug(`[GenerationJobManager] Creating cross-replica runtime for ${streamId}`);
let resolveReady: () => void;
const readyPromise = new Promise<void>((resolve) => {
resolveReady = resolve;
});
// For jobs created on other replicas, readyPromise should be pre-resolved
// since generation has already started
resolveReady!();
// Parse finalEvent from Redis if available
let finalEvent: t.ServerSentEvent | undefined;
if (jobData.finalEvent) {
try {
finalEvent = JSON.parse(jobData.finalEvent) as t.ServerSentEvent;
} catch {
// Ignore parse errors
}
}
const runtime: RuntimeJobState = {
abortController: new AbortController(),
readyPromise,
resolveReady: resolveReady!,
syncSent: jobData.syncSent ?? false,
earlyEventBuffer: [],
hasSubscriber: false,
finalEvent,
};
this.runtimeState.set(streamId, runtime);
// Set up all-subscribers-left callback for this replica
this.eventTransport.onAllSubscribersLeft(streamId, () => {
const currentRuntime = this.runtimeState.get(streamId);
if (currentRuntime) {
currentRuntime.syncSent = false;
// Persist syncSent=false to Redis
this.jobStore.updateJob(streamId, { syncSent: false }).catch((err) => {
logger.error(`[GenerationJobManager] Failed to persist syncSent=false:`, err);
});
// Call registered handlers
if (currentRuntime.allSubscribersLeftHandlers) {
this.jobStore
.getContentParts(streamId)
.then((result) => {
const parts = result?.content ?? [];
for (const handler of currentRuntime.allSubscribersLeftHandlers ?? []) {
try {
handler(parts);
} catch (err) {
logger.error(`[GenerationJobManager] Error in allSubscribersLeft handler:`, err);
}
}
})
.catch((err) => {
logger.error(
`[GenerationJobManager] Failed to get content parts for allSubscribersLeft handlers:`,
err,
);
});
}
}
});
// Set up cross-replica abort listener (Redis mode only)
// This ensures lazily-initialized jobs can receive abort signals
if (this.eventTransport.onAbort) {
this.eventTransport.onAbort(streamId, () => {
const currentRuntime = this.runtimeState.get(streamId);
if (currentRuntime && !currentRuntime.abortController.signal.aborted) {
logger.debug(
`[GenerationJobManager] Received cross-replica abort for lazily-init job ${streamId}`,
);
currentRuntime.abortController.abort();
}
});
}
return runtime;
}
/**
* Get a job by streamId.
*/
async getJob(streamId: string): Promise<t.GenerationJob | undefined> {
const jobData = await this.jobStore.getJob(streamId);
const runtime = this.runtimeState.get(streamId);
if (!jobData || !runtime) {
if (!jobData) {
return undefined;
}
const runtime = await this.getOrCreateRuntimeState(streamId);
if (!runtime) {
return undefined;
}
return this.buildJobFacade(streamId, jobData, runtime);
}
@ -411,6 +548,10 @@ class GenerationJobManagerClass {
/**
* Abort a job (user-initiated).
* Returns all data needed for token spending and message saving.
*
* Cross-replica support (Redis mode):
* - Emits abort signal via Redis pub/sub
* - The replica running generation receives signal and aborts its AbortController
*/
async abortJob(streamId: string): Promise<AbortResult> {
const jobData = await this.jobStore.getJob(streamId);
@ -421,6 +562,13 @@ class GenerationJobManagerClass {
return { success: false, jobData: null, content: [], finalEvent: null };
}
// Emit abort signal for cross-replica support (Redis mode)
// This ensures the generating replica receives the abort signal
if (this.eventTransport.emitAbort) {
this.eventTransport.emitAbort(streamId);
}
// Also abort local controller if we have it (same-replica abort)
if (runtime) {
runtime.abortController.abort();
}
@ -506,6 +654,10 @@ class GenerationJobManagerClass {
* - Resolves readyPromise (legacy, for API compatibility)
* - Replays any buffered early events (e.g., 'created' event)
*
* Supports cross-replica reconnection in Redis mode:
* - If job exists in Redis but not locally, creates minimal runtime state
* - Events are delivered via Redis pub/sub, not in-memory EventEmitter
*
* @param streamId - The stream to subscribe to
* @param onChunk - Handler for chunk events (streamed tokens, run steps, etc.)
* @param onDone - Handler for completion event (includes final message)
@ -518,7 +670,8 @@ class GenerationJobManagerClass {
onDone?: t.DoneHandler,
onError?: t.ErrorHandler,
): Promise<{ unsubscribe: t.UnsubscribeFn } | null> {
const runtime = this.runtimeState.get(streamId);
// Use lazy initialization to support cross-replica subscriptions
const runtime = await this.getOrCreateRuntimeState(streamId);
if (!runtime) {
return null;
}
@ -788,29 +941,46 @@ class GenerationJobManagerClass {
/**
* Mark that sync has been sent.
* Persists to Redis for cross-replica consistency.
*/
markSyncSent(streamId: string): void {
const runtime = this.runtimeState.get(streamId);
if (runtime) {
runtime.syncSent = true;
}
// Persist to Redis for cross-replica consistency
this.jobStore.updateJob(streamId, { syncSent: true }).catch((err) => {
logger.error(`[GenerationJobManager] Failed to persist syncSent flag:`, err);
});
}
/**
* Check if sync has been sent.
* Checks local runtime first, then falls back to Redis for cross-replica scenarios.
*/
wasSyncSent(streamId: string): boolean {
return this.runtimeState.get(streamId)?.syncSent ?? false;
async wasSyncSent(streamId: string): Promise<boolean> {
const localSyncSent = this.runtimeState.get(streamId)?.syncSent;
if (localSyncSent !== undefined) {
return localSyncSent;
}
// Cross-replica: check Redis
const jobData = await this.jobStore.getJob(streamId);
return jobData?.syncSent ?? false;
}
/**
* Emit a done event.
* Persists finalEvent to Redis for cross-replica access.
*/
emitDone(streamId: string, event: t.ServerSentEvent): void {
const runtime = this.runtimeState.get(streamId);
if (runtime) {
runtime.finalEvent = event;
}
// Persist finalEvent to Redis for cross-replica consistency
this.jobStore.updateJob(streamId, { finalEvent: JSON.stringify(event) }).catch((err) => {
logger.error(`[GenerationJobManager] Failed to persist finalEvent:`, err);
});
this.eventTransport.emitDone(streamId, event);
}