mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-02-11 12:04:24 +01:00
🌊 fix: Prevent Truncations When Redis Resumable Streams Are Enabled (#11710)
* fix: prevent truncated responses when Redis resumable streams are enabled Race condition in RedisEventTransport.subscribe() caused early events (seq 0+) to be lost. The Redis SUBSCRIBE command was fired as fire-and-forget, but GenerationJobManager immediately set hasSubscriber=true, disabling the earlyEventBuffer. Events published during the gap between subscribe() returning and the Redis subscription actually taking effect were neither buffered nor received — they were silently dropped by Pub/Sub. This manifested as "timeout waiting for seq 0, force-flushing N messages" warnings followed by truncated or missing response text in the UI. The fix: - IEventTransport.subscribe() now returns an optional `ready` promise that resolves once the transport can actually receive messages - RedisEventTransport returns the Redis SUBSCRIBE acknowledgment as the `ready` promise instead of firing it as fire-and-forget - GenerationJobManager.subscribe() awaits `ready` before setting hasSubscriber=true, keeping the earlyEventBuffer active during the subscription window so no events are lost - GenerationJobManager.emitChunk() early-returns after buffering when no subscriber is connected, avoiding wasteful Redis PUBLISHes that nobody would receive Adds 5 regression tests covering the race condition for both in-memory and Redis transports, verifying that events emitted before subscribe are buffered and replayed, that the ready promise contract is correct for both transport implementations, and that no events are lost across the subscribe boundary. * refactor: Update import paths in GenerationJobManager integration tests - Refactored import statements in the GenerationJobManager integration test file to use absolute paths instead of relative paths, improving code readability and maintainability. - Removed redundant imports and ensured consistent usage of the updated import structure across the test cases. * chore: Remove redundant await from GenerationJobManager initialization in tests - Updated multiple test cases to call GenerationJobManager.initialize() without awaiting, improving test performance and clarity. - Ensured consistent initialization across various scenarios in the CollectedUsage and AbortJob test suites. * refactor: Enhance GenerationJobManager integration tests and RedisEventTransport cleanup - Updated GenerationJobManager integration tests to utilize dynamic Redis clients and removed unnecessary awaits from initialization calls, improving test performance. - Refactored RedisEventTransport's destroy method to safely disconnect the subscriber, enhancing resource management and preventing potential errors during cleanup. * feat: Enhance GenerationJobManager and RedisEventTransport for improved event handling - Added a resetSequence method to IEventTransport and implemented it in RedisEventTransport to manage publish sequence counters effectively. - Updated GenerationJobManager to utilize the new resetSequence method, ensuring proper event handling during stream operations. - Introduced integration tests for GenerationJobManager to validate cross-replica event publishing and subscriber readiness in Redis, enhancing test coverage and reliability. * test: Add integration tests for GenerationJobManager sequence reset and error recovery with Redis - Introduced new tests to validate the behavior of GenerationJobManager during sequence resets, ensuring no stale events are received after a reset. - Added tests to confirm that the sequence is not reset when a second subscriber joins mid-stream, maintaining event integrity. - Implemented a test for resubscription after a Redis subscribe failure, verifying that events can still be received post-error. - Enhanced overall test coverage for Redis-related functionalities in GenerationJobManager. * fix: Update GenerationJobManager and RedisEventTransport for improved event synchronization - Replaced the resetSequence method with syncReorderBuffer in GenerationJobManager to enhance cross-replica event handling without resetting the publisher sequence. - Added a new syncReorderBuffer method in RedisEventTransport to advance the subscriber reorder buffer safely, ensuring no data loss during subscriber transitions. - Introduced a new integration test to validate that local subscribers joining do not cause data loss for cross-replica subscribers, enhancing the reliability of event delivery. - Updated existing tests to reflect changes in event handling logic, improving overall test coverage and robustness. * fix: Clear flushTimeout in RedisEventTransport to prevent potential memory leaks - Added logic to clear the flushTimeout in the reorderBuffer when resetting the sequence counters, ensuring proper resource management and preventing memory leaks during state transitions in RedisEventTransport.
This commit is contained in:
parent
9054ca9c15
commit
e646a3615e
6 changed files with 782 additions and 188 deletions
|
|
@ -745,7 +745,6 @@ class GenerationJobManagerClass {
|
|||
const subscription = this.eventTransport.subscribe(streamId, {
|
||||
onChunk: (event) => {
|
||||
const e = event as t.ServerSentEvent;
|
||||
// Filter out internal events
|
||||
if (!(e as Record<string, unknown>)._internal) {
|
||||
onChunk(e);
|
||||
}
|
||||
|
|
@ -754,14 +753,15 @@ class GenerationJobManagerClass {
|
|||
onError,
|
||||
});
|
||||
|
||||
// Check if this is the first subscriber
|
||||
if (subscription.ready) {
|
||||
await subscription.ready;
|
||||
}
|
||||
|
||||
const isFirst = this.eventTransport.isFirstSubscriber(streamId);
|
||||
|
||||
// First subscriber: replay buffered events and mark as connected
|
||||
if (!runtime.hasSubscriber) {
|
||||
runtime.hasSubscriber = true;
|
||||
|
||||
// Replay any events that were emitted before subscriber connected
|
||||
if (runtime.earlyEventBuffer.length > 0) {
|
||||
logger.debug(
|
||||
`[GenerationJobManager] Replaying ${runtime.earlyEventBuffer.length} buffered events for ${streamId}`,
|
||||
|
|
@ -771,6 +771,8 @@ class GenerationJobManagerClass {
|
|||
}
|
||||
runtime.earlyEventBuffer = [];
|
||||
}
|
||||
|
||||
this.eventTransport.syncReorderBuffer?.(streamId);
|
||||
}
|
||||
|
||||
if (isFirst) {
|
||||
|
|
@ -823,12 +825,13 @@ class GenerationJobManagerClass {
|
|||
}
|
||||
}
|
||||
|
||||
// Buffer early events if no subscriber yet (replay when first subscriber connects)
|
||||
if (!runtime.hasSubscriber) {
|
||||
runtime.earlyEventBuffer.push(event);
|
||||
if (!this._isRedis) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Await the transport emit - critical for Redis mode to maintain event order
|
||||
await this.eventTransport.emitChunk(streamId, event);
|
||||
}
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -146,7 +146,7 @@ describe('CollectedUsage - GenerationJobManager', () => {
|
|||
cleanupOnComplete: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
GenerationJobManager.initialize();
|
||||
|
||||
const streamId = `manager-test-${Date.now()}`;
|
||||
await GenerationJobManager.createJob(streamId, 'user-1');
|
||||
|
|
@ -179,7 +179,7 @@ describe('CollectedUsage - GenerationJobManager', () => {
|
|||
cleanupOnComplete: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
GenerationJobManager.initialize();
|
||||
|
||||
const streamId = `no-usage-test-${Date.now()}`;
|
||||
await GenerationJobManager.createJob(streamId, 'user-1');
|
||||
|
|
@ -202,7 +202,7 @@ describe('CollectedUsage - GenerationJobManager', () => {
|
|||
isRedis: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
GenerationJobManager.initialize();
|
||||
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
|
||||
|
|
@ -235,7 +235,7 @@ describe('AbortJob - Text and CollectedUsage', () => {
|
|||
cleanupOnComplete: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
GenerationJobManager.initialize();
|
||||
|
||||
const streamId = `text-extract-${Date.now()}`;
|
||||
await GenerationJobManager.createJob(streamId, 'user-1');
|
||||
|
|
@ -267,7 +267,7 @@ describe('AbortJob - Text and CollectedUsage', () => {
|
|||
cleanupOnComplete: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
GenerationJobManager.initialize();
|
||||
|
||||
const streamId = `empty-text-${Date.now()}`;
|
||||
await GenerationJobManager.createJob(streamId, 'user-1');
|
||||
|
|
@ -291,7 +291,7 @@ describe('AbortJob - Text and CollectedUsage', () => {
|
|||
cleanupOnComplete: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
GenerationJobManager.initialize();
|
||||
|
||||
const streamId = `full-abort-${Date.now()}`;
|
||||
await GenerationJobManager.createJob(streamId, 'user-1');
|
||||
|
|
@ -328,7 +328,7 @@ describe('AbortJob - Text and CollectedUsage', () => {
|
|||
isRedis: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
GenerationJobManager.initialize();
|
||||
|
||||
const abortResult = await GenerationJobManager.abortJob('non-existent-job');
|
||||
|
||||
|
|
@ -365,7 +365,7 @@ describe('Real-world Scenarios', () => {
|
|||
cleanupOnComplete: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
GenerationJobManager.initialize();
|
||||
|
||||
const streamId = `parallel-abort-${Date.now()}`;
|
||||
await GenerationJobManager.createJob(streamId, 'user-1');
|
||||
|
|
@ -419,7 +419,7 @@ describe('Real-world Scenarios', () => {
|
|||
cleanupOnComplete: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
GenerationJobManager.initialize();
|
||||
|
||||
const streamId = `cache-abort-${Date.now()}`;
|
||||
await GenerationJobManager.createJob(streamId, 'user-1');
|
||||
|
|
@ -459,7 +459,7 @@ describe('Real-world Scenarios', () => {
|
|||
cleanupOnComplete: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
GenerationJobManager.initialize();
|
||||
|
||||
const streamId = `sequential-abort-${Date.now()}`;
|
||||
await GenerationJobManager.createJob(streamId, 'user-1');
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ export class InMemoryEventTransport implements IEventTransport {
|
|||
onDone?: (event: unknown) => void;
|
||||
onError?: (error: string) => void;
|
||||
},
|
||||
): { unsubscribe: () => void } {
|
||||
): { unsubscribe: () => void; ready?: Promise<void> } {
|
||||
const state = this.getOrCreateStream(streamId);
|
||||
|
||||
const chunkHandler = (event: unknown) => handlers.onChunk(event);
|
||||
|
|
|
|||
|
|
@ -92,8 +92,8 @@ export class RedisEventTransport implements IEventTransport {
|
|||
private subscriber: Redis | Cluster;
|
||||
/** Track subscribers per stream */
|
||||
private streams = new Map<string, StreamSubscribers>();
|
||||
/** Track which channels we're subscribed to */
|
||||
private subscribedChannels = new Set<string>();
|
||||
/** Track channel subscription state: resolved promise = active, pending = in-flight */
|
||||
private channelSubscriptions = new Map<string, Promise<void>>();
|
||||
/** Counter for generating unique subscriber IDs */
|
||||
private subscriberIdCounter = 0;
|
||||
/** Sequence counters per stream for publishing (ensures ordered delivery in cluster mode) */
|
||||
|
|
@ -122,9 +122,32 @@ export class RedisEventTransport implements IEventTransport {
|
|||
return current;
|
||||
}
|
||||
|
||||
/** Reset sequence counter for a stream */
|
||||
private resetSequence(streamId: string): void {
|
||||
/** Reset publish sequence counter and subscriber reorder state for a stream (full cleanup only) */
|
||||
resetSequence(streamId: string): void {
|
||||
this.sequenceCounters.delete(streamId);
|
||||
const state = this.streams.get(streamId);
|
||||
if (state) {
|
||||
if (state.reorderBuffer.flushTimeout) {
|
||||
clearTimeout(state.reorderBuffer.flushTimeout);
|
||||
state.reorderBuffer.flushTimeout = null;
|
||||
}
|
||||
state.reorderBuffer.nextSeq = 0;
|
||||
state.reorderBuffer.pending.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/** Advance subscriber reorder buffer to current publisher sequence without resetting publisher (cross-replica safe) */
|
||||
syncReorderBuffer(streamId: string): void {
|
||||
const currentSeq = this.sequenceCounters.get(streamId) ?? 0;
|
||||
const state = this.streams.get(streamId);
|
||||
if (state) {
|
||||
if (state.reorderBuffer.flushTimeout) {
|
||||
clearTimeout(state.reorderBuffer.flushTimeout);
|
||||
state.reorderBuffer.flushTimeout = null;
|
||||
}
|
||||
state.reorderBuffer.nextSeq = currentSeq;
|
||||
state.reorderBuffer.pending.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -331,7 +354,7 @@ export class RedisEventTransport implements IEventTransport {
|
|||
onDone?: (event: unknown) => void;
|
||||
onError?: (error: string) => void;
|
||||
},
|
||||
): { unsubscribe: () => void } {
|
||||
): { unsubscribe: () => void; ready?: Promise<void> } {
|
||||
const channel = CHANNELS.events(streamId);
|
||||
const subscriberId = `sub_${++this.subscriberIdCounter}`;
|
||||
|
||||
|
|
@ -354,16 +377,23 @@ export class RedisEventTransport implements IEventTransport {
|
|||
streamState.count++;
|
||||
streamState.handlers.set(subscriberId, handlers);
|
||||
|
||||
// Subscribe to Redis channel if this is first subscriber
|
||||
if (!this.subscribedChannels.has(channel)) {
|
||||
this.subscribedChannels.add(channel);
|
||||
this.subscriber.subscribe(channel).catch((err) => {
|
||||
logger.error(`[RedisEventTransport] Failed to subscribe to ${channel}:`, err);
|
||||
});
|
||||
let readyPromise = this.channelSubscriptions.get(channel);
|
||||
|
||||
if (!readyPromise) {
|
||||
readyPromise = this.subscriber
|
||||
.subscribe(channel)
|
||||
.then(() => {
|
||||
logger.debug(`[RedisEventTransport] Subscription active for channel ${channel}`);
|
||||
})
|
||||
.catch((err) => {
|
||||
this.channelSubscriptions.delete(channel);
|
||||
logger.error(`[RedisEventTransport] Failed to subscribe to ${channel}:`, err);
|
||||
});
|
||||
this.channelSubscriptions.set(channel, readyPromise);
|
||||
}
|
||||
|
||||
// Return unsubscribe function
|
||||
return {
|
||||
ready: readyPromise,
|
||||
unsubscribe: () => {
|
||||
const state = this.streams.get(streamId);
|
||||
if (!state) {
|
||||
|
|
@ -385,7 +415,7 @@ export class RedisEventTransport implements IEventTransport {
|
|||
this.subscriber.unsubscribe(channel).catch((err) => {
|
||||
logger.error(`[RedisEventTransport] Failed to unsubscribe from ${channel}:`, err);
|
||||
});
|
||||
this.subscribedChannels.delete(channel);
|
||||
this.channelSubscriptions.delete(channel);
|
||||
|
||||
// Call all-subscribers-left callbacks
|
||||
for (const callback of state.allSubscribersLeftCallbacks) {
|
||||
|
|
@ -532,12 +562,15 @@ export class RedisEventTransport implements IEventTransport {
|
|||
|
||||
state.abortCallbacks.push(callback);
|
||||
|
||||
// Subscribe to Redis channel if not already subscribed
|
||||
if (!this.subscribedChannels.has(channel)) {
|
||||
this.subscribedChannels.add(channel);
|
||||
this.subscriber.subscribe(channel).catch((err) => {
|
||||
logger.error(`[RedisEventTransport] Failed to subscribe to ${channel}:`, err);
|
||||
});
|
||||
if (!this.channelSubscriptions.has(channel)) {
|
||||
const ready = this.subscriber
|
||||
.subscribe(channel)
|
||||
.then(() => {})
|
||||
.catch((err) => {
|
||||
this.channelSubscriptions.delete(channel);
|
||||
logger.error(`[RedisEventTransport] Failed to subscribe to ${channel}:`, err);
|
||||
});
|
||||
this.channelSubscriptions.set(channel, ready);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -571,12 +604,11 @@ export class RedisEventTransport implements IEventTransport {
|
|||
// Reset sequence counter for this stream
|
||||
this.resetSequence(streamId);
|
||||
|
||||
// Unsubscribe from Redis channel
|
||||
if (this.subscribedChannels.has(channel)) {
|
||||
if (this.channelSubscriptions.has(channel)) {
|
||||
this.subscriber.unsubscribe(channel).catch((err) => {
|
||||
logger.error(`[RedisEventTransport] Failed to cleanup ${channel}:`, err);
|
||||
});
|
||||
this.subscribedChannels.delete(channel);
|
||||
this.channelSubscriptions.delete(channel);
|
||||
}
|
||||
|
||||
this.streams.delete(streamId);
|
||||
|
|
@ -595,18 +627,20 @@ export class RedisEventTransport implements IEventTransport {
|
|||
state.reorderBuffer.pending.clear();
|
||||
}
|
||||
|
||||
// Unsubscribe from all channels
|
||||
for (const channel of this.subscribedChannels) {
|
||||
this.subscriber.unsubscribe(channel).catch(() => {
|
||||
// Ignore errors during shutdown
|
||||
});
|
||||
for (const channel of this.channelSubscriptions.keys()) {
|
||||
this.subscriber.unsubscribe(channel).catch(() => {});
|
||||
}
|
||||
|
||||
this.subscribedChannels.clear();
|
||||
this.channelSubscriptions.clear();
|
||||
this.streams.clear();
|
||||
this.sequenceCounters.clear();
|
||||
|
||||
// Note: Don't close Redis connections - they may be shared
|
||||
try {
|
||||
this.subscriber.disconnect();
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
|
||||
logger.info('[RedisEventTransport] Destroyed');
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -286,7 +286,7 @@ export interface IJobStore {
|
|||
* Implementations can use EventEmitter, Redis Pub/Sub, etc.
|
||||
*/
|
||||
export interface IEventTransport {
|
||||
/** Subscribe to events for a stream */
|
||||
/** Subscribe to events for a stream. `ready` resolves once the transport can receive messages. */
|
||||
subscribe(
|
||||
streamId: string,
|
||||
handlers: {
|
||||
|
|
@ -294,7 +294,7 @@ export interface IEventTransport {
|
|||
onDone?: (event: unknown) => void;
|
||||
onError?: (error: string) => void;
|
||||
},
|
||||
): { unsubscribe: () => void };
|
||||
): { unsubscribe: () => void; ready?: Promise<void> };
|
||||
|
||||
/** Publish a chunk event - returns Promise in Redis mode for ordered delivery */
|
||||
emitChunk(streamId: string, event: unknown): void | Promise<void>;
|
||||
|
|
@ -329,6 +329,12 @@ export interface IEventTransport {
|
|||
/** Listen for all subscribers leaving */
|
||||
onAllSubscribersLeft(streamId: string, callback: () => void): void;
|
||||
|
||||
/** Reset publish sequence counter for a stream (used during full stream cleanup) */
|
||||
resetSequence?(streamId: string): void;
|
||||
|
||||
/** Advance subscriber reorder buffer to match publisher sequence (cross-replica safe: doesn't reset publisher counter) */
|
||||
syncReorderBuffer?(streamId: string): void;
|
||||
|
||||
/** Cleanup transport resources for a specific stream */
|
||||
cleanup(streamId: string): void;
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue