LibreChat/packages/api/src/mcp/oauth/OAuthReconnectionManager.ts

193 lines
6.6 KiB
TypeScript
Raw Normal View History

import { logger } from '@librechat/data-schemas';
import type { TokenMethods, IUser } from '@librechat/data-schemas';
import type { MCPOAuthTokens } from './types';
import { OAuthReconnectionTracker } from './OAuthReconnectionTracker';
import { FlowStateManager } from '~/flow/manager';
import { MCPManager } from '~/mcp/MCPManager';
import { MCPServersRegistry } from '~/mcp/registry/MCPServersRegistry';
const DEFAULT_CONNECTION_TIMEOUT_MS = 10_000; // ms
♻️ refactor: On-demand MCP connections: remove proactive reconnect, default to available (#11839) * feat: Implement reconnection staggering and backoff jitter for MCP connections - Enhanced the reconnection logic in OAuthReconnectionManager to stagger reconnection attempts for multiple servers, reducing the risk of connection storms. - Introduced a backoff delay with random jitter in MCPConnection to improve reconnection behavior during network issues. - Updated the ConnectionsRepository to handle multiple server connections concurrently with a defined concurrency limit. Added tests to ensure the new reconnection strategy works as intended. * refactor: Update MCP server query configuration for improved data freshness - Reduced stale time from 5 minutes to 30 seconds to ensure quicker updates on server initialization. - Enabled refetching on window focus and mount to enhance data accuracy during user interactions. * ♻️ refactor: On-demand MCP connections; remove proactive reconnection, default to available - Remove reconnectServers() from refresh controller (connection storm root cause) - Stop gating server selection on connection status; add to selection immediately - Render agent panel tools from DB cache, not live connection status - Proceed to cached tools on init failure (only gate on OAuth) - Remove unused batchToggleServers() - Reduce useMCPServersQuery staleTime from 5min to 30s, enable refetchOnMount/WindowFocus * refactor: Optimize MCP tool initialization and server connection logic - Adjusted tool initialization to only occur if no cached tools are available, improving efficiency. - Updated comments for clarity on server connection and tool fetching processes. - Removed unnecessary connection status checks during server selection to streamline the user experience.
2026-02-17 22:33:57 -05:00
const RECONNECT_STAGGER_MS = 500; // ms between each server reconnection
export class OAuthReconnectionManager {
private static instance: OAuthReconnectionManager | null = null;
protected readonly flowManager: FlowStateManager<MCPOAuthTokens | null>;
protected readonly tokenMethods: TokenMethods;
private readonly mcpManager: MCPManager | null;
private readonly reconnectionsTracker: OAuthReconnectionTracker;
public static getInstance(): OAuthReconnectionManager {
if (!OAuthReconnectionManager.instance) {
throw new Error('OAuthReconnectionManager not initialized');
}
return OAuthReconnectionManager.instance;
}
public static async createInstance(
flowManager: FlowStateManager<MCPOAuthTokens | null>,
tokenMethods: TokenMethods,
reconnections?: OAuthReconnectionTracker,
): Promise<OAuthReconnectionManager> {
if (OAuthReconnectionManager.instance != null) {
throw new Error('OAuthReconnectionManager already initialized');
}
const manager = new OAuthReconnectionManager(flowManager, tokenMethods, reconnections);
OAuthReconnectionManager.instance = manager;
return manager;
}
public constructor(
flowManager: FlowStateManager<MCPOAuthTokens | null>,
tokenMethods: TokenMethods,
reconnections?: OAuthReconnectionTracker,
) {
this.flowManager = flowManager;
this.tokenMethods = tokenMethods;
this.reconnectionsTracker = reconnections ?? new OAuthReconnectionTracker();
try {
this.mcpManager = MCPManager.getInstance();
} catch {
this.mcpManager = null;
}
}
public isReconnecting(userId: string, serverName: string): boolean {
⏳ refactor: MCP OAuth Polling with Gradual Backoff and Timeout Handling (#9752) * refactor: Implement gradual backoff polling for oauth connection status with timeout handling * refactor: Enhance OAuth polling with gradual backoff and timeout handling; update reconnection tracking * refactor: reconnection timeout behavior in OAuthReconnectionManager and OAuthReconnectionTracker - Implement tests to verify reconnection timeout handling, including tracking of reconnection states and cleanup of timed-out entries. - Enhance existing methods in OAuthReconnectionManager and OAuthReconnectionTracker to support timeout checks and cleanup logic. - Ensure proper handling of multiple servers with different timeout periods and edge cases for active states. * chore: remove comment * refactor: Enforce strict 3-minute OAuth timeout with updated polling intervals and improved timeout handling * refactor: Remove unused polling logic and prevent duplicate polling for servers in MCP server manager * refactor: Update localization key for no memories message in MemoryViewer * refactor: Improve MCP tool initialization by handling server failures - Introduced a mechanism to track failed MCP servers, preventing retries for unavailable servers. - Added logging for failed tool creation attempts to enhance debugging and monitoring. * refactor: Update reconnection timeout to enforce a strict 3-minute limit * ci: Update reconnection timeout tests to reflect a strict 3-minute limit * ci: Update reconnection timeout tests to enforce a strict 3-minute limit * chore: Remove unused MCP connection timeout message
2025-09-21 22:58:19 -04:00
// Clean up if timed out, then return whether still reconnecting
this.reconnectionsTracker.cleanupIfTimedOut(userId, serverName);
return this.reconnectionsTracker.isStillReconnecting(userId, serverName);
}
public async reconnectServers(userId: string) {
// Check if MCPManager is available
if (this.mcpManager == null) {
logger.warn(
'[OAuthReconnectionManager] MCPManager not available, skipping OAuth MCP server reconnection',
);
return;
}
// 1. derive the servers to reconnect
const serversToReconnect = [];
for (const serverName of await MCPServersRegistry.getInstance().getOAuthServers()) {
const canReconnect = await this.canReconnect(userId, serverName);
if (canReconnect) {
serversToReconnect.push(serverName);
}
}
// 2. mark the servers as reconnecting
for (const serverName of serversToReconnect) {
this.reconnectionsTracker.setActive(userId, serverName);
}
♻️ refactor: On-demand MCP connections: remove proactive reconnect, default to available (#11839) * feat: Implement reconnection staggering and backoff jitter for MCP connections - Enhanced the reconnection logic in OAuthReconnectionManager to stagger reconnection attempts for multiple servers, reducing the risk of connection storms. - Introduced a backoff delay with random jitter in MCPConnection to improve reconnection behavior during network issues. - Updated the ConnectionsRepository to handle multiple server connections concurrently with a defined concurrency limit. Added tests to ensure the new reconnection strategy works as intended. * refactor: Update MCP server query configuration for improved data freshness - Reduced stale time from 5 minutes to 30 seconds to ensure quicker updates on server initialization. - Enabled refetching on window focus and mount to enhance data accuracy during user interactions. * ♻️ refactor: On-demand MCP connections; remove proactive reconnection, default to available - Remove reconnectServers() from refresh controller (connection storm root cause) - Stop gating server selection on connection status; add to selection immediately - Render agent panel tools from DB cache, not live connection status - Proceed to cached tools on init failure (only gate on OAuth) - Remove unused batchToggleServers() - Reduce useMCPServersQuery staleTime from 5min to 30s, enable refetchOnMount/WindowFocus * refactor: Optimize MCP tool initialization and server connection logic - Adjusted tool initialization to only occur if no cached tools are available, improving efficiency. - Updated comments for clarity on server connection and tool fetching processes. - Removed unnecessary connection status checks during server selection to streamline the user experience.
2026-02-17 22:33:57 -05:00
// 3. attempt to reconnect the servers with staggered delays to avoid connection storms
for (let i = 0; i < serversToReconnect.length; i++) {
const serverName = serversToReconnect[i];
if (i === 0) {
void this.tryReconnect(userId, serverName);
} else {
setTimeout(() => void this.tryReconnect(userId, serverName), i * RECONNECT_STAGGER_MS);
}
}
}
public clearReconnection(userId: string, serverName: string) {
this.reconnectionsTracker.removeFailed(userId, serverName);
this.reconnectionsTracker.removeActive(userId, serverName);
}
private async tryReconnect(userId: string, serverName: string) {
if (this.mcpManager == null) {
return;
}
const logPrefix = `[tryReconnectOAuthMCPServer][User: ${userId}][${serverName}]`;
logger.info(`${logPrefix} Attempting reconnection`);
const config = await MCPServersRegistry.getInstance().getServerConfig(serverName, userId);
const cleanupOnFailedReconnect = () => {
this.reconnectionsTracker.setFailed(userId, serverName);
this.reconnectionsTracker.removeActive(userId, serverName);
this.mcpManager?.disconnectUserConnection(userId, serverName);
};
try {
// attempt to get connection (this will use existing tokens and refresh if needed)
const connection = await this.mcpManager.getUserConnection({
serverName,
user: { id: userId } as IUser,
flowManager: this.flowManager,
tokenMethods: this.tokenMethods,
// don't force new connection, let it reuse existing or create new as needed
forceNew: false,
// set a reasonable timeout for reconnection attempts
connectionTimeout: config?.initTimeout ?? DEFAULT_CONNECTION_TIMEOUT_MS,
// don't trigger OAuth flow during reconnection
returnOnOAuth: true,
});
if (connection && (await connection.isConnected())) {
logger.info(`${logPrefix} Successfully reconnected`);
this.clearReconnection(userId, serverName);
} else {
logger.warn(`${logPrefix} Failed to reconnect`);
await connection?.disconnect();
cleanupOnFailedReconnect();
}
} catch (error) {
logger.warn(`${logPrefix} Failed to reconnect: ${error}`);
cleanupOnFailedReconnect();
}
}
private async canReconnect(userId: string, serverName: string) {
if (this.mcpManager == null) {
return false;
}
// if the server has failed reconnection, don't attempt to reconnect
if (this.reconnectionsTracker.isFailed(userId, serverName)) {
return false;
}
⏳ refactor: MCP OAuth Polling with Gradual Backoff and Timeout Handling (#9752) * refactor: Implement gradual backoff polling for oauth connection status with timeout handling * refactor: Enhance OAuth polling with gradual backoff and timeout handling; update reconnection tracking * refactor: reconnection timeout behavior in OAuthReconnectionManager and OAuthReconnectionTracker - Implement tests to verify reconnection timeout handling, including tracking of reconnection states and cleanup of timed-out entries. - Enhance existing methods in OAuthReconnectionManager and OAuthReconnectionTracker to support timeout checks and cleanup logic. - Ensure proper handling of multiple servers with different timeout periods and edge cases for active states. * chore: remove comment * refactor: Enforce strict 3-minute OAuth timeout with updated polling intervals and improved timeout handling * refactor: Remove unused polling logic and prevent duplicate polling for servers in MCP server manager * refactor: Update localization key for no memories message in MemoryViewer * refactor: Improve MCP tool initialization by handling server failures - Introduced a mechanism to track failed MCP servers, preventing retries for unavailable servers. - Added logging for failed tool creation attempts to enhance debugging and monitoring. * refactor: Update reconnection timeout to enforce a strict 3-minute limit * ci: Update reconnection timeout tests to reflect a strict 3-minute limit * ci: Update reconnection timeout tests to enforce a strict 3-minute limit * chore: Remove unused MCP connection timeout message
2025-09-21 22:58:19 -04:00
if (this.reconnectionsTracker.isActive(userId, serverName)) {
return false;
}
// if the server is already connected, don't attempt to reconnect
const existingConnections = this.mcpManager.getUserConnections(userId);
if (existingConnections?.has(serverName)) {
const isConnected = await existingConnections.get(serverName)?.isConnected();
if (isConnected) {
return false;
}
}
// if the server has no tokens for the user, don't attempt to reconnect
const accessToken = await this.tokenMethods.findToken({
userId,
type: 'mcp_oauth',
identifier: `mcp:${serverName}`,
});
if (accessToken == null) {
return false;
}
// if the token has expired, don't attempt to reconnect
const now = new Date();
if (accessToken.expiresAt && accessToken.expiresAt < now) {
return false;
}
// …otherwise, we're good to go with the reconnect attempt
return true;
}
}