LibreChat/packages/api/src/mcp/oauth/OAuthReconnectionManager.ts

223 lines
7.3 KiB
TypeScript
Raw Normal View History

import { logger } from '@librechat/data-schemas';
import type { TokenMethods, IUser } from '@librechat/data-schemas';
import type { MCPOAuthTokens } from './types';
import { OAuthReconnectionTracker } from './OAuthReconnectionTracker';
import { FlowStateManager } from '~/flow/manager';
import { MCPManager } from '~/mcp/MCPManager';
import { MCPServersRegistry } from '~/mcp/registry/MCPServersRegistry';
const DEFAULT_CONNECTION_TIMEOUT_MS = 10_000; // ms
♻️ refactor: On-demand MCP connections: remove proactive reconnect, default to available (#11839) * feat: Implement reconnection staggering and backoff jitter for MCP connections - Enhanced the reconnection logic in OAuthReconnectionManager to stagger reconnection attempts for multiple servers, reducing the risk of connection storms. - Introduced a backoff delay with random jitter in MCPConnection to improve reconnection behavior during network issues. - Updated the ConnectionsRepository to handle multiple server connections concurrently with a defined concurrency limit. Added tests to ensure the new reconnection strategy works as intended. * refactor: Update MCP server query configuration for improved data freshness - Reduced stale time from 5 minutes to 30 seconds to ensure quicker updates on server initialization. - Enabled refetching on window focus and mount to enhance data accuracy during user interactions. * ♻️ refactor: On-demand MCP connections; remove proactive reconnection, default to available - Remove reconnectServers() from refresh controller (connection storm root cause) - Stop gating server selection on connection status; add to selection immediately - Render agent panel tools from DB cache, not live connection status - Proceed to cached tools on init failure (only gate on OAuth) - Remove unused batchToggleServers() - Reduce useMCPServersQuery staleTime from 5min to 30s, enable refetchOnMount/WindowFocus * refactor: Optimize MCP tool initialization and server connection logic - Adjusted tool initialization to only occur if no cached tools are available, improving efficiency. - Updated comments for clarity on server connection and tool fetching processes. - Removed unnecessary connection status checks during server selection to streamline the user experience.
2026-02-17 22:33:57 -05:00
const RECONNECT_STAGGER_MS = 500; // ms between each server reconnection
export class OAuthReconnectionManager {
private static instance: OAuthReconnectionManager | null = null;
protected readonly flowManager: FlowStateManager<MCPOAuthTokens | null>;
protected readonly tokenMethods: TokenMethods;
private readonly mcpManager: MCPManager | null;
private readonly reconnectionsTracker: OAuthReconnectionTracker;
public static getInstance(): OAuthReconnectionManager {
if (!OAuthReconnectionManager.instance) {
throw new Error('OAuthReconnectionManager not initialized');
}
return OAuthReconnectionManager.instance;
}
public static async createInstance(
flowManager: FlowStateManager<MCPOAuthTokens | null>,
tokenMethods: TokenMethods,
reconnections?: OAuthReconnectionTracker,
): Promise<OAuthReconnectionManager> {
if (OAuthReconnectionManager.instance != null) {
throw new Error('OAuthReconnectionManager already initialized');
}
const manager = new OAuthReconnectionManager(flowManager, tokenMethods, reconnections);
OAuthReconnectionManager.instance = manager;
return manager;
}
public constructor(
flowManager: FlowStateManager<MCPOAuthTokens | null>,
tokenMethods: TokenMethods,
reconnections?: OAuthReconnectionTracker,
) {
this.flowManager = flowManager;
this.tokenMethods = tokenMethods;
this.reconnectionsTracker = reconnections ?? new OAuthReconnectionTracker();
try {
this.mcpManager = MCPManager.getInstance();
} catch {
this.mcpManager = null;
}
}
public isReconnecting(userId: string, serverName: string): boolean {
⏳ refactor: MCP OAuth Polling with Gradual Backoff and Timeout Handling (#9752) * refactor: Implement gradual backoff polling for oauth connection status with timeout handling * refactor: Enhance OAuth polling with gradual backoff and timeout handling; update reconnection tracking * refactor: reconnection timeout behavior in OAuthReconnectionManager and OAuthReconnectionTracker - Implement tests to verify reconnection timeout handling, including tracking of reconnection states and cleanup of timed-out entries. - Enhance existing methods in OAuthReconnectionManager and OAuthReconnectionTracker to support timeout checks and cleanup logic. - Ensure proper handling of multiple servers with different timeout periods and edge cases for active states. * chore: remove comment * refactor: Enforce strict 3-minute OAuth timeout with updated polling intervals and improved timeout handling * refactor: Remove unused polling logic and prevent duplicate polling for servers in MCP server manager * refactor: Update localization key for no memories message in MemoryViewer * refactor: Improve MCP tool initialization by handling server failures - Introduced a mechanism to track failed MCP servers, preventing retries for unavailable servers. - Added logging for failed tool creation attempts to enhance debugging and monitoring. * refactor: Update reconnection timeout to enforce a strict 3-minute limit * ci: Update reconnection timeout tests to reflect a strict 3-minute limit * ci: Update reconnection timeout tests to enforce a strict 3-minute limit * chore: Remove unused MCP connection timeout message
2025-09-21 22:58:19 -04:00
// Clean up if timed out, then return whether still reconnecting
this.reconnectionsTracker.cleanupIfTimedOut(userId, serverName);
return this.reconnectionsTracker.isStillReconnecting(userId, serverName);
}
public async reconnectServers(userId: string) {
// Check if MCPManager is available
if (this.mcpManager == null) {
logger.warn(
'[OAuthReconnectionManager] MCPManager not available, skipping OAuth MCP server reconnection',
);
return;
}
// 1. derive the servers to reconnect
const serversToReconnect = [];
for (const serverName of await MCPServersRegistry.getInstance().getOAuthServers()) {
const canReconnect = await this.canReconnect(userId, serverName);
if (canReconnect) {
serversToReconnect.push(serverName);
}
}
// 2. mark the servers as reconnecting
for (const serverName of serversToReconnect) {
this.reconnectionsTracker.setActive(userId, serverName);
}
♻️ refactor: On-demand MCP connections: remove proactive reconnect, default to available (#11839) * feat: Implement reconnection staggering and backoff jitter for MCP connections - Enhanced the reconnection logic in OAuthReconnectionManager to stagger reconnection attempts for multiple servers, reducing the risk of connection storms. - Introduced a backoff delay with random jitter in MCPConnection to improve reconnection behavior during network issues. - Updated the ConnectionsRepository to handle multiple server connections concurrently with a defined concurrency limit. Added tests to ensure the new reconnection strategy works as intended. * refactor: Update MCP server query configuration for improved data freshness - Reduced stale time from 5 minutes to 30 seconds to ensure quicker updates on server initialization. - Enabled refetching on window focus and mount to enhance data accuracy during user interactions. * ♻️ refactor: On-demand MCP connections; remove proactive reconnection, default to available - Remove reconnectServers() from refresh controller (connection storm root cause) - Stop gating server selection on connection status; add to selection immediately - Render agent panel tools from DB cache, not live connection status - Proceed to cached tools on init failure (only gate on OAuth) - Remove unused batchToggleServers() - Reduce useMCPServersQuery staleTime from 5min to 30s, enable refetchOnMount/WindowFocus * refactor: Optimize MCP tool initialization and server connection logic - Adjusted tool initialization to only occur if no cached tools are available, improving efficiency. - Updated comments for clarity on server connection and tool fetching processes. - Removed unnecessary connection status checks during server selection to streamline the user experience.
2026-02-17 22:33:57 -05:00
// 3. attempt to reconnect the servers with staggered delays to avoid connection storms
for (let i = 0; i < serversToReconnect.length; i++) {
const serverName = serversToReconnect[i];
if (i === 0) {
void this.tryReconnect(userId, serverName);
} else {
setTimeout(() => void this.tryReconnect(userId, serverName), i * RECONNECT_STAGGER_MS);
}
}
}
/**
* Attempts to reconnect a single OAuth MCP server.
* @returns true if reconnection succeeded, false otherwise.
*/
public async reconnectServer(userId: string, serverName: string): Promise<boolean> {
if (this.mcpManager == null) {
return false;
}
this.reconnectionsTracker.setActive(userId, serverName);
try {
await this.tryReconnect(userId, serverName);
return !this.reconnectionsTracker.isFailed(userId, serverName);
} catch {
return false;
}
}
public clearReconnection(userId: string, serverName: string) {
this.reconnectionsTracker.removeFailed(userId, serverName);
this.reconnectionsTracker.removeActive(userId, serverName);
}
private async tryReconnect(userId: string, serverName: string) {
if (this.mcpManager == null) {
return;
}
const logPrefix = `[tryReconnectOAuthMCPServer][User: ${userId}][${serverName}]`;
logger.info(`${logPrefix} Attempting reconnection`);
const config = await MCPServersRegistry.getInstance().getServerConfig(serverName, userId);
const cleanupOnFailedReconnect = () => {
this.reconnectionsTracker.setFailed(userId, serverName);
this.reconnectionsTracker.removeActive(userId, serverName);
this.mcpManager?.disconnectUserConnection(userId, serverName);
};
try {
// attempt to get connection (this will use existing tokens and refresh if needed)
const connection = await this.mcpManager.getUserConnection({
serverName,
user: { id: userId } as IUser,
flowManager: this.flowManager,
tokenMethods: this.tokenMethods,
// don't force new connection, let it reuse existing or create new as needed
forceNew: false,
// set a reasonable timeout for reconnection attempts
connectionTimeout: config?.initTimeout ?? DEFAULT_CONNECTION_TIMEOUT_MS,
// don't trigger OAuth flow during reconnection
returnOnOAuth: true,
});
if (connection && (await connection.isConnected())) {
logger.info(`${logPrefix} Successfully reconnected`);
this.clearReconnection(userId, serverName);
} else {
logger.warn(`${logPrefix} Failed to reconnect`);
await connection?.disconnect();
cleanupOnFailedReconnect();
}
} catch (error) {
logger.warn(`${logPrefix} Failed to reconnect: ${error}`);
cleanupOnFailedReconnect();
}
}
🪣 fix: Prevent Memory Retention from AsyncLocalStorage Context Propagation (#11942) * fix: store hide_sequential_outputs before processStream clears config processStream now clears config.configurable after completion to break memory retention chains. Save hide_sequential_outputs to a local variable before calling runAgents so the post-stream filter still works. * feat: memory diagnostics * chore: expose garbage collection in backend inspect command Updated the backend inspect command in package.json to include the --expose-gc flag, enabling garbage collection diagnostics for improved memory management during development. * chore: update @librechat/agents dependency to version 3.1.52 Bumped the version of @librechat/agents in package.json and package-lock.json to ensure compatibility and access to the latest features and fixes. * fix: clear heavy config state after processStream to prevent memory leaks Break the reference chain from LangGraph's internal __pregel_scratchpad through @langchain/core RunTree.extra[lc:child_config] into the AsyncLocalStorage context captured by timers and I/O handles. After stream completion, null out symbol-keyed scratchpad properties (currentTaskInput), config.configurable, and callbacks. Also call Graph.clearHeavyState() to release config, signal, content maps, handler registry, and tool sessions. * chore: fix imports for memory utils * chore: add circular dependency check in API build step Enhanced the backend review workflow to include a check for circular dependencies during the API build process. If a circular dependency is detected, an error message is displayed, and the process exits with a failure status. * chore: update API build step to include circular dependency detection Modified the backend review workflow to rename the API package installation step to reflect its new functionality, which now includes detection of circular dependencies during the build process. * chore: add memory diagnostics option to .env.example Included a commented-out configuration option for enabling memory diagnostics in the .env.example file, which logs heap and RSS snapshots every 60 seconds when activated. * chore: remove redundant agentContexts cleanup in disposeClient function Streamlined the disposeClient function by eliminating duplicate cleanup logic for agentContexts, ensuring efficient memory management during client disposal. * refactor: move runOutsideTracing utility to utils and update its usage Refactored the runOutsideTracing function by relocating it to the utils module for better organization. Updated the tool execution handler to utilize the new import, ensuring consistent tracing behavior during tool execution. * refactor: enhance connection management and diagnostics Added a method to ConnectionsRepository for retrieving the active connection count. Updated UserConnectionManager to utilize this new method for app connection count reporting. Refined the OAuthReconnectionTracker's getStats method to improve clarity in diagnostics. Introduced a new tracing utility in the utils module to streamline tracing context management. Additionally, added a safeguard in memory diagnostics to prevent unnecessary snapshot collection for very short intervals. * refactor: enhance tracing utility and add memory diagnostics tests Refactored the runOutsideTracing function to improve warning logic when the AsyncLocalStorage context is missing. Added tests for memory diagnostics and tracing utilities to ensure proper functionality and error handling. Introduced a new test suite for memory diagnostics, covering snapshot collection and garbage collection behavior.
2026-02-25 17:41:23 -05:00
public getTrackerStats() {
return this.reconnectionsTracker.getStats();
}
private async canReconnect(userId: string, serverName: string) {
if (this.mcpManager == null) {
return false;
}
// if the server has failed reconnection, don't attempt to reconnect
if (this.reconnectionsTracker.isFailed(userId, serverName)) {
return false;
}
⏳ refactor: MCP OAuth Polling with Gradual Backoff and Timeout Handling (#9752) * refactor: Implement gradual backoff polling for oauth connection status with timeout handling * refactor: Enhance OAuth polling with gradual backoff and timeout handling; update reconnection tracking * refactor: reconnection timeout behavior in OAuthReconnectionManager and OAuthReconnectionTracker - Implement tests to verify reconnection timeout handling, including tracking of reconnection states and cleanup of timed-out entries. - Enhance existing methods in OAuthReconnectionManager and OAuthReconnectionTracker to support timeout checks and cleanup logic. - Ensure proper handling of multiple servers with different timeout periods and edge cases for active states. * chore: remove comment * refactor: Enforce strict 3-minute OAuth timeout with updated polling intervals and improved timeout handling * refactor: Remove unused polling logic and prevent duplicate polling for servers in MCP server manager * refactor: Update localization key for no memories message in MemoryViewer * refactor: Improve MCP tool initialization by handling server failures - Introduced a mechanism to track failed MCP servers, preventing retries for unavailable servers. - Added logging for failed tool creation attempts to enhance debugging and monitoring. * refactor: Update reconnection timeout to enforce a strict 3-minute limit * ci: Update reconnection timeout tests to reflect a strict 3-minute limit * ci: Update reconnection timeout tests to enforce a strict 3-minute limit * chore: Remove unused MCP connection timeout message
2025-09-21 22:58:19 -04:00
if (this.reconnectionsTracker.isActive(userId, serverName)) {
return false;
}
// if the server is already connected, don't attempt to reconnect
const existingConnections = this.mcpManager.getUserConnections(userId);
if (existingConnections?.has(serverName)) {
const isConnected = await existingConnections.get(serverName)?.isConnected();
if (isConnected) {
return false;
}
}
// if the server has a valid (non-expired) access token, allow reconnect
const accessToken = await this.tokenMethods.findToken({
userId,
type: 'mcp_oauth',
identifier: `mcp:${serverName}`,
});
if (accessToken != null) {
const now = new Date();
if (!accessToken.expiresAt || accessToken.expiresAt >= now) {
return true;
}
}
// if the access token is expired or TTL-deleted, fall back to refresh token
const refreshToken = await this.tokenMethods.findToken({
userId,
type: 'mcp_oauth',
identifier: `mcp:${serverName}:refresh`,
});
if (refreshToken == null) {
return false;
}
return true;
}
}