🔄 refactor: MCP Server Init and Stale Cache Handling (#10984)

* 🔧 refactor: Update MCP connection handling to improve performance and testing * refactor: Replace getAll() with getLoaded() in MCP.js to prevent unnecessary connection creation for user-context servers. * test: Adjust MCP.spec.js to mock getLoaded() instead of getAll() for consistency with the new implementation. * feat: Enhance MCPServersInitializer to reset initialization flag for better handling of process restarts and stale data. * test: Add integration tests to verify re-initialization behavior and ensure stale data is cleared when necessary. * 🔧 refactor: Enhance cached endpoints config handling for GPT plugins * refactor: Update MCPServersInitializer tests to use new server management methods * refactor: Replace direct Redis server manipulation with registry.addServer and registry.getServerConfig for better abstraction and consistency. * test: Adjust integration tests to verify server initialization and stale data handling using the updated methods. * 🔧 refactor: Increase retry limits and delay for MCP server creation * Updated MAX_CREATE_RETRIES from 3 to 5 to allow for more attempts during server creation. * Increased RETRY_BASE_DELAY_MS from 10 to 25 milliseconds to provide a longer wait time between retries, improving stability in server initialization. * refactor: Update MCPServersInitializer tests to utilize new registry methods * refactor: Replace direct access to sharedAppServers with registry.getServerConfig for improved abstraction. * test: Adjust tests to verify server initialization and stale data handling using the updated registry methods, ensuring consistency and clarity in the test structure.
2026-01-31 14:55:19 +01:00 · 2025-12-15 16:46:56 -05:00 · 2025-12-15 16:46:56 -05:00 · dcd9273700
commit dcd9273700
parent 02fc4647e1
7 changed files with 198 additions and 12 deletions
--- a/api/server/services/Config/getEndpointsConfig.js
+++ b/api/server/services/Config/getEndpointsConfig.js
@ -19,7 +19,11 @@ async function getEndpointsConfig(req) {
  const cache = getLogStores(CacheKeys.CONFIG_STORE);
  const cachedEndpointsConfig = await cache.get(CacheKeys.ENDPOINT_CONFIG);
  if (cachedEndpointsConfig) {
-    return cachedEndpointsConfig;
+    if (cachedEndpointsConfig.gptPlugins) {
+      await cache.delete(CacheKeys.ENDPOINT_CONFIG);
+    } else {
+      return cachedEndpointsConfig;
+    }
  }

  const appConfig = req.config ?? (await getAppConfig({ role: req.user?.role }));
--- a/api/server/services/MCP.js
+++ b/api/server/services/MCP.js
@ -448,7 +448,10 @@ async function getMCPSetupData(userId) {
  /** @type {Map<string, import('@librechat/api').MCPConnection>} */
  let appConnections = new Map();
  try {
-    appConnections = (await mcpManager.appConnections?.getAll()) || new Map();
+    // Use getLoaded() instead of getAll() to avoid forcing connection creation
+    // getAll() creates connections for all servers, which is problematic for servers
+    // that require user context (e.g., those with {{LIBRECHAT_USER_ID}} placeholders)
+    appConnections = (await mcpManager.appConnections?.getLoaded()) || new Map();
  } catch (error) {
    logger.error(`[MCP][User: ${userId}] Error getting app connections:`, error);
  }
--- a/api/server/services/MCP.spec.js
+++ b/api/server/services/MCP.spec.js
@ -128,7 +128,7 @@ describe('tests for the new helper functions used by the MCP connection status e

    beforeEach(() => {
      mockGetMCPManager.mockReturnValue({
-        appConnections: { getAll: jest.fn(() => new Map()) },
+        appConnections: { getLoaded: jest.fn(() => new Map()) },
        getUserConnections: jest.fn(() => new Map()),
      });
      mockRegistryInstance.getOAuthServers.mockResolvedValue(new Set());
@ -143,7 +143,7 @@ describe('tests for the new helper functions used by the MCP connection status e
      const mockOAuthServers = new Set(['server2']);

      const mockMCPManager = {
-        appConnections: { getAll: jest.fn(() => Promise.resolve(mockAppConnections)) },
+        appConnections: { getLoaded: jest.fn(() => Promise.resolve(mockAppConnections)) },
        getUserConnections: jest.fn(() => mockUserConnections),
      };
      mockGetMCPManager.mockReturnValue(mockMCPManager);
@ -153,7 +153,7 @@ describe('tests for the new helper functions used by the MCP connection status e

      expect(mockRegistryInstance.getAllServerConfigs).toHaveBeenCalledWith(mockUserId);
      expect(mockGetMCPManager).toHaveBeenCalledWith(mockUserId);
-      expect(mockMCPManager.appConnections.getAll).toHaveBeenCalled();
+      expect(mockMCPManager.appConnections.getLoaded).toHaveBeenCalled();
      expect(mockMCPManager.getUserConnections).toHaveBeenCalledWith(mockUserId);
      expect(mockRegistryInstance.getOAuthServers).toHaveBeenCalledWith(mockUserId);

@ -174,7 +174,7 @@ describe('tests for the new helper functions used by the MCP connection status e
      mockRegistryInstance.getAllServerConfigs.mockResolvedValue(mockConfig);

      const mockMCPManager = {
-        appConnections: { getAll: jest.fn(() => Promise.resolve(null)) },
+        appConnections: { getLoaded: jest.fn(() => Promise.resolve(null)) },
        getUserConnections: jest.fn(() => null),
      };
      mockGetMCPManager.mockReturnValue(mockMCPManager);
--- a/packages/api/src/mcp/registry/MCPServersInitializer.ts
+++ b/packages/api/src/mcp/registry/MCPServersInitializer.ts
@ -30,11 +30,24 @@ export class MCPServersInitializer {
   * - Followers wait and poll `statusCache` until the leader finishes, ensuring only one node
   *   performs the expensive initialization operations.
   */
+  private static hasInitializedThisProcess = false;
+
+  /** Reset the process-level initialization flag. Only used for testing. */
+  public static resetProcessFlag(): void {
+    MCPServersInitializer.hasInitializedThisProcess = false;
+  }
+
  public static async initialize(rawConfigs: t.MCPServers): Promise<void> {
-    if (await statusCache.isInitialized()) return;
+    // On first call in this process, always reset and re-initialize
+    // This ensures we don't use stale Redis data from previous runs
+    const isFirstCallThisProcess = !MCPServersInitializer.hasInitializedThisProcess;
+    // Set flag immediately so recursive calls (from followers) use Redis cache for coordination
+    MCPServersInitializer.hasInitializedThisProcess = true;
+
+    if (!isFirstCallThisProcess && (await statusCache.isInitialized())) return;

    if (await isLeader()) {
-      // Leader performs initialization
+      // Leader performs initialization - always reset on first call
      await statusCache.reset();
      await MCPServersRegistry.getInstance().reset();
      const serverNames = Object.keys(rawConfigs);
--- a/packages/api/src/mcp/registry/tests/MCPServersInitializer.cache_integration.spec.ts
+++ b/packages/api/src/mcp/registry/tests/MCPServersInitializer.cache_integration.spec.ts
@ -201,6 +201,11 @@ describe('MCPServersInitializer Redis Integration Tests', () => {

    // Mock MCPConnectionFactory
    jest.spyOn(MCPConnectionFactory, 'create').mockResolvedValue(mockConnection);
+
+    // Reset caches and process flag before each test
+    await registryStatusCache.reset();
+    await registry.reset();
+    MCPServersInitializer.resetProcessFlag();
  });

  afterEach(async () => {
@ -261,7 +266,7 @@ describe('MCPServersInitializer Redis Integration Tests', () => {
      await MCPServersInitializer.initialize(testConfigs);

      // Verify inspect was not called again
-      expect(MCPServerInspector.inspect).not.toHaveBeenCalled();
+      expect((MCPServerInspector.inspect as jest.Mock).mock.calls.length).toBe(0);
    });

    it('should initialize all servers to cache repository', async () => {
@ -309,4 +314,118 @@ describe('MCPServersInitializer Redis Integration Tests', () => {
      expect(await registryStatusCache.isInitialized()).toBe(true);
    });
  });
+
+  describe('horizontal scaling / app restart behavior', () => {
+    it('should re-initialize on first call even if Redis says initialized (simulating app restart)', async () => {
+      // First: run full initialization
+      await MCPServersInitializer.initialize(testConfigs);
+      expect(await registryStatusCache.isInitialized()).toBe(true);
+
+      // Add a stale server directly to Redis to simulate stale data
+      await registry.addServer(
+        'stale_server',
+        {
+          type: 'stdio',
+          command: 'node',
+          args: ['stale.js'],
+        },
+        'CACHE',
+      );
+      expect(await registry.getServerConfig('stale_server')).toBeDefined();
+
+      // Simulate app restart by resetting the process flag (but NOT Redis)
+      MCPServersInitializer.resetProcessFlag();
+
+      // Clear mocks to track new calls
+      jest.clearAllMocks();
+
+      // Re-initialize - should still run initialization because process flag was reset
+      await MCPServersInitializer.initialize(testConfigs);
+
+      // Stale server should be gone because registry.reset() was called
+      expect(await registry.getServerConfig('stale_server')).toBeUndefined();
+
+      // Real servers should be present
+      expect(await registry.getServerConfig('file_tools_server')).toBeDefined();
+      expect(await registry.getServerConfig('disabled_server')).toBeDefined();
+
+      // Inspector should have been called (proving re-initialization happened)
+      expect((MCPServerInspector.inspect as jest.Mock).mock.calls.length).toBeGreaterThan(0);
+    });
+
+    it('should skip re-initialization on subsequent calls within same process', async () => {
+      // First initialization
+      await MCPServersInitializer.initialize(testConfigs);
+      expect(await registryStatusCache.isInitialized()).toBe(true);
+
+      // Clear mocks
+      jest.clearAllMocks();
+
+      // Second call in same process should skip
+      await MCPServersInitializer.initialize(testConfigs);
+
+      // Inspector should NOT have been called
+      expect((MCPServerInspector.inspect as jest.Mock).mock.calls.length).toBe(0);
+    });
+
+    it('should clear stale data from Redis when a new instance becomes leader', async () => {
+      // Initial setup with testConfigs
+      await MCPServersInitializer.initialize(testConfigs);
+
+      // Add stale data that shouldn't exist after next initialization
+      await registry.addServer(
+        'should_be_removed',
+        {
+          type: 'stdio',
+          command: 'node',
+          args: ['old.js'],
+        },
+        'CACHE',
+      );
+
+      // Verify stale data exists
+      expect(await registry.getServerConfig('should_be_removed')).toBeDefined();
+
+      // Simulate new process starting (reset process flag)
+      MCPServersInitializer.resetProcessFlag();
+
+      // Initialize with different configs (fewer servers)
+      const reducedConfigs: t.MCPServers = {
+        file_tools_server: testConfigs.file_tools_server,
+      };
+
+      await MCPServersInitializer.initialize(reducedConfigs);
+
+      // Stale server from previous config should be gone
+      expect(await registry.getServerConfig('should_be_removed')).toBeUndefined();
+      // Server not in new configs should be gone
+      expect(await registry.getServerConfig('disabled_server')).toBeUndefined();
+      // Only server in new configs should exist
+      expect(await registry.getServerConfig('file_tools_server')).toBeDefined();
+    });
+
+    it('should work correctly when multiple instances share Redis (leader handles init)', async () => {
+      // First instance initializes (we are the leader)
+      await MCPServersInitializer.initialize(testConfigs);
+
+      // Verify initialized state is in Redis
+      expect(await registryStatusCache.isInitialized()).toBe(true);
+
+      // Verify servers are in Redis
+      const fileToolsServer = await registry.getServerConfig('file_tools_server');
+      expect(fileToolsServer).toBeDefined();
+      expect(fileToolsServer?.tools).toBe('file_read, file_write');
+
+      // Simulate second instance starting (reset process flag but keep Redis data)
+      MCPServersInitializer.resetProcessFlag();
+      jest.clearAllMocks();
+
+      // Second instance initializes - should still process because isFirstCallThisProcess
+      await MCPServersInitializer.initialize(testConfigs);
+
+      // Redis should still have correct data
+      expect(await registryStatusCache.isInitialized()).toBe(true);
+      expect(await registry.getServerConfig('file_tools_server')).toBeDefined();
+    });
+  });
 });
--- a/packages/api/src/mcp/registry/tests/MCPServersInitializer.test.ts
+++ b/packages/api/src/mcp/registry/tests/MCPServersInitializer.test.ts
@ -179,9 +179,10 @@ describe('MCPServersInitializer', () => {
      } as unknown as t.ParsedServerConfig;
    });

-    // Reset caches before each test
+    // Reset caches and process flag before each test
    await registryStatusCache.reset();
    await registry.reset();
+    MCPServersInitializer.resetProcessFlag();
    jest.clearAllMocks();
  });

@ -309,5 +310,51 @@ describe('MCPServersInitializer', () => {

      expect(await registryStatusCache.isInitialized()).toBe(true);
    });
+
+    it('should re-initialize on first call even if Redis cache says initialized (simulating app restart)', async () => {
+      // First initialization - populates caches
+      await MCPServersInitializer.initialize(testConfigs);
+      expect(await registryStatusCache.isInitialized()).toBe(true);
+      expect(await registry.getServerConfig('file_tools_server')).toBeDefined();
+
+      // Simulate stale data: add an extra server that shouldn't be there
+      await registry.addServer('stale_server', testConfigs.file_tools_server, 'CACHE');
+      expect(await registry.getServerConfig('stale_server')).toBeDefined();
+
+      jest.clearAllMocks();
+
+      // Simulate app restart by resetting the process flag
+      // In real scenario, this happens automatically when process restarts
+      MCPServersInitializer.resetProcessFlag();
+
+      // Re-initialize - should reset caches even though Redis says initialized
+      await MCPServersInitializer.initialize(testConfigs);
+
+      // Verify stale server was removed (cache was reset)
+      expect(await registry.getServerConfig('stale_server')).toBeUndefined();
+
+      // Verify new servers are present
+      expect(await registry.getServerConfig('file_tools_server')).toBeDefined();
+      expect(await registry.getServerConfig('oauth_server')).toBeDefined();
+
+      // Verify inspector was called again (re-initialization happened)
+      expect(mockInspect).toHaveBeenCalled();
+    });
+
+    it('should not re-initialize on subsequent calls within same process', async () => {
+      // First initialization (5 servers in testConfigs)
+      await MCPServersInitializer.initialize(testConfigs);
+      expect(mockInspect).toHaveBeenCalledTimes(5);
+
+      jest.clearAllMocks();
+
+      // Second call - should skip because process flag is set and Redis says initialized
+      await MCPServersInitializer.initialize(testConfigs);
+      expect(mockInspect).not.toHaveBeenCalled();
+
+      // Third call - still skips
+      await MCPServersInitializer.initialize(testConfigs);
+      expect(mockInspect).not.toHaveBeenCalled();
+    });
  });
 });
--- a/packages/data-schemas/src/methods/mcpServer.ts
+++ b/packages/data-schemas/src/methods/mcpServer.ts
@ -5,8 +5,8 @@ import logger from '~/config/winston';
 import { nanoid } from 'nanoid';

 const NORMALIZED_LIMIT_DEFAULT = 20;
-const MAX_CREATE_RETRIES = 3;
-const RETRY_BASE_DELAY_MS = 10;
+const MAX_CREATE_RETRIES = 5;
+const RETRY_BASE_DELAY_MS = 25;

 /**
 * Helper to check if an error is a MongoDB duplicate key error.