⛈️ fix: MCP Reconnection Storm Prevention with Circuit Breaker, Backoff, and Tool Stubs (#12162)

* fix: MCP reconnection stability - circuit breaker, throttling, and cooldown retry

* Comment and logging cleanup

* fix broken tests
This commit is contained in:
matt burnett 2026-03-10 11:21:36 -07:00 committed by GitHub
parent cfbe812d63
commit ad5c51f62b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 736 additions and 38 deletions

View file

@ -1,6 +1,12 @@
interface FailedMeta {
attempts: number;
lastFailedAt: number;
}
const COOLDOWN_SCHEDULE_MS = [5 * 60 * 1000, 10 * 60 * 1000, 20 * 60 * 1000, 30 * 60 * 1000];
export class OAuthReconnectionTracker {
/** Map of userId -> Set of serverNames that have failed reconnection */
private failed: Map<string, Set<string>> = new Map();
private failedMeta: Map<string, Map<string, FailedMeta>> = new Map();
/** Map of userId -> Set of serverNames that are actively reconnecting */
private active: Map<string, Set<string>> = new Map();
/** Map of userId:serverName -> timestamp when reconnection started */
@ -9,7 +15,17 @@ export class OAuthReconnectionTracker {
private readonly RECONNECTION_TIMEOUT_MS = 3 * 60 * 1000; // 3 minutes
public isFailed(userId: string, serverName: string): boolean {
return this.failed.get(userId)?.has(serverName) ?? false;
const meta = this.failedMeta.get(userId)?.get(serverName);
if (!meta) {
return false;
}
const idx = Math.min(meta.attempts - 1, COOLDOWN_SCHEDULE_MS.length - 1);
const cooldown = COOLDOWN_SCHEDULE_MS[idx];
const elapsed = Date.now() - meta.lastFailedAt;
if (elapsed >= cooldown) {
return false;
}
return true;
}
/** Check if server is in the active set (original simple check) */
@ -48,11 +64,15 @@ export class OAuthReconnectionTracker {
}
public setFailed(userId: string, serverName: string): void {
if (!this.failed.has(userId)) {
this.failed.set(userId, new Set());
if (!this.failedMeta.has(userId)) {
this.failedMeta.set(userId, new Map());
}
this.failed.get(userId)?.add(serverName);
const userMap = this.failedMeta.get(userId)!;
const existing = userMap.get(serverName);
userMap.set(serverName, {
attempts: (existing?.attempts ?? 0) + 1,
lastFailedAt: Date.now(),
});
}
public setActive(userId: string, serverName: string): void {
@ -68,10 +88,10 @@ export class OAuthReconnectionTracker {
}
public removeFailed(userId: string, serverName: string): void {
const userServers = this.failed.get(userId);
userServers?.delete(serverName);
if (userServers?.size === 0) {
this.failed.delete(userId);
const userMap = this.failedMeta.get(userId);
userMap?.delete(serverName);
if (userMap?.size === 0) {
this.failedMeta.delete(userId);
}
}
@ -94,7 +114,7 @@ export class OAuthReconnectionTracker {
activeTimestamps: number;
} {
return {
usersWithFailedServers: this.failed.size,
usersWithFailedServers: this.failedMeta.size,
usersWithActiveReconnections: this.active.size,
activeTimestamps: this.activeTimestamps.size,
};