Merge 0dc05d9e77 into 8ed0bcf5ca

2026-04-07 00:15:23 +02:00 · 2026-04-05 02:37:05 +00:00 · 2026-04-05 02:37:05 +00:00 · dbe07c9fb2
commit dbe07c9fb2
parent 8ed0bcf5ca 0dc05d9e77
7 changed files with 633 additions and 0 deletions
--- a/docs/docs/configuration/tools/agent-browser.mdx
+++ b/docs/docs/configuration/tools/agent-browser.mdx
@ -0,0 +1,219 @@
+---
+title: Agent Browser MCP
+description: Browser automation via MCP using Vercel's agent-browser library (Playwright + @ref accessibility snapshots)
+---
+
+import { Steps, Callout, Tabs } from 'nextra/components'
+
+# Agent Browser MCP Server
+
+The agent-browser MCP server provides AI-optimised browser automation for LibreChat agents, powered by [Vercel's `agent-browser` library](https://www.npmjs.com/package/agent-browser) which uses Playwright with accessibility tree snapshots.
+
+## Why agent-browser instead of raw Playwright/Puppeteer?
+
+Raw Playwright and Puppeteer expose CSS selectors and XPath expressions to the model. These are brittle in single-page applications, break when a site redeploys, and require the model to infer element identity from unstructured HTML.
+
+`agent-browser` solves this by producing **accessibility tree snapshots** with stable `@ref` identifiers:
+
+```
+button [@e3] "Sign in"
+input  [@e7] placeholder="Email address"
+```
+
+Every interactive element gets a unique `@e1`, `@e2`, `@e3`… reference that the model can pass directly to `click` or `fill`. This lets the LLM:
+
+- Reference elements precisely without fragile CSS selectors
+- Navigate complex SPAs without XPath hacks
+- Interact reliably with dynamically rendered content
+
+## Tools provided
+
+| Tool | Description |
+|------|-------------|
+| `navigate` | Navigate to a URL; returns the page title |
+| `snapshot` | Get the accessibility tree with `@ref` identifiers for all interactive elements |
+| `click` | Click an element by `@ref` (from snapshot) or CSS selector |
+| `fill` | Clear and type into an input field by `@ref` or CSS selector |
+| `get_text` | Extract text content from an element by CSS selector |
+| `press_key` | Press a keyboard key (Enter, Tab, Escape, ArrowDown, etc.) |
+| `screenshot` | Take a screenshot of the current page (returns base64 PNG) |
+| `get_url` | Get the current browser URL |
+| `close_browser` | Close the browser session and free all resources |
+
+## Setup
+
+### Prerequisites
+
+- Docker Compose (recommended) **or** Node.js ≥ 20 + Playwright system dependencies
+- LibreChat configured with `mcpServers` in `librechat.yaml`
+
+<Steps>
+
+### Run the MCP server
+
+<Tabs items={['Docker Compose', 'Build from source']}>
+  <Tabs.Tab>
+Add to your `docker-compose.override.yaml`:
+
+```yaml
+services:
+  agent-browser-mcp:
+    build:
+      context: ./packages/mcp-servers/agent-browser
+    environment:
+      - PORT=8932
+      # Optional: path to a specific Chromium binary
+      # - CHROMIUM_PATH=/usr/bin/chromium
+    # Internal Docker network only — do not expose publicly without auth
+    # For local dev, uncomment: ports: ["127.0.0.1:8932:8932"]
+    restart: unless-stopped
+```
+  </Tabs.Tab>
+  <Tabs.Tab>
+```bash
+# Clone LibreChat
+git clone https://github.com/danny-avila/LibreChat
+cd LibreChat/packages/mcp-servers/agent-browser
+
+npm install
+npx playwright install chromium --with-deps
+
+npm run build
+npm start
+```
+
+The server listens on `http://localhost:8932` by default. Set `PORT` to override.
+  </Tabs.Tab>
+</Tabs>
+
+### Configure librechat.yaml
+
+Add the server to your `librechat.yaml`:
+
+```yaml
+# Allow the MCP client to reach the agent-browser server
+mcpSettings:
+  allowedDomains:
+    - http://agent-browser-mcp:8932
+    - http://localhost:8932
+
+mcpServers:
+  agent-browser:
+    type: sse
+    url: http://agent-browser-mcp:8932/sse
+    # For local/non-Docker setups:
+    # url: http://localhost:8932/sse
+```
+
+</Steps>
+
+## Environment variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `PORT` | `8932` | HTTP port the MCP server listens on |
+| `CHROMIUM_PATH` | _(Playwright managed)_ | Path to a custom Chromium binary |
+
+### Security
+
+<Callout type="warning">
+  **SSRF protection:** The `navigate` tool validates URLs and rejects requests to private
+  IP ranges (10.x, 192.168.x, 172.16-31.x, 127.x, 169.254.x) and internal hostnames
+  (localhost, .local, .internal). For internal/homelab use, fork the server and adjust
+  the `isAllowedUrl()` function in `src/server.ts`.
+</Callout>
+
+## Implementation reference
+
+If you are building your own MCP SSE server or extending this one, the following pattern is critical.
+
+### Critical: Do not add `express.json()` middleware
+
+The MCP `SSEServerTransport.handlePostMessage` reads the raw request stream internally. Adding `express.json()` upstream of the POST `/messages` route causes Express to consume the stream before the SDK can read it, producing **HTTP 400 "stream is not readable"** on every `initialize` call and preventing all tool execution.
+
+```typescript
+import express from "express";
+import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import { SSEServerTransport } from "@modelcontextprotocol/sdk/server/sse.js";
+
+// CORRECT: no express.json() anywhere on this app
+const app = express();
+const transports = new Map<string, SSEServerTransport>();
+
+app.get("/sse", async (req, res) => {
+  const transport = new SSEServerTransport("/messages", res);
+  transports.set(transport.sessionId, transport);
+  const server = buildMcpServer(); // creates McpServer with all tools
+  await server.connect(transport);
+  res.on("close", () => transports.delete(transport.sessionId));
+});
+
+app.post("/messages", async (req, res) => {
+  const transport = transports.get(req.query.sessionId as string);
+  if (!transport) {
+    res.status(404).json({ error: "Session not found" });
+    return;
+  }
+  await transport.handlePostMessage(req, res);
+});
+```
+
+### Session management
+
+Each LibreChat client connection creates its own `SSEServerTransport` instance on `GET /sse`. The transport's `sessionId` (a UUID generated by the SDK) is appended to the client's POST `/messages` requests as `?sessionId=…`, routing each message back to the correct server-sent events connection.
+
+### Tool registration pattern
+
+Tools are registered using the `McpServer` fluent API with [Zod](https://zod.dev) schemas for parameter validation:
+
+```typescript
+import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import { z } from "zod";
+
+function buildMcpServer(): McpServer {
+  const server = new McpServer({ name: "agent-browser", version: "1.0.0" });
+
+  server.tool(
+    "navigate",
+    "Navigate the browser to a URL. Returns the page title.",
+    {
+      url: z
+        .string()
+        .url()
+        .refine(isAllowedUrl, {
+          message:
+            "URL must use http/https and must not point to private, loopback, or link-local addresses.",
+        })
+        .describe("Full URL including https://"),
+    },
+    async ({ url }) => {
+      // ... call agent-browser BrowserManager with a validated, external URL
+      return { content: [{ type: "text", text: `Navigated to: ${title}` }] };
+    }
+  );
+
+  // Register remaining tools...
+  return server;
+}
+```
+
+## Typical agent workflow
+
+```
+1. navigate   → https://example.com
+2. snapshot   → gets accessibility tree with @e1, @e2, @e3 refs
+3. fill       → @e7 "search query"
+4. press_key  → Enter
+5. snapshot   → inspect updated page
+6. get_text   → .result-list  (extract results)
+```
+
+<Callout type="info">
+  Call `close_browser` when the task is finished to free Playwright resources. The browser session is shared across tool calls within a single server process, so leaving it open between tasks is intentional but consumes memory.
+</Callout>
+
+## Related
+
+- [MCP Server configuration](https://www.librechat.ai/docs/configuration/librechat_yaml/object_structure/mcp_servers)
+- [Vercel `agent-browser` npm package](https://www.npmjs.com/package/agent-browser)
+- [Model Context Protocol SDK](https://github.com/modelcontextprotocol/typescript-sdk)
--- a/packages/mcp-servers/agent-browser/.env.example
+++ b/packages/mcp-servers/agent-browser/.env.example
@ -0,0 +1,9 @@
+PORT=8932
+CHROMIUM_PATH=/usr/bin/chromium
+
+# Optional: Perplexica web search integration
+# PERPLEXICA_URL=http://perplexica:3001
+# PERPLEXICA_CHAT_PROVIDER=openai
+# PERPLEXICA_CHAT_MODEL=gpt-4
+# PERPLEXICA_EMBED_PROVIDER=ollama-embeddings
+# PERPLEXICA_EMBED_MODEL=nomic-embed-text:latest
--- a/packages/mcp-servers/agent-browser/Dockerfile
+++ b/packages/mcp-servers/agent-browser/Dockerfile
@ -0,0 +1,42 @@
+FROM node:22-slim AS builder
+
+WORKDIR /app
+COPY package.json tsconfig.json ./
+RUN npm install
+COPY src/ src/
+RUN npm run build
+
+FROM node:22-slim
+
+# Install Chromium dependencies for Playwright
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    chromium \
+    fonts-liberation \
+    libasound2 \
+    libatk-bridge2.0-0 \
+    libatk1.0-0 \
+    libcups2 \
+    libdbus-1-3 \
+    libdrm2 \
+    libgbm1 \
+    libgtk-3-0 \
+    libnspr4 \
+    libnss3 \
+    libx11-xcb1 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxrandr2 \
+    xdg-utils \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN groupadd -r appuser && useradd -r -g appuser -d /app appuser
+WORKDIR /app
+COPY --from=builder /app/dist dist/
+COPY --from=builder /app/node_modules node_modules/
+COPY package.json ./
+
+ENV CHROMIUM_PATH=/usr/bin/chromium
+USER appuser
+EXPOSE 8932
+
+CMD ["node", "dist/server.js"]
--- a/packages/mcp-servers/agent-browser/README.md
+++ b/packages/mcp-servers/agent-browser/README.md
@ -0,0 +1,42 @@
+# @librechat/mcp-agent-browser
+
+Vercel [agent-browser](https://github.com/vercel-labs/agent-browser) wrapped as an MCP SSE server for LibreChat.
+
+Uses Playwright with AI-optimised accessibility tree `@ref` snapshots — significantly better than raw CSS selectors for LLM-driven browser automation.
+
+## Tools
+
+| Tool | Description |
+| --- | --- |
+| `navigate` | Navigate to a URL (SSRF-protected) |
+| `snapshot` | Get accessibility snapshot with `@ref` identifiers |
+| `click` | Click element by `@ref` or CSS selector |
+| `fill` | Fill form input by `@ref` or CSS selector |
+| `get_text` | Get text content of an element |
+| `press_key` | Press a keyboard key |
+| `screenshot` | Take page screenshot |
+| `get_url` | Get current URL |
+| `close_browser` | Close browser session |
+| `perplexica_search` | *(Optional)* Web search via Perplexica |
+
+## Quick Start
+
+```bash
+docker build -t agent-browser-mcp .
+docker run -p 8932:8932 agent-browser-mcp
+```
+
+## LibreChat Configuration
+
+```yaml
+mcpServers:
+  agent-browser:
+    type: sse
+    url: http://agent-browser-mcp:8932/sse
+```
+
+## Security
+
+- **SSRF protection**: The `navigate` tool rejects private IPs (10.x, 192.168.x, 172.16-31.x, 127.x) and internal hostnames.
+- Runs as non-root `appuser` in Docker.
+- No `express.json()` middleware — see source comments for explanation.
--- a/packages/mcp-servers/agent-browser/package.json
+++ b/packages/mcp-servers/agent-browser/package.json
@ -0,0 +1,24 @@
+{
+  "name": "@librechat/mcp-agent-browser",
+  "version": "1.0.0",
+  "description": "Vercel agent-browser MCP SSE server for LibreChat — Playwright-based browser automation with AI-optimised @ref snapshots",
+  "type": "module",
+  "main": "dist/server.js",
+  "scripts": {
+    "build": "tsc",
+    "start": "node dist/server.js",
+    "dev": "tsx src/server.ts"
+  },
+  "dependencies": {
+    "@modelcontextprotocol/sdk": "^1.0.0",
+    "agent-browser": "^0.16.0",
+    "express": "^4.21.0",
+    "zod": "^3.23.0"
+  },
+  "devDependencies": {
+    "@types/express": "^4.17.21",
+    "@types/node": "^22.0.0",
+    "tsx": "^4.19.0",
+    "typescript": "^5.6.0"
+  }
+}
--- a/packages/mcp-servers/agent-browser/src/server.ts
+++ b/packages/mcp-servers/agent-browser/src/server.ts
@ -0,0 +1,283 @@
+import express, { Request, Response } from "express";
+import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import { SSEServerTransport } from "@modelcontextprotocol/sdk/server/sse.js";
+import { z } from "zod";
+import { BrowserManager } from "agent-browser/dist/browser.js";
+import { executeCommand } from "agent-browser/dist/actions.js";
+
+const PORT = parseInt(process.env.PORT ?? "8932");
+const CHROMIUM_PATH = process.env.CHROMIUM_PATH ?? "";
+
+// Optional Perplexica integration — only enabled if PERPLEXICA_URL is set
+const PERPLEXICA_URL = process.env.PERPLEXICA_URL ?? "";
+const PERPLEXICA_CHAT_PROVIDER = process.env.PERPLEXICA_CHAT_PROVIDER ?? "";
+const PERPLEXICA_CHAT_MODEL = process.env.PERPLEXICA_CHAT_MODEL ?? "";
+const PERPLEXICA_EMBED_PROVIDER = process.env.PERPLEXICA_EMBED_PROVIDER ?? "";
+const PERPLEXICA_EMBED_MODEL = process.env.PERPLEXICA_EMBED_MODEL ?? "";
+
+let browser: BrowserManager | null = null;
+let cmdId = 0;
+const nextId = () => `c${++cmdId}`;
+
+async function getBrowser(): Promise<BrowserManager> {
+  if (!browser?.isLaunched()) {
+    browser = new BrowserManager();
+    const launchCmd: Record<string, unknown> = { id: nextId(), action: "launch", headless: true };
+    if (CHROMIUM_PATH) launchCmd.executablePath = CHROMIUM_PATH;
+    const resp = await executeCommand(launchCmd as any, browser);
+    if (!resp.success) throw new Error(`Browser launch failed: ${(resp as any).error}`);
+  }
+  return browser;
+}
+
+async function cmd<T = unknown>(command: Record<string, unknown>): Promise<T> {
+  const b = await getBrowser();
+  const resp = await executeCommand({ id: nextId(), ...command } as any, b);
+  if (!resp.success) throw new Error((resp as any).error ?? "Command failed");
+  return (resp as any).data as T;
+}
+
+// --- SSRF Protection ---
+const isPrivateHostname = (hostname: string): boolean => {
+  const lower = hostname.toLowerCase();
+  return (
+    lower === "localhost" ||
+    lower === "ip6-localhost" ||
+    lower.endsWith(".local") ||
+    lower.endsWith(".internal")
+  );
+};
+
+const isPrivateIp = (hostname: string): boolean => {
+  if (/^\d{1,3}(\.\d{1,3}){3}$/.test(hostname)) {
+    const parts = hostname.split(".").map(Number);
+    const [a, b] = parts;
+    if (a === 10) return true;
+    if (a === 127) return true;
+    if (a === 169 && b === 254) return true;
+    if (a === 172 && b >= 16 && b <= 31) return true;
+    if (a === 192 && b === 168) return true;
+  }
+  return hostname === "::1";
+};
+
+const isAllowedUrl = (value: string): boolean => {
+  try {
+    const parsed = new URL(value);
+    if (parsed.protocol !== "http:" && parsed.protocol !== "https:") return false;
+    return !isPrivateHostname(parsed.hostname) && !isPrivateIp(parsed.hostname);
+  } catch {
+    return false;
+  }
+};
+
+// --- Optional Perplexica search ---
+async function perplexicaChat(query: string): Promise<string> {
+  if (!PERPLEXICA_URL) throw new Error("Perplexica not configured");
+  const messageId = `msg-${Date.now()}`;
+  const chatId = `chat-${Date.now()}`;
+  const body = {
+    message: { messageId, chatId, role: "user", content: query },
+    chatModel: { providerId: PERPLEXICA_CHAT_PROVIDER, key: PERPLEXICA_CHAT_MODEL },
+    embeddingModel: { providerId: PERPLEXICA_EMBED_PROVIDER, key: PERPLEXICA_EMBED_MODEL },
+    sources: ["web"],
+    optimizationMode: "speed",
+    history: [],
+  };
+
+  const resp = await fetch(`${PERPLEXICA_URL}/api/chat`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify(body),
+  });
+  if (!resp.ok) throw new Error(`Perplexica ${resp.status}: ${await resp.text()}`);
+
+  const rawText = await resp.text();
+  const blockValues: Map<string, string> = new Map();
+  for (const line of rawText.split("\n")) {
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    let event: any;
+    try { event = JSON.parse(trimmed); } catch { continue; }
+    if (event.type === "error") throw new Error(event.data ?? "Perplexica error");
+    if (event.type === "updateBlock" && Array.isArray(event.patch)) {
+      for (const patch of event.patch) {
+        if (patch.op === "replace" && patch.path === "/data") {
+          blockValues.set(event.blockId, String(patch.value ?? ""));
+        }
+      }
+    }
+  }
+  return Array.from(blockValues.values()).join("\n\n").trim() || "No response from Perplexica";
+}
+
+function buildMcpServer(): McpServer {
+  const server = new McpServer({ name: "agent-browser", version: "1.0.0" });
+
+  // Register Perplexica search only if configured
+  if (PERPLEXICA_URL) {
+    server.tool(
+      "perplexica_search",
+      "Search the web using Perplexica AI (gives cited answers).",
+      { query: z.string().describe("Search query") },
+      async ({ query }: { query: string }) => {
+        try {
+          const result = await perplexicaChat(query);
+          return { content: [{ type: "text", text: result }] };
+        } catch (e) {
+          return { content: [{ type: "text", text: `Perplexica error: ${String(e)}` }] };
+        }
+      }
+    );
+  }
+
+  server.tool(
+    "navigate",
+    "Navigate the browser to a URL. Returns the page title. SSRF-protected: rejects private/internal addresses.",
+    {
+      url: z.string().url().refine(isAllowedUrl, {
+        message: "URL must use http/https and must not point to private or loopback addresses.",
+      }).describe("Full public URL including https://"),
+    },
+    async ({ url }: { url: string }) => {
+      const data = await cmd<{ url: string; title: string }>({ action: "navigate", url });
+      return { content: [{ type: "text", text: `Navigated to: ${data.title} (${data.url})` }] };
+    }
+  );
+
+  server.tool(
+    "snapshot",
+    "Get an accessibility snapshot of the current page with @ref identifiers. Use refs with click/fill tools.",
+    {},
+    async () => {
+      const data = await cmd<{ snapshot: string; origin?: string }>({
+        action: "snapshot",
+        interactive: true,
+      });
+      return { content: [{ type: "text", text: data.snapshot }] };
+    }
+  );
+
+  server.tool(
+    "click",
+    "Click an element by @ref (from snapshot) or CSS selector.",
+    { ref: z.string().describe("@ref from snapshot (e.g. '@e1') or CSS selector") },
+    async ({ ref }: { ref: string }) => {
+      await cmd({ action: "click", selector: ref });
+      return { content: [{ type: "text", text: `Clicked ${ref}` }] };
+    }
+  );
+
+  server.tool(
+    "fill",
+    "Clear a form input and type a new value. Use @ref from snapshot or CSS selector.",
+    {
+      ref: z.string().describe("@ref from snapshot or CSS selector"),
+      value: z.string().describe("Value to enter"),
+    },
+    async ({ ref, value }: { ref: string; value: string }) => {
+      await cmd({ action: "fill", selector: ref, value });
+      return { content: [{ type: "text", text: `Filled ${ref} with "${value}"` }] };
+    }
+  );
+
+  server.tool(
+    "get_text",
+    "Get the text content of an element by CSS selector.",
+    { selector: z.string().describe("CSS selector") },
+    async ({ selector }: { selector: string }) => {
+      const data = await cmd<{ text: string; origin?: string }>({ action: "gettext", selector });
+      return { content: [{ type: "text", text: data.text.slice(0, 2000) }] };
+    }
+  );
+
+  server.tool(
+    "press_key",
+    "Press a keyboard key globally (e.g. Enter, Tab, Escape, ArrowDown).",
+    { key: z.string().describe("Key name e.g. Enter, Tab, ArrowDown") },
+    async ({ key }: { key: string }) => {
+      const b = await getBrowser();
+      await b.getPage().keyboard.press(key);
+      return { content: [{ type: "text", text: `Pressed ${key}` }] };
+    }
+  );
+
+  server.tool(
+    "screenshot",
+    "Take a screenshot of the current page.",
+    {},
+    async () => {
+      const b = await getBrowser();
+      const page = b.getPage();
+      await page.screenshot({ path: "/tmp/screenshot.png" });
+      return { content: [{ type: "text", text: "Screenshot taken (saved to /tmp/screenshot.png)" }] };
+    }
+  );
+
+  server.tool(
+    "get_url",
+    "Get the current browser URL.",
+    {},
+    async () => {
+      const data = await cmd<{ url: string }>({ action: "url" });
+      return { content: [{ type: "text", text: data.url }] };
+    }
+  );
+
+  server.tool(
+    "close_browser",
+    "Close the browser session and free resources.",
+    {},
+    async () => {
+      if (browser) {
+        const b = browser.getBrowser();
+        if (b) await b.close().catch(() => {});
+        browser = null;
+      }
+      return { content: [{ type: "text", text: "Browser closed" }] };
+    }
+  );
+
+  return server;
+}
+
+// CRITICAL: Do NOT add express.json() or any body-parsing middleware here.
+// SSEServerTransport.handlePostMessage() reads the raw request body as a Node.js readable
+// stream. If express.json() pre-consumes the stream, every MCP initialize handshake fails
+// with HTTP 400 "stream is not readable", silently preventing all tool execution.
+const app = express();
+const transports: Map<string, SSEServerTransport> = new Map();
+
+app.get("/health", (_req: Request, res: Response) => {
+  const tools = [
+    "navigate", "snapshot", "click", "fill", "get_text",
+    "press_key", "screenshot", "get_url", "close_browser",
+  ];
+  if (PERPLEXICA_URL) tools.unshift("perplexica_search");
+  res.json({ status: "ok", tools });
+});
+
+app.get("/sse", async (req: Request, res: Response) => {
+  const transport = new SSEServerTransport("/messages", res);
+  const id = transport.sessionId;
+  transports.set(id, transport);
+  const server = buildMcpServer();
+  await server.connect(transport);
+  res.on("close", () => transports.delete(id));
+});
+
+app.post("/messages", async (req: Request, res: Response) => {
+  const id = req.query.sessionId as string;
+  const transport = transports.get(id);
+  if (!transport) {
+    res.status(404).json({ error: "Session not found" });
+    return;
+  }
+  await transport.handlePostMessage(req, res);
+});
+
+app.listen(PORT, () => {
+  console.log(`agent-browser MCP server listening on port ${PORT}`);
+  if (PERPLEXICA_URL) console.log(`Perplexica integration enabled: ${PERPLEXICA_URL}`);
+  else console.log("Perplexica integration disabled (set PERPLEXICA_URL to enable)");
+});
--- a/packages/mcp-servers/agent-browser/tsconfig.json
+++ b/packages/mcp-servers/agent-browser/tsconfig.json
@ -0,0 +1,14 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "NodeNext",
+    "moduleResolution": "NodeNext",
+    "outDir": "dist",
+    "rootDir": "src",
+    "strict": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true,
+    "declaration": true
+  },
+  "include": ["src"]
+}