From 77d203404576ef556a9cf7942051d067ad8e9a89 Mon Sep 17 00:00:00 2001 From: Will Wilson Date: Sun, 8 Mar 2026 21:39:07 +0000 Subject: [PATCH 1/2] docs: add agent-browser MCP server setup guide Documents the agent-browser MCP server which provides Playwright-backed browser automation for LibreChat agents via the Vercel agent-browser library. Key topics covered: - Why @ref accessibility snapshots beat raw CSS selectors for LLM agents - Tool reference table (navigate, snapshot, click, fill, get_text, etc.) - Docker Compose and build-from-source setup - librechat.yaml mcpServers configuration - Critical: why express.json() must NOT be used with MCP SSE transport - Session management and SSEServerTransport routing pattern - Zod-based tool registration pattern Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../configuration/tools/agent-browser.mdx | 205 ++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 docs/docs/configuration/tools/agent-browser.mdx diff --git a/docs/docs/configuration/tools/agent-browser.mdx b/docs/docs/configuration/tools/agent-browser.mdx new file mode 100644 index 0000000000..a09b64576c --- /dev/null +++ b/docs/docs/configuration/tools/agent-browser.mdx @@ -0,0 +1,205 @@ +--- +title: Agent Browser MCP +description: Browser automation via MCP using Vercel's agent-browser library (Playwright + @ref accessibility snapshots) +--- + +import { Steps, Callout, Tabs } from 'nextra/components' + +# Agent Browser MCP Server + +The agent-browser MCP server provides AI-optimised browser automation for LibreChat agents, powered by [Vercel's `agent-browser` library](https://www.npmjs.com/package/agent-browser) which uses Playwright with accessibility tree snapshots. + +## Why agent-browser instead of raw Playwright/Puppeteer? + +Raw Playwright and Puppeteer expose CSS selectors and XPath expressions to the model. These are brittle in single-page applications, break when a site redeploys, and require the model to infer element identity from unstructured HTML. + +`agent-browser` solves this by producing **accessibility tree snapshots** with stable `@ref` identifiers: + +``` +button [@e3] "Sign in" +input [@e7] placeholder="Email address" +``` + +Every interactive element gets a unique `@e1`, `@e2`, `@e3`… reference that the model can pass directly to `click` or `fill`. This lets the LLM: + +- Reference elements precisely without fragile CSS selectors +- Navigate complex SPAs without XPath hacks +- Interact reliably with dynamically rendered content + +## Tools provided + +| Tool | Description | +|------|-------------| +| `navigate` | Navigate to a URL; returns the page title | +| `snapshot` | Get the accessibility tree with `@ref` identifiers for all interactive elements | +| `click` | Click an element by `@ref` (from snapshot) or CSS selector | +| `fill` | Clear and type into an input field by `@ref` or CSS selector | +| `get_text` | Extract text content from an element by CSS selector | +| `press_key` | Press a keyboard key (Enter, Tab, Escape, ArrowDown, etc.) | +| `screenshot` | Take a screenshot of the current page (returns base64 PNG) | +| `get_url` | Get the current browser URL | +| `close_browser` | Close the browser session and free all resources | + +## Setup + +### Prerequisites + +- Docker Compose (recommended) **or** Node.js ≥ 20 + Playwright system dependencies +- LibreChat configured with `mcpServers` in `librechat.yaml` + + + +### Run the MCP server + + + +Add to your `docker-compose.override.yml`: + +```yaml +services: + agent-browser-mcp: + build: + context: ./packages/mcp-servers/agent-browser + environment: + - PORT=8932 + # Optional: path to a specific Chromium binary + # - CHROMIUM_PATH=/usr/bin/chromium + ports: + - "8932:8932" + restart: unless-stopped +``` + + +```bash +# Clone LibreChat +git clone https://github.com/danny-avila/LibreChat +cd LibreChat/packages/mcp-servers/agent-browser + +npm install +npx playwright install chromium --with-deps + +npm run build +npm start +``` + +The server listens on `http://localhost:8932` by default. Set `PORT` to override. + + + +### Configure librechat.yaml + +Add the server to `mcpServers` in your `librechat.yaml`: + +```yaml +mcpServers: + agent-browser: + type: sse + url: http://agent-browser-mcp:8932/sse + # Adjust the URL for local/non-Docker setups: + # url: http://localhost:8932/sse + autoApprove: + - navigate + - snapshot + - click + - fill + - get_text + - press_key + - screenshot + - get_url + - close_browser +``` + + + +## Environment variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `PORT` | `8932` | HTTP port the MCP server listens on | +| `CHROMIUM_PATH` | _(Playwright managed)_ | Path to a custom Chromium binary | + +## Implementation reference + +If you are building your own MCP SSE server or extending this one, the following pattern is critical. + +### Critical: Do not add `express.json()` middleware + +The MCP `SSEServerTransport.handlePostMessage` reads the raw request stream internally. Adding `express.json()` upstream of the POST `/messages` route causes Express to consume the stream before the SDK can read it, producing **HTTP 400 "stream is not readable"** on every `initialize` call and preventing all tool execution. + +```typescript +import express from "express"; +import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { SSEServerTransport } from "@modelcontextprotocol/sdk/server/sse.js"; + +// CORRECT: no express.json() anywhere on this app +const app = express(); +const transports = new Map(); + +app.get("/sse", async (req, res) => { + const transport = new SSEServerTransport("/messages", res); + transports.set(transport.sessionId, transport); + const server = buildMcpServer(); // creates McpServer with all tools + await server.connect(transport); + res.on("close", () => transports.delete(transport.sessionId)); +}); + +app.post("/messages", async (req, res) => { + const transport = transports.get(req.query.sessionId as string); + if (!transport) { + res.status(404).json({ error: "Session not found" }); + return; + } + await transport.handlePostMessage(req, res); +}); +``` + +### Session management + +Each LibreChat client connection creates its own `SSEServerTransport` instance on `GET /sse`. The transport's `sessionId` (a UUID generated by the SDK) is appended to the client's POST `/messages` requests as `?sessionId=…`, routing each message back to the correct server-sent events connection. + +### Tool registration pattern + +Tools are registered using the `McpServer` fluent API with [Zod](https://zod.dev) schemas for parameter validation: + +```typescript +import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { z } from "zod"; + +function buildMcpServer(): McpServer { + const server = new McpServer({ name: "agent-browser", version: "1.0.0" }); + + server.tool( + "navigate", + "Navigate the browser to a URL. Returns the page title.", + { url: z.string().describe("Full URL including https://") }, + async ({ url }) => { + // ... call agent-browser BrowserManager + return { content: [{ type: "text", text: `Navigated to: ${title}` }] }; + } + ); + + // Register remaining tools... + return server; +} +``` + +## Typical agent workflow + +``` +1. navigate → https://example.com +2. snapshot → gets accessibility tree with @e1, @e2, @e3 refs +3. fill → @e7 "search query" +4. press_key → Enter +5. snapshot → inspect updated page +6. get_text → .result-list (extract results) +``` + + + Call `close_browser` when the task is finished to free Playwright resources. The browser session is shared across tool calls within a single server process, so leaving it open between tasks is intentional but consumes memory. + + +## Related + +- [MCP Server configuration reference](/docs/configuration/librechat_yaml/object_structure/mcp_servers) +- [Vercel `agent-browser` npm package](https://www.npmjs.com/package/agent-browser) +- [Model Context Protocol SDK](https://github.com/modelcontextprotocol/typescript-sdk) From 0dc05d9e7795899619921a9b2e9649ab0b052285 Mon Sep 17 00:00:00 2001 From: Will Wilson Date: Sun, 8 Mar 2026 22:12:11 +0000 Subject: [PATCH 2/2] feat: add agent-browser MCP package with SSRF protection Create packages/mcp-servers/agent-browser/ with: - Generalised server.ts (no homelab-specific config) - SSRF validation on navigate tool - Optional Perplexica integration (env var toggle) - Multi-stage Dockerfile with non-root user - Updated docs: security warnings, correct config schema Address review feedback: - Fix SSRF vulnerability on navigate tool - Remove autoApprove (not in mcpServers schema) - Add mcpSettings.allowedDomains - Fix broken docs links and file extensions - Fix double-pipe table formatting - Add Docker port exposure security guidance Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../configuration/tools/agent-browser.mdx | 50 ++-- .../mcp-servers/agent-browser/.env.example | 9 + packages/mcp-servers/agent-browser/Dockerfile | 42 +++ packages/mcp-servers/agent-browser/README.md | 42 +++ .../mcp-servers/agent-browser/package.json | 24 ++ .../mcp-servers/agent-browser/src/server.ts | 283 ++++++++++++++++++ .../mcp-servers/agent-browser/tsconfig.json | 14 + 7 files changed, 446 insertions(+), 18 deletions(-) create mode 100644 packages/mcp-servers/agent-browser/.env.example create mode 100644 packages/mcp-servers/agent-browser/Dockerfile create mode 100644 packages/mcp-servers/agent-browser/README.md create mode 100644 packages/mcp-servers/agent-browser/package.json create mode 100644 packages/mcp-servers/agent-browser/src/server.ts create mode 100644 packages/mcp-servers/agent-browser/tsconfig.json diff --git a/docs/docs/configuration/tools/agent-browser.mdx b/docs/docs/configuration/tools/agent-browser.mdx index a09b64576c..10f0e3701d 100644 --- a/docs/docs/configuration/tools/agent-browser.mdx +++ b/docs/docs/configuration/tools/agent-browser.mdx @@ -53,7 +53,7 @@ Every interactive element gets a unique `@e1`, `@e2`, `@e3`… reference that th -Add to your `docker-compose.override.yml`: +Add to your `docker-compose.override.yaml`: ```yaml services: @@ -64,8 +64,8 @@ services: - PORT=8932 # Optional: path to a specific Chromium binary # - CHROMIUM_PATH=/usr/bin/chromium - ports: - - "8932:8932" + # Internal Docker network only — do not expose publicly without auth + # For local dev, uncomment: ports: ["127.0.0.1:8932:8932"] restart: unless-stopped ``` @@ -88,25 +88,21 @@ The server listens on `http://localhost:8932` by default. Set `PORT` to override ### Configure librechat.yaml -Add the server to `mcpServers` in your `librechat.yaml`: +Add the server to your `librechat.yaml`: ```yaml +# Allow the MCP client to reach the agent-browser server +mcpSettings: + allowedDomains: + - http://agent-browser-mcp:8932 + - http://localhost:8932 + mcpServers: agent-browser: type: sse url: http://agent-browser-mcp:8932/sse - # Adjust the URL for local/non-Docker setups: + # For local/non-Docker setups: # url: http://localhost:8932/sse - autoApprove: - - navigate - - snapshot - - click - - fill - - get_text - - press_key - - screenshot - - get_url - - close_browser ``` @@ -118,6 +114,15 @@ mcpServers: | `PORT` | `8932` | HTTP port the MCP server listens on | | `CHROMIUM_PATH` | _(Playwright managed)_ | Path to a custom Chromium binary | +### Security + + + **SSRF protection:** The `navigate` tool validates URLs and rejects requests to private + IP ranges (10.x, 192.168.x, 172.16-31.x, 127.x, 169.254.x) and internal hostnames + (localhost, .local, .internal). For internal/homelab use, fork the server and adjust + the `isAllowedUrl()` function in `src/server.ts`. + + ## Implementation reference If you are building your own MCP SSE server or extending this one, the following pattern is critical. @@ -171,9 +176,18 @@ function buildMcpServer(): McpServer { server.tool( "navigate", "Navigate the browser to a URL. Returns the page title.", - { url: z.string().describe("Full URL including https://") }, + { + url: z + .string() + .url() + .refine(isAllowedUrl, { + message: + "URL must use http/https and must not point to private, loopback, or link-local addresses.", + }) + .describe("Full URL including https://"), + }, async ({ url }) => { - // ... call agent-browser BrowserManager + // ... call agent-browser BrowserManager with a validated, external URL return { content: [{ type: "text", text: `Navigated to: ${title}` }] }; } ); @@ -200,6 +214,6 @@ function buildMcpServer(): McpServer { ## Related -- [MCP Server configuration reference](/docs/configuration/librechat_yaml/object_structure/mcp_servers) +- [MCP Server configuration](https://www.librechat.ai/docs/configuration/librechat_yaml/object_structure/mcp_servers) - [Vercel `agent-browser` npm package](https://www.npmjs.com/package/agent-browser) - [Model Context Protocol SDK](https://github.com/modelcontextprotocol/typescript-sdk) diff --git a/packages/mcp-servers/agent-browser/.env.example b/packages/mcp-servers/agent-browser/.env.example new file mode 100644 index 0000000000..9696a76d34 --- /dev/null +++ b/packages/mcp-servers/agent-browser/.env.example @@ -0,0 +1,9 @@ +PORT=8932 +CHROMIUM_PATH=/usr/bin/chromium + +# Optional: Perplexica web search integration +# PERPLEXICA_URL=http://perplexica:3001 +# PERPLEXICA_CHAT_PROVIDER=openai +# PERPLEXICA_CHAT_MODEL=gpt-4 +# PERPLEXICA_EMBED_PROVIDER=ollama-embeddings +# PERPLEXICA_EMBED_MODEL=nomic-embed-text:latest diff --git a/packages/mcp-servers/agent-browser/Dockerfile b/packages/mcp-servers/agent-browser/Dockerfile new file mode 100644 index 0000000000..c373ef1957 --- /dev/null +++ b/packages/mcp-servers/agent-browser/Dockerfile @@ -0,0 +1,42 @@ +FROM node:22-slim AS builder + +WORKDIR /app +COPY package.json tsconfig.json ./ +RUN npm install +COPY src/ src/ +RUN npm run build + +FROM node:22-slim + +# Install Chromium dependencies for Playwright +RUN apt-get update && apt-get install -y --no-install-recommends \ + chromium \ + fonts-liberation \ + libasound2 \ + libatk-bridge2.0-0 \ + libatk1.0-0 \ + libcups2 \ + libdbus-1-3 \ + libdrm2 \ + libgbm1 \ + libgtk-3-0 \ + libnspr4 \ + libnss3 \ + libx11-xcb1 \ + libxcomposite1 \ + libxdamage1 \ + libxrandr2 \ + xdg-utils \ + && rm -rf /var/lib/apt/lists/* + +RUN groupadd -r appuser && useradd -r -g appuser -d /app appuser +WORKDIR /app +COPY --from=builder /app/dist dist/ +COPY --from=builder /app/node_modules node_modules/ +COPY package.json ./ + +ENV CHROMIUM_PATH=/usr/bin/chromium +USER appuser +EXPOSE 8932 + +CMD ["node", "dist/server.js"] diff --git a/packages/mcp-servers/agent-browser/README.md b/packages/mcp-servers/agent-browser/README.md new file mode 100644 index 0000000000..57999b61e9 --- /dev/null +++ b/packages/mcp-servers/agent-browser/README.md @@ -0,0 +1,42 @@ +# @librechat/mcp-agent-browser + +Vercel [agent-browser](https://github.com/vercel-labs/agent-browser) wrapped as an MCP SSE server for LibreChat. + +Uses Playwright with AI-optimised accessibility tree `@ref` snapshots — significantly better than raw CSS selectors for LLM-driven browser automation. + +## Tools + +| Tool | Description | +| --- | --- | +| `navigate` | Navigate to a URL (SSRF-protected) | +| `snapshot` | Get accessibility snapshot with `@ref` identifiers | +| `click` | Click element by `@ref` or CSS selector | +| `fill` | Fill form input by `@ref` or CSS selector | +| `get_text` | Get text content of an element | +| `press_key` | Press a keyboard key | +| `screenshot` | Take page screenshot | +| `get_url` | Get current URL | +| `close_browser` | Close browser session | +| `perplexica_search` | *(Optional)* Web search via Perplexica | + +## Quick Start + +```bash +docker build -t agent-browser-mcp . +docker run -p 8932:8932 agent-browser-mcp +``` + +## LibreChat Configuration + +```yaml +mcpServers: + agent-browser: + type: sse + url: http://agent-browser-mcp:8932/sse +``` + +## Security + +- **SSRF protection**: The `navigate` tool rejects private IPs (10.x, 192.168.x, 172.16-31.x, 127.x) and internal hostnames. +- Runs as non-root `appuser` in Docker. +- No `express.json()` middleware — see source comments for explanation. diff --git a/packages/mcp-servers/agent-browser/package.json b/packages/mcp-servers/agent-browser/package.json new file mode 100644 index 0000000000..8152aa9261 --- /dev/null +++ b/packages/mcp-servers/agent-browser/package.json @@ -0,0 +1,24 @@ +{ + "name": "@librechat/mcp-agent-browser", + "version": "1.0.0", + "description": "Vercel agent-browser MCP SSE server for LibreChat — Playwright-based browser automation with AI-optimised @ref snapshots", + "type": "module", + "main": "dist/server.js", + "scripts": { + "build": "tsc", + "start": "node dist/server.js", + "dev": "tsx src/server.ts" + }, + "dependencies": { + "@modelcontextprotocol/sdk": "^1.0.0", + "agent-browser": "^0.16.0", + "express": "^4.21.0", + "zod": "^3.23.0" + }, + "devDependencies": { + "@types/express": "^4.17.21", + "@types/node": "^22.0.0", + "tsx": "^4.19.0", + "typescript": "^5.6.0" + } +} diff --git a/packages/mcp-servers/agent-browser/src/server.ts b/packages/mcp-servers/agent-browser/src/server.ts new file mode 100644 index 0000000000..64a77ea66e --- /dev/null +++ b/packages/mcp-servers/agent-browser/src/server.ts @@ -0,0 +1,283 @@ +import express, { Request, Response } from "express"; +import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { SSEServerTransport } from "@modelcontextprotocol/sdk/server/sse.js"; +import { z } from "zod"; +import { BrowserManager } from "agent-browser/dist/browser.js"; +import { executeCommand } from "agent-browser/dist/actions.js"; + +const PORT = parseInt(process.env.PORT ?? "8932"); +const CHROMIUM_PATH = process.env.CHROMIUM_PATH ?? ""; + +// Optional Perplexica integration — only enabled if PERPLEXICA_URL is set +const PERPLEXICA_URL = process.env.PERPLEXICA_URL ?? ""; +const PERPLEXICA_CHAT_PROVIDER = process.env.PERPLEXICA_CHAT_PROVIDER ?? ""; +const PERPLEXICA_CHAT_MODEL = process.env.PERPLEXICA_CHAT_MODEL ?? ""; +const PERPLEXICA_EMBED_PROVIDER = process.env.PERPLEXICA_EMBED_PROVIDER ?? ""; +const PERPLEXICA_EMBED_MODEL = process.env.PERPLEXICA_EMBED_MODEL ?? ""; + +let browser: BrowserManager | null = null; +let cmdId = 0; +const nextId = () => `c${++cmdId}`; + +async function getBrowser(): Promise { + if (!browser?.isLaunched()) { + browser = new BrowserManager(); + const launchCmd: Record = { id: nextId(), action: "launch", headless: true }; + if (CHROMIUM_PATH) launchCmd.executablePath = CHROMIUM_PATH; + const resp = await executeCommand(launchCmd as any, browser); + if (!resp.success) throw new Error(`Browser launch failed: ${(resp as any).error}`); + } + return browser; +} + +async function cmd(command: Record): Promise { + const b = await getBrowser(); + const resp = await executeCommand({ id: nextId(), ...command } as any, b); + if (!resp.success) throw new Error((resp as any).error ?? "Command failed"); + return (resp as any).data as T; +} + +// --- SSRF Protection --- +const isPrivateHostname = (hostname: string): boolean => { + const lower = hostname.toLowerCase(); + return ( + lower === "localhost" || + lower === "ip6-localhost" || + lower.endsWith(".local") || + lower.endsWith(".internal") + ); +}; + +const isPrivateIp = (hostname: string): boolean => { + if (/^\d{1,3}(\.\d{1,3}){3}$/.test(hostname)) { + const parts = hostname.split(".").map(Number); + const [a, b] = parts; + if (a === 10) return true; + if (a === 127) return true; + if (a === 169 && b === 254) return true; + if (a === 172 && b >= 16 && b <= 31) return true; + if (a === 192 && b === 168) return true; + } + return hostname === "::1"; +}; + +const isAllowedUrl = (value: string): boolean => { + try { + const parsed = new URL(value); + if (parsed.protocol !== "http:" && parsed.protocol !== "https:") return false; + return !isPrivateHostname(parsed.hostname) && !isPrivateIp(parsed.hostname); + } catch { + return false; + } +}; + +// --- Optional Perplexica search --- +async function perplexicaChat(query: string): Promise { + if (!PERPLEXICA_URL) throw new Error("Perplexica not configured"); + const messageId = `msg-${Date.now()}`; + const chatId = `chat-${Date.now()}`; + const body = { + message: { messageId, chatId, role: "user", content: query }, + chatModel: { providerId: PERPLEXICA_CHAT_PROVIDER, key: PERPLEXICA_CHAT_MODEL }, + embeddingModel: { providerId: PERPLEXICA_EMBED_PROVIDER, key: PERPLEXICA_EMBED_MODEL }, + sources: ["web"], + optimizationMode: "speed", + history: [], + }; + + const resp = await fetch(`${PERPLEXICA_URL}/api/chat`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(body), + }); + if (!resp.ok) throw new Error(`Perplexica ${resp.status}: ${await resp.text()}`); + + const rawText = await resp.text(); + const blockValues: Map = new Map(); + for (const line of rawText.split("\n")) { + const trimmed = line.trim(); + if (!trimmed) continue; + let event: any; + try { event = JSON.parse(trimmed); } catch { continue; } + if (event.type === "error") throw new Error(event.data ?? "Perplexica error"); + if (event.type === "updateBlock" && Array.isArray(event.patch)) { + for (const patch of event.patch) { + if (patch.op === "replace" && patch.path === "/data") { + blockValues.set(event.blockId, String(patch.value ?? "")); + } + } + } + } + return Array.from(blockValues.values()).join("\n\n").trim() || "No response from Perplexica"; +} + +function buildMcpServer(): McpServer { + const server = new McpServer({ name: "agent-browser", version: "1.0.0" }); + + // Register Perplexica search only if configured + if (PERPLEXICA_URL) { + server.tool( + "perplexica_search", + "Search the web using Perplexica AI (gives cited answers).", + { query: z.string().describe("Search query") }, + async ({ query }: { query: string }) => { + try { + const result = await perplexicaChat(query); + return { content: [{ type: "text", text: result }] }; + } catch (e) { + return { content: [{ type: "text", text: `Perplexica error: ${String(e)}` }] }; + } + } + ); + } + + server.tool( + "navigate", + "Navigate the browser to a URL. Returns the page title. SSRF-protected: rejects private/internal addresses.", + { + url: z.string().url().refine(isAllowedUrl, { + message: "URL must use http/https and must not point to private or loopback addresses.", + }).describe("Full public URL including https://"), + }, + async ({ url }: { url: string }) => { + const data = await cmd<{ url: string; title: string }>({ action: "navigate", url }); + return { content: [{ type: "text", text: `Navigated to: ${data.title} (${data.url})` }] }; + } + ); + + server.tool( + "snapshot", + "Get an accessibility snapshot of the current page with @ref identifiers. Use refs with click/fill tools.", + {}, + async () => { + const data = await cmd<{ snapshot: string; origin?: string }>({ + action: "snapshot", + interactive: true, + }); + return { content: [{ type: "text", text: data.snapshot }] }; + } + ); + + server.tool( + "click", + "Click an element by @ref (from snapshot) or CSS selector.", + { ref: z.string().describe("@ref from snapshot (e.g. '@e1') or CSS selector") }, + async ({ ref }: { ref: string }) => { + await cmd({ action: "click", selector: ref }); + return { content: [{ type: "text", text: `Clicked ${ref}` }] }; + } + ); + + server.tool( + "fill", + "Clear a form input and type a new value. Use @ref from snapshot or CSS selector.", + { + ref: z.string().describe("@ref from snapshot or CSS selector"), + value: z.string().describe("Value to enter"), + }, + async ({ ref, value }: { ref: string; value: string }) => { + await cmd({ action: "fill", selector: ref, value }); + return { content: [{ type: "text", text: `Filled ${ref} with "${value}"` }] }; + } + ); + + server.tool( + "get_text", + "Get the text content of an element by CSS selector.", + { selector: z.string().describe("CSS selector") }, + async ({ selector }: { selector: string }) => { + const data = await cmd<{ text: string; origin?: string }>({ action: "gettext", selector }); + return { content: [{ type: "text", text: data.text.slice(0, 2000) }] }; + } + ); + + server.tool( + "press_key", + "Press a keyboard key globally (e.g. Enter, Tab, Escape, ArrowDown).", + { key: z.string().describe("Key name e.g. Enter, Tab, ArrowDown") }, + async ({ key }: { key: string }) => { + const b = await getBrowser(); + await b.getPage().keyboard.press(key); + return { content: [{ type: "text", text: `Pressed ${key}` }] }; + } + ); + + server.tool( + "screenshot", + "Take a screenshot of the current page.", + {}, + async () => { + const b = await getBrowser(); + const page = b.getPage(); + await page.screenshot({ path: "/tmp/screenshot.png" }); + return { content: [{ type: "text", text: "Screenshot taken (saved to /tmp/screenshot.png)" }] }; + } + ); + + server.tool( + "get_url", + "Get the current browser URL.", + {}, + async () => { + const data = await cmd<{ url: string }>({ action: "url" }); + return { content: [{ type: "text", text: data.url }] }; + } + ); + + server.tool( + "close_browser", + "Close the browser session and free resources.", + {}, + async () => { + if (browser) { + const b = browser.getBrowser(); + if (b) await b.close().catch(() => {}); + browser = null; + } + return { content: [{ type: "text", text: "Browser closed" }] }; + } + ); + + return server; +} + +// CRITICAL: Do NOT add express.json() or any body-parsing middleware here. +// SSEServerTransport.handlePostMessage() reads the raw request body as a Node.js readable +// stream. If express.json() pre-consumes the stream, every MCP initialize handshake fails +// with HTTP 400 "stream is not readable", silently preventing all tool execution. +const app = express(); +const transports: Map = new Map(); + +app.get("/health", (_req: Request, res: Response) => { + const tools = [ + "navigate", "snapshot", "click", "fill", "get_text", + "press_key", "screenshot", "get_url", "close_browser", + ]; + if (PERPLEXICA_URL) tools.unshift("perplexica_search"); + res.json({ status: "ok", tools }); +}); + +app.get("/sse", async (req: Request, res: Response) => { + const transport = new SSEServerTransport("/messages", res); + const id = transport.sessionId; + transports.set(id, transport); + const server = buildMcpServer(); + await server.connect(transport); + res.on("close", () => transports.delete(id)); +}); + +app.post("/messages", async (req: Request, res: Response) => { + const id = req.query.sessionId as string; + const transport = transports.get(id); + if (!transport) { + res.status(404).json({ error: "Session not found" }); + return; + } + await transport.handlePostMessage(req, res); +}); + +app.listen(PORT, () => { + console.log(`agent-browser MCP server listening on port ${PORT}`); + if (PERPLEXICA_URL) console.log(`Perplexica integration enabled: ${PERPLEXICA_URL}`); + else console.log("Perplexica integration disabled (set PERPLEXICA_URL to enable)"); +}); diff --git a/packages/mcp-servers/agent-browser/tsconfig.json b/packages/mcp-servers/agent-browser/tsconfig.json new file mode 100644 index 0000000000..4963332ff7 --- /dev/null +++ b/packages/mcp-servers/agent-browser/tsconfig.json @@ -0,0 +1,14 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "NodeNext", + "moduleResolution": "NodeNext", + "outDir": "dist", + "rootDir": "src", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "declaration": true + }, + "include": ["src"] +}