Build a Web Scraper MCP Server
Create an MCP server that fetches and parses web pages, extracting structured content for AI assistants. Learn HTML parsing, content extraction, and responsible scraping patterns.
title: "Build a Web Scraper MCP Server" description: "Create an MCP server that fetches and parses web pages, extracting structured content for AI assistants. Learn HTML parsing, content extraction, and responsible scraping patterns." order: 8 keywords:
- mcp web scraper
- web scraping mcp server
- html parser mcp tool
- fetch web page mcp
- content extraction ai date: "2026-04-01" level: "intermediate" duration: "30 min"
Build an MCP server that lets AI assistants fetch web pages and extract structured content. You will create tools for fetching HTML, extracting article text, and pulling metadata. Covers HTML parsing, content cleaning, and responsible scraping patterns.
What You Will Build
A web scraper MCP server with:
- fetch_page -- Fetch a web page and return cleaned text content
- extract_metadata -- Extract title, description, Open Graph tags
- extract_links -- Get all links from a page with context
The process of taking raw HTML and extracting meaningful text content, stripping ads, navigation, and boilerplate. Good extraction produces clean, readable text that AI assistants can reason about effectively.
Project Setup
Create and configure the project
npx mcp-framework create web-scraper-server
cd web-scraper-server
npm install cheerio
npm install -D @types/cheerio
We use cheerio for HTML parsing -- it provides a jQuery-like API for server-side HTML manipulation.
Building the Scraper Utilities
Create src/utils/scraper.ts:
import * as cheerio from "cheerio";
export interface PageContent {
title: string;
text: string;
wordCount: number;
url: string;
}
export interface PageMetadata {
title: string;
description: string;
ogTitle?: string;
ogDescription?: string;
ogImage?: string;
canonical?: string;
author?: string;
}
export interface PageLink {
text: string;
href: string;
isExternal: boolean;
}
export async function fetchHtml(url: string): Promise<string> {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 15000);
try {
const response = await fetch(url, {
headers: {
"User-Agent": "MCPBot/1.0 (Web Scraper MCP Server)",
Accept: "text/html,application/xhtml+xml",
},
signal: controller.signal,
redirect: "follow",
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const contentType = response.headers.get("content-type") || "";
if (!contentType.includes("text/html") && !contentType.includes("text/plain")) {
throw new Error(`Unsupported content type: ${contentType}`);
}
return await response.text();
} finally {
clearTimeout(timeout);
}
}
export function extractContent(html: string, url: string): PageContent {
const $ = cheerio.load(html);
// Remove non-content elements
$("script, style, nav, footer, header, aside, .ad, .sidebar, .menu, .nav").remove();
const title = $("title").text().trim() ||
$("h1").first().text().trim() ||
"Untitled";
// Try to find the main content area
const mainSelectors = ["article", "main", '[role="main"]', ".content", ".post-content", "#content"];
let contentElement = null;
for (const selector of mainSelectors) {
const el = $(selector);
if (el.length > 0) {
contentElement = el.first();
break;
}
}
const textSource = contentElement || $("body");
const text = textSource
.text()
.replace(/\s+/g, " ")
.replace(/\n{3,}/g, "\n\n")
.trim();
return {
title,
text: text.substring(0, 10000), // Limit to prevent token overflow
wordCount: text.split(/\s+/).length,
url,
};
}
export function extractMetadata(html: string): PageMetadata {
const $ = cheerio.load(html);
return {
title: $("title").text().trim(),
description: $('meta[name="description"]').attr("content") || "",
ogTitle: $('meta[property="og:title"]').attr("content"),
ogDescription: $('meta[property="og:description"]').attr("content"),
ogImage: $('meta[property="og:image"]').attr("content"),
canonical: $('link[rel="canonical"]').attr("href"),
author: $('meta[name="author"]').attr("content"),
};
}
export function extractLinks(html: string, baseUrl: string): PageLink[] {
const $ = cheerio.load(html);
const links: PageLink[] = [];
const seen = new Set<string>();
const base = new URL(baseUrl);
$("a[href]").each((_, el) => {
const href = $(el).attr("href");
const text = $(el).text().trim();
if (!href || !text || href.startsWith("#") || href.startsWith("javascript:")) {
return;
}
try {
const absoluteUrl = new URL(href, baseUrl).toString();
if (seen.has(absoluteUrl)) return;
seen.add(absoluteUrl);
const linkUrl = new URL(absoluteUrl);
links.push({
text: text.substring(0, 200),
href: absoluteUrl,
isExternal: linkUrl.hostname !== base.hostname,
});
} catch {
// Skip invalid URLs
}
});
return links;
}
Building the Tools
FetchPageTool
Create src/tools/FetchPageTool.ts:
import { MCPTool } from "mcp-framework";
import { z } from "zod";
import { fetchHtml, extractContent } from "../utils/scraper.js";
class FetchPageTool extends MCPTool<typeof inputSchema> {
name = "fetch_page";
description = "Fetch a web page and return its cleaned text content, stripping HTML tags and boilerplate";
schema = {
url: {
type: z.string().url(),
description: "The URL of the web page to fetch",
},
maxLength: {
type: z.number().min(100).max(10000).optional(),
description: "Maximum content length in characters (default: 5000)",
},
};
async execute(input: z.infer<typeof inputSchema>): Promise<string> {
try {
const html = await fetchHtml(input.url);
const content = extractContent(html, input.url);
const maxLen = input.maxLength || 5000;
if (content.text.length > maxLen) {
content.text = content.text.substring(0, maxLen) + "... [truncated]";
}
return JSON.stringify(content, null, 2);
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return JSON.stringify({ error: `Failed to fetch page: ${message}` });
}
}
}
const inputSchema = z.object({
url: z.string().url(),
maxLength: z.number().min(100).max(10000).optional(),
});
export default FetchPageTool;
ExtractMetadataTool
Create src/tools/ExtractMetadataTool.ts:
import { MCPTool } from "mcp-framework";
import { z } from "zod";
import { fetchHtml, extractMetadata } from "../utils/scraper.js";
class ExtractMetadataTool extends MCPTool<typeof inputSchema> {
name = "extract_metadata";
description = "Extract metadata from a web page including title, description, and Open Graph tags";
schema = {
url: {
type: z.string().url(),
description: "The URL to extract metadata from",
},
};
async execute(input: z.infer<typeof inputSchema>): Promise<string> {
try {
const html = await fetchHtml(input.url);
const metadata = extractMetadata(html);
return JSON.stringify({ url: input.url, ...metadata }, null, 2);
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return JSON.stringify({ error: message });
}
}
}
const inputSchema = z.object({
url: z.string().url(),
});
export default ExtractMetadataTool;
ExtractLinksTool
Create src/tools/ExtractLinksTool.ts:
import { MCPTool } from "mcp-framework";
import { z } from "zod";
import { fetchHtml, extractLinks } from "../utils/scraper.js";
class ExtractLinksTool extends MCPTool<typeof inputSchema> {
name = "extract_links";
description = "Extract all links from a web page with their text and whether they are external";
schema = {
url: {
type: z.string().url(),
description: "The URL to extract links from",
},
externalOnly: {
type: z.boolean().optional(),
description: "If true, only return external links",
},
};
async execute(input: z.infer<typeof inputSchema>): Promise<string> {
try {
const html = await fetchHtml(input.url);
let links = extractLinks(html, input.url);
if (input.externalOnly) {
links = links.filter((l) => l.isExternal);
}
return JSON.stringify({
url: input.url,
linkCount: links.length,
links: links.slice(0, 50),
}, null, 2);
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return JSON.stringify({ error: message });
}
}
}
const inputSchema = z.object({
url: z.string().url(),
externalOnly: z.boolean().optional(),
});
export default ExtractLinksTool;
Always check a site's robots.txt before scraping. Set a descriptive User-Agent header so site owners can identify your bot. Implement rate limiting to avoid overloading target servers.
Official SDK Version
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import { z } from "zod";
import { fetchHtml, extractContent } from "./scraper.js";
const server = new McpServer({ name: "web-scraper", version: "1.0.0" });
server.tool(
"fetch_page",
"Fetch and extract text content from a web page",
{ url: z.string().url(), maxLength: z.number().optional() },
async ({ url, maxLength }) => {
const html = await fetchHtml(url);
const content = extractContent(html, url);
content.text = content.text.substring(0, maxLength || 5000);
return {
content: [{ type: "text" as const, text: JSON.stringify(content, null, 2) }],
};
}
);
const transport = new StdioServerTransport();
await server.connect(transport);
Web pages can be very large. Always truncate content before returning it to the AI assistant. A 10,000 character limit is a good default. Let the user override it with a parameter if needed.
Testing
npm run build
npx @modelcontextprotocol/inspector node dist/index.js
Try fetching a page: call fetch_page with url: "https://example.com".