#!/usr/bin/env npx tsx /** * MCP PDF Processor — Monetized with SettleGrid * * A complete MCP server that extracts text and structured data from PDFs * using pdf-parse and Claude. Fork, add your key, and deploy. * * Setup: * 1. npm install @settlegrid/mcp pdf-parse * 2. Set ANTHROPIC_API_KEY and SETTLEGRID_API_KEY in your env * 3. Register your tool at settlegrid.ai/dashboard/tools * 4. Run: npx tsx mcp-pdf-processor.ts * * Pricing: 3 cents per text extraction, 6 cents per structured, 4 cents per markdown * - pdf-parse is free (local processing) * - Claude costs ~$0.007 per average document (2K input + 1K output) * - 3 cents text extraction = pure margin (no AI needed) * - 6 cents structured = ~8x margin (Claude parses fields) * - 4 cents markdown = ~5x margin (Claude formats) * * Revenue: You keep 95-100% (100% on Free tier, 95% on paid tiers) */ import { settlegrid } from '@settlegrid/mcp' // ── SettleGrid Setup ──────────────────────────────────────────────────────── const sg = settlegrid.init({ toolSlug: 'my-pdf-processor', // Replace with your tool slug pricing: { defaultCostCents: 3, methods: { extract_text: { costCents: 3, displayName: 'Extract Text' }, extract_structured: { costCents: 6, displayName: 'Structured Extraction' }, pdf_to_markdown: { costCents: 4, displayName: 'PDF to Markdown' }, }, }, }) // ── Helpers ───────────────────────────────────────────────────────────────── const URL_RE = /^https?:\/\/[^\s/$.?#].[^\s]*$/ const MAX_PDF_SIZE = 10 * 1024 * 1024 async function fetchPdfText(url: string): Promise<{ text: string; pages: number }> { if (!url || !URL_RE.test(url)) throw new Error('A valid HTTP or HTTPS URL to a PDF is required') const response = await fetch(url) if (!response.ok) throw new Error(`Failed to fetch PDF: ${response.status} ${response.statusText}`) const ct = response.headers.get('content-type') ?? '' if (!ct.includes('pdf') && !url.toLowerCase().endsWith('.pdf')) throw new Error('URL does not point to a PDF') const buffer = await response.arrayBuffer() if (buffer.byteLength > MAX_PDF_SIZE) throw new Error(`PDF exceeds ${MAX_PDF_SIZE / 1024 / 1024} MB limit`) const pdfParse = (await import('pdf-parse')).default const result = await pdfParse(Buffer.from(buffer)) return { text: result.text, pages: result.numpages } } async function callClaude(prompt: string, systemPrompt: string): Promise { const response = await fetch('https://api.anthropic.com/v1/messages', { method: 'POST', headers: { 'Content-Type': 'application/json', 'x-api-key': process.env.ANTHROPIC_API_KEY!, 'anthropic-version': '2023-06-01' }, body: JSON.stringify({ model: 'claude-sonnet-4-20250514', max_tokens: 4096, system: systemPrompt, messages: [{ role: 'user', content: prompt }], }), }) if (!response.ok) throw new Error(`Claude API returned ${response.status}: ${response.statusText}`) const data = await response.json() const block = data.content?.[0] if (!block || block.type !== 'text') throw new Error('Unexpected Claude response format') return block.text } // ── PDF Processing Methods ────────────────────────────────────────────────── interface ExtractTextArgs { url: string } async function extractText(args: ExtractTextArgs): Promise<{ result: { text: string; pages: number; wordCount: number; charCount: number } }> { const { text, pages } = await fetchPdfText(args.url) return { result: { text, pages, wordCount: text.split(/\s+/).filter(Boolean).length, charCount: text.length } } } interface ExtractStructuredArgs { url: string; documentType?: string; fields?: string[] } async function extractStructured(args: ExtractStructuredArgs): Promise<{ result: { fields: Record; documentType: string; confidence: string } }> { const { text } = await fetchPdfText(args.url) if (text.trim().length === 0) throw new Error('PDF contains no extractable text (may be a scanned image)') const docType = args.documentType ?? 'unknown' const fieldList = args.fields?.length ? args.fields.join(', ') : 'auto-detect relevant fields' const raw = await callClaude( `Extract structured data from this ${docType} document.\n\nFields: ${fieldList}\n\nDocument:\n${text.slice(0, 50_000)}`, 'Return ONLY valid JSON: { "fields": { "fieldName": "value" }, "documentType": "type", "confidence": "high"|"medium"|"low" }. Set missing fields to "NOT_FOUND". No markdown fences.' ) try { return { result: JSON.parse(raw.replace(/```json\n?|\n?```/g, '').trim()) } } catch { return { result: { fields: { _raw: raw }, documentType: docType, confidence: 'low' } } } } interface MarkdownArgs { url: string; includeMetadata?: boolean } async function pdfToMarkdown(args: MarkdownArgs): Promise<{ result: { markdown: string; pages: number } }> { const { text, pages } = await fetchPdfText(args.url) if (text.trim().length === 0) throw new Error('PDF contains no extractable text (may be a scanned image)') const metaNote = args.includeMetadata ? ' Include a YAML front-matter block with title, author, date, and page count if detectable.' : '' const markdown = await callClaude( `Convert this PDF text into well-structured Markdown:\n\n${text.slice(0, 50_000)}`, `Convert raw PDF text into clean Markdown with proper headings, lists, tables, and paragraphs.${metaNote}` ) return { result: { markdown, pages } } } // ── Wrap with SettleGrid Billing ───────────────────────────────────────────── export const billedExtractText = sg.wrap(extractText, { method: 'extract_text' }) export const billedExtractStructured = sg.wrap(extractStructured, { method: 'extract_structured' }) export const billedPdfToMarkdown = sg.wrap(pdfToMarkdown, { method: 'pdf_to_markdown' }) // ── REST Alternative ──────────────────────────────────────────────────────── // import { settlegridMiddleware } from '@settlegrid/mcp/rest' // // const withBilling = settlegridMiddleware({ // toolSlug: 'my-pdf-processor', // pricing: { // defaultCostCents: 3, // methods: { // extract_text: { costCents: 3 }, // extract_structured: { costCents: 6 }, // pdf_to_markdown: { costCents: 4 }, // }, // }, // }) // // export async function POST(request: Request) { // return withBilling(request, async () => { // const { url } = await request.json() // const result = await extractText({ url }) // return Response.json(result) // }, 'extract_text') // }