#!/usr/bin/env npx tsx
/**
 * MCP PDF Processor — Monetized with SettleGrid
 *
 * A complete MCP server that extracts text and structured data from PDFs
 * using pdf-parse and Claude. Fork, add your key, and deploy.
 *
 * Setup:
 *   1. npm install @settlegrid/mcp pdf-parse
 *   2. Set ANTHROPIC_API_KEY and SETTLEGRID_API_KEY in your env
 *   3. Register your tool at settlegrid.ai/dashboard/tools
 *   4. Run: npx tsx mcp-pdf-processor.ts
 *
 * Pricing: 3 cents per text extraction, 6 cents per structured, 4 cents per markdown
 *   - pdf-parse is free (local processing)
 *   - Claude costs ~$0.007 per average document (2K input + 1K output)
 *   - 3 cents text extraction = pure margin (no AI needed)
 *   - 6 cents structured = ~8x margin (Claude parses fields)
 *   - 4 cents markdown = ~5x margin (Claude formats)
 *
 * Revenue: You keep 95-100% (100% on Free tier, 95% on paid tiers)
 */

import { settlegrid } from '@settlegrid/mcp'

// ── SettleGrid Setup ────────────────────────────────────────────────────────

const sg = settlegrid.init({
  toolSlug: 'my-pdf-processor', // Replace with your tool slug
  pricing: {
    defaultCostCents: 3,
    methods: {
      extract_text: { costCents: 3, displayName: 'Extract Text' },
      extract_structured: { costCents: 6, displayName: 'Structured Extraction' },
      pdf_to_markdown: { costCents: 4, displayName: 'PDF to Markdown' },
    },
  },
})

// ── Helpers ─────────────────────────────────────────────────────────────────

const URL_RE = /^https?:\/\/[^\s/$.?#].[^\s]*$/
const MAX_PDF_SIZE = 10 * 1024 * 1024

async function fetchPdfText(url: string): Promise<{ text: string; pages: number }> {
  if (!url || !URL_RE.test(url)) throw new Error('A valid HTTP or HTTPS URL to a PDF is required')
  const response = await fetch(url)
  if (!response.ok) throw new Error(`Failed to fetch PDF: ${response.status} ${response.statusText}`)
  const ct = response.headers.get('content-type') ?? ''
  if (!ct.includes('pdf') && !url.toLowerCase().endsWith('.pdf')) throw new Error('URL does not point to a PDF')
  const buffer = await response.arrayBuffer()
  if (buffer.byteLength > MAX_PDF_SIZE) throw new Error(`PDF exceeds ${MAX_PDF_SIZE / 1024 / 1024} MB limit`)
  const pdfParse = (await import('pdf-parse')).default
  const result = await pdfParse(Buffer.from(buffer))
  return { text: result.text, pages: result.numpages }
}

async function callClaude(prompt: string, systemPrompt: string): Promise<string> {
  const response = await fetch('https://api.anthropic.com/v1/messages', {
    method: 'POST',
    headers: { 'Content-Type': 'application/json', 'x-api-key': process.env.ANTHROPIC_API_KEY!, 'anthropic-version': '2023-06-01' },
    body: JSON.stringify({
      model: 'claude-sonnet-4-20250514', max_tokens: 4096, system: systemPrompt,
      messages: [{ role: 'user', content: prompt }],
    }),
  })
  if (!response.ok) throw new Error(`Claude API returned ${response.status}: ${response.statusText}`)
  const data = await response.json()
  const block = data.content?.[0]
  if (!block || block.type !== 'text') throw new Error('Unexpected Claude response format')
  return block.text
}

// ── PDF Processing Methods ──────────────────────────────────────────────────

interface ExtractTextArgs { url: string }

async function extractText(args: ExtractTextArgs): Promise<{
  result: { text: string; pages: number; wordCount: number; charCount: number }
}> {
  const { text, pages } = await fetchPdfText(args.url)
  return { result: { text, pages, wordCount: text.split(/\s+/).filter(Boolean).length, charCount: text.length } }
}

interface ExtractStructuredArgs { url: string; documentType?: string; fields?: string[] }

async function extractStructured(args: ExtractStructuredArgs): Promise<{
  result: { fields: Record<string, string>; documentType: string; confidence: string }
}> {
  const { text } = await fetchPdfText(args.url)
  if (text.trim().length === 0) throw new Error('PDF contains no extractable text (may be a scanned image)')
  const docType = args.documentType ?? 'unknown'
  const fieldList = args.fields?.length ? args.fields.join(', ') : 'auto-detect relevant fields'
  const raw = await callClaude(
    `Extract structured data from this ${docType} document.\n\nFields: ${fieldList}\n\nDocument:\n${text.slice(0, 50_000)}`,
    'Return ONLY valid JSON: { "fields": { "fieldName": "value" }, "documentType": "type", "confidence": "high"|"medium"|"low" }. Set missing fields to "NOT_FOUND". No markdown fences.'
  )
  try {
    return { result: JSON.parse(raw.replace(/```json\n?|\n?```/g, '').trim()) }
  } catch {
    return { result: { fields: { _raw: raw }, documentType: docType, confidence: 'low' } }
  }
}

interface MarkdownArgs { url: string; includeMetadata?: boolean }

async function pdfToMarkdown(args: MarkdownArgs): Promise<{ result: { markdown: string; pages: number } }> {
  const { text, pages } = await fetchPdfText(args.url)
  if (text.trim().length === 0) throw new Error('PDF contains no extractable text (may be a scanned image)')
  const metaNote = args.includeMetadata ? ' Include a YAML front-matter block with title, author, date, and page count if detectable.' : ''
  const markdown = await callClaude(
    `Convert this PDF text into well-structured Markdown:\n\n${text.slice(0, 50_000)}`,
    `Convert raw PDF text into clean Markdown with proper headings, lists, tables, and paragraphs.${metaNote}`
  )
  return { result: { markdown, pages } }
}

// ── Wrap with SettleGrid Billing ─────────────────────────────────────────────

export const billedExtractText = sg.wrap(extractText, { method: 'extract_text' })
export const billedExtractStructured = sg.wrap(extractStructured, { method: 'extract_structured' })
export const billedPdfToMarkdown = sg.wrap(pdfToMarkdown, { method: 'pdf_to_markdown' })

// ── REST Alternative ────────────────────────────────────────────────────────
// import { settlegridMiddleware } from '@settlegrid/mcp/rest'
//
// const withBilling = settlegridMiddleware({
//   toolSlug: 'my-pdf-processor',
//   pricing: {
//     defaultCostCents: 3,
//     methods: {
//       extract_text: { costCents: 3 },
//       extract_structured: { costCents: 6 },
//       pdf_to_markdown: { costCents: 4 },
//     },
//   },
// })
//
// export async function POST(request: Request) {
//   return withBilling(request, async () => {
//     const { url } = await request.json()
//     const result = await extractText({ url })
//     return Response.json(result)
//   }, 'extract_text')
// }