import * as pdfjsLib from 'pdfjs-dist/build/pdf.mjs'; import fs from 'fs'; const pdfPath = 'd:/Users/chenjun/kyj-yanglao-web-new/src/views/elderly/apply/check-in/颐年集团养老服务合同-2026年6月终版docx.pdf'; const outPath = 'd:/Users/chenjun/kyj-yanglao-web-new/contract_pages.json'; async function main() { const data = new Uint8Array(fs.readFileSync(pdfPath)); const loadingTask = pdfjsLib.getDocument({ data }); const pdf = await loadingTask.promise; console.log('Pages:', pdf.numPages); const pages = []; for (let i = 1; i <= pdf.numPages; i++) { const page = await pdf.getPage(i); const content = await page.getTextContent(); const items = content.items; items.sort((a, b) => { const da = Math.floor(a.transform[5] / 10); const db = Math.floor(b.transform[5] / 10); if (da !== db) return db - da; return a.transform[4] - b.transform[4]; }); const lines = []; let lastY = null; let currentLine = ''; for (const item of items) { const y = item.transform[5]; if (lastY !== null && Math.abs(y - lastY) > 5) { lines.push(currentLine); currentLine = ''; } currentLine += item.str; lastY = y; } if (currentLine) lines.push(currentLine); const text = lines.join('\n'); pages.push({ page: i, text }); console.log('--- PAGE ' + i + ' (chars: ' + text.length + ') ---'); console.log(text); console.log(); } fs.writeFileSync(outPath, JSON.stringify(pages, null, 2), 'utf8'); console.log('Saved to ' + outPath); } main().catch(err => { console.error(err); process.exit(1); });