const pdfjsLib = require('pdfjs-dist/legacy/build/pdf.js'); const fs = require('fs'); const pdfPath = 'd:/Users/chenjun/kyj-yanglao-web-new/src/views/elderly/apply/check-in/颐年集团养老服务合同-2026年6月终版docx.pdf'; const outPath = 'd:/Users/chenjun/kyj-yanglao-web-new/contract_pages.json'; async function main() { const data = new Uint8Array(fs.readFileSync(pdfPath)); const loadingTask = pdfjsLib.getDocument({ data }); const pdf = await loadingTask.promise; console.log('Pages:', pdf.numPages); const pages = []; for (let i = 1; i <= pdf.numPages; i++) { const page = await pdf.getPage(i); const content = await page.getTextContent(); const lines = []; let lastY = null; let currentLine = ''; for (const item of content.items) { if (lastY !== null && Math.abs(item.transform[5] - lastY) > 3) { lines.push(currentLine); currentLine = ''; } currentLine += item.str; lastY = item.transform[5]; } if (currentLine) lines.push(currentLine); const text = lines.join('\n'); pages.push({ page: i, text }); console.log('--- PAGE', i, '---'); console.log(text); console.log(); } fs.writeFileSync(outPath, JSON.stringify(pages, null, 2), 'utf8'); console.log('Saved to', outPath); } main().catch(err => { console.error(err); process.exit(1); });