extract_pdf.js 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. const pdfjsLib = require('pdfjs-dist/legacy/build/pdf.js');
  2. const fs = require('fs');
  3. const pdfPath = 'd:/Users/chenjun/kyj-yanglao-web-new/src/views/elderly/apply/check-in/颐年集团养老服务合同-2026年6月终版docx.pdf';
  4. const outPath = 'd:/Users/chenjun/kyj-yanglao-web-new/contract_pages.json';
  5. async function main() {
  6. const data = new Uint8Array(fs.readFileSync(pdfPath));
  7. const loadingTask = pdfjsLib.getDocument({ data });
  8. const pdf = await loadingTask.promise;
  9. console.log('Pages:', pdf.numPages);
  10. const pages = [];
  11. for (let i = 1; i <= pdf.numPages; i++) {
  12. const page = await pdf.getPage(i);
  13. const content = await page.getTextContent();
  14. const lines = [];
  15. let lastY = null;
  16. let currentLine = '';
  17. for (const item of content.items) {
  18. if (lastY !== null && Math.abs(item.transform[5] - lastY) > 3) {
  19. lines.push(currentLine);
  20. currentLine = '';
  21. }
  22. currentLine += item.str;
  23. lastY = item.transform[5];
  24. }
  25. if (currentLine) lines.push(currentLine);
  26. const text = lines.join('\n');
  27. pages.push({ page: i, text });
  28. console.log('--- PAGE', i, '---');
  29. console.log(text);
  30. console.log();
  31. }
  32. fs.writeFileSync(outPath, JSON.stringify(pages, null, 2), 'utf8');
  33. console.log('Saved to', outPath);
  34. }
  35. main().catch(err => { console.error(err); process.exit(1); });