extract_pdf.mjs 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. import * as pdfjsLib from 'pdfjs-dist/build/pdf.mjs';
  2. import fs from 'fs';
  3. const pdfPath = 'd:/Users/chenjun/kyj-yanglao-web-new/src/views/elderly/apply/check-in/颐年集团养老服务合同-2026年6月终版docx.pdf';
  4. const outPath = 'd:/Users/chenjun/kyj-yanglao-web-new/contract_pages.json';
  5. async function main() {
  6. const data = new Uint8Array(fs.readFileSync(pdfPath));
  7. const loadingTask = pdfjsLib.getDocument({ data });
  8. const pdf = await loadingTask.promise;
  9. console.log('Pages:', pdf.numPages);
  10. const pages = [];
  11. for (let i = 1; i <= pdf.numPages; i++) {
  12. const page = await pdf.getPage(i);
  13. const content = await page.getTextContent();
  14. const items = content.items;
  15. items.sort((a, b) => {
  16. const da = Math.floor(a.transform[5] / 10);
  17. const db = Math.floor(b.transform[5] / 10);
  18. if (da !== db) return db - da;
  19. return a.transform[4] - b.transform[4];
  20. });
  21. const lines = [];
  22. let lastY = null;
  23. let currentLine = '';
  24. for (const item of items) {
  25. const y = item.transform[5];
  26. if (lastY !== null && Math.abs(y - lastY) > 5) {
  27. lines.push(currentLine);
  28. currentLine = '';
  29. }
  30. currentLine += item.str;
  31. lastY = y;
  32. }
  33. if (currentLine) lines.push(currentLine);
  34. const text = lines.join('\n');
  35. pages.push({ page: i, text });
  36. console.log('--- PAGE ' + i + ' (chars: ' + text.length + ') ---');
  37. console.log(text);
  38. console.log();
  39. }
  40. fs.writeFileSync(outPath, JSON.stringify(pages, null, 2), 'utf8');
  41. console.log('Saved to ' + outPath);
  42. }
  43. main().catch(err => { console.error(err); process.exit(1); });