extract_pdf_script.js 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. const pdfjsLib = require('pdfjs-dist/legacy/build/pdf.js');
  2. const fs = require('fs');
  3. const pdfPath = 'd:/Users/chenjun/kyj-yanglao-web-new/src/views/elderly/apply/check-in/颐年集团养老服务合同-2026年6月终版docx.pdf';
  4. const outPath = 'd:/Users/chenjun/kyj-yanglao-web-new/pdf_full_text.txt';
  5. async function main() {
  6. const data = new Uint8Array(fs.readFileSync(pdfPath));
  7. const loadingTask = pdfjsLib.getDocument({ data });
  8. const pdf = await loadingTask.promise;
  9. console.log('Total pages:', pdf.numPages);
  10. let allText = '';
  11. for (let i = 1; i <= pdf.numPages; i++) {
  12. const page = await pdf.getPage(i);
  13. const content = await page.getTextContent();
  14. const lines = [];
  15. let lastY = null;
  16. let currentLine = '';
  17. for (const item of content.items) {
  18. const y = Math.round(item.transform[5] * 10) / 10;
  19. if (lastY !== null && Math.abs(y - lastY) > 5) {
  20. if (currentLine.trim()) lines.push(currentLine);
  21. currentLine = '';
  22. }
  23. currentLine += item.str;
  24. lastY = y;
  25. }
  26. if (currentLine.trim()) lines.push(currentLine);
  27. const pageText = lines.join('\n');
  28. allText += `\n===== PAGE ${i} =====\n${pageText}\n`;
  29. console.log(`Page ${i}: ${pageText.length} chars`);
  30. }
  31. fs.writeFileSync(outPath, allText, 'utf8');
  32. console.log('Saved to', outPath);
  33. }
  34. main().catch(err => { console.error('ERROR:', err); process.exit(1); });