| 12345678910111213141516171819202122232425262728293031323334353637383940 |
- const pdfjsLib = require('pdfjs-dist/legacy/build/pdf.js');
- const fs = require('fs');
- const pdfPath = 'd:/Users/chenjun/kyj-yanglao-web-new/src/views/elderly/apply/check-in/颐年集团养老服务合同-2026年6月终版docx.pdf';
- const outPath = 'd:/Users/chenjun/kyj-yanglao-web-new/pdf_full_text.txt';
- async function main() {
- const data = new Uint8Array(fs.readFileSync(pdfPath));
- const loadingTask = pdfjsLib.getDocument({ data });
- const pdf = await loadingTask.promise;
- console.log('Total pages:', pdf.numPages);
-
- let allText = '';
- for (let i = 1; i <= pdf.numPages; i++) {
- const page = await pdf.getPage(i);
- const content = await page.getTextContent();
- const lines = [];
- let lastY = null;
- let currentLine = '';
- for (const item of content.items) {
- const y = Math.round(item.transform[5] * 10) / 10;
- if (lastY !== null && Math.abs(y - lastY) > 5) {
- if (currentLine.trim()) lines.push(currentLine);
- currentLine = '';
- }
- currentLine += item.str;
- lastY = y;
- }
- if (currentLine.trim()) lines.push(currentLine);
-
- const pageText = lines.join('\n');
- allText += `\n===== PAGE ${i} =====\n${pageText}\n`;
- console.log(`Page ${i}: ${pageText.length} chars`);
- }
-
- fs.writeFileSync(outPath, allText, 'utf8');
- console.log('Saved to', outPath);
- }
- main().catch(err => { console.error('ERROR:', err); process.exit(1); });
|