| 123456789101112131415161718192021222324252627282930313233343536373839404142434445 |
- import * as pdfjsLib from 'pdfjs-dist/build/pdf.mjs';
- import fs from 'fs';
- const pdfPath = 'd:/Users/chenjun/kyj-yanglao-web-new/src/views/elderly/apply/check-in/颐年集团养老服务合同-2026年6月终版docx.pdf';
- const outPath = 'd:/Users/chenjun/kyj-yanglao-web-new/contract_pages.json';
- async function main() {
- const data = new Uint8Array(fs.readFileSync(pdfPath));
- const loadingTask = pdfjsLib.getDocument({ data });
- const pdf = await loadingTask.promise;
- console.log('Pages:', pdf.numPages);
- const pages = [];
- for (let i = 1; i <= pdf.numPages; i++) {
- const page = await pdf.getPage(i);
- const content = await page.getTextContent();
- const items = content.items;
- items.sort((a, b) => {
- const da = Math.floor(a.transform[5] / 10);
- const db = Math.floor(b.transform[5] / 10);
- if (da !== db) return db - da;
- return a.transform[4] - b.transform[4];
- });
- const lines = [];
- let lastY = null;
- let currentLine = '';
- for (const item of items) {
- const y = item.transform[5];
- if (lastY !== null && Math.abs(y - lastY) > 5) {
- lines.push(currentLine);
- currentLine = '';
- }
- currentLine += item.str;
- lastY = y;
- }
- if (currentLine) lines.push(currentLine);
- const text = lines.join('\n');
- pages.push({ page: i, text });
- console.log('--- PAGE ' + i + ' (chars: ' + text.length + ') ---');
- console.log(text);
- console.log();
- }
- fs.writeFileSync(outPath, JSON.stringify(pages, null, 2), 'utf8');
- console.log('Saved to ' + outPath);
- }
- main().catch(err => { console.error(err); process.exit(1); });
|