| 12345678910111213141516171819 |
- import pdfplumber
- import json, sys, os
- pdf_path = r'd:\Users\chenjun\kyj-yanglao-web-new\src\views\elderly\apply\check-in\颐年集团养老服务合同-2026年6月终版docx.pdf'
- with pdfplumber.open(pdf_path) as pdf:
- pages = []
- for i, page in enumerate(pdf.pages):
- text = page.extract_text() or ''
- pages.append({'page': i+1, 'text': text})
- print(f'=== PAGE {i+1} (chars: {len(text)}) ===')
- print(text[:2000])
- if len(text) > 2000:
- print('...(truncated)')
- print()
- out_path = r'd:\Users\chenjun\kyj-yanglao-web-new\contract_pages.json'
- with open(out_path, 'w', encoding='utf-8') as f:
- json.dump(pages, f, ensure_ascii=False, indent=2)
- print(f'\nTotal: {len(pages)} pages. Saved to {out_path}')
|