extract_pdf.py 783 B

12345678910111213141516171819
  1. import pdfplumber
  2. import json, sys, os
  3. pdf_path = r'd:\Users\chenjun\kyj-yanglao-web-new\src\views\elderly\apply\check-in\颐年集团养老服务合同-2026年6月终版docx.pdf'
  4. with pdfplumber.open(pdf_path) as pdf:
  5. pages = []
  6. for i, page in enumerate(pdf.pages):
  7. text = page.extract_text() or ''
  8. pages.append({'page': i+1, 'text': text})
  9. print(f'=== PAGE {i+1} (chars: {len(text)}) ===')
  10. print(text[:2000])
  11. if len(text) > 2000:
  12. print('...(truncated)')
  13. print()
  14. out_path = r'd:\Users\chenjun\kyj-yanglao-web-new\contract_pages.json'
  15. with open(out_path, 'w', encoding='utf-8') as f:
  16. json.dump(pages, f, ensure_ascii=False, indent=2)
  17. print(f'\nTotal: {len(pages)} pages. Saved to {out_path}')