extract_pdf_2.py 686 B

1234567891011121314151617
  1. import pdfplumber
  2. import json, os, sys
  3. pdf_path = r"d:\Users\chenjun\kyj-yanglao-web-new\src\views\elderly\apply\check-in\颐年集团养老服务合同-2026年6月终版docx.pdf"
  4. with pdfplumber.open(pdf_path) as pdf:
  5. pages = []
  6. for i, page in enumerate(pdf.pages):
  7. text = page.extract_text() or ""
  8. pages.append({"page": i+1, "text": text})
  9. print("=== PAGE %d (chars: %d) ===" % (i+1, len(text)))
  10. print(text)
  11. print()
  12. out_path = r"d:\Users\chenjun\kyj-yanglao-web-new\contract_pages.json"
  13. with open(out_path, "w", encoding="utf-8") as f:
  14. json.dump(pages, f, ensure_ascii=False, indent=2)
  15. print("Saved to", out_path)