extract_pdf_py.py 824 B

123456789101112131415161718192021222324
  1. import sys
  2. import os
  3. pdf_path = r'd:\Users\chenjun\kyj-yanglao-web-new\src\views\elderly\apply\check-in\颐年集团养老服务合同-2026年6月终版docx.pdf'
  4. out_path = r'd:\Users\chenjun\kyj-yanglao-web-new\pdf_full_text.txt'
  5. try:
  6. import pdfplumber
  7. except ImportError:
  8. import subprocess
  9. subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pdfplumber', '--user'])
  10. import pdfplumber
  11. with pdfplumber.open(pdf_path) as pdf:
  12. print(f'Total pages: {len(pdf.pages)}')
  13. all_text = ''
  14. for i, page in enumerate(pdf.pages):
  15. text = page.extract_text() or ''
  16. all_text += f'\n===== PAGE {i+1} =====\n{text}\n'
  17. print(f'Page {i+1}: {len(text)} chars')
  18. with open(out_path, 'w', encoding='utf-8') as f:
  19. f.write(all_text)
  20. print(f'Saved to {out_path}')