| 123456789101112131415161718192021222324 |
- import sys
- import os
- pdf_path = r'd:\Users\chenjun\kyj-yanglao-web-new\src\views\elderly\apply\check-in\颐年集团养老服务合同-2026年6月终版docx.pdf'
- out_path = r'd:\Users\chenjun\kyj-yanglao-web-new\pdf_full_text.txt'
- try:
- import pdfplumber
- except ImportError:
- import subprocess
- subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pdfplumber', '--user'])
- import pdfplumber
- with pdfplumber.open(pdf_path) as pdf:
- print(f'Total pages: {len(pdf.pages)}')
- all_text = ''
- for i, page in enumerate(pdf.pages):
- text = page.extract_text() or ''
- all_text += f'\n===== PAGE {i+1} =====\n{text}\n'
- print(f'Page {i+1}: {len(text)} chars')
-
- with open(out_path, 'w', encoding='utf-8') as f:
- f.write(all_text)
- print(f'Saved to {out_path}')
|