xiongxing
/
kyj-yanglao-web-new


			
				
					
						
						
							12345678910111213141516171819
							import pdfplumber
import json, sys, os

pdf_path = r'd:\Users\chenjun\kyj-yanglao-web-new\src\views\elderly\apply\check-in\颐年集团养老服务合同-2026年6月终版docx.pdf'

with pdfplumber.open(pdf_path) as pdf:
    pages = []
    for i, page in enumerate(pdf.pages):
        text = page.extract_text() or ''
        pages.append({'page': i+1, 'text': text})
        print(f'=== PAGE {i+1} (chars: {len(text)}) ===')
        print(text[:2000])
        if len(text) > 2000:
            print('...(truncated)')
        print()
    out_path = r'd:\Users\chenjun\kyj-yanglao-web-new\contract_pages.json'
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(pages, f, ensure_ascii=False, indent=2)
    print(f'\nTotal: {len(pages)} pages. Saved to {out_path}')