import subprocess import re import csv import os import glob from collections import defaultdict def categorize_laposte_transaction(description): """Categorize La Poste transactions""" description = description.lower() if 'virement' in description: return 'Transfer' if 'retrait' in description: return 'Cash Withdrawal' if 'carte' in description or 'paiement' in description: return 'Card Payment' if 'frais' in description or 'cotisation' in description: return 'Bank Fees' if 'cotis' in description: return 'Deductions' if 'impot' in description: return 'Tax' if 'edf' in description or 'bouygues' in description or 'orange' in description: return 'Utilities' return 'Other' def process_laposte_pdf_files(directory, output_csv=False, output_dir='../../output/csv'): """Process La Poste account PDF files with improved transaction extraction""" # Get all PDF files in the directory pdf_files = glob.glob(os.path.join(directory, "*.pdf")) all_transactions = [] for pdf_file in pdf_files: try: # Convert PDF to text result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'], capture_output=True, text=True, check=True) content = result.stdout # Extract transactions from the PDF lines = content.split('\n') in_transaction_section = False for line in lines: # Look for the transaction table section if 'Opérations' in line: in_transaction_section = True continue # Skip headers and footers if not in_transaction_section or 'Date' in line or 'Total' in line or 'Page' in line: continue # Match transaction lines - they have date and amount if re.match(r'\s*\d{2}/\d{2}/\d{4}', line): parts = re.split(r'\s{2,}', line) if len(parts) >= 3: try: date = parts[0].strip() description = parts[1].strip() if len(parts) > 2 else '' # Extract amount (look for numeric values with ¤ or €) amount = 0 for part in parts[2:]: part = part.strip().replace('¤', '').replace('€', '') if re.match(r'[\d.,]+', part): amount_str = part.replace(' ', '').replace(',', '.') try: amount = float(amount_str) break except ValueError: continue category = categorize_laposte_transaction(description) # Only add if amount is valid if amount > 0: all_transactions.append({ 'Date': date, 'Description': description, 'Category': category, 'Amount': amount, 'Source': os.path.basename(pdf_file) }) except (ValueError, IndexError): continue except (subprocess.CalledProcessError, FileNotFoundError) as e: print(f"Error processing {pdf_file}: {e}") continue # Output CSV if requested if output_csv and all_transactions: csv_file = os.path.join(output_dir, 'laposte_all_transactions.csv') os.makedirs(output_dir, exist_ok=True) with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(all_transactions) print(f"\nTransaction data saved to {csv_file}") print(f"--- La Poste Account Statements ---") print(f"Found {len(pdf_files)} account statement files") print(f"Processed {len(all_transactions)} transactions") return all_transactions if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Process La Poste (CCP) account statements') parser.add_argument('--pdf-dir', default='../data/pdf/la_poste', help='Directory containing La Poste PDF files') parser.add_argument('--output-dir', default='../../output/csv', help='Directory to save CSV output files') parser.add_argument('--csv', action='store_true', help='Output transaction data to CSV files') args = parser.parse_args() # Process all PDF files in the directory process_laposte_pdf_files(args.pdf_dir, args.csv, args.output_dir)