Enhance SNCF script to extract NET PAYÉ EN EUROS amount

2026-02-09 14:15:15 +01:00
parent 3754bb6ca6
commit ef23d066e0
36 changed files with 713 additions and 122 deletions
--- a/scripts/process_laposte_improved.py
+++ b/scripts/process_laposte_improved.py
@@ -0,0 +1,124 @@
+import subprocess
+import re
+import csv
+import os
+import glob
+from collections import defaultdict
+
+def categorize_laposte_transaction(description):
+    """Categorize La Poste transactions"""
+    description = description.lower()
+    
+    if 'virement' in description:
+        return 'Transfer'
+    if 'retrait' in description:
+        return 'Cash Withdrawal'
+    if 'carte' in description or 'paiement' in description:
+        return 'Card Payment'
+    if 'frais' in description or 'cotisation' in description:
+        return 'Bank Fees'
+    if 'cotis' in description:
+        return 'Deductions'
+    if 'impot' in description:
+        return 'Tax'
+    if 'edf' in description or 'bouygues' in description or 'orange' in description:
+        return 'Utilities'
+        
+    return 'Other'
+
+def process_laposte_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
+    """Process La Poste account PDF files with improved transaction extraction"""
+    # Get all PDF files in the directory
+    pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
+    all_transactions = []
+    
+    for pdf_file in pdf_files:
+        try:
+            # Convert PDF to text
+            result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'], 
+                                  capture_output=True, text=True, check=True)
+            content = result.stdout
+            
+            # Extract transactions from the PDF
+            lines = content.split('\n')
+            in_transaction_section = False
+            
+            for line in lines:
+                # Look for the transaction table section
+                if 'Opérations' in line:
+                    in_transaction_section = True
+                    continue
+                
+                # Skip headers and footers
+                if not in_transaction_section or 'Date' in line or 'Total' in line or 'Page' in line:
+                    continue
+                    
+                # Match transaction lines - they have date and amount
+                if re.match(r'\s*\d{2}/\d{2}/\d{4}', line):
+                    parts = re.split(r'\s{2,}', line)
+                    if len(parts) >= 3:
+                        try:
+                            date = parts[0].strip()
+                            description = parts[1].strip() if len(parts) > 2 else ''
+                            
+                            # Extract amount (look for numeric values with ¤ or €)
+                            amount = 0
+                            for part in parts[2:]:
+                                part = part.strip().replace('¤', '').replace('€', '')
+                                if re.match(r'[\d.,]+', part):
+                                    amount_str = part.replace(' ', '').replace(',', '.')
+                                    try:
+                                        amount = float(amount_str)
+                                        break
+                                    except ValueError:
+                                        continue
+                            
+                            category = categorize_laposte_transaction(description)
+                            
+                            # Only add if amount is valid
+                            if amount > 0:
+                                all_transactions.append({
+                                    'Date': date,
+                                    'Description': description,
+                                    'Category': category,
+                                    'Amount': amount,
+                                    'Source': os.path.basename(pdf_file)
+                                })
+                        except (ValueError, IndexError):
+                            continue
+            
+        except (subprocess.CalledProcessError, FileNotFoundError) as e:
+            print(f"Error processing {pdf_file}: {e}")
+            continue
+    
+    # Output CSV if requested
+    if output_csv and all_transactions:
+        csv_file = os.path.join(output_dir, 'laposte_all_transactions.csv')
+        os.makedirs(output_dir, exist_ok=True)
+        with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
+            fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(all_transactions)
+        print(f"\nTransaction data saved to {csv_file}")
+    
+    print(f"--- La Poste Account Statements ---")
+    print(f"Found {len(pdf_files)} account statement files")
+    print(f"Processed {len(all_transactions)} transactions")
+    
+    return all_transactions
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='Process La Poste (CCP) account statements')
+    parser.add_argument('--pdf-dir', default='../data/pdf/la_poste', 
+                       help='Directory containing La Poste PDF files')
+    parser.add_argument('--output-dir', default='../../output/csv', 
+                       help='Directory to save CSV output files')
+    parser.add_argument('--csv', action='store_true', 
+                       help='Output transaction data to CSV files')
+    args = parser.parse_args()
+    
+    # Process all PDF files in the directory
+    process_laposte_pdf_files(args.pdf_dir, args.csv, args.output_dir)