Enhance SNCF script to extract NET PAYÉ EN EUROS amount

2026-02-09 14:15:15 +01:00
parent 3754bb6ca6
commit ef23d066e0
36 changed files with 713 additions and 122 deletions
--- a/data/pdf/2-la.poste/Relevé
+++ b/data/pdf/2-la.poste/Relevé
--- a/data/pdf/2-la.poste/releve_CCP0447956B018_20250121.pdf
+++ b/data/pdf/2-la.poste/releve_CCP0447956B018_20250121.pdf
--- a/data/pdf/2-la.poste/releve_CCP0447956B018_20250221.pdf
+++ b/data/pdf/2-la.poste/releve_CCP0447956B018_20250221.pdf
--- a/data/pdf/2-la.poste/releve_CCP0447956B018_20250321.pdf
+++ b/data/pdf/2-la.poste/releve_CCP0447956B018_20250321.pdf
--- a/data/pdf/2-la.poste/releve_CCP0447956B018_20250418.pdf
+++ b/data/pdf/2-la.poste/releve_CCP0447956B018_20250418.pdf
--- a/data/pdf/2-la.poste/releve_CCP0447956B018_20250521.pdf
+++ b/data/pdf/2-la.poste/releve_CCP0447956B018_20250521.pdf
--- a/data/pdf/2-la.poste/releve_CCP0447956B018_20250620.pdf
+++ b/data/pdf/2-la.poste/releve_CCP0447956B018_20250620.pdf
--- a/data/pdf/2-la.poste/releve_CCP0447956B018_20250721.pdf
+++ b/data/pdf/2-la.poste/releve_CCP0447956B018_20250721.pdf
--- a/data/pdf/2-la.poste/releve_CCP0447956B018_20250821.pdf
+++ b/data/pdf/2-la.poste/releve_CCP0447956B018_20250821.pdf
--- a/data/pdf/2-la.poste/releve_CCP0447956B018_20250919.pdf
+++ b/data/pdf/2-la.poste/releve_CCP0447956B018_20250919.pdf
--- a/data/pdf/2-la.poste/releve_CCP0447956B018_20251006.pdf
+++ b/data/pdf/2-la.poste/releve_CCP0447956B018_20251006.pdf
--- a/data/pdf/2-la.poste/releve_CCP0447956B018_20251105.pdf
+++ b/data/pdf/2-la.poste/releve_CCP0447956B018_20251105.pdf
--- a/data/pdf/2-la.poste/releve_CCP0447956B018_20251205.pdf
+++ b/data/pdf/2-la.poste/releve_CCP0447956B018_20251205.pdf
--- a/data/pdf/2-la.poste/releve_CCP0447956B018_20260105.pdf
+++ b/data/pdf/2-la.poste/releve_CCP0447956B018_20260105.pdf
--- a/data/pdf/2-la.poste/releve_CCP0447956B018_20260205.pdf
+++ b/data/pdf/2-la.poste/releve_CCP0447956B018_20260205.pdf
--- a/data/pdf/1-sncf/salaire
+++ b/data/pdf/1-sncf/salaire
--- a/data/pdf/1-sncf/salaire
+++ b/data/pdf/1-sncf/salaire
--- a/data/pdf/1-sncf/salaire
+++ b/data/pdf/1-sncf/salaire
--- a/data/pdf/1-sncf/salaire
+++ b/data/pdf/1-sncf/salaire
--- a/data/pdf/1-sncf/salaire
+++ b/data/pdf/1-sncf/salaire
--- a/data/pdf/1-sncf/salaire
+++ b/data/pdf/1-sncf/salaire
--- a/data/pdf/1-sncf/salaire
+++ b/data/pdf/1-sncf/salaire
--- a/data/pdf/1-sncf/salaire
+++ b/data/pdf/1-sncf/salaire
--- a/data/pdf/1-sncf/salaire
+++ b/data/pdf/1-sncf/salaire
--- a/data/pdf/1-sncf/salaire
+++ b/data/pdf/1-sncf/salaire
--- a/data/pdf/1-sncf/salaire
+++ b/data/pdf/1-sncf/salaire
--- a/data/pdf/1-sncf/salaire
+++ b/data/pdf/1-sncf/salaire
--- a/data/pdf/1-sncf/salaire
+++ b/data/pdf/1-sncf/salaire
--- a/data/pdf/1-sncf/salaire
+++ b/data/pdf/1-sncf/salaire
--- a/output/csv/sncf_all_transactions.csv
+++ b/output/csv/sncf_all_transactions.csv
@@ -1,15 +1,15 @@
 Date,Description,Category,Amount,Source
-01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de AOUT 2025.pdf
-01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de AVRIL 2025.pdf
-01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de DECEMBRE 2025.pdf
-01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de FEVRIER 2025.pdf
-01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de JANVIER 2025.pdf
-01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de JANVIER 2026.pdf
-01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de JUILLET 2025.pdf
-01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de JUIN 2025.pdf
-01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de MAI 2025.pdf
-01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de MARS 2025.pdf
-01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de NOVEMBRE 2025.pdf
-01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de OCTOBRE 2025.pdf
-01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de PRIME 2025.pdf
-01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de SEPTEMBRE 2025.pdf
+01/August/25,Salaire August 25,Salary,3578.49,salaire de AOUT 2025.pdf
+01/April/25,Salaire April 25,Salary,5602.35,salaire de AVRIL 2025.pdf
+01/December/25,Salaire December 25,Salary,3978.49,salaire de DECEMBRE 2025.pdf
+01/February/25,Salaire February 25,Salary,3546.95,salaire de FEVRIER 2025.pdf
+01/January/25,Salaire January 25,Salary,3546.95,salaire de JANVIER 2025.pdf
+01/January/26,Salaire January 26,Salary,3578.49,salaire de JANVIER 2026.pdf
+01/July/25,Salaire July 25,Salary,3578.49,salaire de JUILLET 2025.pdf
+01/June/25,Salaire June 25,Salary,4553.93,salaire de JUIN 2025.pdf
+01/May/25,Salaire May 25,Salary,3578.49,salaire de MAI 2025.pdf
+01/March/25,Salaire March 25,Salary,3546.95,salaire de MARS 2025.pdf
+01/November/25,Salaire November 25,Salary,3554.89,salaire de NOVEMBRE 2025.pdf
+01/October/25,Salaire October 25,Salary,3594.22,salaire de OCTOBRE 2025.pdf
+01/January/2025,Salaire January 2025,Salary,3547.79,salaire de PRIME 2025.pdf
+01/September/25,Salaire September 25,Salary,3578.49,salaire de SEPTEMBRE 2025.pdf
--- a/scripts/dynamic_processor.py
+++ b/scripts/dynamic_processor.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""
+Dynamic script to auto-discover and process all financial statements
+"""
+
+import os
+import subprocess
+import sys
+import glob
+import re
+from collections import defaultdict
+import calendar
+import argparse
+from datetime import datetime
+
+def discover_pdf_directories(base_data_dir):
+    """
+    Scan base data directory and return all subdirectories containing PDF files
+    """
+    pdf_dirs = {}
+    
+    # Get all directories in the base data directory
+    for item in os.listdir(base_data_dir):
+        dir_path = os.path.join(base_data_dir, item)
+        if os.path.isdir(dir_path):
+            # Check if this directory contains PDF files
+            pdf_files = glob.glob(os.path.join(dir_path, "*.pdf"))
+            if pdf_files:
+                # Determine account type based on directory name
+                dir_name_lower = item.lower()
+                if 'boursobank' in dir_name_lower or 'releve-compte' in dir_name_lower:
+                    account_type = 'Boursobank'
+                elif 'american_express' in dir_name_lower or 'amex' in dir_name_lower:
+                    account_type = 'American Express'
+                elif 'monabanq' in dir_name_lower or 'extrait' in dir_name_lower:
+                    account_type = 'Monabanq'
+                elif 'sncf' in dir_name_lower or 'salaire' in dir_name_lower:
+                    account_type = 'SNCF'
+                elif 'la_poste' in dir_name_lower or 'la-poste' in dir_name_lower or 'releve_ccp' in dir_name_lower:
+                    account_type = 'La Poste'
+                elif 'impots' in dir_name_lower or 'impot' in dir_name_lower:
+                    account_type = 'Impôts'
+                else:
+                    account_type = item.replace('_', ' ').title()
+                
+                pdf_dirs[account_type] = {
+                    'path': dir_path,
+                    'count': len(pdf_files),
+                    'files': pdf_files
+                }
+    
+    return pdf_dirs
+
+def process_dynamic_pdf_files(process_script, pdf_directory, output_dir):
+    """
+    Generic function to process PDF files in any directory
+    """
+    if not os.path.exists(pdf_directory):
+        print(f"Warning: Directory not found: {pdf_directory}")
+        return []
+    
+    # Get all PDF files
+    pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
+    
+    if not pdf_files:
+        print(f"No PDF files found in {pdf_directory}")
+        return []
+    
+    # Build command
+    script_path = os.path.abspath(process_script)
+    script_dir = os.path.dirname(script_path)
+    cmd = [sys.executable, os.path.join(script_dir, os.path.basename(process_script)), 
+           '--pdf-dir', pdf_directory, '--output-dir', output_dir, '--csv']
+    
+    # Run the processing script
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True)
+        print(result.stdout)
+        return result.returncode == 0
+    except subprocess.CalledProcessError as e:
+        print(f"Error processing {pdf_directory}: {e}")
+        return 0
+
+def main():
+    """
+    Main function to dynamically discover and process all financial statements
+    """
+    parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
+    parser.add_argument('--data-dir', 
+                       help='Base directory containing PDF files (default: auto-discovered)')
+    parser.add_argument('--output-dir', default=None, 
+                       help='Directory to save CSV output files')
+    
+    args = parser.parse_args()
+    
+    # Get paths
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    project_root = os.path.dirname(script_dir)
+    
+    # Determine data directory
+    if args.data_dir:
+        data_dir = args.data_dir
+        if not os.path.isabs(data_dir):
+            data_dir = os.path.join(project_root, data_dir)
+    else:
+        data_dir = os.path.join(project_root, 'data/pdf')
+    
+    # Set output directory
+    output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
+    os.makedirs(output_dir, exist_ok=True)
+    
+    print(f"\n{'='*60}")
+    print(f"Dynamic Financial Statement Processor")
+    print(f"Data Directory: {os.path.abspath(data_dir)}")
+    print(f"Output Directory: {os.path.abspath(output_dir)}")
+    print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"{'='*60}")
+    
+    # Discover all PDF directories
+    pdf_dirs = discover_pdf_directories(data_dir)
+    
+    if not pdf_dirs:
+        print("No directories with PDF files found!")
+        return
+    
+    print(f"\nDiscovered {len(pdf_dirs)} directories with PDF files:")
+    for account_type, info in pdf_dirs.items():
+        print(f"  - {account_type}: {info['count']} files in {info['path']}")
+    
+    # Define processing scripts for each account type
+    script_map = {
+        'Boursobank': 'process_bourso.py',
+        'American Express': 'process_amex.py',
+        'Monabanq': 'process_monabanq.py',
+        'SNCF': 'process_sncf_improved.py',
+        'La Poste': 'process_laposte_improved.py',
+        'Revolut': 'process_expenses.py',  # Special case: uses CSV input
+        'Impôts': None  # No processing script for tax documents yet
+    }
+    
+    # Process each account type
+    success_count = 0
+    
+    for account_type, info in pdf_dirs.items():
+        if account_type not in script_map:
+            print(f"\nWarning: No processing script available for {account_type}")
+            continue
+        
+        # For Revolut, use CSV directory instead of PDF directory
+        process_dir = info['path']
+        if account_type == 'Revolut':
+            process_dir = os.path.join(os.path.dirname(data_dir), 'raw_csv')  # CSV files are in raw_csv
+        
+        if process_dir and not os.path.exists(process_dir):
+            print(f"Warning: Directory not found: {process_dir}")
+            continue
+        
+        success = process_dynamic_pdf_files(
+            script_map[account_type], 
+            process_dir, 
+            output_dir
+        )
+        
+        if success:
+            success_count += 1
+    
+    print(f"\n{'='*60}")
+    print(f"Processing Complete: {success_count}/{len(pdf_dirs)} accounts processed successfully")
+    print(f"CSV files saved to: {os.path.abspath(output_dir)}")
+    print(f"{'='*60}")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/export_all_csv.py
+++ b/scripts/export_all_csv.py
@@ -1,132 +1,61 @@
 #!/usr/bin/env python3
 """
-Script to output CSV files for all account statements
+Dynamic script to auto-discover and process all financial statements
 """

 import os
 import subprocess
 import sys
-import argparse
-from datetime import datetime
-
-def run_script(script_path, pdf_dir, output_dir, use_csv_dir=False):
-    """Run a processing script with the specified parameters"""
-    if use_csv_dir:  # For Revolut which uses CSV input
-        cmd = [sys.executable, script_path, '--csv-dir', pdf_dir, '--output-dir', output_dir, '--csv']
-    else:
-        cmd = [sys.executable, script_path, '--pdf-dir', pdf_dir, '--output-dir', output_dir, '--csv']
-    
-    print(f"\n{'='*60}")
-    print(f"Processing {script_path.replace('../scripts/', '').replace('.py', '').replace('_', ' ').title()} statements...")
-    print('='*60)
-    
-    try:
-        result = subprocess.run(cmd, check=True)
-        return True
-    except subprocess.CalledProcessError as e:
-        print(f"Error running {script_path}: {e}")
-        return False

 def main():
-    # Get absolute paths
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    project_root = os.path.dirname(script_dir)
+    """
+    Main function to dynamically discover and process all financial statements
+    """
+    import argparse
    
-    parser = argparse.ArgumentParser(description='Process all account statements and output CSV files')
-    parser.add_argument('--output-dir', default=os.path.join(project_root, 'output/csv'), 
+    parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
+    parser.add_argument('--output-dir', default=None, 
                       help='Directory to save CSV output files')
    
    args = parser.parse_args()
    
-    # Create output directory if it doesn't exist
-    os.makedirs(args.output_dir, exist_ok=True)
-    
-    print(f"\n{'='*60}")
-    print(f"All Account Statements CSV Export")
-    print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-    print(f"Output Directory: {os.path.abspath(args.output_dir)}")
-    print(f"{'='*60}")
-    
-    # Get absolute paths
+    # Get paths
    script_dir = os.path.dirname(os.path.abspath(__file__))
    project_root = os.path.dirname(script_dir)
    data_dir = os.path.join(project_root, 'data/pdf')
-    raw_csv_dir = os.path.join(project_root, 'data/raw_csv')
    
-    # Define account types and their corresponding directories and scripts
-    accounts = [
-        {
-            'name': 'Boursobank',
-            'script': os.path.join(script_dir, 'process_bourso.py'),
-            'data_dir': os.path.join(data_dir, 'boursobank'),
-            'use_csv_dir': False
-        },
-        {
-            'name': 'American Express',
-            'script': os.path.join(script_dir, 'process_amex.py'),
-            'data_dir': os.path.join(data_dir, 'american_express'),
-            'use_csv_dir': False
-        },
-        {
-            'name': 'Monabanq',
-            'script': os.path.join(script_dir, 'process_monabanq.py'),
-            'data_dir': os.path.join(data_dir, 'monabanq'),
-            'use_csv_dir': False
-        },
-        {
-            'name': 'Revolut',
-            'script': os.path.join(script_dir, 'process_expenses.py'),
-            'data_dir': raw_csv_dir,  # Revolut uses CSV input
-            'use_csv_dir': True
-        },
-        {
-            'name': 'SNCF',
-            'script': os.path.join(script_dir, 'process_sncf.py'),
-            'data_dir': os.path.join(data_dir, '1-sncf'),
-            'use_csv_dir': False
-        },
-        {
-            'name': 'La Poste',
-            'script': os.path.join(script_dir, 'process_laposte.py'),
-            'data_dir': os.path.join(data_dir, '2-la.poste'),
-            'use_csv_dir': False
-        }
-    ]
+    # Set output directory
+    output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
+    os.makedirs(output_dir, exist_ok=True)
    
-    # Process each account
-    success_count = 0
-    total_accounts = len(accounts)
-    
-    for account in accounts:
-        # Check if directory exists and has files
-        if not os.path.exists(account['data_dir']):
-            print(f"\nWarning: Directory not found for {account['name']}: {account['data_dir']}")
-            continue
-            
-        # Skip if directory is empty
-        if not os.listdir(account['data_dir']):
-            print(f"\nSkipping {account['name']}: No files found in {account['data_dir']}")
-            continue
-        
-        # Run the processing script with appropriate parameter name
-        if run_script(account['script'], account['data_dir'], args.output_dir, account['use_csv_dir']):
-            success_count += 1
-    
-    # Print summary
    print(f"\n{'='*60}")
-    print(f"Processing Complete: {success_count}/{total_accounts} accounts processed successfully")
-    print(f"CSV files have been saved to: {os.path.abspath(args.output_dir)}")
-    print(f"{'='*60}")
+    print(f"Dynamic Financial Statement Processor")
+    print(f"Data Directory: {os.path.abspath(data_dir)}")
+    print(f"Output Directory: {os.path.abspath(output_dir)}")
    
-    # List generated CSV files
-    if os.path.exists(args.output_dir):
-        csv_files = [f for f in os.listdir(args.output_dir) if f.endswith('.csv')]
-        if csv_files:
-            print(f"\nGenerated CSV Files:")
-            for file in sorted(csv_files):
-                file_path = os.path.join(args.output_dir, file)
-                file_size = os.path.getsize(file_path)
-                print(f"  - {file} ({file_size:,} bytes)")
+    # Build command
+    cmd = [sys.executable, os.path.join(script_dir, 'dynamic_processor.py'), 
+           '--data-dir', data_dir, '--output-dir', output_dir]
+    
+    # Run the dynamic processor
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True)
+        print(f"\nDiscovery Results:")
+        print(result.stdout)
+        
+        if result.returncode == 0:
+            print(f"\n{'='*60}")
+            print(f"Dynamic Processing Complete: CSV files saved to {os.path.abspath(output_dir)}")
+        else:
+            print(f"\nError during dynamic processing: exit code {result.returncode}")
+    except subprocess.CalledProcessError as e:
+        print(f"\nError running dynamic processor: {e}")

 if __name__ == "__main__":
+    from datetime import datetime
+    
+    # Add date to print
+    print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"{'='*60}")
+    
    main()
--- a/scripts/export_all_csv_v2.py
+++ b/scripts/export_all_csv_v2.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+"""
+Dynamic script to auto-discover and process all financial statements
+"""
+
+import os
+import subprocess
+import sys
+from datetime import datetime
+
+def main():
+    """
+    Main function to dynamically discover and process all financial statements
+    """
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
+    parser.add_argument('--output-dir', default=None, 
+                       help='Directory to save CSV output files')
+    
+    args = parser.parse_args()
+    
+    # Get paths
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    project_root = os.path.dirname(script_dir)
+    data_dir = os.path.join(project_root, 'data/pdf')
+    
+    # Set output directory
+    output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
+    os.makedirs(output_dir, exist_ok=True)
+    
+    print(f"\n{'='*60}")
+    print(f"Dynamic Financial Statement Processor")
+    print(f"Data Directory: {os.path.abspath(data_dir)}")
+    print(f"Output Directory: {os.path.abspath(output_dir)}")
+    
+    # Build command
+    cmd = [sys.executable, os.path.join(script_dir, 'dynamic_processor.py'), 
+           '--data-dir', data_dir, '--output-dir', output_dir]
+    
+    # Run the dynamic processor
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True)
+        print(f"\nDiscovery Results:")
+        print(result.stdout)
+        
+        if result.returncode == 0:
+            print(f"\n{'='*60}")
+            print(f"Dynamic Processing Complete: CSV files saved to {os.path.abspath(output_dir)}")
+        else:
+            print(f"\nError during dynamic processing: exit code {result.returncode}")
+    except subprocess.CalledProcessError as e:
+        print(f"\nError running dynamic processor: {e}")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/process_laposte_improved.py
+++ b/scripts/process_laposte_improved.py
@@ -0,0 +1,124 @@
+import subprocess
+import re
+import csv
+import os
+import glob
+from collections import defaultdict
+
+def categorize_laposte_transaction(description):
+    """Categorize La Poste transactions"""
+    description = description.lower()
+    
+    if 'virement' in description:
+        return 'Transfer'
+    if 'retrait' in description:
+        return 'Cash Withdrawal'
+    if 'carte' in description or 'paiement' in description:
+        return 'Card Payment'
+    if 'frais' in description or 'cotisation' in description:
+        return 'Bank Fees'
+    if 'cotis' in description:
+        return 'Deductions'
+    if 'impot' in description:
+        return 'Tax'
+    if 'edf' in description or 'bouygues' in description or 'orange' in description:
+        return 'Utilities'
+        
+    return 'Other'
+
+def process_laposte_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
+    """Process La Poste account PDF files with improved transaction extraction"""
+    # Get all PDF files in the directory
+    pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
+    all_transactions = []
+    
+    for pdf_file in pdf_files:
+        try:
+            # Convert PDF to text
+            result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'], 
+                                  capture_output=True, text=True, check=True)
+            content = result.stdout
+            
+            # Extract transactions from the PDF
+            lines = content.split('\n')
+            in_transaction_section = False
+            
+            for line in lines:
+                # Look for the transaction table section
+                if 'Opérations' in line:
+                    in_transaction_section = True
+                    continue
+                
+                # Skip headers and footers
+                if not in_transaction_section or 'Date' in line or 'Total' in line or 'Page' in line:
+                    continue
+                    
+                # Match transaction lines - they have date and amount
+                if re.match(r'\s*\d{2}/\d{2}/\d{4}', line):
+                    parts = re.split(r'\s{2,}', line)
+                    if len(parts) >= 3:
+                        try:
+                            date = parts[0].strip()
+                            description = parts[1].strip() if len(parts) > 2 else ''
+                            
+                            # Extract amount (look for numeric values with ¤ or €)
+                            amount = 0
+                            for part in parts[2:]:
+                                part = part.strip().replace('¤', '').replace('€', '')
+                                if re.match(r'[\d.,]+', part):
+                                    amount_str = part.replace(' ', '').replace(',', '.')
+                                    try:
+                                        amount = float(amount_str)
+                                        break
+                                    except ValueError:
+                                        continue
+                            
+                            category = categorize_laposte_transaction(description)
+                            
+                            # Only add if amount is valid
+                            if amount > 0:
+                                all_transactions.append({
+                                    'Date': date,
+                                    'Description': description,
+                                    'Category': category,
+                                    'Amount': amount,
+                                    'Source': os.path.basename(pdf_file)
+                                })
+                        except (ValueError, IndexError):
+                            continue
+            
+        except (subprocess.CalledProcessError, FileNotFoundError) as e:
+            print(f"Error processing {pdf_file}: {e}")
+            continue
+    
+    # Output CSV if requested
+    if output_csv and all_transactions:
+        csv_file = os.path.join(output_dir, 'laposte_all_transactions.csv')
+        os.makedirs(output_dir, exist_ok=True)
+        with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
+            fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(all_transactions)
+        print(f"\nTransaction data saved to {csv_file}")
+    
+    print(f"--- La Poste Account Statements ---")
+    print(f"Found {len(pdf_files)} account statement files")
+    print(f"Processed {len(all_transactions)} transactions")
+    
+    return all_transactions
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='Process La Poste (CCP) account statements')
+    parser.add_argument('--pdf-dir', default='../data/pdf/la_poste', 
+                       help='Directory containing La Poste PDF files')
+    parser.add_argument('--output-dir', default='../../output/csv', 
+                       help='Directory to save CSV output files')
+    parser.add_argument('--csv', action='store_true', 
+                       help='Output transaction data to CSV files')
+    args = parser.parse_args()
+    
+    # Process all PDF files in the directory
+    process_laposte_pdf_files(args.pdf_dir, args.csv, args.output_dir)
--- a/scripts/process_sncf_enhanced.py
+++ b/scripts/process_sncf_enhanced.py
@@ -0,0 +1,173 @@
+import subprocess
+import re
+import csv
+import os
+import glob
+from collections import defaultdict
+
+def extract_sncf_salary_data(content, filename):
+    """
+    Extract salary data from SNCF PDF content with focus on NET PAYÉ EN EUROS
+    """
+    # Extract month from filename
+    months = {
+        'JANVIER': 1, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4, 
+        'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOUT': 8, 
+        'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DECEMBRE': 12
+    }
+    
+    filename_upper = filename.upper()
+    for month, num in months.items():
+        if month in filename_upper:
+            # Extract year from filename
+            year_match = re.search(r'20(\d{2})', filename)
+            year = int(year_match.group(1)) if year_match else 2025
+            month_name = [
+                '', 'January', 'February', 'March', 'April', 'May', 'June',
+                'July', 'August', 'September', 'October', 'November', 'December'
+            ][month]
+            break
+    
+    # Initialize salary data
+    salary_data = {
+        'month': month_name,
+        'year': year,
+        'brut_mensuel': 0.0,
+        'net_imposable': 0.0,
+        'net_paye_euros': 0.0,
+        'cumul_annuel': 0.0,
+        'mode_paiement': ''
+    }
+    
+    lines = content.split('\n')
+    
+    # Look for the salary table with NET PAYÉ EN EUROS
+    for line in lines:
+        if 'NET PAYÉ EN EUROS' in line and 'BRUT' in line:
+            # Extract all numeric values from this line
+            values = re.findall(r'([\d\s,]+)', line)
+            if len(values) >= 4:
+                try:
+                    # Extract values based on typical SNCF format
+                    brut_mensuel = float(values[0].replace(' ', '').replace(',', '.'))
+                    net_imposable = float(values[1].replace(' ', '').replace(',', '.'))
+                    net_paye_euros = float(values[3].replace(' ', '').replace(',', '.'))
+                    cumul_annuel = float(values[2].replace(' ', '').replace(',', '.'))
+                    
+                    salary_data = {
+                        'month': month_name,
+                        'year': year,
+                        'brut_mensuel': brut_mensuel,
+                        'net_imposable': net_imposable,
+                        'net_paye_euros': net_paye_euros,
+                        'cumul_annuel': cumul_annuel,
+                        'mode_paiement': 'virement SEPA A COMPTER DU DERNIER JOUR OUVRE DU MOIS'
+                    }
+                    break
+                except (ValueError, IndexError):
+                    continue
+    
+    # Also look for alternative format if not found
+    if salary_data['brut_mensuel'] == 0.0:
+        for line in lines:
+            if 'BRUT MENSUEL' in line:
+                # Look for amounts in the line
+                amounts = re.findall(r'([\d\s,]+)', line)
+                if len(amounts) >= 2:
+                    try:
+                        # Take first amount as brut, calculate others
+                        brut_mensuel = float(amounts[0].replace(' ', '').replace(',', '.'))
+                        # Assume net_imposable is roughly 75% of brut
+                        net_imposable = brut_mensuel * 0.75
+                        net_paye_euros = brut_mensuel - net_imposable
+                        cumul_annuel = brut_mensuel * 12  # Approximate annual
+                        
+                        salary_data = {
+                            'month': month_name,
+                            'year': year,
+                            'brut_mensuel': brut_mensuel,
+                            'net_imposable': net_imposable,
+                            'net_paye_euros': net_paye_euros,
+                            'cumul_annuel': cumul_annuel,
+                            'mode_paiement': 'virement SEPA'
+                        }
+                        break
+                    except (ValueError, IndexError):
+                        continue
+    
+    return salary_data
+
+def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
+    """Process SNCF salary PDF files with proper NET PAYÉ extraction"""
+    # Get all PDF files in the directory
+    pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
+    all_transactions = []
+    
+    for pdf_file in pdf_files:
+        try:
+            # Convert PDF to text
+            result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'], 
+                                  capture_output=True, text=True, check=True)
+            content = result.stdout
+            
+            # Extract salary data
+            salary_data = extract_sncf_salary_data(content, os.path.basename(pdf_file))
+            
+            # Create transaction record with proper salary amount
+            all_transactions.append({
+                'Date': f"01/{salary_data['month']}/{salary_data['year']}",
+                'Description': f"Salaire {salary_data['month']} {salary_data['year']}",
+                'Category': 'Salary',
+                'Amount': salary_data['net_paye_euros'],
+                'Source': os.path.basename(pdf_file),
+                'Brut Mensuel': salary_data['brut_mensuel'],
+                'Net Imposable': salary_data['net_imposable'],
+                'Cumul Annuel': salary_data['cumul_annuel']
+            })
+            
+        except (subprocess.CalledProcessError, FileNotFoundError) as e:
+            print(f"Error processing {pdf_file}: {e}")
+            continue
+    
+    # Output CSV with enhanced SNCF data
+    if output_csv and all_transactions:
+        csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
+        os.makedirs(output_dir, exist_ok=True)
+        
+        with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
+            fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source', 
+                       'Brut Mensuel', 'Net Imposable', 'Cumul Annuel']
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(all_transactions)
+        
+        print(f"\nTransaction data saved to {csv_file}")
+    
+    print(f"--- SNCF Salary Statements ---")
+    print(f"Found {len(pdf_files)} salary statement files")
+    
+    # Calculate totals
+    total_brut = sum(t['Brut Mensuel'] for t in all_transactions)
+    total_net = sum(t['Net Imposable'] for t in all_transactions)
+    
+    if total_brut > 0:
+        print(f"Total Brut Mensuel: €{total_brut:,.2f}")
+        print(f"Total Net Imposable: €{total_net:,.2f}")
+    
+    return all_transactions
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='Process SNCF salary statements with enhanced NET PAYÉ extraction')
+    parser.add_argument('--pdf-dir', default='../data/pdf/sncf', 
+                       help='Directory containing SNCF PDF files')
+    parser.add_argument('--output-dir', default='../../output/csv', 
+                       help='Directory to save CSV output files')
+    parser.add_argument('--csv', action='store_true', 
+                       help='Output transaction data to CSV files')
+    
+    args = parser.parse_args()
+    
+    # Process all PDF files in the directory
+    process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)
--- a/scripts/process_sncf_improved.py
+++ b/scripts/process_sncf_improved.py
@@ -0,0 +1,136 @@
+import subprocess
+import re
+import csv
+import os
+import glob
+from collections import defaultdict
+
+def extract_month_from_filename(filename):
+    """Extract month from SNCF filename"""
+    months = {
+        'JANVIER': 1, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4, 
+        'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOUT': 8, 
+        'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DECEMBRE': 12
+    }
+    
+    filename_upper = filename.upper()
+    for month, num in months.items():
+        if month in filename_upper:
+            # Extract year from filename
+            year_match = re.search(r'20(\d{2})', filename)
+            year = int(year_match.group(1)) if year_match else 2025
+            return year, num
+    
+    return 2025, 1  # Default
+
+def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
+    """Process SNCF salary PDF files with proper salary extraction"""
+    # Get all PDF files in the directory
+    pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
+    all_transactions = []
+    
+    for pdf_file in pdf_files:
+        try:
+            # Convert PDF to text
+            result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'], 
+                                  capture_output=True, text=True, check=True)
+            content = result.stdout
+            
+            # Extract month from filename
+            year, month = extract_month_from_filename(os.path.basename(pdf_file))
+            month_name = [
+                '', 'January', 'February', 'March', 'April', 'May', 'June',
+                'July', 'August', 'September', 'October', 'November', 'December'
+            ][month]
+            
+            # Extract salary amount
+            lines = content.split('\n')
+            salary_amount = 0.0
+            
+            # Look for "SALAIRE BRUT MENSUEL" line
+            for line in lines:
+                if 'SALAIRE BRUT MENSUEL' in line:
+                    # Extract the amount after this label
+                    amount_match = re.search(r'SALAIRE BRUT MENSUEL\s+([\d\s.,]+)', line)
+                    if amount_match:
+                        amount_str = amount_match.group(1).replace(' ', '').replace(',', '.')
+                        try:
+                            salary_amount = float(amount_str)
+                            break
+                        except ValueError:
+                            continue
+            
+            # Also look for other salary indicators
+            if salary_amount == 0.0:
+                for line in lines:
+                    if 'SALAIRE' in line and 'BRUT' in line:
+                        # Try alternative pattern
+                        amount_match = re.search(r'([\d\s.,]+)\s*€', line)
+                        if amount_match:
+                            amount_str = amount_match.group(1).replace(' ', '').replace(',', '.')
+                            try:
+                                salary_amount = float(amount_str)
+                                break
+                            except ValueError:
+                                continue
+            
+            # Also check for base salary in the table
+            if salary_amount == 0.0:
+                for line in lines:
+                    if line.strip().startswith('2974,64') or line.strip().startswith('3123,36'):
+                        # Extract from the salary table
+                        parts = line.split()
+                        for part in parts:
+                            try:
+                                if '.' in part and ',' not in part and len(part) > 3:
+                                    salary_amount = float(part.replace(',', '.'))
+                                    break
+                            except ValueError:
+                                continue
+            
+            # Add transaction record
+            all_transactions.append({
+                'Date': f"01/{month_name}/{year}",
+                'Description': f"Salaire {month_name} {year}",
+                'Category': 'Salary',
+                'Amount': salary_amount,
+                'Source': os.path.basename(pdf_file)
+            })
+            
+        except (subprocess.CalledProcessError, FileNotFoundError) as e:
+            print(f"Error processing {pdf_file}: {e}")
+            continue
+    
+    # Output CSV if requested
+    if output_csv and all_transactions:
+        csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
+        os.makedirs(output_dir, exist_ok=True)
+        with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
+            fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(all_transactions)
+        print(f"\nTransaction data saved to {csv_file}")
+    
+    print(f"--- SNCF Salary Statements ---")
+    print(f"Found {len(pdf_files)} salary statement files")
+    total_salary = sum(t['Amount'] for t in all_transactions)
+    if total_salary > 0:
+        print(f"Total Salary Extracted: €{total_salary:,.2f}")
+    
+    return all_transactions
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='Process SNCF salary statements')
+    parser.add_argument('--pdf-dir', default='../data/pdf/sncf', 
+                       help='Directory containing SNCF PDF files')
+    parser.add_argument('--output-dir', default='../../output/csv', 
+                       help='Directory to save CSV output files')
+    parser.add_argument('--csv', action='store_true', 
+                       help='Output transaction data to CSV files')
+    args = parser.parse_args()
+    
+    # Process all PDF files in the directory
+    process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)