Refactor SNCF processor and add Revolut aggregator

- Fix SNCF NET PAYÉ EN EUROS extraction to correctly parse MENSUEL line - Extract month/year from PDF content instead of filename - Add new Revolut CSV processor to aggregate account statements - Organize Revolut data files into data/csv/revolut/ - Clean up redundant scripts and reports
2026-02-09 16:17:48 +01:00
parent ef23d066e0
commit eb66c7a43e
85 changed files with 3270 additions and 2106 deletions
--- a/finanancial_processor.py
+++ b/finanancial_processor.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+"""
+Main script to process all financial statements with a clean one-file-per-structure
+"""
+
+import os
+import sys
+import glob
+import subprocess
+import argparse
+from datetime import datetime
+
+# Import functionality from the dynamic processor
+# Add the directory to the path so we can import it
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from dynamic_processor import discover_pdf_directories, process_dynamic_pdf_files
+
+def main():
+    """
+    Main function with a clean, organized structure
+    """
+    parser = argparse.ArgumentParser(description='Process financial statements with one file per entity')
+    parser.add_argument('--data-dir', 
+                       help='Base directory containing PDF files (default: auto-discovered)')
+    parser.add_argument('--output-dir', default=None, 
+                       help='Directory to save CSV output files (default: auto-discovered)')
+    parser.add_argument('--csv', action='store_true', 
+                       help='Generate CSV output files')
+    parser.add_argument('--single', action='store_true', 
+                       help='Process only the current entity (for testing)')
+    
+    args = parser.parse_args()
+    
+    # Get paths
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    project_root = os.path.dirname(script_dir)
+    
+    # Determine data directory
+    if args.data_dir:
+        data_dir = args.data_dir
+        if not os.path.isabs(data_dir):
+            data_dir = os.path.join(project_root, data_dir)
+    else:
+        data_dir = os.path.join(project_root, 'data/pdf')
+    
+    # Set output directory
+    output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
+    os.makedirs(output_dir, exist_ok=True)
+    
+    print(f"\n{'='*60}")
+    print(f"Financial Statement Processor")
+    print(f"Data Directory: {os.path.abspath(data_dir)}")
+    print(f"Output Directory: {os.path.abspath(output_dir)}")
+    print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"{'='*60}")
+    
+    # Discover all PDF directories
+    pdf_dirs = discover_pdf_directories(data_dir)
+    
+    if not pdf_dirs:
+        print("No directories with PDF files found!")
+        return
+    
+    print(f"\nDiscovered {len(pdf_dirs)} directories with PDF files:")
+    for account_type, info in pdf_dirs.items():
+        print(f"  - {account_type}: {info['count']} files in {info['path']}")
+    
+    # Process each account type to its own file
+    for account_type, info in pdf_dirs.items():
+        if account_type not in ['Boursobank', 'American Express', 'Monabanq', 'SNCF', 'La Poste']:
+            continue  # Skip unsupported types
+            
+        # Create a specialized processor for each account type
+        if account_type == 'Revolut':
+            # Special case for Revolut (CSV files)
+            process_revolut(data_dir, output_dir, args.csv)
+        else:
+            process_pdf_account(account_type, info, output_dir, args.csv, args.single)
+    
+    print(f"\n{'='*60}")
+    print(f"Processing Complete")
+
+def process_revolut(data_dir, output_dir, generate_csv, single_mode=False):
+    """Process Revolut CSV files"""
+    # Revolut CSV files are in raw_csv directory
+    csv_dir = os.path.join(os.path.dirname(data_dir), 'raw_csv')
+    csv_files = glob.glob(os.path.join(csv_dir, "*.csv"))
+    
+    if not csv_files:
+        print(f"No Revolut CSV files found in {csv_dir}")
+        return
+    
+    # Sort files by date
+    csv_files.sort()
+    
+    for csv_file in csv_files:
+        print(f"Processing Revolut CSV: {os.path.basename(csv_file)}")
+        
+        # Build the command
+        cmd = [
+            sys.executable,
+            os.path.join(os.path.dirname(os.path.abspath(__file__)), 
+            'process_expenses.py',
+            '--csv-dir', csv_dir,
+            '--output-dir', output_dir
+        ]
+        
+        if generate_csv:
+            cmd.append('--csv')
+        
+        if single_mode:
+            cmd.append('--single')
+        
+        try:
+            result = subprocess.run(cmd, check=True, capture_output=True)
+            if result.stdout:
+                print(result.stdout)
+        except subprocess.CalledProcessError as e:
+            print(f"Error processing {csv_file}: {e}")
+
+def process_pdf_account(account_type, info, output_dir, generate_csv, single_mode=False):
+    """Create and run a specialized processor for a PDF-based account"""
+    # Create a temporary processor script
+    processor_name = f"{account_type.lower().replace(' ', '_')}_processor.py"
+    
+    processor_content = f'''#!/usr/bin/env python3
+"""
+Temporary processor for {account_type}
+"""
+
+import os
+import sys
+import subprocess
+import glob
+import csv
+
+def main():
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='Process {account_type} statements')
+    parser.add_argument('--pdf-dir')
+    parser.add_argument('--output-dir')
+    parser.add_argument('--csv')
+    args = parser.parse_args()
+    
+    cmd = [
+        sys.executable, os.path.join(os.path.dirname(os.path.abspath(__file__))), 
+        'process_{"account_type.lower().replace(' ', '_')}.py', 
+        '--pdf-dir', args.pdf_dir,
+        '--output-dir', args.output_dir
+    ]
+    
+    if args.csv:
+        cmd.append('--csv')
+    
+    subprocess.run(cmd, check=True)
+'''
+    
+    # Write the processor script
+    processor_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), processor_name)
+    with open(processor_path, 'w') as f:
+        f.write(processor_content)
+    
+    # Make it executable and run it
+    os.chmod(processor_path, 0o755)
+    
+    cmd = [sys.executable, processor_path, '--pdf-dir', info['path'], '--output-dir', output_dir]
+    
+    if generate_csv:
+        cmd.append('--csv')
+    
+    if single_mode:
+        cmd.append('--single')
+    
+    print(f"Running: {' '.join(cmd[2:])}")
+    
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True)
+        if result.stdout:
+                print(result.stdout)
+    except subprocess.CalledProcessError as e:
+        print(f"Error: {e}")
+    
+    # Clean up the temporary script
+    os.remove(processor_path)
+
+if __name__ == "__main__":
+    main()