Add scripts to export all CSV files and aggregate by month

2026-02-09 10:35:45 +01:00
parent 482a199908
commit 7e58c68197
50 changed files with 864 additions and 1302 deletions
--- a/scripts/aggregate_by_month.py
+++ b/scripts/aggregate_by_month.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""
+Script to aggregate all account statements by month
+"""
+
+import os
+import csv
+import sys
+import argparse
+import re
+from datetime import datetime
+from collections import defaultdict
+import calendar
+
+def parse_date(date_str, source_file):
+    """
+    Parse date from various formats and return normalized (year, month, day)
+    """
+    # Try different date formats
+    formats = [
+        '%d/%m/%Y',  # DD/MM/YYYY
+        '%m/%d/%Y',  # MM/DD/YYYY (Amex format)
+        '%Y-%m-%d',  # YYYY-MM-DD (Revolut format)
+    ]
+    
+    for fmt in formats:
+        try:
+            dt = datetime.strptime(date_str, fmt)
+            return (dt.year, dt.month, dt.day)
+        except ValueError:
+            continue
+    
+    # Try to extract from filename (for SNCF)
+    if 'salaire' in source_file.lower():
+        months = ['janvier', 'fevrier', 'mars', 'avril', 'mai', 'juin', 
+                 'juillet', 'aout', 'septembre', 'octobre', 'novembre', 'decembre']
+        for i, month in enumerate(months, 1):
+            if month.lower() in source_file.lower():
+                year_match = re.search(r'20(\d{2})', source_file)
+                year = int(year_match.group(1)) if year_match else datetime.now().year
+                return (year, i, 1)
+    
+    # Default: return current date
+    return (datetime.now().year, datetime.now().month, 1)
+
+def categorize_institution(source_file):
+    """
+    Determine the institution based on the source filename
+    """
+    source_lower = source_file.lower()
+    
+    if 'boursobank' in source_lower or 'releve-compte' in source_lower:
+        return 'Boursobank'
+    elif 'american_express' in source_lower or 'amex' in source_lower:
+        return 'American Express'
+    elif 'monabanq' in source_lower or 'extrait de comptes' in source_lower:
+        return 'Monabanq'
+    elif 'revolut' in source_lower:
+        return 'Revolut'
+    elif 'sncf' in source_lower or 'salaire' in source_lower:
+        return 'SNCF'
+    elif 'la_poste' in source_lower or '2-la.poste' in source_lower or 'releve_ccp' in source_lower:
+        return 'La Poste'
+    
+    return 'Other'
+
+def process_csv_file(file_path):
+    """
+    Process a CSV file and return a list of transactions
+    """
+    transactions = []
+    institution = categorize_institution(os.path.basename(file_path))
+    
+    with open(file_path, 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        
+        for row in reader:
+            # Get the date
+            date_str = row.get('Date', '')
+            if not date_str:
+                continue
+                
+            # Parse and normalize the date
+            year, month, day = parse_date(date_str, row.get('Source', ''))
+            
+            # Get amount (handle different column names)
+            amount_str = row.get('Amount', '') or row.get('Debit', '') or row.get('Credit', '0')
+            try:
+                amount = float(amount_str.replace(',', '.')) if amount_str else 0
+            except ValueError:
+                amount = 0
+            
+            # Create transaction record
+            transactions.append({
+                'year': year,
+                'month': month,
+                'day': day,
+                'date_str': date_str,
+                'description': row.get('Description', ''),
+                'category': row.get('Category', 'Other'),
+                'amount': amount,
+                'institution': institution,
+                'source': row.get('Source', os.path.basename(file_path))
+            })
+    
+    return transactions
+
+def main():
+    parser = argparse.ArgumentParser(description='Aggregate all account statements by month')
+    parser.add_argument('--input-dir', default='output/csv', 
+                       help='Directory containing CSV files to aggregate (default: output/csv)')
+    parser.add_argument('--output-dir', default='output/reports', 
+                       help='Directory to save aggregated reports (default: output/reports)')
+    
+    args = parser.parse_args()
+    
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    
+    print(f"\n{'='*60}")
+    print(f"Monthly Aggregation of All Account Statements")
+    print(f"Input Directory: {os.path.abspath(args.input_dir)}")
+    print(f"Output Directory: {os.path.abspath(args.output_dir)}")
+    print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"{'='*60}")
+    
+    # Collect all transactions
+    all_transactions = []
+    
+    # Find all CSV files in input directory
+    csv_files = [f for f in os.listdir(args.input_dir) if f.endswith('.csv')]
+    
+    if not csv_files:
+        print(f"\nError: No CSV files found in {args.input_dir}")
+        return
+    
+    # Process each CSV file
+    for csv_file in csv_files:
+        file_path = os.path.join(args.input_dir, csv_file)
+        print(f"\nProcessing: {csv_file}")
+        transactions = process_csv_file(file_path)
+        all_transactions.extend(transactions)
+        print(f"  Found {len(transactions)} transactions")
+    
+    # Group transactions by month
+    monthly_transactions = defaultdict(list)
+    for transaction in all_transactions:
+        key = (transaction['year'], transaction['month'])
+        monthly_transactions[key].append(transaction)
+    
+    # Create monthly summary report
+    summary_file = os.path.join(args.output_dir, 'monthly_summary.csv')
+    with open(summary_file, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        
+        # Header
+        writer.writerow([
+            'Year', 'Month', 'Total Income', 'Total Expenses', 'Net Balance',
+            'Transaction Count', 'Institutions'
+        ])
+        
+        # Process each month
+        for (year, month) in sorted(monthly_transactions.keys()):
+            transactions = monthly_transactions[(year, month)]
+            month_name = calendar.month_name[month]
+            
+            # Calculate totals
+            total_income = sum(t['amount'] for t in transactions if t['amount'] < 0)  # Negative amounts are income in Revolut
+            total_expenses = sum(t['amount'] for t in transactions if t['amount'] > 0)
+            net_balance = total_income + total_expenses
+            transaction_count = len(transactions)
+            
+            # Get unique institutions
+            institutions = sorted(list(set(t['institution'] for t in transactions)))
+            institutions_str = ', '.join(institutions)
+            
+            # Write row
+            writer.writerow([
+                year, month_name, total_income, total_expenses, net_balance,
+                transaction_count, institutions_str
+            ])
+    
+    # Create detailed monthly transactions file for each month
+    for (year, month) in sorted(monthly_transactions.keys()):
+        month_name = calendar.month_name[month].lower()
+        transactions = monthly_transactions[(year, month)]
+        
+        # Create filename
+        detail_file = os.path.join(args.output_dir, f'transactions_{year}_{month_name}.csv')
+        
+        with open(detail_file, 'w', newline='', encoding='utf-8') as f:
+            writer = csv.DictWriter(f, fieldnames=[
+                'Date', 'Description', 'Category', 'Amount', 
+                'Institution', 'Source'
+            ])
+            writer.writeheader()
+            
+            # Sort transactions by date
+            sorted_transactions = sorted(transactions, key=lambda x: (x['day'], x['description']))
+            
+            for transaction in sorted_transactions:
+                writer.writerow({
+                    'Date': transaction['date_str'],
+                    'Description': transaction['description'],
+                    'Category': transaction['category'],
+                    'Amount': transaction['amount'],
+                    'Institution': transaction['institution'],
+                    'Source': transaction['source']
+                })
+    
+    # Create yearly summary
+    yearly_summary = defaultdict(lambda: {'income': 0, 'expenses': 0, 'count': 0})
+    for transaction in all_transactions:
+        year = transaction['year']
+        yearly_summary[year]['count'] += 1
+        if transaction['amount'] < 0:
+            yearly_summary[year]['income'] += transaction['amount']
+        else:
+            yearly_summary[year]['expenses'] += transaction['amount']
+    
+    # Create yearly summary file
+    yearly_file = os.path.join(args.output_dir, 'yearly_summary.csv')
+    with open(yearly_file, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        writer.writerow(['Year', 'Total Income', 'Total Expenses', 'Net Balance', 'Transaction Count'])
+        
+        for year in sorted(yearly_summary.keys()):
+            data = yearly_summary[year]
+            net_balance = data['income'] + data['expenses']
+            writer.writerow([
+                year, data['income'], data['expenses'], net_balance, data['count']
+            ])
+    
+    # Print summary statistics
+    print(f"\n{'='*60}")
+    print(f"Aggregation Complete")
+    print(f"Total Transactions: {len(all_transactions)}")
+    print(f"Months with Data: {len(monthly_transactions)}")
+    print(f"{'='*60}")
+    
+    # List generated files
+    generated_files = [
+        os.path.basename(summary_file),
+        os.path.basename(yearly_file)
+    ]
+    for (year, month) in sorted(monthly_transactions.keys()):
+        month_name = calendar.month_name[month].lower()
+        generated_files.append(f'transactions_{year}_{month_name}.csv')
+    
+    print("\nGenerated Files:")
+    for file in generated_files:
+        file_path = os.path.join(args.output_dir, file)
+        if os.path.exists(file_path):
+            file_size = os.path.getsize(file_path)
+            print(f"  - {file} ({file_size:,} bytes)")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/export_all_csv.py
+++ b/scripts/export_all_csv.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""
+Script to output CSV files for all account statements
+"""
+
+import os
+import subprocess
+import sys
+import argparse
+from datetime import datetime
+
+def run_script(script_path, pdf_dir, output_dir, use_csv_dir=False):
+    """Run a processing script with the specified parameters"""
+    if use_csv_dir:  # For Revolut which uses CSV input
+        cmd = [sys.executable, script_path, '--csv-dir', pdf_dir, '--output-dir', output_dir, '--csv']
+    else:
+        cmd = [sys.executable, script_path, '--pdf-dir', pdf_dir, '--output-dir', output_dir, '--csv']
+    
+    print(f"\n{'='*60}")
+    print(f"Processing {script_path.replace('../scripts/', '').replace('.py', '').replace('_', ' ').title()} statements...")
+    print('='*60)
+    
+    try:
+        result = subprocess.run(cmd, check=True)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"Error running {script_path}: {e}")
+        return False
+
+def main():
+    # Get absolute paths
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    project_root = os.path.dirname(script_dir)
+    
+    parser = argparse.ArgumentParser(description='Process all account statements and output CSV files')
+    parser.add_argument('--output-dir', default=os.path.join(project_root, 'output/csv'), 
+                       help='Directory to save CSV output files')
+    
+    args = parser.parse_args()
+    
+    # Create output directory if it doesn't exist
+    os.makedirs(args.output_dir, exist_ok=True)
+    
+    print(f"\n{'='*60}")
+    print(f"All Account Statements CSV Export")
+    print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"Output Directory: {os.path.abspath(args.output_dir)}")
+    print(f"{'='*60}")
+    
+    # Get absolute paths
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    project_root = os.path.dirname(script_dir)
+    data_dir = os.path.join(project_root, 'data/pdf')
+    raw_csv_dir = os.path.join(project_root, 'data/raw_csv')
+    
+    # Define account types and their corresponding directories and scripts
+    accounts = [
+        {
+            'name': 'Boursobank',
+            'script': os.path.join(script_dir, 'process_bourso.py'),
+            'data_dir': os.path.join(data_dir, 'boursobank'),
+            'use_csv_dir': False
+        },
+        {
+            'name': 'American Express',
+            'script': os.path.join(script_dir, 'process_amex.py'),
+            'data_dir': os.path.join(data_dir, 'american_express'),
+            'use_csv_dir': False
+        },
+        {
+            'name': 'Monabanq',
+            'script': os.path.join(script_dir, 'process_monabanq.py'),
+            'data_dir': os.path.join(data_dir, 'monabanq'),
+            'use_csv_dir': False
+        },
+        {
+            'name': 'Revolut',
+            'script': os.path.join(script_dir, 'process_expenses.py'),
+            'data_dir': raw_csv_dir,  # Revolut uses CSV input
+            'use_csv_dir': True
+        },
+        {
+            'name': 'SNCF',
+            'script': os.path.join(script_dir, 'process_sncf.py'),
+            'data_dir': os.path.join(data_dir, '1-sncf'),
+            'use_csv_dir': False
+        },
+        {
+            'name': 'La Poste',
+            'script': os.path.join(script_dir, 'process_laposte.py'),
+            'data_dir': os.path.join(data_dir, '2-la.poste'),
+            'use_csv_dir': False
+        }
+    ]
+    
+    # Process each account
+    success_count = 0
+    total_accounts = len(accounts)
+    
+    for account in accounts:
+        # Check if directory exists and has files
+        if not os.path.exists(account['data_dir']):
+            print(f"\nWarning: Directory not found for {account['name']}: {account['data_dir']}")
+            continue
+            
+        # Skip if directory is empty
+        if not os.listdir(account['data_dir']):
+            print(f"\nSkipping {account['name']}: No files found in {account['data_dir']}")
+            continue
+        
+        # Run the processing script with appropriate parameter name
+        if run_script(account['script'], account['data_dir'], args.output_dir, account['use_csv_dir']):
+            success_count += 1
+    
+    # Print summary
+    print(f"\n{'='*60}")
+    print(f"Processing Complete: {success_count}/{total_accounts} accounts processed successfully")
+    print(f"CSV files have been saved to: {os.path.abspath(args.output_dir)}")
+    print(f"{'='*60}")
+    
+    # List generated CSV files
+    if os.path.exists(args.output_dir):
+        csv_files = [f for f in os.listdir(args.output_dir) if f.endswith('.csv')]
+        if csv_files:
+            print(f"\nGenerated CSV Files:")
+            for file in sorted(csv_files):
+                file_path = os.path.join(args.output_dir, file)
+                file_size = os.path.getsize(file_path)
+                print(f"  - {file} ({file_size:,} bytes)")
+
+if __name__ == "__main__":
+    main()