Add scripts to export all CSV files and aggregate by month

This commit is contained in:
Kevin Bataille
2026-02-09 10:35:45 +01:00
parent 482a199908
commit 7e58c68197
50 changed files with 864 additions and 1302 deletions

258
scripts/aggregate_by_month.py Executable file
View File

@@ -0,0 +1,258 @@
#!/usr/bin/env python3
"""
Script to aggregate all account statements by month
"""
import os
import csv
import sys
import argparse
import re
from datetime import datetime
from collections import defaultdict
import calendar
def parse_date(date_str, source_file):
"""
Parse date from various formats and return normalized (year, month, day)
"""
# Try different date formats
formats = [
'%d/%m/%Y', # DD/MM/YYYY
'%m/%d/%Y', # MM/DD/YYYY (Amex format)
'%Y-%m-%d', # YYYY-MM-DD (Revolut format)
]
for fmt in formats:
try:
dt = datetime.strptime(date_str, fmt)
return (dt.year, dt.month, dt.day)
except ValueError:
continue
# Try to extract from filename (for SNCF)
if 'salaire' in source_file.lower():
months = ['janvier', 'fevrier', 'mars', 'avril', 'mai', 'juin',
'juillet', 'aout', 'septembre', 'octobre', 'novembre', 'decembre']
for i, month in enumerate(months, 1):
if month.lower() in source_file.lower():
year_match = re.search(r'20(\d{2})', source_file)
year = int(year_match.group(1)) if year_match else datetime.now().year
return (year, i, 1)
# Default: return current date
return (datetime.now().year, datetime.now().month, 1)
def categorize_institution(source_file):
"""
Determine the institution based on the source filename
"""
source_lower = source_file.lower()
if 'boursobank' in source_lower or 'releve-compte' in source_lower:
return 'Boursobank'
elif 'american_express' in source_lower or 'amex' in source_lower:
return 'American Express'
elif 'monabanq' in source_lower or 'extrait de comptes' in source_lower:
return 'Monabanq'
elif 'revolut' in source_lower:
return 'Revolut'
elif 'sncf' in source_lower or 'salaire' in source_lower:
return 'SNCF'
elif 'la_poste' in source_lower or '2-la.poste' in source_lower or 'releve_ccp' in source_lower:
return 'La Poste'
return 'Other'
def process_csv_file(file_path):
"""
Process a CSV file and return a list of transactions
"""
transactions = []
institution = categorize_institution(os.path.basename(file_path))
with open(file_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
# Get the date
date_str = row.get('Date', '')
if not date_str:
continue
# Parse and normalize the date
year, month, day = parse_date(date_str, row.get('Source', ''))
# Get amount (handle different column names)
amount_str = row.get('Amount', '') or row.get('Debit', '') or row.get('Credit', '0')
try:
amount = float(amount_str.replace(',', '.')) if amount_str else 0
except ValueError:
amount = 0
# Create transaction record
transactions.append({
'year': year,
'month': month,
'day': day,
'date_str': date_str,
'description': row.get('Description', ''),
'category': row.get('Category', 'Other'),
'amount': amount,
'institution': institution,
'source': row.get('Source', os.path.basename(file_path))
})
return transactions
def main():
parser = argparse.ArgumentParser(description='Aggregate all account statements by month')
parser.add_argument('--input-dir', default='output/csv',
help='Directory containing CSV files to aggregate (default: output/csv)')
parser.add_argument('--output-dir', default='output/reports',
help='Directory to save aggregated reports (default: output/reports)')
args = parser.parse_args()
# Create output directory
os.makedirs(args.output_dir, exist_ok=True)
print(f"\n{'='*60}")
print(f"Monthly Aggregation of All Account Statements")
print(f"Input Directory: {os.path.abspath(args.input_dir)}")
print(f"Output Directory: {os.path.abspath(args.output_dir)}")
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*60}")
# Collect all transactions
all_transactions = []
# Find all CSV files in input directory
csv_files = [f for f in os.listdir(args.input_dir) if f.endswith('.csv')]
if not csv_files:
print(f"\nError: No CSV files found in {args.input_dir}")
return
# Process each CSV file
for csv_file in csv_files:
file_path = os.path.join(args.input_dir, csv_file)
print(f"\nProcessing: {csv_file}")
transactions = process_csv_file(file_path)
all_transactions.extend(transactions)
print(f" Found {len(transactions)} transactions")
# Group transactions by month
monthly_transactions = defaultdict(list)
for transaction in all_transactions:
key = (transaction['year'], transaction['month'])
monthly_transactions[key].append(transaction)
# Create monthly summary report
summary_file = os.path.join(args.output_dir, 'monthly_summary.csv')
with open(summary_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
# Header
writer.writerow([
'Year', 'Month', 'Total Income', 'Total Expenses', 'Net Balance',
'Transaction Count', 'Institutions'
])
# Process each month
for (year, month) in sorted(monthly_transactions.keys()):
transactions = monthly_transactions[(year, month)]
month_name = calendar.month_name[month]
# Calculate totals
total_income = sum(t['amount'] for t in transactions if t['amount'] < 0) # Negative amounts are income in Revolut
total_expenses = sum(t['amount'] for t in transactions if t['amount'] > 0)
net_balance = total_income + total_expenses
transaction_count = len(transactions)
# Get unique institutions
institutions = sorted(list(set(t['institution'] for t in transactions)))
institutions_str = ', '.join(institutions)
# Write row
writer.writerow([
year, month_name, total_income, total_expenses, net_balance,
transaction_count, institutions_str
])
# Create detailed monthly transactions file for each month
for (year, month) in sorted(monthly_transactions.keys()):
month_name = calendar.month_name[month].lower()
transactions = monthly_transactions[(year, month)]
# Create filename
detail_file = os.path.join(args.output_dir, f'transactions_{year}_{month_name}.csv')
with open(detail_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=[
'Date', 'Description', 'Category', 'Amount',
'Institution', 'Source'
])
writer.writeheader()
# Sort transactions by date
sorted_transactions = sorted(transactions, key=lambda x: (x['day'], x['description']))
for transaction in sorted_transactions:
writer.writerow({
'Date': transaction['date_str'],
'Description': transaction['description'],
'Category': transaction['category'],
'Amount': transaction['amount'],
'Institution': transaction['institution'],
'Source': transaction['source']
})
# Create yearly summary
yearly_summary = defaultdict(lambda: {'income': 0, 'expenses': 0, 'count': 0})
for transaction in all_transactions:
year = transaction['year']
yearly_summary[year]['count'] += 1
if transaction['amount'] < 0:
yearly_summary[year]['income'] += transaction['amount']
else:
yearly_summary[year]['expenses'] += transaction['amount']
# Create yearly summary file
yearly_file = os.path.join(args.output_dir, 'yearly_summary.csv')
with open(yearly_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['Year', 'Total Income', 'Total Expenses', 'Net Balance', 'Transaction Count'])
for year in sorted(yearly_summary.keys()):
data = yearly_summary[year]
net_balance = data['income'] + data['expenses']
writer.writerow([
year, data['income'], data['expenses'], net_balance, data['count']
])
# Print summary statistics
print(f"\n{'='*60}")
print(f"Aggregation Complete")
print(f"Total Transactions: {len(all_transactions)}")
print(f"Months with Data: {len(monthly_transactions)}")
print(f"{'='*60}")
# List generated files
generated_files = [
os.path.basename(summary_file),
os.path.basename(yearly_file)
]
for (year, month) in sorted(monthly_transactions.keys()):
month_name = calendar.month_name[month].lower()
generated_files.append(f'transactions_{year}_{month_name}.csv')
print("\nGenerated Files:")
for file in generated_files:
file_path = os.path.join(args.output_dir, file)
if os.path.exists(file_path):
file_size = os.path.getsize(file_path)
print(f" - {file} ({file_size:,} bytes)")
if __name__ == "__main__":
main()

132
scripts/export_all_csv.py Executable file
View File

@@ -0,0 +1,132 @@
#!/usr/bin/env python3
"""
Script to output CSV files for all account statements
"""
import os
import subprocess
import sys
import argparse
from datetime import datetime
def run_script(script_path, pdf_dir, output_dir, use_csv_dir=False):
"""Run a processing script with the specified parameters"""
if use_csv_dir: # For Revolut which uses CSV input
cmd = [sys.executable, script_path, '--csv-dir', pdf_dir, '--output-dir', output_dir, '--csv']
else:
cmd = [sys.executable, script_path, '--pdf-dir', pdf_dir, '--output-dir', output_dir, '--csv']
print(f"\n{'='*60}")
print(f"Processing {script_path.replace('../scripts/', '').replace('.py', '').replace('_', ' ').title()} statements...")
print('='*60)
try:
result = subprocess.run(cmd, check=True)
return True
except subprocess.CalledProcessError as e:
print(f"Error running {script_path}: {e}")
return False
def main():
# Get absolute paths
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(script_dir)
parser = argparse.ArgumentParser(description='Process all account statements and output CSV files')
parser.add_argument('--output-dir', default=os.path.join(project_root, 'output/csv'),
help='Directory to save CSV output files')
args = parser.parse_args()
# Create output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
print(f"\n{'='*60}")
print(f"All Account Statements CSV Export")
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Output Directory: {os.path.abspath(args.output_dir)}")
print(f"{'='*60}")
# Get absolute paths
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(script_dir)
data_dir = os.path.join(project_root, 'data/pdf')
raw_csv_dir = os.path.join(project_root, 'data/raw_csv')
# Define account types and their corresponding directories and scripts
accounts = [
{
'name': 'Boursobank',
'script': os.path.join(script_dir, 'process_bourso.py'),
'data_dir': os.path.join(data_dir, 'boursobank'),
'use_csv_dir': False
},
{
'name': 'American Express',
'script': os.path.join(script_dir, 'process_amex.py'),
'data_dir': os.path.join(data_dir, 'american_express'),
'use_csv_dir': False
},
{
'name': 'Monabanq',
'script': os.path.join(script_dir, 'process_monabanq.py'),
'data_dir': os.path.join(data_dir, 'monabanq'),
'use_csv_dir': False
},
{
'name': 'Revolut',
'script': os.path.join(script_dir, 'process_expenses.py'),
'data_dir': raw_csv_dir, # Revolut uses CSV input
'use_csv_dir': True
},
{
'name': 'SNCF',
'script': os.path.join(script_dir, 'process_sncf.py'),
'data_dir': os.path.join(data_dir, '1-sncf'),
'use_csv_dir': False
},
{
'name': 'La Poste',
'script': os.path.join(script_dir, 'process_laposte.py'),
'data_dir': os.path.join(data_dir, '2-la.poste'),
'use_csv_dir': False
}
]
# Process each account
success_count = 0
total_accounts = len(accounts)
for account in accounts:
# Check if directory exists and has files
if not os.path.exists(account['data_dir']):
print(f"\nWarning: Directory not found for {account['name']}: {account['data_dir']}")
continue
# Skip if directory is empty
if not os.listdir(account['data_dir']):
print(f"\nSkipping {account['name']}: No files found in {account['data_dir']}")
continue
# Run the processing script with appropriate parameter name
if run_script(account['script'], account['data_dir'], args.output_dir, account['use_csv_dir']):
success_count += 1
# Print summary
print(f"\n{'='*60}")
print(f"Processing Complete: {success_count}/{total_accounts} accounts processed successfully")
print(f"CSV files have been saved to: {os.path.abspath(args.output_dir)}")
print(f"{'='*60}")
# List generated CSV files
if os.path.exists(args.output_dir):
csv_files = [f for f in os.listdir(args.output_dir) if f.endswith('.csv')]
if csv_files:
print(f"\nGenerated CSV Files:")
for file in sorted(csv_files):
file_path = os.path.join(args.output_dir, file)
file_size = os.path.getsize(file_path)
print(f" - {file} ({file_size:,} bytes)")
if __name__ == "__main__":
main()