import re import csv import os from collections import defaultdict def categorize_bourso_transaction(description): description = description.lower() if 'ech pret' in description: return 'Loan Repayment' if 'american express' in description: return 'Credit Card Payment (Amex)' if 'orange sa' in description or 'sfr' in description or 'ste reunionnaise du radiotelep' in description: return 'Utilities' if 'be rock' in description: return 'Subscription (BE ROCK)' if 'paypal' in description: return 'Online Purchases (Paypal)' if 'vir virement interne' in description: return 'Internal Transfer' if 'retrait dab' in description: return 'Cash Withdrawal' if description.startswith('carte'): return 'Card Payment' return 'Other' def process_bourso_statement(file_path, output_csv=False, output_dir='../../output/csv'): with open(file_path, 'r', encoding='utf-8') as f: content = f.read() expense_summary = defaultdict(float) total_expenses = 0 transactions_data = [] # Store all transaction data for CSV output # A more robust regex to handle slight variations transaction_regex = re.compile(r"^ (\d{2}/\d{2}/\d{4})\s+(.*?)\s+(\d{2}/\d{2}/\d{4})\s+([\d,.]+\s*)?([\d,.]+\s*)?$", re.MULTILINE) transactions = transaction_regex.findall(content) print("--- Matched Transactions ---") for op_date, description, val_date, debit_str, credit_str in transactions: description = description.strip() debit = 0 if debit_str: try: debit = float(debit_str.strip().replace(',', '.')) except ValueError: continue # Skip if debit is not a valid number category = categorize_bourso_transaction(description) print(f"Found: {description} -> {category} -> {debit}") # DEBUG # Store transaction data for potential CSV output transactions_data.append({ 'Date': op_date, 'Description': description, 'Category': category, 'Debit': debit, 'Credit': 0, 'Value Date': val_date }) if debit > 0 and category != 'Internal Transfer': expense_summary[category] += debit total_expenses += debit # Output CSV if requested if output_csv: csv_file = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '_transactions.csv') os.makedirs(output_dir, exist_ok=True) with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['Date', 'Description', 'Category', 'Debit', 'Credit', 'Value Date'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(transactions_data) print(f"\nTransaction data saved to {csv_file}") print("\n--- Boursobank Expense Summary (Dec 2025) - Final ---") print(f"Total Expenses Analyzed: €{total_expenses:,.2f}") print("\n--- Spending by Category ---") sorted_expenses = sorted(expense_summary.items(), key=lambda item: item[1], reverse=True) if total_expenses > 0: for category, total in sorted_expenses: percentage = (total / total_expenses) * 100 print(f"{category:<25} €{total:9,.2f} ({percentage:5.2f}%)") else: print("No expenses found.") return transactions_data def process_bourso_pdf_files(directory, output_csv=False, output_dir='../../output/csv'): import subprocess import glob # Get all PDF files in the directory pdf_files = glob.glob(os.path.join(directory, "*.pdf")) all_transactions = [] for pdf_file in pdf_files: try: # Convert PDF to text result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'], capture_output=True, text=True, check=True) content = result.stdout # Save text to temporary file temp_file = os.path.splitext(pdf_file)[0] + '.txt' with open(temp_file, 'w', encoding='utf-8') as f: f.write(content) # Process the text file transactions = process_bourso_statement(temp_file, output_csv, output_dir) all_transactions.extend(transactions) # Clean up temporary file os.remove(temp_file) except (subprocess.CalledProcessError, FileNotFoundError) as e: print(f"Error processing {pdf_file}: {e}") continue # Output consolidated CSV if requested if output_csv and all_transactions: csv_file = os.path.join(output_dir, 'boursobank_all_transactions.csv') os.makedirs(output_dir, exist_ok=True) with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['Date', 'Description', 'Category', 'Debit', 'Credit', 'Value Date'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(all_transactions) print(f"\nAll transaction data saved to {csv_file}") return all_transactions if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Process Boursobank statements') parser.add_argument('--pdf-dir', default='../data/pdf/boursobank', help='Directory containing Boursobank PDF files') parser.add_argument('--output-dir', default='../../output/csv', help='Directory to save CSV output files') parser.add_argument('--csv', action='store_true', help='Output transaction data to CSV files') args = parser.parse_args() # Process all PDF files in the directory process_bourso_pdf_files(args.pdf_dir, args.csv, args.output_dir)