Reorganize project structure with separate directories for scripts, data, and output

2026-02-09 10:20:55 +01:00
parent acb1276b38
commit 73ff2b70f7
125 changed files with 1786 additions and 56 deletions
--- a/scripts/process_monabanq.py
+++ b/scripts/process_monabanq.py
@@ -0,0 +1,129 @@
+
+import subprocess
+import re
+import csv
+import os
+from collections import defaultdict
+
+def categorize_monabanq_transaction(description):
+    description = description.lower()
+    
+    if 'ech pret' in description:
+        return 'Loan Repayment'
+    if 'f cotis pratiq+' in description:
+        return 'Bank Fees'
+    if 'google ireland' in description:
+        return 'Google Services'
+    if 'vir mr bataille kevin' in description:
+        return 'Internal Transfer'
+        
+    return 'Other'
+
+def process_monabanq_files(file_list, output_csv=False, output_dir='../../output/csv'):
+    expense_summary = defaultdict(float)
+    total_expenses = 0
+    all_transactions = []
+
+    for file_path in file_list:
+        try:
+            # Use pdftotext to extract text
+            result = subprocess.run(['pdftotext', '-layout', file_path, '-'], capture_output=True, text=True, check=True)
+            content = result.stdout
+        except (subprocess.CalledProcessError, FileNotFoundError) as e:
+            print(f"Error processing {file_path}: {e}")
+            continue
+
+        lines = content.split('\n')
+        transaction_started = False
+        for line in lines:
+            if "SOLDE CREDITEUR AU" in line or "SOLDE DEBITEUR AU" in line:
+                transaction_started = True
+                continue
+            if not transaction_started or not line.strip():
+                continue
+            if "IBAN :" in line:
+                break
+
+            # Regex to capture date, description, and debit/credit
+            match = re.match(r'\s*(\d{2}/\d{2}/\d{4})\s+\d{2}/\d{2}/\d{4}\s+(.*?)(?=\s{2,}|$)(\s+[\d,.]+)?(\s+[\d,.]+)?', line)
+            if match:
+                op_date, description, debit_str, credit_str = match.groups()
+                description = description.strip()
+                debit = 0
+                credit = 0
+                
+                if debit_str:
+                    try:
+                        debit = float(debit_str.strip().replace(',', '.'))
+                    except (ValueError, AttributeError):
+                        debit = 0
+                
+                if credit_str:
+                    try:
+                        credit = float(credit_str.strip().replace(',', '.'))
+                    except (ValueError, AttributeError):
+                        credit = 0
+                
+                category = categorize_monabanq_transaction(description)
+                
+                # Store transaction for CSV output
+                all_transactions.append({
+                    'Date': op_date,
+                    'Description': description,
+                    'Category': category,
+                    'Debit': debit,
+                    'Credit': credit,
+                    'Source': os.path.basename(file_path)
+                })
+                
+                if debit > 0 and category != 'Internal Transfer':
+                    expense_summary[category] += debit
+                    total_expenses += debit
+    
+    # Output CSV if requested
+    if output_csv and all_transactions:
+        csv_file = os.path.join(output_dir, 'monabanq_all_transactions.csv')
+        os.makedirs(output_dir, exist_ok=True)
+        with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
+            fieldnames = ['Date', 'Description', 'Category', 'Debit', 'Credit', 'Source']
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(all_transactions)
+        print(f"\nTransaction data saved to {csv_file}")
+
+    print("--- Monabanq Expense Summary for 2025 ---")
+    print(f"Total Expenses Analyzed: €{total_expenses:,.2f}")
+    print("\n--- Spending by Category ---")
+    
+    sorted_expenses = sorted(expense_summary.items(), key=lambda item: item[1], reverse=True)
+    
+    if total_expenses > 0:
+        for category, total in sorted_expenses:
+            percentage = (total / total_expenses) * 100
+            print(f"{category:<25} €{total:9,.2f} ({percentage:5.2f}%)")
+    else:
+        print("No expenses found.")
+    
+    return all_transactions
+
+if __name__ == "__main__":
+    import argparse
+    import glob
+    
+    parser = argparse.ArgumentParser(description='Process Monabanq statements')
+    parser.add_argument('--pdf-dir', default='../data/pdf/monabanq', 
+                       help='Directory containing Monabanq PDF files')
+    parser.add_argument('--output-dir', default='../../output/csv', 
+                       help='Directory to save CSV output files')
+    parser.add_argument('--csv', action='store_true', 
+                       help='Output transaction data to CSV files')
+    args = parser.parse_args()
+    
+    # Get all PDF files in the directory
+    pdf_files = glob.glob(os.path.join(args.pdf_dir, "*.pdf"))
+    
+    # Sort files by date if possible
+    pdf_files.sort()
+    
+    # Process all PDF files in the directory
+    process_monabanq_files(pdf_files, args.csv, args.output_dir)