Reorganize project structure with separate directories for scripts, data, and output
This commit is contained in:
129
scripts/process_monabanq.py
Normal file
129
scripts/process_monabanq.py
Normal file
@@ -0,0 +1,129 @@
|
||||
|
||||
import subprocess
|
||||
import re
|
||||
import csv
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
def categorize_monabanq_transaction(description):
|
||||
description = description.lower()
|
||||
|
||||
if 'ech pret' in description:
|
||||
return 'Loan Repayment'
|
||||
if 'f cotis pratiq+' in description:
|
||||
return 'Bank Fees'
|
||||
if 'google ireland' in description:
|
||||
return 'Google Services'
|
||||
if 'vir mr bataille kevin' in description:
|
||||
return 'Internal Transfer'
|
||||
|
||||
return 'Other'
|
||||
|
||||
def process_monabanq_files(file_list, output_csv=False, output_dir='../../output/csv'):
|
||||
expense_summary = defaultdict(float)
|
||||
total_expenses = 0
|
||||
all_transactions = []
|
||||
|
||||
for file_path in file_list:
|
||||
try:
|
||||
# Use pdftotext to extract text
|
||||
result = subprocess.run(['pdftotext', '-layout', file_path, '-'], capture_output=True, text=True, check=True)
|
||||
content = result.stdout
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
||||
print(f"Error processing {file_path}: {e}")
|
||||
continue
|
||||
|
||||
lines = content.split('\n')
|
||||
transaction_started = False
|
||||
for line in lines:
|
||||
if "SOLDE CREDITEUR AU" in line or "SOLDE DEBITEUR AU" in line:
|
||||
transaction_started = True
|
||||
continue
|
||||
if not transaction_started or not line.strip():
|
||||
continue
|
||||
if "IBAN :" in line:
|
||||
break
|
||||
|
||||
# Regex to capture date, description, and debit/credit
|
||||
match = re.match(r'\s*(\d{2}/\d{2}/\d{4})\s+\d{2}/\d{2}/\d{4}\s+(.*?)(?=\s{2,}|$)(\s+[\d,.]+)?(\s+[\d,.]+)?', line)
|
||||
if match:
|
||||
op_date, description, debit_str, credit_str = match.groups()
|
||||
description = description.strip()
|
||||
debit = 0
|
||||
credit = 0
|
||||
|
||||
if debit_str:
|
||||
try:
|
||||
debit = float(debit_str.strip().replace(',', '.'))
|
||||
except (ValueError, AttributeError):
|
||||
debit = 0
|
||||
|
||||
if credit_str:
|
||||
try:
|
||||
credit = float(credit_str.strip().replace(',', '.'))
|
||||
except (ValueError, AttributeError):
|
||||
credit = 0
|
||||
|
||||
category = categorize_monabanq_transaction(description)
|
||||
|
||||
# Store transaction for CSV output
|
||||
all_transactions.append({
|
||||
'Date': op_date,
|
||||
'Description': description,
|
||||
'Category': category,
|
||||
'Debit': debit,
|
||||
'Credit': credit,
|
||||
'Source': os.path.basename(file_path)
|
||||
})
|
||||
|
||||
if debit > 0 and category != 'Internal Transfer':
|
||||
expense_summary[category] += debit
|
||||
total_expenses += debit
|
||||
|
||||
# Output CSV if requested
|
||||
if output_csv and all_transactions:
|
||||
csv_file = os.path.join(output_dir, 'monabanq_all_transactions.csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['Date', 'Description', 'Category', 'Debit', 'Credit', 'Source']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(all_transactions)
|
||||
print(f"\nTransaction data saved to {csv_file}")
|
||||
|
||||
print("--- Monabanq Expense Summary for 2025 ---")
|
||||
print(f"Total Expenses Analyzed: €{total_expenses:,.2f}")
|
||||
print("\n--- Spending by Category ---")
|
||||
|
||||
sorted_expenses = sorted(expense_summary.items(), key=lambda item: item[1], reverse=True)
|
||||
|
||||
if total_expenses > 0:
|
||||
for category, total in sorted_expenses:
|
||||
percentage = (total / total_expenses) * 100
|
||||
print(f"{category:<25} €{total:9,.2f} ({percentage:5.2f}%)")
|
||||
else:
|
||||
print("No expenses found.")
|
||||
|
||||
return all_transactions
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import glob
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process Monabanq statements')
|
||||
parser.add_argument('--pdf-dir', default='../data/pdf/monabanq',
|
||||
help='Directory containing Monabanq PDF files')
|
||||
parser.add_argument('--output-dir', default='../../output/csv',
|
||||
help='Directory to save CSV output files')
|
||||
parser.add_argument('--csv', action='store_true',
|
||||
help='Output transaction data to CSV files')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get all PDF files in the directory
|
||||
pdf_files = glob.glob(os.path.join(args.pdf_dir, "*.pdf"))
|
||||
|
||||
# Sort files by date if possible
|
||||
pdf_files.sort()
|
||||
|
||||
# Process all PDF files in the directory
|
||||
process_monabanq_files(pdf_files, args.csv, args.output_dir)
|
||||
Reference in New Issue
Block a user