#!/usr/bin/env python3 """ Enhanced SNCF processor to extract NET PAYÉ EN EUROS amounts """ import subprocess import re import csv import os import glob import argparse from collections import defaultdict def extract_sncf_salary_data(content, filename): """Extract salary data from SNCF PDF content with focus on NET PAYÉ EN EUROS""" # Extract month and year from content (e.g., "BULLETIN DE PAIE DU MOIS DE Janvier 2026") months = { 'JANVIER': 1, 'FÉVRIER': 2, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4, 'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOÛT': 8, 'AOUT': 8, 'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DÉCEMBRE': 12, 'DECEMBRE': 12 } # Try to find month/year from content month_num = 1 year = 2025 month_name = '' # Look for pattern like "MOIS DE Janvier 2026" in content mois_match = re.search(r'MOIS DE\s+(\w+)\s+(\d{4})', content, re.IGNORECASE) if mois_match: month_str = mois_match.group(1).upper() year = int(mois_match.group(2)) if month_str in months: month_num = months[month_str] # Get month name month_names = [ '', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ] month_name = month_names[month_num] # Initialize salary data salary_data = { 'month': '', 'year': year, 'brut_mensuel': 0.0, 'net_imposable': 0.0, 'net_paye_euros': 0.0, 'cumul_annuel': 0.0, 'mode_paiement': '' } lines = content.split('\n') # Look for the salary table with NET PAYÉ EN EUROS for i, line in enumerate(lines): if 'NET PAYÉ EN EUROS' in line: # The next line should be the MENSUEL line with the actual values next_line = lines[i + 1] if i + 1 < len(lines) else '' # Parse the MENSUEL line which has format: # MENSUEL EUR mensuel_match = re.search(r'MENSUEL\s+([\d\s,]+?)\s+([\d\s,]+?)\s+([\d\s,]+?)\s+([\d\s,]+?)\s+EUR', next_line) if mensuel_match: try: # Extract values and convert from French format (comma as decimal) brut_mensuel = float(mensuel_match.group(1).replace(' ', '').replace(',', '.')) net_imposable = float(mensuel_match.group(2).replace(' ', '').replace(',', '.')) prelevement = float(mensuel_match.group(3).replace(' ', '').replace(',', '.')) net_paye_euros = float(mensuel_match.group(4).replace(' ', '').replace(',', '.')) salary_data = { 'month': month_name, 'year': year, 'brut_mensuel': brut_mensuel, 'net_imposable': net_imposable, 'net_paye_euros': net_paye_euros, 'cumul_annuel': 0.0, 'mode_paiement': 'virement SEPA A COMPTER DU DERNIER JOUR OUVRE DU MOIS' } break except (ValueError, IndexError): continue return salary_data def process_sncf_pdf_files(directory, output_csv=False, output_dir='output/csv'): """Process SNCF salary PDF files with proper NET PAYÉ extraction""" # Get all PDF files in the directory pdf_files = glob.glob(os.path.join(directory, "*.pdf")) all_transactions = [] for pdf_file in pdf_files: try: # Convert PDF to text result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'], capture_output=True, text=True, check=True) content = result.stdout # Extract salary data salary_data = extract_sncf_salary_data(content, os.path.basename(pdf_file)) except Exception as e: print(f"Error processing {pdf_file}: {e}") continue # Create transaction record with proper salary amount if salary_data['month'] and salary_data['net_paye_euros'] > 0: all_transactions.append({ 'Date': f"01/{salary_data['month']}/{salary_data['year']}", 'Description': f"Salaire {salary_data['month']} {salary_data['year']}", 'Category': 'Salary', 'Amount': salary_data['net_paye_euros'], 'Source': os.path.basename(pdf_file), 'Brut Mensuel': salary_data['brut_mensuel'], 'Net Imposable': salary_data['net_imposable'], 'Cumul Annuel': salary_data['cumul_annuel'] }) else: # Still create an entry but with zero amount for data integrity all_transactions.append({ 'Date': f"01/{salary_data.get('month', '')}/{salary_data.get('year', '2025')}", 'Description': f"Salaire {salary_data.get('month', '')} {salary_data.get('year', '2025')}", 'Category': 'Salary', 'Amount': salary_data.get('net_paye_euros', 0), 'Source': os.path.basename(pdf_file), 'Brut Mensuel': salary_data.get('brut_mensuel', 0), 'Net Imposable': salary_data.get('net_imposable', 0), 'Cumul Annuel': salary_data.get('cumul_annuel', 0), 'Mode Paiement': salary_data.get('mode_paiement', '') }) # Output CSV with enhanced SNCF data if output_csv and all_transactions: csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv') os.makedirs(output_dir, exist_ok=True) with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source', 'Brut Mensuel', 'Net Imposable', 'Cumul Annuel'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(all_transactions) print(f"\nTransaction data saved to {csv_file}") print(f"--- SNCF Salary Statements ---") print(f"Found {len(pdf_files)} salary statement files") # Calculate totals total_brut = sum(t['Brut Mensuel'] for t in all_transactions) total_net = sum(t['Net Imposable'] for t in all_transactions) if total_brut > 0: print(f"Total Brut Mensuel: €{total_brut:,.2f}") print(f"Total Net Imposable: €{total_net:,.2f}") return all_transactions if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Process SNCF salary statements with enhanced NET PAYÉ extraction') parser.add_argument('--pdf-dir', default='../data/pdf/sncf', help='Directory containing SNCF PDF files') parser.add_argument('--output-dir', default='output/csv', help='Directory to save CSV output files') parser.add_argument('--csv', action='store_true', help='Output transaction data to CSV files') args = parser.parse_args() # Process all PDF files in the directory process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)