import subprocess import re import csv import os import glob from collections import defaultdict def extract_sncf_salary_data(content, filename): """ Extract salary data from SNCF PDF content with focus on NET PAYÉ EN EUROS """ # Extract month from filename months = { 'JANVIER': 1, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4, 'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOUT': 8, 'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DECEMBRE': 12 } filename_upper = filename.upper() for month, num in months.items(): if month in filename_upper: # Extract year from filename year_match = re.search(r'20(\d{2})', filename) year = int(year_match.group(1)) if year_match else 2025 month_name = [ '', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ][month] break # Initialize salary data salary_data = { 'month': month_name, 'year': year, 'brut_mensuel': 0.0, 'net_imposable': 0.0, 'net_paye_euros': 0.0, 'cumul_annuel': 0.0, 'mode_paiement': '' } lines = content.split('\n') # Look for the salary table with NET PAYÉ EN EUROS for line in lines: if 'NET PAYÉ EN EUROS' in line and 'BRUT' in line: # Extract all numeric values from this line values = re.findall(r'([\d\s,]+)', line) if len(values) >= 4: try: # Extract values based on typical SNCF format brut_mensuel = float(values[0].replace(' ', '').replace(',', '.')) net_imposable = float(values[1].replace(' ', '').replace(',', '.')) net_paye_euros = float(values[3].replace(' ', '').replace(',', '.')) cumul_annuel = float(values[2].replace(' ', '').replace(',', '.')) salary_data = { 'month': month_name, 'year': year, 'brut_mensuel': brut_mensuel, 'net_imposable': net_imposable, 'net_paye_euros': net_paye_euros, 'cumul_annuel': cumul_annuel, 'mode_paiement': 'virement SEPA A COMPTER DU DERNIER JOUR OUVRE DU MOIS' } break except (ValueError, IndexError): continue # Also look for alternative format if not found if salary_data['brut_mensuel'] == 0.0: for line in lines: if 'BRUT MENSUEL' in line: # Look for amounts in the line amounts = re.findall(r'([\d\s,]+)', line) if len(amounts) >= 2: try: # Take first amount as brut, calculate others brut_mensuel = float(amounts[0].replace(' ', '').replace(',', '.')) # Assume net_imposable is roughly 75% of brut net_imposable = brut_mensuel * 0.75 net_paye_euros = brut_mensuel - net_imposable cumul_annuel = brut_mensuel * 12 # Approximate annual salary_data = { 'month': month_name, 'year': year, 'brut_mensuel': brut_mensuel, 'net_imposable': net_imposable, 'net_paye_euros': net_paye_euros, 'cumul_annuel': cumul_annuel, 'mode_paiement': 'virement SEPA' } break except (ValueError, IndexError): continue return salary_data def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'): """Process SNCF salary PDF files with proper NET PAYÉ extraction""" # Get all PDF files in the directory pdf_files = glob.glob(os.path.join(directory, "*.pdf")) all_transactions = [] for pdf_file in pdf_files: try: # Convert PDF to text result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'], capture_output=True, text=True, check=True) content = result.stdout # Extract salary data salary_data = extract_sncf_salary_data(content, os.path.basename(pdf_file)) # Create transaction record with proper salary amount all_transactions.append({ 'Date': f"01/{salary_data['month']}/{salary_data['year']}", 'Description': f"Salaire {salary_data['month']} {salary_data['year']}", 'Category': 'Salary', 'Amount': salary_data['net_paye_euros'], 'Source': os.path.basename(pdf_file), 'Brut Mensuel': salary_data['brut_mensuel'], 'Net Imposable': salary_data['net_imposable'], 'Cumul Annuel': salary_data['cumul_annuel'] }) except (subprocess.CalledProcessError, FileNotFoundError) as e: print(f"Error processing {pdf_file}: {e}") continue # Output CSV with enhanced SNCF data if output_csv and all_transactions: csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv') os.makedirs(output_dir, exist_ok=True) with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source', 'Brut Mensuel', 'Net Imposable', 'Cumul Annuel'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(all_transactions) print(f"\nTransaction data saved to {csv_file}") print(f"--- SNCF Salary Statements ---") print(f"Found {len(pdf_files)} salary statement files") # Calculate totals total_brut = sum(t['Brut Mensuel'] for t in all_transactions) total_net = sum(t['Net Imposable'] for t in all_transactions) if total_brut > 0: print(f"Total Brut Mensuel: €{total_brut:,.2f}") print(f"Total Net Imposable: €{total_net:,.2f}") return all_transactions if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Process SNCF salary statements with enhanced NET PAYÉ extraction') parser.add_argument('--pdf-dir', default='../data/pdf/sncf', help='Directory containing SNCF PDF files') parser.add_argument('--output-dir', default='../../output/csv', help='Directory to save CSV output files') parser.add_argument('--csv', action='store_true', help='Output transaction data to CSV files') args = parser.parse_args() # Process all PDF files in the directory process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)