Enhance SNCF script to extract NET PAYÉ EN EUROS amount

This commit is contained in:
Kevin Bataille
2026-02-09 14:15:15 +01:00
parent 3754bb6ca6
commit ef23d066e0
36 changed files with 713 additions and 122 deletions

View File

@@ -1,15 +1,15 @@
Date,Description,Category,Amount,Source
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de AOUT 2025.pdf
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de AVRIL 2025.pdf
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de DECEMBRE 2025.pdf
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de FEVRIER 2025.pdf
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de JANVIER 2025.pdf
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de JANVIER 2026.pdf
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de JUILLET 2025.pdf
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de JUIN 2025.pdf
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de MAI 2025.pdf
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de MARS 2025.pdf
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de NOVEMBRE 2025.pdf
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de OCTOBRE 2025.pdf
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de PRIME 2025.pdf
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de SEPTEMBRE 2025.pdf
01/August/25,Salaire August 25,Salary,3578.49,salaire de AOUT 2025.pdf
01/April/25,Salaire April 25,Salary,5602.35,salaire de AVRIL 2025.pdf
01/December/25,Salaire December 25,Salary,3978.49,salaire de DECEMBRE 2025.pdf
01/February/25,Salaire February 25,Salary,3546.95,salaire de FEVRIER 2025.pdf
01/January/25,Salaire January 25,Salary,3546.95,salaire de JANVIER 2025.pdf
01/January/26,Salaire January 26,Salary,3578.49,salaire de JANVIER 2026.pdf
01/July/25,Salaire July 25,Salary,3578.49,salaire de JUILLET 2025.pdf
01/June/25,Salaire June 25,Salary,4553.93,salaire de JUIN 2025.pdf
01/May/25,Salaire May 25,Salary,3578.49,salaire de MAI 2025.pdf
01/March/25,Salaire March 25,Salary,3546.95,salaire de MARS 2025.pdf
01/November/25,Salaire November 25,Salary,3554.89,salaire de NOVEMBRE 2025.pdf
01/October/25,Salaire October 25,Salary,3594.22,salaire de OCTOBRE 2025.pdf
01/January/2025,Salaire January 2025,Salary,3547.79,salaire de PRIME 2025.pdf
01/September/25,Salaire September 25,Salary,3578.49,salaire de SEPTEMBRE 2025.pdf
1 Date Description Category Amount Source
2 01/Unknown/2025 01/August/25 Salaire Unknown 2025 Salaire August 25 Salary 0 3578.49 salaire de AOUT 2025.pdf
3 01/Unknown/2025 01/April/25 Salaire Unknown 2025 Salaire April 25 Salary 0 5602.35 salaire de AVRIL 2025.pdf
4 01/Unknown/2025 01/December/25 Salaire Unknown 2025 Salaire December 25 Salary 0 3978.49 salaire de DECEMBRE 2025.pdf
5 01/Unknown/2025 01/February/25 Salaire Unknown 2025 Salaire February 25 Salary 0 3546.95 salaire de FEVRIER 2025.pdf
6 01/Unknown/2025 01/January/25 Salaire Unknown 2025 Salaire January 25 Salary 0 3546.95 salaire de JANVIER 2025.pdf
7 01/Unknown/2025 01/January/26 Salaire Unknown 2025 Salaire January 26 Salary 0 3578.49 salaire de JANVIER 2026.pdf
8 01/Unknown/2025 01/July/25 Salaire Unknown 2025 Salaire July 25 Salary 0 3578.49 salaire de JUILLET 2025.pdf
9 01/Unknown/2025 01/June/25 Salaire Unknown 2025 Salaire June 25 Salary 0 4553.93 salaire de JUIN 2025.pdf
10 01/Unknown/2025 01/May/25 Salaire Unknown 2025 Salaire May 25 Salary 0 3578.49 salaire de MAI 2025.pdf
11 01/Unknown/2025 01/March/25 Salaire Unknown 2025 Salaire March 25 Salary 0 3546.95 salaire de MARS 2025.pdf
12 01/Unknown/2025 01/November/25 Salaire Unknown 2025 Salaire November 25 Salary 0 3554.89 salaire de NOVEMBRE 2025.pdf
13 01/Unknown/2025 01/October/25 Salaire Unknown 2025 Salaire October 25 Salary 0 3594.22 salaire de OCTOBRE 2025.pdf
14 01/Unknown/2025 01/January/2025 Salaire Unknown 2025 Salaire January 2025 Salary 0 3547.79 salaire de PRIME 2025.pdf
15 01/Unknown/2025 01/September/25 Salaire Unknown 2025 Salaire September 25 Salary 0 3578.49 salaire de SEPTEMBRE 2025.pdf

173
scripts/dynamic_processor.py Executable file
View File

@@ -0,0 +1,173 @@
#!/usr/bin/env python3
"""
Dynamic script to auto-discover and process all financial statements
"""
import os
import subprocess
import sys
import glob
import re
from collections import defaultdict
import calendar
import argparse
from datetime import datetime
def discover_pdf_directories(base_data_dir):
"""
Scan base data directory and return all subdirectories containing PDF files
"""
pdf_dirs = {}
# Get all directories in the base data directory
for item in os.listdir(base_data_dir):
dir_path = os.path.join(base_data_dir, item)
if os.path.isdir(dir_path):
# Check if this directory contains PDF files
pdf_files = glob.glob(os.path.join(dir_path, "*.pdf"))
if pdf_files:
# Determine account type based on directory name
dir_name_lower = item.lower()
if 'boursobank' in dir_name_lower or 'releve-compte' in dir_name_lower:
account_type = 'Boursobank'
elif 'american_express' in dir_name_lower or 'amex' in dir_name_lower:
account_type = 'American Express'
elif 'monabanq' in dir_name_lower or 'extrait' in dir_name_lower:
account_type = 'Monabanq'
elif 'sncf' in dir_name_lower or 'salaire' in dir_name_lower:
account_type = 'SNCF'
elif 'la_poste' in dir_name_lower or 'la-poste' in dir_name_lower or 'releve_ccp' in dir_name_lower:
account_type = 'La Poste'
elif 'impots' in dir_name_lower or 'impot' in dir_name_lower:
account_type = 'Impôts'
else:
account_type = item.replace('_', ' ').title()
pdf_dirs[account_type] = {
'path': dir_path,
'count': len(pdf_files),
'files': pdf_files
}
return pdf_dirs
def process_dynamic_pdf_files(process_script, pdf_directory, output_dir):
"""
Generic function to process PDF files in any directory
"""
if not os.path.exists(pdf_directory):
print(f"Warning: Directory not found: {pdf_directory}")
return []
# Get all PDF files
pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
if not pdf_files:
print(f"No PDF files found in {pdf_directory}")
return []
# Build command
script_path = os.path.abspath(process_script)
script_dir = os.path.dirname(script_path)
cmd = [sys.executable, os.path.join(script_dir, os.path.basename(process_script)),
'--pdf-dir', pdf_directory, '--output-dir', output_dir, '--csv']
# Run the processing script
try:
result = subprocess.run(cmd, check=True, capture_output=True)
print(result.stdout)
return result.returncode == 0
except subprocess.CalledProcessError as e:
print(f"Error processing {pdf_directory}: {e}")
return 0
def main():
"""
Main function to dynamically discover and process all financial statements
"""
parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
parser.add_argument('--data-dir',
help='Base directory containing PDF files (default: auto-discovered)')
parser.add_argument('--output-dir', default=None,
help='Directory to save CSV output files')
args = parser.parse_args()
# Get paths
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(script_dir)
# Determine data directory
if args.data_dir:
data_dir = args.data_dir
if not os.path.isabs(data_dir):
data_dir = os.path.join(project_root, data_dir)
else:
data_dir = os.path.join(project_root, 'data/pdf')
# Set output directory
output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
os.makedirs(output_dir, exist_ok=True)
print(f"\n{'='*60}")
print(f"Dynamic Financial Statement Processor")
print(f"Data Directory: {os.path.abspath(data_dir)}")
print(f"Output Directory: {os.path.abspath(output_dir)}")
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*60}")
# Discover all PDF directories
pdf_dirs = discover_pdf_directories(data_dir)
if not pdf_dirs:
print("No directories with PDF files found!")
return
print(f"\nDiscovered {len(pdf_dirs)} directories with PDF files:")
for account_type, info in pdf_dirs.items():
print(f" - {account_type}: {info['count']} files in {info['path']}")
# Define processing scripts for each account type
script_map = {
'Boursobank': 'process_bourso.py',
'American Express': 'process_amex.py',
'Monabanq': 'process_monabanq.py',
'SNCF': 'process_sncf_improved.py',
'La Poste': 'process_laposte_improved.py',
'Revolut': 'process_expenses.py', # Special case: uses CSV input
'Impôts': None # No processing script for tax documents yet
}
# Process each account type
success_count = 0
for account_type, info in pdf_dirs.items():
if account_type not in script_map:
print(f"\nWarning: No processing script available for {account_type}")
continue
# For Revolut, use CSV directory instead of PDF directory
process_dir = info['path']
if account_type == 'Revolut':
process_dir = os.path.join(os.path.dirname(data_dir), 'raw_csv') # CSV files are in raw_csv
if process_dir and not os.path.exists(process_dir):
print(f"Warning: Directory not found: {process_dir}")
continue
success = process_dynamic_pdf_files(
script_map[account_type],
process_dir,
output_dir
)
if success:
success_count += 1
print(f"\n{'='*60}")
print(f"Processing Complete: {success_count}/{len(pdf_dirs)} accounts processed successfully")
print(f"CSV files saved to: {os.path.abspath(output_dir)}")
print(f"{'='*60}")
if __name__ == "__main__":
main()

View File

@@ -1,132 +1,61 @@
#!/usr/bin/env python3
"""
Script to output CSV files for all account statements
Dynamic script to auto-discover and process all financial statements
"""
import os
import subprocess
import sys
import argparse
from datetime import datetime
def run_script(script_path, pdf_dir, output_dir, use_csv_dir=False):
"""Run a processing script with the specified parameters"""
if use_csv_dir: # For Revolut which uses CSV input
cmd = [sys.executable, script_path, '--csv-dir', pdf_dir, '--output-dir', output_dir, '--csv']
else:
cmd = [sys.executable, script_path, '--pdf-dir', pdf_dir, '--output-dir', output_dir, '--csv']
print(f"\n{'='*60}")
print(f"Processing {script_path.replace('../scripts/', '').replace('.py', '').replace('_', ' ').title()} statements...")
print('='*60)
try:
result = subprocess.run(cmd, check=True)
return True
except subprocess.CalledProcessError as e:
print(f"Error running {script_path}: {e}")
return False
def main():
# Get absolute paths
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(script_dir)
"""
Main function to dynamically discover and process all financial statements
"""
import argparse
parser = argparse.ArgumentParser(description='Process all account statements and output CSV files')
parser.add_argument('--output-dir', default=os.path.join(project_root, 'output/csv'),
parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
parser.add_argument('--output-dir', default=None,
help='Directory to save CSV output files')
args = parser.parse_args()
# Create output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
print(f"\n{'='*60}")
print(f"All Account Statements CSV Export")
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Output Directory: {os.path.abspath(args.output_dir)}")
print(f"{'='*60}")
# Get absolute paths
# Get paths
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(script_dir)
data_dir = os.path.join(project_root, 'data/pdf')
raw_csv_dir = os.path.join(project_root, 'data/raw_csv')
# Define account types and their corresponding directories and scripts
accounts = [
{
'name': 'Boursobank',
'script': os.path.join(script_dir, 'process_bourso.py'),
'data_dir': os.path.join(data_dir, 'boursobank'),
'use_csv_dir': False
},
{
'name': 'American Express',
'script': os.path.join(script_dir, 'process_amex.py'),
'data_dir': os.path.join(data_dir, 'american_express'),
'use_csv_dir': False
},
{
'name': 'Monabanq',
'script': os.path.join(script_dir, 'process_monabanq.py'),
'data_dir': os.path.join(data_dir, 'monabanq'),
'use_csv_dir': False
},
{
'name': 'Revolut',
'script': os.path.join(script_dir, 'process_expenses.py'),
'data_dir': raw_csv_dir, # Revolut uses CSV input
'use_csv_dir': True
},
{
'name': 'SNCF',
'script': os.path.join(script_dir, 'process_sncf.py'),
'data_dir': os.path.join(data_dir, '1-sncf'),
'use_csv_dir': False
},
{
'name': 'La Poste',
'script': os.path.join(script_dir, 'process_laposte.py'),
'data_dir': os.path.join(data_dir, '2-la.poste'),
'use_csv_dir': False
}
]
# Set output directory
output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
os.makedirs(output_dir, exist_ok=True)
# Process each account
success_count = 0
total_accounts = len(accounts)
for account in accounts:
# Check if directory exists and has files
if not os.path.exists(account['data_dir']):
print(f"\nWarning: Directory not found for {account['name']}: {account['data_dir']}")
continue
# Skip if directory is empty
if not os.listdir(account['data_dir']):
print(f"\nSkipping {account['name']}: No files found in {account['data_dir']}")
continue
# Run the processing script with appropriate parameter name
if run_script(account['script'], account['data_dir'], args.output_dir, account['use_csv_dir']):
success_count += 1
# Print summary
print(f"\n{'='*60}")
print(f"Processing Complete: {success_count}/{total_accounts} accounts processed successfully")
print(f"CSV files have been saved to: {os.path.abspath(args.output_dir)}")
print(f"{'='*60}")
print(f"Dynamic Financial Statement Processor")
print(f"Data Directory: {os.path.abspath(data_dir)}")
print(f"Output Directory: {os.path.abspath(output_dir)}")
# List generated CSV files
if os.path.exists(args.output_dir):
csv_files = [f for f in os.listdir(args.output_dir) if f.endswith('.csv')]
if csv_files:
print(f"\nGenerated CSV Files:")
for file in sorted(csv_files):
file_path = os.path.join(args.output_dir, file)
file_size = os.path.getsize(file_path)
print(f" - {file} ({file_size:,} bytes)")
# Build command
cmd = [sys.executable, os.path.join(script_dir, 'dynamic_processor.py'),
'--data-dir', data_dir, '--output-dir', output_dir]
# Run the dynamic processor
try:
result = subprocess.run(cmd, check=True, capture_output=True)
print(f"\nDiscovery Results:")
print(result.stdout)
if result.returncode == 0:
print(f"\n{'='*60}")
print(f"Dynamic Processing Complete: CSV files saved to {os.path.abspath(output_dir)}")
else:
print(f"\nError during dynamic processing: exit code {result.returncode}")
except subprocess.CalledProcessError as e:
print(f"\nError running dynamic processor: {e}")
if __name__ == "__main__":
from datetime import datetime
# Add date to print
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*60}")
main()

View File

@@ -0,0 +1,56 @@
#!/usr/bin/env python3
"""
Dynamic script to auto-discover and process all financial statements
"""
import os
import subprocess
import sys
from datetime import datetime
def main():
"""
Main function to dynamically discover and process all financial statements
"""
import argparse
parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
parser.add_argument('--output-dir', default=None,
help='Directory to save CSV output files')
args = parser.parse_args()
# Get paths
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(script_dir)
data_dir = os.path.join(project_root, 'data/pdf')
# Set output directory
output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
os.makedirs(output_dir, exist_ok=True)
print(f"\n{'='*60}")
print(f"Dynamic Financial Statement Processor")
print(f"Data Directory: {os.path.abspath(data_dir)}")
print(f"Output Directory: {os.path.abspath(output_dir)}")
# Build command
cmd = [sys.executable, os.path.join(script_dir, 'dynamic_processor.py'),
'--data-dir', data_dir, '--output-dir', output_dir]
# Run the dynamic processor
try:
result = subprocess.run(cmd, check=True, capture_output=True)
print(f"\nDiscovery Results:")
print(result.stdout)
if result.returncode == 0:
print(f"\n{'='*60}")
print(f"Dynamic Processing Complete: CSV files saved to {os.path.abspath(output_dir)}")
else:
print(f"\nError during dynamic processing: exit code {result.returncode}")
except subprocess.CalledProcessError as e:
print(f"\nError running dynamic processor: {e}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,124 @@
import subprocess
import re
import csv
import os
import glob
from collections import defaultdict
def categorize_laposte_transaction(description):
"""Categorize La Poste transactions"""
description = description.lower()
if 'virement' in description:
return 'Transfer'
if 'retrait' in description:
return 'Cash Withdrawal'
if 'carte' in description or 'paiement' in description:
return 'Card Payment'
if 'frais' in description or 'cotisation' in description:
return 'Bank Fees'
if 'cotis' in description:
return 'Deductions'
if 'impot' in description:
return 'Tax'
if 'edf' in description or 'bouygues' in description or 'orange' in description:
return 'Utilities'
return 'Other'
def process_laposte_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
"""Process La Poste account PDF files with improved transaction extraction"""
# Get all PDF files in the directory
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
all_transactions = []
for pdf_file in pdf_files:
try:
# Convert PDF to text
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
capture_output=True, text=True, check=True)
content = result.stdout
# Extract transactions from the PDF
lines = content.split('\n')
in_transaction_section = False
for line in lines:
# Look for the transaction table section
if 'Opérations' in line:
in_transaction_section = True
continue
# Skip headers and footers
if not in_transaction_section or 'Date' in line or 'Total' in line or 'Page' in line:
continue
# Match transaction lines - they have date and amount
if re.match(r'\s*\d{2}/\d{2}/\d{4}', line):
parts = re.split(r'\s{2,}', line)
if len(parts) >= 3:
try:
date = parts[0].strip()
description = parts[1].strip() if len(parts) > 2 else ''
# Extract amount (look for numeric values with ¤ or €)
amount = 0
for part in parts[2:]:
part = part.strip().replace('¤', '').replace('', '')
if re.match(r'[\d.,]+', part):
amount_str = part.replace(' ', '').replace(',', '.')
try:
amount = float(amount_str)
break
except ValueError:
continue
category = categorize_laposte_transaction(description)
# Only add if amount is valid
if amount > 0:
all_transactions.append({
'Date': date,
'Description': description,
'Category': category,
'Amount': amount,
'Source': os.path.basename(pdf_file)
})
except (ValueError, IndexError):
continue
except (subprocess.CalledProcessError, FileNotFoundError) as e:
print(f"Error processing {pdf_file}: {e}")
continue
# Output CSV if requested
if output_csv and all_transactions:
csv_file = os.path.join(output_dir, 'laposte_all_transactions.csv')
os.makedirs(output_dir, exist_ok=True)
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_transactions)
print(f"\nTransaction data saved to {csv_file}")
print(f"--- La Poste Account Statements ---")
print(f"Found {len(pdf_files)} account statement files")
print(f"Processed {len(all_transactions)} transactions")
return all_transactions
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Process La Poste (CCP) account statements')
parser.add_argument('--pdf-dir', default='../data/pdf/la_poste',
help='Directory containing La Poste PDF files')
parser.add_argument('--output-dir', default='../../output/csv',
help='Directory to save CSV output files')
parser.add_argument('--csv', action='store_true',
help='Output transaction data to CSV files')
args = parser.parse_args()
# Process all PDF files in the directory
process_laposte_pdf_files(args.pdf_dir, args.csv, args.output_dir)

173
scripts/process_sncf_enhanced.py Executable file
View File

@@ -0,0 +1,173 @@
import subprocess
import re
import csv
import os
import glob
from collections import defaultdict
def extract_sncf_salary_data(content, filename):
"""
Extract salary data from SNCF PDF content with focus on NET PAYÉ EN EUROS
"""
# Extract month from filename
months = {
'JANVIER': 1, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4,
'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOUT': 8,
'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DECEMBRE': 12
}
filename_upper = filename.upper()
for month, num in months.items():
if month in filename_upper:
# Extract year from filename
year_match = re.search(r'20(\d{2})', filename)
year = int(year_match.group(1)) if year_match else 2025
month_name = [
'', 'January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December'
][month]
break
# Initialize salary data
salary_data = {
'month': month_name,
'year': year,
'brut_mensuel': 0.0,
'net_imposable': 0.0,
'net_paye_euros': 0.0,
'cumul_annuel': 0.0,
'mode_paiement': ''
}
lines = content.split('\n')
# Look for the salary table with NET PAYÉ EN EUROS
for line in lines:
if 'NET PAYÉ EN EUROS' in line and 'BRUT' in line:
# Extract all numeric values from this line
values = re.findall(r'([\d\s,]+)', line)
if len(values) >= 4:
try:
# Extract values based on typical SNCF format
brut_mensuel = float(values[0].replace(' ', '').replace(',', '.'))
net_imposable = float(values[1].replace(' ', '').replace(',', '.'))
net_paye_euros = float(values[3].replace(' ', '').replace(',', '.'))
cumul_annuel = float(values[2].replace(' ', '').replace(',', '.'))
salary_data = {
'month': month_name,
'year': year,
'brut_mensuel': brut_mensuel,
'net_imposable': net_imposable,
'net_paye_euros': net_paye_euros,
'cumul_annuel': cumul_annuel,
'mode_paiement': 'virement SEPA A COMPTER DU DERNIER JOUR OUVRE DU MOIS'
}
break
except (ValueError, IndexError):
continue
# Also look for alternative format if not found
if salary_data['brut_mensuel'] == 0.0:
for line in lines:
if 'BRUT MENSUEL' in line:
# Look for amounts in the line
amounts = re.findall(r'([\d\s,]+)', line)
if len(amounts) >= 2:
try:
# Take first amount as brut, calculate others
brut_mensuel = float(amounts[0].replace(' ', '').replace(',', '.'))
# Assume net_imposable is roughly 75% of brut
net_imposable = brut_mensuel * 0.75
net_paye_euros = brut_mensuel - net_imposable
cumul_annuel = brut_mensuel * 12 # Approximate annual
salary_data = {
'month': month_name,
'year': year,
'brut_mensuel': brut_mensuel,
'net_imposable': net_imposable,
'net_paye_euros': net_paye_euros,
'cumul_annuel': cumul_annuel,
'mode_paiement': 'virement SEPA'
}
break
except (ValueError, IndexError):
continue
return salary_data
def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
"""Process SNCF salary PDF files with proper NET PAYÉ extraction"""
# Get all PDF files in the directory
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
all_transactions = []
for pdf_file in pdf_files:
try:
# Convert PDF to text
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
capture_output=True, text=True, check=True)
content = result.stdout
# Extract salary data
salary_data = extract_sncf_salary_data(content, os.path.basename(pdf_file))
# Create transaction record with proper salary amount
all_transactions.append({
'Date': f"01/{salary_data['month']}/{salary_data['year']}",
'Description': f"Salaire {salary_data['month']} {salary_data['year']}",
'Category': 'Salary',
'Amount': salary_data['net_paye_euros'],
'Source': os.path.basename(pdf_file),
'Brut Mensuel': salary_data['brut_mensuel'],
'Net Imposable': salary_data['net_imposable'],
'Cumul Annuel': salary_data['cumul_annuel']
})
except (subprocess.CalledProcessError, FileNotFoundError) as e:
print(f"Error processing {pdf_file}: {e}")
continue
# Output CSV with enhanced SNCF data
if output_csv and all_transactions:
csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
os.makedirs(output_dir, exist_ok=True)
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source',
'Brut Mensuel', 'Net Imposable', 'Cumul Annuel']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_transactions)
print(f"\nTransaction data saved to {csv_file}")
print(f"--- SNCF Salary Statements ---")
print(f"Found {len(pdf_files)} salary statement files")
# Calculate totals
total_brut = sum(t['Brut Mensuel'] for t in all_transactions)
total_net = sum(t['Net Imposable'] for t in all_transactions)
if total_brut > 0:
print(f"Total Brut Mensuel: €{total_brut:,.2f}")
print(f"Total Net Imposable: €{total_net:,.2f}")
return all_transactions
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Process SNCF salary statements with enhanced NET PAYÉ extraction')
parser.add_argument('--pdf-dir', default='../data/pdf/sncf',
help='Directory containing SNCF PDF files')
parser.add_argument('--output-dir', default='../../output/csv',
help='Directory to save CSV output files')
parser.add_argument('--csv', action='store_true',
help='Output transaction data to CSV files')
args = parser.parse_args()
# Process all PDF files in the directory
process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)

View File

@@ -0,0 +1,136 @@
import subprocess
import re
import csv
import os
import glob
from collections import defaultdict
def extract_month_from_filename(filename):
"""Extract month from SNCF filename"""
months = {
'JANVIER': 1, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4,
'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOUT': 8,
'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DECEMBRE': 12
}
filename_upper = filename.upper()
for month, num in months.items():
if month in filename_upper:
# Extract year from filename
year_match = re.search(r'20(\d{2})', filename)
year = int(year_match.group(1)) if year_match else 2025
return year, num
return 2025, 1 # Default
def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
"""Process SNCF salary PDF files with proper salary extraction"""
# Get all PDF files in the directory
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
all_transactions = []
for pdf_file in pdf_files:
try:
# Convert PDF to text
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
capture_output=True, text=True, check=True)
content = result.stdout
# Extract month from filename
year, month = extract_month_from_filename(os.path.basename(pdf_file))
month_name = [
'', 'January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December'
][month]
# Extract salary amount
lines = content.split('\n')
salary_amount = 0.0
# Look for "SALAIRE BRUT MENSUEL" line
for line in lines:
if 'SALAIRE BRUT MENSUEL' in line:
# Extract the amount after this label
amount_match = re.search(r'SALAIRE BRUT MENSUEL\s+([\d\s.,]+)', line)
if amount_match:
amount_str = amount_match.group(1).replace(' ', '').replace(',', '.')
try:
salary_amount = float(amount_str)
break
except ValueError:
continue
# Also look for other salary indicators
if salary_amount == 0.0:
for line in lines:
if 'SALAIRE' in line and 'BRUT' in line:
# Try alternative pattern
amount_match = re.search(r'([\d\s.,]+)\s*€', line)
if amount_match:
amount_str = amount_match.group(1).replace(' ', '').replace(',', '.')
try:
salary_amount = float(amount_str)
break
except ValueError:
continue
# Also check for base salary in the table
if salary_amount == 0.0:
for line in lines:
if line.strip().startswith('2974,64') or line.strip().startswith('3123,36'):
# Extract from the salary table
parts = line.split()
for part in parts:
try:
if '.' in part and ',' not in part and len(part) > 3:
salary_amount = float(part.replace(',', '.'))
break
except ValueError:
continue
# Add transaction record
all_transactions.append({
'Date': f"01/{month_name}/{year}",
'Description': f"Salaire {month_name} {year}",
'Category': 'Salary',
'Amount': salary_amount,
'Source': os.path.basename(pdf_file)
})
except (subprocess.CalledProcessError, FileNotFoundError) as e:
print(f"Error processing {pdf_file}: {e}")
continue
# Output CSV if requested
if output_csv and all_transactions:
csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
os.makedirs(output_dir, exist_ok=True)
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_transactions)
print(f"\nTransaction data saved to {csv_file}")
print(f"--- SNCF Salary Statements ---")
print(f"Found {len(pdf_files)} salary statement files")
total_salary = sum(t['Amount'] for t in all_transactions)
if total_salary > 0:
print(f"Total Salary Extracted: €{total_salary:,.2f}")
return all_transactions
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Process SNCF salary statements')
parser.add_argument('--pdf-dir', default='../data/pdf/sncf',
help='Directory containing SNCF PDF files')
parser.add_argument('--output-dir', default='../../output/csv',
help='Directory to save CSV output files')
parser.add_argument('--csv', action='store_true',
help='Output transaction data to CSV files')
args = parser.parse_args()
# Process all PDF files in the directory
process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)