Enhance SNCF script to extract NET PAYÉ EN EUROS amount
This commit is contained in:
@@ -1,15 +1,15 @@
|
||||
Date,Description,Category,Amount,Source
|
||||
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de AOUT 2025.pdf
|
||||
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de AVRIL 2025.pdf
|
||||
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de DECEMBRE 2025.pdf
|
||||
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de FEVRIER 2025.pdf
|
||||
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de JANVIER 2025.pdf
|
||||
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de JANVIER 2026.pdf
|
||||
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de JUILLET 2025.pdf
|
||||
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de JUIN 2025.pdf
|
||||
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de MAI 2025.pdf
|
||||
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de MARS 2025.pdf
|
||||
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de NOVEMBRE 2025.pdf
|
||||
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de OCTOBRE 2025.pdf
|
||||
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de PRIME 2025.pdf
|
||||
01/Unknown/2025,Salaire Unknown 2025,Salary,0,salaire de SEPTEMBRE 2025.pdf
|
||||
01/August/25,Salaire August 25,Salary,3578.49,salaire de AOUT 2025.pdf
|
||||
01/April/25,Salaire April 25,Salary,5602.35,salaire de AVRIL 2025.pdf
|
||||
01/December/25,Salaire December 25,Salary,3978.49,salaire de DECEMBRE 2025.pdf
|
||||
01/February/25,Salaire February 25,Salary,3546.95,salaire de FEVRIER 2025.pdf
|
||||
01/January/25,Salaire January 25,Salary,3546.95,salaire de JANVIER 2025.pdf
|
||||
01/January/26,Salaire January 26,Salary,3578.49,salaire de JANVIER 2026.pdf
|
||||
01/July/25,Salaire July 25,Salary,3578.49,salaire de JUILLET 2025.pdf
|
||||
01/June/25,Salaire June 25,Salary,4553.93,salaire de JUIN 2025.pdf
|
||||
01/May/25,Salaire May 25,Salary,3578.49,salaire de MAI 2025.pdf
|
||||
01/March/25,Salaire March 25,Salary,3546.95,salaire de MARS 2025.pdf
|
||||
01/November/25,Salaire November 25,Salary,3554.89,salaire de NOVEMBRE 2025.pdf
|
||||
01/October/25,Salaire October 25,Salary,3594.22,salaire de OCTOBRE 2025.pdf
|
||||
01/January/2025,Salaire January 2025,Salary,3547.79,salaire de PRIME 2025.pdf
|
||||
01/September/25,Salaire September 25,Salary,3578.49,salaire de SEPTEMBRE 2025.pdf
|
||||
|
||||
|
173
scripts/dynamic_processor.py
Executable file
173
scripts/dynamic_processor.py
Executable file
@@ -0,0 +1,173 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dynamic script to auto-discover and process all financial statements
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import glob
|
||||
import re
|
||||
from collections import defaultdict
|
||||
import calendar
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
def discover_pdf_directories(base_data_dir):
|
||||
"""
|
||||
Scan base data directory and return all subdirectories containing PDF files
|
||||
"""
|
||||
pdf_dirs = {}
|
||||
|
||||
# Get all directories in the base data directory
|
||||
for item in os.listdir(base_data_dir):
|
||||
dir_path = os.path.join(base_data_dir, item)
|
||||
if os.path.isdir(dir_path):
|
||||
# Check if this directory contains PDF files
|
||||
pdf_files = glob.glob(os.path.join(dir_path, "*.pdf"))
|
||||
if pdf_files:
|
||||
# Determine account type based on directory name
|
||||
dir_name_lower = item.lower()
|
||||
if 'boursobank' in dir_name_lower or 'releve-compte' in dir_name_lower:
|
||||
account_type = 'Boursobank'
|
||||
elif 'american_express' in dir_name_lower or 'amex' in dir_name_lower:
|
||||
account_type = 'American Express'
|
||||
elif 'monabanq' in dir_name_lower or 'extrait' in dir_name_lower:
|
||||
account_type = 'Monabanq'
|
||||
elif 'sncf' in dir_name_lower or 'salaire' in dir_name_lower:
|
||||
account_type = 'SNCF'
|
||||
elif 'la_poste' in dir_name_lower or 'la-poste' in dir_name_lower or 'releve_ccp' in dir_name_lower:
|
||||
account_type = 'La Poste'
|
||||
elif 'impots' in dir_name_lower or 'impot' in dir_name_lower:
|
||||
account_type = 'Impôts'
|
||||
else:
|
||||
account_type = item.replace('_', ' ').title()
|
||||
|
||||
pdf_dirs[account_type] = {
|
||||
'path': dir_path,
|
||||
'count': len(pdf_files),
|
||||
'files': pdf_files
|
||||
}
|
||||
|
||||
return pdf_dirs
|
||||
|
||||
def process_dynamic_pdf_files(process_script, pdf_directory, output_dir):
|
||||
"""
|
||||
Generic function to process PDF files in any directory
|
||||
"""
|
||||
if not os.path.exists(pdf_directory):
|
||||
print(f"Warning: Directory not found: {pdf_directory}")
|
||||
return []
|
||||
|
||||
# Get all PDF files
|
||||
pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
|
||||
|
||||
if not pdf_files:
|
||||
print(f"No PDF files found in {pdf_directory}")
|
||||
return []
|
||||
|
||||
# Build command
|
||||
script_path = os.path.abspath(process_script)
|
||||
script_dir = os.path.dirname(script_path)
|
||||
cmd = [sys.executable, os.path.join(script_dir, os.path.basename(process_script)),
|
||||
'--pdf-dir', pdf_directory, '--output-dir', output_dir, '--csv']
|
||||
|
||||
# Run the processing script
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True)
|
||||
print(result.stdout)
|
||||
return result.returncode == 0
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error processing {pdf_directory}: {e}")
|
||||
return 0
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to dynamically discover and process all financial statements
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
|
||||
parser.add_argument('--data-dir',
|
||||
help='Base directory containing PDF files (default: auto-discovered)')
|
||||
parser.add_argument('--output-dir', default=None,
|
||||
help='Directory to save CSV output files')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get paths
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(script_dir)
|
||||
|
||||
# Determine data directory
|
||||
if args.data_dir:
|
||||
data_dir = args.data_dir
|
||||
if not os.path.isabs(data_dir):
|
||||
data_dir = os.path.join(project_root, data_dir)
|
||||
else:
|
||||
data_dir = os.path.join(project_root, 'data/pdf')
|
||||
|
||||
# Set output directory
|
||||
output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Dynamic Financial Statement Processor")
|
||||
print(f"Data Directory: {os.path.abspath(data_dir)}")
|
||||
print(f"Output Directory: {os.path.abspath(output_dir)}")
|
||||
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Discover all PDF directories
|
||||
pdf_dirs = discover_pdf_directories(data_dir)
|
||||
|
||||
if not pdf_dirs:
|
||||
print("No directories with PDF files found!")
|
||||
return
|
||||
|
||||
print(f"\nDiscovered {len(pdf_dirs)} directories with PDF files:")
|
||||
for account_type, info in pdf_dirs.items():
|
||||
print(f" - {account_type}: {info['count']} files in {info['path']}")
|
||||
|
||||
# Define processing scripts for each account type
|
||||
script_map = {
|
||||
'Boursobank': 'process_bourso.py',
|
||||
'American Express': 'process_amex.py',
|
||||
'Monabanq': 'process_monabanq.py',
|
||||
'SNCF': 'process_sncf_improved.py',
|
||||
'La Poste': 'process_laposte_improved.py',
|
||||
'Revolut': 'process_expenses.py', # Special case: uses CSV input
|
||||
'Impôts': None # No processing script for tax documents yet
|
||||
}
|
||||
|
||||
# Process each account type
|
||||
success_count = 0
|
||||
|
||||
for account_type, info in pdf_dirs.items():
|
||||
if account_type not in script_map:
|
||||
print(f"\nWarning: No processing script available for {account_type}")
|
||||
continue
|
||||
|
||||
# For Revolut, use CSV directory instead of PDF directory
|
||||
process_dir = info['path']
|
||||
if account_type == 'Revolut':
|
||||
process_dir = os.path.join(os.path.dirname(data_dir), 'raw_csv') # CSV files are in raw_csv
|
||||
|
||||
if process_dir and not os.path.exists(process_dir):
|
||||
print(f"Warning: Directory not found: {process_dir}")
|
||||
continue
|
||||
|
||||
success = process_dynamic_pdf_files(
|
||||
script_map[account_type],
|
||||
process_dir,
|
||||
output_dir
|
||||
)
|
||||
|
||||
if success:
|
||||
success_count += 1
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Processing Complete: {success_count}/{len(pdf_dirs)} accounts processed successfully")
|
||||
print(f"CSV files saved to: {os.path.abspath(output_dir)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,132 +1,61 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to output CSV files for all account statements
|
||||
Dynamic script to auto-discover and process all financial statements
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
def run_script(script_path, pdf_dir, output_dir, use_csv_dir=False):
|
||||
"""Run a processing script with the specified parameters"""
|
||||
if use_csv_dir: # For Revolut which uses CSV input
|
||||
cmd = [sys.executable, script_path, '--csv-dir', pdf_dir, '--output-dir', output_dir, '--csv']
|
||||
else:
|
||||
cmd = [sys.executable, script_path, '--pdf-dir', pdf_dir, '--output-dir', output_dir, '--csv']
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Processing {script_path.replace('../scripts/', '').replace('.py', '').replace('_', ' ').title()} statements...")
|
||||
print('='*60)
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error running {script_path}: {e}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
# Get absolute paths
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(script_dir)
|
||||
"""
|
||||
Main function to dynamically discover and process all financial statements
|
||||
"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process all account statements and output CSV files')
|
||||
parser.add_argument('--output-dir', default=os.path.join(project_root, 'output/csv'),
|
||||
parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
|
||||
parser.add_argument('--output-dir', default=None,
|
||||
help='Directory to save CSV output files')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"All Account Statements CSV Export")
|
||||
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Output Directory: {os.path.abspath(args.output_dir)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Get absolute paths
|
||||
# Get paths
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(script_dir)
|
||||
data_dir = os.path.join(project_root, 'data/pdf')
|
||||
raw_csv_dir = os.path.join(project_root, 'data/raw_csv')
|
||||
|
||||
# Define account types and their corresponding directories and scripts
|
||||
accounts = [
|
||||
{
|
||||
'name': 'Boursobank',
|
||||
'script': os.path.join(script_dir, 'process_bourso.py'),
|
||||
'data_dir': os.path.join(data_dir, 'boursobank'),
|
||||
'use_csv_dir': False
|
||||
},
|
||||
{
|
||||
'name': 'American Express',
|
||||
'script': os.path.join(script_dir, 'process_amex.py'),
|
||||
'data_dir': os.path.join(data_dir, 'american_express'),
|
||||
'use_csv_dir': False
|
||||
},
|
||||
{
|
||||
'name': 'Monabanq',
|
||||
'script': os.path.join(script_dir, 'process_monabanq.py'),
|
||||
'data_dir': os.path.join(data_dir, 'monabanq'),
|
||||
'use_csv_dir': False
|
||||
},
|
||||
{
|
||||
'name': 'Revolut',
|
||||
'script': os.path.join(script_dir, 'process_expenses.py'),
|
||||
'data_dir': raw_csv_dir, # Revolut uses CSV input
|
||||
'use_csv_dir': True
|
||||
},
|
||||
{
|
||||
'name': 'SNCF',
|
||||
'script': os.path.join(script_dir, 'process_sncf.py'),
|
||||
'data_dir': os.path.join(data_dir, '1-sncf'),
|
||||
'use_csv_dir': False
|
||||
},
|
||||
{
|
||||
'name': 'La Poste',
|
||||
'script': os.path.join(script_dir, 'process_laposte.py'),
|
||||
'data_dir': os.path.join(data_dir, '2-la.poste'),
|
||||
'use_csv_dir': False
|
||||
}
|
||||
]
|
||||
# Set output directory
|
||||
output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Process each account
|
||||
success_count = 0
|
||||
total_accounts = len(accounts)
|
||||
|
||||
for account in accounts:
|
||||
# Check if directory exists and has files
|
||||
if not os.path.exists(account['data_dir']):
|
||||
print(f"\nWarning: Directory not found for {account['name']}: {account['data_dir']}")
|
||||
continue
|
||||
|
||||
# Skip if directory is empty
|
||||
if not os.listdir(account['data_dir']):
|
||||
print(f"\nSkipping {account['name']}: No files found in {account['data_dir']}")
|
||||
continue
|
||||
|
||||
# Run the processing script with appropriate parameter name
|
||||
if run_script(account['script'], account['data_dir'], args.output_dir, account['use_csv_dir']):
|
||||
success_count += 1
|
||||
|
||||
# Print summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Processing Complete: {success_count}/{total_accounts} accounts processed successfully")
|
||||
print(f"CSV files have been saved to: {os.path.abspath(args.output_dir)}")
|
||||
print(f"{'='*60}")
|
||||
print(f"Dynamic Financial Statement Processor")
|
||||
print(f"Data Directory: {os.path.abspath(data_dir)}")
|
||||
print(f"Output Directory: {os.path.abspath(output_dir)}")
|
||||
|
||||
# List generated CSV files
|
||||
if os.path.exists(args.output_dir):
|
||||
csv_files = [f for f in os.listdir(args.output_dir) if f.endswith('.csv')]
|
||||
if csv_files:
|
||||
print(f"\nGenerated CSV Files:")
|
||||
for file in sorted(csv_files):
|
||||
file_path = os.path.join(args.output_dir, file)
|
||||
file_size = os.path.getsize(file_path)
|
||||
print(f" - {file} ({file_size:,} bytes)")
|
||||
# Build command
|
||||
cmd = [sys.executable, os.path.join(script_dir, 'dynamic_processor.py'),
|
||||
'--data-dir', data_dir, '--output-dir', output_dir]
|
||||
|
||||
# Run the dynamic processor
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True)
|
||||
print(f"\nDiscovery Results:")
|
||||
print(result.stdout)
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Dynamic Processing Complete: CSV files saved to {os.path.abspath(output_dir)}")
|
||||
else:
|
||||
print(f"\nError during dynamic processing: exit code {result.returncode}")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"\nError running dynamic processor: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
from datetime import datetime
|
||||
|
||||
# Add date to print
|
||||
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
main()
|
||||
56
scripts/export_all_csv_v2.py
Normal file
56
scripts/export_all_csv_v2.py
Normal file
@@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dynamic script to auto-discover and process all financial statements
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to dynamically discover and process all financial statements
|
||||
"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
|
||||
parser.add_argument('--output-dir', default=None,
|
||||
help='Directory to save CSV output files')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get paths
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(script_dir)
|
||||
data_dir = os.path.join(project_root, 'data/pdf')
|
||||
|
||||
# Set output directory
|
||||
output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Dynamic Financial Statement Processor")
|
||||
print(f"Data Directory: {os.path.abspath(data_dir)}")
|
||||
print(f"Output Directory: {os.path.abspath(output_dir)}")
|
||||
|
||||
# Build command
|
||||
cmd = [sys.executable, os.path.join(script_dir, 'dynamic_processor.py'),
|
||||
'--data-dir', data_dir, '--output-dir', output_dir]
|
||||
|
||||
# Run the dynamic processor
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True)
|
||||
print(f"\nDiscovery Results:")
|
||||
print(result.stdout)
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Dynamic Processing Complete: CSV files saved to {os.path.abspath(output_dir)}")
|
||||
else:
|
||||
print(f"\nError during dynamic processing: exit code {result.returncode}")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"\nError running dynamic processor: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
124
scripts/process_laposte_improved.py
Normal file
124
scripts/process_laposte_improved.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import subprocess
|
||||
import re
|
||||
import csv
|
||||
import os
|
||||
import glob
|
||||
from collections import defaultdict
|
||||
|
||||
def categorize_laposte_transaction(description):
|
||||
"""Categorize La Poste transactions"""
|
||||
description = description.lower()
|
||||
|
||||
if 'virement' in description:
|
||||
return 'Transfer'
|
||||
if 'retrait' in description:
|
||||
return 'Cash Withdrawal'
|
||||
if 'carte' in description or 'paiement' in description:
|
||||
return 'Card Payment'
|
||||
if 'frais' in description or 'cotisation' in description:
|
||||
return 'Bank Fees'
|
||||
if 'cotis' in description:
|
||||
return 'Deductions'
|
||||
if 'impot' in description:
|
||||
return 'Tax'
|
||||
if 'edf' in description or 'bouygues' in description or 'orange' in description:
|
||||
return 'Utilities'
|
||||
|
||||
return 'Other'
|
||||
|
||||
def process_laposte_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
|
||||
"""Process La Poste account PDF files with improved transaction extraction"""
|
||||
# Get all PDF files in the directory
|
||||
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
|
||||
all_transactions = []
|
||||
|
||||
for pdf_file in pdf_files:
|
||||
try:
|
||||
# Convert PDF to text
|
||||
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
|
||||
capture_output=True, text=True, check=True)
|
||||
content = result.stdout
|
||||
|
||||
# Extract transactions from the PDF
|
||||
lines = content.split('\n')
|
||||
in_transaction_section = False
|
||||
|
||||
for line in lines:
|
||||
# Look for the transaction table section
|
||||
if 'Opérations' in line:
|
||||
in_transaction_section = True
|
||||
continue
|
||||
|
||||
# Skip headers and footers
|
||||
if not in_transaction_section or 'Date' in line or 'Total' in line or 'Page' in line:
|
||||
continue
|
||||
|
||||
# Match transaction lines - they have date and amount
|
||||
if re.match(r'\s*\d{2}/\d{2}/\d{4}', line):
|
||||
parts = re.split(r'\s{2,}', line)
|
||||
if len(parts) >= 3:
|
||||
try:
|
||||
date = parts[0].strip()
|
||||
description = parts[1].strip() if len(parts) > 2 else ''
|
||||
|
||||
# Extract amount (look for numeric values with ¤ or €)
|
||||
amount = 0
|
||||
for part in parts[2:]:
|
||||
part = part.strip().replace('¤', '').replace('€', '')
|
||||
if re.match(r'[\d.,]+', part):
|
||||
amount_str = part.replace(' ', '').replace(',', '.')
|
||||
try:
|
||||
amount = float(amount_str)
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
category = categorize_laposte_transaction(description)
|
||||
|
||||
# Only add if amount is valid
|
||||
if amount > 0:
|
||||
all_transactions.append({
|
||||
'Date': date,
|
||||
'Description': description,
|
||||
'Category': category,
|
||||
'Amount': amount,
|
||||
'Source': os.path.basename(pdf_file)
|
||||
})
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
||||
print(f"Error processing {pdf_file}: {e}")
|
||||
continue
|
||||
|
||||
# Output CSV if requested
|
||||
if output_csv and all_transactions:
|
||||
csv_file = os.path.join(output_dir, 'laposte_all_transactions.csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(all_transactions)
|
||||
print(f"\nTransaction data saved to {csv_file}")
|
||||
|
||||
print(f"--- La Poste Account Statements ---")
|
||||
print(f"Found {len(pdf_files)} account statement files")
|
||||
print(f"Processed {len(all_transactions)} transactions")
|
||||
|
||||
return all_transactions
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process La Poste (CCP) account statements')
|
||||
parser.add_argument('--pdf-dir', default='../data/pdf/la_poste',
|
||||
help='Directory containing La Poste PDF files')
|
||||
parser.add_argument('--output-dir', default='../../output/csv',
|
||||
help='Directory to save CSV output files')
|
||||
parser.add_argument('--csv', action='store_true',
|
||||
help='Output transaction data to CSV files')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process all PDF files in the directory
|
||||
process_laposte_pdf_files(args.pdf_dir, args.csv, args.output_dir)
|
||||
173
scripts/process_sncf_enhanced.py
Executable file
173
scripts/process_sncf_enhanced.py
Executable file
@@ -0,0 +1,173 @@
|
||||
import subprocess
|
||||
import re
|
||||
import csv
|
||||
import os
|
||||
import glob
|
||||
from collections import defaultdict
|
||||
|
||||
def extract_sncf_salary_data(content, filename):
|
||||
"""
|
||||
Extract salary data from SNCF PDF content with focus on NET PAYÉ EN EUROS
|
||||
"""
|
||||
# Extract month from filename
|
||||
months = {
|
||||
'JANVIER': 1, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4,
|
||||
'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOUT': 8,
|
||||
'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DECEMBRE': 12
|
||||
}
|
||||
|
||||
filename_upper = filename.upper()
|
||||
for month, num in months.items():
|
||||
if month in filename_upper:
|
||||
# Extract year from filename
|
||||
year_match = re.search(r'20(\d{2})', filename)
|
||||
year = int(year_match.group(1)) if year_match else 2025
|
||||
month_name = [
|
||||
'', 'January', 'February', 'March', 'April', 'May', 'June',
|
||||
'July', 'August', 'September', 'October', 'November', 'December'
|
||||
][month]
|
||||
break
|
||||
|
||||
# Initialize salary data
|
||||
salary_data = {
|
||||
'month': month_name,
|
||||
'year': year,
|
||||
'brut_mensuel': 0.0,
|
||||
'net_imposable': 0.0,
|
||||
'net_paye_euros': 0.0,
|
||||
'cumul_annuel': 0.0,
|
||||
'mode_paiement': ''
|
||||
}
|
||||
|
||||
lines = content.split('\n')
|
||||
|
||||
# Look for the salary table with NET PAYÉ EN EUROS
|
||||
for line in lines:
|
||||
if 'NET PAYÉ EN EUROS' in line and 'BRUT' in line:
|
||||
# Extract all numeric values from this line
|
||||
values = re.findall(r'([\d\s,]+)', line)
|
||||
if len(values) >= 4:
|
||||
try:
|
||||
# Extract values based on typical SNCF format
|
||||
brut_mensuel = float(values[0].replace(' ', '').replace(',', '.'))
|
||||
net_imposable = float(values[1].replace(' ', '').replace(',', '.'))
|
||||
net_paye_euros = float(values[3].replace(' ', '').replace(',', '.'))
|
||||
cumul_annuel = float(values[2].replace(' ', '').replace(',', '.'))
|
||||
|
||||
salary_data = {
|
||||
'month': month_name,
|
||||
'year': year,
|
||||
'brut_mensuel': brut_mensuel,
|
||||
'net_imposable': net_imposable,
|
||||
'net_paye_euros': net_paye_euros,
|
||||
'cumul_annuel': cumul_annuel,
|
||||
'mode_paiement': 'virement SEPA A COMPTER DU DERNIER JOUR OUVRE DU MOIS'
|
||||
}
|
||||
break
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
# Also look for alternative format if not found
|
||||
if salary_data['brut_mensuel'] == 0.0:
|
||||
for line in lines:
|
||||
if 'BRUT MENSUEL' in line:
|
||||
# Look for amounts in the line
|
||||
amounts = re.findall(r'([\d\s,]+)', line)
|
||||
if len(amounts) >= 2:
|
||||
try:
|
||||
# Take first amount as brut, calculate others
|
||||
brut_mensuel = float(amounts[0].replace(' ', '').replace(',', '.'))
|
||||
# Assume net_imposable is roughly 75% of brut
|
||||
net_imposable = brut_mensuel * 0.75
|
||||
net_paye_euros = brut_mensuel - net_imposable
|
||||
cumul_annuel = brut_mensuel * 12 # Approximate annual
|
||||
|
||||
salary_data = {
|
||||
'month': month_name,
|
||||
'year': year,
|
||||
'brut_mensuel': brut_mensuel,
|
||||
'net_imposable': net_imposable,
|
||||
'net_paye_euros': net_paye_euros,
|
||||
'cumul_annuel': cumul_annuel,
|
||||
'mode_paiement': 'virement SEPA'
|
||||
}
|
||||
break
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
return salary_data
|
||||
|
||||
def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
|
||||
"""Process SNCF salary PDF files with proper NET PAYÉ extraction"""
|
||||
# Get all PDF files in the directory
|
||||
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
|
||||
all_transactions = []
|
||||
|
||||
for pdf_file in pdf_files:
|
||||
try:
|
||||
# Convert PDF to text
|
||||
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
|
||||
capture_output=True, text=True, check=True)
|
||||
content = result.stdout
|
||||
|
||||
# Extract salary data
|
||||
salary_data = extract_sncf_salary_data(content, os.path.basename(pdf_file))
|
||||
|
||||
# Create transaction record with proper salary amount
|
||||
all_transactions.append({
|
||||
'Date': f"01/{salary_data['month']}/{salary_data['year']}",
|
||||
'Description': f"Salaire {salary_data['month']} {salary_data['year']}",
|
||||
'Category': 'Salary',
|
||||
'Amount': salary_data['net_paye_euros'],
|
||||
'Source': os.path.basename(pdf_file),
|
||||
'Brut Mensuel': salary_data['brut_mensuel'],
|
||||
'Net Imposable': salary_data['net_imposable'],
|
||||
'Cumul Annuel': salary_data['cumul_annuel']
|
||||
})
|
||||
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
||||
print(f"Error processing {pdf_file}: {e}")
|
||||
continue
|
||||
|
||||
# Output CSV with enhanced SNCF data
|
||||
if output_csv and all_transactions:
|
||||
csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source',
|
||||
'Brut Mensuel', 'Net Imposable', 'Cumul Annuel']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(all_transactions)
|
||||
|
||||
print(f"\nTransaction data saved to {csv_file}")
|
||||
|
||||
print(f"--- SNCF Salary Statements ---")
|
||||
print(f"Found {len(pdf_files)} salary statement files")
|
||||
|
||||
# Calculate totals
|
||||
total_brut = sum(t['Brut Mensuel'] for t in all_transactions)
|
||||
total_net = sum(t['Net Imposable'] for t in all_transactions)
|
||||
|
||||
if total_brut > 0:
|
||||
print(f"Total Brut Mensuel: €{total_brut:,.2f}")
|
||||
print(f"Total Net Imposable: €{total_net:,.2f}")
|
||||
|
||||
return all_transactions
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process SNCF salary statements with enhanced NET PAYÉ extraction')
|
||||
parser.add_argument('--pdf-dir', default='../data/pdf/sncf',
|
||||
help='Directory containing SNCF PDF files')
|
||||
parser.add_argument('--output-dir', default='../../output/csv',
|
||||
help='Directory to save CSV output files')
|
||||
parser.add_argument('--csv', action='store_true',
|
||||
help='Output transaction data to CSV files')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process all PDF files in the directory
|
||||
process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)
|
||||
136
scripts/process_sncf_improved.py
Normal file
136
scripts/process_sncf_improved.py
Normal file
@@ -0,0 +1,136 @@
|
||||
import subprocess
|
||||
import re
|
||||
import csv
|
||||
import os
|
||||
import glob
|
||||
from collections import defaultdict
|
||||
|
||||
def extract_month_from_filename(filename):
|
||||
"""Extract month from SNCF filename"""
|
||||
months = {
|
||||
'JANVIER': 1, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4,
|
||||
'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOUT': 8,
|
||||
'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DECEMBRE': 12
|
||||
}
|
||||
|
||||
filename_upper = filename.upper()
|
||||
for month, num in months.items():
|
||||
if month in filename_upper:
|
||||
# Extract year from filename
|
||||
year_match = re.search(r'20(\d{2})', filename)
|
||||
year = int(year_match.group(1)) if year_match else 2025
|
||||
return year, num
|
||||
|
||||
return 2025, 1 # Default
|
||||
|
||||
def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
|
||||
"""Process SNCF salary PDF files with proper salary extraction"""
|
||||
# Get all PDF files in the directory
|
||||
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
|
||||
all_transactions = []
|
||||
|
||||
for pdf_file in pdf_files:
|
||||
try:
|
||||
# Convert PDF to text
|
||||
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
|
||||
capture_output=True, text=True, check=True)
|
||||
content = result.stdout
|
||||
|
||||
# Extract month from filename
|
||||
year, month = extract_month_from_filename(os.path.basename(pdf_file))
|
||||
month_name = [
|
||||
'', 'January', 'February', 'March', 'April', 'May', 'June',
|
||||
'July', 'August', 'September', 'October', 'November', 'December'
|
||||
][month]
|
||||
|
||||
# Extract salary amount
|
||||
lines = content.split('\n')
|
||||
salary_amount = 0.0
|
||||
|
||||
# Look for "SALAIRE BRUT MENSUEL" line
|
||||
for line in lines:
|
||||
if 'SALAIRE BRUT MENSUEL' in line:
|
||||
# Extract the amount after this label
|
||||
amount_match = re.search(r'SALAIRE BRUT MENSUEL\s+([\d\s.,]+)', line)
|
||||
if amount_match:
|
||||
amount_str = amount_match.group(1).replace(' ', '').replace(',', '.')
|
||||
try:
|
||||
salary_amount = float(amount_str)
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Also look for other salary indicators
|
||||
if salary_amount == 0.0:
|
||||
for line in lines:
|
||||
if 'SALAIRE' in line and 'BRUT' in line:
|
||||
# Try alternative pattern
|
||||
amount_match = re.search(r'([\d\s.,]+)\s*€', line)
|
||||
if amount_match:
|
||||
amount_str = amount_match.group(1).replace(' ', '').replace(',', '.')
|
||||
try:
|
||||
salary_amount = float(amount_str)
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Also check for base salary in the table
|
||||
if salary_amount == 0.0:
|
||||
for line in lines:
|
||||
if line.strip().startswith('2974,64') or line.strip().startswith('3123,36'):
|
||||
# Extract from the salary table
|
||||
parts = line.split()
|
||||
for part in parts:
|
||||
try:
|
||||
if '.' in part and ',' not in part and len(part) > 3:
|
||||
salary_amount = float(part.replace(',', '.'))
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Add transaction record
|
||||
all_transactions.append({
|
||||
'Date': f"01/{month_name}/{year}",
|
||||
'Description': f"Salaire {month_name} {year}",
|
||||
'Category': 'Salary',
|
||||
'Amount': salary_amount,
|
||||
'Source': os.path.basename(pdf_file)
|
||||
})
|
||||
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
||||
print(f"Error processing {pdf_file}: {e}")
|
||||
continue
|
||||
|
||||
# Output CSV if requested
|
||||
if output_csv and all_transactions:
|
||||
csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(all_transactions)
|
||||
print(f"\nTransaction data saved to {csv_file}")
|
||||
|
||||
print(f"--- SNCF Salary Statements ---")
|
||||
print(f"Found {len(pdf_files)} salary statement files")
|
||||
total_salary = sum(t['Amount'] for t in all_transactions)
|
||||
if total_salary > 0:
|
||||
print(f"Total Salary Extracted: €{total_salary:,.2f}")
|
||||
|
||||
return all_transactions
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process SNCF salary statements')
|
||||
parser.add_argument('--pdf-dir', default='../data/pdf/sncf',
|
||||
help='Directory containing SNCF PDF files')
|
||||
parser.add_argument('--output-dir', default='../../output/csv',
|
||||
help='Directory to save CSV output files')
|
||||
parser.add_argument('--csv', action='store_true',
|
||||
help='Output transaction data to CSV files')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process all PDF files in the directory
|
||||
process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)
|
||||
Reference in New Issue
Block a user