Refactor SNCF processor and add Revolut aggregator
- Fix SNCF NET PAYÉ EN EUROS extraction to correctly parse MENSUEL line - Extract month/year from PDF content instead of filename - Add new Revolut CSV processor to aggregate account statements - Organize Revolut data files into data/csv/revolut/ - Clean up redundant scripts and reports
This commit is contained in:
188
finanancial_processor.py
Normal file
188
finanancial_processor.py
Normal file
@@ -0,0 +1,188 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Main script to process all financial statements with a clean one-file-per-structure
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import subprocess
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
# Import functionality from the dynamic processor
|
||||
# Add the directory to the path so we can import it
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
from dynamic_processor import discover_pdf_directories, process_dynamic_pdf_files
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function with a clean, organized structure
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description='Process financial statements with one file per entity')
|
||||
parser.add_argument('--data-dir',
|
||||
help='Base directory containing PDF files (default: auto-discovered)')
|
||||
parser.add_argument('--output-dir', default=None,
|
||||
help='Directory to save CSV output files (default: auto-discovered)')
|
||||
parser.add_argument('--csv', action='store_true',
|
||||
help='Generate CSV output files')
|
||||
parser.add_argument('--single', action='store_true',
|
||||
help='Process only the current entity (for testing)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get paths
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(script_dir)
|
||||
|
||||
# Determine data directory
|
||||
if args.data_dir:
|
||||
data_dir = args.data_dir
|
||||
if not os.path.isabs(data_dir):
|
||||
data_dir = os.path.join(project_root, data_dir)
|
||||
else:
|
||||
data_dir = os.path.join(project_root, 'data/pdf')
|
||||
|
||||
# Set output directory
|
||||
output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Financial Statement Processor")
|
||||
print(f"Data Directory: {os.path.abspath(data_dir)}")
|
||||
print(f"Output Directory: {os.path.abspath(output_dir)}")
|
||||
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Discover all PDF directories
|
||||
pdf_dirs = discover_pdf_directories(data_dir)
|
||||
|
||||
if not pdf_dirs:
|
||||
print("No directories with PDF files found!")
|
||||
return
|
||||
|
||||
print(f"\nDiscovered {len(pdf_dirs)} directories with PDF files:")
|
||||
for account_type, info in pdf_dirs.items():
|
||||
print(f" - {account_type}: {info['count']} files in {info['path']}")
|
||||
|
||||
# Process each account type to its own file
|
||||
for account_type, info in pdf_dirs.items():
|
||||
if account_type not in ['Boursobank', 'American Express', 'Monabanq', 'SNCF', 'La Poste']:
|
||||
continue # Skip unsupported types
|
||||
|
||||
# Create a specialized processor for each account type
|
||||
if account_type == 'Revolut':
|
||||
# Special case for Revolut (CSV files)
|
||||
process_revolut(data_dir, output_dir, args.csv)
|
||||
else:
|
||||
process_pdf_account(account_type, info, output_dir, args.csv, args.single)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Processing Complete")
|
||||
|
||||
def process_revolut(data_dir, output_dir, generate_csv, single_mode=False):
|
||||
"""Process Revolut CSV files"""
|
||||
# Revolut CSV files are in raw_csv directory
|
||||
csv_dir = os.path.join(os.path.dirname(data_dir), 'raw_csv')
|
||||
csv_files = glob.glob(os.path.join(csv_dir, "*.csv"))
|
||||
|
||||
if not csv_files:
|
||||
print(f"No Revolut CSV files found in {csv_dir}")
|
||||
return
|
||||
|
||||
# Sort files by date
|
||||
csv_files.sort()
|
||||
|
||||
for csv_file in csv_files:
|
||||
print(f"Processing Revolut CSV: {os.path.basename(csv_file)}")
|
||||
|
||||
# Build the command
|
||||
cmd = [
|
||||
sys.executable,
|
||||
os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||
'process_expenses.py',
|
||||
'--csv-dir', csv_dir,
|
||||
'--output-dir', output_dir
|
||||
]
|
||||
|
||||
if generate_csv:
|
||||
cmd.append('--csv')
|
||||
|
||||
if single_mode:
|
||||
cmd.append('--single')
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True)
|
||||
if result.stdout:
|
||||
print(result.stdout)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error processing {csv_file}: {e}")
|
||||
|
||||
def process_pdf_account(account_type, info, output_dir, generate_csv, single_mode=False):
|
||||
"""Create and run a specialized processor for a PDF-based account"""
|
||||
# Create a temporary processor script
|
||||
processor_name = f"{account_type.lower().replace(' ', '_')}_processor.py"
|
||||
|
||||
processor_content = f'''#!/usr/bin/env python3
|
||||
"""
|
||||
Temporary processor for {account_type}
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
import glob
|
||||
import csv
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process {account_type} statements')
|
||||
parser.add_argument('--pdf-dir')
|
||||
parser.add_argument('--output-dir')
|
||||
parser.add_argument('--csv')
|
||||
args = parser.parse_args()
|
||||
|
||||
cmd = [
|
||||
sys.executable, os.path.join(os.path.dirname(os.path.abspath(__file__))),
|
||||
'process_{"account_type.lower().replace(' ', '_')}.py',
|
||||
'--pdf-dir', args.pdf_dir,
|
||||
'--output-dir', args.output_dir
|
||||
]
|
||||
|
||||
if args.csv:
|
||||
cmd.append('--csv')
|
||||
|
||||
subprocess.run(cmd, check=True)
|
||||
'''
|
||||
|
||||
# Write the processor script
|
||||
processor_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), processor_name)
|
||||
with open(processor_path, 'w') as f:
|
||||
f.write(processor_content)
|
||||
|
||||
# Make it executable and run it
|
||||
os.chmod(processor_path, 0o755)
|
||||
|
||||
cmd = [sys.executable, processor_path, '--pdf-dir', info['path'], '--output-dir', output_dir]
|
||||
|
||||
if generate_csv:
|
||||
cmd.append('--csv')
|
||||
|
||||
if single_mode:
|
||||
cmd.append('--single')
|
||||
|
||||
print(f"Running: {' '.join(cmd[2:])}")
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True)
|
||||
if result.stdout:
|
||||
print(result.stdout)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
# Clean up the temporary script
|
||||
os.remove(processor_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user