- Fix SNCF NET PAYÉ EN EUROS extraction to correctly parse MENSUEL line - Extract month/year from PDF content instead of filename - Add new Revolut CSV processor to aggregate account statements - Organize Revolut data files into data/csv/revolut/ - Clean up redundant scripts and reports
188 lines
6.1 KiB
Python
188 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Main script to process all financial statements with a clean one-file-per-structure
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import glob
|
|
import subprocess
|
|
import argparse
|
|
from datetime import datetime
|
|
|
|
# Import functionality from the dynamic processor
|
|
# Add the directory to the path so we can import it
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
from dynamic_processor import discover_pdf_directories, process_dynamic_pdf_files
|
|
|
|
def main():
|
|
"""
|
|
Main function with a clean, organized structure
|
|
"""
|
|
parser = argparse.ArgumentParser(description='Process financial statements with one file per entity')
|
|
parser.add_argument('--data-dir',
|
|
help='Base directory containing PDF files (default: auto-discovered)')
|
|
parser.add_argument('--output-dir', default=None,
|
|
help='Directory to save CSV output files (default: auto-discovered)')
|
|
parser.add_argument('--csv', action='store_true',
|
|
help='Generate CSV output files')
|
|
parser.add_argument('--single', action='store_true',
|
|
help='Process only the current entity (for testing)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Get paths
|
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
project_root = os.path.dirname(script_dir)
|
|
|
|
# Determine data directory
|
|
if args.data_dir:
|
|
data_dir = args.data_dir
|
|
if not os.path.isabs(data_dir):
|
|
data_dir = os.path.join(project_root, data_dir)
|
|
else:
|
|
data_dir = os.path.join(project_root, 'data/pdf')
|
|
|
|
# Set output directory
|
|
output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Financial Statement Processor")
|
|
print(f"Data Directory: {os.path.abspath(data_dir)}")
|
|
print(f"Output Directory: {os.path.abspath(output_dir)}")
|
|
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
print(f"{'='*60}")
|
|
|
|
# Discover all PDF directories
|
|
pdf_dirs = discover_pdf_directories(data_dir)
|
|
|
|
if not pdf_dirs:
|
|
print("No directories with PDF files found!")
|
|
return
|
|
|
|
print(f"\nDiscovered {len(pdf_dirs)} directories with PDF files:")
|
|
for account_type, info in pdf_dirs.items():
|
|
print(f" - {account_type}: {info['count']} files in {info['path']}")
|
|
|
|
# Process each account type to its own file
|
|
for account_type, info in pdf_dirs.items():
|
|
if account_type not in ['Boursobank', 'American Express', 'Monabanq', 'SNCF', 'La Poste']:
|
|
continue # Skip unsupported types
|
|
|
|
# Create a specialized processor for each account type
|
|
if account_type == 'Revolut':
|
|
# Special case for Revolut (CSV files)
|
|
process_revolut(data_dir, output_dir, args.csv)
|
|
else:
|
|
process_pdf_account(account_type, info, output_dir, args.csv, args.single)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Processing Complete")
|
|
|
|
def process_revolut(data_dir, output_dir, generate_csv, single_mode=False):
|
|
"""Process Revolut CSV files"""
|
|
# Revolut CSV files are in raw_csv directory
|
|
csv_dir = os.path.join(os.path.dirname(data_dir), 'raw_csv')
|
|
csv_files = glob.glob(os.path.join(csv_dir, "*.csv"))
|
|
|
|
if not csv_files:
|
|
print(f"No Revolut CSV files found in {csv_dir}")
|
|
return
|
|
|
|
# Sort files by date
|
|
csv_files.sort()
|
|
|
|
for csv_file in csv_files:
|
|
print(f"Processing Revolut CSV: {os.path.basename(csv_file)}")
|
|
|
|
# Build the command
|
|
cmd = [
|
|
sys.executable,
|
|
os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
|
'process_expenses.py',
|
|
'--csv-dir', csv_dir,
|
|
'--output-dir', output_dir
|
|
]
|
|
|
|
if generate_csv:
|
|
cmd.append('--csv')
|
|
|
|
if single_mode:
|
|
cmd.append('--single')
|
|
|
|
try:
|
|
result = subprocess.run(cmd, check=True, capture_output=True)
|
|
if result.stdout:
|
|
print(result.stdout)
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"Error processing {csv_file}: {e}")
|
|
|
|
def process_pdf_account(account_type, info, output_dir, generate_csv, single_mode=False):
|
|
"""Create and run a specialized processor for a PDF-based account"""
|
|
# Create a temporary processor script
|
|
processor_name = f"{account_type.lower().replace(' ', '_')}_processor.py"
|
|
|
|
processor_content = f'''#!/usr/bin/env python3
|
|
"""
|
|
Temporary processor for {account_type}
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import subprocess
|
|
import glob
|
|
import csv
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Process {account_type} statements')
|
|
parser.add_argument('--pdf-dir')
|
|
parser.add_argument('--output-dir')
|
|
parser.add_argument('--csv')
|
|
args = parser.parse_args()
|
|
|
|
cmd = [
|
|
sys.executable, os.path.join(os.path.dirname(os.path.abspath(__file__))),
|
|
'process_{"account_type.lower().replace(' ', '_')}.py',
|
|
'--pdf-dir', args.pdf_dir,
|
|
'--output-dir', args.output_dir
|
|
]
|
|
|
|
if args.csv:
|
|
cmd.append('--csv')
|
|
|
|
subprocess.run(cmd, check=True)
|
|
'''
|
|
|
|
# Write the processor script
|
|
processor_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), processor_name)
|
|
with open(processor_path, 'w') as f:
|
|
f.write(processor_content)
|
|
|
|
# Make it executable and run it
|
|
os.chmod(processor_path, 0o755)
|
|
|
|
cmd = [sys.executable, processor_path, '--pdf-dir', info['path'], '--output-dir', output_dir]
|
|
|
|
if generate_csv:
|
|
cmd.append('--csv')
|
|
|
|
if single_mode:
|
|
cmd.append('--single')
|
|
|
|
print(f"Running: {' '.join(cmd[2:])}")
|
|
|
|
try:
|
|
result = subprocess.run(cmd, check=True, capture_output=True)
|
|
if result.stdout:
|
|
print(result.stdout)
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"Error: {e}")
|
|
|
|
# Clean up the temporary script
|
|
os.remove(processor_path)
|
|
|
|
if __name__ == "__main__":
|
|
main() |