#!/usr/bin/env python3 """ Export All Posts to CSV for AI Decision Making Fetches complete post data from all 3 WordPress sites and exports to CSV for AI-powered categorization and movement recommendations. Uses credentials from .env file for secure authentication. """ import csv import logging import sys from pathlib import Path from typing import Dict, List, Optional import requests from requests.auth import HTTPBasicAuth import time from datetime import datetime import re from config import Config # Setup logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class PostExporter: """Export posts from WordPress sites to CSV for AI analysis.""" def __init__(self): """Initialize the exporter with sites from Config.""" self.sites = Config.WORDPRESS_SITES self.all_posts = [] self.category_cache = {} # Cache category names by site def fetch_posts_from_site(self, site_name: str, site_config: Dict) -> List[Dict]: """ Fetch ALL posts from a site with full details. Args: site_name: Website name site_config: Site configuration dict Returns: List of posts with full metadata """ logger.info(f"\nFetching posts from {site_name}...") posts = [] page = 1 base_url = site_config['url'].rstrip('/') api_url = f"{base_url}/wp-json/wp/v2/posts" auth = HTTPBasicAuth(site_config['username'], site_config['password']) for status in ['publish', 'draft']: page = 1 status_count = 0 while True: params = { 'page': page, 'per_page': 100, 'status': status, } try: logger.info(f" Fetching page {page} ({status} posts)...") response = requests.get(api_url, params=params, auth=auth, timeout=10) response.raise_for_status() page_posts = response.json() if not page_posts: break posts.extend(page_posts) status_count += len(page_posts) logger.info(f" ✓ Got {len(page_posts)} posts (total: {len(posts)})") page += 1 time.sleep(0.5) except requests.exceptions.HTTPError as e: if response.status_code == 400: logger.info(f" ℹ API limit reached (got {status_count} {status} posts)") break else: logger.error(f"Error on page {page}: {e}") break except requests.exceptions.RequestException as e: logger.error(f"Error fetching from {site_name}: {e}") break if status_count > 0: logger.info(f" ✓ Total {status} posts: {status_count}") logger.info(f"✓ Total posts from {site_name}: {len(posts)}\n") return posts def fetch_category_names(self, site_name: str, site_config: Dict) -> Dict[int, str]: """ Fetch category names and slugs from a WordPress site. Args: site_name: Website name site_config: Site configuration dict Returns: Dict mapping category IDs to category names """ if site_name in self.category_cache: return self.category_cache[site_name] logger.info(f" Fetching categories from {site_name}...") categories = {} base_url = site_config['url'].rstrip('/') api_url = f"{base_url}/wp-json/wp/v2/categories" auth = HTTPBasicAuth(site_config['username'], site_config['password']) try: # Fetch all categories (per_page=100) params = {'per_page': 100} response = requests.get(api_url, params=params, auth=auth, timeout=10) response.raise_for_status() cat_list = response.json() for cat in cat_list: categories[cat['id']] = { 'name': cat.get('name', ''), 'slug': cat.get('slug', ''), } logger.info(f" ✓ Fetched {len(categories)} categories") except Exception as e: logger.warning(f" Could not fetch categories from {site_name}: {e}") self.category_cache[site_name] = categories return categories def extract_post_details(self, post: Dict, site_name: str, category_map: Dict[int, Dict]) -> Dict: """ Extract all relevant details from a post for AI analysis. Args: post: WordPress post object site_name: Website name category_map: Dict mapping category IDs to names Returns: Dict with extracted post details """ # Title title = post.get('title', {}) if isinstance(title, dict): title = title.get('rendered', '') # Content (first 500 chars for context) content = post.get('content', {}) if isinstance(content, dict): content = content.get('rendered', '') # Strip HTML tags for readability content_text = re.sub('<[^<]+?>', '', content)[:500] # Excerpt excerpt = post.get('excerpt', {}) if isinstance(excerpt, dict): excerpt = excerpt.get('rendered', '') excerpt_text = re.sub('<[^<]+?>', '', excerpt) # Meta descriptions and SEO data meta_dict = post.get('meta', {}) if isinstance(post.get('meta'), dict) else {} rank_math_title = meta_dict.get('rank_math_title', '') rank_math_description = meta_dict.get('rank_math_description', '') rank_math_keyword = meta_dict.get('rank_math_focus_keyword', '') yoast_description = meta_dict.get('_yoast_wpseo_metadesc', '') meta_description = rank_math_description or yoast_description or '' # Categories - convert IDs to names using category_map category_ids = post.get('categories', []) category_names = ', '.join([ category_map.get(cat_id, {}).get('name', str(cat_id)) for cat_id in category_ids ]) if category_ids else '' # Tags tags = post.get('tags', []) tag_names = ', '.join([str(t) for t in tags]) if tags else '' # Author author_id = post.get('author', '') # Date date_published = post.get('date', '') date_modified = post.get('modified', '') # Status status = post.get('status', 'publish') # URL url = post.get('link', '') return { 'site': site_name, 'post_id': post['id'], 'status': status, 'title': title.strip(), 'slug': post.get('slug', ''), 'url': url, 'author_id': author_id, 'date_published': date_published, 'date_modified': date_modified, 'categories': category_names, 'tags': tag_names, 'excerpt': excerpt_text.strip(), 'content_preview': content_text.strip(), 'seo_title': rank_math_title, 'meta_description': meta_description, 'focus_keyword': rank_math_keyword, 'word_count': len(content_text.split()), } def export_to_csv(self, output_file: Optional[str] = None) -> str: """ Export all posts to CSV. Args: output_file: Optional custom output path Returns: Path to exported CSV file """ if not output_file: output_dir = Path(__file__).parent.parent / 'output' output_dir.mkdir(parents=True, exist_ok=True) date_str = datetime.now().strftime('%Y-%m-%d') output_file = output_dir / f'all_posts_{date_str}.csv' output_file = Path(output_file) output_file.parent.mkdir(parents=True, exist_ok=True) if not self.all_posts: logger.error("No posts to export") return None fieldnames = [ 'site', 'post_id', 'status', 'title', 'slug', 'url', 'author_id', 'date_published', 'date_modified', 'categories', 'tags', 'excerpt', 'content_preview', 'seo_title', 'meta_description', 'focus_keyword', 'word_count', ] logger.info(f"Exporting {len(self.all_posts)} posts to CSV...") with open(output_file, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for post in self.all_posts: writer.writerow({field: post.get(field, '') for field in fieldnames}) logger.info(f"✓ CSV exported to: {output_file}") return str(output_file) def run(self): """Run complete export process.""" logger.info("="*70) logger.info("EXPORTING ALL POSTS FOR AI DECISION MAKING") logger.info("="*70) logger.info("Sites configured: " + ", ".join(self.sites.keys())) logger.info("") # Fetch from all sites total_posts_before = len(self.all_posts) for site_name, config in self.sites.items(): # Fetch categories for this site categories = self.fetch_category_names(site_name, config) # Fetch posts for this site posts = self.fetch_posts_from_site(site_name, config) if posts: for post in posts: post_details = self.extract_post_details(post, site_name, categories) self.all_posts.append(post_details) if not self.all_posts: logger.error("No posts found on any site") sys.exit(1) # Sort by site then by post_id self.all_posts.sort(key=lambda x: (x['site'], x['post_id'])) # Export to CSV csv_file = self.export_to_csv() # Print summary logger.info("\n" + "="*70) logger.info("EXPORT SUMMARY") logger.info("="*70) by_site = {} for post in self.all_posts: site = post['site'] if site not in by_site: by_site[site] = {'total': 0, 'published': 0, 'draft': 0} by_site[site]['total'] += 1 if post['status'] == 'publish': by_site[site]['published'] += 1 else: by_site[site]['draft'] += 1 for site, stats in sorted(by_site.items()): logger.info(f"\n{site}:") logger.info(f" Total: {stats['total']}") logger.info(f" Published: {stats['published']}") logger.info(f" Drafts: {stats['draft']}") total_posts = len(self.all_posts) total_published = sum(1 for p in self.all_posts if p['status'] == 'publish') total_drafts = sum(1 for p in self.all_posts if p['status'] == 'draft') logger.info(f"\n{'─'*70}") logger.info(f"Total across all sites: {total_posts} posts") logger.info(f" Published: {total_published}") logger.info(f" Drafts: {total_drafts}") logger.info(f"{'─'*70}") logger.info(f"\n✓ Export complete!") logger.info(f"✓ CSV file: {csv_file}") logger.info(f"\nCSV includes:") logger.info(f" • Site, Post ID, Status, Title, URL") logger.info(f" • Publication dates, Categories, Tags") logger.info(f" • Content preview (500 chars)") logger.info(f" • SEO title, Meta description, Focus keyword") logger.info(f" • Word count") logger.info(f"\nNext step: Upload CSV to Claude or other AI for:") logger.info(f" 1. Categorize by topic (VPN, software, gaming, torrenting, etc.)") logger.info(f" 2. Recommend which site each post should be on") logger.info(f" 3. Identify duplicates for consolidation") logger.info(f" 4. Flag posts for deletion (low-traffic, thin content)") def main(): """Main entry point.""" import argparse parser = argparse.ArgumentParser( description='Export all posts from WordPress sites for AI decision making' ) parser.add_argument( '--output', help='Custom output CSV file path' ) args = parser.parse_args() exporter = PostExporter() exporter.run() if __name__ == '__main__': main()