diff --git a/src/seo/app.py b/src/seo/app.py index bbe4963..e1ea387 100644 --- a/src/seo/app.py +++ b/src/seo/app.py @@ -5,7 +5,7 @@ SEO Application Core - Integrated SEO automation functionality import logging from pathlib import Path from datetime import datetime -from typing import Optional, List +from typing import Optional, List, Tuple from .exporter import PostExporter from .analyzer import EnhancedPostAnalyzer @@ -64,9 +64,18 @@ class SEOApp: analyzer = EnhancedPostAnalyzer(csv_file, analyze_fields=fields) return analyzer.run(output_file=output, update_input=update) - def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None) -> str: - """Propose categories for posts.""" - logger.info("šŸ·ļø Proposing categories with AI...") + def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None) -> Tuple[str, str]: + """ + Propose categories for posts with editorial strategy alignment. + + Args: + csv_file: Path to CSV file (uses latest export if not provided) + output: Custom output file path + + Returns: + Tuple of (proposals_file, migrations_file) + """ + logger.info("šŸ·ļø Proposing categories with AI (editorial strategy aligned)...") if not csv_file: csv_file = self._find_latest_export() @@ -76,7 +85,7 @@ class SEOApp: logger.info(f"Using file: {csv_file}") - proposer = CategoryProposer(csv_file) + proposer = CategoryProposer(csv_file, use_editorial_strategy=True) return proposer.run(output_file=output) def category_apply(self, proposals_csv: str, site_name: str, diff --git a/src/seo/category_proposer.py b/src/seo/category_proposer.py index 6572ae6..853ab84 100644 --- a/src/seo/category_proposer.py +++ b/src/seo/category_proposer.py @@ -1,5 +1,6 @@ """ -Category Proposer - AI-powered category suggestions +Category Proposer - AI-powered category suggestions with editorial strategy alignment +Proposes categories based on content AND site editorial lines """ import csv @@ -7,26 +8,74 @@ import json import logging from pathlib import Path from datetime import datetime -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple import requests from .config import Config +from .editorial_strategy import EditorialStrategyAnalyzer logger = logging.getLogger(__name__) -class CategoryProposer: - """Propose categories for posts using AI.""" +# Editorial line definitions for each site +EDITORIAL_LINES = { + 'mistergeek.net': { + 'focus': 'High-value tech content', + 'ideal_categories': [ + 'VPN', 'Software/Tools', 'Gaming', 'SEO', + 'Content Marketing', 'Tech Reviews', 'Tutorials' + ], + 'topic_keywords': { + 'VPN': ['vpn', 'proxy', 'privacy', 'security', 'encryption'], + 'Software': ['software', 'app', 'tool', 'download', 'install'], + 'Gaming': ['game', 'gaming', 'console', 'steam', 'playstation'], + 'SEO': ['seo', 'ranking', 'google', 'search', 'optimization'], + 'Tech': ['tech', 'technology', 'review', 'device', 'hardware'], + } + }, + 'webscroll.fr': { + 'focus': 'Torrenting and file-sharing niche', + 'ideal_categories': [ + 'Torrenting', 'File-Sharing', 'Tracker Guides', + 'VPN for Torrenting', 'Seedbox' + ], + 'topic_keywords': { + 'Torrenting': ['torrent', 'download', 'upload', 'tracker', 'seed'], + 'File-Sharing': ['file-sharing', 'ddl', 'hosting', 'upload'], + 'Tracker Guides': ['tracker', 'ratio', 'invite', 'private'], + } + }, + 'hellogeek.net': { + 'focus': 'Low-traffic, experimental, off-brand content', + 'ideal_categories': [ + 'Experimental', 'Low-Traffic', 'Off-Brand', 'Testing' + ], + 'topic_keywords': {} # Catch-all for everything else + } +} - def __init__(self, csv_file: str): - """Initialize proposer with CSV file.""" + +class CategoryProposer: + """Propose categories for posts using AI with editorial strategy alignment.""" + + def __init__(self, csv_file: str, use_editorial_strategy: bool = True): + """ + Initialize proposer. + + Args: + csv_file: Path to CSV file + use_editorial_strategy: If True, align proposals with editorial lines + """ self.csv_file = Path(csv_file) self.openrouter_api_key = Config.OPENROUTER_API_KEY self.ai_model = Config.AI_MODEL self.posts = [] self.proposed_categories = [] + self.migration_recommendations = [] self.api_calls = 0 self.ai_cost = 0.0 + self.use_editorial_strategy = use_editorial_strategy + self.site_analysis = {} def load_csv(self) -> bool: """Load posts from CSV.""" @@ -48,15 +97,98 @@ class CategoryProposer: logger.error(f"Error loading CSV: {e}") return False + def analyze_editorial_strategy(self) -> Dict: + """Analyze editorial strategy to inform category proposals.""" + if not self.use_editorial_strategy: + return {} + + logger.info("\nšŸ“Š Analyzing editorial strategy to inform category proposals...") + + analyzer = EditorialStrategyAnalyzer() + analyzer.load_csv(str(self.csv_file)) + self.site_analysis = analyzer.analyze_site_content() + + logger.info("āœ“ Editorial strategy analysis complete") + return self.site_analysis + + def determine_best_site_for_post(self, post: Dict) -> Tuple[str, str, float]: + """ + Determine the best site for a post based on content. + + Returns: + Tuple of (site_name, reason, confidence) + """ + title = (post.get('title', '') + ' ' + post.get('content_preview', '')).lower() + current_site = post.get('site', '') + + # Check topic match for each site + site_scores = {} + + for site_name, editorial in EDITORIAL_LINES.items(): + score = 0 + matched_topics = [] + + for topic, keywords in editorial['topic_keywords'].items(): + if any(kw in title for kw in keywords): + score += 1 + matched_topics.append(topic) + + # Bonus for staying on current site (avoid unnecessary moves) + if site_name == current_site: + score += 0.5 + + site_scores[site_name] = { + 'score': score, + 'topics': matched_topics + } + + # Find best match + best_site = max(site_scores.items(), key=lambda x: x[1]['score']) + + if best_site[1]['score'] >= 1: + return ( + best_site[0], + f"Content matches {best_site[0]} editorial line ({', '.join(best_site[1]['topics'])})", + min(1.0, best_site[1]['score'] / 3.0) # Normalize confidence + ) + else: + # No strong match, keep on current site or move to hellogeek + if current_site in ['mistergeek.net', 'webscroll.fr']: + return (current_site, "Keep on current site (no better match)", 0.5) + else: + return ('hellogeek.net', "Low-traffic/off-brand content", 0.4) + def get_category_proposals(self, batch: List[Dict]) -> Optional[str]: - """Get AI category proposals for a batch of posts.""" + """Get AI category proposals with editorial strategy context.""" if not self.openrouter_api_key: logger.error("OPENROUTER_API_KEY not set") return None + # Build editorial context + editorial_context = "" + if self.use_editorial_strategy: + editorial_context = """ +EDITORIAL STRATEGY GUIDELINES: + +mistergeek.net (High-value tech): +- Categories: VPN, Software/Tools, Gaming, SEO, Content Marketing, Tech Reviews +- Focus: Professional, high-traffic tech content + +webscroll.fr (Torrenting niche): +- Categories: Torrenting, File-Sharing, Tracker Guides, VPN for Torrenting, Seedbox +- Focus: Torrenting and file-sharing only + +hellogeek.net (Catch-all): +- Categories: Experimental, Low-Traffic, Off-Brand, Testing +- Focus: Everything else, low-traffic content + +""" + + # Format posts for AI formatted = [] for i, post in enumerate(batch, 1): text = f"{i}. ID: {post['post_id']}\n" + text += f" Site: {post.get('site', '')}\n" text += f" Title: {post.get('title', '')}\n" text += f" Current Categories: {post.get('categories', '')}\n" if 'content_preview' in post: @@ -67,22 +199,28 @@ class CategoryProposer: prompt = f"""Analyze these blog posts and propose optimal categories. +{editorial_context} +POSTS TO ANALYZE: + {posts_text} For EACH post, provide: {{ "post_id": , "current_categories": "", - "proposed_category": "", + "proposed_category": "", "alternative_categories": ["", ""], - "reason": "", - "confidence": "" + "recommended_site": "", + "reason": "", + "confidence": "", + "should_migrate": , + "migration_reason": "" }} Return ONLY a JSON array with one object per post.""" try: - logger.info(f" Getting category proposals...") + logger.info(f" Getting category proposals with editorial alignment...") response = requests.post( "https://openrouter.ai/api/v1/chat/completions", @@ -129,11 +267,15 @@ Return ONLY a JSON array with one object per post.""" return [] def propose_categories(self, batch_size: int = 10) -> bool: - """Propose categories for all posts.""" + """Propose categories with editorial strategy alignment.""" logger.info("\n" + "="*70) - logger.info("PROPOSING CATEGORIES WITH AI") + logger.info("PROPOSING CATEGORIES WITH EDITORIAL STRATEGY") logger.info("="*70 + "\n") + # Analyze editorial strategy first + if self.use_editorial_strategy: + self.analyze_editorial_strategy() + batches = [self.posts[i:i + batch_size] for i in range(0, len(self.posts), batch_size)] logger.info(f"Processing {len(self.posts)} posts in {len(batches)} batches...\n") @@ -158,23 +300,68 @@ Return ONLY a JSON array with one object per post.""" logger.info(f" API calls: {self.api_calls}") logger.info(f" Cost: ${self.ai_cost:.4f}") + # Process proposals with editorial alignment for post in self.posts: post_id = str(post['post_id']) proposal = all_proposals.get(post_id, {}) - - self.proposed_categories.append({ + + current_site = post.get('site', '') + + # Get AI recommendation or use editorial strategy + proposed_category = proposal.get('proposed_category', post.get('categories', '')) + recommended_site = proposal.get('recommended_site', current_site) + should_migrate = proposal.get('should_migrate', False) + + # If AI didn't specify, use editorial strategy + if not recommended_site or recommended_site == current_site: + recommended_site, migration_reason, confidence = self.determine_best_site_for_post(post) + should_migrate = (recommended_site != current_site) + else: + migration_reason = proposal.get('migration_reason', '') + confidence = proposal.get('confidence', 'Medium') + + # Build proposal record + proposal_record = { **post, - 'proposed_category': proposal.get('proposed_category', post.get('categories', '')), + 'proposed_category': proposed_category, 'alternative_categories': ', '.join(proposal.get('alternative_categories', [])), 'category_reason': proposal.get('reason', ''), 'category_confidence': proposal.get('confidence', 'Medium'), - 'current_categories': post.get('categories', '') - }) + 'current_categories': post.get('categories', ''), + 'recommended_site': recommended_site, + 'should_migrate': 'Yes' if should_migrate else 'No', + 'migration_reason': migration_reason, + 'current_site': current_site + } + + self.proposed_categories.append(proposal_record) + + # Track migration recommendations + if should_migrate: + self.migration_recommendations.append({ + 'post_id': post_id, + 'title': post.get('title', '')[:80], + 'from_site': current_site, + 'to_site': recommended_site, + 'reason': migration_reason, + 'category': proposed_category + }) + + # Summary + migration_count = len(self.migration_recommendations) + logger.info(f"\nšŸ“Š Migration Recommendations: {migration_count} posts") + if migration_count > 0: + by_site = {} + for mig in self.migration_recommendations: + site = mig['to_site'] + by_site[site] = by_site.get(site, 0) + 1 + for site, count in by_site.items(): + logger.info(f" To {site}: {count} posts") return True def export_proposals(self, output_file: Optional[str] = None) -> str: - """Export category proposals to CSV.""" + """Export category proposals with migration data.""" if not output_file: output_dir = Path(__file__).parent.parent.parent / 'output' output_dir.mkdir(parents=True, exist_ok=True) @@ -185,9 +372,10 @@ Return ONLY a JSON array with one object per post.""" output_file.parent.mkdir(parents=True, exist_ok=True) fieldnames = [ - 'post_id', 'title', 'site', 'current_categories', + 'post_id', 'title', 'current_site', 'current_categories', 'proposed_category', 'alternative_categories', - 'category_reason', 'category_confidence' + 'category_reason', 'category_confidence', + 'recommended_site', 'should_migrate', 'migration_reason' ] logger.info(f"\nExporting to: {output_file}") @@ -200,13 +388,45 @@ Return ONLY a JSON array with one object per post.""" logger.info(f"āœ“ Exported {len(self.proposed_categories)} proposals") return str(output_file) - def run(self, output_file: Optional[str] = None, batch_size: int = 10) -> str: - """Run complete category proposal process.""" - if not self.load_csv(): + def export_migrations(self, output_file: Optional[str] = None) -> str: + """Export migration recommendations separately.""" + if not self.migration_recommendations: + logger.info("No migration recommendations to export") return "" + if not output_file: + output_dir = Path(__file__).parent.parent.parent / 'output' + output_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + output_file = output_dir / f'migration_recommendations_{timestamp}.csv' + + output_file = Path(output_file) + output_file.parent.mkdir(parents=True, exist_ok=True) + + fieldnames = [ + 'post_id', 'title', 'from_site', 'to_site', 'reason', 'category' + ] + + logger.info(f"\nExporting migrations to: {output_file}") + + with open(output_file, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(self.migration_recommendations) + + logger.info(f"āœ“ Exported {len(self.migration_recommendations)} migration recommendations") + return str(output_file) + + def run(self, output_file: Optional[str] = None, batch_size: int = 10) -> Tuple[str, str]: + """Run complete category proposal process with editorial strategy.""" + if not self.load_csv(): + return "", "" + if not self.propose_categories(batch_size=batch_size): logger.error("Failed to propose categories") - return "" + return "", "" - return self.export_proposals(output_file) + proposals_file = self.export_proposals(output_file) + migrations_file = self.export_migrations() + + return proposals_file, migrations_file diff --git a/src/seo/cli.py b/src/seo/cli.py index 6ba43e1..121f1a1 100644 --- a/src/seo/cli.py +++ b/src/seo/cli.py @@ -140,10 +140,17 @@ def cmd_category_propose(app, args): csv_file = args.args[0] if args.args else None - result = app.category_propose(csv_file=csv_file, output=args.output) + proposals_file, migrations_file = app.category_propose(csv_file=csv_file, output=args.output) - if result: - print(f"āœ… Category proposals saved to: {result}") + if proposals_file: + print(f"\nāœ… Category proposals complete!") + print(f" Proposals: {proposals_file}") + if migrations_file: + print(f" Migrations: {migrations_file}") + print(f"\nReview the files to see:") + print(f" 1. Proposed categories for each post") + print(f" 2. Site migration recommendations") + print(f" 3. Editorial strategy alignment") return 0