#!/usr/bin/env python3
"""
Twitter/X Radar Page Scraper
Scrapes posts from X Radar queries with multiple fallback methods
"""

import requests
from bs4 import BeautifulSoup
import json
import time
import re
from datetime import datetime
from typing import List, Dict, Optional
import argparse
import sys

class TwitterRadarScraper:
    def __init__(self, radar_url: str, headless: bool = True):
        """
        Initialize the scraper

        Args:
            radar_url: The X Radar URL to scrape
            headless: Whether to run browser in headless mode (for Selenium)
        """
        self.radar_url = radar_url
        self.headless = headless
        self.posts = []

        # Extract query ID from URL
        self.query_id = self._extract_query_id(radar_url)

    def _extract_query_id(self, url: str) -> Optional[str]:
        """Extract query ID from Radar URL"""
        match = re.search(r'/radar/(\d+)', url)
        return match.group(1) if match else None

    def scrape_with_requests(self) -> List[Dict]:
        """
        Method 1: Try to scrape using requests + BeautifulSoup
        Note: This will likely fail due to Twitter's JS rendering
        """
        print("🔍 Attempting to scrape with requests library...")

        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }

        try:
            response = requests.get(self.radar_url, headers=headers, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            # Try to find posts (this structure may vary)
            articles = soup.find_all('article', attrs={'data-testid': 'tweet'})

            if not articles:
                print("⚠️  No posts found with requests method (expected - X uses JS rendering)")
                return []

            for article in articles:
                post_data = self._parse_article(article)
                if post_data:
                    self.posts.append(post_data)

            print(f"✅ Found {len(self.posts)} posts with requests method")
            return self.posts

        except Exception as e:
            print(f"❌ Requests method failed: {str(e)}")
            return []

    def scrape_with_selenium(self) -> List[Dict]:
        """
        Method 2: Scrape using Selenium (requires selenium + webdriver)
        This is more reliable for JS-heavy sites like X/Twitter
        """
        print("🔍 Attempting to scrape with Selenium...")

        try:
            from selenium import webdriver
            from selenium.webdriver.common.by import By
            from selenium.webdriver.support.ui import WebDriverWait
            from selenium.webdriver.support import expected_conditions as EC
            from selenium.webdriver.chrome.options import Options
            from selenium.webdriver.chrome.service import Service
        except ImportError:
            print("❌ Selenium not installed. Install with: pip install selenium")
            return []

        # Setup Chrome options
        chrome_options = Options()
        if self.headless:
            chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

        driver = None
        try:
            # Initialize driver
            driver = webdriver.Chrome(options=chrome_options)
            driver.get(self.radar_url)

            # Wait for posts to load
            print("⏳ Waiting for posts to load...")
            wait = WebDriverWait(driver, 15)

            # Try multiple selectors
            selectors = [
                "//article[@data-testid='tweet']",
                "//div[@data-testid='cellInnerDiv']",
                "//article"
            ]

            posts_found = False
            for selector in selectors:
                try:
                    wait.until(EC.presence_of_element_located((By.XPATH, selector)))
                    posts_found = True
                    break
                except:
                    continue

            if not posts_found:
                print("⚠️  Could not find posts on page")
                return []

            # Scroll to load more posts
            print("📜 Scrolling to load more posts...")
            last_height = driver.execute_script("return document.body.scrollHeight")
            scroll_attempts = 0
            max_scrolls = 5

            while scroll_attempts < max_scrolls:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)

                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    break

                last_height = new_height
                scroll_attempts += 1

            # Parse posts
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')

            articles = soup.find_all('article')
            print(f"📊 Found {len(articles)} article elements")

            for article in articles:
                post_data = self._parse_article_selenium(article)
                if post_data and post_data not in self.posts:
                    self.posts.append(post_data)

            print(f"✅ Extracted {len(self.posts)} unique posts with Selenium")
            return self.posts

        except Exception as e:
            print(f"❌ Selenium method failed: {str(e)}")
            import traceback
            traceback.print_exc()
            return []
        finally:
            if driver:
                driver.quit()

    def scrape_with_playwright(self) -> List[Dict]:
        """
        Method 3: Scrape using Playwright (requires playwright)
        Modern alternative to Selenium
        """
        print("🔍 Attempting to scrape with Playwright...")

        try:
            from playwright.sync_api import sync_playwright
        except ImportError:
            print("❌ Playwright not installed. Install with: pip install playwright && playwright install")
            return []

        try:
            with sync_playwright() as p:
                browser = p.chromium.launch(headless=self.headless)
                context = browser.new_context(
                    user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
                )
                page = context.new_page()

                print("🌐 Loading page...")
                page.goto(self.radar_url, wait_until='networkidle', timeout=30000)

                # Wait for content
                print("⏳ Waiting for posts...")
                try:
                    page.wait_for_selector('article', timeout=15000)
                except:
                    print("⚠️  No articles found")
                    browser.close()
                    return []

                # Scroll to load more
                print("📜 Scrolling to load posts...")
                for i in range(5):
                    page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
                    page.wait_for_timeout(2000)

                # Get page content
                content = page.content()
                soup = BeautifulSoup(content, 'html.parser')

                articles = soup.find_all('article')
                print(f"📊 Found {len(articles)} article elements")

                for article in articles:
                    post_data = self._parse_article_selenium(article)
                    if post_data and post_data not in self.posts:
                        self.posts.append(post_data)

                browser.close()
                print(f"✅ Extracted {len(self.posts)} unique posts with Playwright")
                return self.posts

        except Exception as e:
            print(f"❌ Playwright method failed: {str(e)}")
            import traceback
            traceback.print_exc()
            return []

    def _parse_article(self, article) -> Optional[Dict]:
        """Parse article element (basic version)"""
        try:
            # Extract text content
            text_elements = article.find_all('div', attrs={'lang': True})
            text = ' '.join([elem.get_text() for elem in text_elements])

            if not text:
                return None

            return {
                'text': text,
                'timestamp': datetime.now().isoformat(),
                'method': 'requests'
            }
        except Exception as e:
            return None

    def _parse_article_selenium(self, article) -> Optional[Dict]:
        """Parse article element (detailed version for Selenium/Playwright)"""
        try:
            post_data = {
                'username': None,
                'handle': None,
                'text': None,
                'timestamp': None,
                'likes': 0,
                'retweets': 0,
                'replies': 0,
                'views': 0,
                'scraped_at': datetime.now().isoformat()
            }

            # Extract username and handle
            user_link = article.find('a', href=re.compile(r'^/[^/]+$'))
            if user_link:
                post_data['handle'] = user_link.get('href', '').strip('/')

                # Username is usually in a span nearby
                user_name_elem = article.find('span', string=re.compile(r'.+'))
                if user_name_elem:
                    post_data['username'] = user_name_elem.get_text()

            # Extract tweet text
            text_divs = article.find_all('div', attrs={'lang': True})
            if text_divs:
                post_data['text'] = ' '.join([div.get_text() for div in text_divs])

            # Extract timestamp
            time_elem = article.find('time')
            if time_elem:
                post_data['timestamp'] = time_elem.get('datetime', '')

            # Extract engagement metrics
            engagement_pattern = re.compile(r'([\d,]+)\s*(replies|retweets|likes|views)', re.IGNORECASE)
            aria_labels = article.find_all(attrs={'aria-label': True})

            for elem in aria_labels:
                label = elem.get('aria-label', '')
                matches = engagement_pattern.findall(label)

                for count, metric in matches:
                    count_int = int(count.replace(',', ''))
                    metric_lower = metric.lower()

                    if 'repl' in metric_lower:
                        post_data['replies'] = count_int
                    elif 'retweet' in metric_lower:
                        post_data['retweets'] = count_int
                    elif 'like' in metric_lower:
                        post_data['likes'] = count_int
                    elif 'view' in metric_lower:
                        post_data['views'] = count_int

            # Only return if we have text
            if post_data['text']:
                return post_data

            return None

        except Exception as e:
            return None

    def scrape(self, method: str = 'auto') -> List[Dict]:
        """
        Main scraping method

        Args:
            method: 'requests', 'selenium', 'playwright', or 'auto' (tries all)
        """
        if method == 'auto':
            # Try all methods in order of preference
            methods = [
                ('playwright', self.scrape_with_playwright),
                ('selenium', self.scrape_with_selenium),
                ('requests', self.scrape_with_requests)
            ]

            for method_name, method_func in methods:
                print(f"\n{'='*60}")
                print(f"Trying method: {method_name.upper()}")
                print(f"{'='*60}")

                result = method_func()
                if result:
                    return result

            print("\n❌ All methods failed")
            return []

        elif method == 'requests':
            return self.scrape_with_requests()
        elif method == 'selenium':
            return self.scrape_with_selenium()
        elif method == 'playwright':
            return self.scrape_with_playwright()
        else:
            raise ValueError(f"Unknown method: {method}")

    def save_to_json(self, filename: str = None):
        """Save scraped posts to JSON file"""
        if not self.posts:
            print("⚠️  No posts to save")
            return

        if filename is None:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f"twitter_radar_{self.query_id}_{timestamp}.json"

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.posts, f, indent=2, ensure_ascii=False)

        print(f"💾 Saved {len(self.posts)} posts to {filename}")

    def save_to_csv(self, filename: str = None):
        """Save scraped posts to CSV file"""
        if not self.posts:
            print("⚠️  No posts to save")
            return

        import csv

        if filename is None:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f"twitter_radar_{self.query_id}_{timestamp}.csv"

        # Get all unique keys
        fieldnames = set()
        for post in self.posts:
            fieldnames.update(post.keys())
        fieldnames = sorted(fieldnames)

        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(self.posts)

        print(f"💾 Saved {len(self.posts)} posts to {filename}")

    def print_summary(self):
        """Print summary of scraped posts"""
        if not self.posts:
            print("⚠️  No posts scraped")
            return

        print(f"\n{'='*60}")
        print(f"SCRAPING SUMMARY")
        print(f"{'='*60}")
        print(f"Query ID: {self.query_id}")
        print(f"Total posts: {len(self.posts)}")

        # Count posts with engagement data
        with_engagement = sum(1 for p in self.posts if p.get('likes', 0) > 0)
        print(f"Posts with engagement data: {with_engagement}")

        # Top posts by engagement
        sorted_posts = sorted(self.posts, key=lambda x: x.get('likes', 0), reverse=True)

        print(f"\n🔥 Top 5 Posts by Likes:")
        for i, post in enumerate(sorted_posts[:5], 1):
            username = post.get('username', 'Unknown')
            handle = post.get('handle', 'unknown')
            likes = post.get('likes', 0)
            text = post.get('text', '')[:80] + '...' if len(post.get('text', '')) > 80 else post.get('text', '')

            print(f"\n{i}. @{handle} ({username})")
            print(f"   ❤️  {likes:,} likes | 🔁 {post.get('retweets', 0):,} retweets | 💬 {post.get('replies', 0):,} replies")
            print(f"   {text}")


def main():
    parser = argparse.ArgumentParser(description='Scrape Twitter/X Radar pages')
    parser.add_argument('url', help='Radar URL to scrape')
    parser.add_argument('-m', '--method', choices=['auto', 'requests', 'selenium', 'playwright'],
                        default='auto', help='Scraping method (default: auto)')
    parser.add_argument('-o', '--output', help='Output filename (JSON)')
    parser.add_argument('--csv', help='Output filename (CSV)')
    parser.add_argument('--headless', action='store_true', default=True,
                        help='Run browser in headless mode (default: True)')
    parser.add_argument('--no-headless', action='store_false', dest='headless',
                        help='Show browser window')

    args = parser.parse_args()

    print(f"""
╔══════════════════════════════════════════════════════════════╗
║           Twitter/X Radar Scraper                           ║
║                                                              ║
║  URL: {args.url[:50]}{'...' if len(args.url) > 50 else ''}
║  Method: {args.method.upper()}
╚══════════════════════════════════════════════════════════════╝
    """)

    # Create scraper
    scraper = TwitterRadarScraper(args.url, headless=args.headless)

    # Scrape
    posts = scraper.scrape(method=args.method)

    if posts:
        # Print summary
        scraper.print_summary()

        # Save results
        if args.output:
            scraper.save_to_json(args.output)
        else:
            scraper.save_to_json()

        if args.csv:
            scraper.save_to_csv(args.csv)
    else:
        print("\n❌ No posts were scraped. This could be because:")
        print("   1. Twitter/X requires authentication to view this content")
        print("   2. The page structure has changed")
        print("   3. Rate limiting or anti-bot protection kicked in")
        print("\n💡 Try:")
        print("   - Installing selenium: pip install selenium")
        print("   - Installing playwright: pip install playwright && playwright install")
        print("   - Using --no-headless to see what's happening")
        sys.exit(1)


if __name__ == '__main__':
    main()
