#!/usr/bin/env python3
"""
YouTube Transcript Gateway API v4.0
Multi-strategy rate limit bypass with HiveProxy
"""

from flask import Flask, request, jsonify
from flask_cors import CORS
import yt_dlp
import subprocess
import tempfile
import os
import re
import time
import random
import secrets
import logging
from typing import Optional

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = Flask(__name__)
CORS(app)

# Configuration
COOKIES_FILE = os.path.expanduser("~/Services/data/youtube-cookies.txt")

# HiveProxy - Residential Proxy (Brazil/Balneário Camboriú)
# Template with {SESSION} placeholder for IP rotation
HIVE_PROXY_TEMPLATE = "socks5://108cdc609632b4a5353a_c_BR_sd_2607_city_Balneário%20Camboriú_s_{SESSION}:RNW78Fm5@proxy.hivep.com:10000"

def get_rotating_proxy() -> str:
    """Generate a fresh proxy URL with new session ID for IP rotation"""
    session_id = secrets.token_hex(16)  # 32 char random session
    return HIVE_PROXY_TEMPLATE.format(SESSION=session_id)

# Player clients to rotate through (tv is best for avoiding PO token requirements)
PLAYER_CLIENTS = ['tv', 'web_creator', 'mweb', 'ios', 'android']


class TranscriptFetcher:
    def __init__(self):
        self.last_request_time = 0
        self.min_request_interval = 2  # seconds between requests
        self.current_client_index = 0

    def _rate_limit_wait(self):
        """Ensure minimum interval between requests"""
        elapsed = time.time() - self.last_request_time
        if elapsed < self.min_request_interval:
            wait_time = self.min_request_interval - elapsed + random.uniform(0.5, 1.5)
            logger.info(f"Rate limiting: waiting {wait_time:.1f}s")
            time.sleep(wait_time)
        self.last_request_time = time.time()

    def _get_next_client(self) -> str:
        """Rotate through player clients"""
        client = PLAYER_CLIENTS[self.current_client_index]
        self.current_client_index = (self.current_client_index + 1) % len(PLAYER_CLIENTS)
        return client

    def fetch_with_hiveproxy(self, video_id: str, lang: str) -> Optional[str]:
        """Strategy 1: Use HiveProxy residential proxy (Brazil) - PRIORITY
        Uses rotating session ID for fresh IP on each request"""
        self._rate_limit_wait()

        # Get fresh proxy with new IP
        proxy = get_rotating_proxy()
        logger.info(f"Trying HiveProxy for {video_id} (new session)")

        try:
            with tempfile.TemporaryDirectory() as tmpdir:
                ydl_opts = {
                    'skip_download': True,
                    'writeautomaticsub': True,
                    'subtitleslangs': [lang],  # Only request the specific language
                    'subtitlesformat': 'vtt',
                    'outtmpl': os.path.join(tmpdir, '%(id)s'),
                    'proxy': proxy,
                    'socket_timeout': 60,
                    'quiet': True,
                    'no_warnings': True,
                    # Use TV client to avoid PO token requirements
                    'extractor_args': {'youtube': ['player_client=tv']},
                }

                url = f"https://www.youtube.com/watch?v={video_id}"
                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                    ydl.download([url])

                result = self._read_vtt_files(tmpdir, lang)
                if result:
                    logger.info("✅ HiveProxy strategy succeeded!")
                return result
        except Exception as e:
            logger.warning(f"❌ HiveProxy strategy failed: {str(e)[:100]}")
            return None

    def fetch_with_cookies(self, video_id: str, lang: str) -> Optional[str]:
        """Strategy 2: Use browser cookies"""
        if not os.path.exists(COOKIES_FILE):
            logger.info("No cookies file found, skipping cookies strategy")
            return None

        self._rate_limit_wait()
        logger.info(f"Trying cookies for {video_id}")

        try:
            with tempfile.TemporaryDirectory() as tmpdir:
                ydl_opts = {
                    'skip_download': True,
                    'writeautomaticsub': True,
                    'subtitleslangs': [lang],  # Only request the specific language
                    'subtitlesformat': 'vtt',
                    'outtmpl': os.path.join(tmpdir, '%(id)s'),
                    'cookiefile': COOKIES_FILE,
                    'quiet': True,
                    'no_warnings': True,
                }

                url = f"https://www.youtube.com/watch?v={video_id}"
                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                    ydl.download([url])

                result = self._read_vtt_files(tmpdir, lang)
                if result:
                    logger.info("✅ Cookies strategy succeeded!")
                return result
        except Exception as e:
            logger.warning(f"❌ Cookies strategy failed: {str(e)[:100]}")
            return None

    def fetch_with_client_rotation(self, video_id: str, lang: str) -> Optional[str]:
        """Strategy 3: Rotate player clients (uses rotating proxy)"""
        self._rate_limit_wait()

        for _ in range(len(PLAYER_CLIENTS)):
            client = self._get_next_client()
            proxy = get_rotating_proxy()  # Fresh IP for each client
            logger.info(f"Trying client {client} for {video_id} (with proxy)")

            try:
                with tempfile.TemporaryDirectory() as tmpdir:
                    ydl_opts = {
                        'skip_download': True,
                        'writeautomaticsub': True,
                        'subtitleslangs': [lang],  # Only request the specific language
                        'subtitlesformat': 'vtt',
                        'outtmpl': os.path.join(tmpdir, '%(id)s'),
                        'proxy': proxy,  # Use rotating proxy
                        'extractor_args': {'youtube': [f'player_client={client}']},
                        'quiet': True,
                        'no_warnings': True,
                    }

                    url = f"https://www.youtube.com/watch?v={video_id}"
                    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                        ydl.download([url])

                    result = self._read_vtt_files(tmpdir, lang)
                    if result:
                        logger.info(f"✅ Client {client} succeeded!")
                        return result
            except Exception as e:
                logger.warning(f"❌ Client {client} failed: {str(e)[:50]}")
                continue

        return None

    def fetch_direct(self, video_id: str, lang: str) -> Optional[str]:
        """Strategy 4: Direct fetch without proxy (last resort)"""
        self._rate_limit_wait()
        logger.info(f"Trying direct fetch for {video_id}")

        try:
            with tempfile.TemporaryDirectory() as tmpdir:
                ydl_opts = {
                    'skip_download': True,
                    'writeautomaticsub': True,
                    'subtitleslangs': [lang],  # Only request the specific language
                    'subtitlesformat': 'vtt',
                    'outtmpl': os.path.join(tmpdir, '%(id)s'),
                    'quiet': True,
                    'no_warnings': True,
                }

                url = f"https://www.youtube.com/watch?v={video_id}"
                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                    ydl.download([url])

                result = self._read_vtt_files(tmpdir, lang)
                if result:
                    logger.info("✅ Direct fetch succeeded!")
                return result
        except Exception as e:
            logger.warning(f"❌ Direct fetch failed: {str(e)[:100]}")
            return None

    def fetch_with_backoff(self, video_id: str, lang: str, max_retries: int = 2) -> Optional[str]:
        """Fetch with multi-strategy and exponential backoff retry"""
        strategies = [
            ("hiveproxy", self.fetch_with_hiveproxy),  # Priority 1: Residential proxy
            ("cookies", self.fetch_with_cookies),      # Priority 2: Authenticated session
            ("client_rotation", self.fetch_with_client_rotation),  # Priority 3: Client rotation
            ("direct", self.fetch_direct),             # Priority 4: Direct (last resort)
        ]

        for strategy_name, strategy_func in strategies:
            for attempt in range(max_retries):
                logger.info(f"=== Strategy: {strategy_name}, attempt {attempt + 1}/{max_retries} ===")

                result = strategy_func(video_id, lang)
                if result:
                    logger.info(f"🎉 Success with {strategy_name}!")
                    return result

                # Exponential backoff with jitter
                if attempt < max_retries - 1:
                    wait_time = (2 ** attempt) + random.uniform(0.5, 2)
                    logger.info(f"⏳ Waiting {wait_time:.1f}s before retry...")
                    time.sleep(wait_time)

            logger.info(f"Moving to next strategy after {strategy_name} failed")

        return None

    def _read_vtt_files(self, tmpdir: str, lang: str) -> Optional[str]:
        """Read and parse VTT subtitle files"""
        vtt_files = [f for f in os.listdir(tmpdir) if f.endswith('.vtt')]

        if not vtt_files:
            logger.warning(f"No VTT files found in {tmpdir}")
            return None

        logger.info(f"Found VTT files: {vtt_files}")

        # Prefer requested language
        selected = None
        for f in vtt_files:
            if f'.{lang}.' in f:
                selected = f
                break

        if not selected:
            selected = vtt_files[0]

        logger.info(f"Selected: {selected}")

        with open(os.path.join(tmpdir, selected), 'r', encoding='utf-8') as f:
            vtt_content = f.read()

        return self._parse_vtt(vtt_content)

    def _parse_vtt(self, vtt_content: str) -> str:
        """Parse VTT to plain text"""
        lines = vtt_content.split('\n')
        texts = []

        for line in lines:
            line = line.strip()
            # Skip VTT headers, timestamps, and cue identifiers
            if not line or line.startswith('WEBVTT') or '-->' in line or line.isdigit():
                continue
            if line.startswith('NOTE') or line.startswith('Kind:') or line.startswith('Language:'):
                continue
            # Remove HTML tags like <c>, </c>, etc.
            text = re.sub(r'<[^>]+>', '', line)
            # Remove timing tags like <00:00:00.000>
            text = re.sub(r'<\d{2}:\d{2}:\d{2}\.\d{3}>', '', text)
            text = text.strip()
            # Avoid duplicates (VTT often has overlapping segments)
            if text and (not texts or text != texts[-1]):
                texts.append(text)

        # Join and clean up
        full_text = ' '.join(texts)
        # Remove multiple spaces
        full_text = re.sub(r'\s+', ' ', full_text)
        return full_text.strip()


# Global fetcher instance
fetcher = TranscriptFetcher()


def extract_video_id(url: str) -> Optional[str]:
    """Extract video ID from various YouTube URL formats"""
    patterns = [
        r'(?:youtube\.com/watch\?v=)([a-zA-Z0-9_-]{11})',
        r'(?:youtu\.be/)([a-zA-Z0-9_-]{11})',
        r'(?:youtube\.com/embed/)([a-zA-Z0-9_-]{11})',
        r'(?:youtube\.com/v/)([a-zA-Z0-9_-]{11})',
        r'^([a-zA-Z0-9_-]{11})$',
    ]
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None


@app.route('/health')
def health():
    """Health check endpoint"""
    return jsonify({
        "status": "ok",
        "service": "youtube-transcript-gateway",
        "version": "4.1.0",
        "strategies": ["hiveproxy_rotating", "cookies", "client_rotation", "direct"],
        "cookies_available": os.path.exists(COOKIES_FILE),
        "hiveproxy_configured": bool(HIVE_PROXY_TEMPLATE),
        "ip_rotation": True
    })


@app.route('/transcript')
def get_transcript():
    """Get transcript for a YouTube video"""
    url = request.args.get('url', '')
    lang = request.args.get('lang', 'en')

    video_id = extract_video_id(url)
    if not video_id:
        return jsonify({"error": "Invalid YouTube URL"}), 400

    logger.info(f"\n{'='*60}")
    logger.info(f"Fetching transcript: {video_id}, lang={lang}")
    logger.info(f"{'='*60}")

    transcript = fetcher.fetch_with_backoff(video_id, lang)

    if transcript:
        word_count = len(transcript.split())
        logger.info(f"✅ SUCCESS! {word_count} words fetched")
        return jsonify({
            "transcript": transcript,
            "video_id": video_id,
            "language": lang,
            "word_count": word_count
        })
    else:
        logger.error("❌ All strategies failed!")
        return jsonify({
            "error": "All strategies failed. YouTube may be temporarily blocking.",
            "rate_limited": True
        }), 429


@app.route('/languages')
def list_languages():
    """List available transcript languages for a video"""
    url = request.args.get('url', '')

    video_id = extract_video_id(url)
    if not video_id:
        return jsonify({"error": "Invalid YouTube URL"}), 400

    try:
        ydl_opts = {
            'skip_download': True,
            'proxy': get_rotating_proxy(),
            'quiet': True,
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(f"https://www.youtube.com/watch?v={video_id}", download=False)

        subtitles = info.get('subtitles', {})
        auto_captions = info.get('automatic_captions', {})

        languages = []
        for lang_code in set(list(subtitles.keys()) + list(auto_captions.keys())):
            is_auto = lang_code in auto_captions and lang_code not in subtitles
            languages.append({
                "code": lang_code,
                "is_generated": is_auto
            })

        return jsonify({
            "video_id": video_id,
            "languages": languages,
            "count": len(languages)
        })
    except Exception as e:
        return jsonify({"error": str(e)}), 500


@app.route('/video-info')
def get_video_info():
    """Get video metadata"""
    url = request.args.get('url', '')

    video_id = extract_video_id(url)
    if not video_id:
        return jsonify({"error": "Invalid YouTube URL"}), 400

    try:
        ydl_opts = {
            'skip_download': True,
            'proxy': get_rotating_proxy(),
            'quiet': True,
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(f"https://www.youtube.com/watch?v={video_id}", download=False)

        return jsonify({
            "video_id": video_id,
            "title": info.get('title', ''),
            "channel": info.get('channel', ''),
            "duration": info.get('duration', 0),
            "url": f"https://www.youtube.com/watch?v={video_id}"
        })
    except Exception as e:
        return jsonify({"error": str(e)}), 500


if __name__ == '__main__':
    print("=" * 60)
    print("YouTube Transcript Gateway API v4.1.0")
    print("Multi-strategy rate limit bypass with IP Rotation")
    print("=" * 60)
    print("Fallback Chain:")
    print("  1. HiveProxy (rotating residential IPs from Brazil)")
    print("  2. Browser cookies (if available)")
    print("  3. Client rotation (tv, web_creator, mweb, ios)")
    print("  4. Direct fetch (last resort)")
    print("=" * 60)
    print("Features:")
    print("  - IP rotation per request (new session ID)")
    print("  - TV client for PO token bypass")
    print("  - Auto-retry with exponential backoff")
    print("=" * 60)
    print("Endpoints:")
    print("  GET /health              - Health check")
    print("  GET /transcript?url=...  - Get transcript")
    print("  GET /languages?url=...   - List available languages")
    print("  GET /video-info?url=...  - Get video info")
    print("=" * 60)
    print(f"HiveProxy: Rotating IPs from Balneário Camboriú, Brazil")
    print(f"Cookies: {'Available' if os.path.exists(COOKIES_FILE) else 'Not found'}")
    print("=" * 60)
    print("Starting server on http://0.0.0.0:8765")
    print("Accessible via Tailscale at http://100.75.88.8:8765")
    print("=" * 60)

    app.run(host='0.0.0.0', port=8765, debug=False, threaded=True)