#!/usr/bin/env python3
"""
Bank Finder - High-performance domain scanner for digital banks
Processes domains in parallel to find banks, fintechs, and payment services
"""

import asyncio
import aiohttp
import ssl
import re
import json
import sys
from datetime import datetime
from pathlib import Path
from collections import defaultdict

# Configuration
CONCURRENT_REQUESTS = 50  # Number of parallel requests
TIMEOUT = 8  # Seconds per request
INPUT_FILE = "/Users/neog/Downloads/dominios_com_br_raiz.txt"
OUTPUT_DIR = Path("/Users/neog/Downloads/bank_results")

# Bank detection keywords with scores
BANK_KEYWORDS = {
    # High priority (25 points)
    "pix": 25,
    "chave pix": 25,
    "conta digital": 25,
    "banco digital": 25,
    "abrir conta": 25,

    # Medium priority (15 points)
    "ted": 15,
    "doc": 15,
    "fintech": 15,
    "neobank": 15,
    "transferência bancária": 15,
    "abertura de conta": 15,
    "conta corrente": 15,
    "conta pj": 15,
    "instituição de pagamento": 15,

    # Lower priority (10 points)
    "boleto": 10,
    "cartão de crédito": 10,
    "cartão de débito": 10,
    "pagamento instantâneo": 10,
    "open banking": 10,
    "api bancária": 10,
    "baas": 10,
    "banking as a service": 10,
    "bacen": 10,

    # Basic (5 points)
    "transferência": 5,
    "transferencia": 5,
    "pagamento": 5,
    "saldo": 5,
    "extrato": 5,
    "fatura": 5,
    "cashback": 5,
}

# Negative keywords (reduce score)
NEGATIVE_KEYWORDS = {
    "loja online": -20,
    "e-commerce": -15,
    "produtos": -10,
    "comprar agora": -10,
    "carrinho": -10,
    "frete grátis": -10,
}

# Betting/gambling keywords - these sites are NOT banks
BETTING_KEYWORDS = [
    "aposta", "apostas", "bet", "bets", "betting", "casino", "cassino",
    "slots", "poker", "roleta", "roulette", "jackpot", "spin", "gambl",
    "lucky", "fortune", "prize", "prêmio", "premiações", "rifa", "rifas",
    "loteria", "loterias", "jogos online", "gaming", "odds", "esportivas",
    "plataforma de apostas", "casa de apostas", "ganhe r$", "cadastrar"
]

class BankFinder:
    def __init__(self):
        self.results = {
            "banks": [],
            "potential_banks": [],
            "online": [],
            "offline": [],
            "errors": [],
        }
        self.stats = defaultdict(int)
        self.start_time = None

    async def check_domain(self, session, domain, semaphore):
        """Check a single domain for bank keywords"""
        async with semaphore:
            result = {
                "domain": domain,
                "online": False,
                "https": False,
                "title": None,
                "score": 0,
                "keywords": [],
                "is_bank": False,
            }

            # Try HTTPS first, then HTTP
            for protocol in ["https", "http"]:
                url = f"{protocol}://{domain}"
                try:
                    async with session.get(url, timeout=aiohttp.ClientTimeout(total=TIMEOUT)) as response:
                        if response.status < 400:
                            result["online"] = True
                            result["https"] = protocol == "https"

                            # Read content
                            try:
                                html = await response.text(errors='ignore')
                                html_lower = html.lower()

                                # Extract title
                                title_match = re.search(r'<title[^>]*>([^<]+)</title>', html_lower)
                                if title_match:
                                    result["title"] = title_match.group(1).strip()[:100]

                                # Calculate bank score
                                score = 0
                                found_keywords = []

                                for keyword, points in BANK_KEYWORDS.items():
                                    if keyword in html_lower:
                                        score += points
                                        found_keywords.append(keyword)

                                for keyword, points in NEGATIVE_KEYWORDS.items():
                                    if keyword in html_lower:
                                        score += points  # Points are negative

                                # Check if it's a betting/gambling site - NOT a bank
                                is_betting = False
                                title_lower = (result.get("title") or "").lower()
                                for betting_kw in BETTING_KEYWORDS:
                                    if betting_kw in html_lower or betting_kw in title_lower or betting_kw in domain.lower():
                                        is_betting = True
                                        result["is_betting"] = True
                                        break

                                score = max(0, min(100, score))
                                result["score"] = score
                                result["keywords"] = found_keywords[:10]
                                result["is_bank"] = score >= 30 and not is_betting

                            except Exception as e:
                                result["error"] = f"Parse error: {str(e)[:50]}"

                            break  # Success, don't try HTTP

                except asyncio.TimeoutError:
                    result["error"] = "timeout"
                except aiohttp.ClientError as e:
                    result["error"] = str(e)[:50]
                except Exception as e:
                    result["error"] = str(e)[:50]

            return result

    async def process_batch(self, domains, batch_num, total_batches):
        """Process a batch of domains"""
        ssl_ctx = ssl.create_default_context()
        ssl_ctx.check_hostname = False
        ssl_ctx.verify_mode = ssl.CERT_NONE

        connector = aiohttp.TCPConnector(ssl=ssl_ctx, limit=CONCURRENT_REQUESTS)
        semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)

        async with aiohttp.ClientSession(
            connector=connector,
            headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"}
        ) as session:
            tasks = [self.check_domain(session, domain, semaphore) for domain in domains]
            results = await asyncio.gather(*tasks, return_exceptions=True)

            for result in results:
                if isinstance(result, Exception):
                    self.stats["errors"] += 1
                    continue

                if result["online"]:
                    self.stats["online"] += 1
                    self.results["online"].append(result["domain"])

                    if result["is_bank"]:
                        self.stats["banks"] += 1
                        self.results["banks"].append(result)
                        print(f"  🏦 BANK FOUND: {result['domain']} (score: {result['score']})")
                    elif result["score"] >= 15:
                        self.stats["potential"] += 1
                        self.results["potential_banks"].append(result)
                else:
                    self.stats["offline"] += 1

                self.stats["processed"] += 1

        # Progress update
        elapsed = (datetime.now() - self.start_time).total_seconds()
        rate = self.stats["processed"] / elapsed if elapsed > 0 else 0
        eta = (len(domains) * total_batches - self.stats["processed"]) / rate if rate > 0 else 0

        print(f"\n📊 Batch {batch_num}/{total_batches} complete")
        print(f"   Processed: {self.stats['processed']:,} | Online: {self.stats['online']:,}")
        print(f"   🏦 Banks: {self.stats['banks']} | Potential: {self.stats['potential']}")
        print(f"   Rate: {rate:.1f}/sec | ETA: {eta/60:.1f} min")

    async def run(self, domains):
        """Main processing loop"""
        self.start_time = datetime.now()

        print(f"\n{'='*60}")
        print(f"🔍 Bank Finder - Processing {len(domains):,} domains")
        print(f"{'='*60}")
        print(f"   Concurrent requests: {CONCURRENT_REQUESTS}")
        print(f"   Timeout: {TIMEOUT}s")
        print(f"   Started: {self.start_time.strftime('%H:%M:%S')}")
        print(f"{'='*60}\n")

        # Process in batches
        batch_size = 500
        batches = [domains[i:i+batch_size] for i in range(0, len(domains), batch_size)]

        for i, batch in enumerate(batches, 1):
            await self.process_batch(batch, i, len(batches))

        # Final summary
        elapsed = (datetime.now() - self.start_time).total_seconds()

        print(f"\n{'='*60}")
        print(f"✅ COMPLETE!")
        print(f"{'='*60}")
        print(f"   Total processed: {self.stats['processed']:,}")
        print(f"   Online: {self.stats['online']:,}")
        print(f"   Offline: {self.stats['offline']:,}")
        print(f"   🏦 Banks found: {self.stats['banks']}")
        print(f"   Potential banks: {self.stats['potential']}")
        print(f"   Time: {elapsed/60:.1f} minutes")
        print(f"   Rate: {self.stats['processed']/elapsed:.1f} domains/sec")
        print(f"{'='*60}\n")

        return self.results

    def save_results(self):
        """Save results to files"""
        OUTPUT_DIR.mkdir(exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Save banks (detailed JSON)
        banks_file = OUTPUT_DIR / f"banks_{timestamp}.json"
        with open(banks_file, 'w', encoding='utf-8') as f:
            json.dump(self.results["banks"], f, indent=2, ensure_ascii=False)
        print(f"💾 Saved banks to: {banks_file}")

        # Save banks (simple list)
        banks_txt = OUTPUT_DIR / f"banks_{timestamp}.txt"
        with open(banks_txt, 'w') as f:
            for bank in sorted(self.results["banks"], key=lambda x: x["score"], reverse=True):
                f.write(f"{bank['domain']}\t{bank['score']}\t{bank.get('title', '')}\n")
        print(f"💾 Saved banks list to: {banks_txt}")

        # Save potential banks
        potential_file = OUTPUT_DIR / f"potential_banks_{timestamp}.txt"
        with open(potential_file, 'w') as f:
            for bank in sorted(self.results["potential_banks"], key=lambda x: x["score"], reverse=True):
                f.write(f"{bank['domain']}\t{bank['score']}\t{','.join(bank['keywords'][:5])}\n")
        print(f"💾 Saved potential banks to: {potential_file}")

        # Save all online domains
        online_file = OUTPUT_DIR / f"online_{timestamp}.txt"
        with open(online_file, 'w') as f:
            f.write('\n'.join(self.results["online"]))
        print(f"💾 Saved online domains to: {online_file}")

        # Save summary
        summary_file = OUTPUT_DIR / f"summary_{timestamp}.txt"
        with open(summary_file, 'w') as f:
            f.write(f"Bank Finder Results - {timestamp}\n")
            f.write(f"{'='*50}\n")
            f.write(f"Total processed: {self.stats['processed']:,}\n")
            f.write(f"Online: {self.stats['online']:,}\n")
            f.write(f"Banks found: {self.stats['banks']}\n")
            f.write(f"Potential banks: {self.stats['potential']}\n")
            f.write(f"\nTop Banks by Score:\n")
            f.write(f"{'-'*50}\n")
            for bank in sorted(self.results["banks"], key=lambda x: x["score"], reverse=True)[:50]:
                title = (bank.get('title') or '')[:30]
                f.write(f"{bank['score']:3d}  {bank['domain']:<40} {title}\n")
        print(f"💾 Saved summary to: {summary_file}")


async def main():
    # Load domains
    print(f"📂 Loading domains from: {INPUT_FILE}")
    with open(INPUT_FILE, 'r') as f:
        domains = [line.strip().lower() for line in f if line.strip()]
    print(f"   Loaded {len(domains):,} domains")

    # Optional: limit for testing
    if len(sys.argv) > 1:
        limit = int(sys.argv[1])
        domains = domains[:limit]
        print(f"   Limited to first {limit:,} domains (testing mode)")

    # Run finder
    finder = BankFinder()
    await finder.run(domains)
    finder.save_results()


if __name__ == "__main__":
    asyncio.run(main())