import os
import re
import time
import requests
import mysql.connector
import logging
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from docx import Document
from io import BytesIO
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text
import textract
from email_validator import validate_email, EmailNotValidError
import re
from dotenv import load_dotenv
import os
from pdf2image import convert_from_path
import pytesseract

load_dotenv()
# --- Setup logs ---
log_folder = "URLScraplogs"
os.makedirs(log_folder, exist_ok=True)
log_filename = os.path.join(log_folder, datetime.now().strftime('%Y-%m-%d') + ".log")
logging.basicConfig(filename=log_filename, level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')

# --- Regex to extract email ---
EMAIL_REGEX = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'

# --- DB Config ---
DB_CONFIG = {
    'host': os.getenv("HOST"),
    'user': os.getenv("USER"),
    'password': os.getenv("PASSWORD"),
    'database': os.getenv("DATABASE")
}
from email_validator import validate_email, EmailNotValidError

def clean_email(email_str):
    try:
        v = validate_email(email_str, check_deliverability=False)
        return v.email
    except EmailNotValidError:
        return None


# --- DB Connection ---
def get_db_connection():
    return mysql.connector.connect(**DB_CONFIG)

# --- Fetch one pending URL ---
def fetch_pending_url():
    conn = get_db_connection()
    cursor = conn.cursor(dictionary=True)
    cursor.execute("""
        SELECT id, query, url, retry FROM google_search_results
        WHERE status = 'PENDING' AND retry < 2
        ORDER BY scraped_at ASC LIMIT 1
    """)
    row = cursor.fetchone()
    cursor.close()
    conn.close()
    return row

def decode_cf_email(encoded_str):
    """Decode Cloudflare email protection (data-cfemail)."""
    try:
        r = int(encoded_str[:2], 16)
        email = ''.join(
            chr(int(encoded_str[i:i + 2], 16) ^ r)
            for i in range(2, len(encoded_str), 2)
        )
        return email
    except Exception as e:
        print(f"❌ Failed to decode Cloudflare email: {e}")
        return None

def get_download_path(url):
    today = datetime.now().strftime('%Y-%m-%d')
    folder = os.path.join("downloads", today)
    os.makedirs(folder, exist_ok=True)
    filename = os.path.basename(urlparse(url).path) or f"file_{int(time.time())}"
    filepath = os.path.join(folder, filename)
    return filepath


# --- Update DB with result ---
def update_result(id, email=None):
    conn = get_db_connection()
    cursor = conn.cursor()
    if email:
        sql = "UPDATE google_search_results SET email=%s, status='DONE' WHERE id=%s"
        cursor.execute(sql, (email, id))
        logging.info(f"✅ Email found and updated for ID {id}: {email}")
    else:
        sql = "UPDATE google_search_results SET retry = retry + 1 WHERE id=%s"
        cursor.execute(sql, (id,))
        logging.info(f"❌ No email found, retry incremented for ID {id}")
    conn.commit()
    cursor.close()
    conn.close()

# --- Extract emails from text ---
import re
from email_validator import validate_email, EmailNotValidError

def extract_emails(text):
    # Step 1: Match loose patterns that may be followed by junk
    raw_emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+(?:\.[a-zA-Z]{2,})?", text)
    
    cleaned = set()
    suffix_garbage = ['main', 'education', 'art', 'contact', 'price', 'made']

    for email in raw_emails:
        # Step 2: Trim garbage suffixes if stuck to email
        for garbage in suffix_garbage:
            if email.lower().endswith(garbage):
                email = email[: -len(garbage)]

        # Step 3: Ensure it ends with valid domain (.com, .net, etc.)
        if not re.search(r'\.[a-zA-Z]{2,}$', email):
            continue

        # Step 4: Validate
        try:
            valid = validate_email(email, check_deliverability=False)
            cleaned.add(valid.email)
        except EmailNotValidError:
            continue

    return list(cleaned)


# --- Process a webpage ---
def process_webpage(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
        }
        resp = requests.get(url, headers=headers, timeout=15, verify=False)
        if "text/html" in resp.headers.get("Content-Type", ""):
            soup = BeautifulSoup(resp.text, 'html.parser')

            # 1. Decode Cloudflare-protected emails
            cf_emails = set()
            for tag in soup.select('a.__cf_email__'):
                encoded = tag.get('data-cfemail')
                if encoded:
                    decoded_email = decode_cf_email(encoded)
                    if decoded_email:
                        cf_emails.add(decoded_email)

            # 2. Extract emails from mailto: links
            mailto_emails = {
                a['href'].replace('mailto:', '').split('?')[0]
                for a in soup.find_all('a', href=True)
                if a['href'].lower().startswith('mailto:')
            }
            
            all_found = cf_emails.union(mailto_emails)

            # 3. Fallback: extract from page text if no email yet
            if not all_found:
                text_emails = extract_emails(soup.get_text())
                all_found.update(text_emails)

            return list(all_found)
    except Exception as e:
        logging.error(f"🌐 Error processing webpage {url}: {e}")
    return []

# --- Process PDF file ---
def process_pdf(url):
    try:
        filepath = get_download_path(url)
        resp = requests.get(url, timeout=15, verify=False)
        with open(filepath, "wb") as f:
            f.write(resp.content)

        # Try text extraction with pdfminer
        text = extract_text(filepath)
        emails = extract_emails(text)

        if emails:
            return emails

        # Fallback to OCR if no emails from text
        logging.info(f"📸 Running OCR on PDF: {filepath}")
        images = convert_from_path(filepath, dpi=300)
        ocr_text = ""
        for image in images:
            ocr_text += pytesseract.image_to_string(image)

        return extract_emails(ocr_text)

    except Exception as e:
        logging.error(f"📄 Error processing PDF {url}: {e}")
    return []


def fallback_docx_text(filepath):
    try:
        doc = Document(filepath)
        return "\n".join(p.text for p in doc.paragraphs)
    except Exception as e:
        logging.warning(f"⚠️ Fallback failed using python-docx: {e}")
        return ""

# --- Process DOCX file ---
def process_docx(url):
    try:
        filepath = get_download_path(url)
        resp = requests.get(url, timeout=15, verify=False)
        with open(filepath, "wb") as f:
            f.write(resp.content)

        logging.info(f"📝 Extracting text using textract: {filepath}")
        text = textract.process(filepath).decode("utf-8", errors="ignore")

        if not text.strip() and filepath.lower().endswith(".docx"):
            logging.warning(f"🛑 Textract returned empty, using fallback for: {filepath}")
            text = fallback_docx_text(filepath)

        return extract_emails(text)

    except Exception as e:
        logging.error(f"📝 Error processing DOC/DOCX {url}: {e}")
    return []

# --- Main processor ---
def process_url(entry):
    id = entry["id"]
    url = entry["url"]
    parsed = urlparse(url)
    ext = os.path.splitext(parsed.path)[1].lower()

    logging.info(f"🔍 Processing ID {id}: {url}")

    if ext == ".pdf":
        emails = process_pdf(url)
    elif ext in [".docx", ".doc"]:
        emails = process_docx(url)
    else:
        emails = process_webpage(url)

    if emails:
        update_result(id, emails[0])  # Save first found email
    else:
        update_result(id)

# --- Continuous runner ---
def main():
    while True:
        entry = fetch_pending_url()
        if entry:
            process_url(entry)
            time.sleep(3)
        else:
            logging.info("🕐 No URLs to process. Sleeping...")
            time.sleep(20)

if __name__ == "__main__":
    main()