import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random
import mysql.connector
import requests
from fake_useragent import UserAgent
from dotenv import load_dotenv
import os
from datetime import datetime
from bs4 import BeautifulSoup

# Load environment variables
load_dotenv()
CAPTCHA_API_KEY = os.getenv("2CAPTCHA_API")

# --- Logging Setup ---
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
log_path = os.path.join(log_dir, f"{datetime.now().strftime('%Y-%m-%d')}.log")

def log(message):
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    full_message = f"[{timestamp}] {message}"
    print(full_message)
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(full_message + "\n")

# --- DB Insert ---
def insert_into_db(query, url, page_number):
    try:
        conn = mysql.connector.connect(
            host="localhost",
            user="root",
            password="",
            database="web_scraper"
        )
        cursor = conn.cursor()
        sql = "INSERT INTO google_search_results (query, url, page_number) VALUES (%s, %s, %s)"
        cursor.execute(sql, (query, url, page_number))
        conn.commit()
        cursor.close()
        conn.close()
    except mysql.connector.Error as err:
        log(f"❌ DB Error: {err}")

# --- 2Captcha Request ---
def solve_captcha(site_key, page_url):
    log("🧩 Submitting CAPTCHA to 2Captcha...")

    payload = {
        "key": CAPTCHA_API_KEY,
        "method": "userrecaptcha",
        "googlekey": site_key,
        "pageurl": page_url,
        "json": 1
    }

    try:
        submit_resp = requests.post("http://2captcha.com/in.php", data=payload).json()

        if submit_resp.get("status") != 1:
            log(f"❌ Failed to submit CAPTCHA: {submit_resp}")
            return None

        captcha_id = submit_resp["request"]

        for _ in range(20):
            time.sleep(5)
            res = requests.get("http://2captcha.com/res.php", params={
                "key": CAPTCHA_API_KEY,
                "action": "get",
                "id": captcha_id,
                "json": 1
            }).json()

            if res.get("status") == 1:
                log("✅ CAPTCHA solved.")
                return res["request"]
        log("❌ CAPTCHA solving timed out.")
    except Exception as e:
        log(f"❌ CAPTCHA solving error: {e}")
    return None

def inject_captcha_solution(driver, solution):
    driver.execute_script('document.getElementById("g-recaptcha-response").style.display = "block";')
    driver.execute_script(f'document.getElementById("g-recaptcha-response").innerHTML="{solution}";')
    time.sleep(2)
    driver.execute_script('document.querySelector("form").submit();')

# --- Scrape Google Results ---
def scrape_google_results(query, num_pages=10):
    encoded_query = query.replace(' ', '+')
    
    # Create a Selenium WebDriver
    options = uc.ChromeOptions()
    options.headless = False
    driver = uc.Chrome(options=options, use_subprocess=True)

    try:
        # Loop through pages (pagination handling)
        for page in range(num_pages):
            start = page * 10
            search_url = f"https://www.google.com/search?q={encoded_query}&hl=en&gl=us&num=10&start={start}"
            log(f"🌐 Opening: {search_url}")

            driver.get(search_url)
            time.sleep(random.uniform(6, 10))

            # Check for CAPTCHA
            if "recaptcha" in driver.page_source.lower():
                site_key = driver.find_element(By.CLASS_NAME, "g-recaptcha").get_attribute("data-sitekey")
                solution = solve_captcha(site_key, search_url)

                if solution:
                    inject_captcha_solution(driver, solution)
                    time.sleep(10)

            # Parse the result page
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            links = []

            result_elements = soup.find_all('a', href=True)
            for el in result_elements:
                href = el.get('href')
                if 'http' in href:  # Only keep valid URLs
                    links.append(href)

            log(f"✅ Page {page + 1}: Found {len(links)} links.")
            
            # Insert the found URLs into DB
            for link in links:
                insert_into_db(query, link, page + 1)

            time.sleep(random.uniform(8, 15))  # Sleep to avoid rate limits
    finally:
        driver.quit()

# --- Run ---
scrape_google_results(
    '(intitle:resume OR inurl:resume) ("Sales manager" OR "regional sales manager" OR "sales director") AND ("CRM" OR "business development" OR "account management") (New York OR NYC OR "New York City") -job -jobs -sample -samples -templates',
    num_pages=10
)
