import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random
import mysql.connector
import requests
from fake_useragent import UserAgent

# Optional: Rotating proxies
PROXIES = [
    # "ip:port",
]

# Your 2Captcha API Key
CAPTCHA_API_KEY = "YOUR_2CAPTCHA_API_KEY"

def insert_into_db(query, url, page_number):
    try:
        conn = mysql.connector.connect(
            host="localhost",
            user="root",
            password="",
            database="web_scraper"
        )
        cursor = conn.cursor()
        sql = "INSERT INTO google_search_results (query, url, page_number) VALUES (%s, %s, %s)"
        cursor.execute(sql, (query, url, page_number))
        conn.commit()
        cursor.close()
        conn.close()
    except mysql.connector.Error as err:
        print(f"❌ DB Error: {err}")

def human_scroll(driver):
    pause = random.uniform(0.5, 1.5)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/3);")
    time.sleep(pause)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
    time.sleep(pause)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(pause)

def solve_captcha(driver):
    print("🛡️ Solving CAPTCHA...")

    try:
        iframe = driver.find_element(By.TAG_NAME, "iframe")
        src = iframe.get_attribute("src")
        if "k=" in src:
            sitekey = src.split("k=")[1].split("&")[0]
        else:
            print("❌ Sitekey not found.")
            return False
    except Exception as e:
        print(f"❌ Error finding sitekey: {e}")
        return False

    current_url = driver.current_url
    print(f"🌐 Sitekey: {sitekey}")
    print(f"🌐 URL: {current_url}")

    payload = {
        'key': CAPTCHA_API_KEY,
        'method': 'userrecaptcha',
        'googlekey': sitekey,
        'pageurl': current_url,
        'json': 1
    }
    response = requests.post('http://2captcha.com/in.php', data=payload).json()

    if response['status'] != 1:
        print(f"❌ 2Captcha error: {response}")
        return False

    captcha_id = response['request']
    print(f"🧠 CAPTCHA ID: {captcha_id}")

    solution = None
    for _ in range(24):
        time.sleep(5)
        res = requests.get(f"http://2captcha.com/res.php?key={CAPTCHA_API_KEY}&action=get&id={captcha_id}&json=1").json()
        if res['status'] == 1:
            solution = res['request']
            break
        print("⌛ Waiting for CAPTCHA solution...")

    if not solution:
        print("❌ CAPTCHA solving failed.")
        return False

    print(f"✅ CAPTCHA Solved!")

    driver.execute_script("""
    document.getElementById("g-recaptcha-response").innerHTML = arguments[0];
    """, solution)
    time.sleep(2)

    driver.execute_script("""
    for (const form of document.forms) {
        if (form.action.includes('sorry')) {
            form.submit();
            break;
        }
    }
    """)
    print("🚀 CAPTCHA submitted, waiting...")
    time.sleep(8)
    return True

def scrape_current_page(driver, query, current_page):
    human_scroll(driver)

    try:
        result_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, "//a[@data-ved and contains(@href, 'http')]"))
        )
    except:
        print(f"⚠️ No results found on page {current_page}.")
        return False

    page_links = [el.get_attribute("href") for el in result_elements if el.get_attribute("href")]
    print(f"✅ Page {current_page}: {len(page_links)} links found.")

    for link in page_links:
        insert_into_db(query, link, current_page)

    return True

def get_organic_google_results(query, num_pages=10):
    try:
        ua = UserAgent()
        user_agent = ua.random
    except:
        fallback_user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
        ]
        user_agent = random.choice(fallback_user_agents)
    options = uc.ChromeOptions()
    options.headless = False
    options.add_argument(f'user-agent={user_agent}')
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--lang=en-US,en;q=0.9")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument(f"--window-size={random.randint(1024,1920)},{random.randint(768,1080)}")

    if PROXIES:
        proxy = random.choice(PROXIES)
        print(f"🌐 Using proxy: {proxy}")
        options.add_argument(f'--proxy-server=http://{proxy}')

    with uc.Chrome(options=options, use_subprocess=True) as driver:
        driver.delete_all_cookies()

        encoded_query = query.replace(' ', '+')
        search_url = f"https://www.google.com/search?q={encoded_query}&hl=en&gl=us&num=10"
        print(f"🌐 Opening: {search_url}")

        driver.get(search_url)

        current_page = 1
        while current_page <= num_pages:
            time.sleep(random.uniform(5, 9))

            # CAPTCHA detection
            if "sorry" in driver.current_url or "recaptcha" in driver.page_source.lower():
                print("❌ CAPTCHA detected!")
                if solve_captcha(driver):
                    print("✅ CAPTCHA solved, continuing...")
                    continue
                else:
                    print("❌ CAPTCHA solving failed, exiting...")
                    with open("captcha_failed_page.html", "w", encoding="utf-8") as f:
                        f.write(driver.page_source)
                    break

            success = scrape_current_page(driver, query, current_page)
            if not success:
                break

            # Try to click "Next" or move to the next page
            try:
                WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "a.fl"))
                )

                pagination_links = driver.find_elements(By.CSS_SELECTOR, "a.fl")

                next_page_link = None
                for link in pagination_links:
                    aria_label = link.get_attribute('aria-label')
                    if aria_label and ("Next" in aria_label or "Page" in aria_label):
                        next_page_link = link
                        break

                if next_page_link:
                    driver.execute_script("arguments[0].scrollIntoView(true);", next_page_link)
                    next_page_link.click()
                    print(f"➡️ Clicked Next to Page {current_page + 1}")
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "div#search"))
                    )
                    current_page += 1
                else:
                    print("❌ No Next page link found, ending pagination.")
                    break

            except Exception as e:
                print(f"⚠️ Pagination error: {e}")
                break

            time.sleep(random.uniform(6, 12))

# --- Run the scraper ---
get_organic_google_results(
    "(intitle:resume OR inurl:resume) (\"Sales manager\" OR \"regional sales manager\" OR \"sales director\") AND (\"CRM\" OR \"business development\" OR \"account management\") (California OR CA OR \"Los Angeles\") -job -jobs -sample -samples -templates",
    num_pages=10
)
