import requests
import random
import mysql.connector
import time
from fake_useragent import UserAgent
from dotenv import load_dotenv
import os
from datetime import datetime
from bs4 import BeautifulSoup
import sys
# Load environment variables
load_dotenv(override=True)
CAPTCHA_API_KEY = os.getenv("CAPTCHA_API")  # Optional for ScraperAPI use
print(CAPTCHA_API_KEY)
# Logging
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
log_path = os.path.join(log_dir, f"{datetime.now().strftime('%Y-%m-%d')}.log")

def log(message):
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    full_message = f"[{timestamp}] {message}"
    print(full_message)
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(full_message + "\n")
log(f"API KEy {CAPTCHA_API_KEY}")

def is_valid_result_url(url):
    # Skip any Google policy, product, or internal script-related URLs
    exclude_keywords = [
        "policies.google.com",
        "google.com/webhp",
        "google.it/intl",
        "support.google.com",
        "accounts.google.com",
        "/httpservice/retry/enablejs",
        "google.com/preferences",
        "google.com/setprefs",
        "google.com/advanced_search",
        "google.com/history",
        "google.com/search/howsearchworks",
        "google.com/intl/en/ads",  # avoid ad-related pages too
    ]
    for keyword in exclude_keywords:
        if keyword in url:
            return False
    return True


# --- DB Insert ---
def insert_into_db(query, url, page_number,resume_query_id):
    try:
        conn = mysql.connector.connect(
            host=os.getenv("HOST"),
            user=os.getenv("USER"),
            password=os.getenv("PASSWORD"),
            database=os.getenv("DATABASE")
        )
        cursor = conn.cursor()
        sql = "INSERT INTO google_search_results (query, url, page_number,resume_query_id) VALUES (%s, %s, %s, %s)"
        cursor.execute(sql, (query, url, page_number,resume_query_id))
        conn.commit()
        cursor.close()
        conn.close()
    except mysql.connector.Error as err:
        log(f"❌ DB Error: {err}")

# --- DB Update Status ---
def update_query_status(query_id):
    try:
        conn = mysql.connector.connect(
            host=os.getenv("HOST"),
            user=os.getenv("USER"),
            password=os.getenv("PASSWORD"),
            database=os.getenv("DATABASE")
        )
        cursor = conn.cursor()
        cursor.execute("UPDATE resume_queries SET status = 'done' WHERE id = %s", (query_id,))
        conn.commit()
        cursor.close()
        conn.close()
        log(f"✅ Updated status to 'done' for query ID {query_id}")
    except mysql.connector.Error as err:
        log(f"❌ DB Error (update): {err}")

# --- Get Pending Queries ---
def get_pending_queries():
    try:
        conn = mysql.connector.connect(
            host=os.getenv("HOST"),
            user=os.getenv("USER"),
            password=os.getenv("PASSWORD"),
            database=os.getenv("DATABASE")
        )
        cursor = conn.cursor(dictionary=True)
        cursor.execute("SELECT id, query_text FROM resume_queries WHERE status = 'pending'")
        results = cursor.fetchall()
        cursor.close()
        conn.close()
        return results
    except mysql.connector.Error as err:
        log(f"❌ DB Error (fetch): {err}")
        return []

# --- ScraperAPI Request ---
def fetch_with_scraperapi(query, page_number):
    user_agent = UserAgent().random
    start = page_number * 10
    search_url = f"https://www.google.com/search?q={query}&hl=en&gl=us&num=10&start={start}"

    payload = {
        'api_key': CAPTCHA_API_KEY,  
        'url': search_url,
        'user-agent': user_agent,
    }

    print(payload)
    try:
        r = requests.get('https://api.scraperapi.com/', params=payload)
        if r.status_code == 200:
            return r.text
        else:
            log(f"❌ ScraperAPI error: {r.status_code}")
            return None
    except Exception as e:
        log(f"❌ Error fetching with ScraperAPI: {e}")
        return None

# --- Scrape Google Results ---
def scrape_google_results(query_text, query_id, num_pages=10):
    encoded_query = query_text.replace(' ', '+')

    for page in range(num_pages):
        log(f"🌐 Query ID {query_id} - Fetching page {page + 1}...")
        html = fetch_with_scraperapi(encoded_query, page)
        
        if html:
            soup = BeautifulSoup(html, 'html.parser')
            links = []

            result_elements = soup.find_all('a', href=True)
            for el in result_elements:
                href = el.get('href')
                if 'http' in href:
                    links.append(href)


            log(f"✅ Page {page + 1}: Found {len(links)} valid organic links.")

            for link in links:
                if is_valid_result_url(link):
                    insert_into_db(query_text, link, page + 1,query_id)
                else:
                    log(f"⚠️ Skipped irrelevant URL: {link}")


            time.sleep(random.uniform(5, 10))
        else:
            log(f"❌ No results for page {page + 1}, skipping.")
    
    update_query_status(query_id)

# --- Main Runner ---
def run_all_pending_queries():
    pending_queries = get_pending_queries()
    if not pending_queries:
        log("🚫 No pending queries found.")
        return
    
    for q in pending_queries:
        log(f"🚀 Processing Query ID {q['id']}...")
        scrape_google_results(q['query_text'], q['id'])

# --- Run ---
if __name__ == "__main__":
    while True:
        log("🔁 Checking for new pending queries...")
        run_all_pending_queries()
        log("⏳ Sleeping for 5 minutes before next check...\n")
        time.sleep(300)  # Wait 5 minutes (300 seconds)
