import logging
import sys
import pandas as pd
from datetime import datetime
from curl_cffi import requests
from bs4 import BeautifulSoup
import time
import os

# Configure logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler("gymleco/product_scraper.log", mode='w', encoding='utf-8'),
                        logging.StreamHandler(sys.stdout)]
                    )

COLLECTION_URL = 'https://gymleco.com/collections/all?page='
MAX_RETRIES = 3
RETRY_DELAY = 2  # seconds


def scrape_products_from_page(soup, page_num):
    """
    Extract product information from a single page
    Returns list of dictionaries with product data
    """
    products = []

    try:
        # Find the container
        container = soup.find('div', class_='container')
        if not container:
            # Try alternative container searches
            container = soup.find('div', attrs={'class': lambda x: x and 'container' in x})
            if not container:
                logging.warning(f"Page {page_num}: Container not found")
                # Debug: print available div classes
                divs = soup.find_all('div', limit=20)
                classes = [div.get('class', []) for div in divs if div.get('class')]
                logging.debug(f"Page {page_num}: Available div classes: {classes[:10]}")
                return products

        # Find the collection listing - simplified to the working approach
        collection_listing = soup.find('div', attrs={'class': lambda x: x and 'collection-listing' in x})

        if not collection_listing:
            logging.warning(f"Page {page_num}: Collection listing not found")
            # Debug: print container structure
            if container:
                child_divs = container.find_all('div', recursive=False)
                logging.debug(f"Page {page_num}: Container has {len(child_divs)} direct child divs")
                for i, div in enumerate(child_divs[:3]):
                    logging.debug(f"Page {page_num}: Child div {i}: classes={div.get('class', [])}")
            return products

        # Find all product blocks within collection listing
        product_blocks = collection_listing.find_all('product-block', class_='product-block')

        # Check for "No products found" message
        no_products_elements = collection_listing.find_all('h5', class_='align-centre fully-spaced-row')
        for element in no_products_elements:
            if 'No products found' in element.get_text():
                logging.info(f"Page {page_num}: No products found - reached end of collection")
                return None  # Signal to stop iteration

        # Also check for other "no products" indicators
        if not product_blocks:
            # Look for any indication this might be an empty page
            if soup.find(string=lambda text: text and 'no products' in text.lower()):
                logging.info(f"Page {page_num}: Empty page detected - stopping")
                return None

        for block in product_blocks:
            try:
                # Extract product ID from data-product-id attribute
                product_id = block.get('data-product-id')
                if not product_id:
                    logging.warning(f"Page {page_num}: Product ID not found in block")
                    continue

                # Extract title from product-block__title
                title_element = block.find('div', class_='product-block__title')
                if not title_element:
                    logging.warning(f"Page {page_num}: Title not found for product {product_id}")
                    continue
                title = title_element.get_text().strip()

                # Extract link from product-link href
                link_element = block.find('a', class_='product-link')
                if not link_element:
                    logging.warning(f"Page {page_num}: Link not found for product {product_id}")
                    continue
                link = link_element.get('href')

                # Add product to list
                product_data = {
                    'Title': title,
                    'ID': product_id,
                    'Link': link
                }
                products.append(product_data)
                logging.info(f"Page {page_num}: Found product - {title} (ID: {product_id})")

            except Exception as e:
                logging.error(f"Page {page_num}: Error processing product block - {str(e)}")
                continue

        logging.info(f"Page {page_num}: Extracted {len(products)} products")
        return products

    except Exception as e:
        logging.error(f"Page {page_num}: Error scraping page - {str(e)}")
        return products


def fetch_page_with_retry(session, url, page_num):
    """
    Fetch a page with retry mechanism using session
    """
    for attempt in range(MAX_RETRIES):
        try:
            logging.info(f"Page {page_num}: Fetching (attempt {attempt + 1}/{MAX_RETRIES})")
            response = session.get(url, timeout=30)
            response.raise_for_status()
            return response
        except Exception as e:
            logging.error(f"Page {page_num}: Attempt {attempt + 1} failed - {str(e)}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_DELAY)
            else:
                logging.error(f"Page {page_num}: All attempts failed")
                return None


def save_to_excel(products_df, filename="gymleco/products.xlsx"):
    """
    Save products dataframe to Excel file
    """
    try:
        products_df.to_excel(filename, index=False)
        logging.info(f"Products saved to {filename}")
        return True
    except Exception as e:
        logging.error(f"Error saving to Excel: {str(e)}")
        return False


def main():
    logging.info("Program started")
    now = datetime.now()
    formatted_date = now.strftime("%Y-%m-%d %H:%M")

    # Create a session for reusing connections
    session = requests.Session()
    session.impersonate = 'chrome'

    # Initialize products list and dataframe
    all_products = []
    products_df = pd.DataFrame(columns=['Title', 'ID', 'Link'])

    page = 1
    consecutive_errors = 0
    max_consecutive_errors = 3

    try:
        while True:
            try:
                # Construct URL for current page
                current_url = f"{COLLECTION_URL}{page}"
                logging.info(f"Processing page {page}: {current_url}")

                # Fetch page with retry mechanism using session
                response = fetch_page_with_retry(session, current_url, page)
                if not response:
                    consecutive_errors += 1
                    logging.error(f"Failed to fetch page {page}")

                    if consecutive_errors >= max_consecutive_errors:
                        logging.error(f"Too many consecutive errors ({consecutive_errors}). Stopping.")
                        break

                    page += 1
                    continue

                # Parse HTML
                soup = BeautifulSoup(response.content.decode('utf-8', 'ignore'), 'html.parser')

                # Extract products from current page
                page_products = scrape_products_from_page(soup, page)

                # Check if we've reached the end (no products found)
                if page_products is None:
                    logging.info("Reached end of collection - stopping iteration")
                    break

                # Add products to our collection
                if page_products:
                    all_products.extend(page_products)
                    consecutive_errors = 0  # Reset error counter on success

                    # Update dataframe
                    page_df = pd.DataFrame(page_products)
                    products_df = pd.concat([products_df, page_df], ignore_index=True)

                    logging.info(f"Total products found so far: {len(all_products)}")
                else:
                    logging.warning(f"No products found on page {page}")

                # Move to next page
                page += 1
                time.sleep(2)

            except KeyboardInterrupt:
                logging.info("Scraping interrupted by user")
                break
            except Exception as e:
                consecutive_errors += 1
                logging.error(f"Unexpected error on page {page}: {str(e)}")

                if consecutive_errors >= max_consecutive_errors:
                    logging.error(f"Too many consecutive errors ({consecutive_errors}). Stopping.")
                    break

                page += 1

    finally:
        # Close the session
        session.close()
        logging.info("Session closed")

    # Final summary
    logging.info(f"Scraping completed. Total products found: {len(all_products)}")
    logging.info(f"Pages processed: {page - 1}")

    # Save results
    if len(all_products) > 0:
        # Remove duplicates based on ID
        products_df = products_df.drop_duplicates(subset=['ID'], keep='first')
        logging.info(f"After removing duplicates: {len(products_df)} unique products")

        # Save to Excel
        if save_to_excel(products_df):
            logging.info("Results saved successfully")

        # Display summary
        print(f"\n=== SCRAPING SUMMARY ===")
        print(f"Total unique products: {len(products_df)}")
        print(f"Pages processed: {page - 1}")

        return products_df
    else:
        logging.warning("No products were found during scraping")
        return pd.DataFrame()


if __name__ == "__main__":
    try:
        products = main()
    except Exception as e:
        logging.error(f"Fatal error: {str(e)}")
        sys.exit(1)