import logging
import sys
import time

import pandas as pd
from bs4 import BeautifulSoup
from curl_cffi import requests

# Configure logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler("product_scraper.log", mode='w', encoding='utf-8'),
                        logging.StreamHandler(sys.stdout)]
                    )

COLLECTION_URL = 'https://www.fitnesstech.es/collections/todos-los-productos?page='
MAX_RETRIES = 3
RETRY_DELAY = 2  # seconds


def scrape_products_from_page(soup, page_num):
    products = []

    try:
        # Find the grid container with products
        grid_container = soup.find('div', class_='grid grid--uniform')

        if not grid_container:
            logging.warning(f"Page {page_num}: Grid container not found")
            return products

        product_items = grid_container.find_all('div', attrs={
            'class': lambda x: x and 'grid__item' in x and 'grid-product' in x})

        # If no products found, this is likely the end of the collection
        if not product_items:
            logging.info(f"Page {page_num}: No product items found - stopping iteration")
            return None

        for item in product_items:
            try:
                # Extract product ID from data-product-id attribute
                product_id = item.get('data-product-id')
                if not product_id:
                    logging.warning(f"Page {page_num}: Product ID not found in item")
                    continue

                sold_out_element = item.find('div', class_='grid-product__content')
                if sold_out_element and 'Agotado' in sold_out_element.get_text():
                    logging.info(f"Product ID {product_id} is sold out")
                    in_stock = False
                else:
                    in_stock = True

                # Extract title from grid-product__title
                title_element = item.find('div', class_='grid-product__title')
                if not title_element:
                    logging.warning(f"Page {page_num}: Title not found for product {product_id}")
                    continue
                title = title_element.get_text().strip()

                # Extract link from grid-product__link
                link_element = item.find('a', class_='grid-product__link')
                if not link_element:
                    logging.warning(f"Page {page_num}: Link not found for product {product_id}")
                    continue
                link = link_element.get('href')

                # Add product to list
                product_data = {
                    'Title': title,
                    'ID': product_id,
                    'Link': link,
                    'In Stock': in_stock
                }
                products.append(product_data)
                logging.info(f"Page {page_num}: Found product - {title} (ID: {product_id})")

            except Exception as e_item:
                logging.error(f"Page {page_num}: Error processing product item - {str(e_item)}")
                continue

        logging.info(f"Page {page_num}: Extracted {len(products)} products")
        return products

    except Exception as e_page:
        logging.error(f"Page {page_num}: Error scraping page - {str(e_page)}")
        return products


def fetch_page_with_retry(session, url, page_num):
    for attempt in range(MAX_RETRIES):
        try:
            logging.info(f"Page {page_num}: Fetching (attempt {attempt + 1}/{MAX_RETRIES})")
            response = session.get(url, timeout=30)
            response.raise_for_status()
            return response
        except Exception as e_fail:
            logging.error(f"Page {page_num}: Attempt {attempt + 1} failed - {str(e_fail)}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_DELAY)
    return None


def save_to_excel(products_df, filename="products.xlsx"):
    try:
        products_df.to_excel(filename, index=False)
        logging.info(f"Products saved to {filename}")
        return True
    except Exception as e:
        logging.error(f"Error saving to Excel: {str(e)}")
        return False


def main():
    logging.info("Program started")

    # Create a session for reusing connections
    session = requests.Session()
    session.impersonate = 'chrome'

    # Initialize products list and dataframe
    all_products = []
    products_df = pd.DataFrame(columns=['Title', 'ID', 'Link', 'In Stock'])

    page = 1
    consecutive_errors = 0
    max_consecutive_errors = 3

    try:
        while True:
            try:
                # Construct URL for current page
                current_url = f"{COLLECTION_URL}{page}"
                logging.info(f"Processing page {page}: {current_url}")

                # Fetch page with retry mechanism using session
                response = fetch_page_with_retry(session, current_url, page)
                if not response:
                    consecutive_errors += 1
                    logging.error(f"Failed to fetch page {page}")

                    if consecutive_errors >= max_consecutive_errors:
                        logging.error(f"Too many consecutive errors ({consecutive_errors}). Stopping.")
                        break

                    page += 1
                    continue

                # Parse HTML
                soup = BeautifulSoup(response.content.decode('utf-8', 'ignore'), 'html.parser')

                # Extract products from current page
                page_products = scrape_products_from_page(soup, page)

                # Check if we've reached the end (no products found)
                if page_products is None:
                    logging.info("Reached end of collection - stopping iteration")
                    break

                # Add products to our collection
                if page_products:
                    all_products.extend(page_products)
                    consecutive_errors = 0  # Reset error counter on success

                    # Update dataframe
                    page_df = pd.DataFrame(page_products)
                    products_df = pd.concat([products_df, page_df], ignore_index=True)

                    logging.info(f"Total products found so far: {len(all_products)}")
                else:
                    logging.warning(f"No products found on page {page}")

                # Move to next page
                page += 1
                time.sleep(2)

            except KeyboardInterrupt:
                logging.info("Scraping interrupted by user")
                break
            except Exception as e:
                consecutive_errors += 1
                logging.error(f"Unexpected error on page {page}: {str(e)}")

                if consecutive_errors >= max_consecutive_errors:
                    logging.error(f"Too many consecutive errors ({consecutive_errors}). Stopping.")
                    break

                page += 1

    finally:
        # Close the session
        session.close()
        logging.info("Session closed")

    # Final summary
    logging.info(f"Scraping completed. Total products found: {len(all_products)}")
    logging.info(f"Pages processed: {page - 1}")

    # Save results
    if len(all_products) > 0:
        # Remove duplicates based on ID
        products_df = products_df.drop_duplicates(subset=['ID'], keep='first')
        logging.info(f"After removing duplicates: {len(products_df)} unique products")

        # Save to Excel
        if save_to_excel(products_df):
            logging.info("Results saved successfully")

        # Display summary
        print(f"\n=== SCRAPING SUMMARY ===")
        print(f"Total unique products: {len(products_df)}")
        print(f"Pages processed: {page - 1}")

        return products_df
    else:
        logging.warning("No products were found during scraping")
        return pd.DataFrame()


if __name__ == "__main__":
    try:
        products = main()
    except Exception as e:
        logging.error(f"Fatal error: {str(e)}")
        sys.exit(1)
