import html
import json
import logging
import sys
import time
from datetime import datetime
from zoneinfo import ZoneInfo
import pandas as pd
from bs4 import BeautifulSoup
from curl_cffi import requests
from deep_translator import GoogleTranslator

# Configure logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler("product_details_scraper.log", mode='w', encoding='utf-8'),
                        logging.StreamHandler(sys.stdout)]
                    )

MAX_RETRIES = 3
TIME_DELAY = 2

DEBUG = False
DEBUG_LIMIT = 5


def escape_xml_content(text):
    """
    Escape special XML characters in text content
    Handles None values and ensures string output
    """
    if text is None:
        return ""

    # Convert to string and strip whitespace
    text = str(text).strip()

    # Remove or replace problematic characters
    # Remove control characters (except tab, newline, carriage return)
    cleaned_text = ''.join(char for char in text if ord(char) >= 32 or char in '\t\n\r')

    # Replace multiple spaces with single space
    cleaned_text = ' '.join(cleaned_text.split())

    escaped_text = html.escape(cleaned_text, quote=True)

    return escaped_text


def escape_product_fields(product_data):
    """
    Escape XML special characters in specified fields before XML writing
    """
    if not isinstance(product_data, dict):
        return product_data

    # Fields that need XML escaping
    fields_to_escape = ['Title', 'Tags', 'Description', 'Category', 'Availability']

    escaped_data = product_data.copy()

    for field in fields_to_escape:
        if field in escaped_data:
            escaped_data[field] = escape_xml_content(escaped_data[field])
            if escaped_data[field] != product_data[field]:
                logging.debug(f"Escaped {field}: '{product_data[field]}' -> '{escaped_data[field]}'")

    return escaped_data


def fetch_with_retry(session, url, url_type="page"):
    for attempt in range(MAX_RETRIES):
        try:
            logging.info(f"Fetching {url_type}: {url} (attempt {attempt + 1}/{MAX_RETRIES})")
            response = session.get(url, timeout=30)
            response.raise_for_status()
            return response
        except Exception as e:
            logging.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(TIME_DELAY)
            else:
                logging.error(f"All attempts failed for {url}")
                return None


def get_stock_status(session, base_url, translator):
    """
    Fetch the product page and extract stock status text
    Returns translated availability text
    """
    try:
        stock_response = fetch_with_retry(session, base_url, "Stock Check Page")
        if not stock_response:
            logging.error(f"Failed to fetch stock page for {base_url}")
            return "Out of Stock"

        soup = BeautifulSoup(stock_response.text, 'html.parser')
        span_element = soup.find('span', class_='js-incoming-text')

        if span_element:
            stock_text = span_element.get_text(strip=True)
            if stock_text == 'Pedido pendiente, envío pronto':
                availability = 'Out of Stock'
            else:
                try:
                    availability = translator.translate(stock_text)
                    logging.info(f"Translated stock status: '{stock_text}' -> '{availability}'")
                except Exception as e:
                    logging.error(f"Translation failed for stock text '{stock_text}': {str(e)}")
                    availability = "Out of Stock"
        else:
            availability = "Out of Stock"
            logging.warning(f"Stock text element not found for {base_url}")

        return availability

    except Exception as e:
        logging.error(f"Error getting stock status for {base_url}: {str(e)}")
        return "Out of Stock"


def extract_json_data(json_data, base_url):
    """
    Extract product data from JSON response
    """
    try:
        product = json_data.get('product', {})

        # Extract basic info
        category = product.get('product_type', '')
        title = product.get('title', '')
        tags = product.get('tags', '')
        product_id = product.get('id', '')
        body_html = product.get('body_html', '')

        # Remove 'quote' from tags if present
        if tags and 'quote' in tags:
            tags = tags.replace('quote', '').replace(', ,', ',').strip(', ')
        if 'TrÃ¤ningsbollar shop' in tags:
            tags = tags.replace('TrÃ¤ningsbollar shop', 'Training Balls').strip()

        # Extract variants and options
        variants = product.get('variants', [])
        options = product.get('options', [])

        # Training balls tags for specific SKUs
        if variants:
            sku = variants[0].get('sku', '')
            if sku == 'A-D55' or sku == 'N85':
                tags += ', Training Balls'

        # Extract images
        images = product.get('images', [])
        main_img = ''
        other_images = []

        for img in images:
            img_src = img.get('src', '')
            # Remove the Shopify parameters and use clean URL
            if '?' in img_src:
                img_src = img_src.split('?')[0]

            if img.get('position') == 1:
                main_img = img_src
            else:
                other_images.append(img_src)

        result = {
            'category': category,
            'title': title,
            'tags': tags,
            'id': product_id,
            'url': base_url,
            'main_img': main_img,
            'other_images': other_images,
            'body_html': body_html,
            'variants': variants,
            'options': options
        }

        return result
    except Exception as e:
        logging.error(f"Error extracting JSON data: {str(e)}")
        return None


def process_product(session, link, translator, in_stock):
    """
    Process a single product by fetching JSON data
    """
    try:
        base_url = f"https://www.fitnesstech.es{link}"
        json_url = f"{base_url}.json"

        # Fetch JSON data
        json_response = fetch_with_retry(session, json_url, "JSON")
        if not json_response:
            logging.error(f"Failed to fetch JSON for {link}")
            return None

        try:
            json_data = json_response.json()
        except json.JSONDecodeError as e:
            logging.error(f"Failed to parse JSON for {link}: {str(e)}")
            return None

        # Extract JSON data
        json_info = extract_json_data(json_data, base_url)
        if not json_info:
            logging.error(f"Failed to extract JSON data for {link}")
            return None

        # Handle availability - check if product is in stock
        if in_stock:
            availability = "In Stock"
        else:
            # Add delay before fetching HTML page for stock status
            time.sleep(TIME_DELAY)
            availability = get_stock_status(session, base_url, translator)

        # Build product data
        product_data = {
            'Category': json_info['category'],
            'Title': json_info['title'],
            'Tags': json_info['tags'],
            'ID': json_info['id'],
            'URL': json_info['url'],
            'Availability': availability,
            'MainImg': json_info['main_img'],
            'Description': json_info['body_html']
        }

        # Add other images
        for i, img_url in enumerate(json_info['other_images']):
            product_data[f'Image{i + 1}'] = img_url

        # Handle variants
        variants = json_info['variants']
        options = json_info['options']

        if len(variants) == 1:
            # No variants - single product
            product_data['SKU'] = variants[0].get('sku', '')
            product_data['Price'] = variants[0].get('price', '')
        else:
            # Multiple variants
            var_name = ''
            if options and len(options) > 0:
                var_name = options[0].get('name', '')

            product_data['VariationName'] = var_name
            product_data['Variants'] = []

            for variant in variants:
                var_data = {
                    'sku': variant.get('sku', ''),
                    'title': variant.get('title', ''),
                    'price': variant.get('price', '')
                }
                product_data['Variants'].append(var_data)

        logging.info(
            f"Successfully processed product: {json_info['title']} (ID: {json_info['id']}, Availability: {availability})\n")
        return product_data

    except Exception as e:
        logging.error(f"Error processing product {link}: {str(e)}\n")
        return None


def write_xml_product(file_handle, product_data):
    try:
        # ESCAPE XML SPECIAL CHARACTERS BEFORE WRITING
        escaped_product = escape_product_fields(product_data)

        file_handle.write('<product>\n')

        # Write basic fields in the correct order (including Availability)
        basic_fields = ['Category', 'Title', 'Tags', 'ID', 'Availability', 'URL', 'MainImg']
        for field in basic_fields:
            if field in escaped_product and escaped_product[field]:
                file_handle.write(f'<{field}>{escaped_product[field]}</{field}>\n')

        # Write additional images
        image_count = 1
        while f'Image{image_count}' in escaped_product:
            file_handle.write(f'<Image{image_count}>{escaped_product[f"Image{image_count}"]}</Image{image_count}>\n')
            image_count += 1

        # Write description (escaped text)
        if 'Description' in escaped_product and escaped_product['Description']:
            file_handle.write(f'<Description>{escaped_product["Description"]}</Description>\n')

        # Write variations if present
        if 'VariationName' in escaped_product and 'Variants' in product_data:
            variation_name = escaped_product['VariationName']
            file_handle.write('<Variation>\n')
            file_handle.write(f'<var_name>{variation_name}</var_name>\n')

            # Write variants
            for variant in product_data['Variants']:
                file_handle.write('<variant>\n')
                file_handle.write(f'<sku>{escape_xml_content(variant.get("sku", ""))}</sku>\n')
                file_handle.write(f'<title>{escape_xml_content(variant.get("title", ""))}</title>\n')
                file_handle.write(f'<price>{escape_xml_content(variant.get("price", ""))}</price>\n')
                file_handle.write('</variant>\n')

            file_handle.write('</Variation>\n')
        elif 'SKU' in escaped_product:
            # Single product with SKU
            file_handle.write(f'<SKU>{escaped_product["SKU"]}</SKU>\n')
            if 'Price' in escaped_product:
                file_handle.write(f'<Price>{escaped_product["Price"]}</Price>\n')

        file_handle.write('</product>\n')

    except Exception as e:
        logging.error(f"Error writing XML for product: {str(e)}")


def save_to_excel(products_df, filename="product_details.xlsx"):
    """
    Save products dataframe to Excel file
    """
    try:
        products_df.to_excel(filename, index=False)
        logging.info(f"Product details saved to {filename}")
        return True
    except Exception as e:
        logging.error(f"Error saving to Excel: {str(e)}")
        return False


def main():
    logging.info("Product details scraper started")
    greek_tz = ZoneInfo('Europe/Athens')
    now = datetime.now(greek_tz)
    formatted_date = now.strftime("%Y-%m-%d %H:%M")

    # Create a session for reusing connections
    session = requests.Session()
    session.impersonate = 'chrome'

    # Initialize translator (Spanish to English)
    translator = GoogleTranslator(source='es', target='en')

    try:
        # Read product links from the existing Excel file
        try:
            products_df = pd.read_excel("products.xlsx")
            if 'Link' not in products_df.columns:
                logging.error("'Link' column not found in products.xlsx")
                return

            if 'In Stock' not in products_df.columns:
                logging.error("'In Stock' column not found in products.xlsx")
                return

            links = products_df['Link'].tolist()
            in_stock_list = products_df['In Stock'].tolist()

        except FileNotFoundError:
            logging.error("products.xlsx file not found. Please run the product listing scraper first.")
            return
        except Exception as e:
            logging.error(f"Error reading products.xlsx: {str(e)}")
            return

        # Initialize data storage
        all_product_data = []

        # Open XML file for writing
        with open("products.xml", "w", encoding="utf-8") as xml_file:
            xml_file.write('<?xml version="1.0" encoding="UTF-8"?>\n')
            xml_file.write('<products>\n')
            xml_file.write(f"<created_at>{formatted_date}</created_at>\n")

            processed_count = 0
            for i, (link, in_stock) in enumerate(zip(links, in_stock_list), 1):

                if DEBUG and processed_count >= DEBUG_LIMIT:
                    logging.info(f"DEBUG: Reached limit of {DEBUG_LIMIT} products")
                    break

                logging.info(f"Processing product {i}/{len(links)}: {link} (In Stock: {in_stock})")

                # Pass translator and in_stock to process_product
                product_data = process_product(session, link, translator, in_stock)

                if product_data:
                    write_xml_product(xml_file, product_data)
                    all_product_data.append(product_data)
                    processed_count += 1
                else:
                    logging.warning(f"Failed to process product {i}: {link}\n")

                # Add delay between products
                if i < len(links):  # Don't sleep after the last product
                    time.sleep(TIME_DELAY)

            xml_file.write('</products>\n')

        # Save to Excel
        if all_product_data:
            # Flatten the data for Excel export
            excel_data = []
            for product in all_product_data:
                row = {
                    'Category': product.get('Category', ''),
                    'Title': product.get('Title', ''),
                    'Tags': product.get('Tags', ''),
                    'ID': product.get('ID', ''),
                    'URL': product.get('URL', ''),
                    'Availability': product.get('Availability', ''),
                    'MainImg': product.get('MainImg', ''),
                    'Description': product.get('Description', '')
                }

                # Add images
                image_count = 1
                while f'Image{image_count}' in product:
                    row[f'Image{image_count}'] = product[f'Image{image_count}']
                    image_count += 1

                # Add variant info
                if 'SKU' in product:
                    row['SKU'] = product['SKU']
                    row['Price'] = product.get('Price', '')
                elif 'Variants' in product:
                    row['VariationName'] = product.get('VariationName', '')
                    # Store variants as JSON string for Excel
                    row['Variants'] = str(product['Variants'])

                excel_data.append(row)

            excel_df = pd.DataFrame(excel_data)

            if save_to_excel(excel_df):
                logging.info("Excel file saved successfully")

        # Final summary
        logging.info(
            f"Processing completed. Successfully processed {len(all_product_data)} out of {len(links)} products\n\n")

    except KeyboardInterrupt:
        logging.info("Processing interrupted by user\n\n")
    except Exception as e:
        logging.error(f"Fatal error: {str(e)}\n\n")
    finally:
        # Close the session
        session.close()


if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logging.error(f"Fatal error: {str(e)}")
        sys.exit(1)