import html
import json
import logging
import re
import sys
import time
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from curl_cffi import requests

# Configure logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler("gymleco/product_details_scraper.log", mode='w', encoding='utf-8'),
                        logging.StreamHandler(sys.stdout)]
                    )

MAX_RETRIES = 3
TIME_DELAY = 2

DEBUG = False
DEBUG_LIMIT = 5


def escape_xml_content(text):
    """
    Escape special XML characters in text content
    Handles None values and ensures string output
    """
    if text is None:
        return ""

    # Convert to string and strip whitespace
    text = str(text).strip()

    # Remove or replace problematic characters
    # Remove control characters (except tab, newline, carriage return)
    cleaned_text = ''.join(char for char in text if ord(char) >= 32 or char in '\t\n\r')

    # Replace multiple spaces with single space
    cleaned_text = ' '.join(cleaned_text.split())

    escaped_text = html.escape(cleaned_text, quote=True)

    return escaped_text


def escape_product_fields(product_data):
    """
    Escape XML special characters in specified fields before XML writing
    """
    if not isinstance(product_data, dict):
        return product_data

    # Fields that need XML escaping
    fields_to_escape = ['Title', 'Tags', 'Description', 'Category']

    escaped_data = product_data.copy()

    for field in fields_to_escape:
        if field in escaped_data:
            escaped_data[field] = escape_xml_content(escaped_data[field])
            if escaped_data[field] != product_data[field]:
                logging.debug(f"Escaped {field}: '{product_data[field]}' -> '{escaped_data[field]}'")

    return escaped_data


def fetch_with_retry(session, url, url_type="page"):
    for attempt in range(MAX_RETRIES):
        try:
            logging.info(f"Fetching {url_type}: {url} (attempt {attempt + 1}/{MAX_RETRIES})")
            response = session.get(url, timeout=30)
            response.raise_for_status()
            return response
        except Exception as e:
            logging.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(TIME_DELAY)
            else:
                logging.error(f"All attempts failed for {url}")
                return None


def extract_json_data(json_data):
    try:
        product = json_data.get('product', {})

        # Extract basic info
        category = product.get('product_type', '')
        title = product.get('title', '')
        tags = product.get('tags', '')

        # Remove 'quote' from tags if present
        if tags and 'quote' in tags:
            tags = tags.replace('quote', '').replace(', ,', ',').strip(', ')
        if 'Träningsbollar shop' in tags:
            tags = tags.replace('Träningsbollar shop', 'Training Balls').strip()

        # Extract SKU from variants
        sku = ''
        variants = product.get('variants', [])
        if variants:
            sku = variants[0].get('sku', '')

        # Training balls tags for specific SKUs
        if sku == 'A-D55' or sku == 'N85':
            tags += ', Training Balls'

        # Extract variations
        variation_name = ''
        variation_options = ''
        options = product.get('options', [])

        if options:
            first_option = options[0]
            option_name = first_option.get('name', '')
            if option_name == 'Välj färg':
                pass

            elif option_name.lower() != 'title':
                values = first_option.get('values', [])
                if values:
                    # Check if any value contains 'kg' to determine if it's weights
                    if any('kg' in str(value).lower() for value in values):
                        variation_name = 'Weights'
                    # Check if any value contains 'cm' to determine if it's sizes
                    elif (any('cm' in str(value).lower() for value in values) or
                          any(any(size in f' {str(value)} ' for size in
                                  [' S ', ' M ', ' L ', ' XL ', ' XXL ', ' Small ', ' Medium ', ' Large ']) for value in
                              values) or
                          'storlek' in option_name.lower() or
                          'storlekar' in option_name.lower() or
                          'välj modell' in option_name.lower()):
                        variation_name = 'Sizes'
                    elif 'poster' in option_name.lower():
                        variation_name = 'Posters'
                    else:
                        # Convert option name to underscore format
                        # Replace spaces and special characters with underscores
                        variation_name = option_name.replace(' ', '_').replace('-', '_').replace('.', '_')
                        # Remove any double underscores
                        while '__' in variation_name:
                            variation_name = variation_name.replace('__', '_')
                        # Remove leading/trailing underscores
                        variation_name = variation_name.strip('_')

                    # Join all values with |
                    variation_options = ' | '.join(str(value) for value in values)

        # Extract images
        images = product.get('images', [])
        main_img = ''
        other_images = []

        for img in images:
            img_src = img.get('src', '')
            # Remove the Shopify parameters and use clean URL
            if '?' in img_src:
                img_src = img_src.split('?')[0]

            if img.get('position') == 1:
                main_img = img_src
            else:
                other_images.append(img_src)

        result = {
            'category': category,
            'title': title,
            'tags': tags,
            'sku': sku,
            'main_img': main_img,
            'other_images': other_images
        }

        # Add variation data if present
        if variation_name and variation_options:
            result['variation_name'] = variation_name
            result['variation_options'] = variation_options

        return result
    except Exception as e:
        logging.error(f"Error extracting JSON data: {str(e)}")
        return None


def extract_video_url(product_column):
    """
    Extract YouTube video URL from the entire product column
    """
    try:
        # Look for span with youtube-video class anywhere in the product column
        youtube_spans = product_column.find_all('span', class_='youtube-video')

        for span in youtube_spans:
            iframe = span.find('iframe')
            if iframe:
                src = iframe.get('src', '')
                if 'youtube.com/embed/' in src:
                    video_id = src.split('/embed/')[1].split('?')[0]
                    watch_url = f"https://www.youtube.com/watch?v={video_id}"
                    return watch_url

        # Look for any iframe elements in the product column
        iframes = product_column.find_all('iframe')
        logging.info(f"Found {len(iframes)} iframe(s) in product column")

        for iframe in iframes:
            src = iframe.get('src', '')

            if 'youtube.com/embed/' in src:
                video_id = src.split('/embed/')[1].split('?')[0]
                watch_url = f"https://www.youtube.com/watch?v={video_id}"
                logging.info(f"Converted embed to watch URL: {watch_url}")
                return watch_url
            elif 'youtube.com/watch' in src:
                logging.info(f"Found direct watch URL: {src}")
                return src

        # Search for any YouTube URLs in the HTML content of the entire product column
        html_content = str(product_column)
        youtube_patterns = [
            r'https?://(?:www\.)?youtube\.com/embed/([a-zA-Z0-9_-]+)',
            r'https?://(?:www\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]+)',
            r'https?://(?:www\.)?youtu\.be/([a-zA-Z0-9_-]+)'
        ]

        for pattern in youtube_patterns:
            matches = re.findall(pattern, html_content)
            if matches:
                video_id = matches[0]
                watch_url = f"https://www.youtube.com/watch?v={video_id}"
                logging.info(f"Found YouTube URL via regex: {watch_url}")
                return watch_url

        logging.info("No video URL found in product column")
        return ''
    except Exception as e:
        logging.error(f"Error extracting video URL: {str(e)}")
        return ''


def extract_page_data(soup, link):
    """
    Extract data from product page HTML
    """
    try:
        # Find the main product column
        product_column = soup.find('div', class_='detail product-column-right')
        if not product_column:
            logging.warning(f"Product column not found for {link}")
            return None

        # Extract description
        description = ''
        description_div = product_column.find('div', class_='product-description rte cf')
        if description_div:
            # Get text content, removing video elements
            desc_copy = description_div.__copy__()
            # Remove video spans for description text
            for video_span in desc_copy.find_all('span', class_='youtube-video'):
                video_span.decompose()
            description = desc_copy.get_text(strip=True)

        # Extract video URL - search in the entire product column, not just description
        video_url = extract_video_url(product_column)

        # Extract accordion sections
        specifications = ''
        more_info = ''
        features = ''

        accordion_sections = product_column.find_all('div', class_='product-info-accordion not-in-quickbuy')

        for section in accordion_sections:
            try:
                # Find the title
                title_element = section.find('summary', class_='disclosure__title')
                if not title_element:
                    continue

                title_text = title_element.get_text().lower().strip()

                # Find the content
                content_div = section.find('div', class_='metafield-rich_text_field')
                if not content_div:
                    continue

                content_html = str(content_div)
                # Remove the outer div tags to get inner HTML
                content_html = re.sub(r'^<div[^>]*>', '', content_html)
                content_html = re.sub(r'</div>$', '', content_html)

                # Categorize based on title
                if 'specification' in title_text:
                    specifications = content_html
                elif 'more about' in title_text:
                    more_info = content_html
                elif any(keyword in title_text for keyword in ['feature', 'detail', 'information']):
                    features = content_html
                else:
                    # If we can't categorize, put it in features as fallback
                    if not features:
                        features = content_html

            except Exception as e:
                logging.error(f"Error processing accordion section: {str(e)}")
                continue

        return {
            'description': description,
            'video_url': video_url,
            'specifications': specifications,
            'more_info': more_info,
            'features': features
        }

    except Exception as e:
        logging.error(f"Error extracting page data for {link}: {str(e)}")
        return None


def process_product(session, link):
    """
    Process a single product by fetching both JSON and HTML data
    """
    try:
        base_url = f"https://gymleco.com{link}"
        json_url = f"{base_url}.json"

        # Fetch JSON data
        json_response = fetch_with_retry(session, json_url, "JSON")
        if not json_response:
            logging.error(f"Failed to fetch JSON for {link}")
            return None

        try:
            json_data = json_response.json()
        except json.JSONDecodeError as e:
            logging.error(f"Failed to parse JSON for {link}: {str(e)}")
            return None

        # Extract JSON data
        json_info = extract_json_data(json_data)
        if not json_info:
            logging.error(f"Failed to extract JSON data for {link}")
            return None

        # Add delay between requests
        time.sleep(TIME_DELAY)

        # Fetch HTML page
        page_response = fetch_with_retry(session, base_url, "HTML page")
        if not page_response:
            logging.error(f"Failed to fetch HTML page for {link}")
            return None

        # Parse HTML
        soup = BeautifulSoup(page_response.content.decode('utf-8', 'ignore'), 'html.parser')

        # Extract page data
        page_info = extract_page_data(soup, link)
        if not page_info:
            logging.error(f"Failed to extract page data for {link}")
            return None

        # Combine all data
        product_data = {
            'Title': json_info['title'],
            'Category': json_info['category'],
            'Tags': json_info['tags'],
            'SKU': json_info['sku'],
            'URL': base_url,
            'MainImg': json_info['main_img'],
            'Video': page_info['video_url'],
            'Description': page_info['description'],
            'Specifications': page_info['specifications'],
            'MoreInfo': page_info['more_info'],
            'Features': page_info['features']
        }

        # Add variations if present
        if 'variation_name' in json_info and 'variation_options' in json_info:
            product_data['VariationName'] = json_info['variation_name']
            product_data['VariationOptions'] = json_info['variation_options']

        # Add other images
        for i, img_url in enumerate(json_info['other_images']):
            product_data[f'Image{i + 1}'] = img_url

        logging.info(f"Successfully processed product: {json_info['title']} (SKU: {json_info['sku']})\n")
        return product_data

    except Exception as e:
        logging.error(f"Error processing product {link}: {str(e)}\n")
        return None


def write_xml_product(file_handle, product_data):
    try:
        # ESCAPE XML SPECIAL CHARACTERS BEFORE WRITING
        escaped_product = escape_product_fields(product_data)

        file_handle.write('<product>\n')

        # Write basic fields in the correct order
        basic_fields = ['Category', 'Title', 'Tags', 'SKU', 'Video', 'URL', 'MainImg']
        for field in basic_fields:
            if field in escaped_product and escaped_product[field]:
                file_handle.write(f'<{field}>{escaped_product[field]}</{field}>\n')

        # Write additional images
        image_count = 1
        while f'Image{image_count}' in escaped_product:
            file_handle.write(f'<Image{image_count}>{escaped_product[f"Image{image_count}"]}</Image{image_count}>\n')
            image_count += 1

        # Write description (escaped text)
        if 'Description' in escaped_product and escaped_product['Description']:
            file_handle.write(f'<Description>{escaped_product["Description"]}</Description>\n')

        # Write HTML fields with CDATA (these don't need escaping since they're in CDATA)
        html_fields = ['Specifications', 'MoreInfo', 'Features']
        for field in html_fields:
            if field in product_data and product_data[field]:  # Use original data for CDATA
                file_handle.write(f'<{field}><![CDATA[\n{product_data[field]}\n]]></{field}>\n')

        # Write variations if present
        if 'VariationName' in escaped_product and 'VariationOptions' in escaped_product:
            variation_name = escaped_product['VariationName']
            variation_options = escaped_product['VariationOptions']
            file_handle.write('<Variation>\n')
            file_handle.write(f'<var_name>{variation_name}</var_name>\n')
            file_handle.write(f'<var_values>{variation_options}</var_values>\n')
            file_handle.write('</Variation>\n')

        file_handle.write('</product>\n')

    except Exception as e:
        logging.error(f"Error writing XML for product: {str(e)}")


def save_to_excel(products_df, filename="gymleco/product_details.xlsx"):
    """
    Save products dataframe to Excel file
    """
    try:
        products_df.to_excel(filename, index=False)
        logging.info(f"Product details saved to {filename}")
        return True
    except Exception as e:
        logging.error(f"Error saving to Excel: {str(e)}")
        return False


def main():
    logging.info("Product details scraper started")
    now = datetime.now()
    formatted_date = now.strftime("%Y-%m-%d %H:%M")
    global exit_flag

    # Create a session for reusing connections
    session = requests.Session()
    session.impersonate = 'chrome'

    try:
        # Read product links from the existing Excel file
        try:
            products_df = pd.read_excel("gymleco/products.xlsx")
            if 'Link' not in products_df.columns:
                logging.error("'Link' column not found in products.xlsx")
                return

            links = products_df['Link'].tolist()

        except FileNotFoundError:
            logging.error("products.xlsx file not found. Please run the product listing scraper first.")
            return
        except Exception as e:
            logging.error(f"Error reading products.xlsx: {str(e)}")
            return

        # Initialize data storage
        all_product_data = []

        # Open XML file for writing
        with open("gymleco/products.xml", "w", encoding="utf-8") as xml_file:
            xml_file.write('<?xml version="1.0" encoding="UTF-8"?>\n')
            xml_file.write('<products>\n')
            xml_file.write(f"<created_at>{formatted_date}</created_at>\n")

            processed_count = 0
            for i, link in enumerate(links, 1):

                if DEBUG and processed_count >= DEBUG_LIMIT:
                    logging.info(f"DEBUG: Reached limit of {DEBUG_LIMIT} products")
                    break

                logging.info(f"Processing product {i}/{len(links)}: {link}")

                product_data = process_product(session, link)
                if product_data:
                    write_xml_product(xml_file, product_data)
                    all_product_data.append(product_data)
                    processed_count += 1
                else:
                    logging.warning(f"Failed to process product {i}: {link}\n")

                # Add delay between products
                if i < len(links):  # Don't sleep after the last product
                    time.sleep(TIME_DELAY)

            xml_file.write('</products>\n')

        # Save to Excel
        if all_product_data:
            # Create DataFrame with all possible columns
            excel_df = pd.DataFrame(all_product_data)

            # Reorder columns for better readability
            column_order = ['Title', 'Category', 'Tags', 'SKU', 'URL', 'Video', 'MainImg', 'Description',
                            'Specifications', 'MoreInfo', 'Features']

            # Add variation columns if they exist
            if 'VariationName' in excel_df.columns:
                column_order.extend(['VariationName', 'VariationOptions'])

            # Add image columns
            image_cols = [col for col in excel_df.columns if col.startswith('Image')]
            image_cols.sort(key=lambda x: int(x.replace('Image', '')))
            column_order.extend(image_cols)

            # Reorder DataFrame
            available_cols = [col for col in column_order if col in excel_df.columns]
            remaining_cols = [col for col in excel_df.columns if col not in available_cols]
            final_column_order = available_cols + remaining_cols

            excel_df = excel_df[final_column_order]

            if save_to_excel(excel_df):
                logging.info("Excel file saved successfully")

        # Final summary
        logging.info(
            f"Processing completed. Successfully processed {len(all_product_data)} out of {len(links)} products\n\n")

    except KeyboardInterrupt:
        logging.info("Processing interrupted by user\n\n")
    except Exception as e:
        logging.error(f"Fatal error: {str(e)}\n\n")
    finally:
        # Close the session
        session.close()


if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logging.error(f"Fatal error: {str(e)}")
        sys.exit(1)