import requests
import os
import pdfkit
from urllib.parse import urlparse, parse_qs, unquote
import re
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import random


def _extract_filename_from_url(url, response=None):
    """
    Extract filename from URL, query parameters, or response headers.
    Tries multiple strategies to get the correct filename with extension.

    Args:
        url (str): The URL to extract filename from
        response (requests.Response, optional): HTTP response object to check headers

    Returns:
        tuple: (filename, extension) or (None, None) if unable to determine
    """
    filename = None
    extension = None

    # Strategy 1: Check Content-Disposition header if response is provided
    if response:
        content_disposition = response.headers.get('content-disposition', '')
        if content_disposition:
            # Parse Content-Disposition header for filename
            # Example: "attachment; filename="document.docx""
            match = re.search(r'filename[*]?=["\']?([^"\';]+)["\']?', content_disposition, re.IGNORECASE)
            if match:
                filename = unquote(match.group(1))
                if '.' in filename:
                    extension = os.path.splitext(filename)[1].lower()
                    print(f"Extracted filename from Content-Disposition: {filename}")
                    return filename, extension

    # Strategy 2: Parse query parameters for filename hints
    # Common patterns: fileDownloadPath, filename, file, document
    parsed_url = urlparse(url)
    if parsed_url.query:
        query_params = parse_qs(parsed_url.query)

        # Check common query parameter names
        for param_name in ['fileDownloadPath', 'filename', 'file', 'document', 'path']:
            if param_name in query_params:
                param_value = query_params[param_name][0]
                # Extract just the filename from full paths
                potential_filename = os.path.basename(unquote(param_value))
                if potential_filename and '.' in potential_filename:
                    filename = potential_filename
                    extension = os.path.splitext(filename)[1].lower()
                    print(f"Extracted filename from query parameter '{param_name}': {filename}")
                    return filename, extension

    # Strategy 3: Extract from URL path
    path_filename = os.path.basename(unquote(parsed_url.path))
    if path_filename and '.' in path_filename:
        filename = path_filename
        extension = os.path.splitext(filename)[1].lower()
        print(f"Extracted filename from URL path: {filename}")
        return filename, extension

    return None, None


def _get_extension_from_content_type(content_type):
    """
    Map Content-Type header to file extension.

    Args:
        content_type (str): Content-Type header value

    Returns:
        str: File extension with dot (e.g., '.docx') or None
    """
    content_type = content_type.lower().split(';')[0].strip()

    content_type_map = {
        'application/pdf': '.pdf',
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
        'application/msword': '.doc',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
        'application/vnd.ms-excel': '.xls',
        'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx',
        'application/vnd.ms-powerpoint': '.ppt',
        'text/plain': '.txt',
        'text/csv': '.csv',
    }

    return content_type_map.get(content_type)


def download_file_main(url, save_dir='downloads', convert_webpage_to_pdf=True, max_retries=3, retry_delay=2):
    """
    Download a file from URL or convert webpage to PDF with retry mechanism.
    Windows-compatible version with wkhtmltopdf path configuration.
    
    Args:
        url (str): URL to download from
        save_dir (str): Directory to save files
        convert_webpage_to_pdf (bool): Whether to convert webpages to PDF
        max_retries (int): Maximum number of retry attempts
        retry_delay (int): Base delay between retries in seconds (uses exponential backoff)
    """
    
    # Skip if not a web URL
    if not url.startswith(('http://', 'https://', 'www.')):
        print(f"Skipping non-web URL: {url}")
        return

    if (url.strip().lower() == 'https://sso.gem.gov.in/arxsso/oauth/dologin' or url.strip().lower().startswith('https://bidplus.gem.gov.in/bidding/downloadomppdfile/')):
        print(f"Skipping login URL: {url}")
        return
    
    # Add https:// if URL starts with www.
    if url.startswith('www.'):
        url = 'https://' + url
    
    # Create the save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Create a requests session with retry strategy
    session = _create_retry_session(max_retries)
    
    for attempt in range(max_retries + 1):
        try:
            if attempt > 0:
                wait_time = retry_delay * (2 ** (attempt - 1))  # Exponential backoff
                print(f"Retrying in {wait_time} seconds... (Attempt {attempt + 1}/{max_retries + 1})")
                time.sleep(wait_time)
            
            print(f"Accessing: {url}" + (f" (Attempt {attempt + 1})" if attempt > 0 else ""))
            
            # First, make a HEAD request to check content type
            try:
                head_response = session.head(url, allow_redirects=True, timeout=(15, 30))
                content_type = head_response.headers.get('content-type', '').lower()
                head_success = head_response.status_code == 200
            except Exception as e:
                print(f"HEAD request failed: {e}")
                head_success = False
                content_type = ''
            
            # Make a GET request
            response = session.get(url, stream=True, timeout=(15, 30))
            response.raise_for_status()
            
            # Update content type if HEAD failed
            if not head_success:
                content_type = response.headers.get('content-type', '').lower()
            
            # Check if it's a webpage (HTML content)
            is_webpage = ('text/html' in content_type or 
                         'application/xhtml' in content_type or
                         # Additional check: if content starts with HTML tags
                         response.text.strip().lower().startswith(('<!doctype', '<html')))
            
            if is_webpage and convert_webpage_to_pdf:
                return _convert_webpage_to_pdf(url, save_dir)
            else:
                return download_file(url, save_dir)
                
        except requests.exceptions.ConnectTimeout as e:
            print(f"Connection timeout on attempt {attempt + 1}: {e}")
            if attempt == max_retries:
                print(f"Failed to access {url} after {max_retries + 1} attempts due to connection timeout")
                return None
                
        except requests.exceptions.ReadTimeout as e:
            print(f"Read timeout on attempt {attempt + 1}: {e}")
            if attempt == max_retries:
                print(f"Failed to access {url} after {max_retries + 1} attempts due to read timeout")
                return None
                
        except requests.exceptions.ConnectionError as e:
            print(f"Connection error on attempt {attempt + 1}: {e}")
            if attempt == max_retries:
                print(f"Failed to access {url} after {max_retries + 1} attempts due to connection error")
                return None
                
        except requests.exceptions.HTTPError as e:
            print(f"HTTP error on attempt {attempt + 1}: {e}")
            # For HTTP errors like 404, 500, don't retry
            if response.status_code in [404, 403, 401]:
                print(f"Not retrying for HTTP {response.status_code} error")
                return None
            if attempt == max_retries:
                print(f"Failed to access {url} after {max_retries + 1} attempts due to HTTP error")
                return None
                
        except requests.exceptions.RequestException as e:
            print(f"Request error on attempt {attempt + 1}: {e}")
            if attempt == max_retries:
                print(f"Failed to access {url} after {max_retries + 1} attempts due to request error")
                return None
                
        except Exception as e:
            print(f"Unexpected error on attempt {attempt + 1}: {e}")
            if attempt == max_retries:
                print(f"Failed to process {url} after {max_retries + 1} attempts due to unexpected error")
                return None
    
    return None

def download_file(url, save_dir='downloads', timeout=30, retries=3):
    # Skip if not a web URL (simple check)
    if not url.startswith(('http://', 'https://', 'www.')):
        return None

    # Supported document extensions
    SUPPORTED_EXTENSIONS = ['.pdf', '.xlsx', '.xls', '.doc', '.docx', '.pptx', '.ppt', '.txt', '.csv']

    # Add proper headers to mimic a real browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Cache-Control': 'max-age=0'
    }

    # Create session to maintain cookies
    session = requests.Session()
    session.headers.update(headers)

    # Try to extract filename from URL first (without making request)
    filename, extension = _extract_filename_from_url(url)

    # Create the save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    # Temporary placeholder for file_path (will be updated after getting response)
    if filename:
        file_path = os.path.join(save_dir, filename)
    else:
        file_path = None

    # Retry logic
    for attempt in range(retries):
        try:
            print(f"Downloading (attempt {attempt + 1}): {url}")
            
            # Add random delay to avoid rate limiting
            if attempt > 0:
                delay = random.uniform(1, 3)
                print(f"Waiting {delay:.1f} seconds before retry...")
                time.sleep(delay)
            
            # Make the request with proper settings
            response = session.get(
                url, 
                stream=True, 
                timeout=timeout,
                allow_redirects=True,
                verify=True  # SSL verification
            )
            response.raise_for_status()

            # If we didn't get filename from URL, try to get it from response headers
            if not filename:
                filename, extension = _extract_filename_from_url(url, response)

            # If still no filename or extension, use Content-Type to determine extension
            if not filename or not extension:
                content_type = response.headers.get('content-type', '')
                detected_extension = _get_extension_from_content_type(content_type)

                if not filename:
                    # Create filename with timestamp and detected extension
                    if detected_extension:
                        filename = f"downloaded_file_{int(time.time())}{detected_extension}"
                    else:
                        filename = f"downloaded_file_{int(time.time())}.pdf"
                elif not extension and detected_extension:
                    # Filename exists but no extension - add detected extension
                    filename = f"{filename}{detected_extension}"
                elif not extension:
                    # No extension and couldn't detect - default to .pdf
                    filename = f"{filename}.pdf"

            # Finalize file path
            file_path = os.path.join(save_dir, filename)

            # Check if we actually got content
            content_length = response.headers.get('content-length')
            if content_length and int(content_length) < 100:  # Suspiciously small file
                print(f"Warning: File seems very small ({content_length} bytes)")

            # Check content type
            content_type = response.headers.get('content-type', '').lower()
            if 'text/html' in content_type and not filename.endswith(('.html', '.htm')):
                print("Warning: Received HTML content when expecting document - might be blocked or login page")

            # Download the file
            with open(file_path, 'wb') as file:
                downloaded_size = 0
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:  # Filter out keep-alive chunks
                        file.write(chunk)
                        downloaded_size += len(chunk)
                
                print(f"Downloaded {downloaded_size} bytes")

            # Verify the file was downloaded properly
            if os.path.getsize(file_path) == 0:
                print("Error: Downloaded file is empty")
                os.remove(file_path)
                if attempt < retries - 1:
                    continue
                return None
            
            print(f"Successfully downloaded to: {file_path}")
            return file_path
            
        except requests.exceptions.Timeout:
            print(f"Timeout error on attempt {attempt + 1}")
        except requests.exceptions.RequestException as e:
            print(f"Request failed on attempt {attempt + 1}: {e}")
        except Exception as e:
            print(f"Unexpected error on attempt {attempt + 1}: {e}")

        # If this was the last attempt, clean up any partial file
        if attempt == retries - 1 and file_path and os.path.exists(file_path):
            os.remove(file_path)
    
    print(f"Failed to download {url} after {retries} attempts")
    return None

def _create_retry_session(max_retries=3):
    """
    Create a requests session with built-in retry strategy.
    """
    session = requests.Session()
    
    # Define retry strategy
    retry_strategy = Retry(
        total=max_retries,
        backoff_factor=1,  # Will create delays like: 1, 2, 4, 8 seconds
        status_forcelist=[429, 500, 502, 503, 504],  # HTTP status codes to retry
        allowed_methods=["HEAD", "GET", "OPTIONS"]  # HTTP methods to retry
    )
    
    # Mount the adapter with retry strategy
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    # Set common headers to appear more like a real browser
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    })
    
    return session

def _get_wkhtmltopdf_config():
    """
    Get the appropriate wkhtmltopdf configuration for Windows.
    Returns the configuration dict for pdfkit.
    """
    # Common installation paths on Windows
    possible_paths = [
        r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe',
        r'C:\Program Files (x86)\wkhtmltopdf\bin\wkhtmltopdf.exe',
        r'C:\wkhtmltopdf\bin\wkhtmltopdf.exe',
        r'/usr/bin/wkhtmltopdf'
        # Add your custom installation path here if different
    ]
    
    # Check if wkhtmltopdf is in PATH first
    try:
        import subprocess
        result = subprocess.run(['wkhtmltopdf', '--version'], 
                               capture_output=True, text=True, timeout=5)
        if result.returncode == 0:
            print("Found wkhtmltopdf in system PATH")
            return {}  # No config needed if it's in PATH
    except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError):
        pass
    
    # Check common installation paths
    for path in possible_paths:
        if os.path.exists(path):
            print(f"Found wkhtmltopdf at: {path}")
            return {'wkhtmltopdf': path}
    
    # If not found, let user know
    print("wkhtmltopdf not found in common locations.")
    print("Please either:")
    print("1. Add wkhtmltopdf to your system PATH, or")
    print("2. Update the 'possible_paths' list in the function with your installation path")
    return None

def _convert_webpage_to_pdf(url, save_dir):
    """Convert webpage to PDF using pdfkit with Windows-specific configuration."""
    try:
        # Get wkhtmltopdf configuration
        config = _get_wkhtmltopdf_config()
        if config is None:
            return None
        
        # Create filename from URL
        parsed_url = urlparse(url)
        # Clean the filename by removing special characters
        clean_name = re.sub(r'[^\w\-_]', '_', parsed_url.netloc + parsed_url.path)
        clean_name = re.sub(r'_+', '_', clean_name).strip('_')
        filename = f"webpage_{clean_name}.pdf"
        
        file_path = os.path.join(save_dir, filename)
        
        print(f"Converting webpage to PDF: {filename}")
        
        # Configure pdfkit options for better PDF generation
        options = {
            'page-size': 'A4',
            'margin-top': '0.75in',
            'margin-right': '0.75in',
            'margin-bottom': '0.75in',
            'margin-left': '0.75in',
            'encoding': "UTF-8",
            'no-outline': None,
            'enable-local-file-access': None,
            'javascript-delay': 3000,  # Wait for JS to load
            'load-error-handling': 'ignore',
            'load-media-error-handling': 'ignore',
            'disable-smart-shrinking': '',  # Prevent content shrinking
            'print-media-type': '',  # Use print CSS
        }
        
        # Convert webpage to PDF with Windows configuration
        pdfkit.from_url(url, file_path, options=options, configuration=pdfkit.configuration(wkhtmltopdf=config.get('wkhtmltopdf')))
        
        print(f"Webpage converted to PDF: {file_path}")
        return file_path
        
    except Exception as e:
        print(f"Failed to convert webpage to PDF: {e}")
        print("Troubleshooting tips:")
        print("1. Ensure wkhtmltopdf is properly installed")
        print("2. Check if the executable path is correct")
        print("3. Try running 'wkhtmltopdf --version' in command prompt")
        return None