# location_finder_api.py
"""
Location Finder API for Minaions Tender System
Extracts delivery locations from GeM tender PDFs and updates backend
To be imported and included in main app.py
"""

import os
import json
import logging
import requests
import tempfile
from typing import Dict, List, Optional, Any
from fastapi import APIRouter, HTTPException, BackgroundTasks
from pydantic import BaseModel, Field
import time
from datetime import datetime
from pathlib import Path

# Import the location extraction module
from location_finder_gem import TenderLocationExtractor

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Create router for location finder
router = APIRouter(prefix="/api/location-finder", tags=["Location Finder"])

# Load configuration
def load_config(config_path: str = "config.json") -> Dict:
    """Load configuration from config.json"""
    try:
        with open(config_path, 'r') as f:
            return json.load(f)
    except Exception as e:
        logger.warning(f"Failed to load configuration: {e}. Using environment variables.")
        return {}

config = load_config()

# Get API URL and key from config with fallback to environment variables
BACKEND_API_URL = config.get("nodejs_service", {}).get("api_url", os.getenv("BACKEND_API_URL", "http://localhost:5000/internal-api"))
INTERNAL_API_KEY = config.get("nodejs_service", {}).get("api_key", os.getenv("INTERNAL_API_KEY"))

# =================================================================
# API CLIENT FOR BACKEND COMMUNICATION
# =================================================================

class BackendApiClient:
    """Handles API calls to the Node.js backend"""

    def __init__(self, base_url, api_key):
        self.base_url = base_url.rstrip('/')
        self.api_key = api_key
        self.headers = {
            'x-internal-api-key': api_key,
            'Content-Type': 'application/json'
        }

    def get_all_tenders(self, limit: int = 1000, skip: int = 0) -> List[Dict]:
        """Get all tenders"""
        url = f"{self.base_url}/tenders"
        params = {
            'limit': limit,
            'skip': skip
        }
        response = requests.get(url, headers=self.headers, params=params)
        self._check_response(response)
        return response.json()['data']

    def get_tenders_without_location(self, limit: int = 1000, skip: int = 0) -> List[Dict]:
        """Get tenders that don't have location information"""
        url = f"{self.base_url}/tenders/without-location"
        params = {
            'limit': limit,
            'skip': skip
        }
        response = requests.get(url, headers=self.headers, params=params)
        self._check_response(response)
        return response.json()['data']

    def update_tender(self, tender_id: str, update_data: Dict) -> Dict:
        """Update tender with location information"""
        url = f"{self.base_url}/tenders/{tender_id}"
        response = requests.put(url, json=update_data, headers=self.headers)
        self._check_response(response)
        return response.json()['data']

    def download_file(self, s3_key: str, tenant_id: str) -> bytes:
        """Download file from S3 via backend API using S3 key"""
        url = f"{self.base_url}/storage/download"
        params = {
            'tenant_id': tenant_id,
            'key': s3_key,
            'decrypt': 'true'
        }
        response = requests.get(url, headers={'x-internal-api-key': self.api_key}, params=params)
        self._check_response(response)
        return response.content

    def download_document_by_id(self, document_id: str, tenant_id: str) -> bytes:
        """Download document content via backend API using document ID"""
        url = f"{self.base_url}/documents/{document_id}/content"
        params = {
            'tenant_id': tenant_id
        }
        response = requests.get(url, headers={'x-internal-api-key': self.api_key}, params=params)
        self._check_response(response)
        return response.content

    def _check_response(self, response: requests.Response) -> None:
        """Check response status and raise exception if error"""
        if response.status_code >= 400:
            error_msg = f"API Error {response.status_code}"
            try:
                error_data = response.json()
                if 'message' in error_data:
                    error_msg = f"{error_msg}: {error_data['message']}"
            except:
                error_msg = f"{error_msg}: {response.text}"
            raise Exception(error_msg)

# =================================================================
# REQUEST/RESPONSE MODELS
# =================================================================

class ProcessAllTendersRequest(BaseModel):
    only_missing: bool = Field(True, description="Only process tenders without location data")
    use_fallback: bool = Field(False, description="Use regex fallback instead of LLM")
    use_anthropic: bool = Field(False, description="Use Anthropic Claude (default: OpenAI)")
    limit: int = Field(100, description="Maximum number of tenders to process")

class ProcessAllTendersResponse(BaseModel):
    status: str
    message: str
    total_tenders: Optional[int] = 0
    processed: Optional[int] = 0
    successful: Optional[int] = 0
    failed: Optional[int] = 0
    skipped: Optional[int] = 0
    processing_time: Optional[float] = 0
    results: Optional[List[Dict[str, Any]]] = []

# =================================================================
# HELPER FUNCTIONS
# =================================================================

def get_api_key(use_anthropic: bool = False) -> str:
    """Get the appropriate API key from environment based on provider"""
    if use_anthropic:
        api_key = os.getenv("ANTHROPIC_API_KEY")
        if not api_key:
            raise ValueError("ANTHROPIC_API_KEY not found in environment. Set it to use Anthropic.")
    else:
        api_key = os.getenv("OPENAI_API_KEY")
        if not api_key:
            raise ValueError("OPENAI_API_KEY not found in environment. Set it to use OpenAI.")
    return api_key

def extract_pdf_url_from_tender(tender: Dict) -> Optional[str]:
    """
    Extract the FIRST document ID/URL/S3 key from tender data.
    Priority: originalDocuments[0] > documentUrl > s3Key > documents[0]

    Returns:
        Document ID (24-char hex), S3 key, or URL of the first tender document
    """
    # PRIORITY 1: Check originalDocuments array - ALWAYS use FIRST document only
    if 'originalDocuments' in tender and tender['originalDocuments']:
        if isinstance(tender['originalDocuments'], list) and len(tender['originalDocuments']) > 0:
            first_doc = tender['originalDocuments'][0]  # ⚠️ FIRST DOCUMENT ONLY
            logger.debug(f"Found {len(tender['originalDocuments'])} documents, using FIRST: {first_doc}")

            if isinstance(first_doc, dict):
                # If it's a document object with url or s3Key
                result = first_doc.get('url') or first_doc.get('s3Key')
                logger.info(f"Extracted from originalDocuments[0] (dict): {result}")
                return result
            elif isinstance(first_doc, str):
                # If it's just a document ID or S3 key string
                logger.info(f"Extracted from originalDocuments[0] (string): {first_doc}")
                return first_doc

    # PRIORITY 2: Check for document URL field (fallback)
    if 'documentUrl' in tender:
        logger.info(f"Extracted from documentUrl: {tender['documentUrl']}")
        return tender['documentUrl']

    # PRIORITY 3: Check for s3Key field (fallback)
    if 's3Key' in tender:
        logger.info(f"Extracted from s3Key: {tender['s3Key']}")
        return tender['s3Key']

    # PRIORITY 4: Check for documents array (fallback) - FIRST document only
    if 'documents' in tender and tender['documents']:
        if isinstance(tender['documents'], list) and len(tender['documents']) > 0:
            first_doc = tender['documents'][0]  # ⚠️ FIRST DOCUMENT ONLY
            logger.debug(f"Found {len(tender['documents'])} in documents array, using FIRST: {first_doc}")

            if isinstance(first_doc, dict):
                result = first_doc.get('url') or first_doc.get('s3Key')
                logger.info(f"Extracted from documents[0]: {result}")
                return result
            elif isinstance(first_doc, str):
                logger.info(f"Extracted from documents[0]: {first_doc}")
                return first_doc

    logger.warning("No document reference found in tender data")
    return None

def download_pdf_by_document_id(api_client: BackendApiClient, document_id: str, tenant_id: str) -> Optional[str]:
    """Download PDF using document ID and save to temp file, return path"""
    temp_path = None
    try:
        logger.info(f"Downloading PDF for document ID: {document_id}")

        # Download file content using document ID
        file_content = api_client.download_document_by_id(document_id, tenant_id)

        # Validate content is not empty
        if not file_content or len(file_content) == 0:
            logger.error(f"Downloaded file is empty for document ID: {document_id}")
            return None

        # Validate PDF header (PDFs start with %PDF-)
        if not file_content.startswith(b'%PDF-'):
            logger.error(f"Downloaded file is not a valid PDF (missing PDF header) for document ID: {document_id}")
            logger.error(f"File starts with: {file_content[:20]}")
            return None

        # Save to temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
            temp_file.write(file_content)
            temp_path = temp_file.name

        logger.info(f"Downloaded PDF to: {temp_path} ({len(file_content)} bytes)")
        return temp_path

    except Exception as e:
        logger.error(f"Error downloading PDF for document ID {document_id}: {str(e)}")
        # Clean up temp file if it was created
        if temp_path and os.path.exists(temp_path):
            try:
                os.unlink(temp_path)
            except:
                pass
        return None

def download_pdf_from_s3(api_client: BackendApiClient, s3_key: str, tenant_id: str) -> Optional[str]:
    """Download PDF from S3 using S3 key and save to temp file, return path"""
    temp_path = None
    try:
        logger.info(f"Downloading PDF from S3: {s3_key}")

        # Download file content
        file_content = api_client.download_file(s3_key, tenant_id)

        # Validate content is not empty
        if not file_content or len(file_content) == 0:
            logger.error(f"Downloaded file is empty for S3 key: {s3_key}")
            return None

        # Validate PDF header (PDFs start with %PDF-)
        if not file_content.startswith(b'%PDF-'):
            logger.error(f"Downloaded file is not a valid PDF (missing PDF header) for S3 key: {s3_key}")
            logger.error(f"File starts with: {file_content[:20]}")
            return None

        # Save to temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
            temp_file.write(file_content)
            temp_path = temp_file.name

        logger.info(f"Downloaded PDF to: {temp_path} ({len(file_content)} bytes)")
        return temp_path

    except Exception as e:
        logger.error(f"Error downloading PDF from S3: {str(e)}")
        # Clean up temp file if it was created
        if temp_path and os.path.exists(temp_path):
            try:
                os.unlink(temp_path)
            except:
                pass
        return None

def process_single_tender(
    tender: Dict,
    api_client: BackendApiClient,
    extractor: TenderLocationExtractor,
    use_fallback: bool = False
) -> Dict[str, Any]:
    """Process a single tender to extract and update location"""
    tender_id = str(tender.get('_id', ''))
    tenant_id = str(tender.get('tenant', ''))
    bid_number = tender.get('bidNumber', 'Unknown')

    result = {
        'tender_id': tender_id,
        'bid_number': bid_number,
        'tenant_id': tenant_id,
        'status': 'failed',
        'location': None,
        'error': None
    }

    try:
        # Check if tender already has location
        if not use_fallback and tender.get('deliveryLocation'):
            result['status'] = 'skipped'
            result['error'] = 'Location already exists'
            return result

        # Extract PDF URL/S3 key/Document ID from tender
        pdf_reference = extract_pdf_url_from_tender(tender)

        if not pdf_reference:
            result['error'] = 'No PDF document found in tender, set to "Other"'
            logger.warning(f"Tender {bid_number} ({tender_id}): No PDF document found, setting location to 'Other'")

            # Set location to "Other" when no PDF is found
            update_data = {
                "$set": {
                    "metadata.location_info": {
                        "city": None,
                        "state": "Other",
                        "fullAddress": None,
                        "confidence": "low",
                        "extractedAt": datetime.now().isoformat(),
                        "extractionMethod": "fallback"
                    },
                    "metadata.location": "Other",
                    "location": "Other"
                },
                "$push": {
                    "logs": {
                        "timestamp": time.time(),
                        "action": "location_extraction_failed",
                        "details": {
                            "state": "Other",
                            "reason": "No PDF document found"
                        }
                    }
                }
            }

            try:
                api_client.update_tender(tender_id, update_data)
                result['status'] = 'success'
                result['location'] = {"state": "Other", "city": None}
            except Exception as e:
                logger.error(f"Failed to update tender with 'Other' location: {str(e)}")

            return result

        # Determine if it's a document ID or S3 key
        # MongoDB ObjectIds are 24 character hex strings
        is_document_id = (
            isinstance(pdf_reference, str) and
            len(pdf_reference) == 24 and
            all(c in '0123456789abcdefABCDEF' for c in pdf_reference) and
            '/' not in pdf_reference and
            '.' not in pdf_reference
        )

        # Download PDF using appropriate method
        if is_document_id:
            logger.info(f"Detected document ID: {pdf_reference}")
            pdf_path = download_pdf_by_document_id(api_client, pdf_reference, tenant_id)
        else:
            logger.info(f"Detected S3 key: {pdf_reference}")
            pdf_path = download_pdf_from_s3(api_client, pdf_reference, tenant_id)

        if not pdf_path:
            result['error'] = f'Failed to download PDF ({"document ID" if is_document_id else "S3 key"}: {pdf_reference}), set to "Other"'
            logger.warning(f"Tender {bid_number} ({tender_id}): Failed to download PDF from {'document ID' if is_document_id else 'S3 key'}: {pdf_reference}, setting location to 'Other'")

            # Set location to "Other" when PDF download fails
            update_data = {
                "$set": {
                    "metadata.location_info": {
                        "city": None,
                        "state": "Other",
                        "fullAddress": None,
                        "confidence": "low",
                        "extractedAt": datetime.now().isoformat(),
                        "extractionMethod": "fallback"
                    },
                    "metadata.location": "Other",
                    "location": "Other"
                },
                "$push": {
                    "logs": {
                        "timestamp": time.time(),
                        "action": "location_extraction_failed",
                        "details": {
                            "state": "Other",
                            "reason": "Failed to download PDF"
                        }
                    }
                }
            }

            try:
                api_client.update_tender(tender_id, update_data)
                result['status'] = 'success'
                result['location'] = {"state": "Other", "city": None}
            except Exception as e:
                logger.error(f"Failed to update tender with 'Other' location: {str(e)}")

            return result

        try:
            # Extract location using LLM
            logger.info(f"Processing tender {bid_number} ({tender_id})")
            location_data = extractor.process_tender_pdf(pdf_path, use_fallback=use_fallback)

            # Update tender with location information
            if location_data and location_data.get('state'):
                update_data = {
                    "$set": {
                        "metadata.location_info": {
                            "city": location_data.get('city'),
                            "state": location_data.get('state'),
                            "fullAddress": location_data.get('full_address'),
                            "confidence": location_data.get('confidence', 'medium'),
                            "extractedAt": datetime.now().isoformat(),
                            "extractionMethod": location_data.get('method', 'llm')
                        },
                        "metadata.location": location_data.get('state'),
                        "location": location_data.get('state')
                    },
                    "$push": {
                        "logs": {
                            "timestamp": time.time(),
                            "action": "location_extracted",
                            "details": {
                                "state": location_data.get('state'),
                                "city": location_data.get('city'),
                                "confidence": location_data.get('confidence')
                            }
                        }
                    }
                }

                api_client.update_tender(tender_id, update_data)

                result['status'] = 'success'
                result['location'] = location_data
                logger.info(f"Updated tender {bid_number} with location: {location_data.get('state')}")
            else:
                # No valid location extracted, set location to "Other"
                update_data = {
                    "$set": {
                        "metadata.location_info": {
                            "city": None,
                            "state": "Other",
                            "fullAddress": None,
                            "confidence": "low",
                            "extractedAt": datetime.now().isoformat(),
                            "extractionMethod": "fallback"
                        },
                        "metadata.location": "Other",
                        "location": "Other"
                    },
                    "$push": {
                        "logs": {
                            "timestamp": time.time(),
                            "action": "location_extraction_failed",
                            "details": {
                                "state": "Other",
                                "reason": "No valid location extracted"
                            }
                        }
                    }
                }

                api_client.update_tender(tender_id, update_data)

                result['status'] = 'success'
                result['location'] = {"state": "Other", "city": None}
                result['error'] = 'No valid location extracted, set to "Other"'
                logger.warning(f"No valid location extracted for tender {bid_number}, set location to 'Other'")

        finally:
            # Clean up temporary file
            if pdf_path and os.path.exists(pdf_path):
                os.unlink(pdf_path)

    except Exception as e:
        result['error'] = str(e)
        logger.error(f"Error processing tender {bid_number}: {str(e)}", exc_info=True)

    return result

# =================================================================
# MAIN PROCESSING FUNCTION
# =================================================================

def process_all_tenders_handler(
    only_missing: bool = True,
    use_fallback: bool = False,
    use_anthropic: bool = False,
    limit: int = 100
) -> Dict[str, Any]:
    """
    Main function to process all tenders and extract locations
    """
    start_time = time.time()

    try:
        logger.info(f"Starting location extraction for tenders (only_missing={only_missing}, limit={limit})")

        # Initialize API client and extractor
        api_client = BackendApiClient(BACKEND_API_URL, INTERNAL_API_KEY)
        api_key = get_api_key(use_anthropic=use_anthropic)
        extractor = TenderLocationExtractor(api_key=api_key, use_anthropic=use_anthropic)

        # Fetch tenders from backend
        if only_missing:
            try:
                # Try to use specialized endpoint for tenders without location
                tenders = api_client.get_tenders_without_location(limit=limit)
                logger.info(f"Fetched {len(tenders)} tenders without location")
            except Exception as e:
                # Fallback to getting all tenders and filtering
                logger.warning(f"Specialized endpoint failed: {str(e)}, fetching all tenders")
                tenders = api_client.get_all_tenders(limit=limit)
                # Filter tenders without location
                tenders = [t for t in tenders if not t.get('deliveryLocation')]
                logger.info(f"Filtered to {len(tenders)} tenders without location")
        else:
            tenders = api_client.get_all_tenders(limit=limit)
            logger.info(f"Fetched {len(tenders)} tenders")

        # Apply limit
        tenders = tenders[:limit]

        # Process each tender
        results = []
        successful = 0
        failed = 0
        skipped = 0

        for idx, tender in enumerate(tenders):
            logger.info(f"Processing tender {idx + 1}/{len(tenders)}")
            result = process_single_tender(tender, api_client, extractor, use_fallback)
            results.append(result)

            if result['status'] == 'success':
                successful += 1
            elif result['status'] == 'skipped':
                skipped += 1
            else:
                failed += 1

        processing_time = time.time() - start_time

        return {
            "status": "completed",
            "message": f"Processed {len(tenders)} tenders: {successful} successful, {failed} failed, {skipped} skipped",
            "total_tenders": len(tenders),
            "processed": len(tenders),
            "successful": successful,
            "failed": failed,
            "skipped": skipped,
            "processing_time": processing_time,
            "results": results
        }

    except Exception as e:
        logger.error(f"Error in process_all_tenders_handler: {str(e)}", exc_info=True)
        return {
            "status": "error",
            "message": f"Processing failed: {str(e)}",
            "total_tenders": 0,
            "processed": 0,
            "successful": 0,
            "failed": 0,
            "skipped": 0,
            "processing_time": time.time() - start_time,
            "results": []
        }

# =================================================================
# API ENDPOINT
# =================================================================

@router.post("/process-all-tenders", response_model=ProcessAllTendersResponse)
async def process_all_tenders(request: ProcessAllTendersRequest, background_tasks: BackgroundTasks):
    """
    Process all tenders to extract location information (NON-BLOCKING)

    This endpoint:
    1. Starts processing in the background immediately
    2. Returns a response indicating processing has started
    3. Processing happens asynchronously without blocking other API requests

    Background processing:
    1. Fetches all tenders from the backend
    2. For each tender, downloads the PDF document from S3
    3. Extracts location information using LLM (city, state, full address)
    4. Updates the tender record in the backend with the extracted location

    Parameters:
    - only_missing: Default True - Only process tenders without location data
    - use_fallback: Default False - Use regex fallback instead of LLM
    - use_anthropic: Default False - Use Anthropic Claude instead of OpenAI
    - limit: Default 100 - Maximum number of tenders to process in one run

    This endpoint is designed to be run as a cron job once a day.
    """
    try:
        logger.info(f"Received process-all-tenders request: only_missing={request.only_missing}, limit={request.limit}")
        logger.info("Starting background task for tender processing (non-blocking)")

        # Start processing in background - DOES NOT BLOCK
        background_tasks.add_task(
            process_all_tenders_handler,
            only_missing=request.only_missing,
            use_fallback=request.use_fallback,
            use_anthropic=request.use_anthropic,
            limit=request.limit
        )

        # Return immediately
        return ProcessAllTendersResponse(
            status="processing",
            message=f"Location extraction started in background for up to {request.limit} tenders. Processing will continue asynchronously.",
            total_tenders=0,
            processed=0,
            successful=0,
            failed=0,
            skipped=0,
            processing_time=0,
            results=[]
        )

    except Exception as e:
        logger.error(f"Error starting background task: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))

@router.get("/health")
async def health_check():
    """Health check endpoint for location finder API"""
    return {
        "status": "healthy",
        "service": "location-finder",
        "version": "2.0.0",
        "backend_api": BACKEND_API_URL
    }