import pdfplumber
import re
import json
from typing import Dict, Optional
from pathlib import Path

# For LLM - supports both Anthropic and OpenAI
from anthropic import Anthropic
from openai import OpenAI

OPENAI_URL = "https://api.deepinfra.com/v1/openai"
OPENAI_MODEL = "Qwen/Qwen3-Next-80B-A3B-Instruct"

class TenderLocationExtractor:
    """Extract delivery location from GeM tender PDFs"""

    def __init__(self, api_key: str, use_anthropic: bool = True):
        """
        Initialize with API key for LLM service

        Args:
            api_key: API key for OpenAI
            use_anthropic: If True, use Anthropic Claude; if False, use OpenAI
        """
        self.use_anthropic = use_anthropic
        if use_anthropic:
            self.client = Anthropic(api_key=api_key)
        else:
            self.client = OpenAI(api_key=api_key, base_url=OPENAI_URL)

    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """
        Extract text from PDF file using pdfplumber

        Args:
            pdf_path: Path to the PDF file

        Returns:
            Extracted text as string
        """
        import os

        # Check if file exists and is not empty
        if not os.path.exists(pdf_path):
            raise Exception(f"PDF file does not exist at path: {pdf_path}")

        file_size = os.path.getsize(pdf_path)
        if file_size == 0:
            raise Exception(f"PDF file is empty (0 bytes): {pdf_path}")

        try:
            text = ""
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"

            return text.strip()
        except Exception as e:
            error_msg = str(e)
            if "No /Root object" in error_msg or "not a PDF" in error_msg:
                raise Exception(f"Invalid or corrupted PDF file (size: {file_size} bytes): {error_msg}")
            raise Exception(f"Error extracting text from PDF: {error_msg}")

    def extract_location_with_llm(self, text: str) -> Dict[str, Optional[str]]:
        """
        Use LLM to extract location information from tender text

        Args:
            text: Extracted text from PDF

        Returns:
            Dictionary with city, state, and full_address
        """
        prompt = f"""Analyze the following GeM (Government e-Marketplace) tender document text and extract the delivery location information.

Focus on finding:
1. The consignee address (परेषती/Consignee address)
2. Office location (कार्यालय का नाम/Office Name)
3. Any delivery address mentioned

From this information, identify:
- City name
- State name (if city is mentioned but not state, infer the state from the city)
- Full delivery address

Tender Document Text:
{text}

Please respond in JSON format with the following structure:
{{
    "city": "city name or null",
    "state": "full state name or null",
    "full_address": "complete delivery address or null",
    "confidence": "high/medium/low"
}}

Important:
- For city names, map them to their respective states (e.g., Kanchipuram -> Tamil Nadu, Mumbai -> Maharashtra, Kadapa -> Andhra Pradesh)
- Use full state names, not abbreviations (e.g., "Andhra Pradesh" not "AP")
- Name of the state has to be one out of 28 states or 8 UTs. That is one out of: Andhra Pradesh, Arunachal Pradesh, Assam, Bihar, Chhattisgarh, Goa, Gujarat, Haryana, Himachal Pradesh, Jharkhand, Karnataka, Kerala, Madhya Pradesh, Maharashtra, Manipur, Meghalaya, Mizoram, Nagaland, Odisha, Punjab, Rajasthan, Sikkim, Tamil Nadu, Telangana, Tripura, Uttar Pradesh, Uttarakhand, and West Bengal. Andaman and Nicobar Islands, Chandigarh, Dadra and Nagar Haveli and Daman and Diu, Delhi, Jammu and Kashmir, Ladakh, Lakshadweep, and Puducherry.
- If multiple addresses exist, prioritize the consignee/delivery address
- Return null for fields you cannot confidently extract"""

        if self.use_anthropic:
            response = self.client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=1000,
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )
            result_text = response.content[0].text
        else:
            print("\nInvoking LLM with OpenAI model:", OPENAI_MODEL)
            response = self.client.chat.completions.create(
                model=OPENAI_MODEL,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that extracts location (Indian State) information from tender documents."},
                    {"role": "user", "content": prompt}
                ],
                response_format={"type": "json_object"}
            )
            result_text = response.choices[0].message.content

        print("\nLLM response is:", result_text)

        # Parse JSON response
        try:
            # Clean the response - remove markdown code blocks if present
            result_text = re.sub(r'```json\s*', '', result_text)
            result_text = re.sub(r'```\s*$', '', result_text)
            location_data = json.loads(result_text.strip())
            return location_data
        except json.JSONDecodeError as e:
            print(f"Error parsing LLM response: {e}")
            print(f"Raw response: {result_text}")
            return {
                "city": None,
                "state": None,
                "full_address": None,
                "confidence": "low",
                "error": "Failed to parse LLM response"
            }

    def extract_location_regex_fallback(self, text: str) -> Dict[str, Optional[str]]:
        """
        Fallback method using regex patterns to extract location

        Args:
            text: Extracted text from PDF

        Returns:
            Dictionary with extracted location information
        """
        # Common Indian states
        states = [
            'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chhattisgarh',
            'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jharkhand', 'Karnataka',
            'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram',
            'Nagaland', 'Odisha', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu',
            'Telangana', 'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal'
        ]

        # Look for consignee address pattern
        address_pattern = r'(?:पता|Address|परेषती)[:\s]*([^\n]{20,200})'
        address_matches = re.findall(address_pattern, text, re.IGNORECASE)

        full_address = address_matches[0] if address_matches else None

        # Find state
        state = None
        for s in states:
            if re.search(r'\b' + re.escape(s) + r'\b', text, re.IGNORECASE):
                state = s
                break

        return {
            "city": None,
            "state": state,
            "full_address": full_address,
            "confidence": "low",
            "method": "regex_fallback"
        }

    def process_tender_pdf(self, pdf_path: str, use_fallback: bool = False) -> Dict:
        """
        Main method to process tender PDF and extract location

        Args:
            pdf_path: Path to the PDF file
            use_fallback: If True, use regex fallback instead of LLM

        Returns:
            Dictionary with extracted information
        """
        print(f"Processing: {pdf_path}")

        # Extract text
        text = self.extract_text_from_pdf(pdf_path)
        print(f"Extracted {len(text)} characters from PDF")

        # Trim text to 400 chars after last "पता/Address" ---
        search_str = "पपतताा/Address"
        idx = text.rfind(search_str)
        if idx != -1:
            start_idx = idx + len(search_str)
            text = text[start_idx:start_idx + 400]
            print(f"Trimmed text to 400 characters after last '{search_str}'")
        else:
            print(f"'{search_str}' not found, using full text")

        # Extract location
        if use_fallback:
            location_info = self.extract_location_regex_fallback(text)
        else:
            location_info = self.extract_location_with_llm(text)

        # Add metadata
        result = {
            "file_name": Path(pdf_path).name,
            "file_path": pdf_path,
            **location_info
        }

        return result