import os
import shutil
import time
import re
import requests
import PyPDF2
import anthropic
import hashlib
import logging
from pathlib import Path
import bid_prep_automation as bpa
import bid_queries as bq
import file_download_robust as fdr

import argparse
import sys
import json
from typing import Dict, List, Any, Optional, Tuple

import google.generativeai as genai
from google.generativeai.types import content_types as types

from openai import OpenAI

from document_extractor import extract_documents_text_compatible, create_document_extractor

from dotenv import load_dotenv
load_dotenv()
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
ANTHROPIC_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-3-5-haiku-latest")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "Qwen/Qwen3-Next-80B-A3B-Instruct")
OPENAI_URL = os.getenv("OPENAI_URL", "https://api.deepinfra.com/v1/openai")

# Configuration
COMPANY_INFO_DOC = "/Path/to/Company/Info/summary"
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Gemini client
# gemini_client = genai.Client(api_key=GEMINI_API_KEY)
genai.configure(api_key=GEMINI_API_KEY)
gemini_client=genai

llm_model = os.getenv("GENAI_ENGINE", "claude")


# Initialize Claude client
claude_client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

claude_model = ANTHROPIC_MODEL

# Create an OpenAI client with your token and endpoint
openai = OpenAI(
    api_key=OPENAI_API_KEY,
    base_url=OPENAI_URL,
)

class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

def extract_links_from_pdf(pdf_path):
    """
    Extract embedded hyperlinks from PDF files.
    Skip all hyperlinks that appear AFTER the 'अस्वीकरण/Disclaimer' text starts.
    Include hyperlinks that appear BEFORE the disclaimer, even on the same page.
    """
    
    def find_disclaimer_info(pdf_reader):
        """
        Find page number and the text position where disclaimer starts.
        Returns (page_num, char_position, text_before_disclaimer, text_after_disclaimer)
        """
        disclaimer_markers = [
            'अस्वीकरण/disclaimer',
            'Disclaimer',
            'अस्वीकरण/Disclaimer',
        ]
        
        for page_num, page in enumerate(pdf_reader.pages):
            page_text = page.extract_text()
            page_text_lower = page_text.lower()
            
            for marker in disclaimer_markers:
                pos = page_text_lower.find(marker.lower())
                if pos != -1:
                    text_before = page_text[:pos]
                    text_after = page_text[pos:]
                    logger.info(f"Disclaimer found at Page {page_num + 1}, character position {pos}")
                    return page_num, pos, text_before, text_after
        
        return None, None, None, None
    
    def get_annotation_y_position(annotation_obj):
        """
        Get the Y-coordinate of an annotation from its rectangle.
        Returns Y position or None if not available.
        In PDF coordinates, Y increases from bottom to top.
        """
        try:
            if '/Rect' in annotation_obj:
                rect = annotation_obj['/Rect']
                # rect is [x1, y1, x2, y2]
                # Use the bottom Y coordinate (y1) as the position
                return float(rect[1])
        except:
            pass
        return None
    
    def estimate_disclaimer_y_position(page, page_text, disclaimer_position):
        """
        Estimate the Y-coordinate where the disclaimer text appears.
        This is approximate since we're working with extracted text.
        """
        try:
            # Get page height
            mediabox = page.mediabox
            page_height = float(mediabox.height)
            
            # Estimate: assume text flows from top to bottom
            # Calculate what fraction of the text the disclaimer position represents
            text_length = len(page_text)

            
            if text_length > 0:
                fraction = disclaimer_position / text_length
                # Convert to Y coordinate (higher Y = higher on page in PDF)
                # Text at top of page has high Y, text at bottom has low Y
                estimated_y = page_height * (1 - fraction*1.45)
                return estimated_y
        except:
            pass
        return None
    
    def is_url_before_disclaimer(annotation_obj, page_num, page, page_text, disclaimer_page, 
                                  disclaimer_char_pos, text_before_disclaimer, text_after_disclaimer):
        """
        Determine if a URL appears before the disclaimer text.
        """
        # If no disclaimer found, include everything
        if disclaimer_page is None:
            return True
        
        # If URL is on a page before disclaimer page, include it
        if page_num < disclaimer_page:
            return True
        
        # If URL is on a page after disclaimer page, skip it
        if page_num > disclaimer_page:
            return False
        
        # URL is on the same page as disclaimer
        # Need to check if it appears before or after the disclaimer position
        
        # Method 1: Try to find the URL's actual text in the page and check position
        if '/A' in annotation_obj and '/URI' in annotation_obj['/A']:
            uri = annotation_obj['/A']['/URI']
            
            # Check if URI appears as visible text
            if uri in text_before_disclaimer:
                logger.debug(f"URI found before disclaimer (text match)")
                return True
            if uri in text_after_disclaimer:
                logger.debug(f"URI found after disclaimer (text match)")
                return False
        
        # Method 2: Use Y-coordinates to estimate position
        annotation_y = get_annotation_y_position(annotation_obj)
        disclaimer_y = estimate_disclaimer_y_position(page, page_text, disclaimer_char_pos)
        
        if annotation_y is not None and disclaimer_y is not None:
            # In PDF coordinates, higher Y = higher on page
            # If annotation Y > disclaimer Y, annotation is ABOVE (before) disclaimer
            # If annotation Y < disclaimer Y, annotation is BELOW (after) disclaimer
            if annotation_y > disclaimer_y:
                logger.info(f"Annotation Y ({annotation_y:.1f}) > Disclaimer Y ({disclaimer_y:.1f}) - before disclaimer")
                return True
            else:
                logger.info(f"Annotation Y ({annotation_y:.1f}) <= Disclaimer Y ({disclaimer_y:.1f}) - after disclaimer")
                return False
        
        # Method 3: Try checking annotation display text properties
        for key in ['/Contents', '/T', '/TU']:
            if key in annotation_obj:
                context_text = str(annotation_obj[key])
                if context_text:
                    if context_text in text_before_disclaimer:
                        logger.debug(f"Display text found before disclaimer")
                        return True
                    if context_text in text_after_disclaimer:
                        logger.debug(f"Display text found after disclaimer")
                        return False
        
        # Default: if uncertain and on disclaimer page, skip it to be safe
        logger.info(f"Cannot determine URL position on Page {page_num + 1} - skipping to be safe")
        return False
    
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            # First pass: find where disclaimer section starts
            disclaimer_page, disclaimer_char_pos, text_before, text_after = find_disclaimer_info(pdf_reader)
            
            if disclaimer_page is None:
                logger.info("No disclaimer section found - will extract all URLs")
            else:
                logger.info(f"Disclaimer section starts on Page {disclaimer_page + 1} at character position {disclaimer_char_pos}")
                logger.info(f"Will include URLs before disclaimer, skip URLs after disclaimer")
            
            all_links = []
            seen_urls = set()
            
            # Second pass: extract URLs
            for page_num, page in enumerate(pdf_reader.pages):
                page_text = page.extract_text()
                
                # Quick check: if page is entirely after disclaimer, skip it
                if disclaimer_page is not None and page_num > disclaimer_page:
                    logger.info(f"Page {page_num + 1}: Skipping (entirely after disclaimer)")
                    continue
                
                # Extract URLs from annotations
                if '/Annots' in page:
                    annotations = page['/Annots']
                    if annotations:
                        for annotation in annotations:
                            try:
                                annotation_object = annotation.get_object()
                                
                                if annotation_object.get('/Subtype') == '/Link':
                                    if '/A' in annotation_object and '/URI' in annotation_object['/A']:
                                        uri = annotation_object['/A']['/URI']
                                        
                                        if isinstance(uri, str) and uri not in seen_urls:
                                            # Check if URL appears before disclaimer
                                            if is_url_before_disclaimer(annotation_object, page_num, page,
                                                                       page_text, disclaimer_page, 
                                                                       disclaimer_char_pos, text_before, text_after):
                                                all_links.append({
                                                    'url': uri,
                                                    'page': page_num + 1
                                                })
                                                seen_urls.add(uri)
                                                logger.info(f"Page {page_num + 1}: ✓ Extracted URL: {uri}")
                                            else:
                                                logger.info(f"Page {page_num + 1}: ✗ Skipped URL (after disclaimer): {uri}")
                            except Exception as e:
                                logger.debug(f"Error processing annotation: {e}")
                                pass
                
                # Try alternative method for newer PyPDF2 versions
                try:
                    page_links = page.get_links()
                    if page_links:
                        for link in page_links:
                            if hasattr(link, 'url') and link.url and link.url not in seen_urls:
                                # For page links, we can only check page number
                                if disclaimer_page is None or page_num < disclaimer_page:
                                    all_links.append({
                                        'url': link.url,
                                        'page': page_num + 1
                                    })
                                    seen_urls.add(link.url)
                                    logger.info(f"Page {page_num + 1}: ✓ Extracted URL: {link.url}")
                                else:
                                    logger.info(f"Page {page_num + 1}: ✗ Skipped URL: {link.url}")
                except (AttributeError, TypeError):
                    pass
            
            # Fallback: extract URLs from text
            if not all_links:
                logger.info("No embedded links found, using text extraction...")
                
                if disclaimer_page is None:
                    # No disclaimer, extract from all pages
                    for page_num, page in enumerate(pdf_reader.pages):
                        page_text = page.extract_text()
                        urls = re.findall(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', page_text)
                        for url in urls:
                            if url not in seen_urls:
                                all_links.append({'url': url, 'page': page_num + 1})
                                seen_urls.add(url)
                                logger.info(f"Page {page_num + 1}: Extracted URL from text: {url}")
                else:
                    # Extract from pages before disclaimer
                    for page_num in range(disclaimer_page):
                        page = pdf_reader.pages[page_num]
                        page_text = page.extract_text()
                        urls = re.findall(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', page_text)
                        for url in urls:
                            if url not in seen_urls:
                                all_links.append({'url': url, 'page': page_num + 1})
                                seen_urls.add(url)
                                logger.info(f"Page {page_num + 1}: Extracted URL from text: {url}")
                    
                    # For disclaimer page, only extract URLs from text before disclaimer
                    if disclaimer_page is not None and text_before:
                        urls = re.findall(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', text_before)
                        for url in urls:
                            if url not in seen_urls:
                                all_links.append({'url': url, 'page': disclaimer_page + 1})
                                seen_urls.add(url)
                                logger.info(f"Page {disclaimer_page + 1}: Extracted URL from text (before disclaimer): {url}")
            
            logger.info(f"\n{'='*60}")
            logger.info(f"SUMMARY: Total URLs extracted: {len(all_links)}")
            logger.info(f"{'='*60}")
            
            if not all_links:
                return "No links found in the PDF document"
            
            return [link['url'] for link in all_links]
    
    except Exception as e:
        logger.error(f"Error: {str(e)}")
        return f"Error extracting links from PDF: {str(e)}"

def download_linked_documents(bid_dir, links, process_logger = None):
    if process_logger is None:
        process_logger = logger
    """
    Download additional documents from links found in tender documents
    
    Args:
        bid_dir (str): Directory to save downloaded files to
        links (list): List of links to download
        
    Returns:
        list: List of paths to downloaded files
    """
    downloaded_files = []
    
    for link in links:
        try:
            process_logger.info(f"Downloading file from link: {link}")
            file_path=fdr.download_file_main(link, bid_dir)
            process_logger.info(f"Downloaded file from link {link} and saved to path {file_path}")
            downloaded_files.append(file_path)
        except Exception as e:
            process_logger.info(f"Error downloading {link}: {str(e)}")
    
    process_logger.info(f"Total downloaded files {len(downloaded_files)} at paths {downloaded_files}")
    return downloaded_files

def calculate_file_hash(file_path):
    """
    Calculate SHA-256 hash of file content
    
    Args:
        file_path (Path): Path to the file
        
    Returns:
        str: Hexadecimal hash of the file content
    """
    sha256_hash = hashlib.sha256()
    
    # Read and update hash in chunks to handle large files efficiently
    with open(file_path, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    
    return sha256_hash.hexdigest()

def remove_duplicate_pdfs(directory_path):
    """
    Remove duplicate PDF files from a directory based on content.
    For each set of identical files, the first one found is kept and others are removed.
    
    Args:
        directory_path (str): Path to directory containing PDF files
        
    Returns:
        tuple: (kept_files, removed_files) lists of filenames
    """
    logger.info(f"🔍 Checking for duplicate PDFs in: {directory_path}")
    
    # Get all PDF files in the directory
    pdf_files = list(Path(directory_path).glob("*.pdf"))
    
    if not pdf_files:
        logger.info("No PDF files found")
        return [], []
    
    # Dictionary to store hash -> [file_paths]
    hash_map = {}
    
    # Calculate hash for each file and group by hash
    for file_path in pdf_files:
        logger.info(f"Analyzing: {file_path.name}")
        file_hash = calculate_file_hash(file_path)
        
        if file_hash in hash_map:
            hash_map[file_hash].append(file_path)
        else:
            hash_map[file_hash] = [file_path]
    
    # Keep track of which files were kept and which were removed
    kept_files = []
    removed_files = []
    
    # Process each group of files with the same hash
    for file_hash, file_paths in hash_map.items():
        # Keep the first file
        kept_file = file_paths[0]
        kept_files.append(kept_file)
        
        # Remove all duplicates
        for duplicate in file_paths[1:]:
            try:
                os.remove(duplicate)
                removed_files.append(duplicate)
                logger.info(f"Removed duplicate: {duplicate.name} (same as {kept_file.name})")
            except Exception as e:
                logger.info(f"Error removing {duplicate.name}: {str(e)}")
    
    logger.info(f"Kept {len(kept_files)} unique files, removed {len(removed_files)} duplicates")
    return [f.name for f in kept_files], [f.name for f in removed_files]

def list_files_in_directory(directory_path):
    """
    Create a list of all files in the specified directory with their full paths.
    
    Args:
        directory_path (str): Path to the directory to scan
        
    Returns:
        list: List of full paths to all files in the directory
    """
    # Convert to Path object for easier handling
    dir_path = Path(directory_path)
    
    # Check if the directory exists
    if not dir_path.exists():
        logger.info(f"Directory does not exist: {directory_path}")
        return []
    
    if not dir_path.is_dir():
        logger.info(f"Path is not a directory: {directory_path}")
        return []
    
    # List to store the full paths
    file_paths = []
    
    # Iterate through all items in the directory
    for item in dir_path.iterdir():
        # Only include files, not directories
        if item.is_file():
            # Add the full path as a string
            file_paths.append(str(item.absolute()))
    
    logger.info(f"Found {len(file_paths)} files in {directory_path}")
    return file_paths

# def format_list_of_dicts(data_list):
#     formatted_string = ""
#     for i, item_dict in enumerate(data_list):
#         formatted_string += f"Annexure {i + 1}:\n"
#         for key, value in item_dict.items():
#             formatted_string += f"  {key}: {value}\n"
#         formatted_string += "\n"  # Add an extra line break between dictionaries
#     return formatted_string

def analyze_tender_with_LLM(documents_text, annexure_hints, process_logger=None):
    """
    Analyze tender documents with Claude LLM

    Args:
        documents_text (dict): Dictionary mapping file paths to their text content
        process_logger: Optional process logger for detailed logging

    Returns:
        dict: Extracted information from tender documents
    """
    # Helper function to log with both loggers
    def log_info(msg):
        logger.info(msg)
        if process_logger:
            process_logger.info(msg)

    def log_error(msg):
        logger.error(msg)
        if process_logger:
            process_logger.error(msg)

    def log_warning(msg):
        logger.warning(msg)
        if process_logger:
            process_logger.warning(msg)

    # Information to extract
    info_to_extract = [
        "Eligibility/Qualification Criteria or conditions for bidder",
        "Pre-bid meeting",
        "Evaluation criteria or method",
        "Documents needed to submit the bid",
        "Scope of work of the whole project",
        "Amount of EMD fee",
        "Relaxation or preference given to any kind of company or bidder",
        "Payment terms",
        "BOQ requirements",
        "Risks",
        "Redlining"
    ]

    # Initialize the result dictionary with structured format
    extracted_info_with_citations = {}
    for item in info_to_extract:
        extracted_info_with_citations[item] = {
            "content": "",
            "citations": [],
            "summary": ""
        }

    # Process all documents, not just the main one
    # First, calculate the total text size to determine chunking strategy
    total_text_size = sum(len(text) for text in documents_text.values())
    log_info(f"Total text size across all documents: {total_text_size} characters")

    # Approach: Process each document separately and then combine the results
    doc_analyses = []

    log_info(f"Starting LLM analysis of {len(documents_text)} documents")
    for doc_path, doc_text in documents_text.items():
        doc_name = os.path.basename(doc_path)
        log_info(f"Analyzing document: {doc_name} ({len(doc_text)} characters)")
        
        # Skip empty documents
        if not doc_text.strip():
            log_info(f"Skipping empty document: {doc_name}")
            continue

        # Extract page markers from the document
        page_markers = extract_page_markers(doc_text)

        # Create chunks based on document size
        chunk_size = 30000  # Adjust based on LLM's token limits
        doc_chunks = create_chunks_with_page_tracking(doc_text, chunk_size, page_markers)

        log_info(f"Split document into {len(doc_chunks)} chunks")
        
        # Process each chunk with enhanced citation tracking
        for chunk_idx, chunk_data in enumerate(doc_chunks):
            chunk_text = chunk_data['text']
            chunk_page_range = chunk_data['page_range']

            # Create enhanced prompt for citation extraction
            prompt = create_citation_prompt(
                doc_name, chunk_idx + 1, len(doc_chunks),
                chunk_text, info_to_extract, chunk_page_range
            )

            try:
                if llm_model == 'gemini':
                    response = gemini_client.models.generate_content(
                        model="gemini-2.5-flash",
                        contents=[prompt],
                        config=types.GenerateContentConfig(
                            system_instruction="You are an expert in analyzing tender documents. Extract information with precise citations.",
                            max_output_tokens=8000,
                            temperature=0.1
                        )
                    )
                    response_text = response.text

                elif llm_model == 'claude':
                    response = claude_client.messages.create(
                        model=claude_model,
                        max_tokens=8000,
                        temperature=0,
                        system="You are an expert in analyzing tender documents. Extract information with precise citations in the specified JSON format.",
                        messages=[
                            {"role": "user", "content": prompt}
                        ]
                    )
                    response_text = response.content[0].text

                elif llm_model == 'open_llm':
                    response = openai.chat.completions.create(
                        model=OPENAI_MODEL,
                        messages=[
                            {"role": "system", "content": "You are an expert in analyzing tender documents. Extract information with precise citations in the specified JSON format."},
                            {"role": "user", "content": prompt},
                        ],
                    )
                    response_text = response.choices[0].message.content

                # Parse the structured response
                chunk_analysis = parse_llm_response_with_citations(
                    response_text, doc_name, chunk_idx, chunk_text
                )

                doc_analyses.append({
                    "doc_name": doc_name,
                    "chunk_idx": chunk_idx,
                    "page_range": chunk_page_range,
                    "response": response_text,
                    "parsed_data": chunk_analysis
                })

                log_info(f"Successfully analyzed chunk {chunk_idx+1}/{len(doc_chunks)} of {doc_name}")

            except Exception as e:
                log_error(f"Error analyzing chunk {chunk_idx+1} of {doc_name} with LLM: {str(e)}")
                continue

    log_info(f"Completed analysis of all documents. Total chunks analyzed: {len(doc_analyses)}")

    # Combine all analyses into final structured output
    log_info("Combining analyses with citations...")
    combine_analyses_with_citations(extracted_info_with_citations, doc_analyses, info_to_extract)

    # Generate summaries for each category
    log_info("Generating category summaries...")
    generate_category_summaries(extracted_info_with_citations)
    extracted_info_with_citations['Annexures or forms or formats'] = {  
                    'content': annexure_hints,
                    'citations': [],
                    'summary': '',
                    'citation_count': 0,
                    'documents_referenced': []
                 }
    log_info("LLM analysis completed successfully")
    return extracted_info_with_citations, doc_analyses

def extract_page_markers(doc_text):
    """
    Extract page markers from document text to track page numbers

    Args:
        doc_text (str): Document text content

    Returns:
        list: List of tuples (page_number, text_position)
    """
    page_markers = []

    # Common page marker patterns
    patterns = [
        r'Doc Page Number:\s*(\d+)'
    ]

    for pattern in patterns:
        for match in re.finditer(pattern, doc_text, re.IGNORECASE):
            page_num = int(match.group(1))
            position = match.start()
            page_markers.append((page_num, position))

    # Sort by position in text
    page_markers.sort(key=lambda x: x[1])

    # Remove duplicates (keep first occurrence of each page)
    seen_pages = set()
    unique_markers = []
    for page_num, position in page_markers:
        if page_num not in seen_pages:
            unique_markers.append((page_num, position))
            seen_pages.add(page_num)

    return unique_markers

def create_chunks_with_page_tracking(doc_text, chunk_size, page_markers):
    """
    Create text chunks while tracking which pages each chunk spans

    Args:
        doc_text (str): Document text
        chunk_size (int): Maximum chunk size
        page_markers (list): List of page markers with positions

    Returns:
        list: List of chunk dictionaries with text and page range
    """
    chunks = []

    if len(doc_text) <= chunk_size:
        # Single chunk
        page_range = get_page_range_for_text(0, len(doc_text), page_markers)
        chunks.append({
            'text': doc_text,
            'page_range': page_range,
            'start_pos': 0,
            'end_pos': len(doc_text)
        })
        return chunks

    start = 0
    while start < len(doc_text):
        end = start + chunk_size

        # Adjust to end at paragraph boundary if possible
        if end < len(doc_text):
            paragraph_end = doc_text.rfind('\n\n', start, end)
            if paragraph_end > start + (chunk_size * 0.7):
                end = paragraph_end
            else:
                line_end = doc_text.rfind('\n', start, end)
                if line_end > start + (chunk_size * 0.8):
                    end = line_end

        # Get page range for this chunk
        page_range = get_page_range_for_text(start, end, page_markers)

        chunks.append({
            'text': doc_text[start:end],
            'page_range': page_range,
            'start_pos': start,
            'end_pos': end
        })

        start = end

    return chunks

def get_page_range_for_text(start_pos, end_pos, page_markers):
    """
    Determine which pages a text span covers

    Args:
        start_pos (int): Start position in text
        end_pos (int): End position in text
        page_markers (list): List of page markers

    Returns:
        dict: Page range information
    """
    if not page_markers:
        return {"start_page": None, "end_page": None, "pages": []}

    start_page = None
    end_page = None

    # Find the page that contains start_pos
    for i, (page_num, position) in enumerate(page_markers):
        if position <= start_pos:
            start_page = page_num
        else:
            break

    # Find the page that contains end_pos
    for i, (page_num, position) in enumerate(page_markers):
        if position <= end_pos:
            end_page = page_num
        else:
            break

    # Generate list of pages spanned
    pages = []
    if start_page is not None and end_page is not None:
        pages = list(range(start_page, end_page + 1))
    elif start_page is not None:
        pages = [start_page]

    return {
        "start_page": start_page,
        "end_page": end_page,
        "pages": pages
    }

def create_citation_prompt(doc_name, chunk_idx, total_chunks, chunk_text, info_to_extract, page_range):
    """
    Create a balanced prompt that extracts all categories with equal attention
    while providing category-specific guidance where needed.
    """
    categories_list = '\n'.join([f"{i+1}. {item}" for i, item in enumerate(info_to_extract)])

    page_info = ""
    if page_range['pages']:
        page_info = f"This chunk spans pages: {page_range['pages']}"

    prompt = f"""
You are analyzing tender/RFP documents for comprehensive information extraction with precise citations.

Document: {doc_name}
Chunk: {chunk_idx}/{total_chunks}
{page_info}

═══════════════════════════════════════════════════════════════════════════════
TASK: Extract ALL of the following information categories from this chunk diligently.
═══════════════════════════════════════════════════════════════════════════════

CATEGORIES TO EXTRACT:
{categories_list}

═══════════════════════════════════════════════════════════════════════════════
SECTION 1: UNIVERSAL RULES (Apply to ALL categories)
═══════════════════════════════════════════════════════════════════════════════

【CITATION RULES】- Apply to EVERY category extracted:
• Extract the EXACT text span that contains the relevant information
• Use page numbers from "--- Doc Page Number: X ---" markers in the document
• If no specific page number found, use null
• Confidence ratings:
  - high: Direct, unambiguous information
  - medium: Implied or partially stated information  
  - low: Inferred or unclear information

【FORMATTING RULES】- Apply to EVERY category's content:
• Clean up and rephrase content - don't copy verbatim. Summarize and rephrase information.
• Use markdown: **bold headings**, *italics for emphasis*, bullet points
• Use proper paragraph breaks and logical organization with clear spacing
• Make content user-friendly and easy to understand

【OUTPUT FORMAT】- Required JSON structure:
{{
  "extracted_data": [
    {{
      "category": "Category name",
      "found": true/false,
      "content": "Well-formatted, rephrased information with proper structure and line breaks",
      "citations": [
        {{
          "text_span": "Exact text from document that supports this information",
          "page_number": page_number_or_null,
          "confidence": "high/medium/low"
        }}
      ]
    }}
  ]
}}

═══════════════════════════════════════════════════════════════════════════════
SECTION 2: CATEGORY-SPECIFIC GUIDANCE
═══════════════════════════════════════════════════════════════════════════════

Below are specific instructions for categories that need additional clarity.
For categories not listed here, apply standard extraction using the universal rules above.

┌─────────────────────────────────────────────────────────────────────────────┐
│ CATEGORY: Eligibility/Qualification Criteria                                │
└─────────────────────────────────────────────────────────────────────────────┘
Extract ALL criteria bidders must meet:
• Financial criteria (turnover, net worth requirements)
• Technical criteria (experience, past work orders, certifications)
• Legal criteria (registration requirements, not blacklisted)
• Capacity criteria (equipment, manpower)
• Any minimum/maximum thresholds specified
Include specific values, percentages, and time periods mentioned.

┌─────────────────────────────────────────────────────────────────────────────┐
│ CATEGORY: Pre-bid meeting                                                   │
└─────────────────────────────────────────────────────────────────────────────┘
Extract if any information is given:
• Date and time of the meeting
• Mode (online/offline/hybrid)
• Venue or online meeting link
• Contact details for queries

┌─────────────────────────────────────────────────────────────────────────────┐
│ CATEGORY: Evaluation criteria or method                                     │
└─────────────────────────────────────────────────────────────────────────────┘
Extract the complete evaluation methodology:
• Scoring criteria and weightages
• Technical vs commercial evaluation split
• Minimum qualifying marks/scores
• Complete scoring tables if specified
• L1/lowest bid selection criteria

┌─────────────────────────────────────────────────────────────────────────────┐
│ CATEGORY: Documents needed to submit the bid                                │
└─────────────────────────────────────────────────────────────────────────────┘
Focus on documents that bidders should submit with their bids, 
such as certificates, experience letters, financial documents, compliance statements etc. 
If a document is mentioned as 'Additional Doc 1 (Requested in ATC) or Additional Document 2 (Requested in ATC) etc' 
in the RFP/Tender, it needs to be ignored. These are documents with generic description like 
'Additional document format <n> as specified in ATC'. These documents should not be included in the final list 
of documents. 
*Also DO NOT include any annexures, forms, proformas given in the RFP document chunk in this list.* Annexures will be taken care of separately.

┌─────────────────────────────────────────────────────────────────────────────┐
│ CATEGORY: Amount of EMD fee                                                 │
└─────────────────────────────────────────────────────────────────────────────┘
Extract:
• Exact EMD amount
• Mode of payment (DD, BG, online, etc.)
• Bank details if specified

┌─────────────────────────────────────────────────────────────────────────────┐
│ CATEGORY: Relaxation or preference                                          │
└─────────────────────────────────────────────────────────────────────────────┘
Extract any preferences given to:
• MSE/MSME bidders
• Startups (DIPP registered)
• SC/ST owned enterprises
• Local suppliers
• Make in India preferences
• Any turnover/experience relaxations
• Any EMD Fee relaxations

┌─────────────────────────────────────────────────────────────────────────────┐
│ CATEGORY: Payment terms                                                     │
└─────────────────────────────────────────────────────────────────────────────┘
Extract complete payment structure:
• Payment milestones and percentages
• Advance payment provisions
• Running bill/monthly payment terms
• Final payment conditions
• Payment timeline after bill submission
• Any deductions (retention, taxes)

┌─────────────────────────────────────────────────────────────────────────────┐
│ CATEGORY: BOQ requirements                                                  │
└─────────────────────────────────────────────────────────────────────────────┘
Extract:
• Bill of Quantities structure
• How rates should be quoted (per unit, lump sum, percentage)
• Any specific BOQ filling instructions
• Price variation clauses
• Quantity variation limits

┌─────────────────────────────────────────────────────────────────────────────┐
│ CATEGORY: Scope of work                                                     │
└─────────────────────────────────────────────────────────────────────────────┘
Extract:
• Main deliverables and activities
• Work location/site details
• Duration/timeline
• Key milestones
• Contractor's responsibilities
• Items/services included and excluded

┌─────────────────────────────────────────────────────────────────────────────┐
│ CATEGORY: Risks                                                             │
└─────────────────────────────────────────────────────────────────────────────┘
Extract all risky clauses:
• Penalty clauses
• Liquidated Damages (LD) / Mutually Agreed Damages (MAD)
• Indemnification requirements
• Termination conditions
• Forfeiture conditions
• Liability caps or unlimited liability
• Force majeure exclusions

┌─────────────────────────────────────────────────────────────────────────────┐
│ CATEGORY: Redlining                                                         │
└─────────────────────────────────────────────────────────────────────────────┘
Identify potential queries/clarifications for:
• Ambiguous terms or conditions
• One-sided or unfair clauses
• Missing information that should be clarified
• Terms that could be negotiated in pre-bid meeting
• Inconsistencies in the document

═══════════════════════════════════════════════════════════════════════════════
SECTION 3: FINAL REMINDERS
═══════════════════════════════════════════════════════════════════════════════

✦ Extract ALL 12 categories - do not skip any
✦ If a category is not found in this chunk, mark found: false and content: "Not found in this chunk"
✦ Apply CITATION RULES to every extraction
✦ Apply FORMATTING RULES to all content
✦ Each category deserves equal attention and thoroughness

═══════════════════════════════════════════════════════════════════════════════
DOCUMENT CHUNK TO ANALYZE:
═══════════════════════════════════════════════════════════════════════════════

{chunk_text}

═══════════════════════════════════════════════════════════════════════════════
Return ONLY the JSON response with all 12 categories. No additional text.
Use proper markdown formatting (**bold**, *italic*, bullet points, proper spacing, line breaks) in content fields.
"""

    return prompt


def safe_json_parse(json_string, default=None):
    """Safely parse JSON with multiple fallback strategies"""
    
    # Strategy 1: Try normal parsing
    try:
        return json.loads(json_string)
    except json.JSONDecodeError:
        pass
    
    # Strategy 2: Try fixing backslashes
    try:
        fixed_string = json_string.replace('\\', '\\\\')
        # But fix valid escape sequences back
        fixed_string = fixed_string.replace('\\\\n', '\\n')
        fixed_string = fixed_string.replace('\\\\t', '\\t')
        fixed_string = fixed_string.replace('\\\\r', '\\r')
        fixed_string = fixed_string.replace('\\\\\\\\', '\\\\')
        fixed_string = fixed_string.replace('\\\\"', '\\"')
        
        return json.loads(fixed_string)
    except json.JSONDecodeError:
        pass
    
    # Strategy 3: Remove all backslashes (last resort)
    try:
        cleaned_string = json_string.replace('\\', '')
        return json.loads(cleaned_string)
    except json.JSONDecodeError:
        pass
    
    # If all strategies fail, return default value
    logging.warning(f"Could not parse JSON: {json_string[:100]}...")
    return default

def parse_llm_response_with_citations(response_text, doc_name, chunk_idx, chunk_text):
    """
    Parse LLM response and extract structured data with citations

    Args:
        response_text (str): Raw LLM response
        doc_name (str): Document name
        chunk_idx (int): Chunk index
        chunk_text (str): Original chunk text

    Returns:
        dict: Parsed structured data
    """
    try:
        # Try to extract JSON from response
        json_match = re.search(r'```json\s*(.*?)\s*```', response_text, re.DOTALL)
        if json_match:
            json_str = json_match.group(1)
        else:
            # Look for JSON without markdown
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
            if json_match:
                json_str = json_match.group(0)
            else:
                raise ValueError("No JSON found in response")

        parsed_data = safe_json_parse(json_str)

        # Validate and enhance the parsed data
        enhanced_data = validate_and_enhance_citations(
            parsed_data, doc_name, chunk_idx, chunk_text
        )

        return enhanced_data

    except Exception as e:
        logger.info(f"Error parsing LLM response for {doc_name} chunk {chunk_idx}: {str(e)}")
        # Fallback to text parsing
        return fallback_text_parsing(response_text, doc_name, chunk_idx)

def validate_and_enhance_citations(parsed_data, doc_name, chunk_idx, chunk_text):
    """
    Validate and enhance citation data from LLM response

    Args:
        parsed_data (dict): Parsed JSON data from LLM
        doc_name (str): Document name
        chunk_idx (int): Chunk index
        chunk_text (str): Original chunk text

    Returns:
        dict: Enhanced citation data
    """
    enhanced_data = {"extracted_data": []}

    if "extracted_data" not in parsed_data:
        return enhanced_data

    for item in parsed_data["extracted_data"]:
        if not item.get("found", False):
            continue

        enhanced_item = {
            "category": item.get("category", "Unknown"),
            "found": True,
            "content": item.get("content", ""),
            "citations": []
        }

        # Process citations
        for citation in item.get("citations", []):
            enhanced_citation = {
                "document_name": doc_name,
                "text_span": citation.get("text_span", ""),
                "page_number": citation.get("page_number"),
                "chunk_index": chunk_idx,
                "confidence": citation.get("confidence", "medium")
            }

            # Validate text span exists in chunk
            text_span = enhanced_citation["text_span"]
            if text_span and text_span.lower() in chunk_text.lower():
                # Find the position of the text span for better accuracy
                span_position = chunk_text.lower().find(text_span.lower())
                if span_position != -1:
                    enhanced_citation["text_position"] = span_position

            enhanced_item["citations"].append(enhanced_citation)

        if enhanced_item["citations"]:  # Only add if we have citations
            enhanced_data["extracted_data"].append(enhanced_item)

    return enhanced_data

def fallback_text_parsing(response_text, doc_name, chunk_idx):
    """
    Fallback method to parse text when JSON parsing fails

    Args:
        response_text (str): Raw response text
        doc_name (str): Document name
        chunk_idx (int): Chunk index
    Returns:
        dict: Basic parsed data structure
    """
    extracted_data = {"extracted_data": []}

    # Basic text parsing logic here
    # This is a simplified version - you can enhance based on your needs

    return extracted_data

def combine_analyses_with_citations(extracted_info_with_citations, doc_analyses, info_to_extract):
    """
    Combine all chunk analyses into final structured output with citations

    Args:
        extracted_info_with_citations (dict): Final output dictionary
        doc_analyses (list): List of chunk analyses
        info_to_extract (list): Categories to extract
    """
    for analysis in doc_analyses:
        if "parsed_data" not in analysis:
            continue

        parsed_data = analysis["parsed_data"]
        doc_name = analysis["doc_name"]
        chunk_idx = analysis["chunk_idx"]

        for item in parsed_data.get("extracted_data", []):
            category = item.get("category", "")

            # Find matching category in our extraction list
            matching_category = None
            for extract_category in info_to_extract:
                if category.lower() in extract_category.lower() or extract_category.lower() in category.lower():
                    matching_category = extract_category
                    break

            if not matching_category:
                continue

            # Add content and citations
            content = item.get("content", "")
            if content and content.strip():
                if extracted_info_with_citations[matching_category]["content"]:
                    extracted_info_with_citations[matching_category]["content"] += f"\n\n--- From {doc_name} (chunk {chunk_idx+1}) ---\n{content}"
                else:
                    extracted_info_with_citations[matching_category]["content"] = f"--- From {doc_name} (chunk {chunk_idx+1}) ---\n{content}"

                # Add citations
                extracted_info_with_citations[matching_category]["citations"].extend(
                    item.get("citations", [])
                )


def generate_category_summaries(extracted_info_with_citations):
    """
    Generate summaries for each category

    Args:
        extracted_info_with_citations (dict): Extracted information with citations
    """
    for category, data in extracted_info_with_citations.items():
        content = data["content"]
        citations = data["citations"]

        if not content or content.strip() == "":
            data["summary"] = "No information found"
        else:
            # Generate a simple summary
            sentences = content.split('.')
            if len(sentences) <= 2:
                data["summary"] = content[:200] + "..." if len(content) > 200 else content
            else:
                data["summary"] = sentences[0] + "."

        # Add citation count
        data["citation_count"] = len(citations)
        data["documents_referenced"] = list(set([c["document_name"] for c in citations])) 

def save_extracted_info(bid_dir, extracted_info):
    """
    Save extracted information to a file
    
    Args:
        bid_dir (str): Directory to save the file to
        extracted_info (dict): Extracted information to save
        
    Returns:
        str: Path to the saved file
    """
    output_path = os.path.join(bid_dir, "tender_analysis")
    os.makedirs(output_path, exist_ok=True) # Create the directory. If the target directory already exists, do not raise an exception.
    output_text_file = os.path.join(output_path, "tender_analysis.txt")
    output_json_file = os.path.join(output_path, "tender_analysis.json")

    # Write to a JSON file
    with open(output_json_file, "w") as file:
        json.dump(extracted_info, file, indent=4)
    
    # Write to a TXT file
    with open(output_text_file, 'w', encoding='utf-8') as f:
        f.write("TENDER ANALYSIS REPORT\n")
        f.write("=" * 50 + "\n\n")
        
        for category, info in extracted_info.items():
            f.write(f"{category}\n")
            f.write("-" * len(category) + "\n")
            f.write(info.strip() or "Not found in the documents")
            f.write("\n\n" + "=" * 50 + "\n\n")
            if("Eligibility" in category):
                eligibility = info['content'].strip()
    
    logger.info(f"Saved extracted information to {output_text_file}")
    return output_json_file, eligibility

def get_company_info():
    """
    Get company information from Google Docs
    
    Returns:
        str: Company information text
    """
    # In a real implementation, this would use the Google Docs API
    # For now, we'll simulate this with a placeholder
    logger.info(f"Getting company information from {COMPANY_INFO_DOC}")
    
    # This is a placeholder - in a real implementation, you would:
    # 1. Authenticate with Google
    # 2. Use the Docs API to get the document content
    # 3. Parse and return the content
    
    # For demo purposes, let's return a sample company info
    return """
    Yugasa Company Information for Government tendering:

Company Name: Yugasa Software Labs Pvt Ltd
Office addresses: 
Gurgaon Address: Yugasa Software Labs, 3rd floor, Tower B, Unitech Cyber Park, Sector 39, Gurgaon 122001, Haryana
Lucknow Address: Yugasa Software Labs, 3rd floor, TC-14, Vibhuti Khand, Gomti Nagar, Lucknow, Uttar Pradesh 226010
US Address: Yugasa Software LLC, 370 Campus Drive, Somerset, New Jersey 08873

Company registration:
Yugasa Software Labs Pvt Ltd is a legal entity in India registered under Indian Companies Act, 2013. Registered as Private Limited Company with Registrar of Companies, Delhi.
The CIN of the company is U72900HR2015PTC056837

Company website: www.yugasa.com
Company Phone: +918800522257
Company Email: contact@yugasa.com
Contact Person: Dharmesh Jaggi
Person Authorized to sign Bid Documents: Dharmesh Jaggi

PAN of Yugasa: AAACY7582J

Certifications:
CMMI 3
ISO 27001:2022
ISO 9001:2015

Valid GST registration. GST Number of Yugasa: 06AAACY7582J1ZU

Yugasa is the official Meta Business Partner as ISV solution provider for WhatsApp.

Turnover of previous years:

2024-25: INR 3.52 Crores
2023-24: INR 3.29 Crores
2022-23: INR 3.19 Crores
2021-22: INR 3.35 Crores
2020-21: INR 2.18 Crores

Yugasa software Labs Pvt Ltd is not barred or blacklisted by any PSU, government department, or private sector entity. 

Yugasa software labs pvt ltd is an MSME and registered Startup

Manpower on Yugasa’s payroll:
Currently Yugasa has 40 employees on its payroll.

Some previously done projects of Yugasa:

Project 1
Client Name: Narayana Hospitals
Project Title: Development and Implementation of WhatsApp Business API Solution and AI-enabled Chatbot for support automation
Project Scope:
- Integration of WhatsApp Business API with NH's existing systems
- Development of AI-enabled chatbot for patient interaction and support management
- Multi-language support including English and regional languages
- Real-time response and query resolution for patients
- Continuous support and maintenance of the chatbot system
Project Value: The total value of the project till date is INR 30,35,605, and the project is ongoing.
Project Duration: September 30th, 2022 to Present (Ongoing)

Project 2
Client Name: NSC Guwahati
NSC Guwahati is a Ministry of Culture organization 
Project Title: Cashless ticket booking chatbot on WhatsApp.
Project Value: INR 14 lakhs 

    """

def check_eligibility(extracted_info, company_info, process_logger=None):
    """
    Check if the company is eligible for the bid

    Args:
        extracted_info (dict): Extracted tender information
        company_info (str): Company information
        process_logger: Optional process logger for detailed logging

    Returns:
        tuple: (is_eligible, reason)
    """
    # Helper function to log with both loggers
    def log_info(msg):
        logger.info(msg)
        if process_logger:
            process_logger.info(msg)

    def log_error(msg):
        logger.error(msg)
        if process_logger:
            process_logger.error(msg)

    log_info("Starting eligibility check...")

    # Prepare prompt for Claude to assess eligibility
    eligibility_criteria = extracted_info.get("Eligibility/Qualification Criteria or conditions for bidder", "")['content']
    exemptions = extracted_info.get("Relaxation or preference given to any kind of company or bidder", "")['content']

    log_info(f"Eligibility criteria length: {len(eligibility_criteria)} characters")
    log_info(f"Exemptions length: {len(exemptions)} characters")

    prompt = f"""
    You need to determine if the company is eligible to apply for a tender based on the eligibility criteria, exemptions to some special type of companies (if any), and company information.

    Eligibility Criteria:
    {eligibility_criteria}

    Exemptions:
    {exemptions}

    Company Information:
    {company_info}

    Please analyze if the company meets all the eligibility criteria. Return your answer in the following format:

    Eligible: [Yes/No]
    Reason: [Detailed explanation of why the company is eligible or not]
    Missing Requirements: [List any requirements the company doesn't meet, if applicable]
    """

    try:
        log_info(f"Calling LLM ({llm_model}) for eligibility assessment...")

        if(llm_model == 'gemini'):
            response = gemini_client.models.generate_content(
                model="gemini-2.5-flash-preview-04-17", #"gemini-2.0-flash",
                contents=[prompt],
                config=types.GenerateContentConfig(
                    system_instruction="You are an expert in tender eligibility assessment. Be thorough and accurate in your analysis.",
                    max_output_tokens=2000,
                    temperature=0.1
                )
            )
            response_text = response.text

        elif(llm_model == 'claude'):
            # Call Claude API
            response = claude_client.messages.create(
                model=claude_model,
                max_tokens=2000,
                temperature=0,
                system="You are an expert in tender eligibility assessment. Be thorough and accurate in your analysis.",
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )

            # Parse response
            response_text = response.content[0].text

        elif llm_model == 'open_llm':
            response = openai.chat.completions.create(
                model=OPENAI_MODEL,
                messages=[
                    {"role": "system", "content": "You are an expert in tender eligibility assessment. Be thorough and accurate in your analysis."},
                    {"role": "user", "content": prompt},
                ],
            )
            response_text = response.choices[0].message.content

        log_info(f"Eligibility response received: {len(response_text)} characters")
        log_info(f"Eligibility response: {response_text}")

        # Extract eligibility decision
        is_eligible = "eligible: yes" in response_text.lower()

        # Extract reason
        reason_match = re.search(r'Reason:\s*(.*?)(?:\n\n|\n[A-Z]|$)', response_text, re.DOTALL)
        reason = reason_match.group(1).strip() if reason_match else "No detailed reason provided"

        log_info(f"Eligibility check completed. Result: {'Eligible' if is_eligible else 'Not Eligible'}")

        return is_eligible, response_text

    except Exception as e:
        log_error(f"Error checking eligibility with LLM: {str(e)}")
        return False, f"Error during eligibility check: {str(e)}"