import os
import shutil
import json
import re
import anthropic
import logging
from PyPDF2 import PdfReader, PdfWriter
from pathlib import Path, PosixPath
from datetime import datetime
from typing import List, Dict, Tuple, Any, Optional, Union
import argparse
import sys
import json
import csv
import base64

from dotenv import load_dotenv
load_dotenv()
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
ANTHROPIC_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-3-5-haiku-latest")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "Qwen/Qwen3-Next-80B-A3B-Instruct")
OPENAI_URL = os.getenv("OPENAI_URL", "https://api.deepinfra.com/v1/openai")

import google.generativeai as genai
from google.generativeai import types

from openai import OpenAI

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configuration
BASE_DIR = r"D:\Tenders"
COMPANY_REPOSITORY = r"C:\Users\Vivek\minaions\sample_company_docs"
CLAUDE_API_KEY = ANTHROPIC_API_KEY
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

llm_model = os.getenv("GENAI_ENGINE", "claude")

# Initialize Claude client
claude_client = anthropic.Anthropic(api_key=CLAUDE_API_KEY)
# Initialize Gemini client
gemini_model = genai.GenerativeModel(model_name="gemini")

claude_model_main = ANTHROPIC_MODEL
claude_model_lite = "claude-3-5-haiku-latest"

# Create an OpenAI client with your token and endpoint
openai = OpenAI(
    api_key=OPENAI_API_KEY,
    base_url=OPENAI_URL,
)

class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

class DocumentType:
    STANDARD = "standard"       # Company registration, certificates, etc.
    EXPERIENCE = "experience"   # Past work experience documents
    CUSTOM = "custom"           # Documents to be created specifically for the bid
    ANNEXURE = "annexure"       # Templates from the RFP to be filled
    REGENERATED = "regenerated" # Documents regenerated by the user prompt


def get_annexure_content(pdf_path, page_numbers, output_path):
    """
    Extracts text from specified pages of a PDF.

    Args:
        pdf_path (str): Path to the PDF file.
        page_numbers (list): List of page numbers to extract (1-based indexing).

    Returns:
        str: Concatenated text from the specified pages, or None if an error occurs.
    """
    try:
        with open(pdf_path, 'rb') as pdf_file:
            reader = PdfReader(pdf_file)
            num_pages = len(reader.pages)
            extracted_text = ""
            writer = PdfWriter()

            for page_num in page_numbers:
                if 1 <= page_num <= num_pages:
                    page = reader.pages[page_num - 1]  # Adjust to 0-based indexing
                    extracted_text += page.extract_text() + "\n"
                    writer.add_page(page)
                else:
                    print(f"Page number {page_num} is out of range. Skipping.")
        
        with open(output_path, 'wb') as out_file:
            writer.write(out_file)
        if extracted_text:
            return extracted_text
        else:
            # Prepare prompt for Claude
            prompt = f"""
            This message contains scanned pages from a PDF document which will contain a format or annexure.
            
            For EACH PAGE, extract ALL text content visible in it along with its formatting information like tables etc.

            For preserving the tables formatting, you may use underscores '_' and pipe '|' symbols.

            NOTE that the text in the scanned pages can be in any language.
            """
            with open(output_path, 'rb') as pdf_file:
                pdf_content = pdf_file.read()
      
            response = self.claude_client.messages.create(
                model="claude-haiku-4-5",
                max_tokens=4000,
                temperature=0.2,
                system="You are an expert at extracting annexure or table content from scanned documents. Extract ALL text visible on EACH page, preserving the original formatting as much as possible.",
                messages=[
                    {
                        "role": "user", 
                        "content": [
                            {
                                "type": "text",
                                "text": prompt
                            },
                            {
                                "type": "document",
                                "source": {
                                    "type": "base64",
                                    "media_type": "application/pdf",
                                    "data": base64.b64encode(pdf_content).decode('utf-8')
                                }
                            }
                        ]
                    }
                ]
            )
            extracted_text = response.content[0].text
            return extracted_text
    except FileNotFoundError:
        print(f"Error: PDF file not found at {pdf_path}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def document_formatting(doc_text, pdf_path):
    """
    Generates PDF-compatible HTML based on text content and formatting from a reference PDF.
    
    Args:
        doc_text (str): Raw text content to be formatted
        pdf_path (str): Path to reference PDF for formatting guidance
        
    Returns:
        str: Generated HTML code
    
    Raises:
        FileNotFoundError: If PDF file doesn't exist
    """
    # Read PDF content
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"Reference PDF not found: {pdf_path}")
        
    with open(pdf_path, 'rb') as pdf_file:
        pdf_content = pdf_file.read()

    # Enhanced prompt with structured formatting requirements
    prompt = f"""Analyze the attached reference format and styling of a document in the PDF and create PDF-compatible HTML for this text content:
    
    <raw_content>
    {doc_text}
    </raw_content>

    Follow these exact requirements:
    
    1. Layout Analysis:
    - Study the reference PDF's structure (headers, margins, spacing)
    - Identify font styles (sizes, families, weights) through visual patterns
    - Note paragraph spacing and indentation rules
    
    2. Formatting Rules:
    - Preserve ALL original text styling (bold/italic/underline) using semantic HTML
    - Maintain EXACT table structures from reference (columns, borders, alignment)
    - Replicate list formatting (bullet styles, numbering, indentation)
    
    3. HTML Requirements:
    - Use A4 page size: <style>@page {{ size: A4 portrait; margin: 2cm }}</style>
    - Tables: Use <table> with inline CSS for borders/padding
    - Placeholders: <span style="background-color: #ffff00; border: 1px dashed #000">fill_data_here</span>
    - Lists: Convert bullets to <ul> with 1em left margin
    - Headers: Match hierarchy (h1-h6) from reference
    
    4. Output Constraints:
    - No markdown, ONLY HTML/CSS
    - Include complete <html> document structure
    - Ensure print compatibility with PDF converters
    - Preserve white-space: pre-wrap for code blocks
    
    Return ONLY the HTML code with no additional commentary."""
    
    # Updated Claude API call with PDF attachment
    response = claude_client.messages.create(
        model=claude_model_main,
        max_tokens=4000,
        temperature=0.2,
        system="You are a professional document formatting expert specializing in PDF-to-HTML conversion with pixel-perfect accuracy.",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "document",
                        "source": {
                            "type": "base64",
                            "media_type": "application/pdf",
                            "data": base64.b64encode(pdf_content).decode('utf-8')
                        }
                    }
                ]
            }
        ]
    )

    return response.content[0].text

def identify_required_documents(extracted_info: Dict, process_logger = None) -> List[Dict]:
    if process_logger is None:
        process_logger = logger
    """
    Use LLM to intelligently parse and identify required documents from extracted information
    
    Args:
        extracted_info (dict): Dictionary with extracted tender information
        
    Returns:
        list: List of document requirements with metadata
    """
    document_info = extracted_info.get("Documents needed to submit the bid", "")['content']
    
    if not document_info or document_info == "Not found in any document":
        logger.warning("No document requirements found in extracted information")
        return []
    
    # Get scope of work for context
    scope_of_work = extracted_info.get("Scope of work of the whole project", "")['content']
    eligibility_criteria = extracted_info.get("Eligibility/Qualification Criteria or conditions for bidder", "")['content']
    
    # Create prompt for LLM to analyze document requirements
    prompt = f"""
    You are an expert in government tender document preparation. I'll provide you with text 
    describing document requirements for a tender bid. Analyze this text and extract a structured list 
    of all required documents.
    
    For each document, identify:
    1. Document name (brief but descriptive)
    2. Document type (exactly one of: STANDARD, EXPERIENCE, CUSTOM)
       - STANDARD: Standard Company documents like company registrations/incorporation, certificates, accreditations, 
         PAN, GST, MSME certificate, employee details, Team CVs, Turn over & financial documents etc. that the company already has
       - EXPERIENCE: Past work experience documents like work orders, completion certificates, PO, contracts etc.
       - CUSTOM: Documents to be created specifically for this bid (declarations, proposals, cover letter etc.)
    3. Description of what the document should contain
    
    IMPORTANT EXCLUSION RULE:
    DO NOT include any document that is an Annexure, Format, Form, or Proforma specifically provided 
    in the RFP document with a proper name or number (e.g., "Annexure-I", "Format A", "Form 6", "Proforma-B").
    These annexure/format documents will be handled separately. Only extract STANDARD, EXPERIENCE, and 
    CUSTOM documents that the bidder needs to prepare or already possesses.
    
    ADDITIONAL NOTES:
    a) No two documents should be mentioned in one entry. Each document should be a separate entry in the list.
       DO NOT repeat any same document entry in the output.
    b) If a document name is 'Additional Document 1, 2, 3 or 4 (Requested in ATC)', drop it. These are 
       documents with generic description like 'Additional document format <n> as specified in ATC'. 
       These documents should not be included in the final list of documents.
    c) If a document requirement says "as per Annexure X" or "in Format Y" or "use Form Z", do NOT include 
       it in your output - these are annexure-based documents that will be handled separately.
    
    Tender context for reference:
    SCOPE OF WORK SUMMARY: {scope_of_work[:2000] if scope_of_work else "Not provided"}
    ELIGIBILITY CRITERIA SUMMARY: {eligibility_criteria[:500] if eligibility_criteria else "Not provided"}
    
    Document requirements text:
    {document_info}
    
    Return your analysis as a JSON array of objects with these keys:
    - "name": String (Document name)
    - "type": String (One of: STANDARD, EXPERIENCE, CUSTOM)
    - "description": String (Detailed description of the document)
    
    Make sure to identify every required document (excluding annexures/formats/forms), even if it's 
    mentioned in passing or in a complex format.
    """
    
    try:
        if (llm_model == 'claude'):
            # Call Claude API
            response = claude_client.messages.create(
                model=claude_model_main,
                max_tokens=5000,
                temperature=1,
                thinking={
                    "type": "enabled",
                    "budget_tokens": 3000
                },
                system="You are an expert in analyzing tender documents. Extract the requested information accurately in JSON format only.",
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )
            # response_text = response.content[0].text
            process_logger.info(f"\n*************************************************\n🧠 Minaions Thinking: {response.content[0].thinking}")
            response_text = response.content[1].text
        elif (llm_model == 'open_llm'):
            response = openai.chat.completions.create(
                model=OPENAI_MODEL,
                messages=[
                    {"role": "system", "content": "You are an expert in analyzing tender documents. Extract the requested information accurately in JSON format only."},
                    {"role": "user", "content": prompt},
                ],
            )
            response_text = response.choices[0].message.content
        
        # Extract JSON from response
        json_match = re.search(r'```json\s*(.*?)\s*```', response_text, re.DOTALL)
        if json_match:
            json_str = json_match.group(1)
        else:
            # Try to find JSON without markdown code blocks
            json_match = re.search(r'\[\s*\{.*\}\s*\]', response_text, re.DOTALL)
            if json_match:
                json_str = json_match.group(0)
            else:
                # Last resort: try to extract just the JSON part
                lines = response_text.split('\n')
                json_lines = []
                started = False
                
                for line in lines:
                    if line.strip().startswith('[') or started:
                        started = True
                        json_lines.append(line)
                        if line.strip().endswith(']'):
                            break
                
                if json_lines:
                    json_str = '\n'.join(json_lines)
                else:
                    json_str = response_text  # Use the entire response as a fallback
        
        # Clean up the JSON string for better parsing
        json_str = json_str.strip()
        if not json_str.startswith('['):
            json_str = '[' + json_str
        if not json_str.endswith(']'):
            json_str = json_str + ']'
        
        # Parse the JSON
        try:
            required_docs = json.loads(json_str)
        except json.JSONDecodeError:
            # If we can't parse it, try a simplified approach
            process_logger.warning("Failed to parse LLM response as JSON, trying to extract individual documents")
            # Extract document sections from the response
            required_docs = extract_documents_from_text(response_text)
        
            

        # Log the results
        process_logger.info(f"👨‍💻 MinAIons identified {len(required_docs)} required documents")
        for i, doc in enumerate(required_docs):
            process_logger.info(f"📄Document {i+1}: {doc.get('name', 'Unknown')} - Type: {doc.get('type', 'Unknown')}")
            if doc.get('type') == DocumentType.ANNEXURE and doc.get('source_document'):

                #Clean source document string
                pattern = r'[\w\-_.]+\.(?:pdf|doc|docx|txt|xls|xlsx|PDF)\b'
                matches = re.findall(pattern, doc.get('source_document'), re.IGNORECASE)
                if matches:
                    doc['source_document'] = matches[0]  # Return the first match

                process_logger.info(f"  Found in: {doc.get('source_document')}")
        
        return required_docs
    
    except Exception as e:
        logger.error(f"Error identifying required documents with LLM: {str(e)}")
        # Fallback to a simpler approach if LLM fails
        return simple_document_extraction(document_info)

def extract_documents_from_text(text: str) -> List[Dict]:
    """
    Helper function to extract document information from text when JSON parsing fails
    
    Args:
        text (str): The text response from Claude
        
    Returns:
        list: List of document dictionaries
    """
    docs = []
    # Look for numbered or bulleted items
    lines = text.split('\n')
    current_doc = {}
    
    for line in lines:
        line = line.strip()
        # Check for document name patterns
        name_match = re.search(r'(?:^|\s)(?:Document|Name):\s*(.+)', line)
        if name_match:
            # Save previous document if exists
            if current_doc and 'name' in current_doc:
                docs.append(current_doc)
            # Start new document
            current_doc = {'name': name_match.group(1)}
            continue
            
        # Check for type
        type_match = re.search(r'(?:^|\s)Type:\s*(STANDARD|EXPERIENCE|CUSTOM)', line)
        if type_match and current_doc:
            current_doc['type'] = type_match.group(1)
            continue
            
        # Check for description
        desc_match = re.search(r'(?:^|\s)Description:\s*(.+)', line)
        if desc_match and current_doc:
            current_doc['description'] = desc_match.group(1)
            continue
    
    # Add the last document
    if current_doc and 'name' in current_doc:
        docs.append(current_doc)
    
    return docs

def simple_document_extraction(document_info: str) -> List[Dict]:
    """
    Simple fallback method to extract document requirements when LLM fails
    
    Args:
        document_info (str): Document requirements text
        
    Returns:
        list: List of basic document dictionaries
    """
    required_docs = []
    
    # Split into lines and look for bullet points or numbers
    lines = document_info.split('\n')
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        # Check if line starts with a bullet, number, or similar indicator
        if re.match(r'^(\d+\.|\-|\*|\•|\–) ', line):
            doc_name = re.sub(r'^(\d+\.|\-|\*|\•|\–) ', '', line)
            
            # Make a best guess at document type
            doc_type = DocumentType.CUSTOM  # Default
            if any(term in doc_name.lower() for term in [
                "certificate", "registration", "pan", "gst", "msme", "iso", "cmmi", 
                "balance sheet", "income tax return", "audit report"
            ]):
                doc_type = DocumentType.STANDARD
            elif any(term in doc_name.lower() for term in [
                "work order", "completion certificate", "experience", "portfolio"
            ]):
                doc_type = DocumentType.EXPERIENCE
            
            required_docs.append({
                "name": doc_name,
                "type": doc_type,
                "description": doc_name,
                "source_document": None
            })
    
    return required_docs

def get_available_company_documents(company_docs_dir) -> Dict:
    """
    Get a list of available company documents in the repository
    
    Returns:
        dict: Dictionary mapping document types to lists of available documents
    """
    company_docs = {
        DocumentType.STANDARD: [],
        DocumentType.EXPERIENCE: []
    }
    
    # Standard documents folder
    std_doc_path = os.path.join(company_docs_dir, "Standard_Documents")
    if os.path.exists(std_doc_path):
        for file in os.listdir(std_doc_path):
            if file.endswith(('.pdf', '.docx', '.jpg', '.png', '.xlsx', '.csv')):
                company_docs[DocumentType.STANDARD].append({
                    "name": file,
                    "path": os.path.join(std_doc_path, file),
                    "description": get_document_description(os.path.join(std_doc_path, file))
                })
    
    # Experience documents folder
    exp_doc_path = os.path.join(company_docs_dir, "Experience_Documents")
    if os.path.exists(exp_doc_path):
        for file in os.listdir(exp_doc_path):
            if file.endswith(('.pdf', '.docx', '.jpg', '.png', '.xlsx', '.csv')):
                company_docs[DocumentType.EXPERIENCE].append({
                    "name": file,
                    "path": os.path.join(exp_doc_path, file),
                    "description": get_document_description(os.path.join(exp_doc_path, file))
                })
    
    logger.info(f"Found {len(company_docs[DocumentType.STANDARD])} standard documents and "
                f"{len(company_docs[DocumentType.EXPERIENCE])} experience documents")
    
    return company_docs

def get_document_description(file_path: str) -> str:
    """
    Get description for a document based on filename or content
    
    Args:
        file_path (str): Path to the document
        
    Returns:
        str: Document description
    """
    # First check for a description file
    desc_path = file_path + ".desc"
    if os.path.exists(desc_path):
        with open(desc_path, 'r', encoding='utf-8') as f:
            return f.read().strip()
    
    # Otherwise use filename as description
    filename = os.path.basename(file_path)
    # Clean up filename
    description = os.path.splitext(filename)[0]
    description = description.replace('_', ' ').replace('-', ' ')
    
    return description

def write_list_of_docs_to_csv(data, filename):
    """Writes a list of dictionaries to a CSV file.

    Args:
        data: A list of dictionaries.
        filename: The name of the CSV file to write to.
    """
    if not data:
        print("❌ No required documents found in the RFP details.")
        return

    print(f"\n📄 {len(data)} documents identified to be submitted for this bid.")

    fieldnames = ['name', 'type', 'source_document', 'description', 'pages', 'isSelected'] #data[len(data)-1].keys()

    # if os.path.exists(filename):
    #     os.remove(filename)
    #     print(f"File '{filename}' existed already and is removed successfully.")

    with open(filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

def prepare_bid_documents(bid_dir: str, company_docs_dir: str, company_info: str, required_docs: List[Dict[str, Any]], process_logger = None) -> str:
    if process_logger is None:
        process_logger = logger
    """
    Main function to prepare all bid documents
    
    Args:
        bid_dir (str): Path to the bid directory
        
    Returns:
        str: Path to the final documents directory
    """
    extracted_info_path = os.path.join(bid_dir, "tender_analysis")
    extracted_info_file = os.path.join(extracted_info_path, "tender_analysis.json")
    
    if os.path.exists(extracted_info_file):
        with open(extracted_info_file, 'r', encoding='utf-8') as f:
            extracted_info = json.load(f)
    else:
        # Sample extracted info for testing
        print("❌ Looks like this RFP has not been analysed by Minaions before.")
        print("Please check the RFP directory path or analyze the RFP through Minaions first.")
        return "/final/docs/directory"

    # Get scope of work for context
    scope_of_work = extracted_info.get("Scope of work of the whole project", "")['content']

    # Create final documents directory
    final_docs_dir = os.path.join(bid_dir, "final_docs")
    os.makedirs(final_docs_dir, exist_ok=True)
    
    # Create subdirectories for document types
    for doc_type in [DocumentType.STANDARD, DocumentType.EXPERIENCE, DocumentType.CUSTOM, DocumentType.ANNEXURE]:
        os.makedirs(os.path.join(final_docs_dir, doc_type), exist_ok=True)
    
    # Step 1: Identify required documents from parameter
    required_documents = required_docs
    write_list_of_docs_to_csv(required_documents, os.path.join(extracted_info_path, "required_documents.csv"))

    # Step 2: Get available company documents
    available_documents = get_available_company_documents(company_docs_dir)

    logger.info(f"Available company documents are: \n{available_documents}")

    # Step 3: Match and prepare documents
    document_status = {
        "prepared": [],
        "missing": [],
        "index": []
    }

    # Track referenced company documents (not copied, just referenced)
    referenced_company_docs = []

    # Process each required document
    for req_doc in required_documents:
        document_path = None
        referenced_doc_info = None

        print("Required Document:", req_doc)
        # Process based on document type
        if req_doc["type"] == 'STANDARD':
            document_path, referenced_doc_info = process_standard_document(req_doc, available_documents, final_docs_dir, process_logger)
        elif req_doc["type"] == 'EXPERIENCE':
            document_path, referenced_doc_info = process_experience_document(req_doc, available_documents, final_docs_dir, scope_of_work, process_logger)
        elif req_doc["type"] == 'CUSTOM':
            document_path = generate_custom_document(req_doc, extracted_info, company_info, scope_of_work, final_docs_dir, process_logger)
        elif req_doc["type"] == 'ANNEXURE':
            document_path = process_annexure_format(req_doc, extracted_info, company_info, final_docs_dir, bid_dir, process_logger)

        # Track referenced company documents
        if referenced_doc_info:
            referenced_company_docs.extend(referenced_doc_info)

        # Update status
        if document_path or referenced_doc_info:
            rel_path = os.path.relpath(document_path, final_docs_dir) if document_path else ""
            document_status["prepared"].append({
                "name": req_doc["name"],
                "type": req_doc["type"],
                "path": rel_path
            })
            document_status["index"].append({
                "name": req_doc["name"],
                "type": req_doc["type"],
                "path": rel_path,
                "status": "Prepared"
            })
        else:
            document_status["missing"].append(req_doc["name"])
            document_status["index"].append({
                "name": req_doc["name"],
                "type": req_doc["type"],
                "path": "",
                "status": "Missing"
            })
    
    # # Step 4: Create document index
    # index_path = create_document_index(final_docs_dir, document_status["index"])
    
    # Log summary
    process_logger.info(f"📚 Bid documents preparation completed")
    process_logger.info(f"📃 Prepared documents: {len(document_status['prepared'])}")
    process_logger.info(f"📎 Referenced company documents: {len(referenced_company_docs)}")
    process_logger.info(f"⚠️ Missing documents: {len(document_status['missing'])}")
    if document_status["missing"]:
        process_logger.warning(f"🔺 Missing documents: {', '.join(document_status['missing'])}")

    return final_docs_dir, document_status, referenced_company_docs


def process_standard_document(req_doc: Dict, available_documents: Dict, final_docs_dir: str, process_logger = None) -> str:
    if process_logger is None:
        process_logger = logger
    """
    Process standard company document requirement
    
    Args:
        req_doc (dict): Required document information with 'name' and 'description'
        available_documents (dict): Dictionary of available company documents with 'standard' key
        final_docs_dir (str): Directory for final documents
        
    Returns:
        List[Dict]: List of relevant standard documents, or empty list if none found
    """
    process_logger.info(f"🕵️ Processing standard document: {req_doc['name']}")
    
    # Extract standard documents from available_documents
    standard_docs = available_documents.get('standard', [])

    if not standard_docs:
        process_logger.warning("No standard documents available")
        return None, None
    
    # Create prompt for Claude to find relevant standard documents
    prompt = f"""
    You are an expert in government tender bidding and document matching. I need to find the most relevant 
    company standard documents that match the following requirement:
    
    REQUIRED DOCUMENT:
    Name: {req_doc['name']}
    Description: {req_doc['description']}
    
    Here are the available standard company documents (number, filename, and description):
    """
    
    # Add document descriptions (limited to avoid token limit)
    for i, doc in enumerate(standard_docs[:100]):
        prompt += f"\n{i+1}. {doc['name']}: {doc['description']}"
    
    prompt += """
    
    Please identify ALL standard documents that are relevant to the required document. 
    Consider document types, certifications, registrations, and any keywords that indicate relevance.
    Return only the numbers of the relevant documents separated by commas, in order of relevance.
    If no documents are relevant, return "NONE".
    
    For example: "3,7,1" or "5" or "NONE"
    NOTE: PLEASE DO NOT RESPOND WITH ANY ADDITIONAL COMMENTARY OR INFORMATION OTHER THAN THE NUMBERS OF RELEVANT DOCUMENTS SEPARATED BY COMMAS.
    """
    
    try:
        if (llm_model == 'claude'):
            response = claude_client.messages.create(
                model=claude_model_main,
                max_tokens=1000,
                temperature=0,
                system="You are an expert in analyzing tender documents and matching company standard documents to requirements.",
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )
            
            response_text = response.content[0].text.strip()
        elif (llm_model == 'open_llm'):
            response = openai.chat.completions.create(
                model=OPENAI_MODEL,
                messages=[
                    {"role": "system", "content": "You are an expert in analyzing tender documents and matching company standard documents to requirements."},
                    {"role": "user", "content": prompt},
                ],
            )

            response_text = response.choices[0].message.content.strip()
        process_logger.info(f"Minaions response for standard document matching: {response_text}")
        
        # Handle "NONE" response
        if response_text.upper() == "NONE":
            process_logger.info(f"No relevant standard documents found for: {req_doc['name']}")
            return None, None
        
        # Extract document numbers - look for numbers separated by commas
        rankings = re.findall(r'\d+', response_text)
        
        # Get the relevant documents
        relevant_docs = []
        for rank in rankings:
            try:
                index = int(rank) - 1
                if 0 <= index < len(standard_docs):
                    relevant_docs.append(standard_docs[index])
                    logger.info(f"Found relevant document: {standard_docs[index]['name']}")
            except ValueError:
                continue
        
        if relevant_docs:
            process_logger.info(f"Found {len(relevant_docs)} relevant standard document(s) for: {req_doc['name']}")

            # Instead of copying, return reference information for all relevant docs
            referenced_docs = []
            for doc in relevant_docs:
                referenced_docs.append({
                    "name": os.path.basename(doc["path"]),
                    "local_path": doc["path"],
                    "type": DocumentType.STANDARD,
                    "category": DocumentType.STANDARD,
                    "required_doc_name": req_doc['name'],
                    "description": doc.get("description", "")
                })
                process_logger.info(f"Referencing standard document: {os.path.basename(doc['path'])}")

            return None, referenced_docs
        else:
            process_logger.warning(f"No valid standard documents found for: {req_doc['name']}")

        return None, None
    
    except Exception as e:
        process_logger.error(f"Error finding relevant standard documents: {str(e)}")
        return None, None


def process_experience_document(req_doc: Dict, available_documents: Dict, final_docs_dir: str, scope_of_work: str, process_logger = None) -> str:
    if process_logger is None:
        process_logger = logger
    """
    Process experience document requirement
    
    Args:
        req_doc (dict): Required document information
        available_documents (dict): Dictionary of available company documents
        final_docs_dir (str): Directory for final documents
        scope_of_work (str): Scope of work for context
        
    Returns:
        str: Path to the processed document, or None if not found
    """
    process_logger.info(f"🕵️ Processing experience document: {req_doc['name']}")
    
    # Use scope of work to find relevant experience documents
    if scope_of_work:
        relevant_docs = select_relevant_experience_documents(
            req_doc,
            available_documents[DocumentType.EXPERIENCE],
            scope_of_work
        )

        if relevant_docs:
            # Instead of copying, return reference information for all relevant docs
            referenced_docs = []
            for doc in relevant_docs:
                referenced_docs.append({
                    "name": os.path.basename(doc["path"]),
                    "local_path": doc["path"],
                    "type": DocumentType.EXPERIENCE,
                    "category": DocumentType.EXPERIENCE,
                    "required_doc_name": req_doc['name'],
                    "description": doc.get("description", "")
                })
                process_logger.info(f"Referencing experience document: {os.path.basename(doc['path'])}")

            return None, referenced_docs

    # Fallback: find best match without considering scope
    best_match = find_best_document_match(req_doc, available_documents[DocumentType.EXPERIENCE])

    if best_match:
        # Instead of copying, return reference information
        referenced_docs = [{
            "name": os.path.basename(best_match["path"]),
            "local_path": best_match["path"],
            "type": DocumentType.EXPERIENCE,
            "category": DocumentType.EXPERIENCE,
            "required_doc_name": req_doc['name'],
            "description": best_match.get("description", "")
        }]

        process_logger.info(f"Referencing experience document: {os.path.basename(best_match['path'])}")
        return None, referenced_docs
    else:
        process_logger.warning(f"No matching experience document found for: {req_doc['name']}")
        return None, None

def find_best_document_match(req_doc: Dict, available_docs: List[Dict]) -> Dict:
    """
    Find the best matching document from available documents
    
    Args:
        req_doc (dict): Required document information
        available_docs (list): List of available documents
        
    Returns:
        dict: Best matching document, or None if no good match
    """
    if not available_docs:
        return None
    
    best_match = None
    best_score = 0
    
    req_terms = set(req_doc["name"].lower().split())
    
    for doc in available_docs:
        # Calculate simple term overlap score
        doc_terms = set(doc["name"].lower().split() + doc["description"].lower().split())
        common_terms = req_terms.intersection(doc_terms)
        
        score = len(common_terms) / max(len(req_terms), 1)
        
        if score > best_score:
            best_score = score
            best_match = doc
    
    # Only return if the match is good enough
    if best_score >= 0.3:  # At least 30% term overlap
        return best_match
    
    return None

def select_relevant_experience_documents(req_doc: Dict, experience_docs: List[Dict], scope_of_work: str) -> List[Dict]:
    """
    Select relevant experience documents based on scope of work
    
    Args:
        req_doc (dict): Required document information
        experience_docs (list): List of available experience documents
        scope_of_work (str): Scope of work for context
        
    Returns:
        list: Ranked list of relevant experience documents
    """
    if not experience_docs or not scope_of_work:
        return []
    
    # Create prompt for Claude to rank experience documents
    prompt = f"""
    You are an expert in government tender bidding. I need to select the most relevant past experience 
    documents for a tender with the following scope of work:
    
    SCOPE OF WORK:
    {scope_of_work[:2000]}  # Limit scope to avoid token limit issues
    
    The tender requires: {req_doc['name']} - {req_doc['description']}
    
    Here are the available experience documents (filename and description):
    """
    
    # Add document descriptions (limited to 20 to avoid token limit)
    for i, doc in enumerate(experience_docs[:20]):
        prompt += f"\n{i+1}. {doc['name']}: {doc['description']}"
    
    prompt += """
    
    Please rank the top 3 most relevant experience documents for this tender requirement, 
    considering relevance to the scope of work. Return only the numbers of the documents in order 
    of relevance, separated by commas. For example: "5,12,3"
    """
    
    try:
        if (llm_model=='claude'):
            response = claude_client.messages.create(
                model=claude_model_main,
                max_tokens=1000,
                temperature=0,
                system="You are an expert in analyzing tender documents and selecting relevant experience documents.",
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )
            
            response_text = response.content[0].text.strip()
        elif (llm_model=='open_llm'):
            response = openai.chat.completions.create(
                model=OPENAI_MODEL,
                messages=[
                    {"role": "system", "content": "You are an expert in analyzing tender documents and selecting relevant experience documents."},
                    {"role": "user", "content": prompt},
                ],
            )

            response_text = response.choices[0].message.content.strip()
        
        # Extract rankings - look for numbers separated by commas
        rankings = re.findall(r'\d+', response_text)
        
        # Get the ranked documents
        ranked_docs = []
        for rank in rankings:
            try:
                index = int(rank) - 1
                if 0 <= index < len(experience_docs):
                    ranked_docs.append(experience_docs[index])
            except ValueError:
                continue
        
        return ranked_docs
    
    except Exception as e:
        logger.error(f"Error selecting relevant experience documents: {str(e)}")
        return []

def generate_custom_document(req_doc: Dict, extracted_info: Dict, company_info: str, scope_of_work: str, final_docs_dir: str, process_logger = None) -> str:
    if process_logger is None:
        process_logger = logger
    """
    Generate a custom document using Claude
    
    Args:
        req_doc (dict): Required document information
        extracted_info (dict): Dictionary with extracted tender information
        final_docs_dir (str): Directory for final documents
        
    Returns:
        str: Path to the generated document, or None if generation failed
    """
    process_logger.info(f"🤖 Generating custom document: {req_doc['name']}")
    
    # Get relevant sections from extracted info
    eligibility = extracted_info.get("Eligibility/Qualification Criteria or conditions for bidder", "")['content']
    payment_terms = extracted_info.get("Payment terms", "")['content']
    
    # Create prompt for Claude to generate document
    prompt = f"""
    You are an expert in government tender document preparation. I need you to create a {req_doc['name']} 
    for a tender bid. The document should be professional, complete, and follow standard formats.
    
    Document Required: {req_doc['name']}
    Description: {req_doc['description']}
    
    Relevant Tender Information:
    
    SCOPE OF WORK:
    {scope_of_work[:1500]}
    
    ELIGIBILITY CRITERIA:
    {eligibility[:1500]}
    
    PAYMENT TERMS:
    {payment_terms[:500]}
    
    Bidder Company Details:
    {company_info}
    
    Please generate the complete document text in a professional format. Include:
    - Appropriate header with company letterhead elements
    - Date and reference number
    - Professional salutation and closure
    - All necessary declarations or statements
    - Any legal language typically required for such a document
    - Proper formatting with bold, line breaks, paragraphs and tabs etc using HTML tags.
    
    
    The document should be ready to print and sign without further modifications. 

    PLEASE NOTE that:
    1. The output should be in portrait A4 size PDF compatible HTML ONLY.
    2. For tables in the output response, please use HTML table tags.
    3. For any bullet points or serial points, please put them in new lines instead of putting them in one blob of text.
    4. DO NOT make up or create imaginary details for Company's critical information like employee details, designations, previous projects etc, unless provided explicitly in 'Bidder Company Details'. 
    """
    
    
    try:
        if (llm_model=='claude'):
            response = claude_client.messages.create(
                model=claude_model_main,
                max_tokens=5000,
                temperature=1,
                thinking={
                    "type": "enabled",
                    "budget_tokens": 3000
                },
                system="You are an expert in preparing professional tender documents.",
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )
            # filled_document = response.content[0].text
            process_logger.info(f"\n*************************************************\n🧠 Minaions Thinking: {response.content[0].thinking}")
            document_text = response.content[1].text
        elif (llm_model=='open_llm'):
            response = openai.chat.completions.create(
                model=OPENAI_MODEL,
                messages=[
                    {"role": "system", "content": "You are an expert in preparing professional tender documents."},
                    {"role": "user", "content": prompt},
                ],
            )

            document_text = response.choices[0].message.content
        
        # Create document file
        dest_dir = os.path.join(final_docs_dir, DocumentType.CUSTOM)
        safe_name = re.sub(r'[^\w\s-]', '', req_doc['name']).strip().replace(' ', '_')
        dest_path = os.path.join(dest_dir, f"{safe_name}.html")
        
        # Remove all ```html and closing ``` (case-insensitive for html)
        document_text = re.sub(r'```html', '', document_text, flags=re.IGNORECASE)
        document_text = re.sub(r'```', '', document_text)
        
        with open(dest_path, 'w', encoding='utf-8') as f:
            f.write(document_text)
        
        process_logger.info(f"Generated custom document: {os.path.basename(dest_path)}")
        return dest_path
    
    except Exception as e:
        process_logger.error(f"Error generating custom document: {str(e)}")
        return None

def count_words(text):
    """
    Count words in a given text string.
    Removes extra whitespace and handles basic punctuation.
    """
    if not text or text.strip() == "":
        return 0
    
    # Remove extra whitespace and convert to lowercase
    cleaned_text = re.sub(r'\s+', ' ', text.strip())
    
    # Split by whitespace and filter out empty strings
    words = [word for word in cleaned_text.split() if word]
    
    return len(words)

def regenerate_document(org_doc_file_path: str, user_prompt: str, company_info: str, final_docs_dir: str, extracted_info: Dict) -> str:
    """
    Regenerate a document using Claude Sonnet for bid submission purposes.
    
    Args:
        org_doc_file_path (str): Path to the original document file
        user_prompt (str): User's instructions on what needs to be changed
        company_info (str): Bidder company's information and details
        final_docs_dir (str): Directory to save the final document
        scope_of_work (str): Scope of work from the RFP documents
        eligibility_criteria (str): List of eligibility criteria for bidders
    
    Returns:
        str: Path to the generated document file or error message
    """
    print(f"Regenerating the requested document: {org_doc_file_path}")

    try:
        with open(org_doc_file_path, 'r', encoding='utf-8') as file:
            original_document = file.read()
    except FileNotFoundError:
        return f"Error: Original document file not found at {org_doc_file_path}"
    except Exception as e:
        return f"Error reading original document: {str(e)}"

    # Create prompt for Claude to create a standard format

    document_info = extracted_info.get("Documents needed to submit the bid", "")['content']
    
    # Get scope of work for context
    scope_of_work = extracted_info.get("Scope of work of the whole project", "")['content']
    eligibility_criteria = extracted_info.get("Eligibility/Qualification Criteria or conditions for bidder", "")['content']

    regen_cost = 0.00
    base_cost_per_word = 0.005  # INR per word
    total_words = 0
    total_words += count_words(original_document)
    total_words += count_words(user_prompt)
    total_words += count_words(company_info)
    total_words += count_words(scope_of_work)
    total_words += count_words(eligibility_criteria)

    regen_cost = round(2 * total_words * base_cost_per_word, 2)

    # Construct the comprehensive prompt
    system_prompt = """You are an expert document writer specializing in creating professional bid submission documents for tenders and RFPs. Your task is to regenerate, recreate, or rephrase documents based on user requirements while maintaining professionalism and accuracy."""
    
    prompt = f"""
Please regenerate the following document based on the user's requirements and provided information:

**ORIGINAL DOCUMENT:**
{original_document}

**USER'S REQUIREMENTS FOR CHANGES:**
{user_prompt}

**BIDDER COMPANY INFORMATION:**
{company_info}

**SCOPE OF WORK FROM RFP:**
{scope_of_work}

**ELIGIBILITY CRITERIA:**
{eligibility_criteria}

**IMPORTANT FORMATTING AND CONTENT REQUIREMENTS:**
1. The output should be in portrait A4 size PDF compatible HTML ONLY.
2. For tables in the output response, please use HTML table tags with proper styling.
3. For any bullet points or serial points, please put them in new lines instead of putting them in one blob of text.
4. DO NOT make up or create imaginary details for Company's critical information like employee details, designations, previous projects etc, unless provided explicitly in 'Bidder Company Details'.
5. Maintain professional tone and format suitable for bid submission, ensuring that the user's requirements for changes are properly addressed.
6. Ensure the document addresses relevant aspects from the scope of work and eligibility criteria where applicable.
7. Use proper HTML structure with appropriate CSS styling for A4 portrait format.

Please provide the complete regenerated document in HTML format that can be converted to PDF while maintaining A4 portrait layout.
"""

    try:
        if(llm_model == 'gemini'):
            response = gemini_model.generate_content(
                model="gemini-2.5-flash-preview-04-17", #"gemini-2.0-flash",
                contents=[prompt],
                config=types.GenerateContentConfig(
                    max_output_tokens=4000,
                    temperature=0.1
                )
            )
            document_text = response.text
        elif(llm_model == 'claude'):
            response = claude_client.messages.create(
                model=claude_model_main,
                max_tokens=4000,
                temperature=0.1,
                system=system_prompt,
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )

            generated_content = response.content[0].text
        elif (llm_model=='open_llm'):
            response = openai.chat.completions.create(
                model=OPENAI_MODEL,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": prompt},
                ],
            )
            generated_content = response.choices[0].message.content

    except Exception as e:
        logger.error( f"Error calling Claude API: {str(e)}")
        return None, 0

    # Create final documents directory if it doesn't exist
    Path(final_docs_dir).mkdir(parents=True, exist_ok=True)
    
    # Generate filename with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    original_filename = Path(org_doc_file_path).stem
    output_filename = f"regenerated_{original_filename}_{timestamp}.html"
    dest_dir = os.path.join(final_docs_dir, DocumentType.REGENERATED)
    output_path = os.path.join(dest_dir, output_filename)

    # Add HTML document structure if not present
    if not generated_content.strip().startswith('<!DOCTYPE html>'):
        html_template = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Regenerated Bid Document</title>
    <style>
        @media print {{
            @page {{
                size: A4 portrait;
                margin: 1in;
            }}
        }}
        body {{
            font-family: Arial, sans-serif;
            line-height: 1.6;
            color: #333;
            max-width: 8.27in;
            margin: 0 auto;
            padding: 20px;
        }}
        table {{
            width: 100%;
            border-collapse: collapse;
            margin: 20px 0;
        }}
        table, th, td {{
            border: 1px solid #ddd;
        }}
        th, td {{
            padding: 12px;
            text-align: left;
        }}
        th {{
            background-color: #f2f2f2;
            font-weight: bold;
        }}
        ul, ol {{
            margin: 10px 0;
            padding-left: 30px;
        }}
        li {{
            margin: 5px 0;
        }}
        h1, h2, h3, h4, h5, h6 {{
            color: #2c3e50;
            margin-top: 30px;
            margin-bottom: 15px;
        }}
        .header {{
            text-align: center;
            margin-bottom: 30px;
        }}
    </style>
</head>
<body>
{generated_content}
</body>
</html>"""
        generated_content = html_template

    # Remove all ```html and closing ``` (case-insensitive for html)
    generated_content = re.sub(r'```html', '', generated_content, flags=re.IGNORECASE)
    generated_content = re.sub(r'```', '', generated_content)

    # Save the generated document
    try:
        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(generated_content)
    except Exception as e:
        return f"Error saving re-generated document: {str(e)}"
    
    return output_path, regen_cost


def parse_annexure_log(log_text: str) -> Dict[str, Dict]:
    """
    Parse the annexure scan log and extract all annexure information.
    
    Args:
        log_text: The raw log output from the annexure scanning script
        
    Returns:
        Dictionary mapping annexure names to their details:
        {
            "ANNEXURE XIV": {"start": 74, "end": 75, "page_count": 2, "pages": [74, 75]},
            ...
        }
    """
    annexures = {}
    
    # Pattern to match lines like: ✅ ANNEXURE XIV: pages 74-75 (2 pages)
    # Also handles single page: ✅ ANNEXURE A: pages 16-16 (1 pages)
    pattern = r'✅\s*(ANNEXURE\s*[\w\-]+):\s*pages\s*(\d+)-(\d+)\s*\((\d+)\s*pages?\)'
    
    matches = re.findall(pattern, log_text, re.IGNORECASE)
    
    for match in matches:
        annexure_name = match[0].strip().upper()
        start_page = int(match[1])
        end_page = int(match[2])
        page_count = int(match[3])
        
        # Generate list of all pages
        pages = list(range(start_page, end_page + 1))
        
        annexures[annexure_name] = {
            "start": start_page,
            "end": end_page,
            "page_count": page_count,
            "pages": pages
        }
    
    return annexures

def generate_filled_annexure(annexure_name: str, annexure_number: str, annexure_format: str, company_info: str, final_docs_dir: str, pdf_path, process_logger = None) -> str:
    if process_logger is None:
        process_logger = logger
    """
    Generate a filled annexure document using the extracted format

    Args:
        req_doc (dict): Required document information
        annexure_format (str): Extracted annexure format
        source_file (Path): Source PDF file
        final_docs_dir (str): Directory for final documents

    Returns:
        str: Path to the generated document
    """
    # Create prompt for LLM to fill the annexure
    annexure_desc = f"Annexure {annexure_number}" if annexure_number else annexure_name

    prompt = f"""
    You are an expert in government tender document preparation. I need to fill out an annexure format
    for a bid submission. I've extracted the exact format from the tender document.

    Required Document: {annexure_desc}

    Here is the exact format from the RFP:

    {annexure_format}

    Please fill this format with appropriate information from this Company Information:
    
    {company_info}
    
    Follow EXACTLY the original layout, tables, and structure, but fill in all blank fields.
    If there are tables in the format, then please use keys like | or _ or - or tabs and spaces to represent the exact format of the table in the output.

    Again please fill in ALL fields with appropriate information for this type of tender from the company information provided above.
    Completely replace the place holder text in the format with the actual information, wherever found.
    If a field needs specific technical information not provided here, use the string '<fill_data_here>'. DO NOT make up Company's critical information like employee details, designations, previous projects etc, unless provided explicitly in Bidder Company Details.

    IMPORTANT: Maintain the EXACT formatting and layout of the original. Return ONLY the filled document.
    """

    # print("prompt for generating filled annexure:\n\n", prompt)
    try:

        if(llm_model == 'gemini'):
            response = gemini_model.generate_content(
                model="gemini-2.5-flash-preview-04-17", #"gemini-2.0-flash",
                contents=[prompt],
                config=types.GenerateContentConfig(
                    max_output_tokens=4000,
                    temperature=0.1
                )
            )
            filled_document = response.text

        elif(llm_model == 'claude'):
            response = claude_client.messages.create(
                model=claude_model_main,
                max_tokens=5000,
                temperature=1,
                thinking={
                    "type": "enabled",
                    "budget_tokens": 3000
                },
                system="You are an expert in preparing professional tender annexure documents. Fill the exact format with appropriate information.",
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )
            # filled_document = response.content[0].text
            process_logger.info(f"\n*************************************************\n🧠 Minaions Thinking: {response.content[0].thinking}")
            filled_document = response.content[1].text
        
        elif(llm_model == 'open_llm'):
            response = openai.chat.completions.create(
                model=OPENAI_MODEL,
                messages=[
                    {"role": "system", "content": "You are an expert in preparing professional tender annexure documents. Fill the exact format with appropriate information."},
                    {"role": "user", "content": prompt},
                ],
            )
            filled_document = response.choices[0].message.content

        formatted_document = document_formatting(filled_document, pdf_path)

        # Remove all ```html and closing ``` (case-insensitive for html)
        formatted_document = re.sub(r'```html', '', formatted_document, flags=re.IGNORECASE)
        formatted_document = re.sub(r'```', '', formatted_document)

        # Create document file
        dest_dir = os.path.join(final_docs_dir, DocumentType.ANNEXURE)
        safe_name = re.sub(r'[^\w\s-]', '', annexure_name).strip().replace(' ', '_')
        dest_path = os.path.join(dest_dir, f"{safe_name}.html")
        os.makedirs(dest_dir, exist_ok=True)

        with open(dest_path, 'w', encoding='utf-8') as f:
            f.write(formatted_document)

        process_logger.info(f"✅ Generated filled annexure document: {os.path.basename(dest_path)}")
        return dest_path

    except Exception as e:
        process_logger.error(f"🛑 Error generating filled annexure document: {str(e)}")

def generate_annexure_fallback(annexure_name: str, file_desc: str, company_info: str, final_docs_dir: str, eligibility: str, req_docs: str, scope_of_work: str, process_logger  = None) -> str:
    """
    Fallback function to generate an annexure document when the exact format can't be found

    Args:
        req_doc (dict): Required document information
        final_docs_dir (str): Directory for final documents

    Returns:
        str: Path to the generated document
    """
    process_logger.info(f"Using fallback method to generate annexure: {annexure_name}")

    # Create prompt for Claude to create a standard format

    annexure_desc = f"Annexure {annexure_name}" + ": " + file_desc

    prompt = f"""
    You are an expert in government tender document preparation. I need to create an annexure document
    for a bid submission, but I dont have the exact format for this document.

    Required Document:

    {annexure_desc}

    Please create a standard professional format typically used for this type of annexure in government tenders.
    This should look like a authentic, professional government tender annexure.

    To create this document, you may please use the appropriate information from this Company Information:

    {company_info}

    Other related information about this tender/RFP is as following:

    Eligibility Criteria:
    {eligibility}

    Scope of Work:
    {scope_of_work}

    Documents required to be sumitted in the bid:
    {req_docs}

    The format should include:
    - Professional header with annexure title
    - All standard fields typically found in this type of annexure
    - Appropriate spaces for signatures, dates, and stamps
    - Any declarations or statements typically required
    - Proper formatting with bold, line breaks, paragraphs and tabs etc using HTML tags.

    Please create a complete and professional document ready for submission without further modifications.

    PLEASE NOTE that:
    1. The output should be in portrait A4 size PDF compatible HTML ONLY.
    2. For tables in the output response, please use HTML table tags.
    3. For any bullet points or serial points, please put them in new lines instead of putting them in one blob of text.
    4. DO NOT make up or create imaginary details for Company's critical information like employee details, designations, previous projects etc, unless provided explicitly in 'Bidder Company Information'. 
    
    """
    
    try:
        if(llm_model == 'gemini'):
            response = gemini_client.models.generate_content(
                model="gemini-2.5-flash", 
                contents=[prompt],
                config=types.GenerateContentConfig(
                    max_output_tokens=4000,
                    temperature=0.1
                )
            )
            document_text = response.text
        elif(llm_model == 'claude'):
            response = claude_client.messages.create(
                model=claude_model_main,
                max_tokens=5000,
                temperature=1,
                thinking={
                    "type": "enabled",
                    "budget_tokens": 3000
                },
                system="You are an expert in preparing professional tender documents.",
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )
            # filled_document = response.content[0].text
            process_logger.info(f"\n*************************************************\n🧠 Minaions Thinking: {response.content[0].thinking}")
            document_text = response.content[1].text
        elif(llm_model == 'open_llm'):
            response = openai.chat.completions.create(
                model=OPENAI_MODEL,
                messages=[
                    {"role": "system", "content": "You are an expert in preparing professional tender documents."},
                    {"role": "user", "content": prompt},
                ],
            )
            document_text = response.choices[0].message.content
        
        # Create document file
        dest_dir = os.path.join(final_docs_dir, DocumentType.ANNEXURE)
        safe_name = re.sub(r'[^\w\s-]', '', annexure_name).strip().replace(' ', '_')
        dest_path = os.path.join(dest_dir, f"{safe_name}.html")
        os.makedirs(dest_dir, exist_ok=True)
        
        # Remove all ```html and closing ``` (case-insensitive for html)
        document_text = re.sub(r'```html', '', document_text, flags=re.IGNORECASE)
        document_text = re.sub(r'```', '', document_text)
        
        with open(dest_path, 'w', encoding='utf-8') as f:
            f.write(document_text)
        
        process_logger.info(f"✅ Generated Annexure Custom document: {os.path.basename(dest_path)}")
        return dest_path
    
    except Exception as e:
        process_logger.error(f"Error generating fallback annexure document: {str(e)}")
        return None

def process_annexure_format(req_doc: Dict, extracted_info: Dict, company_info: str, final_docs_dir: str, bid_dir: str, process_logger = None) -> str:
    if process_logger is None:
        process_logger = logger
    """
    Process annexure format document by locating, extracting, and filling the exact format from RFP

    Args:
        req_doc (dict): Required document information
        company_info (str): Information of available company
        final_docs_dir (str): Directory for final documents
        bid_dir (str): Path to the bid directory containing all RFP documents

    Returns:
        str: Path to the processed document, or None if processing failed
    """
    process_logger.info(f"🤖 Processing annexure format: {req_doc['name']}")

    # Extract key information for searching
    annexure_name = req_doc['name']
    source_hint = req_doc.get('source_document', 'Unknown')
    file_desc = req_doc.get('description', 'No Description given')
    pages = req_doc.get('pages', [])
    process_logger.info(f"\nTrying to find annexure {annexure_name} in file {source_hint} on pages: {pages}")

    eligibility = extracted_info.get("Eligibility/Qualification Criteria or conditions for bidder", "")['content']
    req_docs = extracted_info.get("Documents needed to submit the bid", "")['content']
    scope_of_work = extracted_info.get("Scope of work of the whole project", "")['content']

    # Normalize annexure name/number for searching
    annexure_match = re.search(r'(?:annexure|format|form|proforma|checklist|annex)\s*[-\s]*([\dIVXivx]+|[a-zA-Z])',
                             annexure_name.lower(), re.IGNORECASE)
    annexure_number = annexure_match.group(1) if annexure_match else None

    process_logger.info(f"\nannexure_name and annexure_number are: {annexure_name} and {annexure_number}")
    process_logger.info(f"\n🔍 Looking for annexure {annexure_name}")

    annexure_content = None
    source_file = None

    # Step 1: Find all PDF files in the tender directory
    all_pdf_files = list(Path(bid_dir).glob("**/*.pdf"))

    # If we have a source document hint, and it is present in all pdf files, then get annexure from it directly
    for pdf_file in all_pdf_files:

        process_logger.info(f"\nProcessing Files: {pdf_file} with {source_hint} and {pdf_file.name}")
        if (source_hint != 'Unknown' and source_hint in pdf_file.name):

            out_dir = os.path.join(bid_dir, "tender_analysis")
            annex_path = os.path.join(out_dir, "temp_annexure.pdf")

            annexure_content = get_annexure_content(pdf_file, pages, annex_path)
            if annexure_content:
                source_file = pdf_file
                process_logger.info(f"🎉 Found annexure content in {pdf_file.name}")
                break
            else:
                process_logger.error(f"Annexure not found in document {pdf_file.name}")

    # Step 3: Generate the filled annexure document
    if annexure_content:
        # print("Found annexure content:\n", annexure_content)
        return generate_filled_annexure(annexure_name, annexure_number, annexure_content, company_info, final_docs_dir, annex_path, process_logger)
    else:
        process_logger.error(f"Could not find annexure format in any tender document. Generating the document as custom document now.")
        # return None
        return generate_annexure_fallback(annexure_name, file_desc, company_info, final_docs_dir, eligibility, req_docs, scope_of_work, process_logger)