import os
import json
import time
import logging
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any, Union, Tuple
import re
import requests
from urllib.parse import urljoin
import tempfile
import random
import shutil
import hashlib

import anthropic
from botocore.exceptions import ClientError

from log_forwarder import setup_api_logging

# Import custom modules
import bid_prep_automation as bpa
import bid_queries as bq
import chat_with_rfp as chat
import tender_automation as ta
from document_extractor import extract_documents_text_compatible, create_document_extractor
import extract_annexures_auto as eaa
import pdf_merger
import main_document_processor as mdp

from dotenv import load_dotenv
load_dotenv()
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
AWS_REGION = os.getenv("AWS_REGION")
AWS_S3_BUCKET = os.getenv("AWS_S3_BUCKET") 

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Company Document Mapping - Centralized configuration
COMPANY_DOCUMENT_MAPPING = {
    # Basic Details Documents
    "companyName": {
        "category": "standard",
        "description_template": "Company incorporation/registration document containing company name: {value}",
        "relevant_sections": ["BASIC COMPANY DETAILS", "LEGAL & REGISTRATION DETAILS"],
        "display_name": "Company Name Document"
    },
    "entityType": {
        "category": "standard", 
        "description_template": "Entity type documentation showing: {value}",
        "relevant_sections": ["BASIC COMPANY DETAILS", "LEGAL & REGISTRATION DETAILS"],
        "display_name": "Entity Type Document"
    },
    "companyWebsite": {
        "category": "standard",
        "description_template": "Company website verification document for: {value}",
        "relevant_sections": ["BASIC COMPANY DETAILS"],
        "display_name": "Website Document"
    },
    "uploadedEmployeeDetails": {
        "category": "standard",
        "description_template": "Employee details document containing staff information and organizational structure",
        "relevant_sections": ["EMPLOYEE INFORMATION", "BASIC COMPANY DETAILS"],
        "display_name": "Employee Details"
    },
    
    # Legal & Registration Documents
    "cin_number": {
        "category": "standard",
        "description_template": "Company Incorporation Certificate - CIN: {value}",
        "relevant_sections": ["LEGAL & REGISTRATION DETAILS", "BASIC COMPANY DETAILS"],
        "display_name": "Company Incorporation Certificate"
    },
    "pan_number": {
        "category": "standard", 
        "description_template": "Permanent Account Number (PAN) Certificate - PAN: {value}",
        "relevant_sections": ["LEGAL & REGISTRATION DETAILS", "BASIC COMPANY DETAILS"],
        "display_name": "PAN Certificate"
    },
    "gst_number": {
        "category": "standard",
        "description_template": "Goods and Services Tax (GST) Registration Certificate - GST: {value}",
        "relevant_sections": ["LEGAL & REGISTRATION DETAILS", "BASIC COMPANY DETAILS"],
        "display_name": "GST Registration Certificate"
    },
    "msme_number": {
        "category": "standard",
        "description_template": "MSME Registration Certificate - Registration Number: {value}",
        "relevant_sections": ["LEGAL & REGISTRATION DETAILS", "BASIC COMPANY DETAILS"],
        "display_name": "MSME Registration Certificate"
    },
    "startUpRegistrationNumber": {
        "category": "standard",
        "description_template": "DPIIT Startup Recognition Certificate - Registration Number: {value}",
        "relevant_sections": ["LEGAL & REGISTRATION DETAILS", "BASIC COMPANY DETAILS"],
        "display_name": "Startup Recognition Certificate"
    },
    
    # Financial Documents
    "financial_itr": {
        "category": "standard",
        "description_template": "Income Tax Return for Financial Year: {year}",
        "relevant_sections": ["FINANCIAL INFORMATION", "BASIC COMPANY DETAILS"],
        "display_name": "Income Tax Return"
    },
    "financial_ca_cert": {
        "category": "standard",
        "description_template": "Chartered Accountant Certificate: {description}",
        "relevant_sections": ["FINANCIAL INFORMATION", "BASIC COMPANY DETAILS"],
        "display_name": "CA Certificate"
    },
    "financial_balance_sheet": {
        "category": "standard",
        "description_template": "Audited Balance Sheet for Financial Year: {year}",
        "relevant_sections": ["FINANCIAL INFORMATION", "BASIC COMPANY DETAILS"],
        "display_name": "Audited Balance Sheet"
    },
    
    # Director Documents
    "director_details": {
        "category": "standard",
        "description_template": "Director documentation for {name} - PAN: {pan}, DIN: {din}",
        "relevant_sections": ["DIRECTOR INFORMATION", "BASIC COMPANY DETAILS"],
        "display_name": "Director Documents"
    },
    
    # Experience Documents
    "past_experience": {
        "category": "experience",
        "description_template": "Experience document for project '{project}' with client '{customer}' - Value: {value}",
        "relevant_sections": ["PAST EXPERIENCE", "BASIC COMPANY DETAILS"],
        "display_name": "Past Experience Document"
    },
    
    # Certificate Documents
    "certificate": {
        "category": "standard",
        "description_template": "Certificate: {description}",
        "relevant_sections": ["CERTIFICATIONS", "BASIC COMPANY DETAILS"],
        "display_name": "Company Certificate"
    }
}

class Color:
    """Utility class for colored terminal output"""
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'


class ApiClient:
    """Handles all API calls to the Node.js service"""
    
    def __init__(self, base_url, api_key):
        """Initialize API client"""
        self.base_url = base_url.rstrip('/')
        self.api_key = api_key
        self.headers = {
            'x-internal-api-key': api_key,
            'Content-Type': 'application/json'
        }
    
    def get_tenant(self, tenant_id: str) -> Dict:
        """Get tenant by ID"""
        url = f"{self.base_url}/tenants/{tenant_id}"
        response = requests.get(url, headers=self.headers)
        self._check_response(response)
        return response.json()['data']
    
    def get_company_info(self, tenant_id: str, company_id: str = None) -> Dict:
        """Get formatted company information for tenant"""
        if company_id is None:
            url = f"{self.base_url}/tenants/{tenant_id}/company-info"
        else:
            url = f"{self.base_url}/tenants/{tenant_id}/company-info?company_id={company_id}"
        response = requests.get(url, headers=self.headers)
        self._check_response(response)
        return response.json()['data']

    def get_company_analysis_data(self, company_id: str) -> Dict:
        """Get raw analysis of company's document from database"""
        url = f"{self.base_url}/rawAnalysisById/{company_id}"
        response = requests.get(url, headers=self.headers)
        self._check_response(response)
        return response.json()['data']
    
    def get_tender(self, tender_id: str, tenant_id: str) -> Dict:
        """Get tender by ID and tenant"""
        url = f"{self.base_url}/tenders/{tender_id}?tenant_id={tenant_id}"
        response = requests.get(url, headers=self.headers)
        self._check_response(response)
        
        # Log the response data for debugging
        tender_data = response.json()['data']
        
        # Debug log the originalDocuments field
        if 'originalDocuments' in tender_data:
            logger.info(f"Original documents in tender response: {type(tender_data['originalDocuments'])}")
            logger.info(f"Document IDs: {tender_data['originalDocuments']}")
            
            # Convert originalDocuments to a list of strings if they aren't already
            if isinstance(tender_data['originalDocuments'], list):
                string_docs = []
                for doc in tender_data['originalDocuments']:
                    if isinstance(doc, dict) and '_id' in doc:
                        string_docs.append(str(doc['_id']))
                    elif isinstance(doc, str):
                        string_docs.append(doc)
                    else:
                        logger.warning(f"Unexpected document ID format: {type(doc)} - {doc}")
                        # Try to convert to string anyway
                        try:
                            string_docs.append(str(doc))
                        except:
                            logger.error(f"Could not convert document ID to string: {doc}")
                
                tender_data['originalDocuments'] = string_docs
                logger.info(f"Converted document IDs: {tender_data['originalDocuments']}")
        
        return tender_data
    
    def update_tender(self, tender_id: str, update_data: Dict) -> Dict:
        """Update tender with new data"""
        url = f"{self.base_url}/tenders/{tender_id}"
        response = requests.put(url, json=update_data, headers=self.headers)
        self._check_response(response)
        return response.json()['data']
    
    def get_documents(self, document_ids: List[str]) -> List[Dict]:
        """Get documents by IDs"""
        url = f"{self.base_url}/documents/get-many"
        
        # Ensure all IDs are properly formatted as strings
        string_ids = []
        for doc_id in document_ids:
            if doc_id is None:
                continue
                
            # Handle different input types
            if isinstance(doc_id, dict) and '_id' in doc_id:
                # If it's a document object with _id field
                string_ids.append(str(doc_id['_id']))
            elif isinstance(doc_id, dict) and '$oid' in doc_id:
                # Handle MongoDB extended JSON format
                string_ids.append(str(doc_id['$oid']))
            else:
                # Regular ID string or other object
                string_ids.append(str(doc_id))
        
        # Log the document IDs we're requesting
        logger.debug(f"Requesting documents with IDs: {string_ids}")
        
        response = requests.post(url, json={'document_ids': string_ids}, headers=self.headers)
        
        # Log response status and info on error
        if response.status_code >= 400:
            logger.error(f"Error getting documents: HTTP {response.status_code}")
            try:
                error_info = response.json()
                logger.error(f"Error details: {error_info}")
            except:
                logger.error(f"Raw response: {response.text[:200]}")
                
        self._check_response(response)
        return response.json()['data']
    
    def create_document(self, document_data: Dict) -> Dict:
        """Create a new document record"""
        url = f"{self.base_url}/documents"
        response = requests.post(url, json=document_data, headers=self.headers)
        self._check_response(response)
        return response.json()['data']
    
    def get_document_content(self, document_id: str, tenant_id: str) -> bytes:
        """Get document content"""
        # Ensure we're passing a string ID
        doc_id = str(document_id).replace('"', '').replace("'", '')
        
        # Log the document ID we're trying to access
        logger.info(f"Getting content for document ID: {doc_id}")
        
        url = f"{self.base_url}/documents/{doc_id}/content?tenant_id={tenant_id}"
        response = requests.get(url, headers={
            'x-internal-api-key': self.api_key
        })
        
        # Log response status and info on error
        if response.status_code >= 400:
            logger.error(f"Error getting document content: HTTP {response.status_code}")
            try:
                error_info = response.json()
                logger.error(f"Error details: {error_info}")
            except:
                logger.error(f"Raw response: {response.text[:200]}")
            
        self._check_response(response)
        return response.content
    
    def create_analysis(self, analysis_data: Dict) -> Dict:
        """Create a new analysis record"""
        url = f"{self.base_url}/analyses"
        response = requests.post(url, json=analysis_data, headers=self.headers)
        self._check_response(response)
        return response.json()['data']
    
    def get_analysis(self, analysis_id: str) -> Dict:
        """Get analysis by ID"""
        url = f"{self.base_url}/analyses/{analysis_id}"
        response = requests.get(url, headers=self.headers)
        self._check_response(response)
        return response.json()['data']
    
    def update_analysis(self, analysis_id: str, update_data: Dict) -> Dict:
        """Update analysis with new data"""
        url = f"{self.base_url}/analyses/{analysis_id}"
        response = requests.put(url, json=update_data, headers=self.headers)
        self._check_response(response)
        return response.json()['data']

    def update_company_info(self, company_id: str, tenant_id: str, update_data: Dict) -> Dict:
        """Update company info with extracted data"""
        url = f"{self.base_url}/companyDetails/{company_id}/{tenant_id}"
        response = requests.put(url, json=update_data, headers=self.headers)
        self._check_response(response)
        return response.json()['data']
    
    def create_bid_generation(self, bid_data: Dict) -> Dict:
        """Create a new bid generation record"""
        url = f"{self.base_url}/bid-generations"
        response = requests.post(url, json=bid_data, headers=self.headers)
        self._check_response(response)
        return response.json()['data']
    
    def get_bid_generation(self, bid_generation_id: str, tenant_id: str) -> Dict:
        """Get bid generation by ID and tenant"""
        url = f"{self.base_url}/bid-generations/{bid_generation_id}?tenant_id={tenant_id}"
        response = requests.get(url, headers=self.headers)
        self._check_response(response)
        return response.json()['data']
    
    def update_bid_generation(self, bid_generation_id: str, update_data: Dict) -> Dict:
        """Update bid generation with new data"""
        url = f"{self.base_url}/bid-generations/{bid_generation_id}"
        response = requests.put(url, json=update_data, headers=self.headers)
        self._check_response(response)
        return response.json()['data']
    
    def upload_file(self, file_content: bytes, s3_key: str, content_type: str, 
                   tenant_id: str, encrypt: bool = False) -> Dict:
        """Upload file to S3"""
        url = f"{self.base_url}/storage/upload"
        
        files = {
            'file': (os.path.basename(s3_key), file_content, content_type)
        }
        
        data = {
            'tenant_id': tenant_id,
            'key': s3_key,
            'content_type': content_type,
            'encrypt': str(encrypt).lower()
        }
        
        headers = {'x-internal-api-key': self.api_key}
        
        response = requests.post(url, files=files, data=data, headers=headers)
        self._check_response(response)
        return response.json()['data']
    
    def download_file(self, s3_key: str, tenant_id: str, decrypt: bool = True) -> bytes:
        """Download file from S3"""
        url = f"{self.base_url}/storage/download?tenant_id={tenant_id}&key={s3_key}&decrypt={str(decrypt).lower()}"
        response = requests.get(url, headers={
            'x-internal-api-key': self.api_key
        })
        self._check_response(response)
        return response.content
    
    def list_company_documents(self, tenant_id: str) -> List[Dict]:
        """List company documents for a tenant"""
        url = f"{self.base_url}/storage/company-documents/{tenant_id}/list"
        response = requests.get(url, headers=self.headers)
        self._check_response(response)
        return response.json()['data']
    
    def download_company_documents(self, tenant_id: str, download_dir: str) -> int:
        """Download all company documents for a tenant"""
        url = f"{self.base_url}/storage/company-documents/{tenant_id}/download"
        response = requests.get(url, headers={
            'x-internal-api-key': self.api_key
        })
        self._check_response(response)
        
        # Save the zip file
        zip_path = os.path.join(download_dir, 'company_documents.zip')
        with open(zip_path, 'wb') as f:
            f.write(response.content)
        
        # Extract the zip file
        import zipfile
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(download_dir)
        
        # Remove the zip file
        os.remove(zip_path)
        
        # Count extracted files
        file_count = 0
        for _, _, files in os.walk(download_dir):
            file_count += len(files)
        
        return file_count
    
    def upload_company_documents(self, tenant_id: str, local_docs_path: str) -> Dict:
        """Upload company documents from local path"""
        url = f"{self.base_url}/storage/company-documents/{tenant_id}/upload"
        
        files = []
        for root, _, filenames in os.walk(local_docs_path):
            for filename in filenames:
                if filename.startswith('.') or filename.endswith('.tmp'):
                    continue
                
                file_path = os.path.join(root, filename)
                files.append(('files', (filename, open(file_path, 'rb'), self._get_content_type(filename))))
        
        headers = {'x-internal-api-key': self.api_key}
        
        response = requests.post(url, files=files, headers=headers)
        
        # Close all file handles
        for _, file_tuple in files:
            file_tuple[1].close()
        
        self._check_response(response)
        return response.json()['data']
    
    def get_signed_url(self, s3_key: str, tenant_id: str, expires_in: int = 3600) -> str:
        """Get signed URL for S3 object"""
        url = f"{self.base_url}/storage/signed-url?tenant_id={tenant_id}&key={s3_key}&expires_in={expires_in}"
        response = requests.get(url, headers=self.headers)
        self._check_response(response)
        return response.json()['data']['url']
    
    def _check_response(self, response: requests.Response) -> None:
        """Check response status and raise exception if error"""
        if response.status_code >= 400:
            error_msg = f"API Error {response.status_code}"
            try:
                error_data = response.json()
                if 'message' in error_data:
                    error_msg = f"{error_msg}: {error_data['message']}"
                if 'error' in error_data:
                    error_msg = f"{error_msg} - {error_data['error']}"
            except:
                pass
            
            raise Exception(error_msg)
    
    def _get_content_type(self, filename: str) -> str:
        """Get content type based on file extension"""
        ext = os.path.splitext(filename)[1].lower()
        content_types = {
            '.pdf': 'application/pdf',
            '.html': 'text/html',
            '.txt': 'text/plain',
            '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
            '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
            '.json': 'application/json'
        }
        return content_types.get(ext, 'application/octet-stream')


class MinaionsRFPService:
    """Main service for RFP processing using Minaions AI"""
    
    def __init__(self, config_path: str = "config.json"):
        """Initialize the service with configuration"""
        # Load configuration
        try:
            with open(config_path, 'r') as f:
                self.config = json.load(f)
        except Exception as e:
            logger.error(f"Failed to load configuration: {e}")
            raise
        
        # Initialize Claude client
        self.client = anthropic.Client(api_key=self.config["anthropic"]["api_key"])
        
        # Create base directories for temporary file storage
        self.base_dir = self.config.get("file_storage", {}).get("base_dir", "/tmp/rfp_service")
        os.makedirs(self.base_dir, exist_ok=True)
        
        # Initialize API client for Node.js service
        api_base_url = self.config.get("nodejs_service", {}).get("api_url", "http://localhost:5000/internal-api")
        api_key = self.config.get("nodejs_service", {}).get("api_key", os.environ.get("INTERNAL_API_KEY"))
        
        if not api_key:
            raise ValueError("Internal API key is required. Please set it in config.json or as INTERNAL_API_KEY environment variable.")
        
        self.api_client = ApiClient(api_base_url, api_key)
        
        # Initialize document extractor
        self.extractor = create_document_extractor(self.config["anthropic"]["api_key"])
        
        # Initialize the current process context (will be set in each method)
        self.current_tenant_id = None
        self.current_process_id = None
        self.current_process_type = None

    # =================================================================
    # COMPANY INFORMATION METHODS
    # =================================================================

    def _format_company_information(self, tenant_id: str, company_id: str = None) -> str:
        """
        Get and format company information from API (structured data only)
        
        Args:
            tenant_id: Tenant ID
            
        Returns:
            str: Formatted company information from structured data
        """
        try:
            # Get company information from API
            company_data = self.api_client.get_company_info(tenant_id, company_id)
            company_info = company_data.get("company_info", {})
            raw_analysis = company_data.get("raw_analysis",{})
            
            self.process_logger.info(f"Retrieved company info (structured data only)")
            
            if(raw_analysis=={}):
                # Convert details array to dictionary for easier processing
                details_info = {}
                details_list = company_info.get("details", [])
                for item in details_list:
                    name = item.get("name", "")
                    value = item.get("value", "")
                    file_url = item.get("file", "")
                    if name and (value or file_url):
                        details_info[name] = value if value else file_url
                
                # Start building the formatted company information
                formatted_info = "COMPANY INFORMATION FOR TENDER SUBMISSION\n"
                formatted_info += "=" * 60 + "\n\n"
                
                # Basic Company Information
                formatted_info += "BASIC COMPANY DETAILS:\n"
                formatted_info += "-" * 22 + "\n"
                
                if details_info.get("companyName"):
                    formatted_info += f"Company Name: {details_info['companyName']}\n"
                
                if details_info.get("entityType"):
                    formatted_info += f"Entity Type: {details_info['entityType']}\n"
                
                if details_info.get("registeredAddress"):
                    formatted_info += f"Registered Address: {details_info['registeredAddress']}\n"
                
                # Branch Office Addresses
                branch_addresses = company_info.get("branchOfficeAddress", [])
                for i, address in enumerate(branch_addresses, 1):
                    if address:
                        formatted_info += f"Branch Office Address {i}: {address}\n"
                
                if details_info.get("companyWebsite"):
                    formatted_info += f"Website: {details_info['companyWebsite']}\n"
                
                if details_info.get("officePhoneNumber"):
                    formatted_info += f"Phone: {details_info['officePhoneNumber']}\n"
                
                if details_info.get("officeEmail"):
                    formatted_info += f"Email: {details_info['officeEmail']}\n"
                
                if details_info.get("officeFax"):
                    formatted_info += f"Fax: {details_info['officeFax']}\n"
                
                if details_info.get("authorizedPersonToSignDocs"):
                    formatted_info += f"Authorized Signatory: {details_info['authorizedPersonToSignDocs']}\n"
                
                if details_info.get("authorizedPersonDesignation"):
                    formatted_info += f"Designation: {details_info['authorizedPersonDesignation']}\n"
                
                formatted_info += "\n"
                
                # Employee Information
                formatted_info += "EMPLOYEE INFORMATION:\n"
                formatted_info += "-" * 21 + "\n"
                
                if details_info.get("totalEmployees"):
                    formatted_info += f"Total Employees: {details_info['totalEmployees']}\n"
                
                if details_info.get("technicalStaff"):
                    formatted_info += f"Technical Staff: {details_info['technicalStaff']}\n"
                
                if details_info.get("uploadedEmployeeDetails"):
                    formatted_info += f"Employee Details Document: {details_info['uploadedEmployeeDetails']}\n"
                
                formatted_info += "\n"
                
                # Legal and Registration Details
                formatted_info += "LEGAL & REGISTRATION DETAILS:\n"
                formatted_info += "-" * 28 + "\n"
                
                if details_info.get("cin_number"):
                    formatted_info += f"Company Incorporation Number (CIN): {details_info['cin_number']}\n"
                
                if details_info.get("pan_number"):
                    formatted_info += f"Permanent Account Number (PAN): {details_info['pan_number']}\n"
                
                if details_info.get("gst_number"):
                    formatted_info += f"Goods and Services Tax (GST): {details_info['gst_number']}\n"
                
                if details_info.get("msme_number"):
                    formatted_info += f"Micro, Small and Medium Enterprises (MSME) Registration: {details_info['msme_number']}\n"
                
                if details_info.get("typeOfMsme"):
                    formatted_info += f"MSME Type: {details_info['typeOfMsme']}\n"
                
                if details_info.get("startUpRegistrationNumber"):
                    formatted_info += f"Department for Promotion of Industry and Internal Trade (DPIIT) Startup Registration: {details_info['startUpRegistrationNumber']}\n"
                
                formatted_info += "\n"
                
                # Director Details
                director_details = company_info.get("directorDetails", [])
                if director_details:
                    formatted_info += "DIRECTOR INFORMATION:\n"
                    formatted_info += "-" * 21 + "\n"
                    
                    for i, director in enumerate(director_details, 1):
                        if director.get("name"):
                            formatted_info += f"Director {i}:\n"
                            formatted_info += f"  Name: {director['name']}\n"
                            if director.get("address"):
                                formatted_info += f"  Address: {director['address']}\n"
                            if director.get("pan"):
                                formatted_info += f"  Permanent Account Number (PAN): {director['pan']}\n"
                            if director.get("din"):
                                formatted_info += f"  Director Identification Number (DIN): {director['din']}\n"
                            formatted_info += "\n"
                
                # Financial Details
                financial_details = company_info.get("financialDetails", {})
                if financial_details:
                    formatted_info += "FINANCIAL INFORMATION:\n"
                    formatted_info += "-" * 22 + "\n"
                    
                    # Turnover Details
                    turnover_details = financial_details.get("turnoverDetails", [])
                    for turnover in turnover_details:
                        if turnover.get("turnoverYear") and turnover.get("turnover"):
                            unit = turnover.get("turnoverUnit", "")
                            formatted_info += f"Annual Turnover {turnover['turnoverYear']}: {unit} {turnover['turnover']}\n"
                    
                    # ITR Details
                    itr_details = financial_details.get("itrDetails", [])
                    for itr in itr_details:
                        if itr.get("itrYear"):
                            formatted_info += f"Income Tax Return (ITR) Year: {itr['itrYear']}\n"
                    
                    # CA Certificate Details
                    ca_cert_details = financial_details.get("ca_certificate_details", [])
                    for ca_cert in ca_cert_details:
                        if ca_cert.get("ca_certificate_description"):
                            formatted_info += f"Chartered Accountant (CA) Certificate: {ca_cert['ca_certificate_description']}\n"
                    
                    # Balance Sheet Details
                    balance_sheet_details = financial_details.get("balanceSheetDetails", [])
                    for balance_sheet in balance_sheet_details:
                        if balance_sheet.get("auditedBalanceSheetYear"):
                            formatted_info += f"Balance Sheet Year: {balance_sheet['auditedBalanceSheetYear']}\n"
                    
                    formatted_info += "\n"
                
                # Past Experience Details
                past_experience = company_info.get("pastExperienceDetails", [])
                if past_experience:
                    formatted_info += "PAST EXPERIENCE:\n"
                    formatted_info += "-" * 16 + "\n"
                    
                    for i, experience in enumerate(past_experience, 1):
                        if experience.get("customer") or experience.get("project"):
                            formatted_info += f"Project {i}:\n"
                            if experience.get("customer"):
                                formatted_info += f"  Client Name: {experience['customer']}\n"
                            if experience.get("clientLocation"):
                                formatted_info += f"  Client Location: {experience['clientLocation']}\n"
                            if experience.get("project"):
                                formatted_info += f"  Project Title: {experience['project']}\n"
                            if experience.get("projectValue"):
                                formatted_info += f"  Project Value: {experience['projectValue']}\n"
                            if experience.get("projectScope"):
                                formatted_info += f"  Project Scope: {experience['projectScope']}\n"
                            if experience.get("projectStartDate"):
                                formatted_info += f"  Start Date: {experience['projectStartDate']}\n"
                            if experience.get("projectEndDate"):
                                formatted_info += f"  End Date: {experience['projectEndDate']}\n"
                            formatted_info += "\n"
                
                # Certifications
                certificates = company_info.get("certificate", [])
                if certificates:
                    formatted_info += "CERTIFICATIONS:\n"
                    formatted_info += "-" * 14 + "\n"
                    
                    for cert in certificates:
                        if cert.get("description"):
                            formatted_info += f"- {cert['description']}\n"
                    
                    formatted_info += "\n"
                
                formatted_info += "=" * 60 + "\n"
                
                self.process_logger.info(f"Successfully formatted company information ({len(formatted_info)} characters)")
                
                self.process_logger.info(f"Company information: \n {formatted_info}")
            else:
                # Convert details array to dictionary for easier processing
                details_info = {}
                details_list = company_info.get("details", [])
                for item in details_list:
                    name = item.get("name", "")
                    value = item.get("value", "")
                    file_url = item.get("file", "")
                    if name and (value or file_url):
                        details_info[name] = value if value else file_url
                
                # Start building the formatted company information
                formatted_info = "COMPANY INFORMATION FOR TENDER SUBMISSION\n"
                formatted_info += "=" * 60 + "\n\n"
                
                # Basic Company Information
                formatted_info += "BASIC COMPANY DETAILS:\n"
                formatted_info += "-" * 22 + "\n"
                
                if details_info.get("companyName"):
                    formatted_info += f"Company Name: {details_info['companyName']}\n"
                
                if details_info.get("entityType"):
                    formatted_info += f"Entity Type: {details_info['entityType']}\n"
                
                if details_info.get("registeredAddress"):
                    formatted_info += f"Registered Address: {details_info['registeredAddress']}\n"
                
                # Branch Office Addresses
                branch_addresses = company_info.get("branchOfficeAddress", [])
                for i, address in enumerate(branch_addresses, 1):
                    if address:
                        formatted_info += f"Branch Office Address {i}: {address}\n"
                
                if details_info.get("companyWebsite"):
                    formatted_info += f"Website: {details_info['companyWebsite']}\n"
                
                if details_info.get("officePhoneNumber"):
                    formatted_info += f"Phone: {details_info['officePhoneNumber']}\n"
                
                if details_info.get("officeEmail"):
                    formatted_info += f"Email: {details_info['officeEmail']}\n"
                
                if details_info.get("officeFax"):
                    formatted_info += f"Fax: {details_info['officeFax']}\n"
                
                if details_info.get("authorizedPersonToSignDocs"):
                    formatted_info += f"Authorized Signatory: {details_info['authorizedPersonToSignDocs']}\n"
                
                if details_info.get("authorizedPersonDesignation"):
                    formatted_info += f"Designation: {details_info['authorizedPersonDesignation']}\n"

                if details_info.get("about"):
                    formatted_info += f"About Company (Additional Info): {details_info['about']}\n"
                
                formatted_info += "\n"
                seen = set()
                formatted_info += "DOCUMENT ANALYSIS INFORMATION\n"
                formatted_info += "=" * 60 + "\n\n"

                for _, value in raw_analysis.items():
                    # Build unique identity to remove duplicates
                    identity = (
                        value.get("doc_type"),
                        json.dumps(value.get("key_info", {}), sort_keys=True),
                        value.get("description"),
                        value.get("file_path"),
                    )
                    if identity in seen:
                        continue
                    seen.add(identity)

                    # Extract only required fields
                    doc_type = value.get("doc_type", "N/A")
                    description = value.get("description", "N/A")
                    file_path = value.get("file_path", "N/A")
                    key_info = value.get("key_info", {})

                    formatted_info += f"Document Type: {doc_type}\n"
                    formatted_info += f"Description: {description}\n"
                    formatted_info += f"File Path: {file_path}\n"

                    if key_info:
                        formatted_info += "Key Information:\n"
                        for k, v in key_info.items():
                            formatted_info += f"  - {k}: {v}\n"

                    formatted_info += "-" * 60 + "\n\n"

                self.process_logger.info(f"Formatted {len(seen)} unique documents from raw_analysis")

            return formatted_info
            
        except Exception as e:
            self.process_logger.error(f"Error formatting company information: {e}")
            # Fallback to basic information if available
            try:
                company_data = self.api_client.get_company_info(tenant_id, company_id)
                company_info = company_data.get("company_info", {})
                details_list = company_info.get("details", [])
                
                # Extract basic info
                basic_info = {}
                for item in details_list:
                    name = item.get("name", "")
                    value = item.get("value", "")
                    if name and value:
                        basic_info[name] = value
                
                fallback_info = f"Company Name: {basic_info.get('companyName', 'Not specified')}\n"
                fallback_info += f"Entity Type: {basic_info.get('entityType', 'Not specified')}\n"
                fallback_info += f"Contact: {basic_info.get('officeEmail', 'Not specified')}\n"
                
                return fallback_info
            except:
                return "Company information not available"

    # =================================================================
    # GET RFP DOCUMENTS METHODS
    # =================================================================

    def get_rfp_documents(self, tender_id: str, tenant_id: str) -> Dict[str, Any]:
        """Extract links from existing RFP documents and download additional documents"""
        try:
            # Set up process context
            self._setup_process_context(tenant_id, tender_id, "get_rfp_documents")

            self.api_client.update_tender(tender_id, {
                "$set": {
                    "logs": {
                        "timestamp": time.time(),
                        "action": "get_rfp_documents_started"
                    }
                }
            })

            # Get tender and tenant details from API
            self.process_logger.info(f"Getting tender {tender_id} for tenant {tenant_id}")
            tender = self.api_client.get_tender(tender_id, tenant_id)
            if not tender:
                raise ValueError(f"Tender not found: {tender_id}")
            
            self.process_logger.info(f"Getting tenant {tenant_id}")
            tenant = self.api_client.get_tenant(tenant_id)
            if not tenant:
                raise ValueError(f"Tenant not found: {tenant_id}")
            
            # Create directory for processing
            processing_dir = os.path.join(self.base_dir, f"rfp_docs_{tender_id}")
            os.makedirs(processing_dir, exist_ok=True)
            
            downloaded_documents = []
            
            self.process_logger.info(f"Getting additional RFP documents for tender {tender.get('bidNumber', tender_id)}")
            
            # Get existing tender documents using API
            document_ids = tender.get("originalDocuments", [])
            
            # Ensure all IDs are strings
            document_ids = [str(doc_id) for doc_id in document_ids if doc_id]
            
            self.process_logger.info(f"Document IDs from tender: {document_ids}")
            
            if not document_ids:
                logger.warning("No document IDs found in tender")
                self.api_client.update_tender(tender_id, {
                    "$push": {
                        "logs": {
                            "timestamp": time.time(),
                            "action": "get_rfp_documents_completed",
                            "details": {
                                "links_found": 0,
                                "documents_downloaded": 0,
                                "message": "No documents found in tender to process"
                            }
                        }
                    }
                })
                return {
                    "status": "success",
                    "tender_id": tender_id,
                    "links_found": 0,
                    "documents_downloaded": 0,
                    "downloaded_documents": [],
                    "message": "No documents found in tender to process"
                }
            
            self.process_logger.info(f"Getting {len(document_ids)} documents")
            documents = self.api_client.get_documents(document_ids)
            self.process_logger.info(f"Retrieved {len(documents)} documents")
            
            # Extract all links from existing documents
            all_links = []
            
            for doc in documents:
                try:
                    doc_id = str(doc.get("_id", ""))
                    doc_name = doc.get("name", f"doc_{doc_id}")

                    doc_type = str(doc.get("type", ""))
                    doc_category = str(doc.get("category", ""))

                    if(doc_type=="bid_document" and doc_category=="tender"):
                        
                        self.process_logger.info(f"Processing document: {doc_name} (ID: {doc_id})")
                        
                        # Extract document content using API
                        logger.info(f"Getting content for document {doc_id}")
                        doc_content = self.api_client.get_document_content(doc_id, tenant_id)
                        self.process_logger.info(f"Received content for document {doc_id}: {len(doc_content)} bytes")
                        
                        # Save to temporary file for link extraction
                        temp_path = os.path.join(processing_dir, doc_name)
                        
                        with open(temp_path, 'wb') as f:
                            f.write(doc_content)
                        
                        self.process_logger.info(f"Saved document to {temp_path}")
                        
                        # Extract links only from PDF files using tender_automation function
                        if doc_name.lower().endswith('.pdf'):
                            self.process_logger.info(f"Extracting links from PDF: {doc_name}")
                            links = ta.extract_links_from_pdf(temp_path)
                            if links:
                                all_links.extend(links)
                                self.process_logger.info(f"Extracted {len(links)} links from {doc_name}")
                        
                        # Clean up temp file
                        try:
                            os.remove(temp_path)
                        except Exception as cleanup_err:
                            self.process_logger.warning(f"Could not delete temp file {temp_path}: {cleanup_err}")
                        
                except Exception as e:
                    self.process_logger.error(f"Error processing document {doc.get('_id')}: {e}", exc_info=True)
            
            # Remove duplicates from links
            unique_links = list(set(all_links))
            self.process_logger.info(f"Found {len(unique_links)} unique links across all documents")
            
            # Download documents from links using tender_automation function
            if unique_links:
                downloaded_files = ta.download_linked_documents(processing_dir, unique_links, process_logger = self.process_logger)

                ta.remove_duplicate_pdfs(processing_dir)

                downloaded_files = ta.list_files_in_directory(processing_dir)
                
                # Process downloaded files and save via API
                for file_path in downloaded_files:
                    try:
                        filename = os.path.basename(file_path)
                        
                        with open(file_path, 'rb') as f:
                            content = f.read()
                        
                        # Upload to S3 via API
                        bid_number = tender.get("bidNumber", str(tender_id))
                        bid_number = re.sub(r'[\/\\]', '_', bid_number)
                        s3_key = f"tenders/{tenant_id}/{bid_number}/linked/{filename}"
                        
                        content_type = self._get_content_type(filename)
                        storage_details = self.api_client.upload_file(
                            content, s3_key, content_type, tenant_id
                        )
                        
                        # Create document record via API
                        document_record = {
                            "tenant": tenant_id,
                            "name": filename,
                            "type": "rfp",
                            "category": "tender",
                            "storageType": "s3",
                            "storageDetails": storage_details,
                            "metadata": {
                                "tenderId": tender_id,
                                "bidNumber": bid_number,
                                "source": "extracted_link",
                                "parentTenderId": tender_id
                            }
                        }
                        
                        doc = self.api_client.create_document(document_record)
                        doc_id = str(doc["_id"])
                        
                        downloaded_documents.append({
                            "document_id": doc_id,
                            "filename": filename,
                            "size": len(content)
                        })
                        
                        self.process_logger.info(f"Saved linked document: {filename}")
                        
                    except Exception as e:
                        self.process_logger.error(f"Error saving linked document {file_path}: {e}")
                
                # Update tender with new linked documents via API
                new_doc_ids = [doc["document_id"] for doc in downloaded_documents]
                existing_doc_ids = [str(doc_id) for doc_id in tender.get("originalDocuments", [])]
                all_doc_ids = existing_doc_ids + new_doc_ids
                
                update_data = {
                    "$addToSet": {"originalDocuments": {"$each": new_doc_ids}},
                    "$push": {
                        "logs": {
                            "timestamp": time.time(),
                            "action": "linked_documents_added",
                            "details": {
                                "links_found": len(unique_links),
                                "documents_downloaded": len(downloaded_documents)
                            }
                        }
                    }
                }
                
                self.api_client.update_tender(tender_id, update_data)

            self.api_client.update_tender(tender_id, {
                "$push": {
                    "logs": {
                        "timestamp": time.time(),
                        "action": "get_rfp_documents_completed",
                        "details": {
                            "links_found": len(unique_links),
                            "documents_downloaded": len(downloaded_documents)
                        }
                    }
                }
            })
            
            # Clean up processing directory
            try:
                shutil.rmtree(processing_dir)
            except:
                pass
            
            return {
                "status": "success",
                "tender_id": tender_id,
                "links_found": len(unique_links),
                "documents_downloaded": len(downloaded_documents),
                "downloaded_documents": downloaded_documents,
                "message": f"Successfully downloaded {len(downloaded_documents)} additional RFP documents"
            }
            
        except Exception as e:
            logger.error(f"Error getting RFP documents: {str(e)}")
            try:
                self.api_client.update_tender(tender_id, {
                    "$push": {
                        "logs": {
                            "timestamp": time.time(),
                            "action": "get_rfp_documents_failed",
                            "details": {"error": str(e)}
                        }
                    }
                })
            except:
                pass
            return {
                "status": "error",
                "message": f"Error getting RFP documents: {str(e)}"
            }

    # =================================================================
    # ANALYSIS ESTIMATION METHODS
    # =================================================================

    def create_analysis_placeholder(self, tender_id: str, tenant_id: str, company_id: str) -> str:
        """Create a placeholder analysis record immediately and return analysis_id"""
        try:
            # Get tender details to validate it exists
            tender = self.api_client.get_tender(tender_id, tenant_id)
            if not tender:
                raise ValueError(f"Tender not found: {tender_id}")

            # Create placeholder analysis record
            analysis_record = {
                "tenant": tenant_id,
                "tender": tender_id,
                "company": company_id,
                "type": "analysis_estimation",
                "status": "estimating",  # Initial status
                "estimatedCost": 0,
                "currency": "INR",
                "analysisMetrics": {},
                "logs": [{
                    "timestamp": time.time(),
                    "action": "analysis_estimation_started",
                    "details": {
                        "message": "Cost estimation in progress"
                    }
                }]
            }

            analysis = self.api_client.create_analysis(analysis_record)
            analysis_id = str(analysis["_id"])

            logger.info(f"Created placeholder analysis record: {analysis_id}")
            return analysis_id

        except Exception as e:
            logger.error(f"Error creating analysis placeholder: {e}")
            raise

    def process_analysis_cost_estimation_background(self, tender_id: str, tenant_id: str, company_id: str, analysis_id: str) -> None:
        """Process analysis cost estimation in background and update the analysis record"""
        try:
            self._setup_process_context(tenant_id, analysis_id, "analysis_estimation")
            self.process_logger.info(f"Analyzing document complexity and preparing for analysis...")
            logger.info(f"[INTERNAL] Starting cost estimation for analysis_id: {analysis_id}")

            # Get tender and documents
            tender = self.api_client.get_tender(tender_id, tenant_id)
            if not tender:
                raise ValueError(f"Tender not found: {tender_id}")

            # Get all related documents via API
            document_ids = [str(doc_id) for doc_id in tender.get("originalDocuments", [])]
            documents = self.api_client.get_documents(document_ids)

            temp_dir = os.path.join(self.base_dir, f"analysis_cost_{tender_id}")
            output_dir = os.path.join(temp_dir, "analysis_cost_output")
            os.makedirs(output_dir, exist_ok=True)
            out_txt_file = os.path.join(output_dir, "merged.txt")
            out_json_file = os.path.join(output_dir, "doc_text.json")

            # Download if extracted text is present
            s3_key = f"tender_analysis/{tender_id}/doc_text.json"
            doc_content = self.api_client.download_file(s3_key, tenant_id, False)
            if ((len(doc_content) != 0) and (doc_content != "false")):
                with open(out_json_file, 'wb') as f:
                    f.write(doc_content)
            else:
                # Download documents to temporary directory for processing
                unique_links = []

                for doc in documents:
                    try:
                        filename = doc.get("name", f"doc_{doc['_id']}")
                        storage_details = doc.get("storageDetails", {})
                        unique_links.append(storage_details.get("url"))

                    except Exception as e:
                        self.process_logger.error(f"Failed to get document URL {doc.get('_id')}: {e}")

                downloaded_files = ta.download_linked_documents(temp_dir, unique_links, process_logger=self.process_logger)

            s3_key = f"tender_analysis/{tender_id}/merged.txt"
            doc_content = self.api_client.download_file(s3_key, tenant_id, False)
            if ((len(doc_content) != 0) and (doc_content != "false")):
                with open(out_txt_file, 'wb') as f:
                    f.write(doc_content)

            # Extract text from all documents to get word count
            documents_text = {}
            total_word_count = 0
            base_cost_per_word = 0.005  # INR per word

            self.process_logger.info("Attempting document extraction with external extractor for cost analysis...")
            documents_text = extract_documents_text_compatible(temp_dir, output_dir, self.config["anthropic"]["api_key"], process_logger=self.process_logger)
            total_word_count = self.extractor.count_total_words(documents_text)

            merged_file_path = os.path.join(output_dir, "merged.txt")
            
            company_info = self._format_company_information(tenant_id, company_id)
            self.process_logger.info(f"Company Info: {company_info}")

            company_info_word_count = self.extractor.count_words(company_info)
            self.process_logger.info(f"Company Info length: {company_info_word_count} words")

            total_word_count = total_word_count + company_info_word_count

            self.process_logger.info(f"Total word count is {total_word_count}\nUploading files to S3 now.")

            # Upload extracted text files to S3
            content = ''
            with open(out_txt_file, 'rb') as f:
                content = f.read()
            s3_key = f"tender_analysis/{tender_id}/merged.txt"
            content_type = 'text/plain'
            storage_details = self.api_client.upload_file(content, s3_key, content_type, tenant_id)
            self.process_logger.info(f"Uploaded merged.txt file to S3")

            content = ''
            with open(out_json_file, 'rb') as f:
                content = f.read()
            s3_key = f"tender_analysis/{tender_id}/doc_text.json"
            content_type = 'application/json'
            storage_details = self.api_client.upload_file(content, s3_key, content_type, tenant_id)
            self.process_logger.info(f"Uploaded doc_text.json file to S3")

            # Calculate base cost based on word count and complexity
            base_cost = max(10, total_word_count * base_cost_per_word)  # Minimum 10 INR

            if os.path.exists(temp_dir):
                shutil.rmtree(temp_dir)
                self.process_logger.info(f"Directory '{temp_dir}' deleted successfully.")
            else:
                self.process_logger.info(f"Directory '{temp_dir}' does not exist.")

            # Adjust cost based on tender complexity
            complexity_factor = 1.0
            complexity_indicators = []

            if total_word_count > 10000:
                complexity_factor = 1.2
                complexity_indicators.append("Large document size")
            if total_word_count > 25000:
                complexity_factor = 1.5
                complexity_indicators.append("Very large document size")
            if len(documents) > 5:
                complexity_factor *= 1.1
                complexity_indicators.append("Multiple documents")

            estimated_cost = round(base_cost * complexity_factor, 2)

            # Update the analysis record with results
            update_data = {
                "status": "estimated",
                "estimatedCost": estimated_cost,
                "analysisMetrics": {
                    "wordCount": total_word_count,
                    "documentCount": len(documents),
                    "complexityFactor": complexity_factor,
                    "complexityIndicators": complexity_indicators,
                    "baseCost": base_cost,
                    "costPerWord": base_cost_per_word
                }
            }

            # Add log entry
            self.api_client.update_analysis(analysis_id, update_data)

            # Show as part of analysis preparation (hide cost from users)
            self.process_logger.info(f"Analysis complexity assessment completed")
            self.process_logger.info(f"Total words analyzed: {total_word_count}")
            logger.info(f"[INTERNAL] Estimated cost: ₹{estimated_cost}")  # Internal log only

        except Exception as e:
            logger.error(f"Error in background analysis cost estimation: {e}")
            # Update analysis record with error status
            try:
                update_data = {
                    "status": "estimation_failed",
                    "logs": [{
                        "timestamp": time.time(),
                        "action": "analysis_estimation_failed",
                        "details": {
                            "error": str(e)
                        }
                    }]
                }
                self.api_client.update_analysis(analysis_id, update_data)
            except Exception as update_error:
                logger.error(f"Failed to update analysis with error status: {update_error}")
    
    def estimate_analysis_cost(self, tender_id: str, tenant_id: str, company_id: str) -> Dict[str, Any]:
        """Estimate the cost for RFP analysis based on document complexity and size"""
        # Get tender details via API
        self._setup_process_context(tenant_id, tender_id + "_" + company_id, "analysis_estimation")
        tender = self.api_client.get_tender(tender_id, tenant_id)
        if not tender:
            raise ValueError(f"Tender not found: {tender_id}")
        
        # Get all related documents via API
        document_ids = [str(doc_id) for doc_id in tender.get("originalDocuments", [])]
        documents = self.api_client.get_documents(document_ids)

        temp_dir = os.path.join(self.base_dir, f"analysis_cost_{tender_id}")
        output_dir = os.path.join(temp_dir, "analysis_cost_output")
        os.makedirs(output_dir, exist_ok=True)
        out_txt_file = os.path.join(output_dir, "merged.txt")
        out_json_file = os.path.join(output_dir, "doc_text.json")
        
        #Download if extracted text is present
        s3_key = f"tender_analysis/{tender_id}/doc_text.json"
        doc_content=self.api_client.download_file(s3_key,tenant_id,False)
        if ((len(doc_content) != 0) and (doc_content != "false")):
            with open(out_json_file, 'wb') as f:
                f.write(doc_content)
        else:
            # Download documents to temporary directory for processing
            unique_links = []
            
            for doc in documents:
                try:
                    filename = doc.get("name", f"doc_{doc['_id']}")
                    storage_details = doc.get("storageDetails", {})
                    unique_links.append(storage_details.get("url"))
                    
                except Exception as e:
                    self.process_logger.error(f"Failed to get document URL {doc.get('_id')}: {e}")

            downloaded_files = ta.download_linked_documents(temp_dir, unique_links, process_logger = self.process_logger)

        s3_key = f"tender_analysis/{tender_id}/merged.txt"
        doc_content=self.api_client.download_file(s3_key,tenant_id,False)
        if ((len(doc_content) != 0) and (doc_content != "false")):
            with open(out_txt_file, 'wb') as f:
                f.write(doc_content)
        



        # Extract text from all documents to get word count
        documents_text = {}
        total_word_count = 0
        base_cost_per_word = 0.005  # INR per word




        self.process_logger.info("Attempting document extraction with external extractor for cost analysis...")
        documents_text = extract_documents_text_compatible(temp_dir, output_dir, self.config["anthropic"]["api_key"], process_logger=self.process_logger)
        total_word_count = self.extractor.count_total_words(documents_text)

        company_info = self._format_company_information(tenant_id, company_id)

        self.process_logger.info(f"Company Info: {company_info}")

        company_info_word_count = self.extractor.count_words(company_info)
            
        self.process_logger.info(f"Company Info length: {company_info_word_count} words")

        total_word_count = total_word_count + company_info_word_count

        self.process_logger.info(f"Total word count is {total_word_count}\nUploading files to S3 now.")
        # Upload extracted text files to S3
        content=''
        with open(out_txt_file, 'rb') as f:
            content = f.read()
        s3_key = f"tender_analysis/{tender_id}/merged.txt"
        # Upload to S3 via API
        content_type = 'text/plain'
        storage_details = self.api_client.upload_file(
            content, s3_key, content_type, tenant_id
        )
        self.process_logger.info(f"Uploaded merged.txt file to S3")
        content=''
        with open(out_json_file, 'rb') as f:
            content = f.read()
        s3_key = f"tender_analysis/{tender_id}/doc_text.json"
        # Upload to S3 via API
        content_type = 'application/json'
        storage_details = self.api_client.upload_file(
            content, s3_key, content_type, tenant_id
        )
        self.process_logger.info(f"Uploaded doc_text.json file to S3")
        # Calculate base cost based on word count and complexity
        base_cost = max(10, total_word_count * base_cost_per_word)  # Minimum 10 INR

        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)
            self.process_logger.info(f"Directory '{temp_dir}' deleted successfully.")
        else:
            self.process_logger.info(f"Directory '{temp_dir}' does not exist.")

        # Adjust cost based on tender complexity
        complexity_factor = 1.0
        complexity_indicators = []
        
        if total_word_count > 10000:
            complexity_factor = 1.2
            complexity_indicators.append("Large document size")
        if total_word_count > 25000:
            complexity_factor = 1.5
            complexity_indicators.append("Very large document size")
        if len(documents) > 5:
            complexity_factor *= 1.1
            complexity_indicators.append("Multiple documents")
            
        estimated_cost = round(base_cost * complexity_factor, 2)
        
        # Store the estimation in database via API
        analysis_record = {
            "tenant": tenant_id,
            "tender": tender_id,
            "company": company_id,
            "type": "analysis_estimation",
            "status": "estimated",
            "estimatedCost": estimated_cost,
            "currency": "INR",
            "analysisMetrics": {
                "wordCount": total_word_count,
                "documentCount": len(documents),
                "complexityFactor": complexity_factor,
                "complexityIndicators": complexity_indicators,
                "baseCost": base_cost,
                "costPerWord": base_cost_per_word
            },
            "logs": [{
                "timestamp": time.time(),
                "action": "analysis_cost_estimation",
                "details": {
                    "estimated_cost": estimated_cost,
                    "word_count": total_word_count,
                    "document_count": len(documents)
                }
            }]
        }
        
        analysis = self.api_client.create_analysis(analysis_record)
        analysis_id = str(analysis["_id"])
        
        # Show as analysis preparation (hide cost from users)
        self.process_logger.info(f"Analysis complexity assessment completed")
        self.process_logger.info(f"Total words analyzed: {total_word_count}")
        logger.info(f"[INTERNAL] Estimated cost: ₹{estimated_cost}")  # Internal log only
        
        return {
            "analysis_id": analysis_id,
            "estimated_cost": estimated_cost,
            "currency": "INR",
            "details": {
                "document_count": len(documents),
                "word_count": total_word_count,
                "complexity_factor": complexity_factor,
                "complexity_indicators": complexity_indicators
            }
        }

    # =================================================================
    # COMPANY DOCUMENT ANALYSIS METHODS  
    # =================================================================
    def analyze_company_docs(self, tenant_id: str, folder_list: List[Dict[str, Any]], company_id: str = None) -> Dict[str, Any]:
        """Analyze Company documents to extract key information"""
        try:
            self._setup_process_context(tenant_id, company_id, "company_doc_analysis")
            self.process_logger.info(f"Company Id: {company_id}")

            # Create directory for processing
            download_dir = os.path.join(self.base_dir, f"company_{company_id}_{tenant_id}")
            os.makedirs(download_dir, exist_ok=True)

            output_dir = os.path.join(download_dir, f"company_merged_{company_id}")
            os.makedirs(output_dir, exist_ok=True)

            file_count = 0

            # Dictionary to store the mapping of temp_path to file._id
            directory_map = {}

            # NEW: Dictionary to track file hashes and duplicates
            file_hash_map = {}  # hash -> original_file_path
            duplicate_map = {}  # duplicate_file_path -> original_file_path
            
            # NEW: Get existing analysis to check for cross-request duplicates
            existing_analysis = self.api_client.get_company_analysis_data(company_id).get("rawAnalysis", {})
            existing_file_hashes = {}  # hash -> existing_file_id
            
            # Build hash map from existing analysis
            if existing_analysis:
                for file_id, analysis_data in existing_analysis.items():
                    if isinstance(analysis_data, dict) and "file_hash" in analysis_data:
                        existing_file_hashes[analysis_data["file_hash"]] = file_id
                self.process_logger.info(f"Found {len(existing_file_hashes)} existing file hashes in database")

            def process_folder(folder, base_path):
                nonlocal file_count

                # Create local folder path
                folder_path = os.path.join(base_path, folder["folder_name"])
                os.makedirs(folder_path, exist_ok=True)

                # Download files in this folder
                for file_entry in folder.get("files", []):
                    doc = file_entry.get("documentId", {})
                    file_id = file_entry.get("_id")
                    storage = doc.get("storageDetails", {})
                    s3_key = storage.get("key")
                    filename = doc.get("name")

                    if not s3_key or not filename:
                        continue

                    # Local file path
                    local_file = os.path.join(folder_path, filename)

                    # Fetch content using wrapper
                    content = self.api_client.download_file(s3_key, tenant_id, False)

                    if not content or content == b"False" or content == "False":
                        continue

                    # Save to disk
                    with open(local_file, "wb") as f:
                        f.write(content)
                    
                    # Add to directory mapping if file_id exists
                    if file_id:
                        directory_map[local_file] = file_id

                    # Calculate file hash and check for duplicates
                    file_hash = self._calculate_file_hash(local_file)
                    
                    # NEW: Check against existing files first (cross-request duplicates)
                    if file_hash in existing_file_hashes:
                        existing_file_id = existing_file_hashes[file_hash]
                        self.process_logger.info(f"Cross-request duplicate detected: {local_file} matches existing file_id: {existing_file_id}")
                        
                        # Copy existing analysis and update file_path for this duplicate
                        if file_id and existing_file_id in existing_analysis:
                            duplicate_analysis = existing_analysis[existing_file_id].copy()
                            # Update file_path to reflect this duplicate's path
                            duplicate_analysis["file_path"] = self._transform_file_path(local_file)
                            duplicate_analysis["file_hash"] = file_hash  # Ensure hash is preserved
                            
                            # Store in existing_analysis for this file_id
                            existing_analysis[file_id] = duplicate_analysis
                            # Also update directory_map for this duplicate
                            directory_map[local_file] = file_id
                            self.process_logger.info(f"Copied existing analysis to new duplicate file_id: {file_id}")
                        
                        # Remove file since it's already processed
                        os.remove(local_file)
                        
                    elif file_hash in file_hash_map:
                        # This is a duplicate within current request
                        original_file = file_hash_map[file_hash]
                        duplicate_map[local_file] = original_file
                        
                        # Remove the duplicate file to avoid processing
                        os.remove(local_file)
                        
                        self.process_logger.info(f"Current-request duplicate detected and removed: {local_file} -> original: {original_file}")
                    else:
                        # This is a unique file - keep it for processing
                        file_hash_map[file_hash] = local_file

                    file_count += 1

                # Recurse into subfolders
                for subfolder in folder.get("subfolders", []):
                    process_folder(subfolder, folder_path)

            # Process each root folder
            for root_folder in folder_list:
                process_folder(root_folder, download_dir)

            # Save directory mapping to JSON file
            directory_map_file = os.path.join(output_dir, "directory_map.json")
            try:
                with open(directory_map_file, "w", encoding="utf-8") as f:
                    json.dump(directory_map, f, indent=2, ensure_ascii=False)
                self.process_logger.info(f"Directory mapping saved to {directory_map_file}")
            except Exception as e:
                self.process_logger.warning(f"Failed to save directory mapping: {e}")

            # Save duplicate mapping for debugging (only if duplicates exist)
            if duplicate_map:
                duplicate_map_file = os.path.join(output_dir, "duplicate_map.json")
                try:
                    with open(duplicate_map_file, "w", encoding="utf-8") as f:
                        json.dump(duplicate_map, f, indent=2, ensure_ascii=False)
                    self.process_logger.info(f"Duplicate mapping saved to {duplicate_map_file}")
                    self.process_logger.info(f"Total current-request duplicates found and skipped: {len(duplicate_map)}")
                except Exception as e:
                    self.process_logger.warning(f"Failed to save duplicate mapping: {e}")

            # Process download_dir only if there are unique files to process
            unique_files_count = len(file_hash_map)
            if unique_files_count > 0:
                self.process_logger.info(f"Processing {unique_files_count} unique files")
                results = mdp.process_directory(download_dir, output_dir, ANTHROPIC_API_KEY)

                if results["success"]:
                    self.process_logger.info("Processing completed successfully!")
                else:
                    self.process_logger.error(f"Processing failed: {results['error']}")
            else:
                self.process_logger.info("No unique files to process - all files were duplicates")
                results = {"success": True}  # Set success since no processing was needed

            # Read raw_analysis.json if it exists
            raw_analysis_file = os.path.join(output_dir, "raw_analysis.json")
            raw_analysis = existing_analysis.copy()  # Start with existing analysis
            
            if os.path.exists(raw_analysis_file):
                try:
                    with open(raw_analysis_file, "r", encoding="utf-8") as f:
                        temp_raw_analysis = json.load(f)

                    # Add duplicate entries by copying analysis from original files
                    if duplicate_map:
                        for duplicate_path, original_path in duplicate_map.items():
                            if original_path in temp_raw_analysis:
                                # NEW: Create a copy and update the file_path for this specific duplicate
                                duplicate_analysis = temp_raw_analysis[original_path].copy()
                                duplicate_analysis["file_path"] = self._transform_file_path(duplicate_path)
                                temp_raw_analysis[duplicate_path] = duplicate_analysis
                                self.process_logger.info(f"Copied analysis from {original_path} to duplicate {duplicate_path}")
                            else:
                                self.process_logger.warning(f"Original file {original_path} not found in analysis for duplicate {duplicate_path}")
                    
                    # NEW: Add file_hash to analysis data and replace temp_path keys with file_id
                    for temp_path, analysis_data in temp_raw_analysis.items():
                        # Ensure file_hash is stored in analysis_data
                        if "file_hash" not in analysis_data:
                            # First try to find hash in file_hash_map (for unique files)
                            hash_found = False
                            for hash_val, file_path in file_hash_map.items():
                                if file_path == temp_path:
                                    analysis_data["file_hash"] = hash_val
                                    hash_found = True
                                    break
                            
                            # If not found, check if this is a duplicate and get hash from original
                            if not hash_found and temp_path in duplicate_map:
                                original_path = duplicate_map[temp_path]
                                for hash_val, file_path in file_hash_map.items():
                                    if file_path == original_path:
                                        analysis_data["file_hash"] = hash_val
                                        hash_found = True
                                        break
                            
                            # Last resort: calculate hash if file still exists or log warning
                            if not hash_found:
                                if os.path.exists(temp_path):
                                    analysis_data["file_hash"] = self._calculate_file_hash(temp_path)
                                    self.process_logger.info(f"Calculated missing hash for {temp_path}")
                                else:
                                    self.process_logger.warning(f"Could not determine file_hash for {temp_path} - file not found and no mapping available")
                        
                        # Update file_path if it matches temp_path
                        if analysis_data.get("file_path") == temp_path:
                            analysis_data["file_path"] = self._transform_file_path(temp_path)
                        
                        # Map to file_id
                        file_id = directory_map.get(temp_path)
                        if file_id:
                            raw_analysis[file_id] = analysis_data
                            self.process_logger.info(f"Mapped {temp_path} -> {file_id}")
                        else:
                            # Keep original path if no mapping found
                            raw_analysis[temp_path] = analysis_data
                            self.process_logger.warning(f"No file_id mapping found for: {temp_path}")
                    
                    # Save the updated raw_analysis back to file
                    try:
                        with open(raw_analysis_file, "w", encoding="utf-8") as f:
                            json.dump(raw_analysis, f, indent=2, ensure_ascii=False)
                        self.process_logger.info("Updated raw_analysis.json with file_id mappings")
                    except Exception as e:
                        self.process_logger.warning(f"Failed to save updated raw_analysis.json: {e}")
                                    
                except Exception as e:
                    self.process_logger.error(f"Failed to process raw_analysis.json: {e}")

            self.process_logger.info("Updating Company Info in DB")
            self.api_client.update_company_info(company_id, tenant_id, {
                "$set": {"rawAnalysis": raw_analysis, "status": "processing_completed"},
                "$push": {
                    "logs": {
                        "timestamp": time.time(),
                        "action": "processing_completed"
                    }
                }
            })            

            results_dict = {
                "company_id": company_id,
                "tenant_id": tenant_id,
                "company_info": raw_analysis,
                "status": "processed"
            }

            # Clean up temporary directory
            try:
                shutil.rmtree(download_dir)
            except Exception as e:
                self.process_logger.warning(f"Failed to cleanup download directory: {e}")

            return results_dict

        except Exception as e:
            logger.error(f"Failed to analyze Company documents: {e}")
            raise

    # =================================================================
    # RFP ANALYSIS METHODS  
    # =================================================================

    def analyze_rfp(self, analysis_id: str, tenant_id: str, company_id: str = None) -> Dict[str, Any]:
        """Analyze RFP documents to extract key information and check eligibility"""
        try:
            self._setup_process_context(tenant_id, analysis_id, "analysis")
            self.process_logger.info(f"Company Id: {company_id}")

            # Get analysis record via API
            analysis = self.api_client.get_analysis(analysis_id)
            if not analysis:
                raise ValueError(f"Analysis record not found: {analysis_id}")
            
            # Manual tenant validation
            if str(analysis["tenant"]) != tenant_id:
                self.process_logger.error(f"Tenant mismatch: {analysis['tenant']} vs {tenant_id}")
                raise ValueError(f"Analysis record not found: {analysis_id}")
            
            # Get tender details via API
            tender_id = str(analysis["tender"])
            
            self.process_logger.info(f"✅ Analysis found and tenant validated: {analysis_id}")

            tender = self.api_client.get_tender(tender_id, tenant_id)
            if not tender:
                raise ValueError(f"Tender not found: {analysis['tender']}")
            
            # Update analysis status via API
            self.api_client.update_analysis(analysis_id, {
                "$set": {"status": "analysis_in_progress"},
                "$push": {
                    "logs": {
                        "timestamp": time.time(),
                        "action": "rfp_analysis_started"
                    }
                }
            })

            if not tender.get("analysis"):
                # Get tender documents via API
                document_ids = [str(doc_id) for doc_id in tender.get("originalDocuments", [])]
                documents = self.api_client.get_documents(document_ids)
                
                # Create temporary directory for processing
                temp_dir = os.path.join(self.base_dir, f"analysis_{analysis_id}")
                os.makedirs(temp_dir, exist_ok=True)
                
                # Download documents to temporary directory for processing
                unique_links = []
                
                for doc in documents:
                    try:
                        filename = doc.get("name", f"doc_{doc['_id']}")
                        storage_details = doc.get("storageDetails", {})
                        unique_links.append(storage_details.get("url"))

                    except Exception as e:
                        self.process_logger.error(f"Failed to get document URL {doc.get('_id')}: {e}")

                downloaded_files = ta.download_linked_documents(temp_dir, unique_links, process_logger = self.process_logger)
                
                # Use document extractor to get documents text
                output_dir = os.path.join(temp_dir, "analysis_output")
                os.makedirs(output_dir, exist_ok=True)

                out_txt_file = os.path.join(output_dir, "merged.txt")
                out_json_file = os.path.join(output_dir, "doc_text.json")
                
                #Download if extracted text is present
                s3_key = f"tender_analysis/{tender_id}/doc_text.json"
                doc_content=self.api_client.download_file(s3_key,tenant_id,False)
                if ((len(doc_content) != 0) and (doc_content != "false")):
                    with open(out_json_file, 'wb') as f:
                        f.write(doc_content)

                s3_key = f"tender_analysis/{tender_id}/merged.txt"
                doc_content=self.api_client.download_file(s3_key,tenant_id,False)
                if ((len(doc_content) != 0) and (doc_content != "false")):
                    with open(out_txt_file, 'wb') as f:
                        f.write(doc_content)
                
                try:
                    self.process_logger.info("Attempting document extraction with external extractor...")
                    documents_text = extract_documents_text_compatible(temp_dir, output_dir, self.config["anthropic"]["api_key"], process_logger=self.process_logger)
                    self.process_logger.info(f"Document extractor returned {len(documents_text) if documents_text else 0} documents")

                    # Upload extracted text files to S3
                    content=''
                    with open(out_txt_file, 'rb') as f:
                        content = f.read()
                    s3_key = f"tender_analysis/{tender_id}/merged.txt"
                    # Upload to S3 via API
                    content_type = 'text/plain'
                    storage_details = self.api_client.upload_file(
                        content, s3_key, content_type, tenant_id
                    )
                    content=''
                    with open(out_json_file, 'rb') as f:
                        content = f.read()
                    s3_key = f"tender_analysis/{tender_id}/doc_text.json"
                    # Upload to S3 via API
                    content_type = 'application/json'
                    storage_details = self.api_client.upload_file(
                        content, s3_key, content_type, tenant_id
                    )

                except Exception as e:
                    self.process_logger.warning(f"Document extractor failed: {e}")
                    documents_text = None
                
                # Debug the extraction results
                self._debug_extraction_results(documents_text, temp_dir)
                
                if not documents_text:
                    raise Exception("No text was extracted from documents using either method")

                # Analyze documents with LLM using tender_automation function
                self.process_logger.info("Starting tender document analysis with LLM...")

                annexure_hints = eaa.extract_annexures_info(output_dir, documents_text)

                # self.process_logger.info(f"\nannexure_hints returned from extract_annexures_info:\n {annexure_hints}\n")
                # logger.info(f"\nannexure_hints returned from extract_annexures_info:\n {annexure_hints}\n")

                extracted_info, doc_analyses = ta.analyze_tender_with_LLM(documents_text, annexure_hints, process_logger=self.process_logger)
                self.process_logger.info("Tender document analysis completed")

                # Identify required documents
                required_docs = bpa.identify_required_documents(extracted_info, process_logger = self.process_logger)
                for annexure in annexure_hints:
                    annexure_name = annexure['annexure_name']
                    document_name = annexure['document_name']
                    page_start = annexure['page_start']
                    page_end = annexure['page_end']
                    
                    # Generate page list from start to end (inclusive)
                    pages = list(range(page_start, page_end + 1))
                    
                    # Create description
                    if page_start == page_end:
                        description = f"{annexure_name} on page {page_start} of {document_name}"
                    else:
                        description = f"{annexure_name} on pages {page_start}-{page_end} of {document_name}"
                    
                    required_docs.append({
                        "name": annexure_name,
                        "type": bpa.DocumentType.ANNEXURE.upper(),
                        "description": description,
                        "source_document": document_name,
                        "pages": pages
                    })

                # Update tender record
                self.api_client.update_tender(tender_id, {
                    "$set": {
                        "analysis":{
                            "extracted_info": extracted_info,
                            "required_docs": required_docs,
                            "analysisMetadata": {
                                "documentsProcessed": len(documents_text),
                                "analysisChunks": len(doc_analyses),
                                "totalWordCount": sum(len(text.split()) for text in documents_text.values())
                            }
                        }
                    },
                    "$push":{
                        "logs": {
                            "timestamp": time.time(),
                            "action": "tender_analysis_completed"
                        }
                    }
                })
                documents_processed = len(documents_text)
            else:
                extracted_info = tender.get("analysis", {}).get("extracted_info")
                required_docs = tender.get("analysis", {}).get("required_docs")
                documents_processed = tender.get("analysis", {}).get("analysisMetadata",{}).get("documentsProcessed")
            
            # Get company information for eligibility check via API
            company_info = self._format_company_information(tenant_id, company_id)

            self.process_logger.info(f"Company Info: {company_info}")

            self.process_logger.info(f"Company Info length: {len(company_info)} characters")

            # Check eligibility using tender_automation function
            self.process_logger.info("Starting eligibility check...")
            is_eligible, eligibility_reason = ta.check_eligibility(extracted_info, company_info, process_logger=self.process_logger)
            self.process_logger.info(f"Eligibility check completed: {is_eligible}")
            
            # Save analysis results via API
            analysis_result = {
                "extractedInfo": extracted_info,
                "required_documents": required_docs,
                "eligibility": {
                    "isEligible": is_eligible,
                    "reason": eligibility_reason
                }
            }
            
            # Update analysis with results via API
            self.api_client.update_analysis(analysis_id, {
                "$set": {
                    "status": "analysis_completed",
                    "results": analysis_result
                },
                "$push": {
                    "logs": {
                        "timestamp": time.time(),
                        "action": "rfp_analysis_completed",
                        "details": {
                            "is_eligible": is_eligible,
                            "documents_processed": documents_processed
                        }
                    }
                }
            })
            
            # Clean up temporary directory
            try:
                shutil.rmtree(temp_dir)
            except Exception as e:
                self.process_logger.warning(f"Failed to cleanup temporary directory: {e}")
            
            # Log results
            if is_eligible:
                self.process_logger.info(f"🚀 Company is eligible for tender {tender.get('bidNumber')}. Reason: {eligibility_reason}")
            else:
                self.process_logger.warning(f"⚠️ Company is NOT eligible for tender {tender.get('bidNumber')}. Reason: {eligibility_reason}")
            
            return {
                "status": "analysis_completed",
                "is_eligible": is_eligible,
                "eligibility_reason": eligibility_reason,
                "extracted_info": extracted_info,
                "required_documents":required_docs
            }
            
        except Exception as e:
            logger.error(f"Failed to analyze RFP: {e}")
            
            # Update analysis status to failed via API
            try:
                self.api_client.update_analysis(analysis_id, {
                    "$set": {"status": "analysis_failed"},
                    "$push": {
                        "logs": {
                            "timestamp": time.time(),
                            "action": "rfp_analysis_failed",
                            "details": {"error": str(e)}
                        }
                    }
                })
            except:
                pass
            
            raise

    # =================================================================
    # BID GENERATION ESTIMATION METHODS
    # =================================================================

    def estimate_bid_generation_cost(self, analysis_id: str, tenant_id: str, required_docs: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Estimate the cost for bid document generation"""
        self._setup_process_context(tenant_id, analysis_id, "estimate_bid_generation")
        # Get analysis record via API
        analysis = self.api_client.get_analysis(analysis_id)
        if not analysis:
            raise ValueError(f"Analysis record not found: {analysis_id}")
        
        # Validate tenant
        if str(analysis["tenant"]) != tenant_id:
            self.process_logger.error(f"Tenant mismatch: {analysis['tenant']} vs {tenant_id}")
            raise ValueError(f"Analysis record not found: {analysis_id}")
        
        self.process_logger.info(f"✅ Analysis found and tenant validated: {analysis_id}")
        
        if analysis.get("status") != "analysis_completed":
            raise ValueError("Analysis must be completed before estimating bid generation cost")
        
        # Check eligibility
        results = analysis.get("results", {})
        eligibility = results.get("eligibility", {})
        
        # if not eligibility.get("isEligible"):
        #     return {
        #         "status": "not_eligible",
        #         "message": "Cannot generate bid documents - company is not eligible",
        #         "reason": eligibility.get("reason")
        #     }
        
        # Extract information for cost calculation
        extracted_info = results.get("extractedInfo", {})
        
        # Calculate base cost for bid generation
        base_generation_cost = 500  # Base cost in INR
        
        # Filter documents to only include CUSTOM and ANNEXURE types
        billable_docs = [doc for doc in required_docs if doc.get("type","").upper() in ["CUSTOM", "ANNEXURE"]]
        
        # Count billable documents by type
        custom_docs = [doc for doc in billable_docs if doc.get("type","").upper() == "CUSTOM"]
        annexure_docs = [doc for doc in billable_docs if doc.get("type","").upper() == "ANNEXURE"]
        
        custom_count = len(custom_docs)
        annexure_count = len(annexure_docs)
        total_billable_count = len(billable_docs)
        
        # Calculate additional costs based on document types        
        additional_cost = 0
        cost_breakdown = []
        
        # Cost for each annexure document
        for annexure in annexure_docs:
            annexure_cost = 125 * random.uniform(1.0, 1.2)
            additional_cost += annexure_cost
            cost_breakdown.append({
                "document": annexure.get("name", "Annexure"),
                "type": "ANNEXURE",
                "cost": round(annexure_cost, 2)
            })
        
        # Cost for each custom document
        for custom_doc in custom_docs:
            custom_cost = 100 * random.uniform(1.0, 1.2)
            additional_cost += custom_cost
            cost_breakdown.append({
                "document": custom_doc.get("name", "Custom Document"),
                "type": "CUSTOM",
                "cost": round(custom_cost, 2)
            })
        
        estimated_cost = round(base_generation_cost + additional_cost, 2)
        
        # Create bid generation record via API
        bid_generation_record = {
            "tenant": tenant_id,
            "tender": str(analysis["tender"]),
            "analysisId": analysis_id,
            "type": "bid_generation_estimation",
            "status": "estimated",
            "estimatedCost": estimated_cost,
            "currency": "INR",
            "generationMetrics": {
                "totalBillableDocuments": total_billable_count,
                "customDocumentCount": custom_count,
                "annexureCount": annexure_count,
                "baseCost": base_generation_cost,
                "additionalCost": round(additional_cost, 2),
                "costBreakdown": cost_breakdown,
                "billableDocuments": billable_docs
            },
            "logs": [{
                "timestamp": time.time(),
                "action": "bid_generation_cost_estimation",
                "details": {
                    "estimated_cost": estimated_cost,
                    "base_cost": base_generation_cost,
                    "additional_cost": round(additional_cost, 2),
                    "billable_document_count": total_billable_count
                }
            }]
        }
        
        bid_generation = self.api_client.create_bid_generation(bid_generation_record)
        bid_gen_id = str(bid_generation["_id"])
        
        self.process_logger.info("=" * 50)
        self.process_logger.info(f"Estimated Bid Generation Cost: ₹{estimated_cost}")
        self.process_logger.info(f"Base Cost: ₹{base_generation_cost}")
        self.process_logger.info(f"Additional Cost: ₹{round(additional_cost, 2)}")
        self.process_logger.info(f"Billable Documents: {total_billable_count} (Custom: {custom_count}, Annexure: {annexure_count})")
        self.process_logger.info("=" * 50)
        
        return {
            "bid_generation_id": bid_gen_id,
            "estimated_cost": estimated_cost,
            "currency": "INR",
            "details": {
                "base_cost": base_generation_cost,
                "additional_cost": round(additional_cost, 2),
                "total_billable_documents": total_billable_count,
                "custom_document_count": custom_count,
                "annexure_count": annexure_count,
                "cost_breakdown": cost_breakdown,
                "billable_documents": billable_docs
            }
        }

    # =================================================================
    # BID DOCUMENT GENERATION METHODS
    # =================================================================

    def generate_bid_documents(self, bid_generation_id: str, tenant_id: str, required_docs: List[Dict[str, Any]], company_id: str = None) -> Dict[str, Any]:
        """Generate complete bid documents using enhanced automation"""
        self._setup_process_context(tenant_id, bid_generation_id, "bid_generation")
        self.process_logger.info(f"Company Id: {company_id}")

        # Get bid generation record via API
        bid_gen = self.api_client.get_bid_generation(bid_generation_id, tenant_id)
        if not bid_gen:
            raise ValueError(f"Bid generation record not found: {bid_generation_id}")
        
        # Get analysis record via API
        analysis_id = str(bid_gen["analysisId"])
        analysis = self.api_client.get_analysis(analysis_id)
        if not analysis:
            raise ValueError(f"Analysis record not found: {analysis_id}")
        
        # Get tender details via API
        tender_id = str(bid_gen["tender"])
        tender = self.api_client.get_tender(tender_id, tenant_id)
        if not tender:
            raise ValueError(f"Tender not found: {bid_gen['tender']}")
        
        # Update status via API
        self.api_client.update_bid_generation(bid_generation_id, {
            "$set": {"status": "generation_in_progress"},
            "$push": {
                "logs": {
                    "timestamp": time.time(),
                    "action": "bid_document_generation_started"
                }
            }
        })
        
        try:
            # Create temporary directories for processing
            temp_rfp_dir = os.path.join(self.base_dir, f"bid_gen_{bid_generation_id}")
            temp_company_docs_dir = os.path.join(self.base_dir, f"company_docs_{bid_generation_id}")
            os.makedirs(temp_rfp_dir, exist_ok=True)
            os.makedirs(temp_company_docs_dir, exist_ok=True)
            
            # Download all tender documents to the temporary directory via API
            document_ids = [str(doc_id) for doc_id in tender.get("originalDocuments", [])]
            documents = self.api_client.get_documents(document_ids)
            
            # Save tender documents to temp directory
            for doc in documents:
                try:
                    doc_content = self.api_client.get_document_content(str(doc["_id"]), tenant_id)
                    filename = doc.get("name", f"doc_{doc['_id']}")
                    
                    if not filename.endswith(".pdf") and doc.get("storageDetails", {}).get("mimeType") == "application/pdf":
                        filename = f"{filename}.pdf"
                        
                    file_path = os.path.join(temp_rfp_dir, filename)
                    
                    with open(file_path, 'wb') as f:
                        f.write(doc_content)
                        
                    self.process_logger.info(f"Saved document to {file_path}")
                    
                except Exception as e:
                    self.process_logger.error(f"Failed to save document {doc.get('_id')}: {e}")
            
            # Get and format company information for fallback descriptions
            formatted_company_info = self._format_company_information(tenant_id, company_id)
            
            # Download and organize company documents with improved logic
            downloaded_company_docs = self._download_and_organize_company_documents(
                tenant_id, temp_company_docs_dir, formatted_company_info, company_id
            )
            
            # Save analysis results to temp directory
            analysis_dir = os.path.join(temp_rfp_dir, "tender_analysis")
            os.makedirs(analysis_dir, exist_ok=True)
            
            analysis_file = os.path.join(analysis_dir, "tender_analysis.json")
            with open(analysis_file, 'w', encoding='utf-8') as f:
                json.dump(analysis.get("results", {}).get("extractedInfo", {}), f, indent=4)
            
            # Use enhanced bid preparation automation with company docs from S3
            self.process_logger.info(f"Generating bid documents for tender {tender.get('bidNumber')}")
            final_docs_dir, document_status, referenced_company_docs = bpa.prepare_bid_documents(temp_rfp_dir, temp_company_docs_dir, formatted_company_info, required_docs, process_logger = self.process_logger)

            # Process the generated documents and save to S3
            generated_document_ids = self._save_generated_documents_to_s3(
                final_docs_dir, bid_generation_id, tenant_id, tender.get("bidNumber", "")
            )

            # Create database entries for referenced company documents (without uploading to S3)
            referenced_document_ids = self._create_referenced_document_entries(
                referenced_company_docs, bid_generation_id, tenant_id, tender.get("bidNumber", ""), downloaded_company_docs
            )
            
            # Clean up temporary directories
            try:
                shutil.rmtree(temp_rfp_dir)
                shutil.rmtree(temp_company_docs_dir)
            except Exception as e:
                self.process_logger.warning(f"Failed to cleanup temporary directories: {e}")
            
            # Update bid generation record via API with SIMPLIFIED data structure
            simplified_company_docs = []
            company_docs_urls = []
            for doc in downloaded_company_docs:
                file_url = doc.get("file_url", "")
                simplified_doc = {
                    "filename": doc.get("filename", ""),
                    "category": doc.get("category", ""),
                    "source": doc.get("source", ""),
                    "type": doc.get("type", ""),
                    "file_url": file_url
                }
                simplified_company_docs.append(simplified_doc)
                # if file_url:
                #     company_docs_urls.append(file_url)

            self.process_logger.info(f"company document urls {company_docs_urls}")

            # Combine generated and referenced document IDs
            all_document_ids = generated_document_ids + referenced_document_ids

            self.api_client.update_bid_generation(bid_generation_id, {
                "$addToSet": {
                    "generatedDocuments": all_document_ids,
                    "companyDocumentUrls": company_docs_urls
                },
                "$set": {
                    "status": "generation_completed",
                    "outputDirectory": final_docs_dir,
                    "documentStatus": document_status
                },
                "$push": {
                    "logs": {
                        "timestamp": time.time(),
                        "action": "bid_document_generation_completed",
                        "details": {
                            "document_count": len(all_document_ids),
                            "generated_count": len(generated_document_ids),
                            "referenced_count": len(referenced_document_ids),
                            "output_directory": final_docs_dir,
                            "company_docs_count": len(downloaded_company_docs)
                        }
                    }
                }
            })

            self.process_logger.info(f"Successfully generated {len(generated_document_ids)} bid documents and referenced {len(referenced_document_ids)} company documents")

            return {
                "status": "completed",
                "generated_documents": all_document_ids,
                "company_document_urls": company_docs_urls,
                "output_directory": final_docs_dir,
                "document_count": len(all_document_ids),
                "message": f"Generated {len(generated_document_ids)} bid documents and referenced {len(referenced_document_ids)} company documents successfully"
            }
            
        except Exception as e:
            logger.error(f"Failed to generate bid documents: {e}")
            
            # Clean up temporary directories on error
            try:
                if 'temp_rfp_dir' in locals() and os.path.exists(temp_rfp_dir):
                    shutil.rmtree(temp_rfp_dir)
                if 'temp_company_docs_dir' in locals() and os.path.exists(temp_company_docs_dir):
                    shutil.rmtree(temp_company_docs_dir)
            except:
                pass
            
            # Update status to failed via API
            self.api_client.update_bid_generation(bid_generation_id, {
                "$set": {"status": "generation_failed"},
                "$push": {
                    "logs": {
                        "timestamp": time.time(),
                        "action": "bid_document_generation_failed",
                        "details": {"error": str(e)}
                    }
                }
            })
            
            raise

    # =================================================================
    # NEW: REGENERATE BID DOCUMENTS METHODS
    # =================================================================
    
    def regenerate_bid_document(self, bid_generation_id: str, tenant_id: str, 
                            document_id: str, user_prompt: str, company_id: str = None) -> Dict[str, Any]:
        """Regenerate a specific bid document based on user feedback"""
        self._setup_process_context(tenant_id, bid_generation_id, "regenerate_bid_document")
        
        try:
            # Get bid generation record via API
            bid_gen = self.api_client.get_bid_generation(bid_generation_id, tenant_id)
            if not bid_gen:
                raise ValueError(f"Bid generation record not found: {bid_generation_id}")
            
            # Get analysis record via API
            analysis_id = str(bid_gen["analysisId"])
            analysis = self.api_client.get_analysis(analysis_id)
            if not analysis:
                raise ValueError(f"Analysis record not found: {analysis_id}")
            
            # Get tender details via API
            tender_id = str(bid_gen["tender"])
            tender = self.api_client.get_tender(tender_id, tenant_id)
            if not tender:
                raise ValueError(f"Tender not found: {bid_gen['tender']}")
            
            # Get the document to regenerate via API
            documents = self.api_client.get_documents([document_id])
            if not documents:
                raise ValueError(f"Document not found: {document_id}")
            
            document = documents[0]
            original_filename = document.get("name", f"doc_{document_id}")
            
            self.process_logger.info(f"Regenerating document: {original_filename} (ID: {document_id})")
            
            # Create temporary directory for processing
            temp_rfp_dir = os.path.join(self.base_dir, f"regenerate_{bid_generation_id}")
            os.makedirs(temp_rfp_dir, exist_ok=True)
            
            try:
                # Download the document to regenerate locally
                doc_content = self.api_client.get_document_content(document_id, tenant_id)
                
                # Ensure proper file extension
                # if not original_filename.lower().endswith(('.pdf', '.docx', '.doc')):
                #     # Try to determine extension from content type or default to .pdf
                #     storage_details = document.get("storageDetails", {})
                #     mime_type = storage_details.get("mimeType", "")
                #     if "pdf" in mime_type:
                #         original_filename += ".pdf"
                #     elif "word" in mime_type:
                #         original_filename += ".docx"
                #     else:
                #         original_filename += ".pdf"  # Default to PDF
                
                local_document_path = os.path.join(temp_rfp_dir, original_filename)
                
                with open(local_document_path, 'wb') as f:
                    f.write(doc_content)
                
                self.process_logger.info(f"Downloaded document locally: {local_document_path}")
                
                # Download all tender documents for context
                document_ids = [str(doc_id) for doc_id in tender.get("originalDocuments", [])]
                tender_documents = self.api_client.get_documents(document_ids)
                
                # Save tender documents to temp directory for context
                for doc in tender_documents:
                    try:
                        doc_content = self.api_client.get_document_content(str(doc["_id"]), tenant_id)
                        filename = doc.get("name", f"doc_{doc['_id']}")
                        
                        if not filename.endswith(".pdf") and doc.get("storageDetails", {}).get("mimeType") == "application/pdf":
                            filename = f"{filename}.pdf"
                            
                        file_path = os.path.join(temp_rfp_dir, filename)
                        
                        # Skip if it's the same file we already downloaded
                        if file_path != local_document_path:
                            with open(file_path, 'wb') as f:
                                f.write(doc_content)
                            
                    except Exception as e:
                        self.process_logger.error(f"Failed to save context document {doc.get('_id')}: {e}")
                
                # Save analysis results to temp directory
                analysis_dir = os.path.join(temp_rfp_dir, "tender_analysis")
                os.makedirs(analysis_dir, exist_ok=True)
                
                analysis_file = os.path.join(analysis_dir, "tender_analysis.json")
                with open(analysis_file, 'w', encoding='utf-8') as f:
                    json.dump(analysis.get("results", {}).get("extractedInfo", {}), f, indent=4)
                
                # Get company information via API and format it
                company_info = self._format_company_information(tenant_id, company_id)
                
                # Create final docs directory for regenerated documents
                final_docs_dir = os.path.join(temp_rfp_dir, "regenerated_docs")
                os.makedirs(final_docs_dir, exist_ok=True)
                
                # Create nested regenerated directory that BPA expects
                regenerated_subdir = os.path.join(final_docs_dir, "regenerated")
                os.makedirs(regenerated_subdir, exist_ok=True)
                
                self.process_logger.info(f"Created output directories: {final_docs_dir} and {regenerated_subdir}")
                
                regen_cost = 0.00
                
                # Use the regenerate_document function from bid_prep_automation with LOCAL path
                self.process_logger.info(f"Calling BPA regenerate_document with path: {local_document_path}")
                regenerated_doc_path, regen_cost = bpa.regenerate_document(
                    org_doc_file_path=local_document_path,  # Use local path instead of remote path
                    user_prompt=user_prompt,
                    company_info=company_info,
                    final_docs_dir=final_docs_dir,
                    extracted_info=analysis.get("results", {}).get("extractedInfo", {})
                )
                
                if not regenerated_doc_path:
                    raise Exception("Failed to regenerate document - no output path returned")
                
                if not os.path.exists(regenerated_doc_path):
                    # List contents of final_docs_dir for debugging
                    try:
                        self.process_logger.error(f"Regenerated file not found at: {regenerated_doc_path}")
                        self.process_logger.error(f"Contents of {final_docs_dir}:")
                        for root, dirs, files in os.walk(final_docs_dir):
                            for file in files:
                                file_path = os.path.join(root, file)
                                self.process_logger.error(f"  Found file: {file_path}")
                    except Exception as debug_error:
                        self.process_logger.error(f"Error listing directory contents: {debug_error}")
                    
                    raise Exception(f"Regenerated document not found at expected path: {regenerated_doc_path}")
                
                self.process_logger.info(f"Successfully regenerated document at: {regenerated_doc_path}")
                
                # Upload regenerated document to S3
                with open(regenerated_doc_path, 'rb') as f:
                    content = f.read()
                
                filename = os.path.basename(regenerated_doc_path)
                bid_number = tender.get("bidNumber", str(tender_id))
                bid_number = re.sub(r'[\/\\]', '_', bid_number)
                s3_key = f"bid_documents/{tenant_id}/{bid_number}/regenerated/{filename}"
                
                content_type = self._get_content_type(filename)
                # Handle HTML files specifically
                if filename.lower().endswith(('.html', '.htm')):
                    content_type = 'text/html'
                storage_details = self.api_client.upload_file(
                    content, s3_key, content_type, tenant_id
                )
                
                # Create document record via API
                document_record = {
                    "tenant": tenant_id,
                    "name": filename,
                    "type": "bid_document_regenerated",
                    "category": "regenerated",
                    "storageType": "s3",
                    "storageDetails": storage_details,
                    "metadata": {
                        "bidGenerationId": bid_generation_id,
                        "bidNumber": bid_number,
                        "originalDocumentId": document_id,  # Store original document ID
                        "originalDocumentName": original_filename,
                        "userPrompt": user_prompt,
                        "regeneratedAt": time.time()
                    }
                }
                
                doc = self.api_client.create_document(document_record)
                regenerated_doc_id = str(doc["_id"])
                
                # Update bid generation record with regenerated document
                self.api_client.update_bid_generation(bid_generation_id, {
                    "$addToSet": {"regeneratedDocuments": regenerated_doc_id},
                    "$push": {
                        "logs": {
                            "timestamp": time.time(),
                            "action": "document_regenerated",
                            "details": {
                                "document_id": regenerated_doc_id,
                                "original_document_id": document_id,
                                "original_filename": original_filename,
                                "user_prompt": user_prompt
                            }
                        }
                    }
                })
                
                self.process_logger.info(f"Successfully regenerated document: {filename}")
                
                return {
                    "status": "completed",
                    "regenerated_document_id": regenerated_doc_id,
                    "original_document_id": document_id,
                    "original_filename": original_filename,
                    "regenerated_filename": filename,
                    "s3_key": s3_key,
                    "message": f"Document regenerated successfully: {filename}"
                }
                
            finally:
                # Clean up temporary directory
                try:
                    shutil.rmtree(temp_rfp_dir)
                except Exception as e:
                    self.process_logger.warning(f"Failed to cleanup temporary directory: {e}")
            
        except Exception as e:
            logger.error(f"Failed to regenerate bid document: {e}")
            
            # Clean up temporary directory on error
            try:
                if 'temp_rfp_dir' in locals() and os.path.exists(temp_rfp_dir):
                    shutil.rmtree(temp_rfp_dir)
            except:
                pass
            
            raise

    def upload_company_documents_to_s3(self, tenant_id: str, local_docs_path: str) -> Dict[str, Any]:
        """Upload company documents to S3 for future use in bid generation"""
        # Validate local path
        if not os.path.exists(local_docs_path):
            raise ValueError(f"Local documents path does not exist: {local_docs_path}")
        
        # Call API to upload company documents
        result = self.api_client.upload_company_documents(tenant_id, local_docs_path)
        
        return {
            "status": "completed" if not result["upload_errors"] else "partial",
            "uploaded_files": result["uploaded_files"],
            "upload_errors": result["upload_errors"],
            "uploaded_count": result["uploaded_count"],
            "error_count": result["error_count"],
            "message": f"Uploaded {result['uploaded_count']} company documents to S3"
        }

    # =================================================================
    # CHAT WITH RFP METHODS
    # =================================================================

    def setup_rfp_chat(self, analysis_id: str, tenant_id: str) -> Dict[str, Any]:
        """Setup chat functionality for RFP queries using enhanced search"""
        try:
            # Set up process context
            self._setup_process_context(tenant_id, analysis_id, "setup_rfp_chat")

            # Get analysis record via API
            analysis = self.api_client.get_analysis(analysis_id)
            if not analysis:
                raise ValueError(f"Analysis record not found: {analysis_id}")
            
            # Validate tenant
            if str(analysis["tenant"]) != tenant_id:
                raise ValueError(f"Analysis record not found: {analysis_id}")
            
            # Get tender details via API
            tender_id = str(analysis["tender"])
            tender = self.api_client.get_tender(tender_id, tenant_id)
            if not tender:
                raise ValueError(f"Tender not found: {analysis['tender']}")
            
            try:
                # Create directory for processing
                temp_dir = os.path.join(self.base_dir, f"chat_setup_{analysis_id}")
                os.makedirs(temp_dir, exist_ok=True)
                
                # Download all tender documents via API
                document_ids = [str(doc_id) for doc_id in tender.get("originalDocuments", [])]
                documents = self.api_client.get_documents(document_ids)
                
                for doc in documents:
                    doc_content = self.api_client.get_document_content(str(doc["_id"]), tenant_id)
                    filename = doc.get("name", f"doc_{doc['_id']}")
                    
                    if not filename.endswith(".pdf"):
                        filename = f"{filename}.pdf"
                        
                    file_path = os.path.join(temp_dir, filename)
                    
                    with open(file_path, 'wb') as f:
                        f.write(doc_content)
                
                # Process documents for chat using bid_queries module
                self.process_logger.info(f"Setting up chat functionality for analysis {analysis_id}")
                data_file = os.path.join(temp_dir, "tender_analysis", "chunks.json")
                logger.info(f"Chunk Data file path {data_file}")
                if not os.path.exists(data_file):
                    status = bq.process_task(temp_dir)
                else:
                    status=True

                if status:
                    # Setup Elasticsearch index for chat
                    bid_id = f"analysis_{analysis_id}"
                    index_config_file = "index_es.json"
                    
                    
                    # Create index config if it doesn't exist
                    if not os.path.exists(index_config_file):
                        self._create_default_index_config(index_config_file)
                    
                    # Convert Excel chunks to JSON if needed
                    if not os.path.exists(data_file):
                        self._convert_chunks_to_json(temp_dir, data_file)
                    
                    # Validate chunks data before indexing
                    if self._validate_chunks_data(data_file):
                        # Setup Elasticsearch index with error handling
                        tag_responses = chat.es_index_data(bid_id, index_config_file, data_file)
                        
                        # Upload chunks file to S3
                        content=''
                        with open(data_file, 'rb') as f:
                            content = f.read()
                        s3_key = f"tender_analysis/{bid_id}/chunks.json"
                        # Upload to S3 via API
                        content_type = 'application/json'
                        storage_details = self.api_client.upload_file(
                            content, s3_key, content_type, tenant_id
                        )
                        
                        # Update analysis record with chat setup info via API
                        self.api_client.update_analysis(analysis_id, {
                            "$set": {
                                "chatSetup": {
                                    "bidId": bid_id,
                                    "setupCompleted": True,
                                    "chunksPath": storage_details['url'],
                                    "indexConfig": index_config_file
                                }
                            },
                            "$push": {
                                "logs": {
                                    "timestamp": time.time(),
                                    "action": "chat_setup_completed",
                                    "details": {"bid_id": bid_id}
                                }
                            }
                        })
                        
                        return {
                            "status": "completed",
                            "bid_id": bid_id,
                            "chunks_path": data_file,
                            "message": "Chat functionality setup successfully"
                        }
                    else:
                        raise Exception("Failed to validate chunks data")
                else:
                    raise Exception("Failed to process documents for chat setup")
                    
            except Exception as e:
                self.process_logger.error(f"Failed to setup RFP chat: {e}")
                raise

        except Exception as e:
            logger.error(f"Failed to setup RFP chat: {e}")
            raise

    # Background function for actual chat setup
    def setup_rfp_chat_background(self, analysis_id: str, tenant_id: str):
        """Background task for setting up chat functionality"""

        # Get analysis record via API
        analysis = self.api_client.get_analysis(analysis_id)
        if not analysis:
            raise ValueError(f"Analysis record not found: {analysis_id}")
        
        # Get tender details via API
        tender_id = str(analysis["tender"])
        tender = self.api_client.get_tender(tender_id, tenant_id)
        if not tender:
            raise ValueError(f"Tender not found: {analysis['tender']}")

        # temp_dir = os.path.join(self.base_dir, f"analysis_cost_{tender_id}")
        # output_dir = os.path.join(temp_dir, "analysis_cost_output")
        # os.makedirs(output_dir, exist_ok=True)
        # merged_txt_file = os.path.join(output_dir, "merged.txt")

        s3_key = f"tender_analysis/{tender_id}/merged.txt"
        merged_txt_file=self.api_client.download_file(s3_key,tenant_id,False)
        # Decode bytes to string
        if isinstance(merged_txt_file, bytes):
            merged_txt_file = merged_txt_file.decode("utf-8")

        bid_id = f"analysis_{tender_id}"
        if not tender.get("chatSetup") or not tender["chatSetup"].get("setupCompleted"):

            if (bq.count_words(merged_txt_file) < 6000):
                logger.info(f"RFP documents small size...")

                # Update tender record
                self.api_client.update_tender(tender_id, {
                    "$set": {
                        "chatSetup": {
                            "bidId": bid_id,
                            "setupCompleted": True,
                            "setupInProgress": False,
                            "chunksPath": "",
                            "indexConfig": "",
                            "setupCompletedAt": time.time()
                        }
                    },
                    "$push":{
                        "logs": {
                            "timestamp": time.time(),
                            "action": "chat_setup_completed"
                        }
                    }
                })

                # Update analysis record with successful setup
                self.api_client.update_analysis(analysis_id, {
                    "$push": {
                        "logs": {
                            "timestamp": time.time(),
                            "action": "chat_setup_completed",
                            "details": {
                                "bid_id": bid_id,
                                "indexed_chunks": 1,
                                "success": True
                            }
                        }
                    }
                })

                return
            else:
                #RFP length is more, so we will break it in chunks
                try:
                    logger.info(f"Starting background chat setup for analysis {analysis_id}")
                    
                    # Set up process context
                    self._setup_process_context(tenant_id, analysis_id, "setup_rfp_chat_bg")
                    
                    # Create directory for processing
                    temp_dir = os.path.join(self.base_dir, f"chat_setup_{analysis_id}")
                    os.makedirs(temp_dir, exist_ok=True)
                    
                    # Download all tender documents via API
                    document_ids = [str(doc_id) for doc_id in tender.get("originalDocuments", [])]
                    documents = self.api_client.get_documents(document_ids)
                    
                    self.process_logger.info(f"Downloading {len(documents)} documents...")
                    
                    for doc in documents:
                        doc_content = self.api_client.get_document_content(str(doc["_id"]), tenant_id)
                        filename = doc.get("name", f"doc_{doc['_id']}")
                        
                        if not filename.endswith(".pdf"):
                            filename = f"{filename}.pdf"
                            
                        file_path = os.path.join(temp_dir, filename)
                        
                        with open(file_path, 'wb') as f:
                            f.write(doc_content)
                    
                    # Process documents for chat using bid_queries module
                    self.process_logger.info(f"Processing documents for chat setup...")
                    data_file = os.path.join(temp_dir, "tender_analysis", "chunks.json")

                    logger.info(f"Chunk Data file path {data_file}")
                    if not os.path.exists(data_file):
                        status = bq.process_task(temp_dir)
                    else:
                        status=True
                    
                    if status:
                        # Setup Elasticsearch index for chat
                        index_config_file = "index_es.json"
                        
                        # Create index config if it doesn't exist
                        if not os.path.exists(index_config_file):
                            self._create_default_index_config(index_config_file)
                        
                        # Convert Excel chunks to JSON if needed
                        if not os.path.exists(data_file):
                            self._convert_chunks_to_json(temp_dir, data_file)
                        
                        self.process_logger.info("Starting Elasticsearch indexing...")
                        
                        # Validate and setup Elasticsearch index with error handling
                        if self._validate_chunks_data(data_file):
                            try:
                                tag_responses = chat.es_index_data(bid_id, index_config_file, data_file)

                                # Upload chunks file to S3
                                content=''
                                with open(data_file, 'rb') as f:
                                    content = f.read()
                                s3_key = f"tender_analysis/{bid_id}/chunks.json"
                                # Upload to S3 via API
                                content_type = 'application/json'
                                storage_details = self.api_client.upload_file(
                                    content, s3_key, content_type, tenant_id
                                )
                                
                                # Update tender record
                                self.api_client.update_tender(tender_id, {
                                    "$set": {
                                        "chatSetup": {
                                            "bidId": bid_id,
                                            "setupCompleted": True,
                                            "setupInProgress": False,
                                            "chunksPath": storage_details['url'],
                                            "indexConfig": index_config_file,
                                            "setupCompletedAt": time.time()
                                        }
                                    },
                                    "$push":{
                                        "logs": {
                                            "timestamp": time.time(),
                                            "action": "chat_setup_completed"
                                        }
                                    }
                                })

                                # Update analysis record with successful setup
                                self.api_client.update_analysis(analysis_id, {
                                    "$push": {
                                        "logs": {
                                            "timestamp": time.time(),
                                            "action": "chat_setup_completed",
                                            "details": {
                                                "bid_id": bid_id,
                                                "indexed_chunks": len(tag_responses),
                                                "success": True
                                            }
                                        }
                                    }
                                })
                                
                                self.process_logger.info(f"Chat setup completed successfully for analysis {analysis_id}")
                                
                            except Exception as es_error:
                                # Handle Elasticsearch errors gracefully
                                self.process_logger.error(f"Elasticsearch indexing failed: {es_error}")
                                
                                # Update tender record
                                self.api_client.update_tender(tender_id, {
                                "$set": {
                                        "chatSetup": {
                                            "bidId": bid_id,
                                            "setupCompleted": False,
                                            "setupInProgress": False,
                                            "setupFailed": True,
                                            "errorMessage": str(es_error),
                                            "failedAt": time.time()
                                        }
                                    },
                                    "$push": {
                                        "logs": {
                                            "timestamp": time.time(),
                                            "action": "chat_setup_failed",
                                            "details": {
                                                "error": str(es_error),
                                                "stage": "elasticsearch_indexing"
                                            }
                                        }
                                    }
                                })

                                # Update analysis with partial failure
                                self.api_client.update_analysis(analysis_id, {
                                    "$push": {
                                        "logs": {
                                            "timestamp": time.time(),
                                            "action": "chat_setup_failed",
                                            "details": {
                                                "error": str(es_error),
                                                "stage": "elasticsearch_indexing"
                                            }
                                        }
                                    }
                                })
                        else:
                            raise Exception("Failed to validate chunks data")
                    else:
                        raise Exception("Failed to process documents for chat setup")
                        
                    # Clean up temporary directory
                    try:
                        shutil.rmtree(temp_dir)
                    except Exception as cleanup_error:
                        self.process_logger.warning(f"Failed to cleanup temp directory: {cleanup_error}")
                except Exception as e:
                    logger.error(f"Background chat setup failed for analysis {analysis_id}: {e}")
                    
                    try:

                        # Update tender record
                        self.api_client.update_tender(tender_id, {
                            "$set": {
                                "chatSetup": {
                                    "bidId": f"analysis_{tender_id}",
                                    "setupCompleted": False,
                                    "setupInProgress": False,
                                    "setupFailed": True,
                                    "errorMessage": str(e),
                                    "failedAt": time.time()
                                }
                            },
                            "$push": {
                                "logs": {
                                    "timestamp": time.time(),
                                    "action": "chat_setup_failed",
                                    "details": {
                                        "error": str(e),
                                        "stage": "background_processing"
                                    }
                                }
                            }
                        })
                        # Update analysis with failure status
                        self.api_client.update_analysis(analysis_id, {
                            "$push": {
                                "logs": {
                                    "timestamp": time.time(),
                                    "action": "chat_setup_failed",
                                    "details": {
                                        "error": str(e),
                                        "stage": "background_processing"
                                    }
                                }
                            }
                        })
                    except Exception as update_error:
                        logger.error(f"Failed to update analysis with error status: {update_error}")
        else:
            logger.info("Chat setup already completed. Reusing the same.")
            # Update analysis record with successful setup
            self.api_client.update_analysis(analysis_id, {
                "$push": {
                    "logs": {
                        "timestamp": time.time(),
                        "action": "chat_setup_completed",
                        "details": {
                            "bid_id": bid_id,
                            "success": True
                        }
                    }
                }
            })
            
            self.process_logger.info(f"Chat setup completed successfully for analysis {analysis_id}")

    def chat_with_rfp(self, analysis_id: str, tenant_id: str, tender_id: str, query: str, client_id: str = None) -> str:
        """Chat with RFP documents using enhanced search and Claude"""
        # Get analysis record via API
        analysis = self.api_client.get_analysis(analysis_id)
        if not analysis:
            raise ValueError(f"Analysis record not found: {analysis_id}")
        
        # Validate tenant
        if str(analysis["tenant"]) != tenant_id:
            raise ValueError(f"Analysis record not found: {analysis_id}")
        
        # temp_dir = os.path.join(self.base_dir, f"analysis_cost_{tender_id}")
        # output_dir = os.path.join(temp_dir, "analysis_cost_output")
        # os.makedirs(output_dir, exist_ok=True)
        # merged_txt_file = os.path.join(output_dir, "merged.txt")

        s3_key = f"tender_analysis/{tender_id}/merged.txt"
        merged_txt_file=self.api_client.download_file(s3_key,tenant_id,False)
        # Decode bytes to string
        if isinstance(merged_txt_file, bytes):
            merged_txt_file = merged_txt_file.decode("utf-8")

        if (bq.count_words(merged_txt_file) < 6000):
            no_es_query = True
        else:
            no_es_query = False

        # Get tender record
        tender = self.api_client.get_tender(tender_id, tenant_id)
        if not tender:
            raise ValueError(f"Tender not found: {tender_id}")

        # Check if chat is setup
        chat_setup = tender.get("chatSetup", {})
        if not chat_setup.get("setupCompleted"):
            chat_setup = analysis.get("chatSetup", {})
            if not chat_setup:
                raise ValueError("Chat functionality not setup. Please setup chat first.")
            else:
                # update tender
                self.api_client.update_tender(tender_id, {
                    "$set": {
                        "chatSetup": chat_setup
                    },
                    "$push":{
                        "logs": {
                            "timestamp": time.time(),
                            "action": "chat_setup_completed"
                        }
                    }
                })
                        
        
        bid_id = chat_setup.get("bidId")
        if not client_id:
            client_id = f"tenant_{tenant_id}"
        
        try:
            temp_dir = os.path.join(self.base_dir, f"chat_setup_{analysis_id}")
            #bid_id = f"analysis_{tender_id}"
            data_file = os.path.join(temp_dir, "tender_analysis", "chunks.json")

            #Download if chunks.json is not present
            s3_key = f"tender_analysis/{bid_id}/chunks.json"
            if not os.path.exists(data_file):
                output_dir = os.path.join(temp_dir, "tender_analysis")
                os.makedirs(output_dir, exist_ok=True)
                doc_content=self.api_client.download_file(s3_key,tenant_id,False)
                with open(data_file, 'wb') as f:
                    f.write(doc_content)
                    
            # Use the chat_with_rfp function
            response = chat.chat_with_rfp(query, bid_id, client_id, data_file, no_es_query, merged_txt_file)
            
            # Log the interaction via API
            self.api_client.update_analysis(analysis_id, {
                "$push": {
                    "chatHistory": {
                        "timestamp": time.time(),
                        "clientId": client_id,
                        "query": query,
                        "response": response
                    }
                }
            })
            
            return response
            
        except Exception as e:
            logger.error(f"Failed to process chat query: {e}")
            raise

    # =================================================================
    # PDF MERGER AND COMPRESSION METHODS
    # =================================================================

    # def merge_bid_documents(self, bid_generation_id: str, tenant_id: str,
    #                         generated_document_ids: List[str],
    #                         document_categories: List[str] = None, 
    #                         add_page_numbers: bool = True) -> Dict[str, Any]:
    #         """
    #         Merge bid documents into a single PDF with index
            
    #         Args:
    #             bid_generation_id: Bid generation record ID
    #             tenant_id: Tenant ID
    #             generated_document_ids: List of generated document IDs to merge
    #             document_categories: List of document categories to include (optional)
    #             add_page_numbers: Whether to add page numbers to merged PDF
                
    #         Returns:
    #             Dict containing merged document information
    #         """
    #         try:
    #             self._setup_process_context(tenant_id, bid_generation_id, "merge_bid_documents")
                
    #             # Validate input parameters
    #             if not generated_document_ids:
    #                 raise ValueError("No generated document IDs provided")
                
    #             # Get bid generation record for metadata (tender info, etc.)
    #             bid_gen = self.api_client.get_bid_generation(bid_generation_id, tenant_id)
    #             if not bid_gen:
    #                 raise ValueError(f"Bid generation record not found: {bid_generation_id}")
                
    #             # Use the provided document IDs instead of getting from database
    #             generated_doc_ids = generated_document_ids
                
    #             # Get document records
    #             documents = self.api_client.get_documents(generated_doc_ids)
                
    #             # Filter by categories if specified
    #             if document_categories:
    #                 documents = [doc for doc in documents if doc.get("category") in document_categories]
                
    #             # Filter only PDF documents
    #             pdf_documents = [doc for doc in documents if doc.get("name", "").lower().endswith('.pdf')]
                
    #             if not pdf_documents:
    #                 raise ValueError("No PDF documents found to merge")
                
    #             self.process_logger.info(f"Merging {len(pdf_documents)} PDF documents")
                
    #             # Create temporary directory for processing
    #             temp_dir = os.path.join(self.base_dir, f"merge_{bid_generation_id}")
    #             os.makedirs(temp_dir, exist_ok=True)
                
    #             try:
    #                 # Download all PDF documents
    #                 downloaded_files = []
    #                 for doc in pdf_documents:
    #                     try:
    #                         doc_content = self.api_client.get_document_content(str(doc["_id"]), tenant_id)
    #                         filename = doc.get("name", f"doc_{doc['_id']}.pdf")
                            
    #                         # Ensure .pdf extension
    #                         if not filename.lower().endswith('.pdf'):
    #                             filename += '.pdf'
                            
    #                         file_path = os.path.join(temp_dir, filename)
                            
    #                         with open(file_path, 'wb') as f:
    #                             f.write(doc_content)
                            
    #                         downloaded_files.append(file_path)
    #                         self.process_logger.info(f"Downloaded: {filename}")
                            
    #                     except Exception as e:
    #                         self.process_logger.error(f"Failed to download document {doc.get('_id')}: {e}")
                    
    #                 if not downloaded_files:
    #                     raise ValueError("No documents were successfully downloaded")
                    
    #                 # Generate output filename
    #                 tender_id = str(bid_gen.get("tender", ""))
    #                 tender = self.api_client.get_tender(tender_id, tenant_id)
    #                 bid_number = tender.get("bidNumber", "bid") if tender else "bid"
    #                 bid_number = re.sub(r'[\/\\]', '_', bid_number)
                    
    #                 output_filename = f"{bid_number}_merged_documents.pdf"
    #                 output_path = os.path.join(temp_dir, output_filename)
                    
    #                 # Merge PDFs using pdf_merger
    #                 self.process_logger.info("Starting PDF merge process...")
    #                 pdf_merger.merge_pdfs_with_index(temp_dir, output_filename, add_page_numbers)
                    
    #                 if not os.path.exists(output_path):
    #                     raise Exception("PDF merge failed - output file not created")
                    
    #                 # Upload merged PDF to S3
    #                 with open(output_path, 'rb') as f:
    #                     merged_content = f.read()
                    
    #                 s3_key = f"bid_documents/{tenant_id}/{bid_number}/merged/{output_filename}"
    #                 storage_details = self.api_client.upload_file(
    #                     merged_content, s3_key, 'application/pdf', tenant_id
    #                 )
                    
    #                 # Create document record
    #                 document_record = {
    #                     "tenant": tenant_id,
    #                     "name": output_filename,
    #                     "type": "bid_document_merged",
    #                     "category": "merged",
    #                     "storageType": "s3",
    #                     "storageDetails": storage_details,
    #                     "metadata": {
    #                         "bidGenerationId": bid_generation_id,
    #                         "bidNumber": bid_number,
    #                         "mergedDocumentCount": len(pdf_documents),
    #                         "addedPageNumbers": add_page_numbers,
    #                         "mergedCategories": document_categories or "all",
    #                         "mergedAt": time.time(),
    #                         "providedDocumentIds": generated_document_ids
    #                     }
    #                 }
                    
    #                 doc = self.api_client.create_document(document_record)
    #                 merged_doc_id = str(doc["_id"])
                    
    #                 # Update bid generation record
    #                 self.api_client.update_bid_generation(bid_generation_id, {
    #                     "$push": {
    #                         "mergedDocuments": merged_doc_id,
    #                         "logs": {
    #                             "timestamp": time.time(),
    #                             "action": "documents_merged",
    #                             "details": {
    #                                 "merged_document_id": merged_doc_id,
    #                                 "document_count": len(pdf_documents),
    #                                 "output_filename": output_filename,
    #                                 "provided_document_ids": generated_document_ids
    #                             }
    #                         }
    #                     }
    #                 })
                    
    #                 self.process_logger.info(f"Successfully merged {len(pdf_documents)} documents into {output_filename}")
                    
    #                 return {
    #                     "status": "completed",
    #                     "merged_document_id": merged_doc_id,
    #                     "filename": output_filename,
    #                     "document_count": len(pdf_documents),
    #                     "file_size": len(merged_content),
    #                     "s3_url": storage_details.get("url"),
    #                     "message": f"Successfully merged {len(pdf_documents)} documents"
    #                 }
                    
    #             finally:
    #                 # Clean up temporary directory
    #                 try:
    #                     shutil.rmtree(temp_dir)
    #                 except Exception as e:
    #                     self.process_logger.warning(f"Failed to cleanup temp directory: {e}")
                
    #         except Exception as e:
    #             self.process_logger.error(f"Failed to merge bid documents: {e}")
    #             raise

    def merge_bid_documents(self, bid_generation_id: str, tenant_id: str,
                        generated_document_ids: List[str],
                        document_categories: List[str] = None, 
                        add_page_numbers: bool = True) -> Dict[str, Any]:
            """
            Merge bid documents into a single PDF with index
            
            Args:
                bid_generation_id: Bid generation record ID
                tenant_id: Tenant ID
                generated_document_ids: List of generated document IDs to merge
                document_categories: List of document categories to include (optional)
                add_page_numbers: Whether to add page numbers to merged PDF
                
            Returns:
                Dict containing merged document information
            """
            try:
                self._setup_process_context(tenant_id, bid_generation_id, "merge_bid_documents")
                
                # Validate input parameters
                if not generated_document_ids:
                    raise ValueError("No generated document IDs provided")

                # Get bid generation record for metadata (tender info, etc.)
                bid_gen = self.api_client.get_bid_generation(bid_generation_id, tenant_id)
                if not bid_gen:
                    raise ValueError(f"Bid generation record not found: {bid_generation_id}")
                
                # Use the provided document IDs instead of getting from database
                generated_doc_ids = generated_document_ids
                
                # Get document records
                documents = self.api_client.get_documents(generated_doc_ids)
                # Filter by categories if specified
                # if document_categories:
                #     documents = [doc for doc in documents if doc.get("category") in document_categories]
                
                # Process ALL documents (not just PDFs)
                processable_documents = documents
                
                if not processable_documents:
                    raise ValueError("No documents found to merge")
                
                self.process_logger.info(f"Merging {len(processable_documents)} documents (will convert non-PDFs to PDF)")
                
                # Create temporary directory for processing
                temp_dir = os.path.join(self.base_dir, f"merge_{bid_generation_id}")
                conversion_dir = os.path.join(temp_dir, "converted")
                os.makedirs(temp_dir, exist_ok=True)
                os.makedirs(conversion_dir, exist_ok=True)
                # Create the 'processed' subdirectory if it doesn't exist
                processed_dir = os.path.join(temp_dir, "processed")
                os.makedirs(processed_dir, exist_ok=True)

                try:
                    # Download and convert all documents to PDF
                    converted_pdf_files = []
                    for doc in processable_documents:
                        try:
                            doc_content = self.api_client.get_document_content(str(doc["_id"]), tenant_id)
                            filename = doc.get("name", f"doc_{doc['_id']}")
                            
                            # Download original file
                            original_file_path = os.path.join(temp_dir, filename)
                            with open(original_file_path, 'wb') as f:
                                f.write(doc_content)
                            
                            # Preprocess HTML files to remove markdown code block syntax
                            if filename.lower().endswith('.html'):
                                self._clean_html_file(original_file_path)
                            
                            # Convert to PDF if needed (single line call as requested)
                            converted_pdf_path = self._convert_to_pdf(original_file_path, conversion_dir)
                            
                            if converted_pdf_path and os.path.exists(converted_pdf_path):
                                converted_pdf_files.append(converted_pdf_path)
                                self.process_logger.info(f"Successfully processed: {filename}")
                            else:
                                self.process_logger.warning(f"Failed to convert document: {filename}")
                            
                        except Exception as e:
                            self.process_logger.error(f"Failed to process document {doc.get('_id')}: {e}")
                    
                    if not converted_pdf_files:
                        raise ValueError("No documents were successfully converted to PDF")
                    
                    # Copy converted PDFs to main temp directory for merging
                    for index, pdf_file in enumerate(converted_pdf_files):
                        # Extract the base filename (e.g., "document.pdf")
                        basename = os.path.basename(pdf_file)
                        new_filename = f"{index + 100}_{basename}" # Start numbering from 100

                        # Construct the destination path using the new filename
                        dest_path = os.path.join(processed_dir, new_filename)

                        # Copy the file to the destination with the new name
                        shutil.copy(pdf_file, dest_path)

                    # Generate output filename
                    tender_id = str(bid_gen.get("tender", ""))
                    tender = self.api_client.get_tender(tender_id, tenant_id)
                    bid_number = tender.get("bidNumber", "bid") if tender else "bid"
                    bid_number = re.sub(r'[\/\\]', '_', bid_number)

                    # Create timestamp in milliseconds
                    timestamp = int(time.time() * 1000)
                    
                    output_filename = f"{bid_number}_merged_documents_{timestamp}.pdf"
                    output_path = os.path.join(processed_dir, output_filename)
                    
                    # Merge PDFs using pdf_merger
                    self.process_logger.info("Starting PDF merge process...")
                    pdf_merger.merge_pdfs_with_index(processed_dir, output_filename, add_page_numbers)
                    
                    if not os.path.exists(output_path):
                        raise Exception("PDF merge failed - output file not created")
                    
                    # Upload merged PDF to S3
                    with open(output_path, 'rb') as f:
                        merged_content = f.read()
                    
                    s3_key = f"bid_documents/{tenant_id}/{bid_number}/merged/{output_filename}"
                    storage_details = self.api_client.upload_file(
                        merged_content, s3_key, 'application/pdf', tenant_id
                    )
                    
                    # Create document record
                    document_record = {
                        "tenant": tenant_id,
                        "name": output_filename,
                        "type": "bid_document_merged",
                        "category": "merged",
                        "storageType": "s3",
                        "storageDetails": storage_details,
                        "metadata": {
                            "bidGenerationId": bid_generation_id,
                            "bidNumber": bid_number,
                            "mergedDocumentCount": len(processable_documents),
                            "convertedDocumentCount": len(converted_pdf_files),
                            "addedPageNumbers": add_page_numbers,
                            "mergedCategories": document_categories or "all",
                            "mergedAt": time.time(),
                            "providedDocumentIds": generated_document_ids
                        }
                    }
                    
                    doc = self.api_client.create_document(document_record)
                    merged_doc_id = str(doc["_id"])
                    
                    # Update bid generation record
                    self.api_client.update_bid_generation(bid_generation_id, {
                        "$addToSet": {"mergedDocuments": merged_doc_id},
                        "$push": {
                            "logs": {
                                "timestamp": time.time(),
                                "action": "documents_merged",
                                "details": {
                                    "merged_document_id": merged_doc_id,
                                    "document_count": len(processable_documents),
                                    "converted_count": len(converted_pdf_files),
                                    "output_filename": output_filename,
                                    "provided_document_ids": generated_document_ids
                                }
                            }
                        }
                    })
                    
                    self.process_logger.info(f"Successfully merged {len(processable_documents)} documents ({len(converted_pdf_files)} converted to PDF) into {output_filename}")
                    
                    return {
                        "status": "completed",
                        "merged_document_id": merged_doc_id,
                        "filename": output_filename,
                        "document_count": len(processable_documents),
                        "converted_count": len(converted_pdf_files),
                        "file_size": len(merged_content),
                        "s3_url": storage_details.get("url"),
                        "message": f"Successfully merged {len(processable_documents)} documents (converted {len(converted_pdf_files)} to PDF)"
                    }
                    
                finally:
                    # Clean up temporary directory
                    try:
                        shutil.rmtree(processed_dir)
                        shutil.rmtree(temp_dir)
                    except Exception as e:
                        self.process_logger.warning(f"Failed to cleanup temp directory: {e}")
                
            except Exception as e:
                self.process_logger.error(f"Failed to merge bid documents: {e}")
                raise

    # def merge_bid_documents(self, bid_generation_id: str, tenant_id: str,
    #                             generated_document_ids: List[str],
    #                             document_categories: List[str] = None, 
    #                             add_page_numbers: bool = True) -> Dict[str, Any]:
    #         """
    #         Merge bid documents into a single PDF with index
            
    #         Args:
    #             bid_generation_id: Bid generation record ID
    #             tenant_id: Tenant ID
    #             generated_document_ids: List of generated document IDs to merge
    #             document_categories: List of document categories to include (optional)
    #             add_page_numbers: Whether to add page numbers to merged PDF
                
    #         Returns:
    #             Dict containing merged document information
    #         """
    #         try:
    #             self._setup_process_context(tenant_id, bid_generation_id, "merge_bid_documents")
                
    #             # Validate input parameters
    #             if not generated_document_ids:
    #                 raise ValueError("No generated document IDs provided")

    #             # Get bid generation record for metadata (tender info, etc.)
    #             bid_gen = self.api_client.get_bid_generation(bid_generation_id, tenant_id)
    #             if not bid_gen:
    #                 raise ValueError(f"Bid generation record not found: {bid_generation_id}")
                
    #             # Use the provided document IDs instead of getting from database
    #             generated_doc_ids = generated_document_ids
                
    #             # Get document records
    #             documents = self.api_client.get_documents(generated_doc_ids)
    #             # Filter by categories if specified
    #             # if document_categories:
    #             #     documents = [doc for doc in documents if doc.get("category") in document_categories]
                
    #             # Process ALL documents (not just PDFs)
    #             processable_documents = documents
                
    #             if not processable_documents:
    #                 raise ValueError("No documents found to merge")
                
    #             self.process_logger.info(f"Merging {len(processable_documents)} documents (will convert non-PDFs to PDF)")
                
    #             # Create temporary directory for processing
    #             temp_dir = os.path.join(self.base_dir, f"merge_{bid_generation_id}")
    #             conversion_dir = os.path.join(temp_dir, "converted")
    #             os.makedirs(temp_dir, exist_ok=True)
    #             os.makedirs(conversion_dir, exist_ok=True)
    #             # Create the 'processed' subdirectory if it doesn't exist
    #             processed_dir = os.path.join(temp_dir, "processed")
    #             os.makedirs(processed_dir, exist_ok=True)

    #             try:
    #                 # Download and convert all documents to PDF
    #                 converted_pdf_files = []
    #                 for doc in processable_documents:
    #                     try:
    #                         doc_content = self.api_client.get_document_content(str(doc["_id"]), tenant_id)
    #                         filename = doc.get("name", f"doc_{doc['_id']}")
                            
    #                         # Download original file
    #                         original_file_path = os.path.join(temp_dir, filename)
    #                         with open(original_file_path, 'wb') as f:
    #                             f.write(doc_content)
                            
    #                         # Convert to PDF if needed (single line call as requested)
    #                         converted_pdf_path = self._convert_to_pdf(original_file_path, conversion_dir)
                            
    #                         if converted_pdf_path and os.path.exists(converted_pdf_path):
    #                             converted_pdf_files.append(converted_pdf_path)
    #                             self.process_logger.info(f"Successfully processed: {filename}")
    #                         else:
    #                             self.process_logger.warning(f"Failed to convert document: {filename}")
                            
    #                     except Exception as e:
    #                         self.process_logger.error(f"Failed to process document {doc.get('_id')}: {e}")
                    
    #                 if not converted_pdf_files:
    #                     raise ValueError("No documents were successfully converted to PDF")
                    
    #                 # Copy converted PDFs to main temp directory for merging
    #                 for index, pdf_file in enumerate(converted_pdf_files):
    #                     # Extract the base filename (e.g., "document.pdf")
    #                     basename = os.path.basename(pdf_file)
    #                     new_filename = f"{index + 100}_{basename}" # Start numbering from 100

    #                     # Construct the destination path using the new filename
    #                     dest_path = os.path.join(processed_dir, new_filename)

    #                     # Copy the file to the destination with the new name
    #                     shutil.copy(pdf_file, dest_path)
                    
    #                 tender_id = str(bid_gen.get("tender", ""))
    #                 tender = self.api_client.get_tender(tender_id, tenant_id)
    #                 bid_number = tender.get("bidNumber", "bid") if tender else "bid"
    #                 bid_number = re.sub(r'[\/\\]', '_', bid_number)
                    
    #                 # Create timestamp in milliseconds
    #                 timestamp = int(time.time() * 1000)
                    
    #                 output_filename = f"{bid_number}_merged_documents_{timestamp}.pdf"
    #                 output_path = os.path.join(processed_dir, output_filename)
                    
    #                 # Merge PDFs using pdf_merger
    #                 self.process_logger.info("Starting PDF merge process...")
    #                 pdf_merger.merge_pdfs_with_index(processed_dir, output_filename, add_page_numbers)
                    
    #                 if not os.path.exists(output_path):
    #                     raise Exception("PDF merge failed - output file not created")
                    
    #                 # Upload merged PDF to S3
    #                 with open(output_path, 'rb') as f:
    #                     merged_content = f.read()
                    
    #                 s3_key = f"bid_documents/{tenant_id}/{bid_number}/merged/{output_filename}"
    #                 storage_details = self.api_client.upload_file(
    #                     merged_content, s3_key, 'application/pdf', tenant_id
    #                 )
                    
    #                 # Create document record
    #                 document_record = {
    #                     "tenant": tenant_id,
    #                     "name": output_filename,
    #                     "type": "bid_document_merged",
    #                     "category": "merged",
    #                     "storageType": "s3",
    #                     "storageDetails": storage_details,
    #                     "metadata": {
    #                         "bidGenerationId": bid_generation_id,
    #                         "bidNumber": bid_number,
    #                         "mergedDocumentCount": len(processable_documents),
    #                         "convertedDocumentCount": len(converted_pdf_files),
    #                         "addedPageNumbers": add_page_numbers,
    #                         "mergedCategories": document_categories or "all",
    #                         "mergedAt": time.time(),
    #                         "providedDocumentIds": generated_document_ids,
    #                         "timestamp": timestamp  # Add timestamp to metadata
    #                     }
    #                 }
                    
    #                 doc = self.api_client.create_document(document_record)
    #                 merged_doc_id = str(doc["_id"])
                    
    #                 # Update bid generation record
    #                 # First remove the document if it exists, then add it at the beginning
    #                 self.api_client.update_bid_generation(bid_generation_id, {
    #                     "$pull": {"mergedDocuments": merged_doc_id}
    #                 })
                    
    #                 self.api_client.update_bid_generation(bid_generation_id, {
    #                     "$push": {
    #                         "mergedDocuments": {
    #                             "$each": [merged_doc_id],
    #                             "$position": 0
    #                         },
    #                         "logs": {
    #                             "timestamp": time.time(),
    #                             "action": "documents_merged",
    #                             "details": {
    #                                 "merged_document_id": merged_doc_id,
    #                                 "document_count": len(processable_documents),
    #                                 "converted_count": len(converted_pdf_files),
    #                                 "output_filename": output_filename,
    #                                 "provided_document_ids": generated_document_ids,
    #                                 "merge_timestamp": timestamp  # Add timestamp to logs
    #                             }
    #                         }
    #                     }
    #                 })
                    
    #                 self.process_logger.info(f"Successfully merged {len(processable_documents)} documents ({len(converted_pdf_files)} converted to PDF) into {output_filename}")
                    
    #                 return {
    #                     "status": "completed",
    #                     "merged_document_id": merged_doc_id,
    #                     "filename": output_filename,
    #                     "document_count": len(processable_documents),
    #                     "converted_count": len(converted_pdf_files),
    #                     "file_size": len(merged_content),
    #                     "s3_url": storage_details.get("url"),
    #                     "timestamp": timestamp,  # Include timestamp in response
    #                     "message": f"Successfully merged {len(processable_documents)} documents (converted {len(converted_pdf_files)} to PDF)"
    #                 }
                    
    #             finally:
    #                 # Clean up temporary directory
    #                 try:
    #                     shutil.rmtree(processed_dir)
    #                     shutil.rmtree(temp_dir)
    #                 except Exception as e:
    #                     self.process_logger.warning(f"Failed to cleanup temp directory: {e}")
                
    #         except Exception as e:
    #             self.process_logger.error(f"Failed to merge bid documents: {e}")
    #             raise

    def compress_document(self, bid_generation_id: str, document_id: str, tenant_id: str, 
                         target_size_percent: int = 50, 
                         preserve_quality: bool = True) -> Dict[str, Any]:
        """
        Compress a PDF document
        
        Args:
            document_id: Document ID to compress
            tenant_id: Tenant ID
            target_size_percent: Target size as percentage of original (25, 50, or 75)
            preserve_quality: Whether to prioritize quality over exact size target
            
        Returns:
            Dict containing compressed document information
        """
        try:
            self._setup_process_context(tenant_id, document_id, "compress_document")
            
            # Validate target size
            if target_size_percent not in [25, 50, 75]:
                raise ValueError("Target size must be 25%, 50%, or 75%")
            
            # Get document record
            documents = self.api_client.get_documents([document_id])
            if not documents:
                raise ValueError(f"Document not found: {document_id}")
            
            document = documents[0]
            doc_name = document.get("name", f"doc_{document_id}")
            
            # Ensure it's a PDF
            if not doc_name.lower().endswith('.pdf'):
                raise ValueError("Only PDF documents can be compressed")
            
            self.process_logger.info(f"Compressing document: {doc_name} to {target_size_percent}%")
            
            # Create temporary directory
            temp_dir = os.path.join(self.base_dir, f"compress_{document_id}")
            os.makedirs(temp_dir, exist_ok=True)
            
            try:
                # Download original document
                doc_content = self.api_client.get_document_content(document_id, tenant_id)
                original_size = len(doc_content)
                
                input_path = os.path.join(temp_dir, doc_name)
                with open(input_path, 'wb') as f:
                    f.write(doc_content)
                
                # Generate output filename
                base_name = os.path.splitext(doc_name)[0]
                output_filename = f"{base_name}_compressed_{target_size_percent}percent.pdf"
                output_path = os.path.join(temp_dir, output_filename)
                
                # Compress PDF using pdf_merger
                self.process_logger.info("Starting PDF compression...")
                success, final_path, stats = pdf_merger.compress_pdf(
                    input_path, output_path, target_size_percent, 
                    max_attempts=10, preserve_quality=preserve_quality
                )
                
                if not success:
                    raise Exception(f"PDF compression failed: {stats.get('error', 'Unknown error')}")
                
                # Read compressed file
                with open(final_path, 'rb') as f:
                    compressed_content = f.read()
                
                compressed_size = len(compressed_content)
                compression_ratio = ((original_size - compressed_size) / original_size) * 100
                
                # Upload compressed PDF to S3
                # Extract metadata for S3 key
                metadata = document.get("metadata", {})
                bid_number = metadata.get("bidNumber", "document")
                bid_number = re.sub(r'[\/\\]', '_', bid_number)
                
                s3_key = f"bid_documents/{tenant_id}/{bid_number}/compressed/{output_filename}"
                storage_details = self.api_client.upload_file(
                    compressed_content, s3_key, 'application/pdf', tenant_id
                )
                
                # Create document record
                document_record = {
                    "tenant": tenant_id,
                    "name": output_filename,
                    "type": "bid_document_compressed",
                    "category": "compressed",
                    "storageType": "s3",
                    "storageDetails": storage_details,
                    "metadata": {
                        **metadata,
                        "originalDocumentId": document_id,
                        "originalSize": original_size,
                        "compressedSize": compressed_size,
                        "compressionRatio": compression_ratio,
                        "targetSizePercent": target_size_percent,
                        "preserveQuality": preserve_quality,
                        "compressionStats": stats,
                        "compressedAt": time.time()
                    }
                }
                
                doc = self.api_client.create_document(document_record)
                compressed_doc_id = str(doc["_id"])

                # Update bid generation record
                self.api_client.update_bid_generation(bid_generation_id, {
                    "$addToSet": {"compressedDocuments": compressed_doc_id},
                    "$push": {
                        "logs": {
                            "timestamp": time.time(),
                            "action": "document_compressed",
                            "details": {
                                "compressed_document_id": compressed_doc_id,
                                "document_count": 1,
                                "output_filename": output_filename,
                                "provided_document_id": document_id
                            }
                        }
                    }
                })
                
                self.process_logger.info(f"Successfully compressed document: {compression_ratio:.1f}% reduction")
                
                return {
                    "status": "completed",
                    "bid_generation_id":bid_generation_id,
                    "compressed_document_id": compressed_doc_id,
                    "original_filename": doc_name,
                    "compressed_filename": output_filename,
                    "original_size": original_size,
                    "compressed_size": compressed_size,
                    "compression_ratio": compression_ratio,
                    "target_achieved": stats.get("achieved_percent", 0),
                    "s3_url": storage_details.get("url"),
                    "compression_stats": stats,
                    "message": f"Compressed {doc_name} by {compression_ratio:.1f}%"
                }
                
            finally:
                # Clean up temporary directory
                try:
                    shutil.rmtree(temp_dir)
                except Exception as e:
                    self.process_logger.warning(f"Failed to cleanup temp directory: {e}")
            
        except Exception as e:
            self.process_logger.error(f"Failed to compress document: {e}")
            raise

    def batch_compress_bid_documents(self, bid_generation_id: str, tenant_id: str, 
                                   target_size_percent: int = 50,
                                   preserve_quality: bool = True,
                                   document_categories: List[str] = None) -> Dict[str, Any]:
        """
        Batch compress all PDF documents in a bid generation
        
        Args:
            bid_generation_id: Bid generation record ID
            tenant_id: Tenant ID
            target_size_percent: Target size as percentage of original (25, 50, or 75)
            preserve_quality: Whether to prioritize quality over exact size target
            document_categories: List of document categories to compress (optional)
            
        Returns:
            Dict containing batch compression results
        """
        try:
            self._setup_process_context(tenant_id, bid_generation_id, "batch_compress_documents")
            
            # Validate target size
            if target_size_percent not in [25, 50, 75]:
                raise ValueError("Target size must be 25%, 50%, or 75%")
            
            # Get bid generation record
            bid_gen = self.api_client.get_bid_generation(bid_generation_id, tenant_id)
            if not bid_gen:
                raise ValueError(f"Bid generation record not found: {bid_generation_id}")
            
            # Get generated documents
            generated_doc_ids = bid_gen.get("generatedDocuments", [])
            if not generated_doc_ids:
                raise ValueError("No generated documents found to compress")
            
            # Get document records
            documents = self.api_client.get_documents(generated_doc_ids)
            
            # Filter by categories if specified
            if document_categories:
                documents = [doc for doc in documents if doc.get("category") in document_categories]
            
            # Filter only PDF documents
            pdf_documents = [doc for doc in documents if doc.get("name", "").lower().endswith('.pdf')]
            
            if not pdf_documents:
                raise ValueError("No PDF documents found to compress")
            
            self.process_logger.info(f"Batch compressing {len(pdf_documents)} PDF documents to {target_size_percent}%")
            
            # Create temporary directory
            temp_dir = os.path.join(self.base_dir, f"batch_compress_{bid_generation_id}")
            os.makedirs(temp_dir, exist_ok=True)
            
            try:
                # Download all PDF documents
                downloaded_files = []
                doc_id_mapping = {}
                
                for doc in pdf_documents:
                    try:
                        doc_content = self.api_client.get_document_content(str(doc["_id"]), tenant_id)
                        filename = doc.get("name", f"doc_{doc['_id']}.pdf")
                        
                        # Ensure .pdf extension
                        if not filename.lower().endswith('.pdf'):
                            filename += '.pdf'
                        
                        file_path = os.path.join(temp_dir, filename)
                        
                        with open(file_path, 'wb') as f:
                            f.write(doc_content)
                        
                        downloaded_files.append(file_path)
                        doc_id_mapping[filename] = str(doc["_id"])
                        self.process_logger.info(f"Downloaded for compression: {filename}")
                        
                    except Exception as e:
                        self.process_logger.error(f"Failed to download document {doc.get('_id')}: {e}")
                
                if not downloaded_files:
                    raise ValueError("No documents were successfully downloaded")
                
                # Batch compress using pdf_merger
                self.process_logger.info("Starting batch PDF compression...")
                pdf_merger.batch_compress_pdfs(temp_dir, target_size_percent, "_compressed")
                
                # Upload compressed files and create records
                compressed_docs = []
                total_original_size = 0
                total_compressed_size = 0
                
                # Find compressed files
                for filename in os.listdir(temp_dir):
                    if filename.endswith(f"_compressed_{target_size_percent}percent.pdf"):
                        # Extract original filename
                        original_filename = filename.replace(f"_compressed_{target_size_percent}percent.pdf", ".pdf")
                        original_doc_id = doc_id_mapping.get(original_filename)
                        
                        if not original_doc_id:
                            continue
                        
                        compressed_path = os.path.join(temp_dir, filename)
                        original_path = os.path.join(temp_dir, original_filename)
                        
                        if os.path.exists(compressed_path) and os.path.exists(original_path):
                            # Read compressed file
                            with open(compressed_path, 'rb') as f:
                                compressed_content = f.read()
                            
                            # Calculate compression stats
                            original_size = os.path.getsize(original_path)
                            compressed_size = len(compressed_content)
                            compression_ratio = ((original_size - compressed_size) / original_size) * 100
                            
                            total_original_size += original_size
                            total_compressed_size += compressed_size
                            
                            # Upload to S3
                            tender_id = str(bid_gen.get("tender", ""))
                            tender = self.api_client.get_tender(tender_id, tenant_id)
                            bid_number = tender.get("bidNumber", "bid") if tender else "bid"
                            bid_number = re.sub(r'[\/\\]', '_', bid_number)
                            
                            s3_key = f"bid_documents/{tenant_id}/{bid_number}/compressed/{filename}"
                            storage_details = self.api_client.upload_file(
                                compressed_content, s3_key, 'application/pdf', tenant_id
                            )
                            
                            # Create document record
                            document_record = {
                                "tenant": tenant_id,
                                "name": filename,
                                "type": "bid_document_compressed",
                                "category": "compressed",
                                "storageType": "s3",
                                "storageDetails": storage_details,
                                "metadata": {
                                    "bidGenerationId": bid_generation_id,
                                    "bidNumber": bid_number,
                                    "originalDocumentId": original_doc_id,
                                    "originalFilename": original_filename,
                                    "originalSize": original_size,
                                    "compressedSize": compressed_size,
                                    "compressionRatio": compression_ratio,
                                    "targetSizePercent": target_size_percent,
                                    "preserveQuality": preserve_quality,
                                    "compressedAt": time.time()
                                }
                            }
                            
                            doc = self.api_client.create_document(document_record)
                            compressed_doc_id = str(doc["_id"])
                            
                            compressed_docs.append({
                                "compressed_document_id": compressed_doc_id,
                                "original_document_id": original_doc_id,
                                "original_filename": original_filename,
                                "compressed_filename": filename,
                                "original_size": original_size,
                                "compressed_size": compressed_size,
                                "compression_ratio": compression_ratio,
                                "s3_url": storage_details.get("url")
                            })
                            
                            self.process_logger.info(f"Compressed {original_filename}: {compression_ratio:.1f}% reduction")
                
                overall_compression = ((total_original_size - total_compressed_size) / total_original_size) * 100 if total_original_size > 0 else 0
                
                # Update bid generation record
                self.api_client.update_bid_generation(bid_generation_id, {
                    "$push": {
                        "logs": {
                            "timestamp": time.time(),
                            "action": "batch_documents_compressed",
                            "details": {
                                "compressed_count": len(compressed_docs),
                                "target_size_percent": target_size_percent,
                                "total_original_size": total_original_size,
                                "total_compressed_size": total_compressed_size,
                                "overall_compression_ratio": overall_compression
                            }
                        }
                    }
                })
                
                self.process_logger.info(f"Successfully batch compressed {len(compressed_docs)} documents")
                self.process_logger.info(f"Overall compression: {overall_compression:.1f}% reduction")
                
                return {
                    "status": "completed",
                    "compressed_documents": compressed_docs,
                    "total_documents": len(compressed_docs),
                    "total_original_size": total_original_size,
                    "total_compressed_size": total_compressed_size,
                    "overall_compression_ratio": overall_compression,
                    "target_size_percent": target_size_percent,
                    "message": f"Successfully compressed {len(compressed_docs)} documents with {overall_compression:.1f}% overall reduction"
                }
                
            finally:
                # Clean up temporary directory
                try:
                    shutil.rmtree(temp_dir)
                except Exception as e:
                    self.process_logger.warning(f"Failed to cleanup temp directory: {e}")
            
        except Exception as e:
            self.process_logger.error(f"Failed to batch compress documents: {e}")
            raise

    def merge_tender_documents(self, tender_id: str, tenant_id: str, 
                              add_page_numbers: bool = True,
                              include_linked_docs: bool = True) -> Dict[str, Any]:
        """
        Merge all tender/RFP documents into a single PDF with index
        
        Args:
            tender_id: Tender ID
            tenant_id: Tenant ID  
            add_page_numbers: Whether to add page numbers to merged PDF
            include_linked_docs: Whether to include linked documents
            
        Returns:
            Dict containing merged document information
        """
        try:
            self._setup_process_context(tenant_id, tender_id, "merge_tender_documents")
            
            # Get tender record
            tender = self.api_client.get_tender(tender_id, tenant_id)
            if not tender:
                raise ValueError(f"Tender not found: {tender_id}")
            
            # Get tender documents
            document_ids = [str(doc_id) for doc_id in tender.get("originalDocuments", [])]
            if not document_ids:
                raise ValueError("No documents found in tender")
            
            documents = self.api_client.get_documents(document_ids)
            
            # Filter documents based on include_linked_docs
            if include_linked_docs:
                # Include all documents
                pdf_documents = [doc for doc in documents if doc.get("name", "").lower().endswith('.pdf')]
            else:
                # Exclude linked documents (category="tender" only)
                pdf_documents = [
                    doc for doc in documents 
                    if (doc.get("name", "").lower().endswith('.pdf') and 
                        doc.get("category") == "tender" and
                        doc.get("type") != "rfp")  # Exclude linked RFP docs
                ]
            
            if not pdf_documents:
                raise ValueError("No PDF documents found to merge")
            
            self.process_logger.info(f"Merging {len(pdf_documents)} tender PDF documents")
            
            # Create temporary directory
            temp_dir = os.path.join(self.base_dir, f"merge_tender_{tender_id}")
            os.makedirs(temp_dir, exist_ok=True)
            
            try:
                # Download all PDF documents
                downloaded_files = []
                for doc in pdf_documents:
                    try:
                        doc_content = self.api_client.get_document_content(str(doc["_id"]), tenant_id)
                        filename = doc.get("name", f"doc_{doc['_id']}.pdf")
                        
                        # Ensure .pdf extension
                        if not filename.lower().endswith('.pdf'):
                            filename += '.pdf'
                        
                        file_path = os.path.join(temp_dir, filename)
                        
                        with open(file_path, 'wb') as f:
                            f.write(doc_content)
                        
                        downloaded_files.append(file_path)
                        self.process_logger.info(f"Downloaded: {filename}")
                        
                    except Exception as e:
                        self.process_logger.error(f"Failed to download document {doc.get('_id')}: {e}")
                
                if not downloaded_files:
                    raise ValueError("No documents were successfully downloaded")
                
                # Generate output filename
                bid_number = tender.get("bidNumber", "tender")
                bid_number = re.sub(r'[\/\\]', '_', bid_number)
                
                output_filename = f"{bid_number}_merged_tender_documents.pdf"
                output_path = os.path.join(temp_dir, output_filename)
                
                # Merge PDFs using pdf_merger
                self.process_logger.info("Starting tender PDF merge process...")
                pdf_merger.merge_pdfs_with_index(temp_dir, output_filename, add_page_numbers)
                
                if not os.path.exists(output_path):
                    raise Exception("PDF merge failed - output file not created")
                
                # Upload merged PDF to S3
                with open(output_path, 'rb') as f:
                    merged_content = f.read()
                
                s3_key = f"tenders/{tenant_id}/{bid_number}/merged/{output_filename}"
                storage_details = self.api_client.upload_file(
                    merged_content, s3_key, 'application/pdf', tenant_id
                )
                
                # Create document record
                document_record = {
                    "tenant": tenant_id,
                    "name": output_filename,
                    "type": "tender_merged",
                    "category": "merged",
                    "storageType": "s3",
                    "storageDetails": storage_details,
                    "metadata": {
                        "tenderId": tender_id,
                        "bidNumber": bid_number,
                        "mergedDocumentCount": len(pdf_documents),
                        "addedPageNumbers": add_page_numbers,
                        "includeLinkedDocs": include_linked_docs,
                        "mergedAt": time.time()
                    }
                }
                
                doc = self.api_client.create_document(document_record)
                merged_doc_id = str(doc["_id"])
                
                # Update tender record
                self.api_client.update_tender(tender_id, {
                    "$addToSet": {"mergedDocuments": merged_doc_id},
                    "$push": {
                        "logs": {
                            "timestamp": time.time(),
                            "action": "tender_documents_merged",
                            "details": {
                                "merged_document_id": merged_doc_id,
                                "document_count": len(pdf_documents),
                                "output_filename": output_filename
                            }
                        }
                    }
                })
                
                self.process_logger.info(f"Successfully merged {len(pdf_documents)} tender documents into {output_filename}")
                
                return {
                    "status": "completed",
                    "merged_document_id": merged_doc_id,
                    "filename": output_filename,
                    "document_count": len(pdf_documents),
                    "file_size": len(merged_content),
                    "s3_url": storage_details.get("url"),
                    "include_linked_docs": include_linked_docs,
                    "message": f"Successfully merged {len(pdf_documents)} tender documents"
                }
                
            finally:
                # Clean up temporary directory
                try:
                    shutil.rmtree(temp_dir)
                except Exception as e:
                    self.process_logger.warning(f"Failed to cleanup temp directory: {e}")
            
        except Exception as e:
            self.process_logger.error(f"Failed to merge tender documents: {e}")
            raise

    # =================================================================
    # HELPER METHODS
    # =================================================================

    def _debug_extraction_results(self, documents_text: Dict[str, str], temp_dir: str) -> None:
        """Debug helper to log extraction results"""
        logger.info(f"=== Document Extraction Debug Info ===")
        logger.info(f"Temp directory: {temp_dir}")
        logger.info(f"Files in temp directory: {os.listdir(temp_dir) if os.path.exists(temp_dir) else 'Directory not found'}")
        
        if documents_text:
            logger.info(f"Extracted text from {len(documents_text)} documents:")
            for doc_path, content in documents_text.items():
                content_preview = content[:200].replace('\n', ' ') if content else "No content"
                logger.info(f"  - {os.path.basename(doc_path)}: {len(content)} chars - {content_preview}...")
        else:
            logger.warning("No documents_text returned from extraction")
        
        logger.info(f"=== End Debug Info ===")

    def _check_document_extractor_output(self, output_dir: str) -> Dict[str, str]:
        """Check and load results from document extractor output"""
        doc_text_file = os.path.join(output_dir, "doc_text.json")
        
        if os.path.exists(doc_text_file):
            try:
                with open(doc_text_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                logger.info(f"Found existing doc_text.json with {len(data)} entries")
                
                # Check if the data is valid
                if isinstance(data, dict) and data:
                    # Validate that entries have actual content
                    valid_entries = {}
                    for key, value in data.items():
                        if isinstance(value, str) and value.strip():
                            valid_entries[key] = value
                        else:
                            logger.warning(f"Skipping invalid entry for {key}")
                    
                    if valid_entries:
                        logger.info(f"Loaded {len(valid_entries)} valid documents from existing extraction")
                        return valid_entries
                    else:
                        logger.warning("No valid content found in existing doc_text.json")
                else:
                    logger.warning("doc_text.json exists but contains invalid data")
                    
            except Exception as e:
                logger.error(f"Error reading existing doc_text.json: {e}")
        
        return {}

    def _get_content_type(self, filename: str) -> str:
        """Get content type based on file extension"""
        ext = os.path.splitext(filename)[1].lower()
        content_types = {
            '.pdf': 'application/pdf',
            '.html': 'text/html',
            '.htm': 'text/html',
            '.txt': 'text/plain',
            '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
            '.doc': 'application/msword',
            '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
            '.xls': 'application/vnd.ms-excel',
            '.json': 'application/json'
        }
        return content_types.get(ext, 'application/octet-stream')

    def _save_generated_documents_to_s3(self, final_docs_dir: str, bid_generation_id: str, 
                                       tenant_id: str, bid_number: str) -> List[str]:
        """Save generated bid documents to S3 and create database records"""
        document_ids = []
        
        for root, dirs, files in os.walk(final_docs_dir):
            for filename in files:
                file_path = os.path.join(root, filename)
                
                # Skip temporary files and directories
                if filename.startswith('.') or filename.endswith('.tmp'):
                    continue
                
                try:
                    with open(file_path, 'rb') as f:
                        content = f.read()
                    
                    bid_number = re.sub(r'[\/\\]', '_', bid_number)
                    # Create S3 key
                    rel_path = os.path.relpath(file_path, final_docs_dir)
                    s3_key = f"bid_documents/{tenant_id}/{bid_number}/{rel_path}"
                    
                    # Upload to S3 via API
                    content_type = self._get_content_type(filename)
                    storage_details = self.api_client.upload_file(
                        content, s3_key, content_type, tenant_id
                    )
                    
                    # Create document record via API
                    document = {
                        "tenant": tenant_id,
                        "name": filename,
                        "type": "bid_document",
                        "category": os.path.basename(root),  # Document category based on folder
                        "storageType": "s3",
                        "storageDetails": storage_details,
                        "metadata": {
                            "bidGenerationId": bid_generation_id,
                            "bidNumber": bid_number,
                            "relativePath": rel_path
                        }
                    }
                    
                    doc = self.api_client.create_document(document)
                    doc_id = str(doc["_id"])
                    document_ids.append(doc_id)
                    
                except Exception as e:
                    logger.error(f"Failed to save generated document {filename}: {e}")
        
        return document_ids

    def _create_referenced_document_entries(self, referenced_company_docs: List[Dict],
                                           bid_generation_id: str, tenant_id: str,
                                           bid_number: str, downloaded_company_docs: List[Dict]) -> List[str]:
        """
        Create database entries for referenced company documents without uploading to S3.
        Uses the original S3 URL from the company documents.

        Args:
            referenced_company_docs: List of referenced company documents from bid preparation
            bid_generation_id: Bid generation ID
            tenant_id: Tenant ID
            bid_number: Bid number
            downloaded_company_docs: List of downloaded company documents with S3 URLs

        Returns:
            List[str]: List of created document IDs
        """
        document_ids = []

        # Create a mapping from local_path to S3 details
        local_path_to_s3 = {}
        for company_doc in downloaded_company_docs:
            local_path = company_doc.get("local_path")
            if local_path:
                local_path_to_s3[local_path] = {
                    "file_url": company_doc.get("file_url"),
                    "filename": company_doc.get("filename"),
                    "category": company_doc.get("category"),
                    "description": company_doc.get("description", ""),
                    "size": company_doc.get("size"),
                    "mimeType": company_doc.get("mimeType")
                }

        for ref_doc in referenced_company_docs:
            try:
                local_path = ref_doc.get("local_path")

                # Find the original S3 URL for this document
                s3_info = local_path_to_s3.get(local_path)
                if not s3_info:
                    self.process_logger.warning(f"Could not find S3 URL for referenced document: {ref_doc.get('name')}")
                    continue

                # Extract S3 key from the file_url
                # Assuming file_url is an S3 URL, we need to extract the key
                file_url = s3_info["file_url"]

                # Parse the S3 URL to extract key
                # URL format: https://bucket.s3.region.amazonaws.com/key or https://s3.region.amazonaws.com/bucket/key
                import urllib.parse
                parsed_url = urllib.parse.urlparse(file_url)

                # Extract key from path (removing leading slash)
                s3_key = parsed_url.path.lstrip('/')

                # If the bucket is in the hostname, extract it
                if '.s3.' in parsed_url.hostname:
                    # Format: bucket.s3.region.amazonaws.com/key
                    pass
                # else:
                #     # Format: s3.region.amazonaws.com/bucket/key - remove bucket from key
                #     parts = s3_key.split('/', 1)
                #     if len(parts) > 1:
                #         s3_key = parts[1]

                bid_number_safe = re.sub(r'[\/\\]', '_', bid_number)

                # Create storage details using the original S3 URL with size and mimeType
                storage_details = {
                    "key": s3_key,
                    "url": file_url,
                    "path": s3_key
                }

                # Add size and mimeType if available
                if s3_info.get("size") is not None:
                    storage_details["size"] = s3_info["size"]
                if s3_info.get("mimeType"):
                    storage_details["mimeType"] = s3_info["mimeType"]

                # Create document record via API
                document = {
                    "tenant": tenant_id,
                    "name": ref_doc.get("name", s3_info["filename"]),
                    "type": "bid_document",
                    "category": ref_doc.get("category", s3_info["category"]),
                    "storageType": "s3",
                    "storageDetails": storage_details,
                    "metadata": {
                        "bidGenerationId": bid_generation_id,
                        "bidNumber": bid_number_safe,
                        "isReferencedDocument": True,  # Mark as referenced (not copied)
                        "originalCompanyDoc": True,
                        "requiredDocName": ref_doc.get("required_doc_name", ""),
                        "description": ref_doc.get("description", s3_info.get("description", ""))
                    }
                }

                doc = self.api_client.create_document(document)
                doc_id = str(doc["_id"])
                document_ids.append(doc_id)

                self.process_logger.info(f"Created referenced document entry: {ref_doc.get('name')} (ID: {doc_id})")

            except Exception as e:
                logger.error(f"Failed to create referenced document entry for {ref_doc.get('name')}: {e}")

        return document_ids

    def _create_default_index_config(self, config_path: str):
        """Create default Elasticsearch index configuration"""
        config = {
            "settings": {
                "number_of_shards": 2,
                "number_of_replicas": 1
            },
            "mappings": {
                "dynamic": "true",
                "_source": {
                    "enabled": "true"
                },
                "properties": {
                    "text": {
                        "type": "text"
                    },
                    "title_vector": {
                        "type": "dense_vector",
                        "dims": 1536
                    },
                    "tag": {
                        "type": "keyword"
                    }
                }
            }
        }
        
        os.makedirs(os.path.dirname(config_path), exist_ok=True)
        with open(config_path, 'w') as f:
            json.dump(config, f, indent=2)

    def _convert_chunks_to_json(self, temp_dir: str, output_file: str):
        """Convert Excel chunks to JSON format for Elasticsearch indexing"""
        import pandas as pd
        
        excel_file = os.path.join(temp_dir, "tender_analysis", "chunks.xlsx")
        if os.path.exists(excel_file):
            df = pd.read_excel(excel_file)
            
            json_data = []
            for _, row in df.iterrows():
                json_data.append({
                    "tagName": row.get("Tag", ""),
                    "question": row.get("question", ""),
                    "answer": row.get("answer", "")
                })
            
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(json_data, f, indent=2, ensure_ascii=False)

    def _setup_process_context(self, tenant_id: str, process_id: str, process_type: str):
        """
        Set up process context for logging
        
        Args:
            tenant_id: Tenant ID
            process_id: Process ID (tender_id, analysis_id, etc.)
            process_type: Process type (discovery, analysis, bid_generation)
        """
        self.current_tenant_id = tenant_id
        self.current_process_id = process_id
        self.current_process_type = process_type
        
        # Set up process-specific logger
        self.process_logger = setup_api_logging(
            f"rfp_service.{process_type}.{process_id}",
            tenant_id,
            process_id,
            process_type
        )

        # Update extractor's process_logger so it logs to the process-specific logger
        if hasattr(self, 'extractor') and self.extractor:
            self.extractor.process_logger = self.process_logger

        self.process_logger.info(f"Starting {process_type} process for {process_id}")

    def _categorize_company_document(self, doc_info: Dict) -> str:
        """
        Categorize company document as standard or experience based on type and content
        
        Args:
            doc_info: Document information dictionary
            
        Returns:
            str: Category ("standard" or "experience")
        """
        doc_type = doc_info.get("type", "").lower()
        doc_name = doc_info.get("name", "").lower()
        doc_value = doc_info.get("value", "").lower()
        
        # Experience document indicators
        experience_indicators = [
            "past_experience", "experience", "project", "work_order", 
            "completion", "portfolio", "client", "customer"
        ]
        
        # Check if it's an experience document
        if (doc_type in experience_indicators or 
            any(indicator in doc_name for indicator in experience_indicators) or
            any(indicator in doc_value for indicator in experience_indicators)):
            return "experience"
        
        # Default to standard documents
        return "standard"

    def _download_and_save_company_document(self, doc_info: Dict, target_dir: str) -> str:
        """
        Download company document and create .desc file
        
        Args:
            doc_info: Document information dictionary
            target_dir: Target directory to save the document
            
        Returns:
            str: Path to saved document or None if failed
        """
        try:
            file_url = doc_info.get("file_url")
            if not file_url:
                return None
            
            # Generate filename based on document info
            doc_name = doc_info.get("name", "document")
            # Clean filename
            safe_filename = re.sub(r'[^\w\s-]', '', doc_name).strip().replace(' ', '_')
            
            # Get file extension from URL
            import urllib.parse
            parsed_url = urllib.parse.urlparse(file_url)
            file_extension = os.path.splitext(parsed_url.path)[1] or '.pdf'
            
            filename = f"{safe_filename}{file_extension}"
            file_path = os.path.join(target_dir, filename)
            desc_path = file_path + ".desc"
            
            # Download the file
            response = requests.get(file_url, timeout=30)
            response.raise_for_status()
            
            with open(file_path, 'wb') as f:
                f.write(response.content)
            
            # Create .desc file
            description_content = ""
            if doc_info.get("description") and doc_info["description"].strip():
                # Use existing description
                description_content = doc_info["description"]
            else:
                # Create description from document metadata
                desc_parts = []
                if doc_info.get("name"):
                    desc_parts.append(f"Name: {doc_info['name']}")
                if doc_info.get("type"):
                    desc_parts.append(f"Type: {doc_info['type']}")
                if doc_info.get("value"):
                    desc_parts.append(f"Value: {doc_info['value']}")
                
                # Add specific data if available
                for key in ['director_data', 'experience_data', 'itr_data', 'ca_cert_data', 'balance_sheet_data', 'certificate_data']:
                    if key in doc_info:
                        desc_parts.append(f"Data: {json.dumps(doc_info[key], indent=2)}")
                
                description_content = "\n".join(desc_parts)
            
            # Write .desc file
            with open(desc_path, 'w', encoding='utf-8') as f:
                f.write(description_content)
            
            self.process_logger.info(f"Downloaded company document: {filename}")
            return file_path
            
        except Exception as e:
            self.process_logger.error(f"Failed to download company document {doc_info.get('name', 'unknown')}: {e}")
            return None

    # Additional helper function to validate chunks data
    def _validate_chunks_data(self, data_file):
        logger.info(f"Validated Data file path {data_file}")
        """Validate chunks data before processing"""
        try:
            with open(data_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            valid_chunks = []
            invalid_count = 0
            
            for i, chunk in enumerate(data):
                # Check required fields
                if not isinstance(chunk, dict):
                    print(f"Chunk {i}: Not a dictionary")
                    invalid_count += 1
                    continue
                    
                if 'tagName' not in chunk or not chunk['tagName']:
                    print(f"Chunk {i}: Missing or empty tagName")
                    invalid_count += 1
                    continue
                    
                if 'question' not in chunk or not chunk['question']:
                    print(f"Chunk {i}: Missing or empty question")
                    invalid_count += 1
                    continue
                    
                if 'answer' not in chunk or not chunk['answer']:
                    print(f"Chunk {i}: Missing or empty answer")
                    invalid_count += 1
                    continue
                
                # Clean up the chunk
                cleaned_chunk = {
                    'tagName': str(chunk['tagName']).strip(),
                    'question': str(chunk['question']).strip(),
                    'answer': str(chunk['answer']).strip()
                }
                
                valid_chunks.append(cleaned_chunk)
            
            print(f"Validation complete: {len(valid_chunks)} valid chunks, {invalid_count} invalid chunks")
            
            # Write cleaned data back
            if valid_chunks:
                with open(data_file, 'w', encoding='utf-8') as f:
                    json.dump(valid_chunks, f, indent=2, ensure_ascii=False)
                print(f"Wrote {len(valid_chunks)} cleaned chunks back to file")
            
            return len(valid_chunks) > 0
            
        except Exception as e:
            print(f"Error validating chunks data: {e}")
            return False

    def _is_valid_file_url(self, url: str) -> bool:
        """
        Check if URL is a valid file URL (not just a website)
        
        Args:
            url: URL to check
            
        Returns:
            bool: True if it's a file URL, False if it's a website
        """
        if not url:
            return False
        
        # Skip obvious website URLs
        if url.startswith(('http://www.', 'https://www.')) and not any(ext in url.lower() for ext in ['.pdf', '.doc', '.xls', '.jpg', '.png']):
            return False
        
        # Skip URLs that don't contain file-like paths
        if url.count('/') < 3:  # Simple domain URLs
            return False
        
        # Check for file-like patterns
        file_indicators = [
            'documents/', 'files/', 'uploads/', 'attachments/',
            '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.jpg', '.png', '.zip'
        ]
        
        return any(indicator in url.lower() for indicator in file_indicators)

    def _download_and_organize_company_documents(self, tenant_id: str, temp_company_docs_dir: str, 
                                               formatted_company_info: str, company_id: str = None) -> List[Dict]:
        """
        Download and organize company documents using centralized mapping (FIXED VERSION)
        
        Args:
            tenant_id: Tenant ID
            temp_company_docs_dir: Temporary directory for company documents
            formatted_company_info: Formatted company information for fallback descriptions
            
        Returns:
            List[Dict]: List of downloaded company documents with metadata
        """
        try:
            # Get structured company information from API
            company_data = self.api_client.get_company_info(tenant_id, company_id)
            company_info = company_data.get("company_info", {})

            raw_analysis_docs = company_data.get("raw_analysis", {})
            
            # Create proper directory structure
            std_docs_dir = os.path.join(temp_company_docs_dir, "Standard_Documents")
            exp_docs_dir = os.path.join(temp_company_docs_dir, "Experience_Documents")
            os.makedirs(std_docs_dir, exist_ok=True)
            os.makedirs(exp_docs_dir, exist_ok=True)
            
            downloaded_company_docs = []
            
            if raw_analysis_docs:
                self.process_logger.info("Processing documents from raw_analysis...")

                seen_files = set()
                for _, ra in raw_analysis_docs.items():
                    self.process_logger.info(f"Raw Analysis Item...{ra}")
                    file_path = ra.get("file_path")
                    if not file_path or not self._is_valid_file_url(file_path):
                        continue
                    if file_path in seen_files:
                        continue
                    seen_files.add(file_path)

                    raw_doc_type = ra.get("doc_type")

                    type_part, name_part = raw_doc_type.split(":", 1)

                    # Normalize → lowercase, underscores
                    doc_type = re.sub(r'[^a-z0-9]+', '_', type_part.lower()).strip('_')
                    doc_name = re.sub(r'[^a-z0-9]+', '_', name_part.lower()).strip('_')

                    # Directory + category mapping
                    if type_part.lower().startswith("experience"):
                        target_dir = exp_docs_dir
                        mapping_config = {"category": "experience"}
                    else:
                        target_dir = std_docs_dir
                        mapping_config = {"category": "standard"}

                    # Use key_info as value
                    key_info_str = json.dumps(ra.get("key_info", {}), ensure_ascii=False)

                    description = ra.get("description", "") or f"{raw_doc_type} document from raw_analysis"

                    doc_info = {
                        "name": doc_name,
                        "type": doc_type,
                        "value": key_info_str,
                        "description": description,
                        "file_url": file_path,
                        "source": "raw_analysis",
                        "mapping_config": mapping_config
                    }

                    saved_info = self._download_and_save_company_document_improved_v2(
                        doc_info, target_dir, formatted_company_info
                    )
                    if saved_info:
                        downloaded_company_docs.append(saved_info)
            else:
                self.process_logger.info("Starting company documents download using centralized mapping...")
                
                # Process documents from details array - ONLY if they have VALID file URLs
                details_list = company_info.get("details", [])
                for i, item in enumerate(details_list):
                    file_url = item.get("file")
                    if not file_url or not self._is_valid_file_url(file_url):  # Skip invalid/website URLs
                        continue
                        
                    doc_name = item.get("name", f"detail_doc_{i+1}")
                    doc_type = item.get("type", "basic_details")
                    doc_value = item.get("value", "")
                    
                    # Get mapping configuration
                    mapping_config = COMPANY_DOCUMENT_MAPPING.get(doc_name, {})
                    
                    doc_info = {
                        "name": doc_name,
                        "type": doc_type,
                        "value": doc_value,
                        "description": item.get("description", ""),
                        "file_url": file_url,
                        "source": "details",
                        "mapping_config": mapping_config
                    }
                    
                    # Use mapping to determine category and target directory
                    category = mapping_config.get("category", "standard")
                    target_dir = exp_docs_dir if category == "experience" else std_docs_dir
                    
                    saved_info = self._download_and_save_company_document_improved_v2(
                        doc_info, target_dir, formatted_company_info
                    )
                    if saved_info:
                        downloaded_company_docs.append(saved_info)
                
                # Process director documents - ONLY if they have VALID files
                director_details = company_info.get("directorDetails", [])
                for i, director in enumerate(director_details):
                    file_url = director.get("file")
                    if not file_url or not self._is_valid_file_url(file_url):  # Skip invalid URLs
                        continue
                        
                    mapping_config = COMPANY_DOCUMENT_MAPPING.get("director_details", {})
                    
                    # Use director name for filename if available
                    director_name = director.get("name", f"director_{i+1}")
                    safe_director_name = re.sub(r'[^\w\s\-_]', '', director_name).strip().replace(' ', '_')
                    
                    doc_info = {
                        "name": f"director_{safe_director_name}_document",
                        "type": "director_details",
                        "value": f"Director: {director.get('name', 'Unknown')}",
                        "description": "",
                        "file_url": file_url,
                        "source": "director_details",
                        "mapping_config": mapping_config,
                        "director_data": director
                    }
                    
                    saved_info = self._download_and_save_company_document_improved_v2(
                        doc_info, std_docs_dir, formatted_company_info
                    )
                    if saved_info:
                        downloaded_company_docs.append(saved_info)
                
                # Process past experience documents - ONLY if they have VALID files
                past_experience = company_info.get("pastExperienceDetails", [])
                for i, experience in enumerate(past_experience):
                    file_url = experience.get("file")
                    if not file_url or not self._is_valid_file_url(file_url):  # Skip invalid URLs
                        continue
                        
                    mapping_config = COMPANY_DOCUMENT_MAPPING.get("past_experience", {})
                    
                    # Use project name for filename if available
                    project_name = experience.get("project", f"project_{i+1}")
                    safe_project_name = re.sub(r'[^\w\s\-_]', '', project_name).strip().replace(' ', '_')[:50]  # Limit length
                    
                    doc_info = {
                        "name": f"experience_{safe_project_name}",
                        "type": "past_experience",
                        "value": f"Project: {experience.get('project', '')}",
                        "description": "",
                        "file_url": file_url,
                        "source": "past_experience",
                        "mapping_config": mapping_config,
                        "experience_data": experience
                    }
                    
                    saved_info = self._download_and_save_company_document_improved_v2(
                        doc_info, exp_docs_dir, formatted_company_info
                    )
                    if saved_info:
                        downloaded_company_docs.append(saved_info)
                
                # Process financial documents - ONLY if they have VALID files
                financial_details = company_info.get("financialDetails", {})
                
                # ITR documents
                itr_details = financial_details.get("itrDetails", [])
                for i, itr in enumerate(itr_details):
                    file_url = itr.get("file")
                    if not file_url or not self._is_valid_file_url(file_url):  # Skip invalid URLs
                        continue
                        
                    mapping_config = COMPANY_DOCUMENT_MAPPING.get("financial_itr", {})
                    itr_year = itr.get('itrYear', f'year_{i+1}')
                    
                    doc_info = {
                        "name": f"ITR_{itr_year}",
                        "type": "financial_itr",
                        "value": f"ITR Year: {itr_year}",
                        "description": "",
                        "file_url": file_url,
                        "source": "financial_itr",
                        "mapping_config": mapping_config,
                        "itr_data": itr
                    }
                    
                    saved_info = self._download_and_save_company_document_improved_v2(
                        doc_info, std_docs_dir, formatted_company_info
                    )
                    if saved_info:
                        downloaded_company_docs.append(saved_info)
                
                # CA Certificate documents
                ca_cert_details = financial_details.get("ca_certificate_details", [])
                for i, ca_cert in enumerate(ca_cert_details):
                    file_url = ca_cert.get("file")
                    if not file_url or not self._is_valid_file_url(file_url):  # Skip invalid URLs
                        continue
                        
                    mapping_config = COMPANY_DOCUMENT_MAPPING.get("financial_ca_cert", {})
                    
                    doc_info = {
                        "name": f"CA_Certificate_{i+1}",
                        "type": "financial_ca_cert",
                        "value": ca_cert.get("ca_certificate_description", ""),
                        "description": "",
                        "file_url": file_url,
                        "source": "financial_ca_cert",
                        "mapping_config": mapping_config,
                        "ca_cert_data": ca_cert
                    }
                    
                    saved_info = self._download_and_save_company_document_improved_v2(
                        doc_info, std_docs_dir, formatted_company_info
                    )
                    if saved_info:
                        downloaded_company_docs.append(saved_info)
                
                # Balance Sheet documents
                balance_sheet_details = financial_details.get("balanceSheetDetails", [])
                for i, balance_sheet in enumerate(balance_sheet_details):
                    file_url = balance_sheet.get("file")
                    if not file_url or not self._is_valid_file_url(file_url):  # Skip invalid URLs
                        continue
                        
                    mapping_config = COMPANY_DOCUMENT_MAPPING.get("financial_balance_sheet", {})
                    bs_year = balance_sheet.get('auditedBalanceSheetYear', f'year_{i+1}')
                    
                    doc_info = {
                        "name": f"Balance_Sheet_{bs_year}",
                        "type": "financial_balance_sheet",
                        "value": f"Balance Sheet Year: {bs_year}",
                        "description": "",
                        "file_url": file_url,
                        "source": "financial_balance_sheet",
                        "mapping_config": mapping_config,
                        "balance_sheet_data": balance_sheet
                    }
                    
                    saved_info = self._download_and_save_company_document_improved_v2(
                        doc_info, std_docs_dir, formatted_company_info
                    )
                    if saved_info:
                        downloaded_company_docs.append(saved_info)
                
                # Certificate documents
                certificates = company_info.get("certificate", [])
                for i, cert in enumerate(certificates):
                    file_url = cert.get("file")
                    if not file_url or not self._is_valid_file_url(file_url):  # Skip invalid URLs
                        continue
                        
                    mapping_config = COMPANY_DOCUMENT_MAPPING.get("certificate", {})
                    cert_desc = cert.get("description", f"Certificate_{i+1}")
                    safe_cert_name = re.sub(r'[^\w\s\-_]', '', cert_desc).strip().replace(' ', '_')[:50]  # Limit length
                    
                    doc_info = {
                        "name": f"Certificate_{safe_cert_name}",
                        "type": "certificate",
                        "value": cert_desc,
                        "description": "",
                        "file_url": file_url,
                        "source": "certificate",
                        "mapping_config": mapping_config,
                        "certificate_data": cert
                    }
                    
                    saved_info = self._download_and_save_company_document_improved_v2(
                        doc_info, std_docs_dir, formatted_company_info
                    )
                    if saved_info:
                        downloaded_company_docs.append(saved_info)
                
            self.process_logger.info(f"Successfully downloaded and organized {len(downloaded_company_docs)} company documents")
            self.process_logger.info(f"Standard documents: {len([d for d in downloaded_company_docs if d['category'] == 'standard'])}")
            self.process_logger.info(f"Experience documents: {len([d for d in downloaded_company_docs if d['category'] == 'experience'])}")
            
            return downloaded_company_docs
            
        except Exception as e:
            self.process_logger.error(f"Failed to download and organize company documents: {e}")
            return []

    def _extract_filename_from_url(self, file_url: str, default_name: str) -> str:
        """
        Extract meaningful filename from URL, fallback to default name
        
        Args:
            file_url: URL of the file
            default_name: Default name to use if extraction fails
            
        Returns:
            str: Meaningful filename
        """
        try:
            import urllib.parse
            parsed_url = urllib.parse.urlparse(file_url)
            path = parsed_url.path
            
            # Extract filename from path
            filename = os.path.basename(path)
            
            # Check if filename is meaningful (not just hash)
            if filename and len(filename) > 3 and not filename.startswith('.'):
                # Clean the filename
                name_part, ext = os.path.splitext(filename)
                # If name part looks like a hash (long alphanumeric), use default
                if len(name_part) > 20 and name_part.replace('-', '').replace('_', '').isalnum():
                    return default_name
                else:
                    return filename
            
            return default_name
            
        except Exception:
            return default_name

    def _download_and_save_company_document_improved_v2(self, doc_info: Dict, target_dir: str, 
                                                      formatted_company_info: str) -> Dict:
        """
        IMPROVED VERSION - Download and save company document with concise, relevant descriptions
        
        Args:
            doc_info: Document information dictionary with mapping_config
            target_dir: Target directory to save the document
            formatted_company_info: Formatted company information for fallback descriptions
            
        Returns:
            Dict: Simple document information with local path or None if failed
        """
        try:
            file_url = doc_info.get("file_url")
            if not file_url:
                self.process_logger.warning(f"No file URL found for document: {doc_info.get('name', 'unknown')}")
                return None
            
            # Generate meaningful filename - try to extract from URL first
            doc_name = doc_info.get("name", "document")
            url_filename = self._extract_filename_from_url(file_url, doc_name)
            
            # Use URL filename if it's meaningful, otherwise use doc_name
            if url_filename != doc_name and len(url_filename) < 50:
                safe_filename = re.sub(r'[^\w\s\-_\.]', '', url_filename).strip()
            else:
                safe_filename = re.sub(r'[^\w\s\-_]', '', doc_name).strip().replace(' ', '_')
                # Get file extension from URL
                import urllib.parse
                parsed_url = urllib.parse.urlparse(file_url)
                file_extension = os.path.splitext(parsed_url.path)[1]
                if not file_extension:
                    file_extension = '.pdf'
                safe_filename += file_extension
            
            file_path = os.path.join(target_dir, safe_filename)
            desc_path = file_path + ".desc"
            
            # Download the file
            self.process_logger.info(f"Downloading document: {safe_filename}")

            response = requests.get(file_url, timeout=60, stream=True)
            response.raise_for_status()

            # Capture size and mimeType from response
            file_size = 0
            mime_type = response.headers.get('content-type', 'application/pdf')

            # Remove charset and other parameters from mime type
            if ';' in mime_type:
                mime_type = mime_type.split(';')[0].strip()

            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
                    file_size += len(chunk)

            # If content-length header was present, use it as authoritative size
            if 'content-length' in response.headers:
                try:
                    file_size = int(response.headers['content-length'])
                except (ValueError, TypeError):
                    pass  # Keep the counted size

            # Create CONCISE description using template
            description_content = self._create_concise_description_from_mapping(doc_info)

            # Write .desc file
            with open(desc_path, 'w', encoding='utf-8') as f:
                f.write(description_content)

            # Get category from mapping
            mapping_config = doc_info.get("mapping_config", {})
            category = mapping_config.get("category", "standard")

            self.process_logger.info(f"Successfully downloaded company document: {safe_filename} ({category}), size: {file_size} bytes, mimeType: {mime_type}")

            # Return ultra-simplified structure for API compatibility
            return {
                "local_path": file_path,
                "file_url": file_url,
                "category": category,
                "filename": safe_filename,
                "description": description_content,  # Keep full description for .desc file
                "source": doc_info.get("source", "unknown"),
                "type": doc_info.get("type", "unknown"),
                "size": file_size,
                "mimeType": mime_type
            }
            
        except requests.exceptions.RequestException as e:
            self.process_logger.error(f"Failed to download document {doc_info.get('name', 'unknown')}: Network error - {e}")
            return None
        except Exception as e:
            self.process_logger.error(f"Failed to save document {doc_info.get('name', 'unknown')}: {e}")
            return None

    def _create_concise_description_from_mapping(self, doc_info: Dict) -> str:
        """
        Create CONCISE description using mapping template - NO long company info repetition
        
        Args:
            doc_info: Document information dictionary
            
        Returns:
            str: Concise description using template
        """
        mapping_config = doc_info.get("mapping_config", {})
        
        # Use template from mapping if available
        description_template = mapping_config.get("description_template", "")
        if description_template:
            try:
                # Prepare template variables
                template_vars = {
                    "value": doc_info.get("value", "N/A"),
                    "name": doc_info.get("name", "N/A"),
                    "type": doc_info.get("type", "N/A")
                }
                
                # Add specific data based on document type
                if doc_info.get("director_data"):
                    director = doc_info["director_data"]
                    template_vars.update({
                        "name": director.get("name", "N/A"),
                        "pan": director.get("pan", "N/A"),
                        "din": director.get("din", "N/A")
                    })
                
                elif doc_info.get("experience_data"):
                    experience = doc_info["experience_data"]
                    template_vars.update({
                        "project": experience.get("project", "N/A"),
                        "customer": experience.get("customer", "N/A"),
                        "value": experience.get("projectValue", "N/A")
                    })
                
                elif doc_info.get("itr_data"):
                    itr = doc_info["itr_data"]
                    template_vars.update({
                        "year": itr.get("itrYear", "N/A")
                    })
                
                elif doc_info.get("balance_sheet_data"):
                    bs = doc_info["balance_sheet_data"]
                    template_vars.update({
                        "year": bs.get("auditedBalanceSheetYear", "N/A")
                    })
                
                elif doc_info.get("ca_cert_data"):
                    ca_cert = doc_info["ca_cert_data"]
                    template_vars.update({
                        "description": ca_cert.get("ca_certificate_description", "N/A")
                    })
                
                elif doc_info.get("certificate_data"):
                    cert = doc_info["certificate_data"]
                    template_vars.update({
                        "description": cert.get("description", "N/A")
                    })
                
                # Format the template
                formatted_description = description_template.format(**template_vars)
                return formatted_description
                
            except Exception as e:
                # Fallback if template formatting fails
                self.process_logger.warning(f"Template formatting failed: {e}")
        
        # Fallback: Use simple description
        doc_value = doc_info.get("value", "")
        existing_desc = doc_info.get("description", "").strip()
        
        if existing_desc:
            return existing_desc
        elif doc_value:
            return doc_value
        else:
            return f"Document: {doc_info.get('name', 'Unknown')}"

    # Updated helper method for backward compatibility (SIMPLIFIED)
    def _categorize_company_document_improved(self, doc_info: Dict) -> str:
        """
        Categorize company document using centralized mapping (backward compatibility)
        
        Args:
            doc_info: Document information dictionary
            
        Returns:
            str: Category ("standard" or "experience")
        """
        doc_name = doc_info.get("name", "").lower()
        doc_type = doc_info.get("type", "").lower()
        doc_source = doc_info.get("source", "").lower()
        
        # Check mapping first
        mapping_config = COMPANY_DOCUMENT_MAPPING.get(doc_name) or COMPANY_DOCUMENT_MAPPING.get(doc_type)
        if mapping_config:
            return mapping_config.get("category", "standard")
        
        # Fallback to original logic
        experience_indicators = [
            "past_experience", "experience", "project", "work_order", 
            "completion", "portfolio", "client", "customer", "contract"
        ]
        
        if (doc_source in ["past_experience"] or 
            doc_type in experience_indicators or 
            any(indicator in doc_name for indicator in experience_indicators)):
            return "experience"
        
        return "standard"

    def _convert_to_pdf(self, file_path: str, output_dir: str) -> str:
            """
            Convert various file formats to PDF
            
            Args:
                file_path: Path to the input file
                output_dir: Directory to save the converted PDF
                
            Returns:
                str: Path to the converted PDF file
            """
            try:
                filename = os.path.basename(file_path)
                name_without_ext = os.path.splitext(filename)[0]
                output_path = os.path.join(output_dir, f"{name_without_ext}.pdf")
                
                file_extension = os.path.splitext(file_path)[1].lower()
                
                if file_extension == '.pdf':
                    # Already a PDF, just copy it
                    shutil.copy2(file_path, output_path)
                    return output_path
                    
                elif file_extension in ['.html', '.htm']:
                    # Convert HTML to PDF using pdfkit
                    try:
                        import pdfkit

                        # Set your desired margins (units: mm, cm, in, px)
                        margin_options = {
                            'page-size': 'Letter',
                            'margin-top': '2cm',
                            'margin-right': '2cm',
                            'margin-bottom': '2cm',
                            'margin-left': '2cm'
                        }

                        pdfkit.from_file(file_path, output_path, options=margin_options)

                        # pdfkit.from_file(file_path, output_path)
                        self.process_logger.info(f"Converted HTML to PDF using pdfkit: {filename}")
                        return output_path
                    except ImportError:
                        self.process_logger.error(f"Pdfkit available for HTML conversion: {filename}")
                        return None
                            
                elif file_extension in ['.xls', '.xlsx']:
                    # Convert Excel to PDF
                    try:
                        import pandas as pd
                        from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer
                        from reportlab.lib.pagesizes import letter, A4
                        from reportlab.lib import colors
                        from reportlab.lib.styles import getSampleStyleSheet
                        
                        # Read Excel file
                        excel_file = pd.ExcelFile(file_path)
                        
                        # Create PDF
                        doc = SimpleDocTemplate(output_path, pagesize=A4)
                        elements = []
                        styles = getSampleStyleSheet()
                        
                        for sheet_name in excel_file.sheet_names:
                            df = pd.read_excel(file_path, sheet_name=sheet_name)
                            
                            # Add sheet title
                            elements.append(Paragraph(f"Sheet: {sheet_name}", styles['Heading1']))
                            elements.append(Spacer(1, 12))
                            
                            # Convert DataFrame to table data
                            data = [df.columns.tolist()] + df.fillna('').astype(str).values.tolist()
                            
                            # Limit columns to fit page (max 6 columns)
                            if len(data[0]) > 6:
                                data = [row[:6] + ['...'] if len(row) > 6 else row for row in data]
                            
                            # Create table
                            table = Table(data)
                            table.setStyle(TableStyle([
                                ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
                                ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
                                ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
                                ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
                                ('FONTSIZE', (0, 0), (-1, 0), 8),
                                ('FONTSIZE', (0, 1), (-1, -1), 6),
                                ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
                                ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
                                ('GRID', (0, 0), (-1, -1), 1, colors.black)
                            ]))
                            
                            elements.append(table)
                            elements.append(Spacer(1, 20))
                        
                        doc.build(elements)
                        self.process_logger.info(f"Converted Excel to PDF: {filename}")
                        return output_path
                        
                    except Exception as e:
                        self.process_logger.error(f"Failed to convert Excel to PDF {filename}: {e}")
                        return None
                        
                elif file_extension in ['.doc', '.docx']:
                    # Convert Word to PDF
                    try:
                        from docx import Document
                        from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
                        from reportlab.lib.pagesizes import letter
                        from reportlab.lib.styles import getSampleStyleSheet
                        
                        # Read Word document
                        doc_word = Document(file_path)
                        
                        # Create PDF
                        doc_pdf = SimpleDocTemplate(output_path, pagesize=letter)
                        elements = []
                        styles = getSampleStyleSheet()
                        
                        for paragraph in doc_word.paragraphs:
                            if paragraph.text.strip():
                                elements.append(Paragraph(paragraph.text, styles['Normal']))
                                elements.append(Spacer(1, 6))
                        
                        doc_pdf.build(elements)
                        self.process_logger.info(f"Converted Word to PDF: {filename}")
                        return output_path
                        
                    except Exception as e:
                        self.process_logger.error(f"Failed to convert Word to PDF {filename}: {e}")
                        return None
                        
                elif file_extension in ['.txt']:
                    # Convert text to PDF
                    try:
                        from reportlab.platypus import SimpleDocTemplate, Paragraph
                        from reportlab.lib.pagesizes import letter
                        from reportlab.lib.styles import getSampleStyleSheet
                        
                        with open(file_path, 'r', encoding='utf-8') as f:
                            text_content = f.read()
                        
                        doc = SimpleDocTemplate(output_path, pagesize=letter)
                        styles = getSampleStyleSheet()
                        
                        # Split text into paragraphs and create PDF
                        paragraphs = text_content.split('\n\n')
                        elements = []
                        
                        for para in paragraphs:
                            if para.strip():
                                elements.append(Paragraph(para.strip(), styles['Normal']))
                        
                        doc.build(elements)
                        self.process_logger.info(f"Converted text to PDF: {filename}")
                        return output_path
                        
                    except Exception as e:
                        self.process_logger.error(f"Failed to convert text to PDF {filename}: {e}")
                        return None
                        
                else:
                    self.process_logger.warning(f"Unsupported file format for conversion: {file_extension}")
                    return None
                    
            except Exception as e:
                self.process_logger.error(f"Failed to convert file to PDF {file_path}: {e}")
                return None


    def _clean_html_file(self, file_path: str) -> None:
        """
        Clean HTML file by removing markdown code block syntax (```html and ```)
        
        Args:
            file_path: Path to the HTML file to clean
        """
        try:
            # Read the file content
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Remove all ```html and closing ``` (case-insensitive for html)
            content = re.sub(r'```html', '', content, flags=re.IGNORECASE)
            content = re.sub(r'```', '', content)
            
            # Write the cleaned content back
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(content.strip())
                
            self.process_logger.info(f"Cleaned HTML file: {os.path.basename(file_path)}")
            
        except Exception as e:
            self.process_logger.warning(f"Failed to clean HTML file {file_path}: {e}")
            # Continue with the original file if cleaning fails

    def _transform_file_path(self, file_path: str) -> str:
        """
        Transform local file_path into S3 public URL.
        Handles dynamic sub-paths after company_<id>/.
        """

        # Split path
        parts = file_path.split("/")

        # # Find the company_id and tenant_id part
        # company_part = [p for p in parts if p.startswith("company_")]
        # if not company_part:
        #     raise ValueError("Company ID not found in path")

        # company_id = company_part[0].replace("company_", "")
        # tenant_id = company_part[0].replace("company_", "")

        # Find the company_x_y part
        company_part = [p for p in parts if p.startswith("company_")]
        if not company_part:
            raise ValueError("Company ID not found in path")

        # Remove "company_" prefix
        company_info = company_part[0].replace("company_", "")

        # Split into company_id and tenant_id
        company_id, tenant_id = company_info.split("_", 1)

        print("Company ID:", company_id)
        print("Tenant ID:", tenant_id)

        # Everything after company_<id>/ is dynamic
        company_index = parts.index(company_part[0])
        dynamic_path = "/".join(parts[company_index + 1 :])

        # Build S3 key
        s3_key = f"documents/{tenant_id}/company_docs/{company_id}/{dynamic_path}"

        # Final URL
        s3_url = f"https://{AWS_S3_BUCKET}.{AWS_REGION}.linodeobjects.com/{s3_key}"

        return s3_url
    
    # Helper method to calculate file hash (add this method to your class)
    def _calculate_file_hash(self, file_path: str) -> str:
        """
        Calculate SHA-256 hash of file content
        
        Args:
            file_path (Path): Path to the file
            
        Returns:
            str: Hexadecimal hash of the file content
        """
        sha256_hash = hashlib.sha256()
        
        # Read and update hash in chunks to handle large files efficiently
        with open(file_path, "rb") as f:
            for byte_block in iter(lambda: f.read(4096), b""):
                sha256_hash.update(byte_block)
        
        return sha256_hash.hexdigest()

if __name__ == "__main__":
    # Example usage
    service = MinaionsRFPService()
    
    # Get RFP Documents
    # rfp_docs = service.get_rfp_documents("tender_id", "tenant_id")
    
    # Analysis Estimation
    # cost_estimate = service.estimate_analysis_cost("tender_id", "tenant_id")
    
    # RFP Analysis
    # analysis = service.analyze_rfp("analysis_id", "tenant_id")
    
    # Bid Generation Estimation
    # bid_cost = service.estimate_bid_generation_cost("analysis_id", "tenant_id")
    
    # Bid Document Generation
    # bid_docs = service.generate_bid_documents("bid_generation_id", "tenant_id")
    
    # Regenerate Bid Document
    # regenerated = service.regenerate_bid_document("bid_generation_id", "tenant_id", "document_path", "user_prompt")
    
    # Setup Chat
    # chat_setup = service.setup_rfp_chat("analysis_id", "tenant_id")
    
    # Chat with RFP
    # response = service.chat_with_rfp("analysis_id", "tenant_id", "What are the eligibility criteria?")