import os
import shutil
import time
import re
import requests
import PyPDF2
import anthropic
import hashlib
from pathlib import Path
from datetime import datetime
from browser_use import Agent, Browser, BrowserConfig
import bid_prep_automation as bpa
import bid_queries as bq

import argparse
import sys
import json

from google import genai
from google.genai import types

# Configuration
DOWNLOAD_DIR = r"D:\\Downloads"
BASE_DIR = r"D:\\Tenders"
COMPANY_INFO_DOC = "https://docs.google.com/document/d/1sEVXc8RJYzys26fLNwiM-Ctx9349TLbcS2tjYyi_NPQ/edit"
COMPANY_DOCS_DRIVE = "https://drive.google.com/drive/u/2/folders/1XRkNDiWk1dxOWDitr7cQyaDGBzrr6XJj"
ANTHROPIC_API_KEY = "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"
os.environ["ANONYMIZED_TELEMETRY"] = "false"
GEMINI_API_KEY = "AIzaSyCzr6L3E8yywy8Ls2errRBOPx740VcjV1g"

# Initialize Gemini client
gemini_client = genai.Client(api_key=GEMINI_API_KEY)
# Initialize Claude client
claude_client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

llm_model = "claude" #"gemini" #

class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

def setup_directories():
    """Create base directories if they don't exist"""
    os.makedirs(BASE_DIR, exist_ok=True)
    return BASE_DIR

async def search_and_download_tenders(keywords):
    """
    Search for tenders with given keywords and download relevant documents
    
    Args:
        keywords (list): List of keywords to search for
        
    Returns:
        dict: Mapping of bid numbers to their download information
    """
    bid_info = {}
    
    # Initialize browser
    browser = Browser(
        config=BrowserConfig(
        # Specify the path to your Chrome executable
        browser_binary_path='C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
    ))

    for keyword in keywords:
        tender_search_agent = Agent(
            task= f"""You are an expert tender application downloader. Your task is to:
            
                    1. Navigate to the GeM bidding portal (https://bidplus.gem.gov.in/all-bids)
                    2. In the "Contains" search field search for bids with the keyword: {keyword}
                    3. For each result:
                       a. Click on the showbidDocument (in front of the BID NO: text) link to download the main tender document
                       b. Check for and download any corrigendum documents, if present.
                       c. Capture the bid number to create a folder/directory with that name.
                """,
            llm=ChatAnthropic(model='claude-3-5-haiku-20241022'),
            browser=browser,
        )
        result = await sales_minaion.run()
    
    try:
        # Navigate to the tender website
        browser.goto("https://bidplus.gem.gov.in/all-bids")
        time.sleep(3)  # Wait for page to load
        
        for keyword in keywords:
            print(f"Searching for keyword: {keyword}")
            
            # Clear any existing search and enter new keyword
            search_box = browser.find("input[placeholder='Search Bid']")
            browser.clear(search_box)
            browser.type(search_box, keyword)
            browser.click("button.search-button")
            time.sleep(5)  # Wait for search results
            
            # Check if any results found
            results = browser.find_all("table tbody tr")
            if not results:
                print(f"No results found for keyword: {keyword}")
                continue
                
            # Process each result
            for result in results:
                # Extract bid number
                bid_number_elem = browser.find_within(result, "td:nth-child(1)")
                if not bid_number_elem:
                    continue
                    
                bid_number = browser.text(bid_number_elem)
                print(f"Found bid: {bid_number}")
                
                if bid_number in bid_info:
                    continue  # Skip if already processed
                
                # Download main tender document
                view_link = browser.find_within(result, "a.bidding-details")
                if view_link:
                    browser.click(view_link)
                    time.sleep(5)  # Wait for details page to load
                    
                    # Download main document
                    download_btn = browser.find("a.downlodaBiddingDoc")
                    if download_btn:
                        browser.click(download_btn)
                        time.sleep(5)  # Wait for download to complete
                        
                        # Check for corrigendum/representation
                        corr_link = browser.find("a:contains('View Corrigendum/Representation')")
                        if corr_link:
                            browser.click(corr_link)
                            time.sleep(3)
                            
                            # Download any available corrigendum documents
                            corr_downloads = browser.find_all("a.downlodaBiddingDoc")
                            for corr_download in corr_downloads:
                                browser.click(corr_download)
                                time.sleep(3)
                    
                    # Record bid info
                    bid_info[bid_number] = {
                        'keyword': keyword,
                        'download_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    }
                    
                    # Go back to results
                    browser.go_back()
                    time.sleep(3)
            
            print(f"Completed search for keyword: {keyword}")
    
    except Exception as e:
        print(f"Error during search and download: {str(e)}")
    
    finally:
        # Close the browser
        browser.close()
        
    return bid_info

def get_latest_downloads(minutes=10):
    """
    Get list of recently downloaded files
    
    Args:
        minutes (int): Time window in minutes to consider files as recent
        
    Returns:
        list: List of paths to recently downloaded files
    """
    download_path = Path(DOWNLOAD_DIR)
    now = datetime.now().timestamp()
    
    # Get all PDF files in downloads directory
    downloaded_files = []
    for file_path in download_path.glob("*.pdf"):
        # Check if file was downloaded recently
        file_time = file_path.stat().st_mtime
        if (now - file_time) < (minutes * 60):  # Convert minutes to seconds
            downloaded_files.append(file_path)
    
    return downloaded_files

def organize_files(bid_number, downloaded_files):
    """
    Organize downloaded files into bid-specific directories
    
    Args:
        bid_number (str): Bid number to create directory for
        downloaded_files (list): List of downloaded file paths
        
    Returns:
        str: Path to the created bid directory
    """
    # Create bid directory
    bid_dir = os.path.join(BASE_DIR, bid_number)
    os.makedirs(bid_dir, exist_ok=True)
    
    # Move downloaded files to bid directory
    moved_files = []
    for file_path in downloaded_files:
        dest_path = os.path.join(bid_dir, file_path.name)
        shutil.move(str(file_path), dest_path)
        moved_files.append(dest_path)
        
    print(f"Moved {len(moved_files)} files to {bid_dir}")
    return bid_dir, moved_files

def extract_links_from_pdf(pdf_path):
    """Extract embedded hyperlinks from PDF files, skipping specific display text"""
    # Define the display texts to skip
    skip_phrases = [
        'attached categories',
        'General Terms and Conditions',
        'Service Level Agreement'
    ]
    
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)

            # Initialize list to store all links
            links = []

            # Extract links from each page
            for page_num, page in enumerate(pdf_reader.pages):
                # Check if we're in a disclaimer section - simple text-based check
                page_text = page.extract_text().lower()
                if "disclaimer" in page_text:
                    continue  # Skip disclaimer sections

                # Access annotations (which include hyperlinks)
                if '/Annots' in page:
                    annotations = page['/Annots']
                    if annotations:
                        # Process each annotation
                        for annotation in annotations:
                            annotation_object = annotation.get_object()
                            # Check if it's a link annotation
                            if annotation_object.get('/Subtype') == '/Link':
                                # Extract the actual URL
                                if '/A' in annotation_object and '/URI' in annotation_object['/A']:
                                    uri = annotation_object['/A']['/URI']
                                    if isinstance(uri, str):
                                        # Check if this link should be skipped based on its display text
                                        skip_link = False
                                        
                                        # Try to get the display text from the annotation
                                        display_text = None
                                        
                                        # Look for text in various annotation properties
                                        if '/Contents' in annotation_object:
                                            display_text = annotation_object['/Contents']
                                        elif '/T' in annotation_object:
                                            display_text = annotation_object['/T']
                                        elif '/TU' in annotation_object:
                                            display_text = annotation_object['/TU']
                                        
                                        # If we still don't have display text, try to extract it from the annotation's text
                                        if not display_text and '/Rect' in annotation_object:
                                            try:
                                                # Get text in the annotation rectangle
                                                rect = annotation_object['/Rect']
                                                # Extract text from the page and look for text near the rectangle
                                                page_text = page.extract_text()
                                                # This is a simplified approach - in practice, you might need more sophisticated text extraction
                                                display_text = page_text
                                            except:
                                                pass
                                        
                                        # Check if display text contains any skip phrases
                                        if display_text and isinstance(display_text, str):
                                            display_text_lower = display_text.lower()
                                            for skip_phrase in skip_phrases:
                                                if skip_phrase.lower() in display_text_lower:
                                                    skip_link = True
                                                    break
                                        
                                        # Only add the link if it shouldn't be skipped
                                        if not skip_link:
                                            links.append({
                                                'url': uri,
                                                'page': page_num + 1
                                            })

                # Alternative method for newer PyPDF2 versions
                try:
                    page_links = page.get_links()
                    for link in page_links:
                        if hasattr(link, 'url') and link.url:
                            # Check if this link should be skipped
                            skip_link = False
                            
                            # Try to get display text for this link
                            if hasattr(link, 'text'):
                                display_text = link.text
                                if display_text and isinstance(display_text, str):
                                    display_text_lower = display_text.lower()
                                    for skip_phrase in skip_phrases:
                                        if skip_phrase.lower() in display_text_lower:
                                            skip_link = True
                                            break
                            
                            # Only add the link if it shouldn't be skipped
                            if not skip_link:
                                links.append({
                                    'url': link.url,
                                    'page': page_num + 1
                                })
                                
                except (AttributeError, TypeError):
                    # get_links method not available or failed
                    pass

        # Fall back to text extraction for visible URLs if no embedded links found
        if not links:
            import re
            for page_num, page in enumerate(pdf_reader.pages):
                page_text = page.extract_text()

                # Skip disclaimer sections
                if "disclaimer" in page_text.lower():
                    continue

                # Extract URLs from text as a fallback
                # Split text into lines to analyze context
                lines = page_text.split('\n')
                
                for i, line in enumerate(lines):
                    urls = re.findall(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', line)
                    for url in urls:
                        # Check if the line or surrounding lines contain skip phrases
                        skip_url = False
                        
                        # Check current line and previous/next lines for context
                        context_lines = []
                        if i > 0:
                            context_lines.append(lines[i-1])
                        context_lines.append(line)
                        if i < len(lines) - 1:
                            context_lines.append(lines[i+1])
                        
                        context_text = ' '.join(context_lines).lower()
                        
                        for skip_phrase in skip_phrases:
                            if skip_phrase.lower() in context_text:
                                skip_url = True
                                break
                        
                        # Only add the URL if it shouldn't be skipped
                        if not skip_url:
                            links.append({
                                'url': url,
                                'page': page_num + 1,
                                'note': 'Extracted from text (not embedded)'
                            })

        # Format the results
        formatted_links = []
        for link in links:
            formatted_links.append(f"{link['url']}")

        if not formatted_links:
            return print("No links found in the PDF document")

        return formatted_links

    except Exception as e:
        print(f"Error extracting links from PDF: {str(e)}")
        return print(f"Failed to extract links from PDF: {str(e)}")

def download_linked_documents(bid_dir, links):
    """
    Download additional documents from links found in tender documents
    
    Args:
        bid_dir (str): Directory to save downloaded files to
        links (list): List of links to download
        
    Returns:
        list: List of paths to downloaded files
    """
    downloaded_files = []
    
    for link in links:
        try:
            # Skip if not a web URL (simple check)
            if not link.startswith(('http://', 'https://')):
                continue
                
            # Extract filename from URL
            filename = os.path.basename(link)
            if not filename.endswith('.pdf'):
                filename = f"linked_doc_{len(downloaded_files)+1}.pdf"
                
            # Download the file
            response = requests.get(link, stream=True)
            if response.status_code == 200:
                file_path = os.path.join(bid_dir, filename)
                
                with open(file_path, 'wb') as f:
                    f.write(response.content)
                    
                downloaded_files.append(file_path)
                print(f"Downloaded {filename} from {link}")
            else:
                print(f"Failed to download {link}, status code: {response.status_code}")
                
        except Exception as e:
            print(f"Error downloading {link}: {str(e)}")
    
    return downloaded_files

def calculate_file_hash(file_path):
    """
    Calculate SHA-256 hash of file content
    
    Args:
        file_path (Path): Path to the file
        
    Returns:
        str: Hexadecimal hash of the file content
    """
    sha256_hash = hashlib.sha256()
    
    # Read and update hash in chunks to handle large files efficiently
    with open(file_path, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    
    return sha256_hash.hexdigest()

def remove_duplicate_pdfs(directory_path):
    """
    Remove duplicate PDF files from a directory based on content.
    For each set of identical files, the first one found is kept and others are removed.
    
    Args:
        directory_path (str): Path to directory containing PDF files
        
    Returns:
        tuple: (kept_files, removed_files) lists of filenames
    """
    print(f"🔍 Checking for duplicate PDFs in: {directory_path}")
    
    # Get all PDF files in the directory
    pdf_files = list(Path(directory_path).glob("*.pdf"))
    
    if not pdf_files:
        print("No PDF files found")
        return [], []
    
    # Dictionary to store hash -> [file_paths]
    hash_map = {}
    
    # Calculate hash for each file and group by hash
    for file_path in pdf_files:
        print(f"Analyzing: {file_path.name}")
        file_hash = calculate_file_hash(file_path)
        
        if file_hash in hash_map:
            hash_map[file_hash].append(file_path)
        else:
            hash_map[file_hash] = [file_path]
    
    # Keep track of which files were kept and which were removed
    kept_files = []
    removed_files = []
    
    # Process each group of files with the same hash
    for file_hash, file_paths in hash_map.items():
        # Keep the first file
        kept_file = file_paths[0]
        kept_files.append(kept_file)
        
        # Remove all duplicates
        for duplicate in file_paths[1:]:
            try:
                os.remove(duplicate)
                removed_files.append(duplicate)
                print(f"Removed duplicate: {duplicate.name} (same as {kept_file.name})")
            except Exception as e:
                print(f"Error removing {duplicate.name}: {str(e)}")
    
    print(f"Kept {len(kept_files)} unique files, removed {len(removed_files)} duplicates")
    return [f.name for f in kept_files], [f.name for f in removed_files]

def list_files_in_directory(directory_path):
    """
    Create a list of all files in the specified directory with their full paths.
    
    Args:
        directory_path (str): Path to the directory to scan
        
    Returns:
        list: List of full paths to all files in the directory
    """
    # Convert to Path object for easier handling
    dir_path = Path(directory_path)
    
    # Check if the directory exists
    if not dir_path.exists():
        print(f"Directory does not exist: {directory_path}")
        return []
    
    if not dir_path.is_dir():
        print(f"Path is not a directory: {directory_path}")
        return []
    
    # List to store the full paths
    file_paths = []
    
    # Iterate through all items in the directory
    for item in dir_path.iterdir():
        # Only include files, not directories
        if item.is_file():
            # Add the full path as a string
            file_paths.append(str(item.absolute()))
    
    print(f"Found {len(file_paths)} files in {directory_path}")
    return file_paths

def extract_text_from_pdf(pdf_path):
    """
    Extract text content from PDF files
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        str: Extracted text content
    """
    text = ""
    
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() + "\n\n"
    
    except Exception as e:
        print(f"Error extracting text from PDF {pdf_path}: {str(e)}")
    
    return text

def count_total_words(documents_text):
    """
    Count the total number of words in all PDF text content stored in documents_text dictionary
    
    Args:
        documents_text (dict): Dictionary mapping file paths to their extracted text content
        
    Returns:
        int: Total word count across all documents
    """
    # Initialize word count
    total_word_count = 0
    
    # Iterate through each document's text content
    for pdf_path, text_content in documents_text.items():
        # Skip if the text content is empty or None
        if not text_content:
            continue
            
        # Split the text into words and count them
        words = text_content.split()
        word_count = len(words)
        
        # Add to the total count
        total_word_count += word_count
        
        # Optionally print individual document word counts
        print(f"Document: {os.path.basename(pdf_path)} - {word_count} words")
    
    print(f"\nTotal words across all documents: {total_word_count}")
    print("\n" + "="*50)
    print(color.BOLD + color.BLUE + f"Estimated cost of RFP Analysis: ₹{round(total_word_count*0.00425, 2)}" + color.END)
    print("="*50)
    return total_word_count

def analyze_tender_with_LLM(documents_text):
    """
    Analyze tender documents with Claude LLM
    
    Args:
        documents_text (dict): Dictionary mapping file paths to their text content
        
    Returns:
        dict: Extracted information from tender documents
    """
    # Information to extract
    info_to_extract = [
        "Eligibility/Qualification Criteria or conditions for bidder",
        "Evaluation criteria or method",
        "Documents needed to submit the bid",
        "Scope of work of the whole project",
        "Amount of EMD fee",
        "Relaxation or preference given to any kind of company or bidder",
        "Payment terms",
        "BOQ requirements",
        "Annexures or forms or formats"
    ]
    
    # Initialize the result dictionary with empty strings
    extracted_info = {item: "" for item in info_to_extract}
    
    # Process all documents, not just the main one
    # First, calculate the total text size to determine chunking strategy
    total_text_size = sum(len(text) for text in documents_text.values())
    print(f"Total text size across all documents: {total_text_size} characters")
    
    # Approach: Process each document separately and then combine the results
    doc_analyses = []
    
    for doc_path, doc_text in documents_text.items():
        doc_name = os.path.basename(doc_path)
        print(f"Analyzing document: {doc_name} ({len(doc_text)} characters)")
        
        # Skip empty documents
        if not doc_text.strip():
            print(f"Skipping empty document: {doc_name}")
            continue
        
        # Create chunks based on document size
        chunk_size = 50000  # Adjust based on LLM's token limits
        doc_chunks = []
        
        if len(doc_text) > chunk_size:
            # Split into chunks, but try to break at paragraph boundaries
            start = 0
            while start < len(doc_text):
                end = start + chunk_size
                
                # Adjust to end at paragraph boundary if possible
                if end < len(doc_text):
                    # Look for double newline (paragraph break) before the cutoff
                    paragraph_end = doc_text.rfind('\n\n', start, end)
                    # If found and not too far from the chunk size, use it
                    if paragraph_end > start + (chunk_size * 0.7):
                        end = paragraph_end
                    else:
                        # Otherwise look for single newline
                        line_end = doc_text.rfind('\n', start, end)
                        if line_end > start + (chunk_size * 0.8):
                            end = line_end
                
                # Add the chunk
                doc_chunks.append(doc_text[start:end])
                start = end
        else:
            # Document is small enough to process in one chunk
            doc_chunks = [doc_text]
        
        print(f"Split document into {len(doc_chunks)} chunks")
        
        # Process each chunk with Claude
        for chunk_idx, chunk in enumerate(doc_chunks):
            prompt = f"""
            You are analyzing tender documents. I'll provide you with a chunk ({chunk_idx+1}/{len(doc_chunks)}) 
            from the document {doc_name}.
            
            Please extract the following information if present in this chunk:
            1. Eligibility/Qualification Criteria or conditions for bidder
            2. Evaluation criteria or method
            3. Documents needed to submit the bid 
            4. Scope of work of the whole project
            5. Amount of EMD fee
            6. Relaxation or preference given to any kind of company or bidder
            7. Payment terms
            8. BOQ requirements
            9. Annexures or forms or formats
            
            For each category, provide the exact text from the document. If the information isn't in this chunk, 
            just say "Not found in this chunk." Please structure your response clearly with appropriate headers
            for each section. For evaluation criteria or method, extract the complete scoring table if specified.
            For Annexures or specific document formats, separate all annexures found with their names or numbers.
            
            Here is the document chunk:
            {chunk}
            """
            
            try:
                if(llm_model == 'gemini'):
                    response = gemini_client.models.generate_content(
                        model="gemini-2.5-flash-preview-04-17", #"gemini-2.0-flash",
                        contents=[prompt],
                        config=types.GenerateContentConfig(
                            system_instruction="You are an expert in analyzing tender documents. Extract the requested information accurately.",
                            max_output_tokens=6000,
                            temperature=0.1
                        )
                    )
                    response_text = response.text

                elif(llm_model == 'claude'):
                    # Call Claude API
                    response = claude_client.messages.create(
                        model="claude-3-7-sonnet-latest", #claude-3-5-haiku-20241022
                        max_tokens=6000,
                        temperature=0,
                        system="You are an expert in analyzing tender documents. Extract the requested information accurately.",
                        messages=[
                            {"role": "user", "content": prompt}
                        ]
                    )
                    
                    # Get the response text
                    response_text = response.content[0].text
                
                # Save this analysis
                doc_analyses.append({
                    "doc_name": doc_name,
                    "chunk_idx": chunk_idx,
                    "response": response_text
                })
                
            except Exception as e:
                print(f"Error analyzing chunk {chunk_idx+1} with Claude: {str(e)}")
    
    # Now process all the analysis responses to extract the information
    # Use a more robust approach to extract information from Claude's responses
    for analysis in doc_analyses:
        response_text = analysis["response"]
        doc_name = analysis["doc_name"]
        chunk_idx = analysis["chunk_idx"]
        
        print(f"Processing analysis of {doc_name} (chunk {chunk_idx+1})")
        
        # Process each category of information to extract
        for idx, item in enumerate(info_to_extract):
            # Possible section headers Claude might use
            section_markers = [
                f"{idx+1}. {item}",  # 1. Eligibility/Qualification Criteria
                f"## {idx+1}. {item}",  # ## 1. Eligibility/Qualification Criteria 
                f"**{idx+1}. {item}**",  # **1. Eligibility/Qualification Criteria**
                f"#{idx+1} {item}",  # #1 Eligibility/Qualification Criteria
                f"{item}:",  # Eligibility/Qualification Criteria:
                f"**{item}**",  # **Eligibility/Qualification Criteria**
                f"## {item}",  # ## Eligibility/Qualification Criteria
                f"### {item}",  # ### Eligibility/Qualification Criteria
                item  # Plain text
            ]
            
            # Find the section
            section_start = -1
            used_marker = ""
            
            for marker in section_markers:
                pos = response_text.find(marker)
                if pos != -1:
                    section_start = pos
                    used_marker = marker
                    break
            
            if section_start == -1:
                # Section not found
                continue
            
            # Find the end of this section (start of next section or end of response)
            section_end = len(response_text)
            
            # Check where the next section starts
            for next_idx, next_item in enumerate(info_to_extract):
                if next_idx <= idx:  # Skip current and previous sections
                    continue
                
                # Check all possible markers for the next section
                for marker in [
                    f"{next_idx+1}. {next_item}", 
                    f"## {next_idx+1}. {next_item}", 
                    f"**{next_idx+1}. {next_item}**",
                    f"#{next_idx+1} {next_item}",
                    f"{next_item}:", 
                    f"**{next_item}**",
                    f"## {next_item}",
                    f"### {next_item}",
                    next_item
                ]:
                    next_pos = response_text.find(marker, section_start)
                    if next_pos != -1 and next_pos < section_end:
                        section_end = next_pos
                        break
            
            # Extract the section content
            section_content = response_text[section_start + len(used_marker):section_end].strip()
            
            # Skip if the content indicates "not found"
            if any(phrase in section_content.lower() for phrase in [
                "not found in this chunk", 
                "not mentioned in this chunk",
                "no information found",
                "not provided in this chunk",
                "not specified in this chunk"
            ]):
                continue
            
            # Add the extracted content to the result
            if section_content:
                # If we already have content for this item, add a separator
                if extracted_info[item]:
                    extracted_info[item] += f"\n\n--- From {doc_name} (chunk {chunk_idx+1}) ---\n"
                else:
                    extracted_info[item] += f"--- From {doc_name} (chunk {chunk_idx+1}) ---\n"
                
                extracted_info[item] += section_content
    
    # Final cleanup - remove any empty sections and format for readability
    for item in info_to_extract:
        if not extracted_info[item]:
            extracted_info[item] = "Not found in any document"
        else:
            # Clean up formatting and remove duplicative information
            lines = extracted_info[item].split('\n')
            cleaned_lines = []
            seen_content = set()
            
            for line in lines:
                # Skip empty lines and source markers at this stage
                if not line.strip() or line.strip().startswith('---'):
                    cleaned_lines.append(line)
                    continue
                
                # Normalize and hash the line for deduplication
                normalized = ' '.join(line.lower().split())
                if normalized not in seen_content and len(normalized) > 5:
                    seen_content.add(normalized)
                    cleaned_lines.append(line)
            
            # Combine back while preserving source markers
            extracted_info[item] = '\n'.join(cleaned_lines)
    
    # Print a summary of what was found
    found_items = [item for item, content in extracted_info.items() if content != "Not found in any document"]
    print(f"Successfully extracted information for {len(found_items)} categories:")
    for item in found_items:
        content_preview = extracted_info[item].split('\n', 1)[0]
        print(f"- {item}: {content_preview[:50]}...")
    
    return extracted_info, doc_analyses

def save_extracted_info(bid_dir, extracted_info):
    """
    Save extracted information to a file
    
    Args:
        bid_dir (str): Directory to save the file to
        extracted_info (dict): Extracted information to save
        
    Returns:
        str: Path to the saved file
    """
    output_path = os.path.join(bid_dir, "tender_analysis")
    os.makedirs(output_path, exist_ok=True) # Create the directory. If the target directory already exists, do not raise an exception.
    output_text_file = os.path.join(output_path, "tender_analysis.txt")
    output_json_file = os.path.join(output_path, "tender_analysis.json")

    # Write to a JSON file
    with open(output_json_file, "w") as file:
        json.dump(extracted_info, file, indent=4)
    
    # Write to a TXT file
    with open(output_text_file, 'w', encoding='utf-8') as f:
        f.write("TENDER ANALYSIS REPORT\n")
        f.write("=" * 50 + "\n\n")
        
        for category, info in extracted_info.items():
            f.write(f"{category}\n")
            f.write("-" * len(category) + "\n")
            f.write(info.strip() or "Not found in the documents")
            f.write("\n\n" + "=" * 50 + "\n\n")
            if("Eligibility" in category):
                eligibility = info.strip()
    
    print(f"Saved extracted information to {output_text_file}")
    return output_json_file, eligibility

def get_company_info():
    """
    Get company information from Google Docs
    
    Returns:
        str: Company information text
    """
    # In a real implementation, this would use the Google Docs API
    # For now, we'll simulate this with a placeholder
    print(f"Getting company information from {COMPANY_INFO_DOC}")
    
    # This is a placeholder - in a real implementation, you would:
    # 1. Authenticate with Google
    # 2. Use the Docs API to get the document content
    # 3. Parse and return the content
    
    # For demo purposes, let's return a sample company info
    return """
    Yugasa Company Information for Government tendering:

Company Name: Yugasa Software Labs Pvt Ltd
Office addresses: 
Gurgaon Address: Yugasa Software Labs, 3rd floor, Tower B, Unitech Cyber Park, Sector 39, Gurgaon 122001, Haryana
Lucknow Address: Yugasa Software Labs, 3rd floor, TC-14, Vibhuti Khand, Gomti Nagar, Lucknow, Uttar Pradesh 226010
US Address: Yugasa Software LLC, 370 Campus Drive, Somerset, New Jersey 08873

Company registration:
Yugasa Software Labs Pvt Ltd is a legal entity in India registered under Indian Companies Act, 2013. Registered as Private Limited Company with Registrar of Companies, Delhi.
The CIN of the company is U72900HR2015PTC056837

PAN of Yugasa: AAACY7582J

Certifications:
CMMI 3
ISO 27001:2022
ISO 9001:2015

Valid GST registration. GST Number of Yugasa: 06AAACY7582J1ZU

Yugasa is the official Meta Business Partner as ISV solution provider for WhatsApp.

Turnover of previous years:

2024-25: INR 3.52 Crores
2023-24: INR 3.29 Crores
2022-23: INR 3.19 Crores
2021-22: INR 3.35 Crores
2020-21: INR 2.18 Crores

Yugasa software Labs Pvt Ltd is not barred or blacklisted by any PSU, government department, or private sector entity. 

Yugasa software labs pvt ltd is an MSME and registered Startup

Manpower on Yugasa’s payroll:
Currently Yugasa has 40 employees on its payroll.
    """

def check_eligibility(extracted_info, company_info):
    """
    Check if the company is eligible for the bid
    
    Args:
        extracted_info (dict): Extracted tender information
        company_info (str): Company information
        
    Returns:
        tuple: (is_eligible, reason)
    """
    # Prepare prompt for Claude to assess eligibility
    eligibility_criteria = extracted_info.get("Eligibility/Qualification Criteria or conditions for bidder", "")
    
    prompt = f"""
    You need to determine if the company is eligible to apply for a tender based on the eligibility criteria and company information.
    
    Eligibility Criteria:
    {eligibility_criteria}
    
    Company Information:
    {company_info}
    
    Please analyze if the company meets all the eligibility criteria. Return your answer in the following format:
    
    Eligible: [Yes/No]
    Reason: [Detailed explanation of why the company is eligible or not]
    Missing Requirements: [List any requirements the company doesn't meet, if applicable]
    """
    
    try:
        if(llm_model == 'gemini'):
            response = gemini_client.models.generate_content(
                model="gemini-2.5-flash-preview-04-17", #"gemini-2.0-flash",
                contents=[prompt],
                config=types.GenerateContentConfig(
                    system_instruction="You are an expert in tender eligibility assessment. Be thorough and accurate in your analysis.",
                    max_output_tokens=2000,
                    temperature=0.1
                )
            )
            response_text = response.text

        elif(llm_model == 'claude'):
            # Call Claude API
            response = claude_client.messages.create(
                model="claude-3-7-sonnet-latest",
                max_tokens=2000,
                temperature=0,
                system="You are an expert in tender eligibility assessment. Be thorough and accurate in your analysis.",
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )
            
            # Parse response
            response_text = response.content[0].text
        
        # Extract eligibility decision
        is_eligible = "eligible: yes" in response_text.lower()
        
        # Extract reason
        reason_match = re.search(r'Reason:\s*(.*?)(?:\n\n|\n[A-Z]|$)', response_text, re.DOTALL)
        reason = reason_match.group(1).strip() if reason_match else "No detailed reason provided"
        
        return is_eligible, reason
        
    except Exception as e:
        print(f"Error checking eligibility with Claude: {str(e)}")
        return False, f"Error during eligibility check: {str(e)}"

def create_tender_docs_directory(bid_dir):
    """
    Create directory for tender documents if eligible
    
    Args:
        bid_dir (str): Bid directory
        
    Returns:
        str: Path to the created directory
    """
    tender_docs_dir = os.path.join(bid_dir, "tender_documents")
    os.makedirs(tender_docs_dir, exist_ok=True)
    print(f"Created tender documents directory: {tender_docs_dir}")
    return tender_docs_dir

def process_tender(keywords):
    """
    Process tender search and analysis workflow
    
    Args:
        keywords (list): Keywords to search for
        
    Returns:
        list: List of processed bid numbers
    """
    # Setup directories
    base_dir = setup_directories()
    
    # Search and download tenders
    bid_info = search_and_download_tenders(keywords)
    
    processed_bids = []
    
    # Process each bid
    for bid_number, info in bid_info.items():
        print(f"\nProcessing bid: {bid_number}")
        
        # Get recently downloaded files
        downloaded_files = get_latest_downloads()
        
        # Organize files into bid directory
        bid_dir, moved_files = organize_files(bid_number, downloaded_files)
        
        # Process main document to find additional links
        main_doc = next((f for f in moved_files if "GeM-Bidding-" in f), None)
        if main_doc:
            # Extract links from the main document
            links = extract_links_from_pdf(main_doc)
            
            #########################################
            # Will be done by playwright script
            #########################################
            # Download additional documents
            # additional_docs = download_linked_documents(bid_dir, links) #Will be done by playwright script
            # moved_files.extend(additional_docs)

        kept, removed = remove_duplicate_pdfs(bid_dir)
        print(f"Kept files: {kept}")
        print(f"Removed Duplicate files: {removed}")

        moved_files = list_files_in_directory(bid_dir)

        # Extract text from all documents
        documents_text = {}
        for doc_path in moved_files:
            text = extract_text_from_pdf(doc_path)
            documents_text[doc_path] = text
        
        # After extracting text from all PDFs and storing in documents_text dictionary
        word_count = count_total_words(documents_text)

        # Analyze documents with Claude
        extracted_info, docs_analysis = analyze_tender_with_LLM(documents_text)
        
        # Save extracted information
        analysis_file, eligibility = save_extracted_info(bid_dir, extracted_info)
        
        # Get company information
        company_info = get_company_info()
        
        # Check eligibility
        is_eligible, reason = check_eligibility(extracted_info, company_info)
        
        # Create tender documents directory if eligible
        if is_eligible:
            tender_docs_dir = create_tender_docs_directory(bid_dir)
            print(f"Company is eligible for bid {bid_number}. Reason: {reason}")
        else:
            print(f"Company is NOT eligible for bid {bid_number}. Reason: {reason}")
        
        processed_bids.append(bid_number)
    
    return processed_bids

def process_tender_from_dir(bid_dir):
    """
    Process tender analysis workflow
    
    Args:
        bid_dir (str): Directory path containing RFP documents 
        
    Returns:
        list: List of processed bid numbers
    """
    
    processed_bids = []
    
    # Process the bid
    print(f"\nProcessing bid from: {bid_dir}")

    kept, removed = remove_duplicate_pdfs(bid_dir)
    print(f"Kept files: {kept}")
    print(f"Removed Duplicate files: {removed}")

    moved_files = list_files_in_directory(bid_dir)

    # Extract text from all documents
    documents_text = {}
    for doc_path in moved_files:
        text = extract_text_from_pdf(doc_path)
        documents_text[doc_path] = text
    
    # After extracting text from all PDFs and storing in documents_text dictionary
    word_count = count_total_words(documents_text)

    # Analyze documents with Claude
    extracted_info, docs_analysis = analyze_tender_with_LLM(documents_text)
    
    # Save extracted information
    analysis_file, eligibility = save_extracted_info(bid_dir, extracted_info)
    
    # Get company information
    company_info = get_company_info()
    
    # Check eligibility
    is_eligible, reason = check_eligibility(extracted_info, company_info)
    
    # Create tender documents directory if eligible
    if is_eligible:
        tender_docs_dir = create_tender_docs_directory(bid_dir)
        print(f"🚀 Company is eligible for the RFP {bid_dir}. Reason: {reason}")
    else:
        print(f"⚠️ Company is NOT eligible for RFP {bid_dir}. Reason: {reason}")
    
    processed_bids.append(bid_dir)
    
    return processed_bids

def main():
    """Main function to run the tender processing workflow"""
    # Set up argument parser
    parser = argparse.ArgumentParser(description='Tender Processing Tool')
    parser.add_argument('--rfp-docs-dir', type=str, help='Directory path containing RFP documents')
    
    # Parse arguments
    args = parser.parse_args()
    
    # If directory path is provided as argument, directly process the tenders
    if args.rfp_docs_dir:
        print("\n" + "="*50)
        print(f"🔄 Processing tenders from directory: {args.rfp_docs_dir}")
        print("="*50)
        processed_bids = process_tender_from_dir(args.rfp_docs_dir)
        
        print(f"\n✅ Completed processing {len(processed_bids)} bids:")
        for bid in processed_bids:
            print(f"- {bid}")
    else:
        # Ask user what they want to do
        while True:
            print("\nWhat would you like to do?")
            print("1. Search for new tenders")
            print("2. Process existing RFP documents")
            print("3. Prepare bid documents")
            
            choice = input("Enter your choice (1 or 2 or 3): ").strip()
            
            if choice == '1':
                # Get keywords from user
                keywords_input = input("Enter comma-separated keywords: ").strip()
                
                # Process keywords
                if keywords_input:
                    keywords = [keyword.strip() for keyword in keywords_input.split(',')]
                    print(f"Searching for tenders with keywords: {keywords}")
                    
                    # Call search_tender function
                    # result = search_tender(keywords)
                    print(f"Search completed.")
                else:
                    print("❌ No keywords provided. Please try again.")
                    continue
                break
                
            elif choice == '2':
                # Get directory path from user
                rfp_docs_dir = input("Enter the path to the directory containing RFP documents: ").strip()
                
                if rfp_docs_dir:
                    print("\n" + "="*50)
                    print(f"🔄 Processing tenders from directory: {rfp_docs_dir}")
                    print("="*50)
                    processed_bids = process_tender_from_dir(rfp_docs_dir)
                    
                    print(f"\n✅ Completed processing {len(processed_bids)} bids:")
                    for bid in processed_bids:
                        print(f"- {bid}")
                else:
                    print("❌ No directory path provided. Please try again.")
                    continue
                break
                
            elif choice == '3':
                # Get directory path from user
                rfp_docs_dir = input("Enter the path to the directory containing RFP documents: ").strip()
                std_company_docs = input("Enter the path to the directory containing Company documents: ").strip()
                
                if rfp_docs_dir:
                    print(f"Preparing bid documents for RFP documents from directory: {rfp_docs_dir}")
                    final_docs_dir = bpa.prepare_bid_documents(rfp_docs_dir, std_company_docs, get_company_info())
                    
                    print(f"\nCompleted preparing documents at: {final_docs_dir}")
                else:
                    print("❌ No directory path provided. Please try again.")
                    continue
                break

            elif choice == '4':
                # Get directory path from user
                rfp_docs_dir = input("Enter the path to the directory containing RFP documents: ").strip()
                
                if rfp_docs_dir:
                    print(f"Preparing for bid documentation RAG from: {rfp_docs_dir}")
                    status = bq.process_task(rfp_docs_dir)
                    
                    print(f"\nCompleted preparing chunks at: {rfp_docs_dir}/tender_analysis/chunks.xlsx")
                else:
                    print("❌ No directory path provided. Please try again.")
                    continue
                break
                
            else:
                print("Invalid choice. Please enter 1 or 2 or 3.")

if __name__ == "__main__":
    main()