"""
Document Processing Service

This service processes documents (PDFs, TXTs) stored in S3:
- Extracts text from documents
- Merges text into a single file
- Chunks text using LLMs for better semantic organization
- Generates summaries using LLMs
- Saves results back to S3
- Provides an API endpoint for processing requests
"""

# Standard library imports
import os
import sys
import time
from datetime import datetime

# Third-party imports
import anthropic
# import fitz  # PyMuPDF for extracting text from PDF files
import PyPDF2
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
import pdfkit
from docx2pdf import convert

# ===== Configuration =====

# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')


# ===== Utility Functions =====

def get_llm_client(llm_type, api_key):
    """
    Returns an initialized LLM client based on the specified type.
    
    Args:
        llm_type (str): Type of LLM to use ("anthropic", "openai", or "deepseek")
        api_key (str): API key for authentication
        
    Returns:
        object: Initialized LLM client
    """
    if not api_key:
        raise ValueError("API key is required for LLM client initialization")
        
    if llm_type == "anthropic":
        return anthropic.Anthropic(api_key=api_key)
    # elif llm_type == "openai":
    #     return OpenAI(api_key=api_key)
    # elif llm_type == "deepseek":
    #     return OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
    # else:
    #     raise ValueError(f"Unsupported LLM type: {llm_type}")

# ===== Text Processing Functions =====

def count_words(text):
    """
    Count words in text.
    
    Args:
        text (str): Text to count words in
        
    Returns:
        int: Number of words in text
    """
    return len(word_tokenize(text))

def create_primary_chunks(text, chunk_size=2000, overlap=100):
    """
    Split text into overlapping chunks for LLM processing.
    
    Args:
        text (str): Text to split into chunks
        chunk_size (int): Target size of each chunk in words
        overlap (int): Number of words to overlap between chunks
        
    Returns:
        list: List of text chunks
    """
    chunks = []
    sentences = sent_tokenize(text)
    current_chunk = []
    current_word_count = 0

    for sentence in sentences:
        sentence_word_count = count_words(sentence)

        # If adding this sentence exceeds chunk size
        if current_word_count + sentence_word_count > chunk_size:
            # Save current chunk
            if current_chunk:
                chunks.append(' '.join(current_chunk))

            # Start new chunk with some overlap
            overlap_sentences = current_chunk[-2:] if len(current_chunk) >= 2 else current_chunk  
            current_chunk = overlap_sentences + [sentence]
            current_word_count = count_words(' '.join(current_chunk))
        else:
            current_chunk.append(sentence)
            current_word_count += sentence_word_count

    # Add the last chunk if not empty
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def extract_text_from_pdf(pdf_path):
    """
    Extract text content from PDF files
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        str: Extracted text content
    """
    text = ""
    
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() + "\n\n"
    
    except Exception as e:
        print(f"Error extracting text from PDF {pdf_path}: {str(e)}")
    
    return text

# def extract_text_from_pdf(pdf_path):
#     """
#     Extract clean text from a single PDF file using PyMuPDF.
    
#     Args:
#         pdf_path (str): Path to the PDF file
        
#     Returns:
#         str or None: Extracted text or None if error occurs
#     """
#     try:
#         # Open PDF document
#         doc = fitz.open(pdf_path)

#         # Get filename for logging
#         filename = os.path.basename(pdf_path)
#         total_pages = len(doc)

#         print(f"Processing {filename} ({total_pages} pages)...")
#         sys.stdout.flush()

#         # Extract text from each page
#         text_chunks = []

#         # Process each page
#         for page_num, page in enumerate(doc):
#             # Extract text with block organization
#             blocks = page.get_text("blocks")

#             # Sort blocks by vertical position then horizontal
#             blocks.sort(key=lambda b: (b[1], b[0]))

#             # Extract and clean text from each block
#             page_text = []
#             for block in blocks:
#                 text = block[4].strip()
#                 if text:
#                     page_text.append(text)

#             # Join blocks with proper spacing
#             if page_text:
#                 text_chunks.append("\n".join(page_text))

#             # Print progress for large documents
#             if total_pages > 10 and (page_num + 1) % 10 == 0:
#                 print(f"Processed {page_num + 1}/{total_pages} pages...")
#                 sys.stdout.flush()

#         # Close the document
#         doc.close()

#         # Join all text chunks with proper spacing
#         final_text = "\n\n".join(text_chunks)

#         # Basic validation of extracted text
#         if not final_text.strip():
#             print(f"Warning: No text was extracted from {filename}")
#             sys.stdout.flush()
#             return None

#         return final_text

#     except Exception as error:
#         print(f"Error processing PDF {pdf_path}: {error}")
#         sys.stdout.flush()
#         return None

# ===== LLM Processing Functions =====

def process_chunk_with_llm(llm_model, llm_type, api_key, chunk):
    """
    Send chunk to LLM for processing into smaller, semantically coherent chunks.
    
    Args:
        llm_model (str): Model identifier for the LLM
        llm_type (str): Type of LLM to use ("anthropic", "openai", or "deepseek")
        api_key (str): API key for authentication
        chunk (str): Text chunk to process
        
    Returns:
        list: List of processed sub-chunks
    """
    try:
        client = get_llm_client(llm_type, api_key)
        
        # Construct the prompt for chunking text
        prompt = [
            {"role": "user", "content": "You have to work as a Data Engineer who converts unstructured data into structured format for making it usable for ML Training."},
            {"role": "assistant", "content": "Sure, I will help you in text data retructuring. Please let me know the exact requirements."},
            {"role": "user", "content": "Great! So you have to analyze a text segment and break it into smaller chunks following these rules: \
             1. Each chunk must be more than 400 words; \
             2. Each chunk must maintain complete semantic meaning; \
             3. Never break in the middle of a sentence or logical thought; \
             4. If the input contains any headers, titles or section names, headings or subheadings:\
               - Identify such contextual content\
               - Prepend these relevant headers or titles or section name to each chunk to maintain hierarchical context; \
             5. Ensure proper handling of:\
               - Lists (keep items together when contextually related)\
               - Tables (keep with their captions and context)\
               - Quotes (preserve complete quotes in single chunks when possible)\
               - Citations (keep with their referenced text); \
             6. Please delimt different chunks with this delimiter: '============break============'.\
             Only create the chunks of the text and use the original text. DO NOT make up any text or content on your own. \
             \
             And please do not add any explanations in the output.\
             \nHere is the text to process:\n" + chunk},
        ]

        # Process with the appropriate LLM
        if llm_type == "anthropic":
            message = client.messages.create(
                model=llm_model,
                max_tokens=4000,
                messages=prompt
            )
            sub_chunks = message.content[0].text.split('============break============') 
        else:  # OpenAI or Deepseek
            response = client.chat.completions.create(
                model=llm_model,
                max_tokens=4096,
                messages=prompt,
                stream=False
            )
            sub_chunks = response.choices[0].message.content.split('============break============')
            
        # Clean up the chunks
        return [chunk.strip() for chunk in sub_chunks if chunk.strip()]
            
    except Exception as error:
        print(f"Error processing chunk with LLM: {error}")
        sys.stdout.flush()
        return []

# ===== Document Processing Functions =====

def process_pdf(input_dir, output_dir):
    """
    Process all PDF and TXT files from an S3 bucket and create a merged text file.
    
    Args:
        bucket_name (str): S3 bucket containing the files
        endpoint_url (str): S3 endpoint URL
        input_dir (str): Directory path within bucket for input files
        output_dir (str): Directory path within bucket for output files
        
    Returns:
        bool: True if successful, False otherwise
    """
    excel_files = []
    pdf_files = []
    word_files = []

    for filename in os.listdir(input_dir):
        if filename.endswith(('.xls', '.xlsx')):
            file_path = os.path.join(input_dir, filename)
            excel_files.append(file_path)
        elif filename.endswith(('.pdf')):
            file_path = os.path.join(input_dir, filename)
            pdf_files.append(file_path)
        elif filename.endswith(('.doc', '.docx')):
            file_path = os.path.join(input_dir, filename)
            word_files.append(file_path)
    
    if not pdf_files and not word_files and not excel_files:
        print(f"No tender document files found in {input_dir}")
        sys.stdout.flush()
        return False, "No tender documents found"

    print(f"Found {len(pdf_files)} PDF files to process")
    sys.stdout.flush()
    print(f"Found {len(excel_files)} EXCEL files to process")
    sys.stdout.flush()
    print(f"Found {len(word_files)} DOC files to process")
    sys.stdout.flush()


    all_texts = []
    successful_files = []
    failed_files = []

    start_time = datetime.now()

    # Process PDFs
    for file in pdf_files:
        try:                
            text = extract_text_from_pdf(file)
            if text:
                all_texts.append(text)
                successful_files.append(file)
                print(f"Successfully extracted text from {file}")
                sys.stdout.flush()
            else:
                failed_files.append(file)
                print(f"Failed to extract text from {file}")
                sys.stdout.flush()
        except Exception as e:
            print(f"Error processing PDF {file}: {e}")
            sys.stdout.flush()
            failed_files.append(file)

    # Process TXT files
    for file in excel_files:
        try:
            
            base_name = os.path.splitext(os.path.basename(file))[0]
            html_filename = os.path.join(input_dir, f"{base_name}.html")
            pdf_filename = os.path.join(input_dir, f"{base_name}.pdf")

            print(f"base_name: {base_name}, html_filename: {html_filename} and pdf_filename: {pdf_filename}")

            df = pd.read_excel(file)
            print(df)
            df.to_html(html_filename)
            pdfkit.from_file(html_filename, pdf_filename)
            
            text = extract_text_from_pdf(pdf_filename)
            os.remove(html_filename)
            os.remove(pdf_filename)
            if text:
                all_texts.append(text)
                successful_files.append(file)
                print(f"Successfully extracted text from {file}")
                sys.stdout.flush()
            else:
                failed_files.append(file)
                print(f"Failed to extract text from {file}")
                sys.stdout.flush()
        except Exception as error:
            print(f"Error processing file {file}: {e}")
            sys.stdout.flush()
            failed_files.append(file)

    # Process Word files
    for file in word_files:
        try:
            
            base_name = os.path.splitext(os.path.basename(file))[0]
            pdf_filename = os.path.join(input_dir, f"{base_name}.pdf")

            convert(file, pdf_filename)
            
            text = extract_text_from_pdf(pdf_filename)
            os.remove(pdf_filename)
            if text:
                all_texts.append(text)
                successful_files.append(file)
                print(f"Successfully extracted text from {file}")
                sys.stdout.flush()
            else:
                failed_files.append(file)
                print(f"Failed to extract text from {file}")
                sys.stdout.flush()
        except Exception as error:
            print(f"Error processing file {file}: {e}")
            sys.stdout.flush()
            failed_files.append(file)

    if not all_texts:
        print("No text was successfully extracted from RFP files")
        sys.stdout.flush()
        return False, "No text was extracted from RFP files"

    processing_time = datetime.now() - start_time

    # Ensure output directory exists
    output_key = f"{output_dir.rstrip('/')}/merged.txt"
    
    # Create merged text file in S3
    merged_text = "\n\n".join(all_texts)
    print(f"Total merged text ({len(merged_text)} chars) of the RFP files in {input_dir} are extracted.")
    sys.stdout.flush()
    
    return True, "Successfully created merged text file"

def process_text_file(output_dir, api_key, llm_type, llm_model):
    """
    Process the merged text file into chunks using LLM.
    
    Args:
        output_dir (str): Directory path within bucket for output files
        api_key (str): API key for LLM
        llm_type (str): Type of LLM to use
        llm_model (str): Model identifier for the LLM
        
    Returns:
        bool: True if successful, False otherwise
    """
    # Read input file
    file_path = f"{output_dir}/merged.txt"
    try:
        with open(file_path, 'r') as file:
            text = file.read()
    except FileNotFoundError:
        print(f"Error: File not found at path: {file_path}")
        return False

    if not text:
        print(f"Failed to read merged text file from S3")
        sys.stdout.flush()
        return False

    print("Creating primary chunks...")
    sys.stdout.flush()
    primary_chunks = create_primary_chunks(text)

    # Process each chunk
    all_processed_chunks = []
    for i, chunk in enumerate(primary_chunks):
        print(f"\nProcessing chunk {i+1} of {len(primary_chunks)}...")
        sys.stdout.flush()
        
        # Process with LLM
        processed_chunks = process_chunk_with_llm(llm_model, llm_type, api_key, chunk)
        
        # Add processed chunks to collection
        all_processed_chunks.extend(processed_chunks)

        # Add small delay between API calls
        time.sleep(1)
    
    # Filter out any empty chunks
    all_processed_chunks = [chunk for chunk in all_processed_chunks if chunk]


    # Create DataFrame with required structure
    df = pd.DataFrame({
        'Tag': [f'tag{i}' for i in range(len(all_processed_chunks))],
        'question': all_processed_chunks,
        'answer': all_processed_chunks  # Duplicate the text for both columns
    })

    output_excel_file = f"{output_dir.rstrip('/')}/chunks.xlsx"

    with pd.ExcelWriter(output_excel_file) as writer:
        df.to_excel(writer, sheet_name='input', index=False)

    print(f"Successfully saved FAQ Excel to: {output_dir}/chunks.xlsx")
    sys.stdout.flush()
    return True


def process_task(bid_dir):
    """
    Process a document processing task from the queue.
    
    Args:
        task_data (dict): Task data including client_id, request_id, and configuration
    """
    #Create output Directory if it doesnt exist
    output_dir = os.path.join(bid_dir, "tender_analysis")
    os.makedirs(output_dir, exist_ok=True)

    # Process documents
    success, err_msg = process_pdf(bid_dir, output_dir)

    genai_key = "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"
    genai_engine = "anthropic"
    genai_version = "claude-3-7-sonnet-latest"

    if success:
        chunk_success = process_text_file(output_dir, genai_key, genai_engine, genai_version)
            
        if chunk_success:
            return True
        else:
            return False
    else:
        print(err_msg)
        return False