o
    hh-'                     @   s\  d Z ddlZddlZddlZddlZddlZddlmZ ddlmZ e  ddl	Z	ddl
Z
ddlZddlmZmZ ddlZddlZddlmZ ddlZddlZddlmZ ed ed	 d
ZedZeddZedZeddZeddZddl m!Z! e!eedZ eZ"dZ#e	j$edZ%dd Z&d%ddZ'dd Z(dd  Z)d!d" Z*d#d$ Z+dS )&aC  
Document Processing Service

This service processes documents (PDFs, TXTs) stored in S3:
- Extracts text from documents
- Merges text into a single file
- Chunks text using LLMs for better semantic organization
- Generates summaries using LLMs
- Saves results back to S3
- Provides an API endpoint for processing requests
    N)datetime)load_dotenv)sent_tokenizeword_tokenize)convert)extract_RFP_text_compatiblepunkt	punkt_tab'AIzaSyCzr6L3E8yywy8Ls2errRBOPx740VcjV1gANTHROPIC_API_KEYANTHROPIC_MODELzclaude-3-5-haiku-latestOPENAI_API_KEYOPENAI_MODELz Qwen/Qwen3-Next-80B-A3B-Instruct
OPENAI_URLz#https://api.deepinfra.com/v1/openai)OpenAI)api_keybase_urlopen_llm)r   c                 C   s   t t| S )z
    Count words in text.
    
    Args:
        text (str): Text to count words in
        
    Returns:
        int: Number of words in text
    )lenr   )text r   6/var/www/html/minaions-tender/ai-engine/bid_queries.pycount_wordsF   s   
r     d   c           
      C   s   g }t | }g }d}|D ]:}t|}|| |kr=|r"|d| t|dkr.|dd n|}	|	|g }td|}q|| ||7 }q|rQ|d| |S )a3  
    Split text into overlapping chunks for LLM processing.
    
    Args:
        text (str): Text to split into chunks
        chunk_size (int): Target size of each chunk in words
        overlap (int): Number of words to overlap between chunks
        
    Returns:
        list: List of text chunks
    r       N)r   r   appendjoinr   )
r   
chunk_sizeoverlapchunks	sentencescurrent_chunkcurrent_word_countsentencesentence_word_countoverlap_sentencesr   r   r   create_primary_chunksR   s"   


r)   c              
   C   s   zFdddddddd|  dg}t dkr(tjjt d|d	}|jd
 jd}nt dkr?tjj	jt
|d}|jd
 jjd}dd |D W S  tyf } ztd|  tj  g W  Y d}~S d}~ww )z
    Send chunk to LLM for processing into smaller, semantically coherent chunks.
    
    Args:
        chunk (str): Text chunk to process
        
    Returns:
        list: List of processed sub-chunks
    userzYou have to work as a Data Engineer who converts unstructured data into structured format for making it usable for ML Training.)rolecontent	assistantz[Sure, I will help you in text data retructuring. Please let me know the exact requirements.a  Great! So you have to analyze a text segment and break it into smaller chunks following these rules:              1. Each chunk must be more than 400 words;              2. Each chunk must maintain complete semantic meaning;              3. Never break in the middle of a sentence or logical thought;              4. If the input contains any headers, titles or section names, headings or subheadings:               - Identify such contextual content               - Prepend these relevant headers or titles or section name to each chunk to maintain hierarchical context;              5. Ensure proper handling of:               - Lists (keep items together when contextually related)               - Tables (keep with their captions and context)               - Quotes (preserve complete quotes in single chunks when possible)               - Citations (keep with their referenced text);              6. Please delimt different chunks with this delimiter: '============break============'.             Only create the chunks of the text and use the original text. DO NOT make up any text or content on your own.                           And please do not add any explanations in the output.             
Here is the text to process:
claudei  )model
max_tokensmessagesr   z============break============r   )r/   r1   c                 S   s   g | ]
}|  r|  qS r   )strip.0chunkr   r   r   
<listcomp>   s    z*process_chunk_with_llm.<locals>.<listcomp>z!Error processing chunk with LLM: N)	llm_modelclaude_clientr1   creater,   r   splitopenaichatcompletionsr   choicesmessage	Exceptionprintsysstdoutflush)r5   promptr?   
sub_chunkserrorr   r   r   process_chunk_with_llm|   s6   

rH   c                 C   s:   t | |t\}}|rt| ||fS td|  ||fS )a&  
    Process all PDF Excel and Doc files from a directory and create a merged text file.
    
    Args:
        input_dir (str): Directory path for input files
        output_dir (str): Directory path for output files
        
    Returns:
        bool: True if successful, False otherwise
    zError: )r   r   rA   )	input_dir
output_dirsuccessr?   r   r   r   extract_RFP_text   s   rL   c                 C   s  t j| d}zt|ddd}| }W d   n1 sw   Y  W n ty5   td|  Y dS w |sCtd tj	  dS td	 tj	  t
|}g }t|D ]&\}}td
|d  dt| d tj	  t|}|| td qVdd |D }dd t|D }	t j| d}
t|
ddd}tj|	|ddd W d   n1 sw   Y  td|
  tj	  dS )z
    Process the merged text file into chunks using LLM.
    
    Args:
        output_dir (str): Directory path within bucket for output files
        
    Returns:
        bool: True if successful, False otherwise
    z
merged.txtrzutf-8)encodingNzError: File not found at path: Fz'Failed to read merged text file from S3zCreating primary chunks...z
Processing chunk    z of z...c                 S   s   g | ]}|r|qS r   r   r3   r   r   r   r6      s    z%process_text_file.<locals>.<listcomp>c                 S   s*   g | ]\}}d |d  |g|g dqS )tagrO   )tagNamequestionanswerquestion_negr   )r4   ichunk_for_jsonr   r   r   r6   	  s    zchunks.jsonwr   )indentensure_asciiz'Successfully saved BID Chunks Json to: T)ospathr   openreadFileNotFoundErrorrA   rB   rC   rD   r)   	enumerater   rH   extendtimesleepjsondump)rJ   	file_pathfiler   primary_chunksall_processed_chunksrU   r5   processed_chunks	json_dataoutput_json_filefr   r   r   process_text_file   sF   





rm   c                 C   sN   t j| d}t j|dd t| |\}}|r!t|}|rdS dS t| dS )z
    Process a document processing task from the queue.
    
    Args:
        task_data (dict): Task data including client_id, request_id, and configuration
    tender_analysisT)exist_okF)rZ   r[   r   makedirsrL   rm   rA   )bid_dirrJ   rK   err_msgchunk_successr   r   r   process_task  s   rt   )r   r   ),__doc__rZ   rc   rB   iora   r   dotenvr   	anthropicPyPDF2nltknltk.tokenizer   r   pandaspdpdfkitdocx2pdfr   base64pathlibdocument_extractorr   downloadGEMINI_API_KEYgetenvr   r   r   r   r   r;   r   claude_modelr7   	Anthropicr8   r   r)   rH   rL   rm   rt   r   r   r   r   <module>   sN    




*<R