o
    if                     @   sj  d Z ddlZddlZddlZddlZddlZddlZddlmZ ddlm	Z	m
Z
mZmZ ddlZddlZddlZddlmZ ddlZddlmZ ddlmZ ddlmZ ddlZddlZejejdd	 ee Z!zej"d
dd ej"ddd W n   Y dZ#G dd dZ$de%de$fddZ&de%de%de%de
e'e%f fddZ(dde%de%de%de	e%e%f fddZ)dS )z
Unified Document Text Extraction Module

This module provides comprehensive text extraction capabilities for tender RFP documents.
It handles PDF, Excel, Word documents and scanned images, returning both individual 
document texts and merged content.
    N)datetime)DictTupleListOptional)docx2python)word_tokenize)content_typesz4%(asctime)s - %(name)s - %(levelname)s - %(message)s)levelformatpunktT)quiet	punkt_tabclaudec                
   @   s  e Zd ZdZd,defddZdd Zdd	 Zd
d Zdede	fddZ
defddZd-dede	defddZd.dededefddZdededefddZd,d ed!ee deeeef eef fd"d#Zd$eeef d%edefd&d'Zd$eeef de	fd(d)Zd*d+ ZdS )/DocumentExtractorz
    Unified document text extraction class that handles multiple file types
    and provides both individual and merged text outputs.
    Nllm_api_keyc                 C   sT   || _ || _tdkr|rtj|dnd| _ntdkr%|r"tj|dnd| _i | _	dS )z
        Initialize the document extractor.

        Args:
            llm_api_key (str): API key for Gemini (used for scanned documents)
            process_logger: Optional process logger for detailed logging
        gemini)api_keyNr   )
r   process_logger	llm_modelgenaiClientgemini_client	anthropic	Anthropicclaude_client_extraction_cache)selfr   r    r   =/var/www/html/minaions-tender/ai-engine/document_extractor.py__init__0   s   
zDocumentExtractor.__init__c                 C   $   t | | jr| j| dS dS )z9Log info message to both module logger and process loggerN)loggerinfor   r   msgr   r   r   	_log_info@      
zDocumentExtractor._log_infoc                 C   r!   )z<Log warning message to both module logger and process loggerN)r"   warningr   r$   r   r   r   _log_warningF   r'   zDocumentExtractor._log_warningc                 C   r!   )z:Log error message to both module logger and process loggerN)r"   errorr   r$   r   r   r   
_log_errorL   r'   zDocumentExtractor._log_errortextreturnc                 C   s&   zt t|W S    t |  Y S )zCount words in text.)lenr   split)r   r,   r   r   r   count_wordsR   s   zDocumentExtractor.count_wordsc                 C   s   |  }| pt| dk S )z
        Determine if a PDF page is a scanned image using PyPDF2
        
        Args:
            page: PyPDF2 page object
            
        Returns:
            bool: True if the page appears to be scanned, False otherwise
           )extract_textr.   strip)r   pager,   r   r   r   is_scanned_pageY   s   
z!DocumentExtractor.is_scanned_page   pdf_path
batch_sizec                 C   s  i }g }zvt |de}t|}t|j}| dtj| d| d t	|D ].}|j| }	| 
|	rH|| | d|d  d q+|	 }
d|d  d	|
 d
||< q+| dt| d |rjt	dt||D ]}||||  }|s}qp| dt| ddd |D   t }|D ]
}||j|  qt }|| |d ddd |D }dt| d}tdkr| }| jjjdtjj|dd|gd}|j}n1tdkrt|  d}| j!j"j#dddd d!d"|d#d$d%d|d&d'gd(gd)}|j$d j}|%d*}|r%t|t|kr%|dd+ }t&|D ]:\}}|t|k rX|| ' }|rLd|d  d	| d
||< q)d|d  d,||< q)d|d  d-||< q)t()d qpW d+   n	1 suw   Y  W n' t*y } z| d.| d/t+|  d0t+| W  Y d+}~S d+}~ww d1}t	|D ]}||v r||| 7 }q| d2| d3t| d4 |S )5a  
        Extract text content from PDF files, using Gemini for scanned pages in batches
        and PyPDF2 for pages with selectable text
        
        Args:
            pdf_path (str): Path to the PDF file
            batch_size (int): Maximum number of scanned pages to send to Gemini in one request
            
        Returns:
            str: Extracted text content
        rbzProcessing PDF z with z pageszPage    z appears to be a scanned page.z--- Doc Page Number: z ---


Found z scanned pages in the documentr   zProcessing batch of z scanned pages: c                 S   s   g | ]}|d  qS r:   r   .0pr   r   r   
<listcomp>   s    z;DocumentExtractor.extract_text_from_pdf.<locals>.<listcomp>z, c                 S   s   g | ]}t |d  qS r=   strr>   r   r   r   rA      s    z/
                        This message contains a>   scanned pages from a PDF document.
                        
                        For EACH PAGE, extract ALL text content visible in it. Format your response with clear page number headers like:
                        
                        "=== NEW PAGE ==="
                        [extracted text for page 1]
                        
                        "=== NEW PAGE ==="
                        [extracted text for page 2]
                        
                        And so on for each page. Format the text naturally, preserving paragraphs, bullet points, and other structural elements.
                        Please just give out the extracted text as output without any additional commentary.

                        NOTE that the text in the scanned pages can be in any language.
                        r   zgemini-2.5-flash-preview-04-17zapplication/pdf)data	mime_type)modelcontentsr   utf-8zclaude-3-5-haiku-latesti  g?zYou are an expert at extracting text of any language from scanned documents. Extract ALL text visible on EACH page, preserving the original formatting as much as possible.userr,   )typer,   documentbase64)rJ   
media_typerD   )rJ   source)rolecontent)rF   
max_tokenstemperaturesystemmessagesz=== NEW PAGE ===Nz" ---
[No text content extracted]

z' ---
[Could not extract page content]

zError extracting text from PDF : zError processing PDF:  zProcessing complete. Processed z pages, including z scanned pages.),openPyPDF2	PdfReaderr.   pagesr&   ospathbasenameranger5   appendr2   	PdfWriteradd_pageioBytesIOwriteseekjoinr   getvaluer   modelsgenerate_contenttypesPart
from_bytesr,   rL   	b64encodereaddecoder   rT   createrP   r/   	enumerater3   timesleep	ExceptionrC   )r   r7   r8   extracted_textscanned_pagesfile
pdf_readertotal_pagespage_numr4   r,   ibatchwriteroutput_buffer	page_listpromptbatch_pdf_bytesresponsellm_text_out
pdf_base64scanned_pages_textjpage_contentecombined_textr   r   r   extract_text_from_pdfg   s   

 


$



  
z'DocumentExtractor.extract_text_from_pdf  ||  excel_filepath	separatorc           
   
      s   zOt j|dd  d  t  fdd jD }|dd t j|D }g }  D ]\}}|dd t||D }|	| q0d	|g| W S  t
yu }	 z| d
| dt|	  dt|	 W  Y d}	~	S d}	~	ww )ai  
        Converts an Excel file to a formatted text file, maintaining table structure.

        Args:
            excel_filepath (str): Path to the Excel file.
            separator (str, optional): Separator character between columns. Defaults to "  ||  ".
            
        Returns:
            str: Formatted text representation of the Excel file
        r   )headerrV   c                    s*   g | ]}t d tdd  | jD qS )2   c                 s   s    | ]	}t t|V  qd S N)r.   rC   )r?   xr   r   r   	<genexpr>  s    zHDocumentExtractor.extract_table_from_excel.<locals>.<listcomp>.<genexpr>)minmaxvalues)r?   coldfr   r   rA     s   * z>DocumentExtractor.extract_table_from_excel.<locals>.<listcomp>c                 s   s"    | ]\}}|d | V  qdS <Nr   )r?   r   widthr   r   r   r     s    
z=DocumentExtractor.extract_table_from_excel.<locals>.<genexpr>c                 s   s&    | ]\}}t |d | V  qdS r   rB   )r?   cellr   r   r   r   r   $  s    

zError extracting from Excel rU   zError processing Excel: N)pd
read_excelfillnaastyperC   columnsrf   zipiterrowsr_   rt   r+   )
r   r   r   
col_widthsheader_line
data_lines_row	data_liner   r   r   r   extract_table_from_excel  s(   




z*DocumentExtractor.extract_table_from_excel	word_pathtemp_dirc              
   C   s   z)t jt j|d }t j|| d}| |}t j|r't | |W S  tyO } z| 	d| dt
|  dt
| W  Y d}~S d}~ww )a#  
        Extract text from Word document by converting to PDF first.
        
        Args:
            word_path (str): Path to the Word document
            temp_dir (str): Directory to store temporary PDF file
            
        Returns:
            str: Extracted text content
        r   .pdfzError extracting from Word rU   z Error processing Word document: N)r[   r\   splitextr]   rf   get_structured_docx_contentexistsremovert   r+   rC   )r   r   r   	base_namepdf_filenamer,   r   r   r   r   extract_text_from_word.  s   

z(DocumentExtractor.extract_text_from_word	input_dir	cache_keyc                 C   s  |r|| j v r| d|  | j | S tj|s#i dd| dfS g }g }g }t|D ])}tj||}|drB|| q.|drM|| q.|drW|| q.|sf|sf|sfi dd| fS | d	t	| d
t	| dt	| d i }g }	g }
t
 }|D ]Y}z5| |}|r|ds|||< |	| | dtj|  n|
| | dtj|  W q ty } z| d| d|  |
| W Y d}~qd}~ww |D ]\}z7| |}|r
|ds
|||< |	| | dtj|  n|
| | dtj|  W q ty? } z| d| d|  |
| W Y d}~qd}~ww |D ]_}z9| ||}|rj|dsj|||< |	| | dtj|  n|
| | dtj|  W qB ty } z| d| d|  |
| W Y d}~qBd}~ww t
 | }|si ddf}ntdd | D }|ddt	|	 d| d| f}|r|| j |< |S )a  
        Extract text from all supported documents in a directory.
        
        Args:
            input_dir (str): Directory containing documents to process
            cache_key (str, optional): Key for caching results
            
        Returns:
            Tuple[Dict[str, str], bool, str]: 
                - Dictionary mapping file paths to extracted text
                - Success status
                - Status message
        z$Using cached extraction results for Fz
Directory z does not exist)z.xlsz.xlsxz.XLSz.XLSX)r   z.PDF)z.docz.docxz.DOCz.DOCXz%No supported document files found in r<   z PDF files, z Excel files, z Word files to processErrorz!Successfully extracted text from zFailed to extract text from zError processing PDF rU   NzError processing Excel zError processing Word z5No text was successfully extracted from any documentsc                 s   s    | ]}t |V  qd S r   )r.   )r?   r,   r   r   r   r     s    z;DocumentExtractor.extract_documents_text.<locals>.<genexpr>Tz documents (z total characters) in )r   r&   r[   r\   r   listdirrf   endswithr_   r.   r   nowr   
startswithr]   r)   rt   r+   r   r   sumr   )r   r   r   	pdf_filesexcel_files
word_filesfilename	file_pathdocuments_textsuccessful_filesfailed_files
start_timer,   r   processing_timeresulttotal_charsr   r   r   extract_documents_textK  s   




*







 
z(DocumentExtractor.extract_documents_textr   output_pathc              
   C   s   z>d | }tjtj|dd t|ddd}|| W d   n1 s*w   Y  | dt	| d	|  W dS  t
yY } z| d
|  W Y d}~dS d}~ww )aC  
        Create a merged text file from individual document texts.
        
        Args:
            documents_text (Dict[str, str]): Dictionary mapping file paths to text content
            output_path (str): Path where to save the merged text file
            
        Returns:
            bool: Success status
        r;   T)exist_okwrH   encodingNzMerged text (z  chars) successfully written to z!Error creating merged text file: F)rf   r   r[   makedirsr\   dirnamerW   rd   r&   r.   rt   r+   )r   r   r   merged_textrw   r   r   r   r   create_merged_text_file  s   z)DocumentExtractor.create_merged_text_filec                 C   sf   d}|  D ]!\}}|sq| |}||7 }| dtj| d|dd q| d|d |S )a  
        Count the total number of words in all document texts.
        
        Args:
            documents_text (Dict[str, str]): Dictionary mapping file paths to their text content
            
        Returns:
            int: Total word count across all documents
        r   z
Document: z - ,z wordsz#
Total words across all documents: )itemsr0   r&   r[   r\   r]   )r   r   total_word_countr   text_content
word_countr   r   r   count_total_words  s   

$z#DocumentExtractor.count_total_wordsc              
   C   sj   zt |}t|jW  d   W S 1 sw   Y  W dS  ty4 } zd| W  Y d}~S d}~ww )z9Returns the content of a .docx file as a structured list.NzError reading file: )r   rC   bodyrt   )r   r   docx_contentr   r   r   r   r     s   
(z-DocumentExtractor.get_structured_docx_contentr   )r6   )r   )__name__
__module____qualname____doc__rC   r    r&   r)   r+   intr0   boolr5   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   *   s      %#.ir   r   r-   c                 C   s   t | S )z
    Factory function to create a DocumentExtractor instance.
    
    Args:
        llm_api_key (str): API key for Gemini
        
    Returns:
        DocumentExtractor: Initialized extractor instance
    )r   )r   r   r   r   create_document_extractor  s   
r   r   
output_dirc                 C   s   t j|d}t j|rdd| dfS t|}|j| | d\}}}|s)d|fS t j|d}t|dd	d
}	tj||	ddd W d   n1 sKw   Y  |	||}
|
rZdS dS )aD  
    Backward compatible version of extract_RFP_text for bid_queries.py
    
    Args:
        input_dir (str): Directory containing input documents
        output_dir (str): Directory for output files
        llm_api_key (str): API key for LLM
        
    Returns:
        Tuple[bool, str]: Success status and message
    
merged.txtT
The file '' already exists.r   Fdoc_text.jsonr   rH   r      ensure_asciiindentN)T%Successfully created merged text file)F!Failed to create merged text file)
r[   r\   rf   r   r   r   rW   jsondumpr   )r   r   r   merged_file_path	extractorr   successmessagedocuments_text_pathfmerge_successr   r   r   extract_RFP_text_compatible  s   r   c                    s   fdd} fdd} fdd}t j|d}t j|rs|d| d	 z)t|d
dd}t|}	W d   n1 s?w   Y  |dt|	 d |	W S  tya   |d|  Y dS  tj	yr   |d|  Y dS w |d|   t
| d}
|d |
j| | d\}	}}|d|  t|ddd}tj|	|ddd W d   n1 sw   Y  |r|d|  |dt|	 d t j|d}|d|  |
|	|}|r|d |	S |d  |	S |d!|  i S )"aH  
    Backward compatible function for tender_automation.py

    Args:
        input_dir (str): Directory containing documents
        llm_api_key (str): API key for LLM
        process_logger: Optional process logger for detailed logging

    Returns:
        Dict[str, str]: Dictionary mapping file paths to extracted text
    c                        t |   r |  d S d S r   )r"   r#   r%   r   r   r   log_info4     
z3extract_documents_text_compatible.<locals>.log_infoc                    r   r   )r"   r*   r   r   r   r   	log_error9  r   z4extract_documents_text_compatible.<locals>.log_errorc                    r   r   )r"   r(   r   r   r   r   log_warning>  r   z6extract_documents_text_compatible.<locals>.log_warningr   r   r   rrH   r   Nz*Loaded existing document text from cache: z
 documentszError: File not found: zError: Invalid JSON format in: z"Starting document extraction from r   z!Extracting text from documents...r   zSaving extracted text to r   Fr   r   z Document extraction successful: zExtracted text from r   zCreating merged text file at r   r   zDocument extraction failed: )r[   r\   rf   r   rW   r   loadr.   FileNotFoundErrorJSONDecodeErrorr   r   r   r   )r   r   r   r   r   r   r   r   rw   r   r   r   r   r   r   r   r   r   r   !extract_documents_text_compatible'  sP   r  r   )*r   r[   rb   sysr   rL   pathlibr   typingr   r   r   r   rr   rX   pandasr   r   nltknltk.tokenizer   google.generativeaigenerativeair   google.generativeai.typesr	   rj   r   loggingbasicConfigINFO	getLoggerr   r"   downloadr   r   rC   r   r   r   r  r   r   r   r   <module>   sD    
   I"()