o
    [2iX                     @   s8  d Z ddlZddlZddlZddlZddlZddlZddlmZ ddlm	Z	m
Z
mZmZ ddlZddlZddlZddlmZ ddlZddlmZ ddlmZ ddlmZ ddlZddlZddlmZ ddl Z ej!ej"d	d
 e#e$Z%zej&ddd ej&ddd W n   Y dZ'G dd dZ(de)de(fddZ*dS )z
Enhanced Document Text Extraction Module

Extends the original document_extractor.py to handle additional file types
including CSV, images, and text files, with recursive folder processing.
    N)datetime)DictTupleListOptional)convert)word_tokenize)content_types)Imagez4%(asctime)s - %(name)s - %(levelname)s - %(message)s)levelformatpunktT)quiet	punkt_tabclaudec                
   @   s(  e Zd ZdZdefddZdedefddZdefd	d
Z	d.dededefddZ
d/dededefddZdedefddZdedefddZdedefddZdededefd d!Zd"edefd#d$Zd0d"ededefd&d'Zd(edee fd)d*Zd0d(ed+ee deeeef eef fd,d-Zd%S )1EnhancedDocumentExtractorz
    Enhanced document text extraction class that handles multiple file types
    and provides recursive folder processing capabilities.
    llm_api_keyc                 C   st   || _ tdkr|rtj|dnd| _ntdkr"|rtj|dnd| _i | _dgddgdd	gd
gdgg dd| _	dS )z
        Initialize the enhanced document extractor.
        
        Args:
            llm_api_key (str): API key for Claude/Gemini (used for scanned documents and images)
        gemini)api_keyNr   .pdfz.xlsz.xlsxz.docz.docxz.csvz.txtz.jpgz.jpegz.pngz.tiffz.tifz.bmp)pdfexcelwordcsvtextimage)
r   	llm_modelgenaiClientgemini_client	anthropic	Anthropicclaude_client_extraction_cachesupported_extensions)selfr    r'   F/var/www/html/minaions-tender/ai-engine/enhanced_document_extractor.py__init__0   s   z"EnhancedDocumentExtractor.__init__r   returnc                 C   s&   zt t|W S    t |  Y S )zCount words in text.)lenr   split)r&   r   r'   r'   r(   count_wordsH   s   z%EnhancedDocumentExtractor.count_wordsc                 C   s   |  }| pt| dk S )z
        Determine if a PDF page is a scanned image using PyPDF2
        
        Args:
            page: PyPDF2 page object
            
        Returns:
            bool: True if the page appears to be scanned, False otherwise
           )extract_textr+   strip)r&   pager   r'   r'   r(   is_scanned_pageO   s   
z)EnhancedDocumentExtractor.is_scanned_page   pdf_path
batch_sizec                 C   s6  i }g }zIt |d8}t|}t|j}tdtj	| d| d t
|D ].}|j| }	| |	rH|| td|d  d q+|	 }
d|d  d	|
 d
||< q+tdt| d |r=| jr=t
dt||D ]}||||  }|sqttdt| ddd |D   t }|D ]
}||j|  qt }|| |d dt| d}t| d}| jjjdddddd|dddd|dd gd!gd"}|jd j}|d#}|rt|t|kr|dd$ }t|D ]7\}}|t|k r,||  }|r!d|d  d	| d
||< qd|d  d%||< qd|d  d&||< qt !d qtW d$   n	1 sHw   Y  W n' t"yu } ztd'| d(t#|  d)t#| W  Y d$}~S d$}~ww d*}t
|D ]}||v r||| 7 }q|td+| d,t| d- |S ).a  
        Extract text content from PDF files, using Claude for scanned pages in batches
        and PyPDF2 for pages with selectable text
        
        Args:
            pdf_path (str): Path to the PDF file
            batch_size (int): Maximum number of scanned pages to send to Claude in one request
            
        Returns:
            str: Extracted text content
        rbzProcessing PDF z with z pageszPage    z appears to be a scanned page.z--- Doc Page Number:  ---


Found z scanned pages in the documentr   zProcessing batch of z scanned pages: c                 S   s   g | ]}|d  qS )r7   r'   ).0pr'   r'   r(   
<listcomp>   s    zCEnhancedDocumentExtractor.extract_text_from_pdf.<locals>.<listcomp>z/
                        This message contains a>   scanned pages from a PDF document.
                        
                        For EACH PAGE, extract ALL text content visible in it. Format your response with clear page number headers like:
                        
                        "=== NEW PAGE ==="
                        [extracted text for page 1]
                        
                        "=== NEW PAGE ==="
                        [extracted text for page 2]
                        
                        And so on for each page. Format the text naturally, preserving paragraphs, bullet points, and other structural elements.
                        Please just give out the extracted text as output without any additional commentary.

                        NOTE that the text in the scanned pages can be in any language.
                        utf-8claude-haiku-4-5  皙?zYou are an expert at extracting text of any language from scanned documents. Extract ALL text visible on EACH page, preserving the original formatting as much as possible.userr   typer   documentbase64zapplication/pdfrD   
media_typedatarD   sourcerolecontentmodel
max_tokenstemperaturesystemmessagesz=== NEW PAGE ===Nz" ---
[No text content extracted]

z' ---
[Could not extract page content]

zError extracting text from PDF : zError processing PDF:  zProcessing complete. Processed z pages, including z scanned pages.)$openPyPDF2	PdfReaderr+   pagesloggerinfoospathbasenameranger2   appendr/   r#   	PdfWriteradd_pageioBytesIOwriteseekrF   	b64encodereaddecoderT   createrN   r   r,   	enumerater0   timesleep	Exceptionstr)r&   r4   r5   extracted_textscanned_pagesfile
pdf_readertotal_pagespage_numr1   r   ibatchwriteroutput_bufferprompt
pdf_base64responsellm_text_outscanned_pages_textjpage_contentecombined_textr'   r'   r(   extract_text_from_pdf\   s   

 


$


o
z/EnhancedDocumentExtractor.extract_text_from_pdf  ||  excel_filepath	separatorc              
      s   zit j|ddd}d}| D ]W\} |d| d7 } d  t  fdd jD }|d	d
 t j|D }g } 	 D ]\}	}
|dd
 t|
|D }|
| qC|d|g| d 7 }q|W S  ty } ztd| dt|  dt| W  Y d}~S d}~ww )ai  
        Converts an Excel file to a formatted text file, maintaining table structure.

        Args:
            excel_filepath (str): Path to the Excel file.
            separator (str, optional): Separator character between columns. Defaults to "  ||  ".
            
        Returns:
            str: Formatted text representation of the Excel file
        Nr   )
sheet_nameheaderrV   z
--- Sheet: r8   c                    s*   g | ]}t d tdd  | jD qS )2   c                 s   s    | ]	}t t|V  qd S N)r+   rp   )r;   xr'   r'   r(   	<genexpr>   s    zOEnhancedDocumentExtractor.extract_text_from_excel.<locals>.<listcomp>.<genexpr>)minmaxvalues)r;   coldfr'   r(   r=      s   * zEEnhancedDocumentExtractor.extract_text_from_excel.<locals>.<listcomp>c                 s   s"    | ]\}}|d | V  qdS <Nr'   )r;   r   widthr'   r'   r(   r     s    
zDEnhancedDocumentExtractor.extract_text_from_excel.<locals>.<genexpr>c                 s   s&    | ]\}}t |d | V  qdS r   )rp   )r;   cellr   r'   r'   r(   r     s    

r9   zError extracting from Excel rU   zError processing Excel: )pd
read_excelitemsfillnaastyperp   columnsjoinzipiterrowsra   ro   r[   r\   )r&   r   r   
all_sheetsr   r   
col_widthsheader_line
data_lines_row	data_liner   r'   r   r(   extract_text_from_excel   s0   




z1EnhancedDocumentExtractor.extract_text_from_excelcsv_filepathc              
   C   sz   zt |}|d}|t}|jddW S  ty< } ztd| dt|  dt| W  Y d}~S d}~ww )z
        Extract text from CSV file.
        
        Args:
            csv_filepath (str): Path to the CSV file
            
        Returns:
            str: Formatted text representation of the CSV file
        rV   F)indexzError extracting from CSV rU   zError processing CSV: N)	r   read_csvr   r   rp   	to_stringro   r[   r\   )r&   r   r   r   r'   r'   r(   extract_text_from_csv  s   



z/EnhancedDocumentExtractor.extract_text_from_csvtxt_filepathc                 C   s.  zt |ddd}| W  d   W S 1 sw   Y  W dS  tyq   z"t |ddd}| W  d   W  Y S 1 s@w   Y  W Y dS  typ } ztd| dt|  dt| W  Y d}~ Y S d}~ww  ty } ztd	| dt|  dt| W  Y d}~S d}~ww )
z
        Extract text from text file.
        
        Args:
            txt_filepath (str): Path to the text file
            
        Returns:
            str: Text content of the file
        rr>   )encodingNzlatin-1zError reading text file rU   zError processing text file: z Error extracting from text file )rW   ri   UnicodeDecodeErrorro   r[   r\   rp   )r&   r   rs   r   r'   r'   r(   extract_text_from_txt(  s&   
(.z/EnhancedDocumentExtractor.extract_text_from_txt
image_pathc           	      C   s  za| j sW dS t|d}t| d}W d   n1 s!w   Y  tj|d 	 }dddddd	d
}|
|d}| j jjdddddddddd||ddgdgd}|jd jW S  ty } ztd| dt|  dt| W  Y d}~S d}~ww )z
        Extract text from image file using Claude.
        
        Args:
            image_path (str): Path to the image file
            
        Returns:
            str: Extracted text content
        z6Error: No Claude API key provided for image processingr6   r>   Nr7   z
image/jpegz	image/pngz
image/tiffz	image/bmpr   r?   r@   rA   zYou are an expert at extracting text from images. Extract ALL text visible in the image, preserving the original formatting as much as possible.rB   r   zExtract all text content from this image. Preserve formatting, structure, and layout as much as possible. If this appears to be a document, maintain the document structure.rC   r   rF   rG   rJ   rL   rO   r   z!Error extracting text from image rU   zError processing image: )r#   rW   rF   rh   ri   rj   r]   r^   splitextlowergetrT   rk   rN   r   ro   r[   r\   rp   )	r&   r   
image_file
image_data	image_extmime_type_map	mime_typer}   r   r'   r'   r(   extract_text_from_image@  sP   
z1EnhancedDocumentExtractor.extract_text_from_image	word_pathtemp_dirc              
   C   s   z.t jt j|d }t j|| d}t|| | |}t j|r,t | |W S  t	yT } zt
d| dt|  dt| W  Y d}~S d}~ww )a#  
        Extract text from Word document by converting to PDF first.
        
        Args:
            word_path (str): Path to the Word document
            temp_dir (str): Directory to store temporary PDF file
            
        Returns:
            str: Extracted text content
        r   r   zError extracting from Word rU   z Error processing Word document: N)r]   r^   r   r_   r   r   r   existsremovero   r[   r\   rp   )r&   r   r   	base_namepdf_filenamer   r   r'   r'   r(   extract_text_from_word~  s   


z0EnhancedDocumentExtractor.extract_text_from_word	file_pathc                 C   s<   t j|d  }| j D ]\}}||v r|  S qdS )z
        Determine the file type based on extension.
        
        Args:
            file_path (str): Path to the file
            
        Returns:
            str: File type category
        r7   unsupported)r]   r^   r   r   r%   r   )r&   r   ext	file_type
extensionsr'   r'   r(   get_file_type  s   
z'EnhancedDocumentExtractor.get_file_typeNc              
   C   s   |  |}|du rtj|}zC|dkr| |W S |dkr$| |W S |dkr/| ||W S |dkr9| |W S |dkrC| |W S |dkrM| 	|W S d| W S  t
yx } ztd	| d
t|  dt| W  Y d}~S d}~ww )a  
        Extract text from a single file based on its type.
        
        Args:
            file_path (str): Path to the file
            temp_dir (str): Temporary directory for intermediate files
            
        Returns:
            str: Extracted text content
        Nr   r   r   r   r   r   zUnsupported file type: zError extracting text from rU   zError processing file: )r   r]   r^   dirnamer   r   r   r   r   r   ro   r[   r\   rp   )r&   r   r   r   r   r'   r'   r(   extract_text_from_file  s,   
z0EnhancedDocumentExtractor.extract_text_from_fileroot_dirc           	      C   sR   g }t |D ]\}}}|D ]}t j||}| |}|dkr%|| qq|S )z
        Recursively find all supported document files in a directory tree.
        
        Args:
            root_dir (str): Root directory to search
            
        Returns:
            List[str]: List of file paths
        r   )r]   walkr^   r   r   ra   )	r&   r   found_filesrootdirsfilesrs   r   r   r'   r'   r(   find_documents_recursively  s   


z4EnhancedDocumentExtractor.find_documents_recursively	cache_keyc                 C   s  |r|| j v rtd|  | j | S tj|s#i dd| dfS | |}|s2i dd| fS tdt| d i }g }g }t	 }|D ]l}zHtdtj
| d	 | j|tj|d
}	|	r|	ds|	||< || tdtj
|  n|| tdtj
|  W qI ty }
 ztd| d|
  || W Y d}
~
qId}
~
ww t	 | }|si ddf}ntdd | D }|ddt| d| d| f}|r|| j |< |S )a  
        Recursively extract text from all supported documents in a directory tree.
        
        Args:
            root_dir (str): Root directory containing documents to process
            cache_key (str, optional): Key for caching results
            
        Returns:
            Tuple[Dict[str, str], bool, str]: 
                - Dictionary mapping file paths to extracted text
                - Success status
                - Status message
        z$Using cached extraction results for Fz
Directory z does not existz%No supported document files found in r:   z supported files to processzProcessing z...)r   Errorz!Successfully extracted text from zFailed to extract text from zError processing rU   Nz5No text was successfully extracted from any documentsc                 s   s    | ]}t |V  qd S r   )r+   )r;   r   r'   r'   r(   r     s    zMEnhancedDocumentExtractor.extract_documents_text_recursive.<locals>.<genexpr>Tz documents (z total characters) in )r$   r[   r\   r]   r^   r   r   r+   r   nowr_   r   r   
startswithra   ro   sumr   )r&   r   r   	all_filesdocuments_textsuccessful_filesfailed_files
start_timer   r   r   processing_timeresulttotal_charsr'   r'   r(    extract_documents_text_recursive  sJ   



 
z:EnhancedDocumentExtractor.extract_documents_text_recursive)r3   )r   r   )__name__
__module____qualname____doc__rp   r)   intr-   boolr2   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r'   r'   r'   r(   r   *   s      *>#2r   r   r*   c                 C   s   t | S )z
    Factory function to create an EnhancedDocumentExtractor instance.
    
    Args:
        llm_api_key (str): API key for Claude/Gemini
        
    Returns:
        EnhancedDocumentExtractor: Initialized extractor instance
    )r   )r   r'   r'   r(   "create_enhanced_document_extractor(  s   
r   )+r   r]   rd   sysjsonrF   pathlibr   typingr   r   r   r   rm   rX   pandasr   docx2pdfr   nltknltk.tokenizer   google.generativeaigenerativeair   google.generativeai.typesr	   typesr!   loggingPILr
   r   basicConfigINFO	getLoggerr   r[   downloadr   r   rp   r   r'   r'   r'   r(   <module>   sF    
    