o
    h$                     @   s   d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	 ddl
m
Z
 ddlmZ ddlmZ ddlmZ ejejded	eejgd
 eeZG dd dZdededefddZdededefddZdS )z
Main Document Processor

This is the main application that orchestrates the entire document processing pipeline:
1. Recursive document discovery and text extraction
2. Document analysis using Claude LLM
3. Data formatting into required output formats
    N)DictAnyOptional)datetime)"create_enhanced_document_extractor)create_document_analyzer)create_data_formatterz4%(asctime)s - %(name)s - %(levelname)s - %(message)szdocument_processor.log)levelformathandlersc                   @   sx   e Zd ZdZdefddZ	ddededed	eeef fd
dZ	deded	eeef fddZ
d	eeef fddZdS )DocumentProcessorzH
    Main document processor that orchestrates the entire pipeline.
    claude_api_keyc                 C   s&   || _ t|| _t|| _t | _dS )z
        Initialize the document processor.
        
        Args:
            claude_api_key (str): API key for Claude
        N)r   r   	extractorr   analyzerr   	formatter)selfr    r   B/var/www/html/minaions-tender/ai-engine/main_document_processor.py__init__'   s   

zDocumentProcessor.__init__T	input_dir
output_dir	use_cachereturnc              
   C   s  t  }td td|  td|  tj|dd | ||i d}ztd t  }|r8|nd}| jj||d	\}}	}
t  | }|	|
|	 t
|d
|d d< |	sotd|
  d|d< |
|d< |W S tdt
| d tj|d}t|ddd}tj||ddd W d   n1 sw   Y  td t  }| j|}t  | }| j|}d|	 t
||d|d d< tdt
| d td|d d td  t  }| j||}t  | }d|	 |d!|d d"< t  | }d|d< t   |d#< |	 |d$< ||d%< td&|  td'|  |W S  ty` } z"td(|  d|d< t||d< t   |d#< |W  Y d}~S d}~ww ))a  
        Process all documents in the input directory and generate outputs.
        
        Args:
            input_dir (str): Directory containing documents to process
            output_dir (str): Directory to save output files
            use_cache (bool): Whether to use caching for extraction
            
        Returns:
            Dict[str, Any]: Processing results and statistics
        z%Starting document processing pipelinezInput directory: zOutput directory: T)exist_ok)
start_timer   r   stepsz)Step 1: Extracting text from documents...N)	cache_key)successmessagedurationdocuments_foundr   
extractionzText extraction failed: Fr   errorz!Successfully extracted text from z
 documentszextracted_text.jsonwzutf-8)encoding   )indentensure_asciiz*Step 2: Analyzing documents with Claude...)r   r   documents_analyzedsummaryanalysiszSuccessfully analyzed zAverage confidence: avg_confidence.2fz(Step 3: Formatting and saving results...)r   r   output_files
formattingend_timetotal_durationr-   z.Document processing completed successfully in zOutput files saved to: z"Error during document processing: )r   nowloggerinfoosmakedirs	isoformatr    extract_documents_text_recursivetotal_secondslenr"   pathjoinopenjsondumpr   analyze_documents_batchget_analysis_summaryr   save_formatted_data	Exceptionstr)r   r   r   r   r   processing_resultsstep1_startr   documents_textextraction_successextraction_messagestep1_durationextracted_text_pathfstep2_startanalysis_resultsstep2_durationanalysis_summarystep3_startr-   step3_durationr0   er   r   r   process_documents3   s   



z#DocumentProcessor.process_documents	file_pathc              
   C   s   t d|  tj|sdd| dS z*| j|}|dr(d|dW S | j	||}||i}| j
||}d||dW S  tya } zt d|  dt|dW  Y d	}~S d	}~ww )
a	  
        Process a single document file.
        
        Args:
            file_path (str): Path to the document file
            output_dir (str): Directory to save output files
            
        Returns:
            Dict[str, Any]: Processing results
        zProcessing single document: FzFile not found: )r   r"   ErrorT)r   analysis_resultr-   z"Error processing single document: N)r2   r3   r4   r:   existsr   extract_text_from_file
startswithr   analyze_documentr   rA   rB   r"   rC   )r   rT   r   textrV   rM   r-   rR   r   r   r   process_single_document   s&   
z)DocumentProcessor.process_single_documentc                 C   s   | j jS )z
        Get the list of supported file types.
        
        Returns:
            Dict[str, list]: Dictionary of supported file extensions by category
        )r   supported_extensions)r   r   r   r   get_supported_file_types   s   z*DocumentProcessor.get_supported_file_typesN)T)__name__
__module____qualname____doc__rC   r   boolr   r   rS   r\   listr^   r   r   r   r   r   "   s    

n(r   r   r   api_keyc                 C   sP   t |}|| |}|d rtd td|d   |S td|d   |S )z
    Example function showing how to process a directory of documents.
    
    Args:
        input_dir (str): Directory containing documents
        output_dir (str): Directory to save outputs
        api_key (str): Claude API key
    r   z"Processing completed successfully!zOutput files: r-   Processing failed: r"   )r   rS   print)r   r   re   	processorresultsr   r   r   process_directory   s   	rj   rT   c                 C   sl   t |}|| |}|d r+td td|d d   td|d d d dS td	|d
   dS )z
    Example function showing how to process a single document.
    
    Args:
        file_path (str): Path to document file
        output_dir (str): Directory to save outputs
        api_key (str): Claude API key
    r   z Document processed successfully!zDocument type: rV   doc_typezConfidence: 
confidencer,   rf   r"   N)r   r\   rg   )rT   r   re   rh   ri   r   r   r   process_single_file   s   	rm   )rb   r4   sysr=   loggingargparsetypingr   r   r   r   enhanced_document_extractorr   document_analyzerr   data_formatterr   basicConfigINFOFileHandlerStreamHandlerstdout	getLoggerr_   r2   r   rC   rj   rm   r   r   r   r   <module>   s.    	

 3