o
    Gfht4                     @   s   d Z ddlZddlZddlZddlmZmZmZmZ ddl	Z	ddl
m
Z
 ddlmZ e  edZeddZed	Zed
dZeddZddlmZ eddZeeZG dd dZdedefddZdS )z
Document Analyzer Module

This module uses Claude LLM to analyze extracted document text and identify
document types, extract key information, and provide descriptions.
    N)DictAnyListOptional)datetime)load_dotenvANTHROPIC_API_KEYANTHROPIC_MODELzclaude-3-5-haiku-latestOPENAI_API_KEYOPENAI_MODELz Qwen/Qwen3-Next-80B-A3B-Instruct
OPENAI_URLz#https://api.deepinfra.com/v1/openai)OpenAIGENAI_ENGINEclaudec                   @   s   e Zd ZdZdefddZdefddZded	edeeef fd
dZ	deeef deeeeef f fddZ
deeeeef f deeef fddZdS )DocumentAnalyzerzp
    Document analyzer that uses Claude to analyze document content
    and extract structured information.
    claude_api_keyc                 C   sd   t dkrtj|d| _nt dkrtttd| _g dg dg dg dg d	g d
g dd| _dS )z
        Initialize the document analyzer.
        
        Args:
            claude_api_key (str): API key for Claude
        r   )api_keyopen_llm)r   base_url)Incorporation_CertificateMOAAOAPAN_CardPAN_CertificateGST_CertificateMSME_CertificateMSME_RegistrationCIN_CertificateDirector_PANDirector_AadhaarAddress_ProofLease_AgreementPartnership_DeedLicenseTrade_LicenseRegistration_Certificate)ITRIncome_Tax_ReturnBalance_SheetAudit_ReportCA_CertificateTurnover_CertificateRevenue_CertificateFinancial_StatementBank_StatementTax_Compliance_CertificateCopy_of_Cheque)ISO_CertificateCMMI_CertificatePatent_CertificateMembership_CertificateQuality_CertificateCompliance_CertificateAward_CertificateTraining_Certificate)

Work_OrderPurchase_OrderContract	AgreementCompletion_CertificatePerformance_CertificateClient_CertificateProject_CertificateReference_LetterTestimonial)
Product_BrochureTechnical_Specification
Price_List	Quotation
OEM_LetterAuthorization_LetterProduct_CertificateTechnical_DocumentManual	Datasheet)Employee_ResumeEmployee_CertificateSalary_CertificateExperience_LetterAppointment_LetterEmployee_ListEmployee_Details)LetterEmailCorrespondenceInvoiceReceiptGeneral_DocumentMiscellaneous)Legal	FinancialCertificates
Experience	TechnicalHROtherN)		llm_model	anthropic	Anthropicclaude_clientr   r
   r   openaidocument_categories)selfr    ri   </var/www/html/minaions-tender/ai-engine/document_analyzer.py__init__%   s   zDocumentAnalyzer.__init__returnc                 C   s&   d dd | j D }d| dS )zr
        Get the prompt for document analysis.
        
        Returns:
            str: Analysis prompt
        
c                 S   s&   g | ]\}}d | dd | qS )z- : z, )join).0categorysubtypesri   ri   rj   
<listcomp>b   s    z8DocumentAnalyzer.get_analysis_prompt.<locals>.<listcomp>z
You are an expert document analyzer. Analyze the provided document text and extract structured information.

DOCUMENT CATEGORIES AND TYPES:
a  

Your task is to:
1. Identify the document type using the format "Category:Subtype" (e.g., "Financial:ITR", "Experience:Work_Order")
2. Extract key information relevant to the document type
3. Provide a concise description
4. Note the original filename if mentioned

RESPONSE FORMAT - Return ONLY a valid JSON object with this exact structure:
{
    "doc_type": "Category:Subtype",
    "key_info": {
        "key1": "value1",
        "key2": "value2"
    },
    "description": "Brief description of the document and its contents",
    "confidence": 0.95
}

KEY INFORMATION EXTRACTION GUIDELINES:
- For Financial documents (ITR, Balance Sheet, etc.): Extract assessment year, amounts, company name, total revenue
- For Experience documents (Work Orders, Contracts, etc.): Extract customer, location, project details, value, dates
- For Legal documents: Extract company name, registration numbers, dates, document number/ID
- For Certificates: Extract issuing authority, validity period, certificate number, scope, purpose of certificate
- For Technical documents: Extract product details, specifications, prices, suppliers, OEM name

IMPORTANT RULES:
- Always respond with valid JSON only
- Use "Other:General_Document" if the document type cannot be clearly determined
- Extract dates in a consistent format when possible
- Include currency symbols and units where applicable
- Set confidence between 0.1 and 1.0 based on how certain you are about the classification
- If key information is not available, use empty strings or null values
- Keep descriptions concise but informative (1-3 sentences)
- If the document is signed or verified or attested, then you may please ignore the information about signing authority or person

DO NOT include any text outside of the JSON structure.
)ro   rg   items)rh   categories_textri   ri   rj   get_analysis_prompt[   s   
z$DocumentAnalyzer.get_analysis_promptdocument_text	file_pathc                 C   s4  z|   }tj|}d| d| }t|dkr!|dd d }tdkrC| jjjt	ddd	d
| d| dgd}|j
d j }n"tdkre| jjjjtdd	dd
| d| dgd}|jd jj
}|drv|dddd }z2t|}||d< ||d< t  |d< g d}	|	D ]
}
|
|vrd||
< qd|vrd|d< |W W S  tjy } z,td| d|  td|  di d| d||t  d d!W  Y d}~W S d}~ww  ty } z+td"| d|  di d#t| d|tj|t  t|d!W  Y d}~S d}~ww )$a*  
        Analyze a single document and extract structured information.
        
        Args:
            document_text (str): Extracted text from the document
            file_path (str): Path to the original file
            
        Returns:
            Dict[str, Any]: Analysis results
        z
FILENAME: z

DOCUMENT CONTENT:
i Nz#
[Document truncated due to length]r   i  g?zYou are an expert document analyzer. Analyze documents and return structured JSON data about their type, key information, and description.userz

ANALYZE THIS DOCUMENT:
)rolecontent)model
max_tokenstemperaturesystemmessagesr   r   r   )r|   r   z```json z```rx   	file_nameanalysis_date)doc_typekey_infodescription
confidenceg      ?z"Failed to parse JSON response for rn   zResponse was: zOther:General_Documentz8Document analysis failed - could not parse response for zJSON parsing failed)r   r   r   r   rx   r   r   errorzError analyzing document z'Document analysis failed due to error: )rv   ospathbasenamelenrb   re   r   creater	   r{   textstriprf   chatcompletionsr   choicesmessage
startswithreplacejsonloadsr   now	isoformatJSONDecodeErrorloggerr   	Exceptionstr)rh   rw   rx   promptfilename	full_textresponseresponse_textanalysis_resultrequired_fieldsfielderi   ri   rj   analyze_document   s   





z!DocumentAnalyzer.analyze_documentdocuments_textc           
      C   s   i }t |}td| d t| dD ];\}\}}td| d| dtj|  | ||}|||< |	dd}|	d	d
}	td| d|	dd qtd| d |S )a!  
        Analyze multiple documents in batch.
        
        Args:
            documents_text (Dict[str, str]): Dictionary mapping file paths to extracted text
            
        Returns:
            Dict[str, Dict[str, Any]]: Dictionary mapping file paths to analysis results
        zStarting analysis of z
 documents   zAnalyzing document /rn   r   Unknownr   r   z  -> Classified as: z (confidence: z.2f)zCompleted analysis of )
r   r   info	enumeratert   r   r   r   r   get)
rh   r   analysis_results
total_docsirx   r   r   r   r   ri   ri   rj   analyze_documents_batch   s   
$z(DocumentAnalyzer.analyze_documents_batchr   c                 C   s   |sdi ddS i }g }|  D ]$}|dd}|dd}||v r+||  d7  < nd||< || q|r?t|t| nd}t|||dd |  D d	S )
z
        Generate a summary of the analysis results.
        
        Args:
            analysis_results (Dict[str, Dict[str, Any]]): Analysis results
            
        Returns:
            Dict[str, Any]: Summary statistics
        r   )total_documents
categoriesavg_confidencer   zOther:Unknownr   r   c                 S   s$   g | ]}| d ddk r|d qS )r   r   g333333?r   )r   )rp   resultri   ri   rj   rs   7  s
    z9DocumentAnalyzer.get_analysis_summary.<locals>.<listcomp>)r   r   r   low_confidence_docs)valuesr   appendsumr   )rh   r   r   confidencesr   r   r   r   ri   ri   rj   get_analysis_summary  s&   
z%DocumentAnalyzer.get_analysis_summaryN)__name__
__module____qualname____doc__r   rk   rv   r   r   r   r   r   ri   ri   ri   rj   r      s    67*g.r   r   rl   c                 C   s   t | S )z
    Factory function to create a DocumentAnalyzer instance.
    
    Args:
        claude_api_key (str): API key for Claude
        
    Returns:
        DocumentAnalyzer: Initialized analyzer instance
    )r   )r   ri   ri   rj   create_document_analyzer?  s   
r   )r   r   loggingr   typingr   r   r   r   rc   r   dotenvr   getenvr   r	   r
   r   r   rf   r   rb   	getLoggerr   r   r   r   r   ri   ri   ri   rj   <module>   s*    


  "