o
    7i>!                     @   sf   d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	 d dl
mZ dZdZG dd	 d	ZdS )
    N)DictOptional)Path)	Anthropic)OpenAIz#https://api.deepinfra.com/v1/openaiz Qwen/Qwen3-Next-80B-A3B-Instructc                   @   s   e Zd ZdZddedefddZdedefd	d
Zdedeee	e f fddZ
dedeee	e f fddZddededefddZdS )TenderLocationExtractorz.Extract delivery location from GeM tender PDFsTapi_keyuse_anthropicc                 C   s,   || _ |rt|d| _dS t|td| _dS )z
        Initialize with API key for LLM service

        Args:
            api_key: API key for OpenAI
            use_anthropic: If True, use Anthropic Claude; if False, use OpenAI
        )r   )r   base_urlN)r	   r   clientr   
OPENAI_URL)selfr   r	    r   >/var/www/html/minaions-tender/ai-engine/location_finder_gem.py__init__   s   z TenderLocationExtractor.__init__pdf_pathreturnc           
   
   C   s   ddl }|j|std| |j|}|dkr"td| z.d}t|}|jD ]}| }|r<||d 7 }q.W d   n1 sGw   Y  |	 W S  tyy } zt
|}	d|	v sdd|	v rntd	| d
|	 td|	 d}~ww )z
        Extract text from PDF file using pdfplumber

        Args:
            pdf_path: Path to the PDF file

        Returns:
            Extracted text as string
        r   Nz!PDF file does not exist at path: zPDF file is empty (0 bytes):  
zNo /Root objectz	not a PDFz%Invalid or corrupted PDF file (size: z	 bytes): z Error extracting text from PDF: )ospathexists	Exceptiongetsize
pdfplumberopenpagesextract_textstripstr)
r   r   r   	file_sizetextpdfpage	page_texte	error_msgr   r   r   extract_text_from_pdf   s0   


z-TenderLocationExtractor.extract_text_from_pdfr!   c              
   C   s  d| d}| j r| jjjddd|dgd}|jd j}n!td	t | jjj	jtd
ddd|dgddid}|j
d jj}td| ztdd|}tdd|}t| }|W S  tjy } ztd|  td|  ddddddW  Y d}~S d}~ww )z
        Use LLM to extract location information from tender text

        Args:
            text: Extracted text from PDF

        Returns:
            Dictionary with city, state, and full_address
        u  Analyze the following GeM (Government e-Marketplace) tender document text and extract the delivery location information.

Focus on finding:
1. The consignee address (परेषती/Consignee address)
2. Office location (कार्यालय का नाम/Office Name)
3. Any delivery address mentioned

From this information, identify:
- City name
- State name (if city is mentioned but not state, infer the state from the city)
- Full delivery address

Tender Document Text:
af  

Please respond in JSON format with the following structure:
{
    "city": "city name or null",
    "state": "full state name or null",
    "full_address": "complete delivery address or null",
    "confidence": "high/medium/low"
}

Important:
- For city names, map them to their respective states (e.g., Kanchipuram -> Tamil Nadu, Mumbai -> Maharashtra, Kadapa -> Andhra Pradesh)
- Use full state names, not abbreviations (e.g., "Andhra Pradesh" not "AP")
- Name of the state has to be one out of 28 states or 8 UTs. That is one out of: Andhra Pradesh, Arunachal Pradesh, Assam, Bihar, Chhattisgarh, Goa, Gujarat, Haryana, Himachal Pradesh, Jharkhand, Karnataka, Kerala, Madhya Pradesh, Maharashtra, Manipur, Meghalaya, Mizoram, Nagaland, Odisha, Punjab, Rajasthan, Sikkim, Tamil Nadu, Telangana, Tripura, Uttar Pradesh, Uttarakhand, and West Bengal. Andaman and Nicobar Islands, Chandigarh, Dadra and Nagar Haveli and Daman and Diu, Delhi, Jammu and Kashmir, Ladakh, Lakshadweep, and Puducherry.
- If multiple addresses exist, prioritize the consignee/delivery address
- Return null for fields you cannot confidently extractzclaude-sonnet-4-20250514i  user)rolecontent)model
max_tokensmessagesr   z 
Invoking LLM with OpenAI model:systemzdYou are a helpful assistant that extracts location (Indian State) information from tender documents.typejson_object)r+   r-   response_formatz
LLM response is:z
```json\s*r   z```\s*$zError parsing LLM response: zRaw response: NlowzFailed to parse LLM response)citystatefull_address
confidenceerror)r	   r   r-   creater*   r!   printOPENAI_MODELchatcompletionschoicesmessageresubjsonloadsr   JSONDecodeError)r   r!   promptresponseresult_textlocation_datar%   r   r   r   extract_location_with_llmB   sJ   



z1TenderLocationExtractor.extract_location_with_llmc                 C   sp   g d}d}t ||t j}|r|d nd}d}|D ]}t dt | d |t jr/|} nqd||dddS )	z
        Fallback method using regex patterns to extract location

        Args:
            text: Extracted text from PDF

        Returns:
            Dictionary with extracted location information
        )zAndhra PradeshzArunachal PradeshAssamBiharChhattisgarhGoaGujaratHaryanazHimachal Pradesh	Jharkhand	KarnatakaKeralazMadhya PradeshMaharashtraManipur	MeghalayaMizoramNagalandOdishaPunjab	RajasthanSikkimz
Tamil Nadu	TelanganaTripurazUttar PradeshUttarakhandzWest Bengalu=   (?:पता|Address|परेषती)[:\s]*([^\n]{20,200})r   Nz\br2   regex_fallback)r3   r4   r5   r6   method)r?   findall
IGNORECASEsearchescape)r   r!   statesaddress_patternaddress_matchesr5   r4   sr   r   r   extract_location_regex_fallback   s    	z7TenderLocationExtractor.extract_location_regex_fallbackFuse_fallbackc           	      C   s   t d|  | |}t dt| d d}||}|dkr8|t| }|||d  }t d| d nt d| d	 |rH| |}n| |}t|j|d
|}|S )a  
        Main method to process tender PDF and extract location

        Args:
            pdf_path: Path to the PDF file
            use_fallback: If True, use regex fallback instead of LLM

        Returns:
            Dictionary with extracted information
        zProcessing: z
Extracted z characters from PDFu   पपतताा/Addressi  z+Trimmed text to 400 characters after last ''z' not found, using full text)	file_name	file_path)r9   r'   lenrfindrh   rH   r   name)	r   r   ri   r!   
search_stridx	start_idxlocation_inforesultr   r   r   process_tender_pdf   s&   


z*TenderLocationExtractor.process_tender_pdfN)T)F)__name__
__module____qualname____doc__r   boolr   r'   r   r   rH   rh   rv   r   r   r   r   r      s    #Q(r   )r   r?   rA   typingr   r   pathlibr   	anthropicr   openair   r   r:   r   r   r   r   r   <module>   s    