o
    L%i                     @   sN  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlZddlZddlZddlZddlmZ ddlZddlZddlZddlmZ e  dZed	d
dZdZdZeddZedZedZejejde de! gd e"e#Z$G dd dZ%dd Z&e#dkre&  dd Z&e#dkre&  dS dS )z
Bid Processing Script - Extract and process government tender data
Reads PDF work orders and JSON files, uses Claude LLM for PDF extraction
    N)datetime)Path)DictListOptionalAny)OpenAI)load_dotenvzclaude-haiku-4-5 gpZ6FfGe2DKNfIIM2hzqIctuVnntHgvSz#https://api.deepinfra.com/v1/openai)api_keybase_urlopen_llmz Qwen/Qwen3-Next-80B-A3B-InstructAPI_URLzhttp://localhost:5000INTERNAL_API_KEYMASTER_TENANT_IDz)%(asctime)s - %(levelname)s - %(message)szbid_processing.log)levelformathandlersc                   @   s.  e Zd Z		d.dededefddZdefd	d
Zdededee fddZ		d/dededededee f
ddZ
d/dededee dee fddZdd ZdedefddZdededefddZdedefd d!Zd/ded"ee dee fd#d$Z	d/d%ed&ed'ed(ed)ee defd*d+Zd,d- ZdS )0BidProcessorprocessed_bids_data.jsonNroot_directoryoutput_fileanthropic_api_keyc                 C   sP   t || _t || _|ptd| _| jstdtj| jd| _	| 
 | _dS )z
        Initialize the bid processor

        Args:
            root_directory: Root directory containing bid folders
            output_file: Output JSON file path
            anthropic_api_key: Anthropic API key for Claude
        ANTHROPIC_API_KEYz[Anthropic API key required. Set ANTHROPIC_API_KEY environment variable or pass it directly.)r   N)r   r   r   osgetenvr   
ValueError	anthropic	Anthropicclient_load_existing_dataprocessed_data)selfr   r   r    r#   =/var/www/html/minaions-tender/ai-engine/process_eproc_data.py__init__9   s   


zBidProcessor.__init__returnc              
   C   s   | j  rOz/t| j ddd}t|}W d   n1 sw   Y  tdt|dg  d |W S  t	yN } zt
d|  W Y d}~nd}~ww t  d	d	d
d	ddg dS )z)Load existing processed data if availablerutf-8encodingNzLoaded existing data with bidsz bidszError loading existing data: r   zGovernment Tender Processingz0.0%)	scrapedAt	totalBids
totalFoundsourcewithDetailedInfodetailFetchSuccess)metadatar+   )r   existsopenjsonloadloggerinfolenget	Exceptionerrorr   now	isoformat)r"   fdataer#   r#   r$   r    M   s(   

z BidProcessor._load_existing_data	file_path
bid_numberc                 C   sr  z|  std|  W dS trtsgtd td td ts+td ts_td td td td	 td
 td td td td td td W dS |dddd}|j}dt d| d| }ddddddddddddd}|j	 }|
|d}t|do}	d ||	|fi}
t||d!d"}d#ti}tjt d$|
||d%d&}|jd'kr| }|
d(rtd)|  |
d*W  d   W S td+|
d,  	 W d   W dS td-|j d.|j  	 W d   W dS 1 sw   Y  W dS  ty8 } ztd/|j d0|  W Y d}~dS d}~ww )1z
        Upload file to backend via API (stores in S3)

        Args:
            file_path: Path to file
            bid_number: Bid number for organizing files

        Returns:
            Dict with storage details or None if failed
        zFile not found: NzP================================================================================zHCONFIGURATION ERROR: INTERNAL_API_KEY or MASTER_TENANT_ID not configuredu.   ❌ INTERNAL_API_KEY is missing from .env fileu.   ❌ MASTER_TENANT_ID is missing from .env file zTo fix this:z.1. Connect to MongoDB and find your tenant ID:z   mongo> use tenderdbz:   mongo> db.tenants.findOne({}, {_id: 1, companyName: 1})z(2. Add the tenant _id to ai-engine/.env:z)   MASTER_TENANT_ID=<your_tenant_id_here>/_\znexray_training/application/pdfzapplication/mswordzGapplication/vnd.openxmlformats-officedocument.wordprocessingml.documentzapplication/vnd.ms-excelzAapplication/vnd.openxmlformats-officedocument.spreadsheetml.sheetz
text/plainapplication/jsonzapplication/zipzapplication/x-rar-compressedz	image/pngz
image/jpeg).pdf.doc.docx.xls.xlsxz.txtz.jsonz.zipz.rarz.pngz.jpgz.jpegzapplication/octet-streamrbfilefalse)	tenant_idkeycontent_typeencryptx-internal-api-keyz/internal-api/storage/upload<   )filesr@   headerstimeout   successu   ✅ Uploaded file to S3: r@   zBackend upload failed: messagezUpload API error:  - zError uploading file : )r3   r7   warningr   r   r<   replacenamesuffixlowerr:   r4   requestspostBACKEND_API_URLstatus_coder5   r8   textr;   )r"   rB   rC   safe_bid_number	file_namefile_keycontent_type_mapfile_extrT   r?   rX   r@   rY   responseresultrA   r#   r#   r$   _upload_file_to_backende   s   

















* z$BidProcessor._upload_file_to_backendstorage_detailsr2   c                 C   s.  z{|sW dS |j }| }d}t||dd||d|t  dd|p#i d}td	d
}	tjt	 d||	dd}
|
j
dv rl|
 }|dr^|di d}td| d| d |W S td|d  W dS td|
j
 d|
j  W dS  ty } ztd|  W Y d}~dS d}~ww )a:  
        Create Document record in database via API

        Args:
            storage_details: Storage details from S3 upload
            file_path: Original file path
            bid_number: Bid number
            metadata: Additional metadata

        Returns:
            Document ID or None if failed
        Nnexray_documentnexray_trainings3eproc_historicalT)	bidNumberr/   originalFileName
uploadedAtisNexRayTraining)tenantrb   typecategorystorageTypestorageDetailsr2   rI   )rV   zContent-Typez/internal-api/documents   )r5   rY   rZ   )r[      r\   r@   _idu   ✅ Created document record: z (ID: )zDocument creation failed: r]   zDocument API error: r^   z Error creating document record: )rb   rd   r   r   r=   r>   r   re   rf   rg   rh   r5   r:   r7   r8   r<   ri   r;   )r"   rr   rB   rC   r2   rk   
file_lowerdoc_typedocument_datarY   ro   rp   document_idrA   r#   r#   r$   _create_document_record   sZ   


z$BidProcessor._create_document_recordfolder_pathexisting_documentsc                 C   s  g }|rt |dkrtd| d |S zh d}g }| D ]}| r2|j |v r2|| q |s@td|  |W S tdt | d|j	  |D ]f}zF| 
||}|r| j|||d|j	id	}	|	r||	|j	|d
|d|dt  d ntd|j	  n	td|j	  W qP ty }
 ztd|j	 d|
  W Y d}
~
qPd}
~
ww W |S  ty }
 ztd|
  W Y d}
~
|S d}
~
ww )ai  
        Find all files in folder, upload to S3, and create document records

        Args:
            folder_path: Path to folder containing files
            bid_number: Bid number for organization
            existing_documents: List of already uploaded documents to avoid duplicates

        Returns:
            List of document information dicts
        r   z Documents already exist for bid z, skipping upload>   rK   rJ   rM   rL   rN   z9No uploadable files (pdf, doc, docx, xls, xlsx) found in Found z uploadable file(s) in 
folderName)r2   urlrS   size)
documentIdfileName
storageUrl
storageKeyfileSizery   z%Failed to create document record for zFailed to upload zError processing file r_   Nz"Error processing files in folder: )r9   r7   r8   iterdiris_filerc   rd   appendr`   rb   rq   r   r:   r   r=   r>   r;   r<   )r"   r   rC   r   	documentsallowed_extensions	all_filesrP   rr   r   rA   r#   r#   r$   _process_and_upload_files
  sb   


	#z&BidProcessor._process_and_upload_filesc              
   C   s  zgt | jd | jd d< tdd | jd D | jd d< | jd d td| jd d  d d	d
| jd d< t| jddd}tj| j|ddd W d   n1 sWw   Y  t	d| j  W dS  t
y } ztd|  W Y d}~dS d}~ww )zSave processed data to diskr+   r2   r-   c                 s   s    | ]	}d |v rdV  qdS )detailedInfo   Nr#   ).0bidr#   r#   r$   	<genexpr>W  s    z*BidProcessor._save_data.<locals>.<genexpr>r0   r   d   z.1f%r1   wr(   r)      F)indentensure_asciiNzData saved to zError saving data: )r9   r!   summaxr4   r   r5   dumpr7   r8   r;   r<   )r"   r?   rA   r#   r#   r$   
_save_dataR  s   ,zBidProcessor._save_datapdf_pathc           	   
   C   s  zd}z:t |}|jD ]}| }|r||d 7 }qW d   n1 s&w   Y  | r<td|j  |W W S W n ty[ } zt	d|j d|  W Y d}~nd}~ww z;t|d}t
|}|jD ]
}|| d 7 }qkW d   n1 sw   Y  | rtd|j  |W W S W n ty } zt	d	|j d|  W Y d}~nd}~ww t	d
|j d W dS  ty } ztd|j d|  W Y d}~dS d}~ww )zExtract text from PDF filerD   
Nz2Successfully extracted text using pdfplumber from zpdfplumber failed for r_   rO   z.Successfully extracted text using PyPDF2 from zPyPDF2 failed for zCould not extract text from z - might be scannedzError processing PDF )
pdfplumberr4   pagesextract_textstripr7   r8   rb   r;   r`   PyPDF2	PdfReaderr<   )	r"   r   ri   pdfpage	page_textrA   rP   
pdf_readerr#   r#   r$   _extract_text_from_pdfg  sR   
$

$z#BidProcessor._extract_text_from_pdfpdf_textc              
   C   s6  zd| d}|  rAtdkr&| jjjtdd|dgd}|jd j  }nhtd	kr@tj	j
jtd|dgd
}|jd jj  }nMtd|j d t|d}| }t|d}	W d   n1 sgw   Y  | jjjtddddd|	ddd|dgdgd}|jd j  }d|v r|dd }
|d|
}||
|   }nd|v r|dd }
|d}||
|   }zt|}td|j d |W W S  tjy } ztd|j d|  td|  i W  Y d}~W S d}~ww  ty } ztd |  i W  Y d}~S d}~ww )!zDUse Claude to analyze PDF content and extract structured informationz
            Analyze this government work order/purchase order document and extract the following information in JSON format:

            a  

            Please extract and return ONLY a JSON object with this structure:
            {
                "seller_details": {
                    "name": "Company name",
                    "address": "Full address",
                    "contact": "Phone/email if available"
                },
                "work_order_details": {
                    "order_number": "Work order number",
                    "date": "Order date",
                    "total_amount": "Total contract value",
                    "completion_period": "Work completion period"
                },
                "scope_of_work": "Complete description of work/items to be supplied",
                "items": [
                    {
                        "description": "Item description",
                        "quantity": "Quantity",
                        "unit_rate": "Rate per unit",
                        "total_amount": "Total for this item"
                    }
                ],
                "terms_and_conditions": "Key terms and conditions if any"
            }

            Extract only the information that is clearly present in the document. Use null for missing information.
            claudei  user)rolecontent)model
max_tokensmessagesr   r   )r   r   zPDF z0 appears to be scanned - using document analysisrO   r(   Ndocumentbase64rH   )r|   
media_typer@   )r|   r/   ri   )r|   ri   z```json   z```   zSuccessfully analyzed PDF z with Claudez.Failed to parse JSON from Claude response for r_   zClaude response was: z!Error analyzing PDF with Claude: )r   	llm_modelr   r   createCLAUDE_MODELr   ri   openaichatcompletionsopen_llm_modelchoicesr]   r7   r8   rb   r4   readr   	b64encodedecodefindrfindr5   loadsJSONDecodeErrorr<   debugr;   )r"   r   r   promptr]   response_textro   pdf_filepdf_data
pdf_base64
json_startjson_endextracted_datarA   r#   r#   r$   _analyze_pdf_with_llm  s   !
	

z"BidProcessor._analyze_pdf_with_llm	json_pathc              
   C   s   z(t |ddd}t|}W d   n1 sw   Y  td|j  |W S  tyH } ztd|j d|  i W  Y d}~S d}~ww )zLoad and parse JSON datar'   r(   r)   NzSuccessfully loaded zError loading JSON r_   )r4   r5   r6   r7   r8   rb   r;   r<   )r"   r   r?   r@   rA   r#   r#   r$   _load_json_data  s   zBidProcessor._load_json_dataexisting_bid_datac                 C   sR  t d|j  d}d}| D ]}|j dkr|}q|jdkr$|}q|r)|s4t d|j  dS | |}|sFt d|j  dS |di dd	pR|j}d}|rl|d
g }|rlt dt	| d|  t d| d | 
|||}	|	rt dt	|	 d n	t d|j  | |}
d	g i d	d}| |j||||	S )z0Process a single folder containing bid documentszProcessing folder: NrJ   zstage_summary_data.jsonzMissing required files in zCould not load JSON data from 
headerInfotenderIDrD   nexray_documentsr   z existing documents for bid u&   📤 Checking/Uploading files for bid ...u   ✅ Successfully uploaded z document(s) to S3u&   ⚠️ No documents were uploaded for )scope_of_workitemsseller_detailsterms_and_conditions)r7   r8   rb   r   rc   rd   r`   r   r:   r9   r   r   _structure_bid_data)r"   r   r   r   	json_filerP   	json_datarC   r   uploaded_documentsr   pdf_analysisr#   r#   r$   _process_folder  sD   


zBidProcessor._process_folderfolder_namer   r   r   r   c                 C   sr  |du rg }| di }| dg }| dd}| dg }	d| | ddgt| d	dd
kr>| d	ddd
 d n| d	dg| d	dg| dddgt|dkrYdndg|d}
d	|v rj|d	 g|
d< | d|t|dk|| dddtt| dg d| ddddddd| ddv r| dddd ndd| ddv r| dddd ndddg |
d r|
d d ndt|dddd| di dd d!d"}g }t|dD ]\}}| d#d| d$d%d&| d$dv rd&ndd'}| d#t|v r	 |	D ]L}| d#| d#krX| d(d)|d*< d+|d,< | d#d|d- d. d/ d0< | drG| d|d- d. d/ d1< |t| d(d)d2d  nq|d- d. d3 | q|rt||d- d. d/ d4< t	||d- d. d/ d5< t	|t| |d- d. d/ d6< | d7r|
d8 r|
d8  d9| d7 7  < n	d:| d7 |
d8< ||
d;< |
S )<z;Structure the extracted data according to the target formatNr   bidsListSectionfinanceBidOpeningSummaryrD   awardedBidsfolder_r   tenderTitler   r   r   Fr   r   )b_idb_bid_numberb_category_namebd_category_nameb_scope_workis_high_valueba_is_single_packetr   	bbt_titleActiver   z
30 ( Days)organisationChainz||z, Ministry
Department)addressministry
departmentorganisationoffice)rw   	bidStatusquantitybidValiditybuyerDetailsr   r   )totalSellerslowestPricehighestPrice
priceRangel1Winnerl1Winner_details)sellersofferedItemssummary)
bidDetails
evaluation)bidIdisSinglePacketr   sectionszBidder NameStatusUnknownMSE)
sellerNamestatus
statusTypezAwarded Value0
totalPriceL1rankr  r  r  r  r  ,r  r  r  r  r   r   z

Terms & Conditions:
zTerms & Conditions:
r   )
r:   r9   strra   split	enumerater   floatminr   )r"   r   r   r   r   r   header_info	bids_listfinance_summaryawarded_bidsbid_datadetailed_infoseller_pricesir   sellerawardedr#   r#   r$   r   D  s   8



((
"


 
z BidProcessor._structure_bid_datac                 C   s  t d| j  dd | j D }t|}t d| d d}|D ]}zd|j }d}d}t| jd	 D ]\}}	|	d
d|krJ|	}|} nq8|rzd|v }
|do_t|dg dk}|
rp|rpt d|j d W q$t d|j d | 	||}|r|dur|| jd	 |< t d|j  n| jd	 
| |d7 }|   t d| d| d td W q$ ty } zt d|j d|  W Y d}~q$d}~ww t d| d | jS )z)Process all folders in the root directoryzStarting to process folders in c                 S   s   g | ]}|  r|qS r#   )is_dir)r   r?   r#   r#   r$   
<listcomp>  s    z4BidProcessor.process_all_folders.<locals>.<listcomp>r   z folders to processr   r   Nr+   r   rD   r   r   zFolder z4 already fully processed with documents, skipping...z( exists but incomplete, re-processing...zUpdated existing bid data for r   z
Processed rE   z folderszError processing folder r_   zProcessing complete. Processed z new folders.)r7   r8   r   r   r9   rb   r%  r!   r:   r   r   r   timesleepr;   r<   )r"   folderstotal_foldersprocessed_countfolder	folder_idr   existing_bid_indexidxr   has_detailed_infohas_documentsr,  rA   r#   r#   r$   process_all_folders  sR   
z BidProcessor.process_all_folders)r   N)N)__name__
__module____qualname__r#  r%   r   r    r   r   rq   r   r   r   r   r   r   r   r   r   r?  r#   r#   r#   r$   r   8   sB    
`
$FH*q 8
mr   c               
   C   s   t jdd} | jddd | jdddd	 |  }|j}|j}tjd
}tj	
|s5td| d dS z#t|||}| }td tdt|d   td|  W dS  tyz } ztd|  td|  W Y d}~dS d}~ww )&Main function to run the bid processorzProcess bid folders)descriptionr   z0Root directory containing bid folders to process)helpz--outputr   z4Output file name (default: processed_bids_data.json))defaultrE  r   Error: Directory  does not exist!N
Processing complete!Total bids processed: r+   Results saved to: Error in main processing: Error: )argparseArgumentParseradd_argument
parse_argsr   outputr   environr:   pathr3   printr   r?  r9   r;   r7   r<   )parserargsROOT_DIRECTORYOUTPUT_FILEr   	processorrp   rA   r#   r#   r$   main  s8   r[  __main__c               
   C   s   d} d}d}t j| std|  d dS z#t| ||}| }td tdt|d	   td
|  W dS  ty[ } zt	d|  td|  W Y d}~dS d}~ww )rC  z/contentr   zl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  rH  NrI  rJ  r+   rK  rL  rM  )
r   rT  r3   rU  r   r?  r9   r;   r7   r<   )rX  rY  r   rZ  rp   rA   r#   r#   r$   r[    s"   )'__doc__r   r5   loggingr4  r   pathlibr   typingr   r   r   r   r   r   r   r   r   r   rN  re   dotenvr	   r   r   r   r   rg   r   r   basicConfigINFOFileHandlerStreamHandler	getLoggerr@  r7   r   r[  r#   r#   r#   r$   <module>   sb   


     >(
