o
    !6i[                     @   st   d Z ddlZddlZddlZddlmZmZmZmZ ddl	m
Z
 ddlZe
G dd dZG dd dZd	d
 ZdS )z
Final Annexure Extractor with Table of Contents Detection
- Skips table of contents pages
- Finds actual format pages only
- No duplicates, includes A and B
    N)DictListTupleOptional)	dataclassc                   @   s^   e Zd ZU dZeed< eed< eed< eed< eed< dZeed< dZ	eed	< d
Z
eed< dS )Annexurez Represents an extracted annexuretitlenumber
page_startpage_endcontentF	has_tablehas_formr   field_countN)__name__
__module____qualname____doc__str__annotations__intr   boolr   r    r   r   A/var/www/html/minaions-tender/ai-engine/extract_annexures_auto.pyr      s   
 r   c                   @   sR  e Zd ZdZd.defddZdefddZd	ed
efddZdede	d
efddZ
dede	d
eeee ee ee f fddZded
ee fddZded
efddZdee d
ee fddZdee d
efddZdee d
ee fdd Zded
efd!d"Zde	d
efd#d$Zd%ee d
efd&d'Zd(eee  d
efd)d*Zd+d, Zd-S )/FinalAnnexureExtractorz8
    Final version with Table of Contents detection
    Tverbosec                 C   s&   || _ g | _t | _t | _d| _d S )NF)r   	annexuressetseen_numbers	toc_pagesin_list_section)selfr   r   r   r   __init__#   s
   
zFinalAnnexureExtractor.__init__msgc                 C   s   | j rtd|  d S d S )Nz[ANNEXURE] )r   print)r!   r#   r   r   r   log*   s   zFinalAnnexureExtractor.logr	   returnc                 C   s   t dd| S )zNormalize annexure numberz\s+ )resubupper)r!   r	   r   r   r   _normalize_number.   s   z(FinalAnnexureExtractor._normalize_numbertextpage_numc                    sb   d}t ||}t|dkr/t fdd|D }t|dkr/ d| dt| d dS d	S )
z
        Detect if this page is a table of contents.

        KEY HEURISTIC: If we find MULTIPLE different annexure numbers (3+)
        on the SAME page, it's likely a table of contents, not actual formats.
        u(   (?i)ANNEXURE[\s\-–—]+([IVXABCD0-9]+)   c                 3   s    | ]}  |V  qd S )N)r+   ).0mr!   r   r   	<genexpr>?   s    zDFinalAnnexureExtractor._is_table_of_contents_page.<locals>.<genexpr>u     📋 Page z  detected as Table of Contents (z annexures listed)TF)r(   findalllenr   r%   )r!   r,   r-   patternmatchesunique_numbersr   r1   r   _is_table_of_contents_page2   s   z1FinalAnnexureExtractor._is_table_of_contents_pagelinec                 C   s\  |  }|rt|dk rdS g d}|D ]}t||}|r|d  }|d  }t| dkr;|d  nd}	| |}
|
| jv rX| d| d| d	|   dS t|d
kra dS g d}|	r|D ]}t	||	tj
r| d| d| d| d   dS qi|	rt|	dkr| d| d| dt|	 d  dS d||	|f  S qdS )zn
        Detect if a line is an annexure heading.
        Returns: (is_match, number, title, keyword)
        r.   )FNNN)uF   (?i)^(ANNEXURE)[\s\-–—]+([IVXABCD0-9\-]+)[\s\-–—:]*(.{0,100})$uC   (?i)^(ANNEX)[\s\-–—]+([IVXABCD0-9\-]+)[\s\-–—:]*(.{0,100})$uB   (?i)^(FORM)[\s\-–—]+([IVXABCD0-9\-]+)[\s\-–—:]*(.{0,100})$uF   (?i)^(APPENDIX)[\s\-–—]+([IVXABCD0-9\-]+)[\s\-–—:]*(.{0,100})$uD   (?i)^(FORMAT)[\s\-–—]+([IVXABCD0-9\-]+)[\s\-–—:]*(.{0,100})$ub   (?i)^FORMAT[\s\-–—:]*([A-Z0-9]+|[IVXLCDM]+)(?:[\s\-–—:]+([A-Z0-9][A-Z0-9\-\s]*[A-Z0-9]))?$u  (?<![A-Za-z])(FORMAT|FORM|PROFORMA|PRO[\s\-]?FORMA)[\s\-:–—]*([A-Z][\s\-–—]+\d+[A-Z]?|[A-Z]{1,2}[\-–—]\d+[A-Z]?|\d{1,3}[\-–—][A-Z]{1,2}|[A-Z]{1,2}\d+[A-Z]?|\d{1,3}[A-Z]{1,2}|X{0,3}(?:IX|IV|V?I{1,3}|VI{0,3})|\d{1,3}|[A-Z]{1,2})(?=[\s\-:,;.\)\]\}\n\r]|$)u@   ^(F)[\s]*[\-–—][\s]*(\d{1,2}[A-Z]?)[\s\-–—:]*(.{0,100})$      r'   u       ⚠️  Skipping duplicate  	 on page    )z\bin compliance\bz\border no\.?\bz\bdt\.?\s+\d{2}z
\bas per\bz\brefer to\bz\bsubmit\b.*\bat\bz\bundertaking at\bz\bgiven in\bz\bmentioned in\bz\bspecified in\bz\bamended from time to time\bz\blist of forms\bz\bform no\.u"       ⚠️  Skipping reference to z (contains 'z')<   u,       ⚠️  Skipping potential reference to z (title too long: z chars)T)stripr4   r(   matchgroupgroupsr+   r   r%   search
IGNORECASE)r!   r9   r-   
line_cleanpatternsr5   rA   keywordr	   r   norm_numberreference_keywordsref_patternr   r   r   _is_annexure_headingF   s:   "

"0z+FinalAnnexureExtractor._is_annexure_headingfile_txtc              
      s<    d  fdd| D } |}|r|d d nd}  dt| d| d	 g }d
}g }d}d
}	|D ]2}
|
d }|
d }|d
d D ]}td|rad _|}	  d|   nqJ jrx|	d
urx||	krxd _  d|  d}|d
d D ]} ||\}}}}|r! jr  d| d| d qd}|r|s|d }||jk r|j}||_	 
||_||   d|j d|j	 d n	 d
}g }d}| dkrd}n| }|r| d| d| }n| d| }t||||dd}| dkrdn| }  d| d| d |  |
g}d} nq|r6|r1|d d |kr6||
 |rl|d!d
 D ],} |rj||_	 
||_||   d"|j d#| d$ d
}g }d} nq?q9|r|d d }||_	 
||_||   d%|j d&| d   d't| d( |S ))z
        Extract annexures from a plain text file, using page markers if present.
        Returns Annexure objects with page_start/page_end ranges and merged content.
        z#Processing TXT file (page-aware)...c                    s   g | ]}  |qS r   )_fix_encoding_artifacts)r/   r9   r1   r   r   
<listcomp>   s    z@FinalAnnexureExtractor.extract_all_annexures.<locals>.<listcomp>r-   r   z	Detected z pages (last page number: )NFlines
   z.(?i)list\s+of\s+forms?\s*(?:&|and)?\s*formats?Tu.     📋 Entering LIST OF FORMS section on page u+     ✓ Exited LIST OF FORMS section at page    u       ⚠️  Skipping r<   z (in LIST OF FORMS section)r:   u     ⚠️ Closed z	 at page z (no signature found)FFormatr'   r   r	   r
   r   r   FORMATu     ✓ Found r=   iu     ✅ z ended on page z (signature found)u	     ℹ️ z closed at last page Found z annexures in text file)r%   
splitlines_split_text_into_pagesr4   r(   rD   r    rL   r
   r   _merge_pages_contentr   appendr   r*   r   _is_signature_line)r!   rM   	raw_linespagestotal_pagesr   current_anncurrent_pages_accsaw_signature_for_currentlist_section_start_pagepagepnumrR   lnheading_foundis_annexurer	   r   rH   prev_page_numkeyword_display
full_titlelog_keywordlast_page_numr   r1   r   extract_all_annexures   s   




	6


z,FinalAnnexureExtractor.extract_all_annexuresc                 C   s:   |sdS |  }g d}td|tj}t||S )z
        Return True if the line looks like a signature block or signatory phrase.
        Checks last lines of a page for phrases such as 'signature', 'signed', 'authorized signatory', etc.
        F)z\bsignature\bz
\bsigned\bz\bsign(ed)?\s+and\s+stamp\bz\bsign\s+and\s+stamp\bz\bauthorized\s+signatory\bz\bauthorised\s+signatory\bz\bfor\s+and\s+on\s+behalf\bz \bfor\s+and\s+on\s+behalf\s+of\bz)\b(approved|auth)\.?[ -]?sign(?:atory)?\bz	\bsig\.\bz\b(Sign|Signed|Signature):|)r@   r(   compilejoinrE   r   rD   )r!   r,   sig_patternsr5   r   r   r   r^     s   z)FinalAnnexureExtractor._is_signature_linerR   c                    s   g g  dd}t dt j}dtf fdd}|D ]$}||}|r3d}t|d}|| qdu r;|d  | qdurO d	 S dg d	 S )
z
        Return list of pages: [{'page_num': int, 'lines': [...]}, ...]
        Detects patterns like:
          --- Doc Page Number: 1 ---
          Page 1 of 96
        Falls back to single page if no page markers are found.
        NFz+---\s*Doc\s+Page\s+Number\s*:\s*(\d+)\s*---rg   c                    s$   d ur  d | g  d S )Nr-   rR   )r]   )rg   current_page_linescurrent_page_numr`   r   r   _start_new_page<  s   zFFinalAnnexureExtractor._split_text_into_pages.<locals>._start_new_pageTr:   ru   )r(   rr   rE   r   rD   rB   r]   )r!   rR   found_any_header
header_re1ry   rh   m1r-   r   rv   r   r[   *  s*   
z-FinalAnnexureExtractor._split_text_into_pagesr`   c                 C   sB   g }|D ]}d |dg }||  qd dd |D S )zr
        Merge a list of page dicts (as returned by _split_text_into_pages) into a single content string.
        
rR   z

c                 S   s   g | ]}|r|qS r   r   )r/   pr   r   r   rO   v  s    z?FinalAnnexureExtractor._merge_pages_content.<locals>.<listcomp>)rs   getr]   r@   )r!   r`   partsr~   	page_textr   r   r   r\   m  s
   z+FinalAnnexureExtractor._merge_pages_contentc              	   C   s@  d }g }g }t |dD ]u\}}| |}| ||\}}}	}
|ry|r2|d |_d||_|| |
 dkr;d}n|
 }|	rL| d| d|	 }n| d| }t	||||dd}g }|
 dkrfdn|
 }| 
d	| d| d
|  q|r|| q|rt||_d||_|| | 
dt| d |S )Nr:   r}   rU   rV   r<   r'   rW   rX   u
   ✓ Found z	 at line rY   z
 annexures)	enumeraterN   rL   r   rs   r   r]   r*   r   r   r%   r4   )r!   rR   currentannexure_contentr   idxr9   rj   r	   r   rH   rl   rm   rn   r   r   r   _extract_annexures_from_linesy  sJ   





z4FinalAnnexureExtractor._extract_annexures_from_linesc              
   C   s   |s|S i ddddddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+}|  D ]
\}}|||}qH|S ),zDFix common mis-encoded UTF-8 sequences found in PDFs, DOCX, and TXT.u   â€“u   –u   â€”u   —u   â€˜u   ‘u   â€™u   ’u   â€œu   “u   â€u   ”u   â€¦u   …u   â€¢u   •u   â€¡u   ‡u   â€ u   †u   Â©   ©u   Â®   ®u   Â°   °u   Â±   ±u   Â£   £u   Â¢   ¢u   Â¥   ¥   ×   ÷   á   é   ó   ú   ñr<   )u   Ã—u   Ã·u   Ã¡u   Ã©u   Ã³u   Ãºu   Ã±u   Â )itemsreplace)r!   r,   replacementswrongcorrectr   r   r   rN     sb   	
z.FinalAnnexureExtractor._fix_encoding_artifactsc                 C   sB   ||j ddg d}| }|r|D ]}|d | | q|S )NT)layout)r-   r,   tablesr   )extract_textextract_tablesr]   _format_table_text)r!   rf   r-   r   r   tabler   r   r   _extract_page_content  s   
z,FinalAnnexureExtractor._extract_page_contentpages_contentc                 C   s~   g }|D ]5}|d r)t |d dD ]\}}|d| d || |d q|d r4||d  |d qd|S )Nr   r:   z
[TABLE z]
r'   r,   r}   )r   r]   rs   )r!   r   output	page_datair   r   r   r   _merge_page_contents  s   

z+FinalAnnexureExtractor._merge_page_contentsr   c                    s  |r|d sdS dd |D }g }t t|d D ] t fdd|D }|t|dd  qg }d	d	d
d |D  d	 }t|D ]=\}}|dkrR|| g }	t|D ]\ }
 t|k rl|	|
|   qX|dd|	 d  |dkr|| qE|| d|S )Nr   z[Empty table]c                 S   s   g | ]	}d d |D qS )c                 S   s   g | ]
}t |pd  qS )r'   )r   r@   )r/   cellr   r   r   rO     s    zHFinalAnnexureExtractor._format_table_text.<locals>.<listcomp>.<listcomp>r   r/   rowr   r   r   rO     s    z=FinalAnnexureExtractor._format_table_text.<locals>.<listcomp>c                 3   s,    | ]} t |k rt |  nd V  qdS )r   N)r4   r   col_idxr   r   r2     s   * z<FinalAnnexureExtractor._format_table_text.<locals>.<genexpr>r.   r;   +c                 s   s    | ]}d | V  qdS )-Nr   )r/   wr   r   r   r2      s    rq   r}   )ranger4   maxr]   rs   r   ljust)r!   r   cleaned
col_widthswidthrR   	separatorrow_idxr   cellsr   r   r   r   r     s.   



z)FinalAnnexureExtractor._format_table_textc                 C   s8   g }t |ddD ]\}}|||j|j|jd q|S )zf
        Returns a JSON-friendly list of dicts:
        S. No. | Name | Page Start | Page End
        r:   )start)document_nameannexure_namer
   r   )r   r]   r   r
   r   )r!   doc_namer   summaryr   annr   r   r   to_summary_table  s   
z'FinalAnnexureExtractor.to_summary_tableN)T)r   r   r   r   r   r"   r   r%   r+   r   r8   r   r   rL   r   r   rp   r^   r   r[   r\   r   rN   r   r   r   r   r   r   r   r   r      s"    .LC9(r   c              
   C   s  g }z_t  }| D ]'\}}tj|}||}tdt| d| d |||}|	| q
tj
| d}	tj|ddd}
t|	dd	d
}||
 W d    |W S 1 sZw   Y  |W S  tjyt   td| d Y d S  ty } ztd|  |W  Y d }~S d }~ww )Nz
Extracted z annexures from 'z'.zannexures_info.txt   F)indentensure_asciir   zutf-8)encodingz#Error: Could not decode JSON from 'z)'. Check if the file contains valid JSON.zAn unexpected error occurred: )r   r   ospathbasenamerp   r$   r4   r   extendrs   jsondumpsopenwriteJSONDecodeError	Exception)
output_dirdatacombined_result	extractordoc_pathdoc_textr   r   resultannexures_info_pathformatted_resultfer   r   r   extract_annexures_info   s2   

r   )r   r   sysr(   typingr   r   r   r   dataclassesr   r   r   r   r   r   r   r   r   <module>   s       