#!/usr/bin/env python3
"""
Final Annexure Extractor with Table of Contents Detection
- Skips table of contents pages
- Finds actual format pages only
- No duplicates, includes A and B
"""

import os
import sys
import re
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
import json


@dataclass
class Annexure:
    """Represents an extracted annexure"""
    title: str
    number: str
    page_start: int
    page_end: int
    content: str
    has_table: bool = False
    has_form: bool = False
    field_count: int = 0


class FinalAnnexureExtractor:
    """
    Final version with Table of Contents detection
    """

    def __init__(self, verbose: bool = True):
        self.verbose = verbose
        self.annexures: List[Annexure] = []
        self.seen_numbers = set()
        self.toc_pages = set()  # Track table of contents pages
        self.in_list_section = False  # Track if we're in "LIST OF FORMS" section

    def log(self, msg: str):
        if self.verbose:
            print(f"[ANNEXURE] {msg}")

    def _normalize_number(self, number: str) -> str:
        """Normalize annexure number"""
        return re.sub(r'\s+', '', number.upper())

    def _is_table_of_contents_page(self, text: str, page_num: int) -> bool:
        """
        Detect if this page is a table of contents.

        KEY HEURISTIC: If we find MULTIPLE different annexure numbers (3+)
        on the SAME page, it's likely a table of contents, not actual formats.
        """
        # Find all annexure mentions
        pattern = r'(?i)ANNEXURE[\s\-–—]+([IVXABCD0-9]+)'
        matches = re.findall(pattern, text)

        if len(matches) >= 3:
            # Multiple annexures on one page = likely TOC
            unique_numbers = set(self._normalize_number(m) for m in matches)
            if len(unique_numbers) >= 3:
                self.log(f"  📋 Page {page_num} detected as Table of Contents ({len(unique_numbers)} annexures listed)")
                return True

        return False

    def _is_annexure_heading(self, line: str, page_num: int) -> Tuple[bool, Optional[str], Optional[str], Optional[str]]:
        """
        Detect if a line is an annexure heading.
        Returns: (is_match, number, title, keyword)
        """
        line_clean = line.strip()

        if not line_clean or len(line_clean) < 3:
            return False, None, None, None
        # print("Clean Line is:  ", line_clean)
        # Capture the keyword (ANNEXURE/FORM/APPENDIX/F-) in group 1, number in group 2, title in group 3
        patterns = [
            r'(?i)^(ANNEXURE)[\s\-–—]+([IVXABCD0-9\-]+)[\s\-–—:]*(.{0,100})$',
            r'(?i)^(ANNEX)[\s\-–—]+([IVXABCD0-9\-]+)[\s\-–—:]*(.{0,100})$',
            r'(?i)^(FORM)[\s\-–—]+([IVXABCD0-9\-]+)[\s\-–—:]*(.{0,100})$',
            r'(?i)^(APPENDIX)[\s\-–—]+([IVXABCD0-9\-]+)[\s\-–—:]*(.{0,100})$',
            r'(?i)^(FORMAT)[\s\-–—]+([IVXABCD0-9\-]+)[\s\-–—:]*(.{0,100})$',
            r'(?i)^FORMAT[\s\-–—:]*([A-Z0-9]+|[IVXLCDM]+)(?:[\s\-–—:]+([A-Z0-9][A-Z0-9\-\s]*[A-Z0-9]))?$',
            r'(?<![A-Za-z])(FORMAT|FORM|PROFORMA|PRO[\s\-]?FORMA)[\s\-:–—]*([A-Z][\s\-–—]+\d+[A-Z]?|[A-Z]{1,2}[\-–—]\d+[A-Z]?|\d{1,3}[\-–—][A-Z]{1,2}|[A-Z]{1,2}\d+[A-Z]?|\d{1,3}[A-Z]{1,2}|X{0,3}(?:IX|IV|V?I{1,3}|VI{0,3})|\d{1,3}|[A-Z]{1,2})(?=[\s\-:,;.\)\]\}\n\r]|$)',
            # F-1, F-2A, F-10, F-13, etc. (shorthand for Format)
            r'^(F)[\s]*[\-–—][\s]*(\d{1,2}[A-Z]?)[\s\-–—:]*(.{0,100})$',
        ]

        for pattern in patterns:
            match = re.match(pattern, line_clean)
            if match:
                keyword = match.group(1).strip()  # ANNEXURE/FORM/APPENDIX
                number = match.group(2).strip()
                title = match.group(3).strip() if len(match.groups()) > 2 else ""

                norm_number = self._normalize_number(number)

                # Skip if already seen
                if norm_number in self.seen_numbers:
                    self.log(f"    ⚠️  Skipping duplicate {keyword} {number} on page {page_num}")
                    return False, None, None, None

                # Line should be short (not middle of paragraph)
                if len(line_clean) > 150:
                    return False, None, None, None

                # Check if this is a reference/instruction rather than an actual heading
                # References often contain these keywords
                reference_keywords = [
                    r'\bin compliance\b',
                    r'\border no\.?\b',
                    r'\bdt\.?\s+\d{2}',  # date like "dt 23/7/2020" or "dt. 23/7/2020"
                    r'\bas per\b',
                    r'\brefer to\b',
                    r'\bsubmit\b.*\bat\b',  # "submit ... at Annexure"
                    r'\bundertaking at\b',
                    r'\bgiven in\b',
                    r'\bmentioned in\b',
                    r'\bspecified in\b',
                    r'\bamended from time to time\b',
                    r'\blist of forms\b',  # Skip entries in "LIST OF FORMS" section
                    r'\bform no\.',  # Skip "Form No." column headers
                ]

                # Check if title contains reference indicators
                if title:
                    for ref_pattern in reference_keywords:
                        if re.search(ref_pattern, title, re.IGNORECASE):
                            self.log(f"    ⚠️  Skipping reference to {keyword} {number} (contains '{ref_pattern}')")
                            return False, None, None, None

                # Check if title is suspiciously long (likely part of a sentence)
                if title and len(title) > 60:
                    self.log(f"    ⚠️  Skipping potential reference to {keyword} {number} (title too long: {len(title)} chars)")
                    return False, None, None, None

                return True, number, title, keyword

        return False, None, None, None

    # --- Updated text extractor using pages + signature-driven end detection ---
    def extract_all_annexures(self, file_txt: str) -> List[Annexure]:
        """
        Extract annexures from a plain text file, using page markers if present.
        Returns Annexure objects with page_start/page_end ranges and merged content.
        """
        self.log("Processing TXT file (page-aware)...")

        # read and clean - using direct text input
        raw_lines = [self._fix_encoding_artifacts(line) for line in file_txt.splitlines()]

        pages = self._split_text_into_pages(raw_lines)
        total_pages = pages[-1]['page_num'] if pages else 0
        self.log(f"Detected {len(pages)} pages (last page number: {total_pages})")

        annexures = []
        current_ann = None
        current_pages_acc = []  # list of page dicts that belong to current annexure
        saw_signature_for_current = False
        list_section_start_page = None  # Track which page started the list section

        for page in pages:
            pnum = page['page_num']
            lines = page['lines']

            # Check if this page contains "LIST OF FORMS" - if so, skip form detection until next page
            for ln in lines[:10]:
                if re.search(r'(?i)list\s+of\s+forms?\s*(?:&|and)?\s*formats?', ln):
                    self.in_list_section = True
                    list_section_start_page = pnum
                    self.log(f"  📋 Entering LIST OF FORMS section on page {pnum}")
                    break

            # Exit list section when we move to a new page
            if self.in_list_section and list_section_start_page is not None and pnum > list_section_start_page:
                self.in_list_section = False
                self.log(f"  ✓ Exited LIST OF FORMS section at page {pnum}")

            # 1) detect annexure heading in first ~30 lines of the page
            # Skip heading detection if we're in a LIST OF FORMS section
            heading_found = False
            for ln in lines[:30]:
                is_annexure, number, title, keyword = self._is_annexure_heading(ln, pnum)
                if is_annexure:
                    if self.in_list_section:
                        # Skip - we're in a list section
                        self.log(f"    ⚠️  Skipping {keyword} {number} (in LIST OF FORMS section)")
                        continue
                    heading_found = True
                    # If there's an open annexure:
                    if current_ann:
                        if not saw_signature_for_current:
                            # No signature found for previous annexure -> close it at previous page
                            prev_page_num = pnum - 1
                            if prev_page_num < current_ann.page_start:
                                prev_page_num = current_ann.page_start
                            current_ann.page_end = prev_page_num
                            current_ann.content = self._merge_pages_content(current_pages_acc)
                            annexures.append(current_ann)
                            self.log(f"  ⚠️ Closed {current_ann.title} at page {current_ann.page_end} (no signature found)")
                        else:
                            # already closed by signature - nothing to do
                            pass
                        # reset
                        current_ann = None
                        current_pages_acc = []
                        saw_signature_for_current = False

                    # Start new annexure - use actual keyword
                    # Special handling for F-[NUMBER] (shorthand for Format)
                    if keyword.upper() == 'F':
                        keyword_display = 'Format'
                    else:
                        keyword_display = keyword.title()

                    # If title exists, prepend keyword and number to it
                    if title:
                        full_title = f"{keyword_display} {number} {title}"
                    else:
                        full_title = f"{keyword_display} {number}"

                    current_ann = Annexure(
                        title=full_title,
                        number=number,
                        page_start=pnum,
                        page_end=pnum,
                        content=""
                    )

                    # For logging, use FORMAT instead of F
                    log_keyword = "FORMAT" if keyword.upper() == 'F' else keyword.upper()
                    self.log(f"  ✓ Found {log_keyword} {number} on page {pnum}")
                    current_pages_acc = [page]
                    saw_signature_for_current = False
                    break

            # 2) if there's an active annexure and this page wasn't already appended, append it
            if current_ann and (not current_pages_acc or current_pages_acc[-1]['page_num'] != pnum):
                current_pages_acc.append(page)

            # 3) check bottom lines of this page for signature markers (search last 15 lines)
            if current_ann:
                for ln in lines[-15:]:
                    if self._is_signature_line(ln):
                        # mark end at this page
                        current_ann.page_end = pnum
                        current_ann.content = self._merge_pages_content(current_pages_acc)
                        annexures.append(current_ann)
                        self.log(f"  ✅ {current_ann.title} ended on page {pnum} (signature found)")
                        # reset
                        current_ann = None
                        current_pages_acc = []
                        saw_signature_for_current = True
                        break

        # After loop, if an annexure still open (no signature and no subsequent annexure header)
        if current_ann:
            # close at last page
            last_page_num = pages[-1]['page_num']
            current_ann.page_end = last_page_num
            current_ann.content = self._merge_pages_content(current_pages_acc)
            annexures.append(current_ann)
            self.log(f"  ℹ️ {current_ann.title} closed at last page {last_page_num} (no signature found)")

        self.log(f"Found {len(annexures)} annexures in text file")
        return annexures

    # --- Helper: signature detection ---
    def _is_signature_line(self, text: str) -> bool:
        """
        Return True if the line looks like a signature block or signatory phrase.
        Checks last lines of a page for phrases such as 'signature', 'signed', 'authorized signatory', etc.
        """
        if not text:
            return False
        text = text.strip()
        sig_patterns = [
            r'\bsignature\b',
            r'\bsigned\b',
            r'\bsign(ed)?\s+and\s+stamp\b',
            r'\bsign\s+and\s+stamp\b',
            r'\bauthorized\s+signatory\b',
            r'\bauthorised\s+signatory\b',
            r'\bfor\s+and\s+on\s+behalf\b',
            r'\bfor\s+and\s+on\s+behalf\s+of\b',
            r'\b(approved|auth)\.?[ -]?sign(?:atory)?\b',
            r'\bsig\.\b',
            r'\b(Sign|Signed|Signature):',
        ]
        pattern = re.compile('|'.join(sig_patterns), re.IGNORECASE)
        return bool(pattern.search(text))

    # --- Helper: split lines into pages by detecting page headers in the text file ---
    def _split_text_into_pages(self, lines: List[str]) -> List[Dict]:
        """
        Return list of pages: [{'page_num': int, 'lines': [...]}, ...]
        Detects patterns like:
          --- Doc Page Number: 1 ---
          Page 1 of 96
        Falls back to single page if no page markers are found.
        """
        pages = []
        current_page_lines = []
        current_page_num = None
        found_any_header = False

        header_re1 = re.compile(r'---\s*Doc\s+Page\s+Number\s*:\s*(\d+)\s*---', re.IGNORECASE)
        # header_re2 = re.compile(r'Page\s+(\d+)\s+of\s+(\d+)', re.IGNORECASE)
        # # Also support simple "Page 1" style
        # header_re3 = re.compile(r'^\s*Page\s+(\d+)\s*$', re.IGNORECASE)

        def _start_new_page(pnum: int):
            nonlocal current_page_lines, pages, current_page_num
            if current_page_num is not None:
                pages.append({'page_num': current_page_num, 'lines': current_page_lines})
            current_page_num = pnum
            current_page_lines = []

        for ln in lines:
            # Try matches
            m1 = header_re1.search(ln)
            # m2 = header_re2.search(ln)
            # m3 = header_re3.search(ln)

            if m1:
                found_any_header = True
                page_num = int(m1.group(1))
                _start_new_page(page_num)
                continue  # header line usually not useful as page content
            # elif m2:
            #     found_any_header = True
            #     page_num = int(m2.group(1))
            #     _start_new_page(page_num)
            #     continue
            # elif m3:
            #     found_any_header = True
            #     page_num = int(m3.group(1))
            #     _start_new_page(page_num)
            #     continue
            else:
                # normal content line - append to current page if we have one,
                # otherwise start page 1 implicitly (useful if header only appears at top - still handled)
                if current_page_num is None:
                    # no explicit header seen yet; assume page 1
                    _start_new_page(1)
                current_page_lines.append(ln)

        # flush last page
        if current_page_num is not None:
            pages.append({'page_num': current_page_num, 'lines': current_page_lines})
        else:
            # No headers at all and no lines ? -> single empty page
            pages.append({'page_num': 1, 'lines': []})

        # If we didn't find any header pattern, fallback: group lines into pages by "approx page size"
        # BUT the user explicitly has the header pattern; we can skip fallback heuristics for now.
        return pages


    # --- Helper: merge page contents into text ---
    def _merge_pages_content(self, pages: List[Dict]) -> str:
        """
        Merge a list of page dicts (as returned by _split_text_into_pages) into a single content string.
        """
        parts = []
        for p in pages:
            # join lines within a page, preserve page boundary with a page-break token or blank line
            page_text = "\n".join(p.get('lines', []))
            parts.append(page_text.strip())
        return "\n\n".join([p for p in parts if p])  # remove empty pages


    def _extract_annexures_from_lines(self, lines: List[str]) -> List[Annexure]:
        current = None
        annexure_content = []
        annexures = []

        for idx, line in enumerate(lines, 1):
            line = self._fix_encoding_artifacts(line)

            is_annexure, number, title, keyword = self._is_annexure_heading(line, idx)

            if is_annexure:
                if current:
                    current.page_end = idx - 1
                    current.content = "\n".join(annexure_content)
                    annexures.append(current)

                # Use actual keyword in title
                # Special handling for F-[NUMBER] (shorthand for Format)
                if keyword.upper() == 'F':
                    keyword_display = 'Format'
                else:
                    keyword_display = keyword.title()

                # If title exists, prepend keyword and number to it
                if title:
                    full_title = f"{keyword_display} {number} {title}"
                else:
                    full_title = f"{keyword_display} {number}"

                current = Annexure(
                    title=full_title,
                    number=number,
                    page_start=idx,
                    page_end=idx,
                    content=""
                )
                annexure_content = []

                # For logging, use FORMAT instead of F
                log_keyword = "FORMAT" if keyword.upper() == 'F' else keyword.upper()
                self.log(f"✓ Found {log_keyword} {number} at line {idx}")
                continue

            if current:
                annexure_content.append(line)

        if current:
            current.page_end = len(lines)
            current.content = "\n".join(annexure_content)
            annexures.append(current)

        self.log(f"Found {len(annexures)} annexures")
        return annexures




    def _fix_encoding_artifacts(self, text: str) -> str:
        """Fix common mis-encoded UTF-8 sequences found in PDFs, DOCX, and TXT."""

        if not text:
            return text

        replacements = {
            "â€“": "–",   # en dash
            "â€”": "—",   # em dash
            "â€˜": "‘",   # left single quote
            "â€™": "’",   # right single quote
            "â€œ": "“",   # left double quote
            "â€": "”",   # right double quote
            "â€¦": "…",   # ellipsis
            "â€¢": "•",   # bullet
            "â€¡": "‡",
            "â€ ": "†",
            "Â©": "©",
            "Â®": "®",
            "Â°": "°",
            "Â±": "±",
            "Â£": "£",
            "Â¢": "¢",
            "Â¥": "¥",
            "Ã—": "×",
            "Ã·": "÷",
            "Ã¡": "á",
            "Ã©": "é",
            "Ã³": "ó",
            "Ãº": "ú",
            "Ã±": "ñ",
            "Â ": " ",    # non-breaking space → normal space
        }

        for wrong, correct in replacements.items():
            text = text.replace(wrong, correct)

        return text


    def _extract_page_content(self, page, page_num: int) -> Dict:
        content = {
            'page_num': page_num,
            'text': page.extract_text(layout=True),
            'tables': []
        }

        tables = page.extract_tables()
        if tables:
            for table in tables:
                content['tables'].append(self._format_table_text(table))

        return content

    def _merge_page_contents(self, pages_content: List[Dict]) -> str:
        output = []
        for page_data in pages_content:
            if page_data['tables']:
                for i, table in enumerate(page_data['tables'], 1):
                    output.append(f"\n[TABLE {i}]\n")
                    output.append(table)
                    output.append("")
            if page_data['text']:
                output.append(page_data['text'])
            output.append("")
        return "\n".join(output)

    def _format_table_text(self, table: List[List[str]]) -> str:
        if not table or not table[0]:
            return "[Empty table]"

        cleaned = [[str(cell or "").strip() for cell in row] for row in table]
        col_widths = []
        for col_idx in range(len(cleaned[0])):
            width = max(len(row[col_idx]) if col_idx < len(row) else 0 for row in cleaned)
            col_widths.append(max(width, 3) + 2)

        lines = []
        separator = "+" + "+".join("-" * w for w in col_widths) + "+"

        for row_idx, row in enumerate(cleaned):
            if row_idx == 0:
                lines.append(separator)
            cells = []
            for col_idx, cell in enumerate(row):
                if col_idx < len(col_widths):
                    cells.append(cell.ljust(col_widths[col_idx]))
            lines.append("|" + "|".join(cells) + "|")
            if row_idx == 0:
                lines.append(separator)
        lines.append(separator)
        return "\n".join(lines)

    def to_summary_table(self, doc_name, annexures):
        """
        Returns a JSON-friendly list of dicts:
        S. No. | Name | Page Start | Page End
        """
        summary = []

        for idx, ann in enumerate(annexures, start=1):
            summary.append({
                "document_name": doc_name,
                "annexure_name": ann.title,           # preserve exact detected name
                "page_start": ann.page_start,
                "page_end": ann.page_end
            })

        return summary

def extract_annexures_info(output_dir, data):

    combined_result = []
    try:
        extractor = FinalAnnexureExtractor()
        for doc_path, doc_text in data.items():
            doc_name = os.path.basename(doc_path)

            annexures = extractor.extract_all_annexures(doc_text)
            print(f"Extracted {len(annexures)} annexures from '{doc_name}'.")
            result = extractor.to_summary_table(doc_name, annexures)

            combined_result.extend(result)
            

        # Save combined result to file with proper formatting
        annexures_info_path = os.path.join(output_dir, "annexures_info.txt")
        # Convert combined_result to formatted string
        formatted_result = json.dumps(combined_result, indent=4, ensure_ascii=False)
        # Write to file
        with open(annexures_info_path, 'w', encoding='utf-8') as f:
            f.write(formatted_result)
        return combined_result

    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from '{data}'. Check if the file contains valid JSON.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return e

    