o
    iK                     @   s   d dl Z d dlZd dlZd dlmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlZdddZdd ZdddZdddZdddZdd Zdd ZdS )    N)urlparseparse_qsunquote)HTTPAdapter)Retryc                 C   sL  d}d}|r9|j dd}|r9td|tj}|r9t|d}d|v r9tj	|d 
 }td|  ||fS t| }|jr~t|j}dD ]6}||v r}|| d	 }	tjt|	}
|
r}d|
v r}|
}tj	|d 
 }td
| d|  ||f  S qGtjt|j}|rd|v r|}tj	|d 
 }td|  ||fS dS )a  
    Extract filename from URL, query parameters, or response headers.
    Tries multiple strategies to get the correct filename with extension.

    Args:
        url (str): The URL to extract filename from
        response (requests.Response, optional): HTTP response object to check headers

    Returns:
        tuple: (filename, extension) or (None, None) if unable to determine
    Nzcontent-disposition z#filename[*]?=["\']?([^"\';]+)["\']?   .z-Extracted filename from Content-Disposition: )fileDownloadPathfilenamefiledocumentpathr   z)Extracted filename from query parameter 'z': z"Extracted filename from URL path: )NN)headersgetresearch
IGNORECASEr   grouposr   splitextlowerprintr   queryr   basename)urlresponser   	extensioncontent_dispositionmatch
parsed_urlquery_params
param_nameparam_valuepotential_filenamepath_filename r&   ?/var/www/html/minaions-tender/ai-engine/file_download_robust.py_extract_filename_from_url   s@   
r(   c              
   C   s8   |   dd  } ddddddd	d
dd	}|| S )z
    Map Content-Type header to file extension.

    Args:
        content_type (str): Content-Type header value

    Returns:
        str: File extension with dot (e.g., '.docx') or None
    ;r   .pdf.docx.doc.xlsx.xls.pptx.ppt.txt.csv)	zapplication/pdfzGapplication/vnd.openxmlformats-officedocument.wordprocessingml.documentzapplication/mswordzAapplication/vnd.openxmlformats-officedocument.spreadsheetml.sheetzapplication/vnd.ms-excelzIapplication/vnd.openxmlformats-officedocument.presentationml.presentationzapplication/vnd.ms-powerpointz
text/plainztext/csv)r   splitstripr   )content_typecontent_type_mapr&   r&   r'    _get_extension_from_content_typeF   s   

r7   	downloadsT      c                 C   s  |  dstd|   dS |   dks|    dr(td|   dS |  dr1d|  } tj|d	d
 t|}t|d D ]}z|dkri|d|d   }td| d|d  d|d  d t	| td|  |dkrzd|d  dnd  z|j
| d	dd}|jdd }	|jdk}
W n ty } ztd|  d}
d}	W Y d}~nd}~ww |j| d	dd}|  |
s|jdd }	d|	v pd|	v p|j   d}|r|rt| |W   S t| |W   S  tjjy+ } z,td|d  d |  ||kr!td!|  d"|d  d# W Y d}~ dS W Y d}~qBd}~w tjjye } z,td$|d  d |  ||kr[td!|  d"|d  d% W Y d}~ dS W Y d}~qBd}~w tjjy } z,td&|d  d |  ||krtd!|  d"|d  d' W Y d}~ dS W Y d}~qBd}~w tjjy } zCtd(|d  d |  |jd)v rtd*|j d+ W Y d}~ dS ||krtd!|  d"|d  d, W Y d}~ dS W Y d}~qBd}~w tjjy* } z,td-|d  d |  ||kr td!|  d"|d  d. W Y d}~ dS W Y d}~qBd}~w tyb } z,td/|d  d |  ||krXtd0|  d"|d  d1 W Y d}~ dS W Y d}~qBd}~ww dS )2a  
    Download a file from URL or convert webpage to PDF with retry mechanism.
    Windows-compatible version with wkhtmltopdf path configuration.
    
    Args:
        url (str): URL to download from
        save_dir (str): Directory to save files
        convert_webpage_to_pdf (bool): Whether to convert webpages to PDF
        max_retries (int): Maximum number of retry attempts
        retry_delay (int): Base delay between retries in seconds (uses exponential backoff)
    http://https://www.zSkipping non-web URL: Nz+https://sso.gem.gov.in/arxsso/oauth/dologinz5https://bidplus.gem.gov.in/bidding/downloadomppdfile/zSkipping login URL: r>   r=   Texist_okr   r   r:   zRetrying in z seconds... (Attempt /)zAccessing: z
 (Attempt r   )      )allow_redirectstimeoutcontent-type   zHEAD request failed: F)streamrF   	text/htmlzapplication/xhtml)z	<!doctypez<htmlzConnection timeout on attempt : zFailed to access  after z# attempts due to connection timeoutzRead timeout on attempt z attempts due to read timeoutzConnection error on attempt z! attempts due to connection errorzHTTP error on attempt )i  i  i  zNot retrying for HTTP z errorz attempts due to HTTP errorzRequest error on attempt z attempts due to request errorUnexpected error on attempt zFailed to process z! attempts due to unexpected error)
startswithr   r4   r   r   makedirs_create_retry_sessionrangetimesleepheadr   r   status_code	Exceptionraise_for_statustext_convert_webpage_to_pdfdownload_filerequests
exceptionsConnectTimeoutReadTimeoutConnectionError	HTTPErrorRequestException)r   save_dirconvert_webpage_to_pdfmax_retriesretry_delaysessionattempt	wait_timehead_responser5   head_successer   
is_webpager&   r&   r'   download_file_maina   s   
"
$
*





rm   rD   c                 C   s  |  dsd S g d}dddddddd	d
ddd}t }|j| t| \}}tj|dd |r:tj	||}	nd }	t
|D ]y}
ztd|
d  d|   |
dkritdd}td|dd t| |j| d|ddd}|  |st| |\}}|r|s|jdd}t|}|s|rdtt  | }ndtt  d}n|s|r| | }n|s| d}tj	||}	|jd}|rt|dk rtd| d  |jdd }d!|v r|d"std# t|	d$)}d}|jd%d&D ]}|r|| |t|7 }qtd'| d( W d    n	1 s!w   Y  tj|	dkrEtd) t|	 |
|d k rAW q@W  d S td*|	  |	W   S  tjjye   td+|
d   Y n? tjjy } ztd,|
d  d-|  W Y d }~n#d }~w ty } ztd.|
d  d-|  W Y d }~nd }~ww |
|d kr|	rtj |	rt|	 q@td/|  d0| d1 d S )2Nr;   )	r*   r-   r.   r,   r+   r/   r0   r1   r2   sMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36zJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.5zgzip, deflate1z
keep-aliver   navigatenonez	max-age=0)
User-AgentAcceptzAccept-LanguagezAccept-EncodingDNT
ConnectionzUpgrade-Insecure-RequestszSec-Fetch-DestzSec-Fetch-ModezSec-Fetch-SitezCache-ControlTr?   zDownloading (attempt r   z): r   r9   zWaiting z.1fz seconds before retry...)rI   rF   rE   verifyrG   r   downloaded_file_r*   zcontent-lengthd   z Warning: File seems very small (z bytes)rJ   )z.htmlz.htmzWWarning: Received HTML content when expecting document - might be blocked or login pagewbi    )
chunk_sizezDownloaded z byteszError: Downloaded file is emptyzSuccessfully downloaded to: zTimeout error on attempt zRequest failed on attempt rK   rM   zFailed to download rL   z	 attempts)!rN   r[   Sessionr   updater(   r   rO   r   joinrQ   r   randomuniformrR   rS   r   rW   r7   intr   endswithopeniter_contentwritelengetsizeremover\   Timeoutra   rV   exists)r   rb   rF   retriesSUPPORTED_EXTENSIONSr   rf   r   r   	file_pathrg   delayr   r5   detected_extensioncontent_lengthr   downloaded_sizechunkrk   r&   r&   r'   rZ      s   






$$"
rZ   c                 C   sV   t  }t| dg dg dd}t|d}|d| |d| |jdd	i |S )
zA
    Create a requests session with built-in retry strategy.
    r   )i  i  i  i  i  )HEADGETOPTIONS)totalbackoff_factorstatus_forcelistallowed_methods)rd   r<   r=   rr   rn   )r[   r{   r   r   mountr   r|   )rd   rf   retry_strategyadapterr&   r&   r'   rP   S  s   
rP   c               
   C   s   g d} zddl }|jddgdddd}|jdkr td	 i W S W n |jt|jfy/   Y nw | D ]}tj	|rGtd
|  d|i  S q2td td td td dS )zs
    Get the appropriate wkhtmltopdf configuration for Windows.
    Returns the configuration dict for pdfkit.
    )z0C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exez6C:\Program Files (x86)\wkhtmltopdf\bin\wkhtmltopdf.exez"C:\wkhtmltopdf\bin\wkhtmltopdf.exez/usr/bin/wkhtmltopdfr   Nwkhtmltopdfz	--versionT   )capture_outputrX   rF   z Found wkhtmltopdf in system PATHzFound wkhtmltopdf at: z*wkhtmltopdf not found in common locations.zPlease either:z*1. Add wkhtmltopdf to your system PATH, orzO2. Update the 'possible_paths' list in the function with your installation path)

subprocessrun
returncoder   TimeoutExpiredFileNotFoundErrorSubprocessErrorr   r   r   )possible_pathsr   resultr   r&   r&   r'   _get_wkhtmltopdf_configm  s.   	

r   c           	      C   s  zat  }|du rW dS t| }tdd|j|j }tdd|d}d| d}tj||}t	d|  dd	d	d	d	d
dddddddd}t
j| ||t
j|ddd t	d|  |W S  ty } zt	d|  t	d t	d t	d t	d W Y d}~dS d}~ww )zHConvert webpage to PDF using pdfkit with Windows-specific configuration.Nz[^\w\-_]_z_+webpage_r*   zConverting webpage to PDF: A4z0.75inzUTF-8i  ignorer   )z	page-sizez
margin-topzmargin-rightzmargin-bottomzmargin-leftencodingz
no-outlinezenable-local-file-accesszjavascript-delayzload-error-handlingzload-media-error-handlingzdisable-smart-shrinkingzprint-media-typer   )r   )optionsconfigurationzWebpage converted to PDF: z"Failed to convert webpage to PDF: zTroubleshooting tips:z+1. Ensure wkhtmltopdf is properly installedz*2. Check if the executable path is correctz83. Try running 'wkhtmltopdf --version' in command prompt)r   r   r   subnetlocr   r4   r   r}   r   pdfkitfrom_urlr   r   rV   )	r   rb   configr    
clean_namer   r   r   rk   r&   r&   r'   rY     sH    rY   )N)r8   Tr9   r:   )r8   rD   r9   )r9   )r[   r   r   urllib.parser   r   r   r   rR   requests.adaptersr   urllib3.util.retryr   r~   r(   r7   rm   rZ   rP   r   rY   r&   r&   r&   r'   <module>   s"    
:

p 
&