o
    ȳgc                     @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ e e!Z"dZ#dZ$dZ%dZ&g dZ'g dZ(dgZ)g dZ*e'e(e)e*dZ+G dd de,eZ-G dd deZ.G dd deZ/G dd deZ0G dd deZ1G d d! d!eZ2dDd%d&Z3dEd(d)Z4dFd+d,Z5dGd.d/Z6dHd0d1Z7	dIdJd7d8Z8dKd:d;Z9dLd=d>Z:dMd@dAZ;G dBdC dCeZ<dS )N    )annotationsN)Enum)
HTTPStatus)AnyDictListOptionalTuple)Document)get_runtime_environment)get_from_dict_or_env)	BaseModel)Responserequest)RequestException)
BaseLoaderz0.1.1zhttp://localhost:8000zhttps://api.daxa.ai  )
JSONLoaderS3FileLoaderUnstructuredMarkdownLoaderUnstructuredPDFLoaderUnstructuredFileLoaderUnstructuredJsonLoaderPyPDFLoaderGCSFileLoaderAmazonTextractPDFLoader	CSVLoaderUnstructuredExcelLoaderUnstructuredEmailLoader)DirectoryLoaderS3DirLoaderSlackDirectoryLoaderPyPDFDirectoryLoaderNotionDirectoryLoaderDataFrameLoader)NotionDBLoaderGoogleDriveLoaderSharePointLoader)filedir	in-memoryzcloud-folderc                   @  s   e Zd ZdZdZdZdS )Routesz2Routes available for the Pebblo API as enumerator.z/v1/loader/docz/v1/app/discoverN)__name__
__module____qualname____doc__
loader_docloader_app_discover r2   r2   `/var/www/html/chatdoc2/venv/lib/python3.10/site-packages/langchain_community/utilities/pebblo.pyr+   C   s    r+   c                   @  s   e Zd ZU dZded< dS )IndexedDocumentzPebblo Indexed Document.strpb_idNr,   r-   r.   r/   __annotations__r2   r2   r2   r3   r4   J   s   
 r4   c                   @  s   e Zd ZU dZdZded< 	 ded< 	 ded< 	 dZded	< 	 ded
< 	 ded< 	 ded< 	 ded< 	 ded< 	 dZded< dS )RuntimezPebblo Runtime.localr5   typehostpath Optional[str]ipplatformos
os_versionlanguagelanguage_versionruntimeN)r,   r-   r.   r/   r;   r8   r@   rF   r2   r2   r2   r3   r9   Q   s,   
 r9   c                   @  s$   e Zd ZU dZded< 	 ded< dS )	FrameworkzPebblo Framework instance.r5   nameversionNr7   r2   r2   r2   r3   rG   j   s   
 rG   c                   @  s`   e Zd ZU dZded< 	 ded< 	 ded< 	 ded< 	 ded	< 	 d
ed< 	 ded< 	 d
ed< dS )AppzPebblo AI application.r5   rH   ownerr?   descriptionload_idr9   rF   rG   	frameworkplugin_versionclient_versionNr7   r2   r2   r2   r3   rJ   s   s$   
 rJ   c                   @  st   e Zd ZU dZded< 	 ded< 	 ded< 	 ded< 	 ded< 	 d	ed
< 	 ded< 	 ded< 	 ded< 	 ded< dS )DoczPebblo document.r5   rH   rK   listdocsrO   rM   dictloader_detailsboolloading_endsource_ownerclassifier_locationanonymize_snippetsNr7   r2   r2   r2   r3   rQ      s,   
 rQ   r=   r5   returnc                 C  sF   | rd| v sd| d ks| dv r| S t | }| r| }t|S )zReturn an absolute local path for a local file/directory,
    for a network related path, return as is.

    Args:
        path (str): Relative path to be resolved.

    Returns:
        str: Resolved absolute path.
    z:///r   )unknown-r*   )pathlibPathexistsresolver5   )r=   	full_pathr2   r2   r3   get_full_path   s   
rd   loaderc                 C  s&   t  D ]\}}| |v r|  S qdS )zReturn loader type among, file, dir or in-memory.

    Args:
        loader (str): Name of the loader, whose type is to be resolved.

    Returns:
        str: One of the loader type among, file/dir/in-memory.
    unsupported)LOADER_TYPE_MAPPINGitems)re   loader_typeloadersr2   r2   r3   get_loader_type   s
   	rk   r   c                 C  s  ddl m}m}m}m} d}t| tstd |S | j	}zd|v rBt| |r2d| j
 d| j }nt| |rAd| j
 d| j }nd	|v r^|d	 }|r]d
|v r]|d
 }|r]| d| }nd|v rg|d }nxd|v rp|d }nod|v r|d }|rt|trt|dkr|d }nUt| |rd}nMt| |rd| j }nA| jjdkr|dr|d}	d|	 }n+|dr|dg }
ddd |
D }n|dr|dg }ddd |D }W n	 ty   Y nw tt|S )zReturn an absolute source path of source of loader based on the
    keys present in Document.

    Args:
        loader (BaseLoader): Langchain document loader, derived from Baseloader.
    r   )r$   r   r%   r   r^   zGloader is not derived from BaseLoader, source location will be unknown!bucketzgc://r\   zs3://sourcechannelr=   	file_path	web_pathsr*   znotiondb://r&   	folder_idz+https://drive.google.com/drive/u/2/folders/file_idsz, c                 S     g | ]}d | dqS )z https://drive.google.com/file/d/z/viewr2   ).0file_idr2   r2   r3   
<listcomp>       
z(get_loader_full_path.<locals>.<listcomp>document_idsc                 S  rs   )z#https://docs.google.com/document/d/z/editr2   )rt   doc_idr2   r2   r3   rv     rw   )$langchain_community.document_loadersr$   r   r%   r   
isinstancer   loggererror__dict__rl   blobkeyrR   lendatabase_id	__class__r,   getjoin	Exceptionrd   r5   )re   r$   r   r%   r   locationloader_dictrn   rp   rq   rr   rx   r2   r2   r3   get_loader_full_path   st   










r   Tuple[Framework, Runtime]c                  C  s   t  } td| ddd}t }t|jtjd | dd|j	|j
t | dd| d	dd
}d|jv r;d|_d|_td|  td|  ||fS )zFetch the current Framework and Runtime details.

    Returns:
        Tuple[Framework, Runtime]: Framework and Runtime for the current app instance.
    	langchainlibrary_versionN)rH   rI   PWDrA   r]   rF   runtime_version)r<   r=   rA   rB   rC   r@   rD   rE   DarwindesktopzMac OSXz
framework zruntime )r   rG   r   rA   unamer9   noderB   environsystemrI   get_ipr;   rF   r|   debug)runtime_envrN   r   rF   r2   r2   r3   get_runtime  s*   



r   c                  C  s@   ddl } |  }z| |}W |S  ty   | d}Y |S w )zJFetch local runtime ip address.

    Returns:
        str: IP address
    r   N	localhost)socketgethostnamegethostbynamer   )r   r<   	public_ipr2   r2   r3   r   .  s   r   rS   List[Document]max_batch_sizeintList[List[Document]]c                 C  s~   g }g }d}| D ]-}t |jd}||kr||g q|| |kr,|| g }d}|| ||7 }q|r=|| |S )a  
    Generate batches of documents based on page_content size.
    Args:
        docs: List of documents to be batched.
        max_batch_size: Maximum size of each batch in bytes. Defaults to 100*1024(100KB)
    Returns:
        List[List[Document]]: List of batches of documents
    r   utf-8)r   page_contentencodeappend)rS   r   batchescurrent_batchcurrent_batch_sizedocdoc_sizer2   r2   r3   generate_size_based_batches>  s    



r   ro   c                 C  s@   zddl }t| j}||j}W |S  ty   d}Y |S w )zFetch owner of local file path.

    Args:
        file_path (str): Local file path.

    Returns:
        str: Name of owner.
    r   Nr]   )pwdrB   statst_uidgetpwuidpw_namer   )ro   r   file_owner_uidfile_owner_namer2   r2   r3   get_file_owner_from_pathf  s   	r   source_pathc                 C  s   | sdS d}t j| rt j| }|S t j| rCd}t | D ]\}}}|D ]}t j||}t j|s?|t j|7 }q(q!|}|S )zFetch size of source path. Source can be a directory or a file.

    Args:
        source_path (str): Local path of data source.

    Returns:
        int: Source size in bytes.
    r   )rB   r=   isfilegetsizeisdirwalkr   islink)r   size
total_sizedirpath_	filenamesffpr2   r2   r3   get_source_sizey  s"   		r   datac                 C  s   |  d}t|}|S )zCalculate the content size in bytes:
    - Encode the string to bytes using a specific encoding (e.g., UTF-8)
    - Get the length of the encoded bytes.

    Args:
        data (str): Data string.

    Returns:
        int: Size of string in bytes.
    r   )r   r   )r   encoded_contentr   r2   r2   r3   calculate_content_size  s   
r   c                      s   e Zd ZU dZded< 	 dZded< 	 ded< 	 ded< 	 d	Zd
ed< 	 d; fddZd<ddZ		d=d>ddZ	d?ddZ
d=d@d!d"ZdAd(d)Ze	*	+dBdCd2d3ZedDd6d7ZedEd9d:Z  ZS )FPebbloLoaderAPIWrapperzWrapper for Pebblo Loader API.r?   api_keyr:   r5   rY   classifier_url	cloud_urlFrV   rZ   kwargsr   c                   sL   t |ddd|d< t |ddt|d< t |ddt|d< t jd	i | dS )
z%Validate that api key in environment.r   PEBBLO_API_KEYr>   r   PEBBLO_CLASSIFIER_URLr   PEBBLO_CLOUD_URLNr2   )r   _DEFAULT_CLASSIFIER_URL_DEFAULT_PEBBLO_CLOUD_URLsuper__init__)selfr   r   r2   r3   r     s   zPebbloLoaderAPIWrapper.__init__apprJ   r[   Nonec           	      C  s   d}|j dd}| jdkr"|  }| j tjj }| d|||}| jrW| jdd}|r=t	
|jd}|d|i |dti | j tjj }| d|||}dS dS )	z
        Send app discovery request to Pebblo server & cloud.

        Args:
            app (App): App instance to be discovered.
        NTexclude_unsetr:   POSTcloud_requestpebblo_server_versionpebblo_client_version)rT   rY   _make_headersr   r+   r1   valuemake_requestr   jsonloadstextr   updatePLUGIN_VERSIONr   )	r   r   pebblo_resppayloadheadersapp_discover_urlr   pebblo_cloud_urlr   r2   r2   r3   send_loader_discover  s$   
z+PebbloLoaderAPIWrapper.send_loader_discoverdocs_with_idList[IndexedDocument]rU   rT   rW   c              
   C  s4  | dd}t|}| |||\}}| ||||||}	i }
| jdkrm|  }| j tjj	 }z#| 
d|||	d}|rSt|j dg D ]}|
|d |i qGW n tyl } ztd| W Y d	}~nd	}~ww | jr| jdkr}| |	d |
 |	d
d	 | |	 |
S | jdkrtd td|
S )a  
        Send documents to Pebblo server for classification.
        Then send classified documents to Daxa cloud(If api_key is present).

        Args:
            docs_with_id (List[IndexedDocument]): List of documents to be classified.
            app (App): App instance.
            loader_details (dict): Loader details.
            loading_end (bool): Boolean, indicating the halt of data loading by loader.
        r   r>   r:   r   i,  rS   r6   z3An Exception caught in classify_documents: local %sNrZ   zpebblo-cloudz4API key is missing for sending docs to Pebblo cloud.)r   r   prepare_docs_for_classificationbuild_classification_payloadrY   r   r   r+   r0   r   r   r   r   r   r   r   r|   warningr   update_doc_datapopsend_docs_to_pebblo_cloud	NameError)r   r   r   rU   rW   r   rX   rS   source_aggregate_sizer   classified_docsr   load_doc_urlr   classified_docer2   r2   r3   classify_documents  sH   






z)PebbloLoaderAPIWrapper.classify_documentsr   c              
   C  sh   | j dd}| j tjj }z| d|||}W dS  ty3 } ztd| W Y d}~dS d}~ww )z
        Send documents to Pebblo cloud.

        Args:
            payload (dict): The payload containing documents to be sent.
        Tr   r   z3An Exception caught in classify_documents: cloud %sN)	r   r   r+   r0   r   r   r   r|   r   )r   r   r   r   r   r   r2   r2   r3   r     s   z0PebbloLoaderAPIWrapper.send_docs_to_pebblo_cloudr   c                 C  s6   ddd}|r| j r|d| j i |S td |S )z
        Generate headers for the request.

        args:
            cloud_request (bool): flag indicating whether the request is for Pebblo
            cloud.
        returns:
            dict: Headers for the request.

        zapplication/json)AcceptzContent-Typez	x-api-keyz,API key is missing for Pebblo cloud request.)r   r   r|   r   )r   r   r   r2   r2   r3   r   (  s   
z$PebbloLoaderAPIWrapper._make_headersrS   
List[dict]rX   r   r   c                 C  sb   |j |j|t|j|d|| j| jd
}|du r$d|d< d|v r$||d d< td
i |jdd}|S )a  
        Build the payload for document classification.

        Args:
            app (App): App instance.
            docs (List[dict]): List of documents to be classified.
            loader_details (dict): Loader details.
            source_owner (str): Owner of the source.
            source_aggregate_size (int): Aggregate size of the source.
            loading_end (bool): Boolean indicating the halt of data loading by loader.

        Returns:
            dict: Payload for document classification.
        false)
rH   rK   rS   rO   rM   rU   rW   rX   rY   rZ   TtruerW   rU   r   r   Nr2   )rH   rK   r   rM   rY   rZ   rQ   rT   )r   r   rS   rU   rX   r   rW   r   r2   r2   r3   r   ?  s$   
z3PebbloLoaderAPIWrapper.build_classification_payloadN   methodurlr   Optional[dict]timeoutOptional[Response]c              
   C  s  zYt | ||||d}td| |j jtt|j jr|j jng t|j |jtj	kr6t
d|j  |W S |jtjkrHt
d|j  |W S |jtjkrWt
d|j  |W S  tyi   t
d| Y dS  ty } zt
d| W Y d}~dS d}~ww )	a  
        Make a request to the Pebblo API

        Args:
            method (str): HTTP method (GET, POST, PUT, DELETE, etc.).
            url (str): URL for the request.
            headers (dict): Headers for the request.
            payload (Optional[dict]): Payload for the request (for POST, PUT, etc.).
            timeout (int): Timeout for the request in seconds.

        Returns:
            Optional[Response]: Response object if the request is successful.
        )r   r   r   r   r  z5Request: method %s, url %s, len %s response status %szPebblo Server: Error z$Pebblo received an invalid payload: z-Pebblo returned an unexpected response code: zUnable to reach server %sz'An Exception caught in make_request: %sN)r   r|   r   r   r5   r   bodystatus_coder   INTERNAL_SERVER_ERRORr   BAD_REQUESTr   OKr   r   )r   r   r   r   r  responser   r2   r2   r3   r   k  s@   
	z#PebbloLoaderAPIWrapper.make_requestr   Tuple[List[dict], int]c              
   C  s6  g }d}dd | D }d}|D ]}| di }| dg }	|d dkr.t| d	|d
 }
nt| d| d	|}
| dt|
}| dt|
}t| d}t|}||7 }| ddp`d}|||
|| di  d|d|	rwd|	ini |durd|ini  |d dkr|s| d|d
< d}q||fS )a  
        Prepare documents for classification.

        Args:
            docs_with_id (List[IndexedDocument]): List of documents to be classified.
            source_path (str): Source path of the documents.
            loader_details (dict): Contains loader info.

        Returns:
            Tuple[List[dict], int]: Documents and the aggregate size
            of the source.
        r   c                 S  s   g | ]}|  qS r2   )rT   )rt   r   r2   r2   r3   rv     s    zJPebbloLoaderAPIWrapper.prepare_docs_for_classification.<locals>.<listcomp>Fmetadataauthorized_identitiesre   r'   rm   r   rc   rK   r   r   r6   Nlast_modified)r   r   r6   r  
file_ownersource_path_sizesource_full_urlT)r   rd   r   r   r5   r   r   )r   r   rU   rS   r   doc_contentsource_path_updater   doc_metadatadoc_authorized_identitiesdoc_source_pathdoc_source_ownerdoc_source_sizer   page_content_sizery   r2   r2   r3   r     s`   

z6PebbloLoaderAPIWrapper.prepare_docs_for_classificationr   c              
   C  sX   | D ]'}| |d i }|| d| d| di | di d |d qdS )	z
        Update the document data with classified information.

        Args:
            docs (List[dict]): List of document data to be updated.
            classified_docs (dict): The dictionary containing classified documents.
        r6   pb_checksumloader_source_pathentitiestopics)r  r  r  r  r   N)r   r   r   )rS   r   doc_dataclassified_datar2   r2   r3   r     s   	

	z&PebbloLoaderAPIWrapper.update_doc_data)r   r   )r   rJ   r[   r   )F)
r   r   r   rJ   rU   rT   rW   rV   r[   rT   )r   rT   r[   r   )r   rV   r[   rT   )r   rJ   rS   r   rU   rT   rX   r5   r   r   rW   rV   r[   rT   )Nr   )r   r5   r   r5   r   rT   r   r   r  r   r[   r  )r   r   r   r5   rU   rT   r[   r	  )rS   r   r   rT   r[   r   )r,   r-   r.   r/   r8   rY   rZ   r   r   r   r   r   r   staticmethodr   r   r   __classcell__r2   r2   r   r3   r     s6   
 
$
>
,1Fr   )r=   r5   r[   r5   )re   r5   r[   r5   )re   r   r[   r5   )r[   r   )r[   r5   )r   )rS   r   r   r   r[   r   )ro   r5   r[   r5   )r   r5   r[   r   )r   r5   r[   r   )=
__future__r   r   loggingrB   r_   rA   enumr   httpr   typingr   r   r   r   r	   langchain_core.documentsr
   langchain_core.envr   langchain_core.utilsr   pydanticr   requestsr   r   requests.exceptionsr   )langchain_community.document_loaders.baser   	getLoggerr,   r|   r   r   r   BATCH_SIZE_BYTESfile_loader
dir_loader	in_memorycloud_folderrg   r5   r+   r4   r9   rG   rJ   rQ   rd   rk   r   r   r   r   r   r   r   r   r2   r2   r2   r3   <module>   s^    
	



H

(

