o
    ZŽhd  ã                   @   s˜   d Z ddlZddlmZmZ ddlZddlmZ ddl	m
Z
 ddlmZmZ dZe e¡Zd	ed
edejfdd„ZG dd„ dƒZG dd„ dƒZdgZdS )z%REALM Retriever model implementation.é    N)ÚOptionalÚUnion)Úhf_hub_download)ÚAutoTokenizeré   )ÚloggingÚ	strtoboolzblock_records.npyÚblock_records_pathÚnum_block_recordsÚreturnc                 C   sF   dd l m  m} |jj| dd}|j|dd}t| d¡ ¡ ƒ}|S )Nr   i    )Úbuffer_sizeT)Zdrop_remainderé   )	Ztensorflow.compat.v1ÚcompatÚv1ÚdataZTFRecordDatasetÚbatchÚnextÚtakeZas_numpy_iterator)r	   r
   ÚtfZblocks_datasetZ	np_record© r   úc/var/www/auris/lib/python3.10/site-packages/transformers/models/deprecated/realm/retrieval_realm.pyÚconvert_tfrecord_to_np"   s
   r   c                   @   s*   e Zd ZdZ				ddd„Zdd	„ Zd
S )ÚScaNNSearcherztNote that ScaNNSearcher cannot currently be used within the model. In future versions, it might however be included.é   éè  éd   é † c           	      C   sD   ddl m} |||dd}|j|||d}|j|d}| ¡ | _dS )zBuild scann searcher.r   )ÚbuilderZdot_product)ÚdbÚnum_neighborsZdistance_measure)Ú
num_leavesÚnum_leaves_to_searchÚtraining_sample_size)Údimensions_per_blockN)Z#scann.scann_ops.py.scann_ops_pybindr   ÚtreeZscore_ahÚbuildÚsearcher)	Úselfr   r   r#   r    r!   r"   ZBuilderr   r   r   r   Ú__init__/   s   ÿzScaNNSearcher.__init__c                 C   s"   | j  | ¡  ¡ ¡\}}| d¡S )NZint64)r&   Úsearch_batchedÚdetachÚcpuZastype)r'   Zquestion_projectionÚretrieved_block_idsÚ_r   r   r   r)   D   s   
zScaNNSearcher.search_batchedN)r   r   r   r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r(   r)   r   r   r   r   r   ,   s    
ùr   c                       sZ   e Zd ZdZ‡ fdd„Zddd„Zedeee	e
jf  fd	d
„ƒZdd„ Zdd„ Z‡  ZS )ÚRealmRetrieverah  The retriever of REALM outputting the retrieved evidence block and whether the block has answers as well as answer
    positions."

        Parameters:
            block_records (`np.ndarray`):
                A numpy array which contains evidence texts.
            tokenizer ([`RealmTokenizer`]):
                The tokenizer to encode retrieved texts.
    c                    s   t ƒ  ¡  || _|| _d S ©N)Úsuperr(   Úblock_recordsÚ	tokenizer)r'   r5   r6   ©Ú	__class__r   r   r(   T   s   

zRealmRetriever.__init__NÚptc                 C   s–   t j| j|dd}| jj|d dd}g }g }	|D ]}
| |¡ |	 |
 ¡ ¡ q| j||	ddd|d}| |¡}|d urE|  ||¡|f S d d d |fS )Nr   )ÚindicesZaxisT)Zskip_special_tokens)ÚpaddingZ
truncationZreturn_special_tokens_maskÚ
max_length)Únpr   r5   r6   ÚdecodeÚappendZconvert_to_tensorsÚblock_has_answer)r'   r,   Zquestion_input_idsÚ
answer_idsr<   Zreturn_tensorsZretrieved_blocksÚquestionÚtextZ	text_pairZretrieved_blockÚconcat_inputsZconcat_inputs_tensorsr   r   r   Ú__call__Y   s   
ÿ
zRealmRetriever.__call__Úpretrained_model_name_or_pathc                 O   sz   t j |¡rt j |t¡}n
td|tdœ|¤Ž}tt j dd¡ƒs%t	dƒ‚t
j|dd}tj|g|¢R i |¤Ž}| ||ƒS )N)Zrepo_idÚfilenameZTRUST_REMOTE_CODEÚFalseaz  This part uses `pickle.load` which is insecure and will execute arbitrary code that is potentially malicious. It's recommended to never unpickle data that could have come from an untrusted source, or that could have been tampered with. If you already verified the pickle data and decided to use it, you can set the environment variable `TRUST_REMOTE_CODE` to `True` to allow it.T)Zallow_pickler   )ÚosÚpathÚisdirÚjoinÚ_REALM_BLOCK_RECORDS_FILENAMEr   r   ÚenvironÚgetÚ
ValueErrorr=   Úloadr   Úfrom_pretrained)ÚclsrF   Zinit_inputsÚkwargsr	   r5   r6   r   r   r   rR   n   s   ÿÿÿ
zRealmRetriever.from_pretrainedc                 C   s(   t  tj |t¡| j¡ | j |¡ d S r3   )	r=   ÚsaverI   rJ   rL   rM   r5   r6   Úsave_pretrained)r'   Zsave_directoryr   r   r   rV   „   s   zRealmRetriever.save_pretrainedc                 C   sd  g }g }g }d}|j D ]€}| ¡ }| | jj¡}	|	d ||	d d…  | jj¡ }
| g ¡ | g ¡ |D ]4}t|	d |
ƒD ]*}|d || krh|||t|ƒ … |krh|d  |¡ |d  |t|ƒ d ¡ q>q5t|d ƒdkrx| d¡ q| d¡ t|d ƒ|kr‹t|d ƒ}qt||ƒD ]\}}t|ƒ|k r¬dg|t|ƒ  }||7 }||7 }q‘|||fS )z&check if retrieved_blocks has answers.r   r   NéÿÿÿÿFT)	Z	input_idsÚtolistÚindexr6   Zsep_token_idr?   ÚrangeÚlenÚzip)r'   rD   rA   Zhas_answersÚ	start_posÚend_posZmax_answersZinput_idZinput_id_listZfirst_sep_idxZsecond_sep_idxZanswerÚidxZ
start_pos_Zend_pos_Úpaddedr   r   r   r@   Š   s>   
"

€ü
€€
zRealmRetriever.block_has_answer)Nr9   )r.   r/   r0   r1   r(   rE   Úclassmethodr   r   ÚstrrI   ÚPathLikerR   rV   r@   Ú__classcell__r   r   r7   r   r2   I   s    

r2   )r1   rI   Útypingr   r   Únumpyr=   Zhuggingface_hubr   Ztransformersr   Úutilsr   r   rM   Z
get_loggerr.   Úloggerrb   ÚintZndarrayr   r   r2   Ú__all__r   r   r   r   Ú<module>   s   


g