o
    Zh"/                     @   s   d dl Z d dlmZmZmZ d dlZd dlmZ	 d dlm
Z
mZmZmZmZ ddlmZ ddlmZ ddlmZ ed	d
G dd dejjZdgZdS )    N)ListOptionalUnion)BertTokenizer)FastBertTokenizerShrinkLongestTrimmercase_fold_utf8combine_segmentspad_model_inputs   )keras)requires   )tftensorflow_text)backendsc                       s   e Zd ZdZ										d!dededee d	ee d
ee dedededee dededef fddZ	e
d"ddZe
deeejf fddZdd Z							d#ddZdd  Z  ZS )$TFBertTokenizera  
    This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the
    `from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings
    from an existing standard tokenizer object.

    In-graph tokenizers, unlike other Hugging Face tokenizers, are actually Keras layers and are designed to be run
    when the model is called, rather than during preprocessing. As a result, they have somewhat more limited options
    than standard tokenizer classes. They are most useful when you want to create an end-to-end model that goes
    straight from `tf.string` inputs to outputs.

    Args:
        vocab_list (`list`):
            List containing the vocabulary.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        cls_token_id (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        sep_token_id (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token_id (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        padding (`str`, defaults to `"longest"`):
            The type of padding to use. Can be either `"longest"`, to pad only up to the longest sample in the batch,
            or `"max_length", to pad all inputs to the maximum length supported by the tokenizer.
        truncation (`bool`, *optional*, defaults to `True`):
            Whether to truncate the sequence to the maximum length.
        max_length (`int`, *optional*, defaults to `512`):
            The maximum length of the sequence, used for padding (if `padding` is "max_length") and/or truncation (if
            `truncation` is `True`).
        pad_to_multiple_of (`int`, *optional*, defaults to `None`):
            If set, the sequence will be padded to a multiple of this value.
        return_token_type_ids (`bool`, *optional*, defaults to `True`):
            Whether to return token_type_ids.
        return_attention_mask (`bool`, *optional*, defaults to `True`):
            Whether to return the attention_mask.
        use_fast_bert_tokenizer (`bool`, *optional*, defaults to `True`):
            If True, will use the FastBertTokenizer class from Tensorflow Text. If False, will use the BertTokenizer
            class instead. BertTokenizer supports some additional options, but is slower and cannot be exported to
            TFLite.
    NlongestT   
vocab_listdo_lower_casecls_token_idsep_token_idpad_token_idpadding
truncation
max_lengthpad_to_multiple_ofreturn_token_type_idsreturn_attention_maskuse_fast_bert_tokenizerc              	      s  t    |rt|ftj|d|| _n+tjjtjj|tj	tj
tj|tjdtjdtjddd}t|ftj|d|| _|| _|| _|d u rO|dn|| _|d u r[|d	n|| _|d u rg|d
n|| _t|d dd| _|| _|| _|| _|	| _|
| _|| _d S )N)token_out_typeZlower_case_nfd_strip_accents)Zout_type)Zdtype)keysZ	key_dtypevaluesZvalue_dtyper   )Znum_oov_buckets)r!   Z
lower_casez[CLS]z[SEP]z[PAD]r   Zaxis)super__init__r   r   Zint64tf_tokenizerlookupZStaticVocabularyTableZKeyValueTensorInitializerstringrangesizeBertTokenizerLayerr   r   indexr   r   r   r   paired_trimmerr   r   r   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r    Ztokenizer_kwargsZlookup_table	__class__ \/var/www/auris/lib/python3.10/site-packages/transformers/models/bert/tokenization_bert_tf.pyr&   ;   sJ   

	
zTFBertTokenizer.__init__	tokenizerPreTrainedTokenizerBasec           	      K   s   | dd}|du r|jn|}| dd}|du r|jn|}| dd}|du r+|jn|}| dd}|du r:|jn|}| }t| dd d}d	d
 |D }| d|||||d|S )a  
        Initialize a `TFBertTokenizer` from an existing `Tokenizer`.

        Args:
            tokenizer (`PreTrainedTokenizerBase`):
                The tokenizer to use to initialize the `TFBertTokenizer`.

        Examples:

        ```python
        from transformers import AutoTokenizer, TFBertTokenizer

        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
        tf_tokenizer = TFBertTokenizer.from_tokenizer(tokenizer)
        ```
        r   Nr   r   r   c                 S   s   | d S )Nr   r2   )xr2   r2   r3   <lambda>   s    z0TFBertTokenizer.from_tokenizer.<locals>.<lambda>)keyc                 S   s   g | ]}|d  qS )r   r2   ).0entryr2   r2   r3   
<listcomp>   s    z2TFBertTokenizer.from_tokenizer.<locals>.<listcomp>r   r   r   r   r   r2   )popr   r   r   r   Z	get_vocabsorteditems)	clsr4   kwargsr   r   r   r   Zvocabr   r2   r2   r3   from_tokenizerk   s(   zTFBertTokenizer.from_tokenizerpretrained_model_name_or_pathc                 O   s\   zt j|g|R i |}W n   ddlm} |j|g|R i |}Y | j|fi |S )a  
        Instantiate a `TFBertTokenizer` from a pre-trained tokenizer.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                The name or path to the pre-trained tokenizer.

        Examples:

        ```python
        from transformers import TFBertTokenizer

        tf_tokenizer = TFBertTokenizer.from_pretrained("google-bert/bert-base-uncased")
        ```
        r   )BertTokenizerFast)r   from_pretrainedZtokenization_bert_fastrD   rB   )r@   rC   Zinit_inputsrA   r4   rD   r2   r2   r3   rE      s   zTFBertTokenizer.from_pretrainedc                 C   s&   | j rt|}| j|}|ddS )Nr   )r   r   r'   tokenizeZ
merge_dims)r/   Ztextstokensr2   r2   r3   unpaired_tokenize   s   z!TFBertTokenizer.unpaired_tokenizec	                 C   s  |d u r| j }|dvrtd|d ur|d urtd|d u r"| j}|d u r)| j}|d u r0| j}|d u r7| j}|d u r>| j}t|tj	sIt
|}|d urXt|tj	sXt
|}|d urp|jjdkrftd|jjdkrptd|jjdkr|d d df |d d df }}| |}|d u r|r|d d d |d f }t|f| j| jd	\}	}
n| |}|r| j||g\}}t||f| j| jd	\}	}
|d
kr|	jdd}|d ur|tj| |  }n|}t|	|| jd\}	}d|	i}|r||d< |r
t|
|| jd\}
}|
|d< |S )N)r   r   z1Padding must be either 'longest' or 'max_length'!zJmax_length cannot be overridden at call time when truncating paired texts!r   zJtext argument should not be multidimensional when a text pair is supplied!z)text_pair should not be multidimensional!   r   )Zstart_of_sequence_idZend_of_segment_idr   r$   )Zmax_seq_lengthZ	pad_value	input_idsattention_masktoken_type_ids)r   
ValueErrorr   r   r   r   r   
isinstancer   ZTensorZconvert_to_tensorshapeZrankrI   r	   r   r   r.   ZtrimZbounding_shapemathfloordivr
   r   )r/   textZ	text_pairr   r   r   r   r   r   rK   rM   Z
pad_lengthrL   output_r2   r2   r3   call   sp   

"



zTFBertTokenizer.callc                 C   s   | j | j| j| j| jdS )Nr<   r<   )r/   r2   r2   r3   
get_config   s   zTFBertTokenizer.get_config)
NNNr   Tr   NTTT)r4   r5   )NNNNNNN)__name__
__module____qualname____doc__r   boolr   intstrr&   classmethodrB   r   osPathLikerE   rI   rV   rW   __classcell__r2   r2   r0   r3   r      sf    0	
0&	
Hr   )r`   typingr   r   r   Z
tensorflowr   r   r   r,   r   r   r   r	   r
   Zmodeling_tf_utilsr   Zutils.import_utilsr   Ztokenization_bertZlayersZLayerr   __all__r2   r2   r2   r3   <module>   s     
v