o
    Zhi                  	   @   s@  d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
Z
ddlmZ ddlmZ eeZdd	d
Zdd ZG dd deZ	 dZdZededdddddf	Ze
dde e
je
jB e
jB Ze
dZe
ee
je
jB e
jB Ze
dZ d-ddZ!d.d"d#Z"G d$d% d%Z#d&d' Z$d(d) Z%d/d+d,Z&dgZ'dS )0z!Tokenization classes for BERTweet    N)copyfile)ListOptionalTuple   )PreTrainedTokenizer)loggingz	vocab.txtz	bpe.codes)
vocab_filemerges_filec                 C   s>   t  }| d }| dd D ]}|||f |}qt |}|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r      N)setadd)wordpairsZ	prev_charchar r   a/var/www/auris/lib/python3.10/site-packages/transformers/models/bertweet/tokenization_bertweet.py	get_pairs&   s   r   c                
       s0  e Zd ZdZeZ								d- fdd		Z	
d.dee de	ee  dee fddZ
	d/dee de	ee  dedee f fddZ	
d.dee de	ee  dee fddZedd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd.d'ed(e	e dee fd)d*Zd+d, Z  ZS )0BertweetTokenizera	  
    Constructs a BERTweet tokenizer, using Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        normalization (`bool`, *optional*, defaults to `False`):
            Whether or not to apply a normalization preprocess.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    F<s></s><unk><pad><mask>c                    sT  zddl m} || _W n ty   td d | _Y nw || _|| _i | _d| jt	|< d| jt	|	< d| jt	|< d| jt	|< | 
| dd | j D | _t|d	d
}| dd d }W d    n1 snw   Y  dd |D }tt|tt|| _i | _|| _t | _ddd| _t jd|||||||	|
d| d S )Nr   )demojizezsemoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0r      r   c                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>       z.BertweetTokenizer.__init__.<locals>.<dictcomp>utf-8encoding
c                 S   s    g | ]}t | d d qS )Nr%   )tuplesplit)r   merger   r   r   
<listcomp>   s     z.BertweetTokenizer.__init__.<locals>.<listcomp>'z...)u   ’u   …)normalization	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_tokenr   )emojir   	demojizerImportErrorloggerwarningr	   r
   encoderstradd_from_fileitemsdecoderopenreadr'   dictziprangelen	bpe_rankscacher+   TweetTokenizertweetPreprocessorspecial_punctssuper__init__)selfr	   r
   r+   r,   r-   r.   r/   r0   r1   r2   kwargsr   Zmerges_handleZmerges	__class__r   r   rI   k   sN   


	
zBertweetTokenizer.__init__Ntoken_ids_0token_ids_1returnc                 C   sD   |du r| j g| | jg S | j g}| jg}|| | | | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERTweet sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)cls_token_idsep_token_id)rJ   rN   rO   clssepr   r   r    build_inputs_with_special_tokens   s
   z2BertweetTokenizer.build_inputs_with_special_tokensalready_has_special_tokensc                    sh   |rt  j||ddS |du rdgdgt|  dg S dgdgt|  ddg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rN   rO   rV   Nr   r   )rH   get_special_tokens_maskrB   )rJ   rN   rO   rV   rL   r   r   rW      s   0z)BertweetTokenizer.get_special_tokens_maskc                 C   sP   | j g}| jg}|du rt|| | dg S t|| | | | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. BERTweet does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )rR   rQ   rB   )rJ   rN   rO   rT   rS   r   r   r   $create_token_type_ids_from_sequences   s
   "z6BertweetTokenizer.create_token_type_ids_from_sequencesc                 C   s
   t | jS N)rB   r8   rJ   r   r   r   
vocab_size   s   
zBertweetTokenizer.vocab_sizec                 C   s   t | jfi | jS rY   )r?   r8   Zadded_tokens_encoderrZ   r   r   r   	get_vocab   s   zBertweetTokenizer.get_vocabc           
         s  | j v r
 j | S t|}tt|d d |d d g }t|}|s'|S 	 t| fddd}| jvr8ny|\}}g }d}|t|k rz|||}	W n ty`   |	||d   Y n?w |	|||	  |	}|| |kr|t|d k r||d  |kr|
||  |d	7 }n|
||  |d7 }|t|k sFt|}|}t|dkrnt|}q(d
|}|d d }| j |< |S )Nr%   z</w>Tc                    s    j | tdS )Ninf)rC   getfloat)pairrZ   r   r   <lambda>  s    z'BertweetTokenizer.bpe.<locals>.<lambda>)keyr   r   r   @@ )rD   r&   listr   minrC   rB   index
ValueErrorextendappendjoin)
rJ   tokenr   r   ZbigramfirstsecondZnew_wordijr   rZ   r   bpe   sN   

"
,


zBertweetTokenizer.bpec                 C   sH   | j r| |}g }td|}|D ]}|t| |d q|S )zTokenize a string.z\S+\n? )r+   normalizeTweetrefindallri   re   rq   r'   )rJ   textZsplit_tokenswordsrl   r   r   r   	_tokenize(  s   
zBertweetTokenizer._tokenizec                    s    j D ]}|| j | }q j|}d fdd|D }|ddddddd	d
dd}|dddddddddddd}|dddddddd }d| S )!z'
        Normalize a raw Tweet
        rr   c                    s   g | ]}  |qS r   )normalizeToken)r   rl   rZ   r   r   r)   ;  r    z4BertweetTokenizer.normalizeTweet.<locals>.<listcomp>zcannot zcan not zn't z n't zn 't zca n'tzcan'tzai n'tzain'tz'm z 'm z're z 're z's z 's z'll z 'll z'd z 'd z've z 've z p . m .z  p.m.z p . m z p.m z a . m .z a.m.z a . m z a.m )rG   replacerF   tokenizerk   r'   )rJ   ZtweetpuncttokensZ	normTweetr   rZ   r   rs   3  s.   



	z BertweetTokenizer.normalizeTweetc                 C   sj   |  }|drdS |ds|drdS t|dkr3|| jv r'| j| S | jdur1| |S |S |S )z-
        Normalize tokens in a Tweet
        @z@USERhttpwwwZHTTPURLr   N)lower
startswithrB   rG   r4   )rJ   rl   Zlowercased_tokenr   r   r   ry   U  s   




z BertweetTokenizer.normalizeTokenc                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r8   r^   r0   )rJ   rl   r   r   r   _convert_token_to_idh  s   z&BertweetTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)r<   r^   r0   )rJ   rg   r   r   r   _convert_id_to_tokenl  s   z&BertweetTokenizer._convert_id_to_tokenc                 C   s   d |dd }|S )z:Converts a sequence of tokens (string) in a single string.rr   rc    )rk   rz   strip)rJ   r}   Z
out_stringr   r   r   convert_tokens_to_stringp  s   z*BertweetTokenizer.convert_tokens_to_stringsave_directoryfilename_prefixc                 C   s  t j|std| d d S t j||r|d ndtd  }t j||r,|d ndtd  }t j| jt j|krNt j	| jrNt
| j| n&t j	| jstt|d}| j }|| W d    n1 sow   Y  t j| jt j|krt
| j| ||fS )NzVocabulary path (z) should be a directory-r   r	   r
   wb)ospathisdirr6   errorrk   VOCAB_FILES_NAMESabspathr	   isfiler   r=   Zsp_modelZserialized_model_protowriter
   )rJ   r   r   Zout_vocab_fileZout_merge_filefiZcontent_spiece_modelr   r   r   save_vocabularyu  s&   (
z!BertweetTokenizer.save_vocabularyc           	   
   C   s   t |trCz!t|ddd}| | W d   W dS 1 sw   Y  W dS  ty4 } z|d}~w tyB   td| dw | }|D ]!}| }|	d}|dkr\t
d	|d| }t| j| j|< qIdS )
zi
        Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
        rr!   r"   NzIncorrect encoding detected in z, please rebuild the datasetrr   r%   z5Incorrect dictionary format, expected '<token> <cnt>')
isinstancer9   r=   r:   FileNotFoundErrorUnicodeError	Exception	readlinesr   rfindrh   rB   r8   )	rJ   ffdZfnfelinesZlineTmplineidxr   r   r   r   r:     s.   

zBertweetTokenizer.add_from_file)Fr   r   r   r   r   r   r   rY   )NF)__name__
__module____qualname____doc__r   Zvocab_files_namesrI   r   intr   rU   boolrW   rX   propertyr[   r\   rq   rx   rs   ry   r   r   r   r9   r   r   r:   __classcell__r   r   rL   r   r   6   sd    2=





," r   ac  
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
      |
      <3                         # heart
    )u  			# Capture 1: entire matched URL
  (?:
  https?:				# URL protocol and colon
    (?:
      /{1,3}				# 1-3 slashes
      |					#   or
      [a-z0-9%]				# Single letter or digit or '%'
                                       # (Trying not to match e.g. "URI::Escape")
    )
    |					#   or
                                       # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:[a-z]{2,13})
    /
  )
  (?:					# One or more:
    [^\s()<>{}\[\]]+			# Run of non-space, non-()<>{}[]
    |					#   or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
  )+
  (?:					# End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
    |					#   or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’]	# not a space or one of these punct chars
  )
  |					# OR, the following to match naked domains:
  (?:
    (?<!@)			        # not preceded by a @, avoid matching foo@_gmail.com_
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.]
    (?:[a-z]{2,13})
    \b
    /?
    (?!@)			        # not succeeded by a @,
                            # avoid matching "foo.na" in "foo.na@example.com"
  )
a	  
    (?:
      (?:            # (international)
        \+?[01]
        [ *\-.\)]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [ *\-.\)]*
      )?
      \d{3}          # exchange
      [ *\-.\)]*
      \d{4}          # base
    )z	<[^>\s]+>z[\-]+>|<[\-]+z(?:@[\w_]+)z(?:\#+[\w_]+[\w\'_\-]*[\w_]+)z#[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]a  
    (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
    z(%s)|z([^a-zA-Z0-9])\1{3,}z&(#?(x?))([^&;\s]+);strictc                 C   s&   |d u rd}t | tr| ||S | S )Nr!   )r   bytesdecode)rv   r#   errorsr   r   r   _str_to_unicode\  s
   
r   r   Tr!   c                    s     fdd}t |t| |S )u  
    Remove entities from text by converting them to their corresponding unicode character.

    Args:
        text:
            A unicode string or a byte string encoded in the given *encoding* (which defaults to 'utf-8').
        keep (list):
            List of entity names which should not be replaced. This supports both numeric entities (`&#nnnn;` and
            `&#hhhh;`) and named entities (such as `&nbsp;` or `&gt;`).
        remove_illegal (bool):
            If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are
            kept "as is".

    Returns: A unicode string with the entities removed.

    See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py

    Examples:

    ```python
    >>> from nltk.tokenize.casual import _replace_html_entities

    >>> _replace_html_entities(b"Price: &pound;100")
    'Price: \xa3100'

    >>> print(_replace_html_entities(b"Price: &pound;100"))
    Price: £100
    ```c              	      s   |  d}|  dr=z'|  drt|d}nt|d}d|  kr%dkr0n n	t|fdW S W n ty<   d }Y nw | v rF|  d	S tjj|}|d urbzt	|W S  tt
fya   Y nw rfd
S |  d	S )Nr   r   r      
         cp1252r   r   )groupr   r   r   rh   htmlentitiesname2codepointr^   chrOverflowError)matchZentity_bodynumberkeepremove_illegalr   r   _convert_entity  s,   





z/_replace_html_entities.<locals>._convert_entity)ENT_REsubr   )rv   r   r   r#   r   r   r   r   _replace_html_entitiesd  s   r   c                   @   s"   e Zd ZdZd	ddZdd ZdS )
rE   a  
    Examples:

    ```python
    >>> # Tokenizer for tweets.
    >>> from nltk.tokenize import TweetTokenizer

    >>> tknzr = TweetTokenizer()
    >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
    >>> tknzr.tokenize(s0)
    ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']

    >>> # Examples using *strip_handles* and *reduce_len parameters*:
    >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    >>> s1 = "@remy: This is waaaaayyyy too much for you!!!!!!"
    >>> tknzr.tokenize(s1)
    [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
    ```TFc                 C   s   || _ || _|| _d S rY   preserve_case
reduce_lenstrip_handles)rJ   r   r   r   r   r   r   rI     s   
zTweetTokenizer.__init__c                 C   sR   t |}| jrt|}| jrt|}td|}t|}| j	s'dd |D }|S )z
        Args:
            text: str

        Returns: list(str) A tokenized list of strings; concatenating this list returns the original string if
        `preserve_case=False`
        \1\1\1c                 S   s"   g | ]}t |r|n| qS r   )EMOTICON_REsearchr   )r   xr   r   r   r)     s   " z+TweetTokenizer.tokenize.<locals>.<listcomp>)
r   r   remove_handlesr   reduce_lengtheningHANG_REr   WORD_REru   r   )rJ   rv   Z	safe_textrw   r   r   r   r{     s   	
zTweetTokenizer.tokenizeNTFF)r   r   r   r   rI   r{   r   r   r   r   rE     s    
rE   c                 C      t d}|d| S )za
    Replace repeated character sequences of length 3 or greater with sequences of length 3.
    z	(.)\1{2,}r   regexcompiler   rv   patternr   r   r   r     s   
r   c                 C   r   )z4
    Remove Twitter username handles from text.
    zv(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)rr   r   r   r   r   r   r     s   r   Fc                 C   s   t |||d| S )z:
    Convenience function for wrapping the tokenizer.
    r   )rE   r{   )rv   r   r   r   r   r   r   casual_tokenize  s   r   )Nr   )r   Tr!   r   )(r   r   r   rt   shutilr   typingr   r   r   r   Ztokenization_utilsr   utilsr   Z
get_loggerr   r6   r   r   r   Z	EMOTICONSZURLSZREGEXPSr   rk   VERBOSEIUNICODEr   r   r   r   r   r   rE   r   r   r   __all__r   r   r   r   <module>   sT   
   &.$0



A8

