
    eTh 9                        S SK r S SKrS SKrS SKJr  S SKJrJrJrJ	r	J
r
JrJr  S SKJrJr  \R                   R#                  \5      r\R                   R'                  \S5      r\" \5       r\R/                  5       rSSS5         " S S5      r\	\   r\	\\      r " S S	\
5      r " S
 S5      r " S S5      r " S S5      r \4S\	\!   S\!4S jjr"g! , (       d  f       N^= f)    N)Template)AnyCallableDictList
NamedTupleOptionalTuple)Encoding	Tokenizerzvisualizer-styles.cssc                   J    \ rS rSr% \\S'   \\S'   \\S'   S\S\S\4S jrSrg)
Annotation   startendlabelc                 (    Xl         X l        X0l        g N)r   r   r   )selfr   r   r   s       S/var/www/auris/envauris/lib/python3.13/site-packages/tokenizers/tools/visualizer.py__init__Annotation.__init__   s    

    )r   r   r   N)	__name__
__module____qualname____firstlineno__int__annotations__strr   __static_attributes__ r   r   r   r      s+    J	HJc  C r   r   c                   6    \ rS rSr% \\   \S'   \\   \S'   Srg)CharStateKey   token_ixanno_ixr"   N)r   r   r   r   r	   r   r   r!   r"   r   r   r$   r$      s    smc]r   r$   c                   Z    \ rS rSr% \\   \S'   S r\S 5       r	\S 5       r
S\4S jrSrg	)
	CharState$   char_ixc                 ,    Xl         S U l        / U l        g r   )r+   r'   tokens)r   r+   s     r   r   CharState.__init__'   s    &*!#r   c                 V    [        U R                  5      S:  a  U R                  S   $ S $ )Nr   lenr-   r   s    r   r&   CharState.token_ix-   s%    !$T[[!1A!5t{{1~?4?r   c                 2    [        U R                  5      S:  $ )z:
BPE tokenizers can output more than one token for a char
   r0   r2   s    r   is_multitokenCharState.is_multitoken1   s    
 4;;!##r   returnc                 >    [        U R                  U R                  S9$ )N)r&   r'   )r$   r&   r'   r2   s    r   partition_keyCharState.partition_key8   s    ]]LL
 	
r   )r'   r+   r-   N)r   r   r   r   r	   r   r   r   propertyr&   r6   r$   r:   r!   r"   r   r   r)   r)   $   sG    c]$ @ @ $ $
| 
r   r)   c                       \ rS rSrSrg)Aligned?   r"   N)r   r   r   r   r!   r"   r   r   r>   r>   ?   s    r   r>   c            
       f   \ rS rSrSr\R                  " S\R                  S9r  SS\	S\
S\\\/\4      4S	 jjr/ S4S
\S\S\\
   S\\   4S jjr\S\S\\\4   4S j5       r\S\\   S
\S\4S j5       r\S
\S\S\S\4S j5       r\S
\S\S\4S j5       r\S
\S\S\S\\   4S j5       rSrg)EncodingVisualizerC   a  
Build an EncodingVisualizer

Args:

     tokenizer (:class:`~tokenizers.Tokenizer`):
        A tokenizer instance

     default_to_notebook (:obj:`bool`):
        Whether to render html output in a notebook by default

     annotation_converter (:obj:`Callable`, `optional`):
        An optional (lambda) function that takes an annotation in any format and returns
        an Annotation object
z(.{1})?(unk|oov)(.{1})?)flagsN	tokenizerdefault_to_notebookannotation_converterc                 z    U(       a	   SSK JnJn  Xl        X l        X0l        g ! [         a    [	        S5      ef = f)Nr   HTMLdisplayzWe couldn't import IPython utils for html display.
                        Are you running in a notebook?
                        You can also pass `default_to_notebook=False` to get back raw HTML
                    )IPython.core.displayrI   rJ   ImportError	ExceptionrD   rE   annotation_coverter)r   rD   rE   rF   rI   rJ   s         r   r   EncodingVisualizer.__init__V   sH     > ##6 #7    s   $ :textannotationsr8   c                 `   U R                   nUb  UnU(       a	   SSKJnJn  U R                  b  [        [        U R                  U5      5      nU R                  R                  U5      n[        R                  XU5      nU(       a  W" W" U5      5        gU$ ! [         a    [        S5      ef = f)aw  
Build a visualization of the given text

Args:
    text (:obj:`str`):
        The text to tokenize

    annotations (:obj:`List[Annotation]`, `optional`):
        An optional list of annotations of the text. The can either be an annotation class
        or anything else if you instantiated the visualizer with a converter function

    default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
        If True, will render the html in a notebook. Otherwise returns an html string.

Returns:
    The HTML string if default_to_notebook is False, otherwise (default) returns None and
    renders the HTML in the notebook

Nr   rH   zeWe couldn't import IPython utils for html display.
                    Are you running in a notebook?)rE   rK   rI   rJ   rL   rM   rN   listmaprD   encoderA   _EncodingVisualizer__make_html)	r   rP   rQ   rE   final_default_to_notebookrI   rJ   encodinghtmls	            r   __call__EncodingVisualizer.__call__l   s    2 %)$<$<!*(;%$> ##/s4#;#;[IJK>>((.!--dkJ$DJK  6 s   B B-c                     [        U 5      S:X  a  0 $ [        [        S U 5      5      n[        U5      n[        SU-  5      nUS:  a  SnSnSnSn0 n[	        U5       H  nSU S	U S
U S3Xx'   Xc-  nM     U$ )z
Generates a color palette for all the labels in a given set of annotations

Args:
  annotations (:obj:`Annotation`):
    A list of annotations

Returns:
    :obj:`dict`: A dictionary mapping labels to colors in HSL format
r   c                     U R                   $ r   )r   )xs    r   <lambda>;EncodingVisualizer.calculate_label_colors.<locals>.<lambda>   s    177r             @   
   zhsl(,z%,%)r1   setrT   r   sorted)	rQ   labels
num_labelsh_stepslhcolorsr   s	            r   calculate_label_colors)EncodingVisualizer.calculate_label_colors   s     {q IS*K89[
S:%&B;FF^E"1#QqcA3a0FMKA $ r   consecutive_chars_listrX   c                    U S   nUR                   c  UR                  UR                     nSU S3$ U S   nUR                   nUR                   S-   nXU n/ n	0 n
UR                  b  U	R                  S5        UR                  (       a  U	R                  S5        UR                  S-  (       a  U	R                  S	5        OU	R                  S
5        [
        R                  R                  UR                  UR                     5      b-  U	R                  S5        UR                  UR                     U
S'   OU	R                  S5        SSR                  U	5       S3nSnU
R                  5        H  u  pUSU SU S3-  nM     SU SU SU S3$ )as  
Converts a list of "consecutive chars" into a single HTML element.
Chars are consecutive if they fall under the same word, token and annotation.
The CharState class is a named tuple with a "partition_key" method that makes it easy to
compare if two chars are consecutive.

Args:
    consecutive_chars_list (:obj:`List[CharState]`):
        A list of CharStates that have been grouped together

    text (:obj:`str`):
        The original text being processed

    encoding (:class:`~tokenizers.Encoding`):
        The encoding returned from the tokenizer

Returns:
    :obj:`str`: The HTML span for a set of consecutive chars
r   z(<span class="special-token" data-stoken=z></span>r5   tokenzmulti-token   z	odd-tokenz
even-tokenzspecial-tokenstokz	non-tokenzclass=" " z data-z="z<span z ></span>)
r+   r-   r&   appendr6   rA   unk_token_regexsearchjoinitems)rs   rP   rX   firststokenlastr   r   	span_textcss_classes
data_itemscssdatakeyvals                  r   consecutive_chars_to_html,EncodingVisualizer.consecutive_chars_to_html   s   2 'q)== __U^^4F >fXXNN%b)llQsO	
>>%w'""""=1~~!
 "";/ ""<0!11889XYe""?3%-__U^^%D
6" {+#((;/04"((*HCfSEC5**D +uAdV2i[88r   c           	      4   [         R                  XU5      nUS   /nUS   R                  n/ n[         R                  U5      nUS   R                  nUb,  X(   n	U	R                  n
Xz   nUR                  SU SU
 S35        USS   H  nUR                  nX:w  aj  UR                  [         R                  UU US95        U/nUb  UR                  S5        Ub,  X(   n	U	R                  n
Xz   nUR                  SU SU
 S35        UnUR                  5       US   R                  5       :X  a  UR                  U5        M  UR                  [         R                  UU US95        U/nM     UR                  [         R                  UU US95        [        U5      nU$ )Nr   z&<span class="annotation" style="color:z" data-label="z">r5   )rP   rX   r|   )	rA   %_EncodingVisualizer__make_char_statesr'   rq   r   r}   r   r:   HTMLBody)rP   rX   rQ   char_statescurrent_consecutive_charsprev_anno_ixspanslabel_colors_dictcur_anno_ixannor   colorcsress                 r   __make_htmlEncodingVisualizer.__make_html   s   (;;DKX%0^$4!"1~--.EEkR!!n,,"+DJJE%,ELLA%W\V]]_`aab/B**K*&@@1!!) A  .0D)+LL+*&3D JJE-4ELL#I%P^_d^eeg!hi&L!%>q%A%O%O%QQ)004 &@@1!!) A  .0D)M "R 	88)! 9 	
 uo
r   c                     S/[        U 5      -  n[        U5       H0  u  p4[        UR                  UR                  5       H  nX2U'   M	     M2     U$ )a  
Args:
    text (:obj:`str`):
        The raw text we want to align to

    annotations (:obj:`AnnotationList`):
        A (possibly empty) list of annotations

Returns:
    A list of  length len(text) whose entry at index i is None if there is no annotation on
    character i or k, the index of the annotation that covers index i where k is with
    respect to the list of annotations
N)r1   	enumerateranger   r   )rP   rQ   annotation_mapr'   ais         r   __make_anno_map"EncodingVisualizer.__make_anno_map<  sN     #d)+#K0JG177AEE*$+q! + 1 r   c                    [         R                  X5      n[        [        U 5      5       Vs/ s H  n[	        U5      PM     nn[        UR                  5       HN  u  pgUR                  U5      nUc  M  Uu  p[        X5       H   nX[   R                  R                  U5        M"     MP     [        U5       H  u  pLXU   l	        M     U$ s  snf )aJ  
For each character in the original text, we emit a tuple representing it's "state":

    * which token_ix it corresponds to
    * which word_ix it corresponds to
    * which annotation_ix it corresponds to

Args:
    text (:obj:`str`):
        The raw text we want to align to

    annotations (:obj:`List[Annotation]`):
        A (possibly empty) list of annotations

    encoding: (:class:`~tokenizers.Encoding`):
        The encoding returned from the tokenizer

Returns:
    :obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
    it's state is
)
rA   "_EncodingVisualizer__make_anno_mapr   r1   r)   r   r-   token_to_charsr}   r'   )rP   rX   rQ   r   r+   r   r&   rv   offsetsr   r   r   r'   s                r   __make_char_states%EncodingVisualizer.__make_char_statesQ  s    . ,;;DNJOPSTXPYJZ'[JZw	'(:JZ'[(9OH--h7G"$
u*AN))00: +	  : !*. 9G+2 ( !:  (\s   C	)rN   rE   rD   )TN)r   r   r   r   __doc__recompile
IGNORECASEr~   r   boolr	   r   r   r   r   r    AnnotationListrZ   staticmethodr   rq   r   r)   r   r   rV   PartialIntListr   r   r!   r"   r   r   rA   rA   C   s     jj!>bmmTO
 %)FJ	 " 'xz0A'BC	2 ').2	++ $+ &d^	+
 
#+Z N tCH~  8 A9 $YA9A9 A9 A9F ?# ? ? ?SV ? ?B c  >  ( " " "~ "Z^_hZi " "r   rA   childrenr8   c                 6    SR                  U 5      nSU SU S3$ )a7  
Generates the full html with css from a list of html spans

Args:
    children (:obj:`List[str]`):
        A list of strings, assumed to be html elements

    css_styles (:obj:`str`, `optional`):
        Optional alternative implementation of the css

Returns:
    :obj:`str`: An HTML string with style markup
r{   z?
    <html>
        <head>
            <style>
                zs
            </style>
        </head>
        <body>
            <div class="tokenized-text" dir=auto>
            z4
            </div>
        </body>
    </html>
    )r   )r   
css_styleschildren_texts      r   r   r   w  s9     GGH%M  
 O  r   )#	itertoolsosr   stringr   typingr   r   r   r   r   r	   r
   
tokenizersr   r   pathdirname__file__r   css_filenameopenfreadr   r   r   r   r   r$   r)   r>   rA   r    r   r"   r   r   <module>r      s     	 	  I I I * ''//(
#ww||G%<=	,1
&&(C   j!hsm$: 

 
6	 	q qh	 .1 tCy S W s   ,C
C#