
    eTh6                         S SK r S SKrS SKrS SKJr  S SKJrJrJr  SSKJ	r	J
r
Jr  \R                  " \5      r\ " S S5      5       r\" SS	9 " S
 S5      5       r " S S5      r " S S\5      rg)    N)	dataclass)ListOptionalUnion   )is_tf_availableis_torch_availableloggingc                   \    \ rS rSr% Sr\\S'   \\S'   Sr\\   \S'   Sr	\\   \S'   S r
S	rg)
InputExample   a  
A single training/test example for simple sequence classification.

Args:
    guid: Unique id for the example.
    text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
    text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
    label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
guidtext_aNtext_blabelc                 Z    [         R                  " [        R                  " U 5      SS9S-   $ )*Serializes this instance to a JSON string.   )indent
jsondumpsdataclassesasdictselfs    Z/var/www/auris/envauris/lib/python3.13/site-packages/transformers/data/processors/utils.pyto_json_stringInputExample.to_json_string1   s#    zz+,,T21=DD     )__name__
__module____qualname____firstlineno____doc__str__annotations__r   r   r   r   __static_attributes__r"   r!   r   r   r      s5     IK FHSM E8C=Er!   r   T)frozenc                       \ rS rSr% Sr\\   \S'   Sr\	\\      \S'   Sr
\	\\      \S'   Sr\	\\\4      \S'   S rS	rg)
InputFeatures6   a  
A single set of features of data. Property names are the same names as the corresponding inputs to a model.

Args:
    input_ids: Indices of input sequence tokens in the vocabulary.
    attention_mask: Mask to avoid performing attention on padding token indices.
        Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded)
        tokens.
    token_type_ids: (Optional) Segment token indices to indicate first and second
        portions of the inputs. Only some models use them.
    label: (Optional) Label corresponding to the input. Int for classification problems,
        float for regression problems.
	input_idsNattention_masktoken_type_idsr   c                 \    [         R                  " [        R                  " U 5      5      S-   $ )r   r   r   r   s    r   r   InputFeatures.to_json_stringK   s!    zz+,,T23d::r!   r"   )r#   r$   r%   r&   r'   r   intr)   r0   r   r1   r   r   floatr   r*   r"   r!   r   r-   r-   6   sV     Cy*.NHT#Y'.*.NHT#Y'.)-E8E#u*%&-;r!   r-   c                   P    \ rS rSrSrS rS rS rS rS r	S r
\SS
 j5       rSrg	)DataProcessorP   zEBase class for data converters for sequence classification data sets.c                     [        5       e)z
Gets an example from a dict with tensorflow tensors.

Args:
    tensor_dict: Keys and values should match the corresponding Glue
        tensorflow_dataset examples.
NotImplementedError)r   tensor_dicts     r   get_example_from_tensor_dict*DataProcessor.get_example_from_tensor_dictS   s     "##r!   c                     [        5       e)z8Gets a collection of [`InputExample`] for the train set.r:   r   data_dirs     r   get_train_examples DataProcessor.get_train_examples]       !##r!   c                     [        5       e)z6Gets a collection of [`InputExample`] for the dev set.r:   r@   s     r   get_dev_examplesDataProcessor.get_dev_examplesa   rD   r!   c                     [        5       e)z7Gets a collection of [`InputExample`] for the test set.r:   r@   s     r   get_test_examplesDataProcessor.get_test_examplese   rD   r!   c                     [        5       e)z*Gets the list of labels for this data set.r:   r   s    r   
get_labelsDataProcessor.get_labelsi   rD   r!   c                     [        U R                  5       5      S:  a+  U R                  5       [        UR                  5         Ul        U$ )z
Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
examples to the correct format.
   )lenrL   r4   r   )r   examples     r   tfds_mapDataProcessor.tfds_mapm   s9    
 t !A% OO-c'--.@AGMr!   Nc           	          [        USSS9 n[        [        R                  " USUS95      sSSS5        $ ! , (       d  f       g= f)z!Reads a tab separated value file.rz	utf-8-sig)encoding	)	delimiter	quotecharN)openlistcsvreader)cls
input_filerY   fs       r   	_read_tsvDataProcessor._read_tsvv   s3     *cK8A

1	JK 988s	   5
Ar"   N)r#   r$   r%   r&   r'   r=   rB   rF   rI   rL   rR   classmethodra   r*   r"   r!   r   r7   r7   P   s9    O$$$$$ L Lr!   r7   c                       \ rS rSrSrSS jrS rS r\ SS j5       r	\SS j5       r
       SS	 jr SS
 jr     SS jrSrg)%SingleSentenceClassificationProcessor}   z@Generic processor for a single sentence classification data set.Nc                 L    Uc  / OUU l         Uc  / OUU l        X0l        X@l        g rc   )labelsexamplesmodeverbose)r   ri   rj   rk   rl   s        r   __init__.SingleSentenceClassificationProcessor.__init__   s'    "Nb&.H	r!   c                 ,    [        U R                  5      $ rc   )rP   rj   r   s    r   __len__-SingleSentenceClassificationProcessor.__len__   s    4==!!r!   c                     [        U[        5      (       a!  [        U R                  U R                  U   S9$ U R                  U   $ )N)ri   rj   )
isinstanceslicerf   ri   rj   )r   idxs     r   __getitem__1SingleSentenceClassificationProcessor.__getitem__   s<    c5!!8VZVcVcdgVhii}}S!!r!   c                 B    U " S0 UD6nUR                  UUUUUUSSS9  U$ )NT)
split_namecolumn_labelcolumn_text	column_idskip_first_rowoverwrite_labelsoverwrite_examplesr"   )add_examples_from_csv)	r^   	file_namery   rz   r{   r|   r}   kwargs	processors	            r   create_from_csv5SingleSentenceClassificationProcessor.create_from_csv   sB     M&M	''!%#)!# 	( 		
 r!   c                 4    U " S0 UD6nUR                  XS9  U$ )N)ri   r"   )add_examples)r^   texts_or_text_and_labelsri   r   r   s        r   create_from_examples:SingleSentenceClassificationProcessor.create_from_examples   s%    M&M	7Gr!   c	                 b   U R                  U5      n	U(       a  U	SS  n	/ n
/ n/ n[        U	5       Hm  u  pU
R                  X   5        UR                  X   5        Ub  UR                  X   5        MC  U(       a  U SU 3O
[        U5      nUR                  U5        Mo     U R	                  XXUS9$ )NrO   -)r~   r   )ra   	enumerateappendr(   r   )r   r   ry   rz   r{   r|   r}   r~   r   linestextsri   idsiliner   s                   r   r   ;SingleSentenceClassificationProcessor.add_examples_from_csv   s     y)!"IE 'GALL*+MM$,-$

4?+.8*Qqc*c!f

4  (   3Vh ! 
 	
r!   c           
      :   Ub;  [        U5      [        U5      :w  a#  [        S[        U5       S[        U5       35      eUb;  [        U5      [        U5      :w  a#  [        S[        U5       S[        U5       35      eUc  S /[        U5      -  nUc  S /[        U5      -  n/ n[        5       n[        XU5       HV  u  pn
[	        U[
        [        45      (       a  U	c  Uu  pOUnUR                  U	5        UR                  [        XS U	S95        MX     U(       a  X`l
        OU R                  R                  U5        U(       a  [        U5      U l        U R                  $ [        [        U R                  5      R                  U5      5      U l        U R                  $ )Nz(Text and labels have mismatched lengths z and z%Text and ids have mismatched lengths )r   r   r   r   )rP   
ValueErrorsetziprs   tupler[   addr   r   rj   extendri   union)r   r   ri   r   r~   r   rj   added_labelstext_or_text_and_labelr   r   texts               r   r   2SingleSentenceClassificationProcessor.add_examples   s    #&>"?3v;"N:3?W;X:YY^_bci_j^kl  ?s#;<CHDSIaEbDcchilmpiqhrstt;&3788C>Vc":;;Fu367OY\3]/"405$-@@U]4e-U#OOLdTYZ[ 4^ $MMM  * |,DK }} s4;;/55lCDDK}}r!   c                 
  ^ Uc  UR                   n[        U R                  5       VVs0 s H  u  pxX_M	     n	nn/ n
[        U R                  5       He  u  pUS-  S:X  a  [        R                  SU 35        UR                  UR                  S[        X!R                   5      S9nU
R                  U5        Mg     [        S U
 5       5      n/ m[        [        XR                  5      5       GH]  u  nu  pUS-  S:X  a.  [        R                  SU S	[        U R                  5       35        U(       a  S
OS/[        U5      -  nU[        U5      -
  nU(       a  U/U-  U-   nU(       a  SOS
/U-  U-   nOX/U-  -   nX(       a  SOS
/U-  -   n[        U5      U:w  a  [        S[        U5       SU 35      e[        U5      U:w  a  [        S[        U5       SU 35      eU R                  S:X  a  XR                     nO;U R                  S:X  a  [!        UR                  5      nO[        U R                  5      eUS:  a  U R"                  (       a  [        R                  S5        [        R                  SUR$                   35        [        R                  SSR'                  U Vs/ s H  n[)        U5      PM     sn5       35        [        R                  SSR'                  U Vs/ s H  n[)        U5      PM     sn5       35        [        R                  SUR                   SU S35        TR                  [+        XUS95        GM`     Uc  T$ US:X  a  [-        5       (       d  [/        S5      eSSKnU4S jnUR2                  R4                  R7                  UUR8                  UR8                  S.UR:                  4UR=                  S/5      UR=                  S/5      S.UR=                  / 5      45      nU$ US:X  Ga%  [?        5       (       d  [/        S5      eSSK nSSK!J"n  URG                  T Vs/ s H  nURH                  PM     snURJ                  S 9n
URG                  T Vs/ s H  nURL                  PM     snURJ                  S 9nU R                  S:X  a6  URG                  T Vs/ s H  nUR                  PM     snURJ                  S 9nOEU R                  S:X  a5  URG                  T Vs/ s H  nUR                  PM     snUR                   S 9nU" U
UW5      nU$ [        S!5      es  snnf s  snf s  snf s  snf s  snf s  snf s  snf )"a  
Convert examples in a list of `InputFeatures`

Args:
    tokenizer: Instance of a tokenizer that will tokenize the examples
    max_length: Maximum example length
    pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default)
    pad_token: Padding token
    mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values
        and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for actual
        values)

Returns:
    If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
    task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
    `InputFeatures` which can be fed to the model.

Ni'  r   zTokenizing example T)add_special_tokens
max_lengthc              3   8   #    U  H  n[        U5      v   M     g 7frc   )rP   ).0r/   s     r   	<genexpr>ESingleSentenceClassificationProcessor.get_features.<locals>.<genexpr>  s     I=i3y>>=s   zWriting example /rO   zError with input length z vs classification
regression   z*** Example ***zguid: zinput_ids:  zattention_mask: zlabel: z (id = )r/   r0   r   tfz?return_tensors set to 'tf' but TensorFlow 2.0 can't be importedc               3   n   >#    T H*  n U R                   U R                  S.U R                  4v   M,     g 7f)Nr/   r0   r   )exfeaturess    r   gen?SingleSentenceClassificationProcessor.get_features.<locals>.genC  s1     "B)+IZIZ[]_]e]eff #s   25r   ptz8return_tensors set to 'pt' but PyTorch can't be imported)TensorDataset)dtypez,return_tensors should be one of 'tf' or 'pt')'max_lenr   ri   rj   loggerinfoencoder   minr   maxr   rP   r   rk   r   r5   rl   r   joinr(   r-   r   RuntimeError
tensorflowdataDatasetfrom_generatorint32int64TensorShaper	   torchtorch.utils.datar   tensorr/   longr0   )r   	tokenizerr   pad_on_left	pad_tokenmask_padding_with_zeroreturn_tensorsr   r   	label_mapall_input_idsex_indexrQ   r/   batch_lengthr0   padding_lengthxr   r   datasetr   r   r`   all_attention_mask
all_labelsr   s                             @r   get_features2SingleSentenceClassificationProcessor.get_features   s   6 "**J.7.DE.D(!UX.D	E!*4==!9H%1$1(<=!((#'z+<+<= ) I
   + ": I=II.7M==8Y.Z*H*y%1$.xj#dmm:L9MNO $:aqAC	NRN *C	N:N'[>9YF	(>1A"F"W[i!i%~)EF	!/9OAUV3WZh3h!i9~- #;C	N;K4P\~!^__>"l2 #;C<O;PPTUaTb!cddyy,,!--0l*gmm, ++!|-.fW\\N34k#((I3NIqCFI3N*O)PQR.sxx8XAQ8X/Y.Z[\ggmm_GE7!DEOOMIdijkG /[J !Ot#"$$"#dee#g ggoo44!xx288DbhhO!~~tf5Y]X^I_`bdbpbpqsbtuG
 Nt#%''"#]^^6!LLx)Hx!!++x)HPUPZPZL[M!&.RAq/?/?.RZ_ZdZd!eyy,,"\\H*EHq177H*EUZZ\X
l*"\\H*EHq177H*EU[[\Y
#M3EzRGNKLLo F` 4O8X8 *I.R*E*Es)   T4'T:(T?7U,U	1U7U)rj   ri   rk   rl   )NNr   F) r   rO   NFrc   )r   r   rO   NFFF)NNFF)NFr   TN)r#   r$   r%   r&   r'   rm   rp   rv   rd   r   r   r   r   r   r*   r"   r!   r   rf   rf   }   s    J""
 ej      
> kp#P #uMr!   rf   )r\   r   r   r   typingr   r   r   utilsr   r	   r
   
get_loggerr#   r   r   r-   r7   rf   r"   r!   r   <module>r      s   "    ! ( ( A A 
		H	% E E E0 $; ; ;2*L *LZ`MM `Mr!   