o
    Zh6                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZmZmZ ddlm	Z	m
Z
mZ eeZeG dd dZedd	G d
d dZG dd dZG dd deZdS )    N)	dataclass)ListOptionalUnion   )is_tf_availableis_torch_availableloggingc                   @   sJ   e Zd ZU dZeed< eed< dZee ed< dZee ed< dd Z	dS )	InputExamplea5  
    A single training/test example for simple sequence classification.

    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
    guidtext_aNtext_blabelc                 C   s   t jt| ddd S )*Serializes this instance to a JSON string.   )indent
jsondumpsdataclassesasdictself r   Q/var/www/auris/lib/python3.10/site-packages/transformers/data/processors/utils.pyto_json_string1   s   zInputExample.to_json_string)
__name__
__module____qualname____doc__str__annotations__r   r   r   r   r   r   r   r   r
      s   
 r
   T)frozenc                   @   sf   e Zd ZU dZee ed< dZeee  ed< dZ	eee  ed< dZ
eeeef  ed< dd ZdS )	InputFeaturesa  
    A single set of features of data. Property names are the same names as the corresponding inputs to a model.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded)
            tokens.
        token_type_ids: (Optional) Segment token indices to indicate first and second
            portions of the inputs. Only some models use them.
        label: (Optional) Label corresponding to the input. Int for classification problems,
            float for regression problems.
    	input_idsNattention_masktoken_type_idsr   c                 C   s   t t| d S )r   r   r   r   r   r   r   r   K   s   zInputFeatures.to_json_string)r   r   r   r    r   intr"   r&   r   r'   r   r   floatr   r   r   r   r   r$   6   s   
 r$   c                   @   sN   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	e
dddZdS )DataProcessorzEBase class for data converters for sequence classification data sets.c                 C      t  )z
        Gets an example from a dict with tensorflow tensors.

        Args:
            tensor_dict: Keys and values should match the corresponding Glue
                tensorflow_dataset examples.
        NotImplementedError)r   Ztensor_dictr   r   r   get_example_from_tensor_dictS   s   z*DataProcessor.get_example_from_tensor_dictc                 C   r+   )z8Gets a collection of [`InputExample`] for the train set.r,   r   data_dirr   r   r   get_train_examples]      z DataProcessor.get_train_examplesc                 C   r+   )z6Gets a collection of [`InputExample`] for the dev set.r,   r/   r   r   r   get_dev_examplesa   r2   zDataProcessor.get_dev_examplesc                 C   r+   )z7Gets a collection of [`InputExample`] for the test set.r,   r/   r   r   r   get_test_examplese   r2   zDataProcessor.get_test_examplesc                 C   r+   )z*Gets the list of labels for this data set.r,   r   r   r   r   
get_labelsi   r2   zDataProcessor.get_labelsc                 C   s(   t |  dkr|  t|j |_|S )z
        Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
        examples to the correct format.
           )lenr5   r(   r   )r   exampler   r   r   tfds_mapm   s   zDataProcessor.tfds_mapNc                 C   sF   t |ddd}ttj|d|dW  d   S 1 sw   Y  dS )z!Reads a tab separated value file.rz	utf-8-sig)encoding	)	delimiter	quotecharN)openlistcsvreader)clsZ
input_filer>   fr   r   r   	_read_tsvv   s   $zDataProcessor._read_tsvN)r   r   r   r    r.   r1   r3   r4   r5   r9   classmethodrE   r   r   r   r   r*   P   s    
	r*   c                   @   s   e Zd ZdZdddZdd Zd	d
 Ze	dddZedddZ								dddZ
	dddZ					dddZdS )%SingleSentenceClassificationProcessorz@Generic processor for a single sentence classification data set.NclassificationFc                 C   s4   |d u rg n|| _ |d u rg n|| _|| _|| _d S rF   )labelsexamplesmodeverbose)r   rJ   rK   rL   rM   r   r   r   __init__   s   
z.SingleSentenceClassificationProcessor.__init__c                 C   s
   t | jS rF   )r7   rK   r   r   r   r   __len__   s   
z-SingleSentenceClassificationProcessor.__len__c                 C   s(   t |trt| j| j| dS | j| S )N)rJ   rK   )
isinstanceslicerH   rJ   rK   )r   idxr   r   r   __getitem__   s   

z1SingleSentenceClassificationProcessor.__getitem__ r   r6   c           	   
   K   s,   | di |}|j ||||||ddd |S )NT)
split_namecolumn_labelcolumn_text	column_idskip_first_rowoverwrite_labelsoverwrite_examplesr   )add_examples_from_csv)	rC   	file_namerU   rV   rW   rX   rY   kwargs	processorr   r   r   create_from_csv   s   
z5SingleSentenceClassificationProcessor.create_from_csvc                 K   s    | di |}|j ||d |S )N)rJ   r   )add_examples)rC   texts_or_text_and_labelsrJ   r^   r_   r   r   r   create_from_examples   s   z:SingleSentenceClassificationProcessor.create_from_examplesc	                 C   s   |  |}	|r|	dd  }	g }
g }g }t|	D ]0\}}|
||  |||  |d ur5|||  q|r>| d| nt|}|| q| j|
||||dS )Nr6   -)rZ   r[   )rE   	enumerateappendr!   ra   )r   r]   rU   rV   rW   rX   rY   rZ   r[   linesZtextsrJ   idsiliner   r   r   r   r\      s    

z;SingleSentenceClassificationProcessor.add_examples_from_csvc              	   C   sB  |d urt |t |krtdt | dt | |d ur4t |t |kr4tdt | dt | |d u r?d gt | }|d u rJd gt | }g }t }t|||D ]'\}}	}
t|ttfrj|	d u rj|\}}	n|}||	 |t	|
|d |	d qU|r|| _
n| j
| |rt|| _| j
S tt| j|| _| j
S )Nz(Text and labels have mismatched lengths z and z%Text and ids have mismatched lengths )r   r   r   r   )r7   
ValueErrorsetziprP   tupler@   addrf   r
   rK   extendrJ   union)r   rb   rJ   rh   rZ   r[   rK   Zadded_labelsZtext_or_text_and_labelr   r   textr   r   r   ra      s4   


z2SingleSentenceClassificationProcessor.add_examplesTc                    s  |du r|j }dd t| jD }g }t| jD ]$\}	}
|	d dkr*td|	  |j|
jdt||j d}|	| qt
d	d
 |D }g  tt|| jD ]\}	\}}
|	d dkrjtd|	 dt| j  |rndndgt| }|t| }|r|g| | }|rdndg| | }n||g|  }||rdndg|  }t||krtdt| d| t||krtdt| d| | jdkr||
j }n| jdkrt|
j}nt| j|	dk r%| jr%td td|
j  tdddd |D   tdddd |D   td|
j d| d  	t|||d qP|du r7 S |dkrqt sDtd ddl} fd!d"}|jj||j|jd#|jf|dg|dgd#|g f}|S |d$krt s~td%ddl}dd&l m!} |j"d'd  D |j#d(}|j"d)d  D |j#d(}| jdkr|j"d*d  D |j#d(}n| jdkr|j"d+d  D |jd(}||||}|S td,)-a  
        Convert examples in a list of `InputFeatures`

        Args:
            tokenizer: Instance of a tokenizer that will tokenize the examples
            max_length: Maximum example length
            pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default)
            pad_token: Padding token
            mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values
                and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for actual
                values)

        Returns:
            If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
            task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
            `InputFeatures` which can be fed to the model.

        Nc                 S   s   i | ]\}}||qS r   r   ).0ri   r   r   r   r   
<dictcomp>  s    zFSingleSentenceClassificationProcessor.get_features.<locals>.<dictcomp>i'  r   zTokenizing example T)Zadd_special_tokens
max_lengthc                 s   s    | ]}t |V  qd S rF   )r7   )rs   r%   r   r   r   	<genexpr>  s    zESingleSentenceClassificationProcessor.get_features.<locals>.<genexpr>zWriting example /r6   zError with input length z vs rI   Z
regression   z*** Example ***zguid: zinput_ids:  c                 S      g | ]}t |qS r   r!   rs   xr   r   r   
<listcomp>6      zFSingleSentenceClassificationProcessor.get_features.<locals>.<listcomp>zattention_mask: c                 S   rz   r   r{   r|   r   r   r   r~   7  r   zlabel: z (id = )r%   r&   r   tfz?return_tensors set to 'tf' but TensorFlow 2.0 can't be importedc                  3   s&     D ]} | j | jd| jfV  qd S )Nr%   r&   r   )exfeaturesr   r   genC  s   z?SingleSentenceClassificationProcessor.get_features.<locals>.genr   ptz8return_tensors set to 'pt' but PyTorch can't be imported)TensorDatasetc                 S      g | ]}|j qS r   )r%   rs   rD   r   r   r   r~   S      )Zdtypec                 S   r   r   )r&   r   r   r   r   r~   T  r   c                 S   r   r   r   r   r   r   r   r~   V  r   c                 S   r   r   r   r   r   r   r   r~   X  r   z,return_tensors should be one of 'tf' or 'pt')$max_lenre   rJ   rK   loggerinfoencoder   minrf   maxrm   r7   rk   rL   r   r)   rM   r   joinr$   r   RuntimeErrorZ
tensorflowdataZDatasetZfrom_generatorZint32Zint64ZTensorShaper   torchZtorch.utils.datar   Ztensorlong)r   Z	tokenizerru   Zpad_on_leftZ	pad_tokenZmask_padding_with_zeroZreturn_tensorsZ	label_mapZall_input_idsZex_indexr8   r%   Zbatch_lengthr&   Zpadding_lengthr   r   r   Zdatasetr   r   Zall_attention_maskZ
all_labelsr   r   r   get_features   s   




  

"
z2SingleSentenceClassificationProcessor.get_features)NNrI   F)rT   r   r6   NFrF   )rT   r   r6   NFFF)NNFF)NFr   TN)r   r   r   r    rN   rO   rS   rG   r`   rc   r\   ra   r   r   r   r   r   rH   }   s4    


(rH   )rA   r   r   r   typingr   r   r   utilsr   r   r	   Z
get_loggerr   r   r
   r$   r*   rH   r   r   r   r   <module>   s   
-