o
    Zhk                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	m
Z
mZ d dlZd dlmZ d dlmZ ddlmZ dd	lmZmZ d
dlmZmZmZ d
dlmZ eeZeG dd dZG dd deZ G dd deZ!dS )    N)	dataclassfield)Enum)ListOptionalUnion)FileLock)Dataset   )PreTrainedTokenizerBase)check_torch_load_is_safelogging   )!glue_convert_examples_to_featuresglue_output_modesglue_processors)InputFeaturesc                   @   s   e Zd ZU dZeddde  idZe	e
d< eddidZe	e
d< ed	dd
idZee
d< edddidZee
d< dd ZdS )GlueDataTrainingArgumentsz
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command
    line.
    helpz"The name of the task to train on: z, )metadata	task_namezUThe input data dir. Should contain the .tsv files (or other data files) for the task.data_dir   zThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.)defaultr   max_seq_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachec                 C   s   | j  | _ d S N)r   lowerself r    N/var/www/auris/lib/python3.10/site-packages/transformers/data/datasets/glue.py__post_init__=   s   z'GlueDataTrainingArguments.__post_init__N)__name__
__module____qualname____doc__r   joinr   keysr   str__annotations__r   r   intr   boolr"   r    r    r    r!   r   #   s    
 $	r   c                   @   s   e Zd ZdZdZdZdS )SplittraindevtestN)r#   r$   r%   r.   r/   r0   r    r    r    r!   r-   A   s    r-   c                   @   s   e Zd ZU dZeed< eed< ee ed< de	j
dfdededee deee	f d	ee f
d
dZdd ZdefddZdd ZdS )GlueDatasetzH
    This will be superseded by a framework-agnostic approach soon.
    argsoutput_modefeaturesN	tokenizerlimit_lengthmode	cache_dirc                 C   s&  t dt || _t|j  | _t|j | _t	|t
r-zt| }W n ty,   tdw tj|d ur6|n|jd|j d|jj d|j d|j }| j }|jdv ri|jjdv ri|d |d |d< |d< || _|d	 }t| tj|r|jst }	t  tj|d
d| _t d| dt |	  ndt d|j  |tj!kr| j"|j}
n|tj#kr| j$|j}
n| j%|j}
|d ur|
d | }
t&|
||j|| jd| _t }	t'| j| t d| dt |	 dd W d    d S W d    d S 1 sw   Y  d S )Nu  This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.pyzmode is not a valid split nameZcached__)Zmnlizmnli-mm)ZRobertaTokenizerZRobertaTokenizerFastZXLMRobertaTokenizerZBartTokenizerZBartTokenizerFastr      z.lockT)Zweights_onlyz"Loading features from cached file z [took %.3f s]z'Creating features from dataset file at )
max_length
label_listr3   z!Saving features into cached file z [took z.3fz s])(warningswarnFutureWarningr2   r   r   	processorr   r3   
isinstancer)   r-   KeyErrorospathr'   r   value	__class__r#   r   
get_labelsr<   r   existsr   timer   torchloadr4   loggerinfor/   Zget_dev_examplesr0   Zget_test_examplesZget_train_examplesr   save)r   r2   r5   r6   r7   r8   Zcached_features_filer<   Z	lock_pathstartZexamplesr    r    r!   __init__P   sj   
$



$zGlueDataset.__init__c                 C   s
   t | jS r   )lenr4   r   r    r    r!   __len__      
zGlueDataset.__len__returnc                 C   s
   | j | S r   )r4   )r   ir    r    r!   __getitem__   rS   zGlueDataset.__getitem__c                 C   s   | j S r   )r<   r   r    r    r!   rG      s   zGlueDataset.get_labels)r#   r$   r%   r&   r   r*   r)   r   r   r-   r.   r   r   r+   r   rP   rR   rV   rG   r    r    r    r!   r1   G   s,   
 

Kr1   )"rC   rI   r=   dataclassesr   r   enumr   typingr   r   r   rJ   Zfilelockr   Ztorch.utils.datar	   Ztokenization_utils_baser   utilsr   r   Zprocessors.gluer   r   r   Zprocessors.utilsr   Z
get_loggerr#   rL   r   r-   r1   r    r    r    r!   <module>   s$   
