
    eTh[$                     b   S SK r S SKrS SKJrJr  S SKJr  S SKJrJ	r	J
r
Jr  S SKrS SKJr  S SKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJrJrJrJr  \R:                  " \5      r\ " \RB                  " 5       5      r"\#" S \" 5       5      r$\ " S S5      5       r% " S S\5      r& " S S\5      r'g)    N)	dataclassfield)Enum)DictListOptionalUnion)FileLock)Dataset   )$MODEL_FOR_QUESTION_ANSWERING_MAPPING)PreTrainedTokenizer)check_torch_load_is_safelogging   )SquadFeaturesSquadV1ProcessorSquadV2Processor"squad_convert_examples_to_featuresc              #   8   #    U  H  oR                   v   M     g 7fN)
model_type).0confs     X/var/www/auris/envauris/lib/python3.13/site-packages/transformers/data/datasets/squad.py	<genexpr>r   "   s     E0DOO0Ds   c                      \ rS rSr% Sr\" SSSSR                  \5      -   0S9r\	\
S'   \" SSS	0S9r\	\
S
'   \" SSS0S9r\\
S'   \" SSS0S9r\\
S'   \" SSS0S9r\\
S'   \" SSS0S9r\\
S'   \" SSS0S9r\\
S'   \" SSS0S9r\\
S'   \" SSS0S9r\\
S'   \" SSS0S9r\\
S'   \" S SS!0S9r\\
S"'   \" S#SS$0S9r\\
S%'   S&rg)'SquadDataTrainingArguments%   zZ
Arguments pertaining to what data we are going to input our model for training and eval.
Nhelpz!Model type selected in the list: z, )defaultmetadatar   zFThe input data dir. Should contain the .json files for the SQuAD task.data_dir   zThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.max_seq_lengthzVWhen splitting up a long document into chunks, how much stride to take between chunks.
doc_stride@   zkThe maximum number of tokens for the question. Questions longer than this will be truncated to this length.max_query_length   zThe maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.max_answer_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachezDIf true, the SQuAD examples contain some that do not have an answer.version_2_with_negativeg        zIIf null_score - best_non_null is greater than the threshold predict null.null_score_diff_threshold   n_best_sizer   zjlanguage id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)lang_id   z3multiple threads for converting example to featuresthreads )__name__
__module____qualname____firstlineno____doc__r   joinMODEL_TYPESr   str__annotations__r#   r%   intr&   r(   r*   r+   boolr,   r-   floatr/   r0   r2   __static_attributes__r3       r   r   r   %   s    (KdiiXcNd(deJ  (pqHc   Q
NC  rsJ  "/
c  #J
s  ")\ ]OT  %*)o p%T  (-v'rs(u  f&qrK  C
GS  f6k-lmGSmrA   r   c                       \ rS rSrSrSrSrg)Splith   traindevr3   N)r4   r5   r6   r7   rE   rF   r@   r3   rA   r   rC   rC   h   s    E
CrA   rC   c                       \ rS rSr% Sr\\S'   \\   \S'   \	\S'   \
\S'   S\	R                  SSS	4S\S
\S\\   S\\\	4   S\\
   S\\   S\\   4S jjrS rS\\\R*                  4   4S jrSrg)SquadDatasetm   z@
This will be superseded by a framework-agnostic approach soon.
argsfeaturesmodeis_language_sensitiveNFpt	tokenizerlimit_length	cache_dirdataset_formatc                    Xl         XPl        UR                  (       a
  [        5       O	[	        5       U l        [        U[        5      (       a
   [        U   nX@l
        UR                  (       a  SOSn[        R                  R                  Ub  UOUR                  SUR                   SUR                   R"                   SUR$                   SU 35      n	U	S-   n
['        U
5         [        R                  R)                  U	5      (       Ga  UR*                  (       d  [,        R,                  " 5       n[/        5         [0        R2                  " U	SS9U l        U R4                  S	   U l        U R4                  R9                  S
S 5      U l        U R4                  R9                  SS 5      U l        [>        RA                  SU	 S3[,        R,                  " 5       U-
  5        U R:                  b  U R<                  c  [>        RC                  SU	 S35        GOJU[        RD                  :X  a+  U R
                  RG                  UR                  5      U l        O*U R
                  RI                  UR                  5      U l        [K        U R<                  UUR$                  URL                  URN                  U[        RP                  :H  URR                  US9u  U l        U l        [,        R,                  " 5       n[0        RT                  " U R6                  U R:                  U R<                  S.U	5        [>        RA                  SU	 S[,        R,                  " 5       U-
  S S35        S S S 5        g ! [         a    [        S5      ef = f! , (       d  f       g = f)Nzmode is not a valid split namev2v1cached__z.lockT)weights_onlyrK   datasetexamplesz"Loading features from cached file z [took %.3f s]zDeleting cached file z; will allow dataset and examples to be cached in future run)rZ   rO   r%   r&   r(   is_trainingr2   return_dataset)rK   rY   rZ   z!Saving features into cached file z [took z.3fz s])+rJ   rM   r,   r   r   	processor
isinstancer;   rC   KeyErrorrL   ospathr9   r#   value	__class__r4   r%   r
   existsr+   timer   torchloadold_featuresrK   getrY   rZ   loggerinfowarningrF   get_dev_examplesget_train_examplesr   r&   r(   rE   r2   save)selfrJ   rO   rP   rL   rM   rQ   rR   version_tagcached_features_file	lock_pathstarts               r   __init__SquadDataset.__init__w   s    	%:"/3/K/K)+QaQcdC  AT{ 	"::d!ww||".IDMMdjj\9#6#6#?#?"@$BUBUAVVWXcWde 
 )72	i ww~~233D<P<P		(*$)JJ/CRV$W! !% 1 1* =#0044YE $ 1 1 5 5j$ G89M8Nn]_c_h_h_jmr_r <<'4==+@NN/0D/E F& &
 599$$(NN$C$CDMM$RDM$(NN$E$Edmm$TDM.P!]]'#'#6#6#%)%:%: $ 3 LL#1	/+t| 		

!%4<<UYUbUbc(
 78L7MWUYU^U^U`chUhilTmmpqW !   A?@@A ! s   	M" $I5M;"M8;
N	c                 ,    [        U R                  5      $ r   )lenrK   )rp   s    r   __len__SquadDataset.__len__   s    4==!!rA   returnc                    U R                   U   n[        R                  " UR                  [        R                  S9n[        R                  " UR
                  [        R                  S9n[        R                  " UR                  [        R                  S9n[        R                  " UR                  [        R                  S9n[        R                  " UR                  [        R                  S9n[        R                  " UR                  [        R                  S9nUUUS.n	U R                  R                  S;   a  U	S	 U R                  R                  S;   a  U	R                  XgS.5        U R                  R                  (       a  U	R                  SU05        U R                  (       aU  U	R                  S[        R                   " UR"                  [        R$                  S9U R                  R&                  -  05        U R(                  [*        R,                  :X  am  [        R                  " UR.                  [        R                  S9n
[        R                  " UR0                  [        R                  S9nU	R                  XS	.5        U	$ )
N)dtype)	input_idsattention_masktoken_type_ids)xlmroberta
distilbert	camembertr   )xlnetr   )	cls_indexp_maskis_impossiblelangs)start_positionsend_positions)rK   rf   tensorr~   longr   r   r   r   r?   r   rJ   r   updater,   rM   onesshapeint64r0   rL   rC   rE   start_positionend_position)rp   ifeaturer~   r   r   r   r   r   inputsr   r   s               r   __getitem__SquadDataset.__getitem__   s   --"LL!2!2%**E	g&<&<EJJOg&<&<EJJOLL!2!2%**E	gnnEKK@W%:%:%++N #,,
 99#PP'(99#33MM	DEyy00>?))wIOO5;;)WZ^ZcZcZkZk)kmn99##ll7+A+ATO!LL)=)=UZZPMMMo^_rA   )rJ   rY   rZ   rK   rM   rL   rh   r]   )r4   r5   r6   r7   r8   r   r<   r   r   rC   r>   rE   r   r   r=   r	   r;   ru   ry   r   rf   Tensorr   r@   r3   rA   r   rH   rH   m   s     %$=!!
K '+"'++05#'(,J(J 'J sm	J
 CJJ  (~J C=J !JX" S%,,%6 7  rA   rH   )(r`   re   dataclassesr   r   enumr   typingr   r   r   r	   rf   filelockr
   torch.utils.datar   models.auto.modeling_autor   tokenization_utilsr   utilsr   r   processors.squadr   r   r   r   
get_loggerr4   rj   listkeysMODEL_CONFIG_CLASSEStupler:   r   rC   rH   r3   rA   r   <module>r      s    
  (  . .   $ M 5 6 t t 
		H	%@EEGH E0DEE ?n ?n ?nDD 
y7 yrA   