
    eTh\                        S SK r S SKrS SKrS SKrS SKrS SKrS SKJrJrJ	r	  S SK
r
S SKJr  S SKJr  SSKJr  SSKJr  \R&                  " \5      rSr " S	 S
\5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      rg)    N)DictListOptional)FileLock)Dataset   )PreTrainedTokenizer)loggingu   This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: {0}c            
       h    \ rS rSrSr  SS\S\S\S\\   4S jjr	S	 r
S
\R                  4S jrSrg)TextDataset(   @
This will be superseded by a framework-agnostic approach soon.
N	tokenizer	file_path
block_size	cache_dirc           
      (   [         R                  " [        R                  S5      [        5        [
        R                  R                  U5      SL a  [        SU S35      eX1R                  SS9-
  n[
        R                  R                  U5      u  pg[
        R                  R                  Ub  UOUSUR                  R                   SU SU 35      nUS-   n	[        U	5         [
        R                  R                  U5      (       a~  U(       dw  [         R                   " 5       n
[#        US	5       n[$        R&                  " U5      U l        S S S 5        [*        R-                  S
U S3[         R                   " 5       U
-
  5        GO>[*        R-                  SU 35        / U l        [#        USS9 nUR/                  5       nS S S 5        UR1                  UR3                  W5      5      n[5        S[7        U5      U-
  S-   U5       H1  nU R(                  R9                  UR;                  XX-    5      5        M3     [         R                   " 5       n
[#        US5       n[$        R<                  " U R(                  U[$        R>                  S9  S S S 5        [*        R-                  SU S[         R                   " 5       U
-
  S S35        S S S 5        g ! , (       d  f       GN= f! , (       d  f       GN&= f! , (       d  f       Nn= f! , (       d  f       g = f)Nchttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.pyFInput file path 
 not foundpair
cached_lm__.lockrb"Loading features from cached file  [took %.3f s]'Creating features from dataset file at utf-8encodingr      wbprotocol!Saving features into cached file  [took .3f s]) warningswarnDEPRECATION_WARNINGformatFutureWarningospathisfile
ValueErrornum_special_tokens_to_addsplitjoin	__class____name__r   existstimeopenpickleloadexamplesloggerinforeadconvert_tokens_to_idstokenizerangelenappend build_inputs_with_special_tokensdumpHIGHEST_PROTOCOL)selfr   r   r   overwrite_cacher   	directoryfilenamecached_features_file	lock_pathstarthandleftexttokenized_textis                   d/var/www/auris/envauris/lib/python3.13/site-packages/transformers/data/datasets/language_modeling.py__init__TextDataset.__init__-   s}    	&&u 		
 77>>)$-/	{*EFF"E"E5"E"QQ
 ggmmI6	!ww||".II,,556a
|1XJO 
 )72	i ww~~233O		.5$*KK$7DM 689M8Nn]_c_h_h_jmr_r
 Ei[QR ")g6!668D 7 "+!@!@ASASTXAY!Zq#n"5
"BQ"F
SAMM((!BB>VWVdCef T 		.5KKv@W@WX 678L7MWUYU^U^U`chUhilTmmpq; !  65 76 657 ! sW   $AL1KA#L0K BL/K2
;L
K	L 
K/	*L2
L 	<L
Lc                 ,    [        U R                  5      $ NrE   r>   rJ   s    rV   __len__TextDataset.__len__j       4==!!    returnc                 b    [         R                  " U R                  U   [         R                  S9$ )Ndtype)torchtensorr>   longrJ   rU   s     rV   __getitem__TextDataset.__getitem__m   s     ||DMM!,EJJ??r`   r>   )FN)r8   
__module____qualname____firstlineno____doc__r	   strintr   rW   r]   re   Tensorri   __static_attributes__ r`   rV   r   r   (   sV     #';&; ; 	; C=;z"@ @r`   r   c                   `    \ rS rSrSrS\S\S\4S jrS r	S\
\\R                  4   4S	 jrS
rg)LineByLineTextDatasetq   r   r   r   r   c           	         [         R                  " [        R                  S5      [        5        [
        R                  R                  U5      SL a  [        SU S35      e[        R                  SU 35        [        USS9 nUR                  5       R                  5        Vs/ s H-  n[        U5      S:  d  M  UR                  5       (       a  M+  UPM/     nnS S S 5        U" WS	S	US
9nUS   U l        U R                    Vs/ s H(  nS["        R$                  " U["        R&                  S90PM*     snU l        g s  snf ! , (       d  f       Nl= fs  snf )Nr   Fr   r   r   r    r!   r   Tadd_special_tokens
truncation
max_length	input_idsrc   )r+   r,   r-   r.   r/   r0   r1   r2   r3   r?   r@   r;   rA   
splitlinesrE   isspacer>   re   rf   rg   )	rJ   r   r   r   rR   linelinesbatch_encodinges	            rV   rW   LineByLineTextDataset.__init__v   s   &&u 		
 77>>)$-/	{*EFF 	=i[IJ)g.!&'ffh&9&9&;f&;dD	ATVZVbVbVdT&;Ef / #5Td_ij&{3SWS`S`aS`a+u||AUZZ'HIS`a	 g /.
 bs0   !D?#D::D:D:D?/E:D??
Ec                 ,    [        U R                  5      $ rZ   r[   r\   s    rV   r]   LineByLineTextDataset.__len__   r_   r`   ra   c                      U R                   U   $ rZ   rk   rh   s     rV   ri   !LineByLineTextDataset.__getitem__       }}Qr`   rk   Nr8   rl   rm   rn   ro   r	   rp   rq   rW   r]   r   re   rf   ri   rs   rt   r`   rV   rv   rv   q   sF    b"5 b# bSV b*" S%,,%6 7  r`   rv   c                   d    \ rS rSrSrS\S\S\S\4S jrS r	S	\
\\R                  4   4S
 jrSrg)LineByLineWithRefDataset   r   r   r   r   ref_pathc                    [         R                  " [        R                  S5      [        5        [
        R                  R                  U5      SL a  [        SU S35      e[
        R                  R                  U5      SL a  [        SU S35      e[        R                  SU 35        [        R                  SU 35        [        USS	9 nUR                  5       nS S S 5        W Vs/ s H;  n[        U5      S
:  d  M  UR                  5       (       a  M+  UR                  5       PM=     nn[        USS	9 nUR!                  5       R#                  5        Vs/ s HA  n[        U5      S
:  d  M  UR                  5       (       a  M+  [$        R&                  " U5      PMC     nnS S S 5        [        U5      [        W5      :w  a)  [        SU S[        U5       SU S[        U5       35      eU" USSUS9n	U	S   U l        U R(                   V
s/ s H(  n
S[*        R,                  " U
[*        R.                  S90PM*     sn
U l        [        U R(                  5      n[1        U5       H8  n[*        R,                  " X   [*        R.                  S9U R(                  U   S'   M:     g ! , (       d  f       GN= fs  snf s  snf ! , (       d  f       GN!= fs  sn
f )Nzghttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_wwm.pyFr   r   zRef file path r   zUse ref segment results at r    r!   r   zDLength of Input file should be equal to Ref file. But the length of z is z while length of Try   r}   rc   chinese_ref)r+   r,   r-   r.   r/   r0   r1   r2   r3   r?   r@   r;   	readlinesrE   r   striprA   r~   jsonloadsr>   re   rf   rg   rD   )rJ   r   r   r   r   rR   datar   refr   r   nrU   s                rV   rW   !LineByLineWithRefDataset.__init__   sJ   &&y 		
 77>>)$-/	{*EFF77>>(#u,~i[
CDD 	=i[IJ1(<=)g.!;;=D /)-VTQt||~

V(W-010C0C0Ep0E#d)VW-#`d`l`l`n#4::d#0ECp .t9C VW`Vaaefijnfoep q##+*DS
< 
 #4DT^hi&{3SWS`S`aS`a+u||AUZZ'HIS`aqA.3ll36.TDMM!]+ # /.V q .- bsN   J(J1?J1J16!J;J6.J6J6J;/K
J.6J;;
K
c                 ,    [        U R                  5      $ rZ   r[   r\   s    rV   r]    LineByLineWithRefDataset.__len__   r_   r`   ra   c                      U R                   U   $ rZ   rk   rh   s     rV   ri   $LineByLineWithRefDataset.__getitem__   r   r`   rk   Nr   rt   r`   rV   r   r      sP    "U"5 "U# "USV "Ube "UH" S%,,%6 7  r`   r   c                   j    \ rS rSrSrS\S\S\4S jrSS jr	S r
S	\\\R                  4   4S
 jrSrg)LineByLineWithSOPTextDataset   zQ
Dataset for sentence order prediction task, prepare sentence pairs for SOP task
r   file_dirr   c                    [         R                  " [        R                  S5      [        5        [
        R                  R                  U5      SL a  [        U S35      e[        R                  SU 35        / U l        [
        R                  " U5       GH;  n[
        R                  R                  X$5      n[
        R                  R                  U5      SL a  [        U S35      eSn[        USS9 nUR!                  5       n/ n	U H  n
SU
;   a  S	nM  S
U
;   a  SnU	SS   V
s/ s HK  n
[#        U
5      S:  d  M  U
R%                  5       (       a  M+  UR'                  UR)                  U
5      5      PMM     nn
U R+                  XU5      nU R                  R-                  U5        / n	M  U(       d  M  U	R/                  U
5        M     S S S 5        GM>     [        R                  S5        g s  sn
f ! , (       d  f       GMk  = f)Nr   Fz is not a directoryz.Creating features from dataset file folder at z is not a filer    r!   z<doc id=Tz</doc>r#   r   zDataset parse finished.)r+   r,   r-   r.   r/   r0   r1   isdirr3   r?   r@   r>   listdirr6   r2   r;   r   rE   r   rB   rC   create_examples_from_documentextendrF   )rJ   r   r   r   	file_namer   article_openrR   original_linesarticle_linesr   documentr>   s                rV   rW   %LineByLineWithSOPTextDataset.__init__   s   &&u 		
 77=="e+z)<=>>DXJOP H-IX9Iww~~i(E1 I;n!=>> Li'2a!" "*D!T)'+!T)', )6ab(9$(9 #D	A V6:lln VI;;I<N<Nt<TU(9 ! $ $(#E#Eh\e#f,,X6(*'<)006! + 32 .4 	-.$ 32s0   31G/$G*
;G*
$G*
68G/2G/*G//
G?	c                 ,   X#R                  SS9-
  nUn[        R                  " 5       U:  a  [        R                  " SU5      n/ n/ nSn	Sn
U
[        U5      :  Ga8  X   nU(       d  U
S-  n
M"  UR	                  U5        U	[        U5      -  n	U
[        U5      S-
  :X  d  X:  Ga  U(       Ga  Sn[        U5      S:  a#  [        R                  " S[        U5      S-
  5      n/ n[        U5       H  nUR                  X   5        M     / n[        U[        U5      5       H  nUR                  X   5        M     [        U5      S:X  d  [        U5      S:X  a  GM  [        R                  " 5       S:  a  SnXpOSnS nU" XU5        [        U5      S:  d  [        S	[        U5       S
35      e[        U5      S:  d  [        S[        U5       S
35      eUR                  X5      nUR                  X5      n[        R                  " U[        R                  S9[        R                  " U[        R                  S9[        R                  " U(       a  SOS[        R                  S9S.nUR	                  U5        / nSn	U
S-  n
U
[        U5      :  a  GM8  U$ )'Creates examples for a single document.Tr      r   r#         ?Fc                     [        U 5      [        U5      -   nX2::  a  g[        U 5      [        U5      :  a  U OUn[        U5      S:  d  [        S5      e[        R                  " 5       S:  a  US	 OUR                  5         M  )z;Truncates a pair of sequences to a maximum sequence length.r#   z8Sequence length to be truncated must be no less than oner   r   N)rE   r3   randompop)tokens_atokens_bmax_num_tokenstotal_lengthtrunc_tokenss        rV   truncate_seq_pairULineByLineWithSOPTextDataset.create_examples_from_document.<locals>.truncate_seq_pair-  sx    "+.x=3x=+HL+= %7:8}s8}7T8ZbL$'$5$:&01k&l l  &}}4$0O , 0 0 2 #r`   Length of sequence a is  which must be no less than 1Length of sequence b is rc   )r}   token_type_idssentence_order_label)r4   r   randintrE   rF   rD   r   r3   rG   $create_token_type_ids_from_sequencesre   rf   rg   )rJ   r   r   r   short_seq_probr   target_seq_lengthr>   current_chunkcurrent_lengthrU   segmenta_endr   jr   is_nextr   r}   r   examples                        rV   r   :LineByLineWithSOPTextDataset.create_examples_from_document   sa    $&I&It&I&TT +==?^+ &q. A #h-kGQ  )c'l*NCMA%%)L E=)Q. &q#m2Dq2H I!H"5\ (89 *  "H"5#m*<= (89 > 8})S]a-?  }},"'-5("&3  &h.IMQ.(+CCM?Ro)pqqMQ.(+CCM?Ro)pqq !* J J8 ^I%.%S%ST\%gN &+\\)5::%N*/,,~UZZ*X05'QqX]XbXb0cG
 OOG, "!"FAM #h-N r`   c                 ,    [        U R                  5      $ rZ   r[   r\   s    rV   r]   $LineByLineWithSOPTextDataset.__len__S  r_   r`   ra   c                      U R                   U   $ rZ   rk   rh   s     rV   ri   (LineByLineWithSOPTextDataset.__getitem__V  r   r`   rk   N)皙?)r8   rl   rm   rn   ro   r	   rp   rq   rW   r   r]   r   re   rf   ri   rs   rt   r`   rV   r   r      sJ    '/"5 '/ '/RU '/RaF" S%,,%6 7  r`   r   c                   f    \ rS rSrSr   SS\S\S\4S jjrS\	\	\      S\S\4S	 jr
S
 rS rSrg)$TextDatasetForNextSentencePredictioniZ  r   r   r   r   c           	      N   [         R                  " [        R                  S5      [        5        [
        R                  R                  U5      (       d  [        SU S35      eXPl	        X`l
        [
        R                  R                  U5      u  px[
        R                  R                  USUR                  R                   SU SU 35      n	Xl        U	S-   n
[!        U
5         [
        R                  R#                  U	5      (       a~  U(       dw  [$        R$                  " 5       n['        U	S5       n[(        R*                  " U5      U l        S S S 5        [.        R1                  SU	 S	3[$        R$                  " 5       U-
  5        GO[.        R1                  S
U 35        / /U l        ['        USS9 n UR5                  5       nU(       d  OUR7                  5       nU(       d7  [9        U R2                  S   5      S:w  a  U R2                  R;                  / 5        UR=                  U5      nUR?                  U5      nU(       a  U R2                  S   R;                  U5        M  S S S 5        [.        R1                  S[9        U R2                  5       S35        / U l        [A        U R2                  5       H  u  nnU RC                  UUU5        M     [$        R$                  " 5       n['        U	S5       n[(        RD                  " U R,                  U[(        RF                  S9  S S S 5        [.        R1                  SU	 S[$        R$                  " 5       U-
  S S35        S S S 5        g ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       Nn= f! , (       d  f       g = f)Nr   r   r   cached_nsp_r   r   r   r   r   r   r    r!   r   zCreating examples from z documents.r$   r%   r'   r(   r)   r*   )$r+   r,   r-   r.   r/   r0   r1   r2   r3   short_seq_probabilitynsp_probabilityr5   r6   r7   r8   r   r   r9   r:   r;   r<   r=   r>   r?   r@   	documentsreadliner   rE   rF   rC   rB   	enumerater   rH   rI   )rJ   r   r   r   rK   r   r   rL   rM   rN   rO   rP   rQ   rR   r   tokens	doc_indexr   s                     rV   rW   -TextDatasetForNextSentencePrediction.__init___  s    	&&u 		
 ww~~i((/	{*EFF%:". ggmmI6	!ww||)--667qAhZP 

 # )72	 i ww~~233O		.5$*KK$7DM 689M8Nn]_c_h_h_jmr_r Ei[QR"$)g6! zz|#!#zz|  $DNN2,>(?1(D NN11"5!*!3!3D!9!*!@!@!H! NN2.55f=  7 5c$..6I5J+VW "+4T^^+D'Ix66xJW ,E 		.5KKv@W@WX 678L7MWUYU^U^U`chUhilTmmpqG !  65 76* 65C ! sX   "AN/M!A$N/B1M3 BN./N;N!
M0	+N3
N	=N
N	N
N$r   r   c                    X0R                   R                  SS9-
  nUn[        R                  " 5       U R                  :  a  [        R                  " SU5      n/ nSnSnU[        U5      :  Ga  X   n	UR                  U	5        U[        U	5      -  nU[        U5      S-
  :X  d  Xu:  Ga  U(       Ga  Sn
[        U5      S:  a#  [        R                  " S[        U5      S-
  5      n
/ n[        U
5       H  nUR                  Xl   5        M     / n[        U5      S:X  d#  [        R                  " 5       U R                  :  a  SnU[        U5      -
  n[        S5       H8  n[        R                  " S[        U R                  5      S-
  5      nUU:w  d  M8    O   U R                  W   n[        R                  " S[        U5      S-
  5      n[        U[        U5      5       H(  nUR                  UU   5        [        U5      U:  d  M(    O   [        U5      U
-
  nUU-  nO1Sn[        U
[        U5      5       H  nUR                  Xl   5        M     [        U5      S:  d  [        S[        U5       S	35      e[        U5      S:  d  [        S
[        U5       S	35      eU R                   R                  X5      nU R                   R                  X5      n[        R                  " U[        R                   S9[        R                  " U[        R                   S9[        R                  " U(       a  SOS[        R                   S9S.nU R"                  R                  U5        / nSnUS-  nU[        U5      :  a  GM  gg)r   Tr   r   r   r#   
   Fr   r   r   rc   )r}   r   next_sentence_labelN)r   r4   r   r   r   rE   rF   rD   r   r   r   r3   rG   r   re   rf   rg   r>   )rJ   r   r   r   r   r   r   r   rU   r   r   r   r   r   is_random_nexttarget_b_lengthr   random_document_indexrandom_documentrandom_startnum_unused_segmentsr}   r   r   s                           rV   r   BTextDatasetForNextSentencePrediction.create_examples_from_document  s    $nn&N&NTX&N&YY +==?T777 &q. A#h-kG  )c'l*NCMA%%)L  E=)Q. &q#m2Dq2H I!H"5\ (89 *  "H=)Q.&--/DDXDX2X)-*;c(m*K "'rA4:NN1c$..FY\]F]4^14	A % "+
 +/..9N*O'-~~a_9MPQ9Q'R!&|S5I!JA$OOOA,>?"8}? % "K /2-.@5.H+00 */!&uc-.@!AA$OOM,<= "B  MQ.(+CCM?Ro)pqqMQ.(+CCM?Ro)pqq !% O OPX cI%)^^%X%XYa%lN &+\\)5::%N*/,,~UZZ*X/4||AUV^c^h^h/iG MM((1 "!"FAI #h-r`   c                 ,    [        U R                  5      $ rZ   r[   r\   s    rV   r]   ,TextDatasetForNextSentencePrediction.__len__  r_   r`   c                      U R                   U   $ rZ   rk   rh   s     rV   ri   0TextDatasetForNextSentencePrediction.__getitem__  r   r`   )r   r>   r   r   r   N)Fr   r   )r8   rl   rm   rn   ro   r	   rp   rq   rW   r   r   r]   ri   rs   rt   r`   rV   r   r   Z  sk     !S&S S 	SjXd49o XRU Xcf Xt" r`   r   )r   r0   r<   r   r:   r+   typingr   r   r   re   filelockr   torch.utils.datar   tokenization_utilsr	   utilsr
   
get_loggerr8   r?   r-   r   rv   r   r   r   rt   r`   rV   <module>r      s     	     ' '   $ 5  
		H	%L F@' F@R G  B- w - `U 7 U px 7 x r`   