o
    Zh\                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZm	Z	 d dl
Z
d dlmZ d dlmZ ddlmZ ddlmZ eeZdZG d	d
 d
eZG dd deZG dd deZG dd deZG dd deZdS )    N)DictListOptional)FileLock)Dataset   )PreTrainedTokenizer)loggingu   This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: {0}c                
   @   sL   e Zd ZdZ		ddedededee fdd	Zd
d Z	de
jfddZdS )TextDatasetH
    This will be superseded by a framework-agnostic approach soon.
    FN	tokenizer	file_path
block_size	cache_dirc              
   C   s4  t tdt tj|du rtd| d||j	dd }tj
|\}}tj|d ur2|n|d|jj d| d| }|d }	t|	 tj|r|st }
t|d	}t|| _W d    n1 slw   Y  td
| dt |
  ntd|  g | _t|dd}| }W d    n1 sw   Y  |||}tdt|| d |D ]}| j|||||   qt }
t|d}tj| j|tjd W d    n1 sw   Y  td| dt |
 dd W d    d S W d    d S 1 sw   Y  d S )Nchttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.pyFInput file path 
 not foundpairZ
cached_lm__.lockrb"Loading features from cached file  [took %.3f s]'Creating features from dataset file at utf-8encodingr      wbprotocol!Saving features into cached file  [took .3f s]) warningswarnDEPRECATION_WARNINGformatFutureWarningospathisfile
ValueErrornum_special_tokens_to_addsplitjoin	__class____name__r   existstimeopenpickleloadexamplesloggerinforeadconvert_tokens_to_idstokenizerangelenappend build_inputs_with_special_tokensdumpHIGHEST_PROTOCOL)selfr   r   r   overwrite_cacher   	directoryfilenamecached_features_file	lock_pathstarthandleftextZtokenized_texti rP   [/var/www/auris/lib/python3.10/site-packages/transformers/data/datasets/language_modeling.py__init__-   sX   

$zTextDataset.__init__c                 C   
   t | jS Nr@   r9   rE   rP   rP   rQ   __len__j      
zTextDataset.__len__returnc                 C   s   t j| j| t jdS )NZdtype)torchtensorr9   longrE   rO   rP   rP   rQ   __getitem__m   s   zTextDataset.__getitem__)FN)r3   
__module____qualname____doc__r   strintr   rR   rW   r[   ZTensorr_   rP   rP   rP   rQ   r
   (   s    	
=r
   c                   @   sF   e Zd ZdZdededefddZdd Zd	e	ee
jf fd
dZdS )LineByLineTextDatasetr   r   r   r   c                 C   s   t tdt tj|du rtd| dt	
d|  t|dd}dd	 |  D }W d    n1 s=w   Y  ||d
d
|d}|d | _dd	 | jD | _d S )Nr   Fr   r   r   r   r   c                 S   s$   g | ]}t |d kr| s|qS r   )r@   isspace.0linerP   rP   rQ   
<listcomp>   s   $ z2LineByLineTextDataset.__init__.<locals>.<listcomp>TZadd_special_tokensZ
truncation
max_length	input_idsc                 S       g | ]}d t j|t jdiqS rn   rZ   r[   r\   r]   ri   erP   rP   rQ   rk           )r&   r'   r(   r)   r*   r+   r,   r-   r.   r:   r;   r6   r<   
splitlinesr9   )rE   r   r   r   rM   linesbatch_encodingrP   rP   rQ   rR   v   s   
zLineByLineTextDataset.__init__c                 C   rS   rT   rU   rV   rP   rP   rQ   rW      rX   zLineByLineTextDataset.__len__rY   c                 C   
   | j | S rT   r9   r^   rP   rP   rQ   r_      rX   z!LineByLineTextDataset.__getitem__Nr3   r`   ra   rb   r   rc   rd   rR   rW   r   r[   r\   r_   rP   rP   rP   rQ   re   q   s
    re   c                   @   sJ   e Zd ZdZdedededefddZdd	 Zd
e	ee
jf fddZdS )LineByLineWithRefDatasetr   r   r   r   ref_pathc              
   C   s  t tdt tj|du rtd| dtj|du r)td| dt	
d|  t	
d|  t|dd	}| }W d    n1 sNw   Y  d
d |D }t|dd	}dd |  D }W d    n1 svw   Y  t|t|krtd| dt| d| dt| ||dd|d}|d | _dd | jD | _t| j}	t|	D ]}
tj||
 tjd| j|
 d< qd S )Nzghttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_wwm.pyFr   r   zRef file path r   zUse ref segment results at r   r   c                 S   s(   g | ]}t |d kr| s| qS rf   )r@   rg   striprh   rP   rP   rQ   rk      s   ( z5LineByLineWithRefDataset.__init__.<locals>.<listcomp>c                 S   s*   g | ]}t |d kr| st|qS rf   )r@   rg   jsonloadsrh   rP   rP   rQ   rk      s   * zDLength of Input file should be equal to Ref file. But the length of z is z while length of Trl   rn   c                 S   ro   rp   rq   rr   rP   rP   rQ   rk      rt   rZ   Zchinese_ref)r&   r'   r(   r)   r*   r+   r,   r-   r.   r:   r;   r6   	readlinesr<   ru   r@   r9   r?   r[   r\   r]   )rE   r   r   r   r|   rM   datarefrw   nrO   rP   rP   rQ   rR      sD   


 z!LineByLineWithRefDataset.__init__c                 C   rS   rT   rU   rV   rP   rP   rQ   rW      rX   z LineByLineWithRefDataset.__len__rY   c                 C   rx   rT   ry   r^   rP   rP   rQ   r_      rX   z$LineByLineWithRefDataset.__getitem__Nrz   rP   rP   rP   rQ   r{      s
    $r{   c                   @   sP   e Zd ZdZdededefddZddd	Zd
d Z	de
eejf fddZdS )LineByLineWithSOPTextDatasetzY
    Dataset for sentence order prediction task, prepare sentence pairs for SOP task
    r   file_dirr   c              	      s8  t tdt tj|du rt| dt	
d|  g | _t|D ]l}tj||}tj|du r@t| dd}t|ddD}| }g }	|D ]3}
d|
v rZd	}qQd
|
v r}d} fdd|	dd  D }| || }| j| g }	qQ|r|	|
 qQW d    n1 sw   Y  q(t	
d d S )Nr   Fz is not a directoryz.Creating features from dataset file folder at z is not a filer   r   z<doc id=Tz</doc>c                    s0   g | ]}t |d kr| s  |qS rf   )r@   rg   r=   r>   rh   r   rP   rQ   rk      s
    z9LineByLineWithSOPTextDataset.__init__.<locals>.<listcomp>r   zDataset parse finished.)r&   r'   r(   r)   r*   r+   r,   isdirr.   r:   r;   r9   listdirr1   r-   r6   r   create_examples_from_documentextendrA   )rE   r   r   r   	file_namer   Zarticle_openrM   Zoriginal_linesZarticle_linesrj   documentr9   rP   r   rQ   rR      sH   


z%LineByLineWithSOPTextDataset.__init__皙?c                 C   s  ||j dd }|}t |k rtd|}g }g }d}	d}
|
t|k r||
 }|s/|
d7 }
q|| |	t|7 }	|
t|d ksF|	|kr|rd}t|dkrZtdt|d }g }t|D ]	}|||  q`g }t|t|D ]	}|||  qst|dkst|dkrqt dk rd}||}}nd}dd	 }|||| t|dkstd
t| dt|dkstdt| d|||}|	||}t
j|t
jdt
j|t
jdt
j|rdndt
jdd}|| g }d}	|
d7 }
|
t|k s$|S )'Creates examples for a single document.Tr      r   r         ?Fc                 S   sh   	 t | t | }||krdS t | t |kr| n|}t |dks%tdt dk r/|d= n|  q)z;Truncates a pair of sequences to a maximum sequence length.Tr   z8Sequence length to be truncated must be no less than oner   r   N)r@   r.   randompop)tokens_atokens_bmax_num_tokenstotal_lengthZtrunc_tokensrP   rP   rQ   truncate_seq_pair-  s   zULineByLineWithSOPTextDataset.create_examples_from_document.<locals>.truncate_seq_pairLength of sequence a is  which must be no less than 1Length of sequence b is rZ   )rn   token_type_idsZsentence_order_label)r/   r   randintr@   rA   r?   r   r.   rB   $create_token_type_ids_from_sequencesr[   r\   r]   )rE   r   r   r   Zshort_seq_probr   target_seq_lengthr9   current_chunkcurrent_lengthrO   segmenta_endr   jr   Zis_nextr   rn   r   examplerP   rP   rQ   r      sd   	

Gz:LineByLineWithSOPTextDataset.create_examples_from_documentc                 C   rS   rT   rU   rV   rP   rP   rQ   rW   S  rX   z$LineByLineWithSOPTextDataset.__len__rY   c                 C   rx   rT   ry   r^   rP   rP   rQ   r_   V  rX   z(LineByLineWithSOPTextDataset.__getitem__N)r   )r3   r`   ra   rb   r   rc   rd   rR   r   rW   r   r[   r\   r_   rP   rP   rP   rQ   r      s    
)cr   c                   @   s\   e Zd ZdZ			ddededefdd	Zd
eee  dedefddZ	dd Z
dd ZdS )$TextDatasetForNextSentencePredictionr   Fr   r   r   r   r   c              	   C   sz  t tdt tj|std| d|| _	|| _
tj|\}}tj|d|jj d| d| }	|| _|	d }
t|
 tj|	ry|syt }t|	d}t|| _W d    n1 sew   Y  td|	 d	t |  ntd
|  g g| _t|dd:}	 | }|sn*| }|st| jd dkr| jg  ||}||}|r| jd | qW d    n1 sw   Y  tdt| j d g | _t | jD ]\}}| !||| qt }t|	d}tj"| j|tj#d W d    n	1 sw   Y  td|	 dt | dd W d    d S W d    d S 1 s6w   Y  d S )Nr   r   r   Zcached_nsp_r   r   r   r   r   r   r   r   Tr   zCreating examples from z documents.r   r    r"   r#   r$   r%   )$r&   r'   r(   r)   r*   r+   r,   r-   r.   short_seq_probabilitynsp_probabilityr0   r1   r2   r3   r   r   r4   r5   r6   r7   r8   r9   r:   r;   	documentsreadliner}   r@   rA   r>   r=   	enumerater   rC   rD   )rE   r   r   r   rF   r   r   rG   rH   rI   rJ   rK   rL   rM   rj   tokens	doc_indexr   rP   rP   rQ   rR   _  sr   	


$z-TextDatasetForNextSentencePrediction.__init__r   r   c                 C   s|  || j jdd }|}t | jk rtd|}g }d}d}|t|k r<|| }	||	 |t|	7 }|t|d ksA||kr0|r,d}
t|dkrVtdt|d }
g }t|
D ]	}|||  q\g }t|dksut | j	k rd}|t| }tdD ]}tdt| j
d }||kr nq| j
| }tdt|d }t|t|D ]}|||  t||kr nqt||
 }||8 }nd}t|
t|D ]	}|||  qt|dkstdt| d	t|dkstd
t| d	| j ||}| j ||}tj|tjdtj|tjdtj|rdndtjdd}| j| g }d}|d7 }|t|k s%dS dS )r   Tr   r   r   r   
   Fr   r   r   rZ   )rn   r   Znext_sentence_labelN)r   r/   r   r   r   r@   rA   r?   r   r   r   r.   rB   r   r[   r\   r]   r9   )rE   r   r   r   r   r   r   r   rO   r   r   r   r   r   Zis_random_nextZtarget_b_lengthr   Zrandom_document_indexZrandom_documentZrandom_startZnum_unused_segmentsrn   r   r   rP   rP   rQ   r     sn   	


zBTextDatasetForNextSentencePrediction.create_examples_from_documentc                 C   rS   rT   rU   rV   rP   rP   rQ   rW     rX   z,TextDatasetForNextSentencePrediction.__len__c                 C   rx   rT   ry   r^   rP   rP   rQ   r_     rX   z0TextDatasetForNextSentencePrediction.__getitem__N)Fr   r   )r3   r`   ra   rb   r   rc   rd   rR   r   r   rW   r_   rP   rP   rP   rQ   r   Z  s    	
UZr   )r~   r+   r7   r   r5   r&   typingr   r   r   r[   Zfilelockr   Ztorch.utils.datar   Ztokenization_utilsr   utilsr	   Z
get_loggerr3   r:   r(   r
   re   r{   r   r   rP   rP   rP   rQ   <module>   s*   
I!0 