o
    rZh.                     @   s   d dl T d dlmZ edZedZedZedZedZ	edZ
ed	ZG d
d deZG dd deeZdS )    )*)XMLCorpusReaderz<p(?: [^>]*){0,1}>(.*?)</p>z<s(?: [^>]*){0,1}>(.*?)</s>z#<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>z!<[wc](?: [^>]*){0,1}>(.*?)</[wc]>ztype="(.*?)"zana="(.*?)"ztext id="(.*?)"c                   @   s0   e Zd Z			d
ddZdZdd Zdd	 ZdS )TEICorpusViewNr   c                 C   s,   || _ || _|| _|| _tj| ||d d S )N)startpos)_tagged_textids_group_by_sent_group_by_paraStreamBackedCorpusView__init__)selfZcorpus_fileZtaggedZgroup_by_sentZgroup_by_paraZtagsethead_lentextids r   H/var/www/auris/lib/python3.10/site-packages/nltk/corpus/reader/pl196x.pyr      s
   
zTEICorpusView.__init__i   c                 C   sv  | | j}t|}|d|dks|ddkr;| }t|dkr&n||7 }|d|dks|ddks|dd}t|}| j	ru|D ])}|| j	vrt|
|d }||d  
dtd }|d | ||| d   }qKg }t|D ]<}	g }
t|	D ]$}| jst|}ntt| jt|}| jr|
| q|
| q| jr||
 q|||
 q||S )Nz<text idz</text>r   
    )	readlines	_pagesizeconcatcountreadlinelenreplaceTEXTIDfindallr   findPARASENTr   WORDlistmap
_parse_tag
TAGGEDWORDr   appendextendr	   )r   streamblocktmpr   tidbegendoutputZpara_strparaZsent_strsentr   r   r   
read_block,   sN   

zTEICorpusView.read_blockc                 C   sB   |\}}| drt|d}||fS t|d}||fS )Nwr   )
startswithANAsearchgroupTYPE)r   Ztag_word_tupletagwordr   r   r   r#   S   s   
zTEICorpusView._parse_tag)Nr   N)__name__
__module____qualname__r   r   r0   r#   r   r   r   r   r      s    
'r   c                   @   s   e Zd ZdZdd Zdd Zdd Zdd	d
Zdd ZdddZ	dddZ
dddZdddZdddZdddZdddZdddZdS ) Pl196xCorpusReaderi
  c                 O   sD   d|v r
|d | _ nd | _ tj| g|R   t| | |   d S )NZtextid_file)r   r   r   CategorizedCorpusReader_init_textids)r   argskwargsr   r   r   r   _   s   zPl196xCorpusReader.__init__c                 C   s   t t| _t t| _| jd urVt| j8}|D ],}| }|dd\}}||  vr4t	d| j|f || j
D ]}| || q:qW d    d S 1 sOw   Y  d S d S )N r   z(In text_id mapping file %s: %s not found)defaultdictr!   _f2t_t2fr   openstripsplitfileids
ValueErrorZ
_delimiter_add_textids)r   fplinefile_idZtext_idstext_idr   r   r   r>   j   s&   


"z Pl196xCorpusReader._init_textidsc                 C   s$   | j | | | j| | d S N)rC   r%   rD   )r   rM   rN   r   r   r   rJ   z   s   zPl196xCorpusReader._add_textidsNc                    s   d }t ttdd |||fdkrtd|d ur|d fS |d ur) |d fS |d urZt|tr5|g}t fdd|D g }t }|D ]}t	 j
| t	|@ ||< qF||fS d S )Nc                 S   s   | d u S rO   r   )accessorr   r   r   <lambda>   s    z-Pl196xCorpusReader._resolve.<locals>.<lambda>r   z6Specify exactly one of: fileids, categories or textidsc                 3       | ]} j | V  qd S rO   )rD   ).0tr   r   r   	<genexpr>       z.Pl196xCorpusReader._resolve.<locals>.<genexpr>)r   r!   filterrI   rH   
isinstancestrsumdictsetrC   )r   rH   
categoriesr   r)   filesZtdictfr   rU   r   _resolve~   s6   

zPl196xCorpusReader._resolvec                 C   s   |S rO   r   )r   r7   r   r   r   
decode_tag   s   zPl196xCorpusReader.decode_tagc                    sN     ||\}}|du rt jS t|tr|g}tt fdd|D g S )an  
        In the pl196x corpus each category is stored in single
        file and thus both methods provide identical functionality. In order
        to accommodate finer granularity, a non-standard textids() method was
        implemented. All the main functions can be supplied with a list
        of required chunks---giving much more control to the user.
        Nc                 3   rR   rO   )rC   )rS   drU   r   r   rV      rW   z-Pl196xCorpusReader.textids.<locals>.<genexpr>)ra   sortedrD   rY   rZ   r[   r   rH   r^   _r   rU   r   r      s   

zPl196xCorpusReader.textidsc                    d     ||\}|d u r j}nt|tr|g}r't fdd|D S t fdd|D S )Nc              
      ,   g | ]}t  |d d d  j| dqS )Fr   r   r   abspathr   rS   Zfileidr   r   r   r   
<listcomp>       	z,Pl196xCorpusReader.words.<locals>.<listcomp>c              	      &   g | ]}t  |d d d  jdqS )Fr   rj   rl   rU   r   r   rn      s    ra   Z_fileidsrY   rZ   r   r   rH   r^   r   r   rm   r   words   s    
	
zPl196xCorpusReader.wordsc                    rg   )Nc              
      s,   g | ]}t  |d dd  j| dqS FTri   rj   rl   rm   r   r   rn      ro   z,Pl196xCorpusReader.sents.<locals>.<listcomp>c              	      s&   g | ]}t  |d dd  jdqS FTrq   rj   rl   rU   r   r   rn          rr   rs   r   rm   r   sents       
	
zPl196xCorpusReader.sentsc                    rg   )Nc              
      ,   g | ]}t  |d dd j| dqS ru   rj   rl   rm   r   r   rn      ro   z,Pl196xCorpusReader.paras.<locals>.<listcomp>c              	      &   g | ]}t  |d dd jdqS rv   rj   rl   rU   r   r   rn     rw   rr   rs   r   rm   r   paras   ry   zPl196xCorpusReader.parasc                    rg   )Nc              
      rz   TFri   rj   rl   rm   r   r   rn     ro   z3Pl196xCorpusReader.tagged_words.<locals>.<listcomp>c              	      r{   TFrq   rj   rl   rU   r   r   rn   *  rw   rr   rs   r   rm   r   tagged_words  ry   zPl196xCorpusReader.tagged_wordsc                    rg   )Nc              
      s,   g | ]}t  |d d d j| dqS r}   rj   rl   rm   r   r   rn   ;  ro   z3Pl196xCorpusReader.tagged_sents.<locals>.<listcomp>c              	      s&   g | ]}t  |d d d jdqS r~   rj   rl   rU   r   r   rn   I  rw   rr   rs   r   rm   r   tagged_sents2  ry   zPl196xCorpusReader.tagged_sentsc                    rg   )Nc              
      rh   )Tri   rj   rl   rm   r   r   rn   Z  ro   z3Pl196xCorpusReader.tagged_paras.<locals>.<listcomp>c              	      rp   )Trq   rj   rl   rU   r   r   rn   h  rw   rr   rs   r   rm   r   tagged_parasQ  ry   zPl196xCorpusReader.tagged_parasc                 C   s4   |  ||\}}t|dkrt| |d S td)Nr   r   zExpected a single file)ra   r   r   xml	TypeErrorre   r   r   r   r   p  s   zPl196xCorpusReader.xmlrO   )NN)NNN)r9   r:   r;   r   r   r>   rJ   ra   rb   r   rt   rx   r|   r   r   r   r   r   r   r   r   r<   \   s    
 


#



r<   N)Znltk.corpus.reader.apiZnltk.corpus.reader.xmldocsr   recompiler   r   r$   r    r6   r3   r   r
   r   r=   r<   r   r   r   r   <module>   s   






E