o
    rZh|                      @   s@   d dl Z d dlmZ d dlmZ d dlmZ G dd deZdS )    N)warn)ElementTree)CorpusReaderc                       sh   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd Zdd Z  ZS )BCP47CorpusReaderu~  
    Parse BCP-47 composite language tags

    Supports all the main subtags, and the 'u-sd' extension:

    >>> from nltk.corpus import bcp47
    >>> bcp47.name('oc-gascon-u-sd-fr64')
    'Occitan (post 1500): Gascon: Pyrénées-Atlantiques'

    Can load a conversion table to Wikidata Q-codes:
    >>> bcp47.load_wiki_q()
    >>> bcp47.wiki_q['en-GI-spanglis']
    'Q79388'

    c                    s   t  || i | _| d}| | d| _W d   n1 s%w   Y  | d}| t	
|d| _W d   n1 sFw   Y  |   dS )zRead the BCP-47 databasez!iana/language-subtag-registry.txtz%%
Nzcldr/common-subdivisions-en.xmlz+localeDisplayNames/subdivisions/subdivision)super__init__langcodeopen	data_dictreadsplitdbsubdiv_dictetparseZiterfindsubdiv
morphology)selfrootZfileidsfp	__class__ G/var/www/auris/lib/python3.10/site-packages/nltk/corpus/reader/bcp47.pyr       s   zBCP47CorpusReader.__init__c                 C   sR   |  d}| |  ddd | _W d   dS 1 s"w   Y  dS )z:Load conversion table to Wikidata Q-codes (only if needed)z-cldr/tools-cldr-rdf-external-entityToCode.tsv
   N)r	   	wiki_dictr   stripr   Zwiki_q)r   r   r   r   r   load_wiki_q,   s   $"zBCP47CorpusReader.load_wiki_qc                 C   s   dd dd |D D S )z7Convert Wikidata list of Q-codes to a BCP-47 dictionaryc                 S   s$   i | ]}|d  |d  dd qS )r   r   /r   ).0pairr   r   r   
<dictcomp>3   s    z/BCP47CorpusReader.wiki_dict.<locals>.<dictcomp>c                 S   s   g | ]	}|  d qS )	)r   r   )r"   liner   r   r   
<listcomp>5       z/BCP47CorpusReader.wiki_dict.<locals>.<listcomp>r   )r   linesr   r   r   r   1   s   zBCP47CorpusReader.wiki_dictc                 C   s   dd |D S )z2Convert the CLDR subdivisions list to a dictionaryc                 S   s   i | ]	}|j d  |jqS )type)Zattribtext)r"   subr   r   r   r$   :   r(   z1BCP47CorpusReader.subdiv_dict.<locals>.<dictcomp>r   )r   Zsubdivsr   r   r   r   8   s   zBCP47CorpusReader.subdiv_dictc              
   C   s   t jt jt jt jt jd| _d}d}d}d}t|d  dt|d  t| |d  td|d	  d
|d  dt|d  |d d  t| d| _d S )N)languageextlangscriptregionvariantz[0-9]z[a-z]z[A-Z]z[a-zA-Z0-9]   ?(   z)|()   )r-   r.   r/   r0   r1   	singleton)strlowertitleuppercasingrecompileformat)r   diglowupZalnumr   r   r   r   <   s"   
zBCP47CorpusReader.morphologyc                 C   sv  |d  dd | _i }i |d< dD ]}i |d |< q|dd D ]}dd	 | d
D }|d d }|d d }||vrDi ||< i }|dd D ]C}	t|	dkrj|	\}
}|
|vrb|g||
< n||
 | n||
 d  d|	d   7  < d|vr|dkr|
dkr|| j||
 d < qL|D ]}
t||
 dkr||
 d ||
< qd|v r||d | |< q"||| |< q"|S )z;Convert the BCP-47 language subtag registry to a dictionaryr   z
File-Date: 
deprecated)r-   r.   r/   r0   r1   	redundantgrandfatheredr   Nc                 S   s   g | ]}| d qS ): r!   )r"   fieldr   r   r   r'   a   s    z/BCP47CorpusReader.data_dict.<locals>.<listcomp>r   r5   r     Z
Deprecatedr-   Description)replacer   versionr   lenappendr   )r   recordsZdiclabelrecordfieldstyptagZ	subfieldsrI   keyvalr   r   r   r
   Q   s@   	 zBCP47CorpusReader.data_dictc                 C   s   t |tkr
|d }|S )zReturn only first valuer   )r*   list)r   rW   r   r   r   val2str   s   zBCP47CorpusReader.val2strc                 C   s2   |d  }dD ]}||v r|d||  7 }q|S )zConcatenate subtag valuesr-   )r.   r/   r0   r1   	extensionrH   r   )r   Z	lg_recordnamerQ   r   r   r   lang2str   s   
zBCP47CorpusReader.lang2strc                 C   s  | d}i }g d}|r|r|d}d}|r|d}| j| |}| j| |r|| j| v rYd}| | j| | d }|dkrT||v rT||  d| 7  < n|||< nL|| jd	 | v rd}d
|d| d}	d| jd	 | | v r| jd	 | | d }
|	d| |
 d7 }	| | jd	 | | d ||< t|	 n|s|s|dkr|d dkr|d }|| jv r| j| }n)d| d}n"| d	dd |D  
 }| jd |sd| d}t| ||d< g }|r|s|S )z8Convert a BCP-47 tag to a dictionary of labelled subtags-)r-   r.   r/   r0   r1   r1   r   FTrK   r1   rH   rE   The rJ   z code is deprecatedPreferred-Valuez', prefer ''usdr   z<Unknown subdivision: >rD   c                 S   s   g | ]}d | qS )r]   r   )r"   extr   r   r   r'      s    z/BCP47CorpusReader.parse_tag.<locals>.<listcomp>r8   z<Invalid extension: rZ   )r   popr=   r@   	fullmatchr   rY   r   r   joinr:   )r   rU   ZsubtagslanglabelsZsubtagfoundrQ   Zvalstrnotepreferrb   rd   r   r   r   	parse_tag   sZ   



 *zBCP47CorpusReader.parse_tagc                 C   s  dD ]f}d}|| j | v r | j | | d  }d|d| }n>|| j d | v r^| j d | | d  }d|d| d}d| j d | | v r^| j d | | d }|d	| |7 }|rht| |  S qz	| | |W S    td
|d Y dS )z
        Convert a BCP-47 tag to a colon-separated string of subtag names

        >>> from nltk.corpus import bcp47
        >>> bcp47.name('ca-Latn-ES-valencia')
        'Catalan: Latin: Spain: Valencian'

        )rF   rG   NrK   r^   z	 code is rE   z and deprecatedr_   z	, prefer zTag z was not recognized)r   rY   r   r\   rm   )r   rU   rQ   rW   rk   rl   r   r   r   r[      s(   	zBCP47CorpusReader.name)__name__
__module____qualname____doc__r   r   r   r   r   r
   rY   r\   rm   r[   __classcell__r   r   r   r   r      s    .1r   )	r>   warningsr   Z	xml.etreer   r   Znltk.corpus.readerr   r   r   r   r   r   <module>   s
   