o
    rZh                     @   sl   d Z ddlmZ ddlmZ zddlZW n ey   dZY nw G dd dZdd Z	e
d	kr4e	  dS dS )
a  
A module for language identification using the TextCat algorithm.
An implementation of the text categorization algorithm
presented in Cavnar, W. B. and J. M. Trenkle,
"N-Gram-Based Text Categorization".

The algorithm takes advantage of Zipf's law and uses
n-gram frequencies to profile languages and text-yet to
be identified-then compares using a distance measure.

Language n-grams are provided by the "An Crubadan"
project. A corpus reader was created separately to read
those files.

For details regarding the algorithm, see:
https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf

For details about An Crubadan, see:
https://borel.slu.edu/crubadan/index.html
    )maxsize)trigramsNc                   @   sP   e Zd ZdZi ZdZdZi Zdd Zdd Z	dd	 Z
d
d Zdd Zdd ZdS )TextCatN<>c                 C   s>   t stdddlm} || _| j D ]}| j| qd S )Nzclassify.textcat requires the regex module that supports unicode. Try '$ pip install regex' and see https://pypi.python.org/pypi/regex for further details.r   )crubadan)reOSErrornltk.corpusr   _corpuslangs	lang_freq)selfr   lang r   D/var/www/auris/lib/python3.10/site-packages/nltk/classify/textcat.py__init__7   s   zTextCat.__init__c                 C   s   t dd|S )z)Get rid of punctuation except apostrophesz[^\P{P}\']+ )r   subr   textr   r   r   remove_punctuationG   s   zTextCat.remove_punctuationc                 C   s   ddl m}m} | |}||}| }|D ])}t| j| | j }dd |D }	|	D ]}
|
|v r:||
  d7  < q+d||
< q+q|S )z'Create FreqDist of trigrams within textr   )FreqDistword_tokenizec                 S   s   g | ]}d  |qS )r   )join).0Ztrir   r   r   
<listcomp>U   s    z#TextCat.profile.<locals>.<listcomp>   )Znltkr   r   r   r   _START_CHAR	_END_CHAR)r   r   r   r   Z
clean_texttokensfingerprinttZtoken_trigram_tuplesZtoken_trigramsZcur_trigramr   r   r   profileK   s   

zTextCat.profilec                 C   sT   | j |}d}||v r&t| |}t| |}t|| }|S t}|S )zgCalculate the "out-of-place" measure between the
        text and language profile for a single trigramr   )r   r   listkeysindexabsr   )r   r   trigramZtext_profileZlang_fddistZidx_lang_profileZidx_textr   r   r   	calc_dist_   s   zTextCat.calc_distc                 C   sL   i }|  |}| jj D ]}d}|D ]}|| |||7 }q|||< q|S )zOCalculate the "out-of-place" measure between
        the text and all languagesr   )r#   r   Z_all_lang_freqr%   r*   )r   r   Z	distancesr#   r   Z	lang_distr(   r   r   r   
lang_distst   s   

zTextCat.lang_distsc                 C   s   |  || _t| j| jjdS )zYFind the language with the min distance
        to the text and return its ISO 639-3 code)key)r+   last_distancesmingetr   r   r   r   guess_language   s   zTextCat.guess_language)__name__
__module____qualname__r   Zfingerprintsr   r   r-   r   r   r#   r*   r+   r0   r   r   r   r   r   /   s    r   c            
   
      s   ddl m}  g d}dddddd	d
ddd	}t }|D ]X}| |td }ttt}d}td|D ] dd fddtd|  D  }||7 }q6t	d|dd  d  |
|}	t	d|	 d||	  d t	d qd S )Nr   )udhr)	zKurdish-UTF8zAbkhaz-UTF8zFarsi_Persian-UTF8z
Hindi-UTF8zHawaiian-UTF8zRussian-UTF8zVietnamese-UTF8zSerbian_Srpski-UTF8zEsperanto-UTF8zNorthern KurdishZ	AbkhazianzIranian PersianZHindiZHawaiianRussianZ
VietnameseZSerbianZ	Esperanto)	ZkmrZabkZpesZhinZhawZrusZvieZsrpZepor   r    c                    s   g | ]}  | qS r   r   )r   jiZraw_sentencesr   r   r      s    zdemo.<locals>.<listcomp>zLanguage snippet:    z...zLanguage detection: z ()z############################################################################################################################################)r
   r4   r   Zsentslenr$   mapranger   printr0   )
r4   r   friendlyZtcZcur_langrowscolssampleZcur_sentguessr   r8   r   demo   s4   
(


rE   __main__)__doc__sysr   Z	nltk.utilr   regexr   ImportErrorr   rE   r1   r   r   r   r   <module>   s   `1
