
    /h                     |    S r SSKJr  SSKJr   SSKr " S S5      rS r	\
S:X  a  \	" 5         gg! \ a    Sr N&f = f)	a  
A module for language identification using the TextCat algorithm.
An implementation of the text categorization algorithm
presented in Cavnar, W. B. and J. M. Trenkle,
"N-Gram-Based Text Categorization".

The algorithm takes advantage of Zipf's law and uses
n-gram frequencies to profile languages and text-yet to
be identified-then compares using a distance measure.

Language n-grams are provided by the "An Crubadan"
project. A corpus reader was created separately to read
those files.

For details regarding the algorithm, see:
https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf

For details about An Crubadan, see:
https://borel.slu.edu/crubadan/index.html
    )maxsize)trigramsNc                   L    \ rS rSrSr0 rSrSr0 rS r	S r
S rS rS	 rS
 rSrg)TextCat/   N<>c                     [         (       d  [        S5      eSSKJn  Xl        U R                  R                  5        H  nU R                  R                  U5        M      g )Nzclassify.textcat requires the regex module that supports unicode. Try '$ pip install regex' and see https://pypi.python.org/pypi/regex for further details.r   )crubadan)reOSErrornltk.corpusr   _corpuslangs	lang_freq)selfr   langs      M/var/www/auris/envauris/lib/python3.13/site-packages/nltk/classify/textcat.py__init__TextCat.__init__7   sL    r#  	)LL&&(DLL""4( )    c                 2    [         R                  " SSU5      $ )z)Get rid of punctuation except apostrophesz[^\P{P}\']+ )r   subr   texts     r   remove_punctuationTextCat.remove_punctuationG   s    vvnb$//r   c                 >   SSK JnJn  U R                  U5      nU" U5      nU" 5       nU Hi  n[	        U R
                  U-   U R                  -   5      nU V	s/ s H  n	SR                  U	5      PM     n
n	U
 H  nX;   a  Xk==   S-  ss'   M  SXk'   M     Mk     U$ s  sn	f )z'Create FreqDist of trigrams within textr   )FreqDistword_tokenizer      )nltkr    r!   r   r   _START_CHAR	_END_CHARjoin)r   r   r    r!   
clean_texttokensfingerprintttoken_trigram_tuplestritoken_trigramscur_trigrams               r   profileTextCat.profileK   s    0,,T2
z*jA#+D,<,<q,@4>>,Q#R 6JK6Jsbggcl6JNK--,1,/0K,	  .	   Ls   Bc                    U R                   R                  U5      nSnX$;   a_  [        UR                  5       5      R	                  U5      n[        UR                  5       5      R	                  U5      n[        Xg-
  5      nU$ [        nU$ )z_Calculate the "out-of-place" measure between the
text and language profile for a single trigramr   )r   r   listkeysindexabsr   )r   r   trigramtext_profilelang_fddistidx_lang_profileidx_texts           r   	calc_distTextCat.calc_dist_   s     ,,((.#GLLN399'BL--/066w?H '23D  Dr   c                     0 nU R                  U5      nU R                  R                  R                  5        H&  nSnU H  nXPR	                  XFU5      -  nM     XRU'   M(     U$ )zGCalculate the "out-of-place" measure between
the text and all languagesr   )r/   r   _all_lang_freqr3   r<   )r   r   	distancesr/   r   	lang_distr6   s          r   
lang_distsTextCat.lang_distst   si     	,,t$LL//446D I"^^D7CC	 # (dO 7 r   c                 ~    U R                  U5      U l        [        U R                  U R                  R                  S9$ )zQFind the language with the min distance
to the text and return its ISO 639-3 code)key)rB   last_distancesmingetr   s     r   guess_languageTextCat.guess_language   s4     #ood34&&D,?,?,C,CDDr   )r   rF   )__name__
__module____qualname____firstlineno__r   fingerprintsr$   r%   rF   r   r   r/   r<   rB   rI   __static_attributes__ r   r   r   r   /   s:    GLKIN) 0(*$Er   r   c            
         SSK Jn   / SQnSSSSSS	S
SSS.	n[        5       nU H  nU R                  U5      n[	        U5      S-
  n[        [        [        U5      5      nSn[        SU5       H=  n	SSR                  [        SXy   5       V
s/ s H
  oU	   U
   PM     sn
5      -   nX-  nM?     [        SUSS -   S-   5        UR                  U5      n[        SU SX,    S35        [        S5        M     g s  sn
f )Nr   )udhr)	zKurdish-UTF8zAbkhaz-UTF8zFarsi_Persian-UTF8z
Hindi-UTF8zHawaiian-UTF8zRussian-UTF8zVietnamese-UTF8zSerbian_Srpski-UTF8zEsperanto-UTF8zNorthern Kurdish	AbkhazianzIranian PersianHindiHawaiianRussian
VietnameseSerbian	Esperanto)	kmrabkpeshinhawrusviesrpepor"   r    zLanguage snippet:    z...zLanguage detection: z ()z############################################################################################################################################)r   rS   r   sentslenr2   mapranger&   printrI   )rS   r   friendlytccur_langraw_sentencesrowscolssampleijcur_sentguesss                r   demorw      s    
E " 
H 
B

8,=!A%C]+, q$ASXXE!TWDU&VDUqQ'7':DU&VWWHF  
 	"VAc]2U:;!!&)$UG2ho->a@Ai#  'Ws   C6__main__)__doc__sysr   	nltk.utilr   regexr   ImportErrorr   rw   rK   rQ   r   r   <module>r~      sZ   *  \E \E@.b zF q  	Bs   0 ;;