
    /h8                        S r SSKrSSKJrJrJrJr  SSKJ	r	J
r
  SSKJr  SSKJr   " S S5      r " S	 S
\5      r " S S\5      r " S S\5      rSS jr\S:X  aE  SSKrSSKJr   \" S\R.                  S   -   5      r \" S\R.                  S   -   5      r\" \\5        / SQrg! \ a    Sr N1f = f! \ a    Sr N&f = f)a  
Tools to identify collocations --- words that often appear consecutively
--- within corpora. They may also be used to find other associations between
word occurrences.
See Manning and Schutze ch. 5 at https://nlp.stanford.edu/fsnlp/promo/colloc.pdf
and the Text::NSP Perl package at http://ngram.sourceforge.net

Finding collocations requires first calculating the frequencies of words and
their appearance in the context of other words. Often the collection of words
will then requiring filtering to only retain useful content terms. Each ngram
of words may then be scored according to some association measure, in order
to determine the relative likelihood of each ngram being a collocation.

The ``BigramCollocationFinder`` and ``TrigramCollocationFinder`` classes provide
these functionalities, dependent on being provided a function which scores a
ngram given appropriate frequency counts. A number of standard association
measures are provided in bigram_measures and trigram_measures.
    N)BigramAssocMeasuresContingencyMeasuresQuadgramAssocMeasuresTrigramAssocMeasures)ranks_from_scoresspearman_correlation)FreqDist)ngramsc                       \ rS rSrSrS r\ SS j5       r\S 5       r\	S 5       r
S 4S	 jrS
 rS rS rS rS rS rS rSrg)AbstractCollocationFinder/   ad  
An abstract base class for collocation finders whose purpose is to
collect collocation candidate frequencies, filter and rank them.

As a minimum, collocation finders require the frequencies of each
word in a corpus, and the joint frequency of word tuples. This data
should be provided through nltk.probability.FreqDist objects or an
identical interface.
c                 F    Xl         UR                  5       U l        X l        g N)word_fdNngram_fd)selfr   r   s      I/var/www/auris/envauris/lib/python3.13/site-packages/nltk/collocations.py__init__"AbstractCollocationFinder.__init__:   s         Nc                    ^ U4US-
  -  mU(       a)  [         R                  R                  U4S jU 5       5      $ U(       a)  [         R                  R                  U4S jU 5       5      $ g)zE
Pad the document with the place holder according to the window_size
   c              3   R   >#    U  H  n[         R                  " UT5      v   M     g 7fr   
_itertoolschain.0docpaddings     r   	<genexpr>AAbstractCollocationFinder._build_new_documents.<locals>.<genexpr>H   s$      2:C3
  g..)   $'c              3   R   >#    U  H  n[         R                  " TU5      v   M     g 7fr   r   r   s     r   r"   r#   L   s$      2:C3
  #..)r$   N)r   r   from_iterable)cls	documentswindow_sizepad_left	pad_right
pad_symbolr!   s         @r   _build_new_documents.AbstractCollocationFinder._build_new_documents?   sn     -;?3##11 2:C2   ##11 2:C2   r   c                 T    U R                  U R                  XR                  SS95      $ )zrConstructs a collocation finder given a collection of documents,
each of which is a list (or iterable) of tokens.
Tr+   )
from_wordsr-   
default_ws)r'   r(   s     r   from_documents(AbstractCollocationFinder.from_documentsP   s,     ~~$$Y$$O
 	
r   c                 \   ^ ^ [        UU 4S j[        [        T 5      S-
  5       5       5      $ )Nc              3   D   >#    U  H  n[        TXT-    5      v   M     g 7fr   )tuple)r   inwordss     r   r"   <AbstractCollocationFinder._ngram_freqdist.<locals>.<genexpr>\   s$     O9NAeAA.//9Ns    r   )r	   rangelen)r:   r9   s   ``r   _ngram_freqdist)AbstractCollocationFinder._ngram_freqdistZ   s!    Os5zA~9NOOOr   c                     g)NF )ngramfreqs     r   <lambda>"AbstractCollocationFinder.<lambda>^   s    5r   c                     [        5       nU R                  R                  5        H  u  p4U" X45      (       a  M  XBU'   M     X l        g)zwGeneric filter removes ngrams from the frequency distribution
if the function returns True when passed an ngram tuple.
N)r	   r   items)r   fn	tmp_ngramrB   rC   s        r   _apply_filter'AbstractCollocationFinder._apply_filter^   s<     J	==..0KEe??#'%  1 "r   c                 0   ^ U R                  U4S j5        g)zARemoves candidate ngrams which have frequency less than min_freq.c                    > UT:  $ r   rA   )ngrC   min_freqs     r   rD   =AbstractCollocationFinder.apply_freq_filter.<locals>.<lambda>j   s	    D8Or   NrJ   )r   rO   s    `r   apply_freq_filter+AbstractCollocationFinder.apply_freq_filterh   s    ;<r   c                 0   ^ U R                  U4S j5        g)zPRemoves candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
evaluates to True.
c                    > T" U 6 $ r   rA   rN   frH   s     r   rD   >AbstractCollocationFinder.apply_ngram_filter.<locals>.<lambda>p   s	    Rr   NrQ   r   rH   s    `r   apply_ngram_filter,AbstractCollocationFinder.apply_ngram_filterl   s     	01r   c                 0   ^ U R                  U4S j5        g)z]Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
...) evaluates to True.
c                 .   > [        U4S jU  5       5      $ )Nc              3   4   >#    U  H  nT" U5      v   M     g 7fr   rA   )r   wrH   s     r   r"   PAbstractCollocationFinder.apply_word_filter.<locals>.<lambda>.<locals>.<genexpr>v   s     ,?BqRUUBs   )anyrV   s     r   rD   =AbstractCollocationFinder.apply_word_filter.<locals>.<lambda>v   s    ,?B,?)?r   NrQ   rY   s    `r   apply_word_filter+AbstractCollocationFinder.apply_word_filterr   s     	?@r   c              #   l   #    U R                    H   nU R                  " U/UQ76 nUc  M  X#4v   M"     g7f)zRGenerates of (ngram, score) pairs as determined by the scoring
function provided.
N)r   score_ngram)r   score_fntupscores       r   _score_ngrams'AbstractCollocationFinder._score_ngramsx   s8      ==C$$X44E j  !s   $4
4c                 6    [        U R                  U5      S S9$ )zReturns a sequence of (ngram, score) pairs ordered from highest to
lowest score, as determined by the scoring function provided.
c                     U S   * U S   4$ )Nr   r   rA   )ts    r   rD   8AbstractCollocationFinder.score_ngrams.<locals>.<lambda>   s    AaD5!A$-r   )key)sortedrj   )r   rg   s     r   score_ngrams&AbstractCollocationFinder.score_ngrams   s     d((28OPPr   c                 `    U R                  U5      SU  VVs/ s H  u  p4UPM	     snn$ s  snnf )z;Returns the top n ngrams when scored by the given function.Nrr   )r   rg   r9   pss        r   nbestAbstractCollocationFinder.nbest   s0    "//9"1=>=da=>>>s   *c              #   V   #    U R                  U5       H  u  p4XB:  a  Uv   M    g   g7f)zmReturns a sequence of ngrams, ordered by decreasing score, whose
scores each exceed the given minimum score.
Nru   )r   rg   	min_scorerB   ri   s        r   above_score%AbstractCollocationFinder.above_score   s,      !--h7LE 	 8s   '))r   r   r   )FFN)__name__
__module____qualname____firstlineno____doc__r   classmethodr-   r3   staticmethodr>   rJ   rR   rZ   rc   rj   rr   rx   r|   __static_attributes__rA   r   r   r   r   /   s|    !
 QU   
 
 P P  9 "=2A!Q?r   r   c                   @    \ rS rSrSrSrS	S jr\S	S j5       rS r	Sr
g)
BigramCollocationFinder   zA tool for the finding and ranking of bigram collocations or other
association measures. It is often useful to use from_words() rather than
constructing an instance directly.
   c                 <    [         R                  XU5        X0l        g)zuConstruct a BigramCollocationFinder, given FreqDists for
appearances of words and (possibly non-contiguous) bigrams.
N)r   r   r)   )r   r   	bigram_fdr)   s       r   r    BigramCollocationFinder.__init__   s     	"**4)D&r   c                     [        5       n[        5       nUS:  a  [        S5      e[        XSS9 H8  nUS   nUc  M  X6==   S-  ss'   USS  H  nUc  M  XFU4==   S-  ss'   M     M:     U " X4US9$ )	zConstruct a BigramCollocationFinder for all bigrams in the given
sequence.  When window_size > 2, count non-contiguous bigrams, in the
style of Church and Hanks's (1990) association ratio.
r   zSpecify window_size at least 2Tr0   r   Nr   r)   )r	   
ValueErrorr
   )r'   r:   r)   wfdbfdwindoww1w2s           r   r1   "BigramCollocationFinder.from_words   s     jj?=>>U4@FBzGqLGQRj>RMQ&M ! A 355r   c                     U R                   nU R                  X#4   U R                  S-
  -  nU(       d  gU R                  U   nU R                  U   nU" XVU4U5      $ )zReturns the score for a given bigram using the given scoring
function.  Following Church and Hanks (1990), counts are scaled by
a factor of 1/(window_size - 1).
g      ?N)r   r   r)   r   )r   rg   r   r   n_alln_iin_ixn_xis           r   rf   #BigramCollocationFinder.score_ngram   s`    
 }}bX&$*:*:S*@A||B||BTlE22r   r   N)r   r~   r   r   r   r   r2   r   r   r1   rf   r   rA   r   r   r   r      s,    
 J' 6 6*3r   r   c                   B    \ rS rSrSrSrS r\S
S j5       rS r	S r
Srg	)TrigramCollocationFinder   zA tool for the finding and ranking of trigram collocations or other
association measures. It is often useful to use from_words() rather than
constructing an instance directly.
   c                 H    [         R                  XU5        X0l        X l        g)zConstruct a TrigramCollocationFinder, given FreqDists for
appearances of words, bigrams, two words with any word between them,
and trigrams.
N)r   r   wildcard_fdr   )r   r   r   r   
trigram_fds        r   r   !TrigramCollocationFinder.__init__   s    
 	"**4*E&"r   c                    US:  a  [        S5      e[        5       n[        5       n[        5       n[        5       n[        XSS9 Hp  nUS   nUc  M  [        R                  " USS S5       HE  u  pX8==   S-  ss'   U	c  M  XXU	4==   S-  ss'   U
c  M)  XHU
4==   S-  ss'   XhX4==   S-  ss'   MG     Mr     U " X5XF5      $ )	zMConstruct a TrigramCollocationFinder for all trigrams in the given
sequence.
r   zSpecify window_size at least 3Tr0   r   Nr   r   r   r	   r
   r   combinations)r'   r:   r)   r   wildfdr   tfdr   r   r   w3s              r   r1   #TrigramCollocationFinder.from_words   s    
 ?=>>jjjU4@FBz$11&*a@1:H":Bx A% L!Q&! A	 A 3V))r   c                 B    [        U R                  U R                  5      $ )zConstructs a bigram collocation finder with the bigram and unigram
data from this finder. Note that this does not include any filtering
applied to this finder.
)r   r   r   )r   s    r   bigram_finder&TrigramCollocationFinder.bigram_finder   s    
 't||T^^DDr   c                 "   U R                   nU R                  X#U4   nU(       d  gU R                  X#4   nU R                  X$4   nU R                  X44   n	U R                  U   n
U R                  U   nU R                  U   nU" XgX4XU4U5      $ )zHReturns the score for a given trigram using the given scoring
function.
N)r   r   r   r   r   )r   rg   r   r   r   r   n_iiin_iixn_ixin_xiin_ixxn_xixn_xxis                r   rf   $TrigramCollocationFinder.score_ngram   s     rrl+x(  "*x(R R R u4uU6KUSSr   )r   r   N)r   )r~   r   r   r   r   r2   r   r   r1   r   rf   r   rA   r   r   r   r      s3    
 J# * *4ETr   r   c                   <    \ rS rSrSrSrS r\S	S j5       rS r	Sr
g)
QuadgramCollocationFinderi  zA tool for the finding and ranking of quadgram collocations or other association measures.
It is often useful to use from_words() rather than constructing an instance directly.
   c	                 x    [         R                  XU5        X@l        X0l        XPl        X`l        Xpl        Xl        g)zConstruct a QuadgramCollocationFinder, given FreqDists for appearances of words,
bigrams, trigrams, two words with one word and two words between them, three words
with a word between them in both variations.
N)r   r   iiiiiixiixxiiixiixii)	r   r   quadgram_fdr   r   r   r   r   r   s	            r   r   "QuadgramCollocationFinder.__init__  s2    
 	"**4+F			r   c           
      T   US:  a  [        S5      e[        5       n[        5       n[        5       n[        5       n[        5       n[        5       n[        5       n	[        5       n
[        XSS9 H  nUS   nUc  M  [        R                  " USS  S5       H  u  pnX<==   S-  ss'   Uc  M  X\U4==   S-  ss'   Uc  M*  XlX4==   S-  ss'   X|U4==   S-  ss'   Uc  MK  XLXU4==   S-  ss'   XU4==   S-  ss'   XX4==   S-  ss'   XX4==   S-  ss'   M     M     U " X4XVXxX5      $ )Nr   zSpecify window_size at least 4Tr0   r   r   r   r   )r'   r:   r)   ixxxiiiir   r   r   r   r   r   r   r   r   r   w4s                   r   r1   $QuadgramCollocationFinder.from_words!  sC   ?=>>zzZjjzzzU4@FBz(55fQRj!D
A:8!:L!Q&!H":""%&!+&"X!#"\"a'""\"a'" E	 A( 4r4>>r   c           
      8   U R                   nU R                  X#XE4   nU(       d  g U R                  X#U4   nU R                  X4U4   n	U R                  X#U4   n
U R                  X$U4   nU R
                  X#4   nU R
                  XE4   nU R
                  X44   nU R                  X$4   nU R                  X%4   nU R                  X54   nU R                  U   nU R                  U   nU R                  U   nU R                  U   nU" UXX4XUUX4UUUU4U5      $ r   )	r   r   r   r   r   r   r   r   r   )r   rg   r   r   r   r   r   n_iiiin_iiixn_xiiin_iixin_ixiin_iixxn_xxiin_xiixn_ixixn_ixxin_xixin_ixxxn_xixxn_xxixn_xxxis                         r   rf   %QuadgramCollocationFinder.score_ngramD  s7   /022,'22,'BB<(BB<(""""""2(#B8$2(#b!b!b!b!V,VVV<VVV,
 	
r   )r   r   r   r   r   r   N)r   r   rA   r   r   r   r     s-     J  ?  ?D
r   r   c           
        ^ SSK JnJnJn  U c  UR                  n Uc  UR
                  nSSKJnJn  UR                  S5      mU4S jnUR                  5        H  nUR                  U5       V	s/ s H  oR                  5       PM     n
n	[        R                  U
5      nUR                  S5        UR                  U5        U" U" UR!                  U 5      5      U" UR!                  U5      5      5      n[#        U5        [#        SUR%                  U S	5       Vs/ s H  nS
R'                  U5      PM     sn5        [#        SUR(                   SUS 35        GM     gs  sn	f s  snf )z=Finds bigram collocations in the files of the WebText corpus.r   )r   r   r   N)	stopwordswebtextenglishc                 T   > [        U 5      S:  =(       d    U R                  5       T;   $ )Nr   )r=   lower)r_   ignored_wordss    r   rD   demo.<locals>.<lambda>r  s     CFQJD!'')}*DDr   r   	    z	 Correlation to z: z0.4f)nltk.metricsr   r   r   likelihood_ratioraw_freqnltk.corpusr   r   r:   fileidsr   r   r1   rR   rc   rr   printrx   joinr~   )scorercompare_scorerr   r   r   r   r   word_filterfilewordr:   cfcorrrh   r   s                 @r   demor   b  s1     ~$55,55.OOI.MDK!*1--*=>*=$*=>$//6
Q
[)#boof56boon=>
 	ddbhhvr.BC.BsSXXc].BCD">#:#:";2d4[IJ "> Ds   /EE"__main__)r   zBigramAssocMeasures.r   r   )r   r   r   )NN)r   	itertoolsr   r   r   r   r   r   nltk.metrics.spearmanr   r   nltk.probabilityr	   	nltk.utilr
   r   r   r   r   r   r~   sysevalargvr   
IndexErrorr   __all__rA   r   r   <module>r      s   2   J % d dN037 03fAT8 ATHR
 9 R
jKL z0,sxx{:;4sxx{BC 	     s$   (B' B5 'B21B25C ?C 