
    /hn                     \   S r SSKrSSKrSSKrSSKJrJrJr  SSKJ	r	  SSK
Jr  SSKJr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJr  SSKJrJrJr  \" S/ SQ5      r " S S5      r  " S S5      r! " S S5      r" " S S5      r# " S S\#5      r$S r%\&S:X  a  \%" 5         / SQr'g)a  
This module brings together a variety of NLTK functionality for
text analysis, and provides simple, interactive interfaces.
Functionality includes: concordancing, collocation discovery,
regular expression search over tokenized strings, and
distributional similarity.
    N)Counterdefaultdict
namedtuple)reduce)log)BigramCollocationFinder)MLE)padded_everygram_pipeline)BigramAssocMeasures	f_measure)ConditionalFreqDist)FreqDist)sent_tokenize)LazyConcatenation
cut_string	tokenwrapConcordanceLine)leftqueryrightoffset
left_printright_printlinec                   Z    \ rS rSrSr\S 5       rSSS 4S jrS rS r	SS	 jr
SS
 jrSrg)ContextIndex'   z
A bidirectional index between words and their 'contexts' in a text.
The context of a word is usually defined to be the words that occur
in a fixed window around the word; but other definitions may also
be used by providing a custom context function.
c                     US:w  a  XS-
     R                  5       OSnU[        U 5      S-
  :w  a  XS-      R                  5       OSnX#4$ )z;One left token and one right token, normalized to lowercaser      *START**END*)lowerlen)tokensir   r   s       A/var/www/auris/envauris/lib/python3.13/site-packages/nltk/text.py_default_contextContextIndex._default_context/   sM     )*Qv!e}""$I)*c&kAo)=1u##%7}    Nc                     U $ N xs    r&   <lambda>ContextIndex.<lambda>6   s    Qr)   c                 N  ^ ^ UT l         TT l        U(       a  UT l        OT R                  T l        U(       a!  T Vs/ s H  oS" U5      (       d  M  UPM     snm[	        U U4S j[        T5       5       5      T l        [	        U U4S j[        T5       5       5      T l        g s  snf )Nc              3   n   >#    U  H*  u  pTR                  U5      TR                  TU5      4v   M,     g 7fr+   )_key_context_func.0r%   wselfr$   s      r&   	<genexpr>(ContextIndex.__init__.<locals>.<genexpr>?   s1      %
FWdaTYYq\4--fa89FW   25c              3   n   >#    U  H*  u  pTR                  TU5      TR                  U5      4v   M,     g 7fr+   )r4   r3   r5   s      r&   r9   r:   B   s1      %
FWdaT*DIIaL9FWr;   )r3   _tokensr4   r'   CFD	enumerate_word_to_contexts_context_to_words)r8   r$   context_funcfilterkeyts   ``    r&   __init__ContextIndex.__init__6   s    	!-D!%!6!6D!'5A6!9a5F!$ %
FOPVFW%
 "
 "% %
FOPVFW%
 "
	 6s   B"B"c                     U R                   $ )zW
:rtype: list(str)
:return: The document that this context index was
    created from.
r=   r8   s    r&   r$   ContextIndex.tokensF        ||r)   c                     U R                  U5      n[        U R                  U   5      n0 nU R                  R                  5        H  u  pE[	        U[        U5      5      X4'   M     U$ )z
Return a dictionary mapping from words to 'similarity scores,'
indicating how often these two words occur in the same
context.
)r3   setr@   itemsr   )r8   wordword_contextsscoresr7   
w_contextss         r&   word_similarity_dict!ContextIndex.word_similarity_dictN   s_     yyD22489!3399;MA!-ZAFI < r)   c                 0   [        [        5      nU R                  U R                  U5          HO  nU R                  U    H9  nXQ:w  d  M
  X5==   U R                  U   U   U R                  U   U   -  -  ss'   M;     MQ     [        X3R                  SS9S U $ )NT)rD   reverse)r   intr@   r3   rA   sortedget)r8   rP   nrR   cr7   s         r&   similar_wordsContextIndex.similar_words]   s    S!''		$8A++A.9I..q1$7$:P:PQR:STU:VVI / 9 f**d;BQ??r)   c                   ^ ^ U Vs/ s H  nT R                  U5      PM     nnU Vs/ s H  n[        T R                  U   5      PM     nn[        [	        U5      5       Vs/ s H  oTU   (       a  M  X   PM     nn[        [        R                  U5      mU(       a"  U(       a  [        SSR                  U5      5      eT(       d
  [        5       $ [        UU 4S jU 5       5      nU$ s  snf s  snf s  snf )ag  
Find contexts where the specified words can all appear; and
return a frequency distribution mapping each context to the
number of times that context was used.

:param words: The words used to seed the similarity search
:type words: str
:param fail_on_unknown: If true, then raise a value error if
    any of the given words do not occur at all in the index.
z%The following word(s) were not found: c              3   d   >#    U  H%  nTR                   U     H  o"T;   d  M
  Uv   M     M'     g 7fr+   )r@   )r6   r7   r\   commonr8   s      r&   r9   /ContextIndex.common_contexts.<locals>.<genexpr>|   s.       a$*@*@*CQF{*C5s   00)
r3   rN   r@   ranger#   r   intersection
ValueErrorjoinr   )	r8   wordsfail_on_unknownr7   contextsr%   emptyfdrb   s	   `       @r&   common_contextsContextIndex.common_contextsg   s     (--u!1u-<ABEqC..q12EB#(U#4H#4aQK#4H(((3_DchhuoVV:   B I .BHs   C)"C."C33C3)r4   rA   r3   r=   r@      )F)__name__
__module____qualname____firstlineno____doc__staticmethodr'   rF   r$   rT   r]   rm   __static_attributes__r,   r)   r&   r   r   '   s>       -1; 
 @r)   r   c                   L    \ rS rSrSrS 4S jrS rS rS rSS jr	SS	 jr
S
rg)ConcordanceIndex   zg
An index that can be used to look up the offset locations at which
a given word occurs in a document.
c                     U $ r+   r,   r-   s    r&   r/   ConcordanceIndex.<lambda>   s    Qr)   c                     Xl          X l         [        [        5      U l         [        U5       H4  u  p4U R                  U5      nU R                  U   R                  U5        M6     g)a  
Construct a new concordance index.

:param tokens: The document (list of tokens) that this
    concordance index was created from.  This list can be used
    to access the context of a given word occurrence.
:param key: A function that maps each token to a normalized
    version that will be used as a key in the index.  E.g., if
    you use ``key=lambda s:s.lower()``, then the index will be
    case-insensitive.
N)r=   r3   r   list_offsetsr?   append)r8   r$   rD   indexrP   s        r&   rF   ConcordanceIndex.__init__   s\     	  	D#D)L$V,KE99T?DMM$&&u- -r)   c                     U R                   $ )z[
:rtype: list(str)
:return: The document that this concordance index was
    created from.
rI   rJ   s    r&   r$   ConcordanceIndex.tokens   rL   r)   c                 B    U R                  U5      nU R                  U   $ )z
:rtype: list(int)
:return: A list of the offset positions at which the given
    word occurs.  If a key function was specified for the
    index, then given word's key will be looked up.
)r3   r   r8   rP   s     r&   offsetsConcordanceIndex.offsets   s      yy}}T""r)   c                 \    S[        U R                  5      [        U R                  5      4-  $ )Nz+<ConcordanceIndex for %d tokens (%d types)>)r#   r=   r   rJ   s    r&   __repr__ConcordanceIndex.__repr__   s-    <@
 
 	
r)   c           
      ^   [        U[        5      (       a  UnOU/nSR                  U5      n[        S U 5       5      nX%-
  S-
  S-  nUS-  n/ nU R	                  US   5      n	[        USS 5       HE  u  pU R	                  U5       Vs1 s H
  oU
-
  S-
  iM     nn[        UR                  U	5      5      n	MG     U	(       a  U	 H  n
SR                  U R                  X[        U5      -    5      nU R                  [        SX-
  5      U
 nU R                  U
[        U5      -   X-    n[        SR                  U5      U* 5      R                  U5      n[        SR                  U5      U5      nSR                  UUU/5      n[        UUUU
UUU5      nUR                  U5        M     U$ s  snf )zs
Find all concordance lines given the query word.

Provided with a list of words, these will be found as a phrase.
r`   c              3   `   #    U  H$  n[         R                  " U5      (       a  M   S v   M&     g7f)r   N)unicodedata	combining)r6   chars     r&   r9   4ConcordanceIndex.find_concordance.<locals>.<genexpr>   s     Uzt9N9Nt9Tzs   .	.      r   r   N)
isinstancer~   rg   sumr   r?   rY   re   r=   r#   maxr   rjustr   r   )r8   rP   widthphrase
phrase_str
phrase_len
half_widthcontextconcordance_listr   r%   r   word_offsets
query_wordleft_contextright_contextr   r   
line_printconcordance_lines                       r&   find_concordance!ConcordanceIndex.find_concordance   s    dD!!FVFXXf%
UzUU
(1,2
1* ,,vay) ,GA9=d9KL9KvQJN9KLL\66w?@G -  XXdll13v;&GH
#||C1;,?!D $QV_q{ K'(>LRR
 )-)@*M XXz:{&KL
#2 !$  !''(89- .  5 Ms   
F*c                     U R                  XS9nU(       d  [        S5        g[        U[        U5      5      n[        SU S[        U5       S35        [	        USU 5       H  u  pV[        UR
                  5        M     g)aa  
Print concordance lines given the query word.
:param word: The target word or phrase (a list of strings)
:type word: str or list
:param lines: The number of lines to display (default=25)
:type lines: int
:param width: The width of each line, in characters (default=80)
:type width: int
:param save: The option to save the concordance.
:type save: bool
)r   z
no matcheszDisplaying z of z	 matches:N)r   printminr#   r?   r   )r8   rP   r   linesr   r%   r   s          r&   print_concordance"ConcordanceIndex.print_concordance   s}      000C,s#345EKwd3/?+@*AKL'01A&51I'J#&++, (Kr)   )r3   r   r=   N)P   )r      )rq   rr   rs   rt   ru   rF   r$   r   r   r   r   rw   r,   r)   r&   ry   ry      s+    
 $/ .4#
. `-r)   ry   c                   $    \ rS rSrSrS rS rSrg)TokenSearcheri  a  
A class that makes it easier to use regular expressions to search
over tokenized strings.  The tokenized string is converted to a
string where tokens are marked with angle brackets -- e.g.,
``'<the><window><is><still><open>'``.  The regular expression
passed to the ``findall()`` method is modified to treat angle
brackets as non-capturing parentheses, in addition to matching the
token boundaries; and to have ``'.'`` not match the angle brackets.
c                 >    SR                  S U 5       5      U l        g )N c              3   2   #    U  H  nS U-   S-   v   M     g7f)<>Nr,   )r6   r7   s     r&   r9   )TokenSearcher.__init__.<locals>.<genexpr>  s     :6aC!GcM6s   )rg   _raw)r8   r$   s     r&   rF   TokenSearcher.__init__  s    GG:6::	r)   c                    [         R                  " SSU5      n[         R                  " SSU5      n[         R                  " SSU5      n[         R                  " SSU5      n[         R                  " XR                  5      nU H<  nUR	                  S5      (       a  M  UR                  S5      (       d  M3  [        S	5      e   U Vs/ s H  o3S
S R                  S5      PM     nnU$ s  snf )ak  
Find instances of the regular expression in the text.
The text is a list of tokens, and a regexp pattern to match
a single token must be surrounded by angle brackets.  E.g.

>>> from nltk.text import TokenSearcher
>>> from nltk.book import text1, text5, text9
>>> text5.findall("<.*><.*><bro>")
you rule bro; telling you bro; u twizted bro
>>> text1.findall("<a>(<.*>)<man>")
monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave
>>> text9.findall("<th.*>{3,}")
thread through those; the thought that; that the thing; the thing
that; that that thing; through these than through; them that the;
through the thick; them that they; thought that the

:param regexp: A regular expression
:type regexp: str
z\sr   r   z(?:<(?:r   z)>)z	(?<!\\)\.z[^>]z$Bad regexp for TokenSearcher.findallr   z><)resubfindallr   
startswithendswithrf   splitr8   regexphitshs       r&   r   TokenSearcher.findall  s    0 r6*i0eV,ff5 zz&)), A<<$$C !GHH 
 .22T!Bd#T2 3s   C()r   N)rq   rr   rs   rt   ru   rF   r   rw   r,   r)   r&   r   r     s    ;'r)   r   c                       \ rS rSrSrSrSS jrS rS rSS jr	SS	 jr
SS
 jrSS jrS rS rS rSS jrSS jrS rS S jrS!S jrS rS rS r\R2                  " S5      rS rS rS rSrg)"Texti9  a  
A wrapper around a sequence of simple (string) tokens, which is
intended to support initial exploration of texts (via the
interactive console).  Its methods perform a variety of analyses
on the text's contexts (e.g., counting, concordancing, collocation
discovery), and display the results.  If you wish to write a
program which makes use of these analyses, then you should bypass
the ``Text`` class, and use the appropriate analysis function or
class directly instead.

A ``Text`` is typically initialized from a given document or
corpus.  E.g.:

>>> import nltk.corpus
>>> from nltk.text import Text
>>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))

TNc                 &   U R                   (       a  [        U5      nXl        U(       a  X l        gSUSS ;   a5  USS R	                  S5      nSR                  S USU  5       5      U l        gSR                  S USS  5       5      S	-   U l        g)
zV
Create a Text object.

:param tokens: The source text.
:type tokens: sequence of str
]Nrp   r`   c              3   8   #    U  H  n[        U5      v   M     g 7fr+   strr6   toks     r&   r9    Text.__init__.<locals>.<genexpr>b  s      C]cS]   r   c              3   8   #    U  H  n[        U5      v   M     g 7fr+   r   r   s     r&   r9   r   d  s      @ZcSZr      z...)_COPY_TOKENSr~   r$   namer   rg   )r8   r$   r   ends       r&   rF   Text.__init__S  s     &\FIF3BK"+##C(C CVAc] CCDI @VBQZ @@5HDIr)   c                      U R                   U   $ r+   )r$   )r8   r%   s     r&   __getitem__Text.__getitem__j  s    {{1~r)   c                 ,    [        U R                  5      $ r+   )r#   r$   rJ   s    r&   __len__Text.__len__m  s    4;;r)   c                     SU R                   ;  a  [        U R                  S S9U l        U R                  R	                  XU5      $ )a|  
Prints a concordance for ``word`` with the specified context window.
Word matching is not case-sensitive.

:param word: The target word or phrase (a list of strings)
:type word: str or list
:param width: The width of each line, in characters (default=80)
:type width: int
:param lines: The number of lines to display (default=25)
:type lines: int

:seealso: ``ConcordanceIndex``
_concordance_indexc                 "    U R                  5       $ r+   r"   ss    r&   r/   "Text.concordance.<locals>.<lambda>  
    1779r)   rD   )__dict__ry   r$   r   r   r8   rP   r   r   s       r&   concordanceText.concordancet  sB      t}}4&6!4'D# &&88eLLr)   c                     SU R                   ;  a  [        U R                  S S9U l        U R                  R	                  X5      SU $ )a~  
Generate a concordance for ``word`` with the specified context window.
Word matching is not case-sensitive.

:param word: The target word or phrase (a list of strings)
:type word: str or list
:param width: The width of each line, in characters (default=80)
:type width: int
:param lines: The number of lines to display (default=25)
:type lines: int

:seealso: ``ConcordanceIndex``
r   c                 "    U R                  5       $ r+   r   r   s    r&   r/   'Text.concordance_list.<locals>.<lambda>  r   r)   r   N)r   ry   r$   r   r   r   s       r&   r   Text.concordance_list  sG      t}}4&6!4'D# &&77DVeLLr)   c                   ^ SU R                   ;   a   U R                  U:X  a  U R                  U:X  d  Xl        X l        SSKJn  UR                  S5      m[        R                  " U R                  U5      nUR                  S5        UR                  U4S j5        [        5       n[        UR                  UR                  U5      5      U l        U R                  $ )a  
Return collocations derived from the text, ignoring stopwords.

    >>> from nltk.book import text4
    >>> text4.collocation_list()[:2]
    [('United', 'States'), ('fellow', 'citizens')]

:param num: The maximum number of collocations to return.
:type num: int
:param window_size: The number of tokens spanned by a collocation (default=2)
:type window_size: int
:rtype: list(tuple(str, str))
_collocationsr   )	stopwordsenglishr   c                 T   > [        U 5      S:  =(       d    U R                  5       T;   $ )N   )r#   r"   )r7   ignored_wordss    r&   r/   'Text.collocation_list.<locals>.<lambda>  s     s1vz/WQWWY-=W/Wr)   )r   _num_window_sizenltk.corpusr   rh   r   
from_wordsr$   apply_freq_filterapply_word_filterr   r~   nbestlikelihood_ratior   )r8   numwindow_sizer   finderbigram_measuresr   s         @r&   collocation_listText.collocation_list  s     t}},		S !![0I + .%OOI6M,77[QF$$Q'$$%WX13O!%_==sC"D !!!r)   c                     U R                  X5       VVs/ s H  u  p4US-   U-   PM     nnn[        [        USS95        gs  snnf )a  
Print collocations derived from the text, ignoring stopwords.

    >>> from nltk.book import text4
    >>> text4.collocations() # doctest: +NORMALIZE_WHITESPACE
    United States; fellow citizens; years ago; four years; Federal
    Government; General Government; American people; Vice President; God
    bless; Chief Justice; one another; fellow Americans; Old World;
    Almighty God; Fellow citizens; Chief Magistrate; every citizen; Indian
    tribes; public debt; foreign nations


:param num: The maximum number of collocations to print.
:type num: int
:param window_size: The number of tokens spanned by a collocation (default=2)
:type window_size: int
r`   ; )	separatorN)r  r   r   )r8   r   r   w1w2collocation_stringss         r&   collocationsText.collocations  sM    ( )-(=(=c(O
(OfbBHrM(O 	 
 	i+t<=
s   A c                 8    U R                   R                  U5      $ )z:
Count the number of times this word appears in the text.
)r$   countr   s     r&   r  
Text.count       {{  &&r)   c                 8    U R                   R                  U5      $ )zA
Find the index of the first occurrence of the word in the text.
)r$   r   r   s     r&   r   
Text.index  r  r)   c                     [         er+   )NotImplementedError)r8   methods     r&   readabilityText.readability  s    !!r)   c                   ^^^ SU R                   ;  a  [        U R                  S S S9U l        TR	                  5       mU R                  R
                  mTTR                  5       ;   am  [        TT   5      m[        UUU4S jTR                  5        5       5      nUR                  U5       VVs/ s H  u  pEUPM	     nnn[        [        U5      5        g[        S5        gs  snnf )a>  
Distributional similarity: find other words which appear in the
same contexts as the specified word; list most similar words first.

:param word: The word used to seed the similarity search
:type word: str
:param num: The number of words to generate (default=20)
:type num: int
:seealso: ContextIndex.similar_words()
_word_context_indexc                 "    U R                  5       $ r+   )isalphar-   s    r&   r/   Text.similar.<locals>.<lambda>  s
    aiikr)   c                 "    U R                  5       $ r+   r   r   s    r&   r/   r    s
    r)   )rC   rD   c              3   b   >#    U  H$  nTU     H  nUT;   d  M  UT:X  a  M  Uv   M     M&     g 7fr+   r,   )r6   r7   r\   rj   wcirP   s      r&   r9   Text.similar.<locals>.<genexpr>  s@      )AQA=  *+d  )s   ///z
No matchesN)r   r   r$   r  r"   r@   
conditionsrN   r   most_commonr   r   )	r8   rP   r   rl   r7   _rh   rj   r  s	    `     @@r&   similarText.similar  s     !5'3$9?R(D$ zz|&&883>>##3t9~H ) B $&>>##67#641Q#6E7)E"#, 8s   1C#c                    SU R                   ;  a  [        U R                  S S9U l         U R                  R	                  US5      nU(       d  [        S5        gUR                  U5       VVs/ s H  u  pEUPM	     nnn[        [        S U 5       5      5        gs  snnf ! [         a  n[        U5         SnAgSnAff = f)a  
Find contexts where the specified words appear; list
most frequent common contexts first.

:param words: The words used to seed the similarity search
:type words: str
:param num: The number of words to generate (default=20)
:type num: int
:seealso: ContextIndex.common_contexts()
r  c                 "    U R                  5       $ r+   r   r   s    r&   r/   &Text.common_contexts.<locals>.<lambda>  r   r)   r   TzNo common contexts were foundc              3   6   #    U  H  u  pUS -   U-   v   M     g7f)r"  Nr,   )r6   r  r  s      r&   r9   'Text.common_contexts.<locals>.<genexpr>!  s     LO&"S2Os   N)	r   r   r$   r  rm   r   r!  r   rf   )r8   rh   r   rl   r7   r"  ranked_contextses           r&   rm   Text.common_contexts
  s     !5'3!4(D$		))99%FB56131D"E1D11D"EiLOLLM #F  	!HH	s/   .B" B" /B=B" B" "
C,B<<Cc                      SSK Jn  U" X5        g)z
Produce a plot showing the distribution of the words through the text.
Requires pylab to be installed.

:param words: The words to be plotted
:type words: list(str)
:seealso: nltk.draw.dispersion_plot()
r   )dispersion_plotN)	nltk.drawr.  )r8   rh   r.  s      r&   r.  Text.dispersion_plot&  s     	.$r)   c                 T    [        X!5      u  p4[        US9nUR                  X45        U$ )N)order)r
   r	   fit)r8   tokenized_sentsr[   
train_datapadded_sentsmodels         r&   _train_default_ngram_lmText._train_default_ngram_lm3  s)    #<Q#P 
!		*+r)   c                    [        SR                  U R                  5      5       Vs/ s H  oDR                  S5      PM     snU l        [        U S5      (       d7  [        S[        R                  S9  U R                  U R                  SS9U l
        / nUS:  d   S5       e[        U5      U:  ac  [        U R                  R                  XUS	95       H&  u  pgUS
:X  a  M  US:X  a    OUR                  U5        M(     US-  n[        U5      U:  a  Mc  U(       a  SR                  U5      S-   OSnU[        USU 5      -   n	[        U	5        U	$ s  snf )a  
Print random text, generated using a trigram language model.
See also `help(nltk.lm)`.

:param length: The length of text to generate (default=100)
:type length: int

:param text_seed: Generation can be conditioned on preceding context.
:type text_seed: list(str)

:param random_seed: A random seed or an instance of `random.Random`. If provided,
    makes the random sampling part of generation reproducible. (default=42)
:type random_seed: int
r`   _trigram_modelzBuilding ngram index...)filer   )r[   r   z!The `length` must be more than 0.)	text_seedrandom_seedz<s>z</s>r   r   N)r   rg   r$   r   _tokenized_sentshasattrr   sysstderrr8  r;  r#   r?   generater   r   )
r8   lengthr=  r>  sentgenerated_tokensidxtokenprefix
output_strs
             r&   rC  Text.generate9  sU   " )6chht{{6K(L!
(LJJsO(L!
 t-..+#**="&">">%% #? #D z>>>z"#f,'##,,[ - 

 E>F? ''. 1K "#f, /8)$s*Ri(8&(ABB
j9!
s   Ec                 <    U R                  5       R                  " U6 $ )zK
See documentation for FreqDist.plot()
:seealso: nltk.prob.FreqDist.plot()
)vocabplot)r8   argss     r&   rN  	Text.plotg  s    
 zz|  $''r)   c                 Z    SU R                   ;  a  [        U 5      U l        U R                  $ )z
:seealso: nltk.prob.FreqDist
_vocab)r   r   rR  rJ   s    r&   rM  
Text.vocabn  s%     4==("4.DK{{r)   c                     SU R                   ;  a  [        U 5      U l        U R                  R                  U5      nU Vs/ s H  nSR	                  U5      PM     nn[        [        US5      5        gs  snf )aC  
Find instances of the regular expression in the text.
The text is a list of tokens, and a regexp pattern to match
a single token must be surrounded by angle brackets.  E.g.

>>> from nltk.book import text1, text5, text9
>>> text5.findall("<.*><.*><bro>")
you rule bro; telling you bro; u twizted bro
>>> text1.findall("<a>(<.*>)<man>")
monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave
>>> text9.findall("<th.*>{3,}")
thread through those; the thought that; that the thing; the thing
that; that that thing; through these than through; them that the;
through the thick; them that they; thought that the

:param regexp: A regular expression
:type regexp: str
_token_searcherr`   r  N)r   r   rU  r   rg   r   r   r   s       r&   r   Text.findallw  sc    . DMM1#0#6D ##++F3%)*TT*id#$ +s    A3z\w+|[\.\!\?]c                    US-
  nUS:  aQ  U R                   R                  X   5      (       d/  US-  nUS:  a$  U R                   R                  X   5      (       d  M/  US:w  a  X   OSnUS-   nU[        U5      :  aZ  U R                   R                  X   5      (       d8  US-  nU[        U5      :  a$  U R                   R                  X   5      (       d  M8  U[        U5      :w  a  X   OSnXE4$ )z
One left & one right token, both case-normalized.  Skip over
non-sentence-final punctuation.  Used by the ``ContextIndex``
that is created for ``similar()`` and ``common_contexts()``.
r   r   r    r!   )_CONTEXT_REmatchr#   )r8   r$   r%   jr   r   s         r&   _contextText._context  s     E1fT--33FI>>FA 1fT--33FI>>Fvy	 E#f+od&6&6&<&<VY&G&GFA #f+od&6&6&<&<VY&G&G#f+-	7}r)   c                      SU R                   -  $ Nz
<Text: %s>r   rJ   s    r&   __str__Text.__str__      dii''r)   c                      SU R                   -  $ r^  r_  rJ   s    r&   r   Text.__repr__  rb  r)   )r   r   r   rU  r?  r;  rR  r   r  r   r$   r+   )O   r   )rp   r   ro   )r   )d   N*   )rq   rr   rs   rt   ru   r   rF   r   r   r   r   r  r
  r  r   r  r#  rm   r.  r8  rC  rN  rM  r   r   compilerX  r[  r`  r   rw   r,   r)   r&   r   r   9  s    . LI. M*M(!"F>0''"  D8%,\(%D **_-K0((r)   r   c                   0    \ rS rSrSrS rS rS rS rSr	g)	TextCollectioni  a  A collection of texts, which can be loaded with list of texts, or
with a corpus consisting of one or more texts, and which supports
counting, concordancing, collocation discovery, etc.  Initialize a
TextCollection as follows:

>>> import nltk.corpus
>>> from nltk.text import TextCollection
>>> from nltk.book import text1, text2, text3
>>> gutenberg = TextCollection(nltk.corpus.gutenberg)
>>> mytexts = TextCollection([text1, text2, text3])

Iterating over a TextCollection produces all the tokens of all the
texts in order.
c                     [        US5      (       a.  UR                  5        Vs/ s H  o!R                  U5      PM     nnXl        [        R                  U [        U5      5        0 U l        g s  snf )Nrh   )r@  fileidsrh   _textsr   rF   r   
_idf_cache)r8   sourcefs      r&   rF   TextCollection.__init__  sX    67##/5~~/?@/?!ll1o/?F@d-f56	 As   A-c                 <    UR                  U5      [        U5      -  $ )z"The frequency of the term in text.)r  r#   r8   termtexts      r&   tfTextCollection.tf  s    zz$#d)++r)   c                 ^   U R                   R                  U5      nUc  [        U R                   Vs/ s H  o1U;   d  M
  SPM     sn5      n[        U R                  5      S:X  a  [	        S5      eU(       a!  [        [        U R                  5      U-  5      OSnX R                   U'   U$ s  snf )zThe number of texts in the corpus divided by the
number of texts that the term appears in.
If a term does not appear in the corpus, 0.0 is returned.Tr   z+IDF undefined for empty document collectiong        )rn  rZ   r#   rm  rf   r   )r8   rt  idfru  matchess        r&   ry  TextCollection.idf  s    
 oo!!$';DKKHKD4<4KHIG4;;1$ !NOO5<#c$++&01#C$'OOD!
 Is
   	B*B*c                 H    U R                  X5      U R                  U5      -  $ r+   )rv  ry  rs  s      r&   tf_idfTextCollection.tf_idf  s    wwt"TXXd^33r)   )rn  rm  N)
rq   rr   rs   rt   ru   rF   rv  ry  r}  rw   r,   r)   r&   rj  rj    s    ,4r)   rj  c                  t   SSK Jn   [        U R                  SS95      n[	        U5        [	        5         [	        S5        UR                  S5        [	        5         [	        S5        UR                  S5        [	        5         [	        S5        UR                  5         [	        5         [	        S5        UR                  / S	Q5        [	        5         [	        S
5        UR                  S5        [	        5         [	        S5        [	        SUS   5        [	        SUSS 5        [	        SUR                  5       S   5        g )Nr   )brownnews)
categorieszConcordance:zDistributionally similar words:zCollocations:zDispersion plot:)r  reportsaid	announcedzVocabulary plot:2   z	Indexing:ztext[3]:r   z
text[3:5]:   ztext.vocab()['news']:)r   r  r   rh   r   r   r#  r
  r.  rN  rM  )r  ru  s     r&   demor    s    !v./D	$K	G	.V	G	
+,LL	G	/	G 

@A	G	
IIbM	G	+	*d1g	,Qq	"	
!4::<#78r)   __main__)r   ry   r   r   rj  )(ru   r   rA  r   collectionsr   r   r   	functoolsr   mathr   nltk.collocationsr   nltk.lmr	   nltk.lm.preprocessingr
   nltk.metricsr   r   nltk.probabilityr   r>   r   nltk.tokenizer   	nltk.utilr   r   r   r   r   ry   r   r   rj  r  rq   __all__r,   r)   r&   <module>r     s    
 
  8 8   5  ; 7 7 % ' > >MX Xv|- |-~5 5p~( ~(D+4T +4\9< zFr)   