o
    rZhn                     @   s,  d Z ddlZddlZddlZddlmZmZmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ edg dZG dd dZ G dd dZ!G dd dZ"G dd dZ#G dd de#Z$dd Z%e&dkre%  g dZ'dS )a  
This module brings together a variety of NLTK functionality for
text analysis, and provides simple, interactive interfaces.
Functionality includes: concordancing, collocation discovery,
regular expression search over tokenized strings, and
distributional similarity.
    N)Counterdefaultdict
namedtuple)reduce)log)BigramCollocationFinder)MLE)padded_everygram_pipeline)BigramAssocMeasures	f_measure)ConditionalFreqDist)FreqDist)sent_tokenize)LazyConcatenation
cut_string	tokenwrapConcordanceLine)leftqueryrightoffset
left_printright_printlinec                   @   sT   e Zd ZdZedd Zdddd fddZd	d
 Zdd ZdddZ	dddZ
dS )ContextIndexa  
    A bidirectional index between words and their 'contexts' in a text.
    The context of a word is usually defined to be the words that occur
    in a fixed window around the word; but other definitions may also
    be used by providing a custom context function.
    c                 C   sH   |dkr| |d    nd}|t| d kr| |d    nd}||fS )z;One left token and one right token, normalized to lowercaser      *START**END*)lowerlen)tokensir   r    r"   8/var/www/auris/lib/python3.10/site-packages/nltk/text.py_default_context/   s   $zContextIndex._default_contextNc                 C      | S Nr"   xr"   r"   r#   <lambda>6       zContextIndex.<lambda>c                    sv   |_ _|r|_nj_ r fddD tfddtD _tfddtD _d S )Nc                    s   g | ]} |r|qS r"   r"   ).0t)filterr"   r#   
<listcomp>>       z)ContextIndex.__init__.<locals>.<listcomp>c                 3   s*    | ]\}}  | |fV  qd S r&   )_key_context_funcr+   r!   wselfr    r"   r#   	<genexpr>?       
z(ContextIndex.__init__.<locals>.<genexpr>c                 3   s*    | ]\}}  | |fV  qd S r&   )r1   r0   r2   r4   r"   r#   r6   B   r7   )r0   _tokensr1   r$   CFD	enumerate_word_to_contexts_context_to_words)r5   r    Zcontext_funcr-   keyr"   )r-   r5   r    r#   __init__6   s   
zContextIndex.__init__c                 C      | j S )zw
        :rtype: list(str)
        :return: The document that this context index was
            created from.
        r8   r5   r"   r"   r#   r    F      zContextIndex.tokensc                 C   sF   |  |}t| j| }i }| j D ]\}}t|t|||< q|S )z
        Return a dictionary mapping from words to 'similarity scores,'
        indicating how often these two words occur in the same
        context.
        )r0   setr;   itemsr   )r5   wordZword_contextsscoresr3   Z
w_contextsr"   r"   r#   word_similarity_dictN   s   
z!ContextIndex.word_similarity_dict   c                 C   sv   t t}| j| | D ]"}| j| D ]}||kr-||  | j| | | j| |  7  < qqt||jddd | S )NT)r=   reverse)r   intr;   r0   r<   sortedget)r5   rE   nrF   cr3   r"   r"   r#   similar_words]   s   zContextIndex.similar_wordsFc                    s   fddD fddD fddt tD }ttj |r2|r2tdd s7t S t fddD }|S )	a  
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        c                       g | ]}  |qS r"   )r0   r+   r3   rA   r"   r#   r.   r       z0ContextIndex.common_contexts.<locals>.<listcomp>c                    s   g | ]	}t  j| qS r"   )rC   r;   rQ   rA   r"   r#   r.   s   s    c                    s   g | ]
} | s| qS r"   r"   )r+   r!   )contextswordsr"   r#   r.   t   s    z%The following word(s) were not found: c                 3   s,    | ]}j | D ]	}| v r	|V  q	qd S r&   )r;   r+   r3   rN   )commonr5   r"   r#   r6   |   s    
z/ContextIndex.common_contexts.<locals>.<genexpr>)ranger   r   rC   intersection
ValueErrorjoinr   )r5   rT   Zfail_on_unknownemptyfdr"   )rW   rS   r5   rT   r#   common_contextsg   s   zContextIndex.common_contextsrH   )F)__name__
__module____qualname____doc__staticmethodr$   r>   r    rG   rO   r^   r"   r"   r"   r#   r   '   s    


r   c                   @   sL   e Zd ZdZdd fddZdd Zdd	 Zd
d ZdddZdddZ	dS )ConcordanceIndexzs
    An index that can be used to look up the offset locations at which
    a given word occurs in a document.
    c                 C   r%   r&   r"   r'   r"   r"   r#   r)      r*   zConcordanceIndex.<lambda>c                 C   sL   || _ 	 || _	 tt| _	 t|D ]\}}| |}| j| | qdS )a  
        Construct a new concordance index.

        :param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurrence.
        :param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use ``key=lambda s:s.lower()``, then the index will be
            case-insensitive.
        N)r8   r0   r   list_offsetsr:   append)r5   r    r=   indexrE   r"   r"   r#   r>      s   

zConcordanceIndex.__init__c                 C   r?   )z{
        :rtype: list(str)
        :return: The document that this concordance index was
            created from.
        r@   rA   r"   r"   r#   r       rB   zConcordanceIndex.tokensc                 C   s   |  |}| j| S )z
        :rtype: list(int)
        :return: A list of the offset positions at which the given
            word occurs.  If a key function was specified for the
            index, then given word's key will be looked up.
        )r0   rg   r5   rE   r"   r"   r#   offsets   s   

zConcordanceIndex.offsetsc                 C   s   dt | jt | jf S )Nz+<ConcordanceIndex for %d tokens (%d types)>)r   r8   rg   rA   r"   r"   r#   __repr__   s   zConcordanceIndex.__repr__P   c              	      sR  t |tr|}n|g}d|}tdd |D }|| d d }|d }g }| |d }	t|dd D ]\ } fd	d
| |D }
t|
|	}	q6|	r|	D ]T d| j  t	|  }| jt
d |   }| j t	|  |  }td|| |}td||}d|||g}t||| |||}|| qR|S )z
        Find all concordance lines given the query word.

        Provided with a list of words, these will be found as a phrase.
        rU   c                 s   s    | ]
}t |sd V  qdS )r   N)unicodedata	combining)r+   charr"   r"   r#   r6      s    z4ConcordanceIndex.find_concordance.<locals>.<genexpr>      r   r   Nc                    s   h | ]}|  d  qS )r   r"   )r+   r   r!   r"   r#   	<setcomp>   r/   z4ConcordanceIndex.find_concordance.<locals>.<setcomp>)
isinstancerf   r[   sumrk   r:   rK   rY   r8   r   maxr   rjustr   rh   )r5   rE   widthphraseZ
phrase_strZ
phrase_lenZ
half_widthcontextconcordance_listrk   Zword_offsetsZ
query_wordZleft_contextZright_contextr   r   Z
line_printconcordance_liner"   rs   r#   find_concordance   sB   

	z!ConcordanceIndex.find_concordance   c                 C   sn   | j ||d}|std dS t|t|}td| dt| d t|d| D ]	\}}t|j q+dS )a  
        Print concordance lines given the query word.
        :param word: The target word or phrase (a list of strings)
        :type word: str or list
        :param lines: The number of lines to display (default=25)
        :type lines: int
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param save: The option to save the concordance.
        :type save: bool
        )ry   z
no matcheszDisplaying z of z	 matches:N)r~   printminr   r:   r   )r5   rE   ry   linesr|   r!   r}   r"   r"   r#   print_concordance   s   z"ConcordanceIndex.print_concordanceN)rm   )rm   r   )
r`   ra   rb   rc   r>   r    rk   rl   r~   r   r"   r"   r"   r#   re      s    

0re   c                   @   s    e Zd ZdZdd Zdd ZdS )TokenSearchera  
    A class that makes it easier to use regular expressions to search
    over tokenized strings.  The tokenized string is converted to a
    string where tokens are marked with angle brackets -- e.g.,
    ``'<the><window><is><still><open>'``.  The regular expression
    passed to the ``findall()`` method is modified to treat angle
    brackets as non-capturing parentheses, in addition to matching the
    token boundaries; and to have ``'.'`` not match the angle brackets.
    c                 C   s   d dd |D | _d S )N c                 s   s    | ]	}d | d V  qdS )<>Nr"   rQ   r"   r"   r#   r6     s    z)TokenSearcher.__init__.<locals>.<genexpr>)r[   _rawr4   r"   r"   r#   r>     s   zTokenSearcher.__init__c                 C   s~   t dd|}t dd|}t dd|}t dd|}t || j}|D ]}|ds5|dr5td	q%d
d |D }|S )a  
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> from nltk.text import TokenSearcher
        >>> from nltk.book import text1, text5, text9
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        z\sr   r   z(?:<(?:r   z)>)z	(?<!\\)\.z[^>]z$Bad regexp for TokenSearcher.findallc                 S   s   g | ]}|d d  dqS )r   z><splitr+   hr"   r"   r#   r.   5  s    z)TokenSearcher.findall.<locals>.<listcomp>)resubfindallr   
startswithendswithrZ   )r5   regexphitsr   r"   r"   r#   r     s   zTokenSearcher.findallN)r`   ra   rb   rc   r>   r   r"   r"   r"   r#   r     s    
r   c                   @   s   e Zd ZdZdZd6ddZdd Zdd	 Zd7ddZd7ddZ	d8ddZ
d8ddZdd Zdd Zdd Zd9ddZd9ddZd d! Zd:d#d$Zd;d'd(Zd)d* Zd+d, Zd-d. Zed/Zd0d1 Zd2d3 Zd4d5 ZdS )<Texta  
    A wrapper around a sequence of simple (string) tokens, which is
    intended to support initial exploration of texts (via the
    interactive console).  Its methods perform a variety of analyses
    on the text's contexts (e.g., counting, concordancing, collocation
    discovery), and display the results.  If you wish to write a
    program which makes use of these analyses, then you should bypass
    the ``Text`` class, and use the appropriate analysis function or
    class directly instead.

    A ``Text`` is typically initialized from a given document or
    corpus.  E.g.:

    >>> import nltk.corpus
    >>> from nltk.text import Text
    >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))

    TNc                 C   s   | j rt|}|| _|r|| _dS d|dd v r3|dd d}ddd |d| D | _dS ddd |dd	 D d
 | _dS )zv
        Create a Text object.

        :param tokens: The source text.
        :type tokens: sequence of str
        ]NrH   rU   c                 s       | ]}t |V  qd S r&   strr+   tokr"   r"   r#   r6   b      z Text.__init__.<locals>.<genexpr>r   c                 s   r   r&   r   r   r"   r"   r#   r6   d  r      z...)_COPY_TOKENSrf   r    nameri   r[   )r5   r    r   endr"   r"   r#   r>   S  s   
"&zText.__init__c                 C   s
   | j | S r&   )r    )r5   r!   r"   r"   r#   __getitem__j     
zText.__getitem__c                 C   s
   t | jS r&   )r   r    rA   r"   r"   r#   __len__m  r   zText.__len__O   r   c                 C   s.   d| j vrt| jdd d| _| j|||S )a  
        Prints a concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.

        :param word: The target word or phrase (a list of strings)
        :type word: str or list
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int

        :seealso: ``ConcordanceIndex``
        _concordance_indexc                 S      |   S r&   r   sr"   r"   r#   r)         z"Text.concordance.<locals>.<lambda>r=   )__dict__re   r    r   r   r5   rE   ry   r   r"   r"   r#   concordancet  s
   

zText.concordancec                 C   s4   d| j vrt| jdd d| _| j||d| S )a  
        Generate a concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.

        :param word: The target word or phrase (a list of strings)
        :type word: str or list
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int

        :seealso: ``ConcordanceIndex``
        r   c                 S   r   r&   r   r   r"   r"   r#   r)     r   z'Text.concordance_list.<locals>.<lambda>r   N)r   re   r    r   r~   r   r"   r"   r#   r|     s
   

zText.concordance_listrH   rq   c                    s   d| j v r| j|kr| j|ksB|| _|| _ddlm} |d t| j|}|	d |
 fdd t }t||j|| _| jS )a  
        Return collocations derived from the text, ignoring stopwords.

            >>> from nltk.book import text4
            >>> text4.collocation_list()[:2]
            [('United', 'States'), ('fellow', 'citizens')]

        :param num: The maximum number of collocations to return.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        :rtype: list(tuple(str, str))
        _collocationsr   )	stopwordsenglishrq   c                    s   t | dk p|   v S )N   )r   r   )r3   Zignored_wordsr"   r#   r)     r/   z'Text.collocation_list.<locals>.<lambda>)r   Z_numZ_window_sizenltk.corpusr   rT   r   Z
from_wordsr    Zapply_freq_filterZapply_word_filterr
   rf   ZnbestZlikelihood_ratior   )r5   numwindow_sizer   finderZbigram_measuresr"   r   r#   collocation_list  s   




zText.collocation_listc                 C   s*   dd |  ||D }tt|dd dS )a  
        Print collocations derived from the text, ignoring stopwords.

            >>> from nltk.book import text4
            >>> text4.collocations() # doctest: +NORMALIZE_WHITESPACE
            United States; fellow citizens; years ago; four years; Federal
            Government; General Government; American people; Vice President; God
            bless; Chief Justice; one another; fellow Americans; Old World;
            Almighty God; Fellow citizens; Chief Magistrate; every citizen; Indian
            tribes; public debt; foreign nations


        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        c                 S   s   g | ]
\}}|d  | qS rU   r"   r+   Zw1Zw2r"   r"   r#   r.     s    z%Text.collocations.<locals>.<listcomp>; )	separatorN)r   r   r   )r5   r   r   Zcollocation_stringsr"   r"   r#   collocations  s   
zText.collocationsc                 C      | j |S )zJ
        Count the number of times this word appears in the text.
        )r    countrj   r"   r"   r#   r        z
Text.countc                 C   r   )zQ
        Find the index of the first occurrence of the word in the text.
        )r    ri   rj   r"   r"   r#   ri     r   z
Text.indexc                 C   s   t r&   )NotImplementedError)r5   methodr"   r"   r#   readability  s   zText.readabilityc                    s   d| j vrt| jdd dd d| _ | jj v rGt  t fdd D }dd	 |	|D }t
t| dS t
d
 dS )a~  
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.similar_words()
        _word_context_indexc                 S   r   r&   )isalphar'   r"   r"   r#   r)     r   zText.similar.<locals>.<lambda>c                 S   r   r&   r   r   r"   r"   r#   r)     r   )r-   r=   c                 3   s2    | ]}| D ]}| v r|ks|V  qqd S r&   r"   rV   rS   ZwcirE   r"   r#   r6     s    zText.similar.<locals>.<genexpr>c                 S      g | ]\}}|qS r"   r"   r+   r3   _r"   r"   r#   r.         z Text.similar.<locals>.<listcomp>z
No matchesN)r   r   r    r   r   r;   Z
conditionsrC   r   most_commonr   r   )r5   rE   r   r]   rT   r"   r   r#   similar  s   
zText.similarc              
   C   s   d| j vrt| jdd d| _z(| j|d}|s td W dS dd ||D }ttd	d
 |D  W dS  tyN } zt| W Y d}~dS d}~ww )aY  
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param words: The words used to seed the similarity search
        :type words: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        r   c                 S   r   r&   r   r   r"   r"   r#   r)     r   z&Text.common_contexts.<locals>.<lambda>r   TzNo common contexts were foundc                 S   r   r"   r"   r   r"   r"   r#   r.      r   z(Text.common_contexts.<locals>.<listcomp>c                 s   s     | ]\}}|d  | V  qdS )r   Nr"   r   r"   r"   r#   r6   !  s    z'Text.common_contexts.<locals>.<genexpr>N)	r   r   r    r   r^   r   r   r   rZ   )r5   rT   r   r]   Zranked_contextser"   r"   r#   r^   
  s   

zText.common_contextsc                 C   s   ddl m} || | dS )z
        Produce a plot showing the distribution of the words through the text.
        Requires pylab to be installed.

        :param words: The words to be plotted
        :type words: list(str)
        :seealso: nltk.draw.dispersion_plot()
        r   )dispersion_plotN)Z	nltk.drawr   )r5   rT   r   r"   r"   r#   r   &  s   	zText.dispersion_plotr   c                 C   s(   t ||\}}t|d}||| |S )N)order)r	   r   fit)r5   Ztokenized_sentsrM   Z
train_dataZpadded_sentsmodelr"   r"   r#   _train_default_ngram_lm3  s   
zText._train_default_ngram_lmd   *   c           	      C   s   dd t d| jD | _t| ds#tdtjd | j| jdd| _	g }|d	ks-J d
t
||k r]t| j	j|||dD ]\}}|dkrGq>|dkrM n|| q>|d7 }t
||k s3|rfd|d nd}|t|d|  }t| |S )a  
        Print random text, generated using a trigram language model.
        See also `help(nltk.lm)`.

        :param length: The length of text to generate (default=100)
        :type length: int

        :param text_seed: Generation can be conditioned on preceding context.
        :type text_seed: list(str)

        :param random_seed: A random seed or an instance of `random.Random`. If provided,
            makes the random sampling part of generation reproducible. (default=42)
        :type random_seed: int
        c                 S   s   g | ]}| d qS r   r   )r+   sentr"   r"   r#   r.   I  s    
z!Text.generate.<locals>.<listcomp>rU   _trigram_modelzBuilding ngram index...)filer   )rM   r   z!The `length` must be more than 0.)	text_seedrandom_seedz<s>z</s>r   r   N)r   r[   r    Z_tokenized_sentshasattrr   sysstderrr   r   r   r:   generaterh   r   )	r5   lengthr   r   Zgenerated_tokensidxtokenprefixZ
output_strr"   r"   r#   r   9  s6   
zText.generatec                 G   s   |   j| S )zc
        See documentation for FreqDist.plot()
        :seealso: nltk.prob.FreqDist.plot()
        )vocabplot)r5   argsr"   r"   r#   r   g  s   z	Text.plotc                 C   s   d| j vr
t| | _| jS )z.
        :seealso: nltk.prob.FreqDist
        _vocab)r   r   r   rA   r"   r"   r#   r   n  s   

z
Text.vocabc                 C   s@   d| j vr
t| | _| j|}dd |D }tt|d dS )a  
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> from nltk.book import text1, text5, text9
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        _token_searcherc                 S   s   g | ]}d  |qS r   )r[   r   r"   r"   r#   r.     rR   z Text.findall.<locals>.<listcomp>r   N)r   r   r   r   r   r   )r5   r   r   r"   r"   r#   r   w  s
   

zText.findallz\w+|[\.\!\?]c                 C   s   |d }|dkr | j || s |d8 }|dkr | j || r|dkr(|| nd}|d }|t|k rN| j || sN|d7 }|t|k rN| j || r<|t|krX|| nd}||fS )z
        One left & one right token, both case-normalized.  Skip over
        non-sentence-final punctuation.  Used by the ``ContextIndex``
        that is created for ``similar()`` and ``common_contexts()``.
        r   r   r   r   )_CONTEXT_REmatchr   )r5   r    r!   jr   r   r"   r"   r#   _context  s   zText._contextc                 C   
   d| j  S Nz
<Text: %s>r   rA   r"   r"   r#   __str__  r   zText.__str__c                 C   r   r   r   rA   r"   r"   r#   rl     r   zText.__repr__r&   )r   r   )rH   rq   r_   )r   )r   Nr   )r`   ra   rb   rc   r   r>   r   r   r   r|   r   r   r   ri   r   r   r^   r   r   r   r   r   r   r   compiler   r   r   rl   r"   r"   r"   r#   r   9  s2    




#

"

.	
"r   c                   @   s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )TextCollectiona;  A collection of texts, which can be loaded with list of texts, or
    with a corpus consisting of one or more texts, and which supports
    counting, concordancing, collocation discovery, etc.  Initialize a
    TextCollection as follows:

    >>> import nltk.corpus
    >>> from nltk.text import TextCollection
    >>> from nltk.book import text1, text2, text3
    >>> gutenberg = TextCollection(nltk.corpus.gutenberg)
    >>> mytexts = TextCollection([text1, text2, text3])

    Iterating over a TextCollection produces all the tokens of all the
    texts in order.
    c                    s@   t  dr fdd  D   | _t| t  i | _d S )NrT   c                    rP   r"   )rT   )r+   fsourcer"   r#   r.     rR   z+TextCollection.__init__.<locals>.<listcomp>)r   Zfileids_textsr   r>   r   
_idf_cache)r5   r   r"   r   r#   r>     s
   

zTextCollection.__init__c                 C   s   | |t| S )z"The frequency of the term in text.)r   r   r5   termtextr"   r"   r#   tf  s   zTextCollection.tfc                    sj   | j  }|du r3t fdd| jD }t| jdkr!td|r,tt| j| nd}|| j  < |S )zThe number of texts in the corpus divided by the
        number of texts that the term appears in.
        If a term does not appear in the corpus, 0.0 is returned.Nc                    s   g | ]} |v rd qS )Tr"   )r+   r   r   r"   r#   r.     r/   z&TextCollection.idf.<locals>.<listcomp>r   z+IDF undefined for empty document collectiong        )r   rL   r   r   rZ   r   )r5   r   idfmatchesr"   r   r#   r     s   
zTextCollection.idfc                 C   s   |  ||| | S r&   )r   r   r   r"   r"   r#   tf_idf  s   zTextCollection.tf_idfN)r`   ra   rb   rc   r>   r   r   r   r"   r"   r"   r#   r     s    r   c                  C   s   ddl m}  t| jdd}t| t  td |d t  td |d t  td |  t  td |g d	 t  td
 |	d t  td td|d  td|dd  td|
 d  d S )Nr   )brownnews)
categorieszConcordance:zDistributionally similar words:zCollocations:zDispersion plot:)r   reportZsaidZ	announcedzVocabulary plot:2   z	Indexing:ztext[3]:r   z
text[3:5]:   ztext.vocab()['news']:)r   r   r   rT   r   r   r   r   r   r   r   )r   r   r"   r"   r#   demo  s.   


r  __main__)r   re   r   r   r   )(rc   r   r   rn   collectionsr   r   r   	functoolsr   mathr   Znltk.collocationsr   Znltk.lmr   Znltk.lm.preprocessingr	   Znltk.metricsr
   r   Znltk.probabilityr   r9   r   Znltk.tokenizer   Z	nltk.utilr   r   r   r   r   re   r   r   r   r  r`   __all__r"   r"   r"   r#   <module>   s>   [8   .