o
    rZh                      @   s   d Z ddlZddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ G d	d
 d
edZdd Zdd ZdddZG dd dedZdS )zLanguage Model Interface.    N)ABCMetaabstractmethod)bisect)
accumulate)NgramCounter)	log_base2)
Vocabularyc                   @   s0   e Zd ZdZdd Zedd Zedd ZdS )		SmoothingzNgram Smoothing Interface

    Implements Chen & Goodman 1995's idea that all smoothing algorithms have
    certain features in common. This should ideally allow smoothing algorithms to
    work both with Backoff and Interpolation.
    c                 C   s   || _ || _dS )z
        :param vocabulary: The Ngram vocabulary object.
        :type vocabulary: nltk.lm.vocab.Vocabulary
        :param counter: The counts of the vocabulary items.
        :type counter: nltk.lm.counter.NgramCounter
        N)vocabcounts)self
vocabularycounter r   :/var/www/auris/lib/python3.10/site-packages/nltk/lm/api.py__init__   s   
zSmoothing.__init__c                 C      t  NNotImplementedError)r   wordr   r   r   unigram_score&      zSmoothing.unigram_scorec                 C   r   r   r   r   r   contextr   r   r   alpha_gamma*   r   zSmoothing.alpha_gammaN)__name__
__module____qualname____doc__r   r   r   r   r   r   r   r   r	      s    

r	   )	metaclassc                 C   s   t | t|  S )z0Return average (aka mean) for sequence of items.)sumlen)itemsr   r   r   _mean/   s   r$   c                 C   s   t | tjr| S t| S r   )
isinstancerandomRandom)Zseed_or_generatorr   r   r   _random_generator4   s   
r(   c                 C   sR   | st dt| t|krt dtt|}|d }| }| t|||  S )z`Like random.choice, but with weights.

    Heavily inspired by python 3.6 `random.choices`.
    z"Can't choose from empty populationz3The number of weights does not match the population)
ValueErrorr"   listr   r&   r   )
populationweightsrandom_generatorcum_weightstotal	thresholdr   r   r   _weighted_choice:   s   r2   c                   @   sh   e Zd ZdZdddZdddZdddZedd	d
ZdddZ	dd Z
dd Zdd ZdddZdS )LanguageModelzKABC for Language Models.

    Cannot be directly instantiated itself.

    Nc                 C   s`   || _ |rt|tstjd| jjddd |du rt n|| _|du r+t | _	dS || _	dS )ap  Creates new LanguageModel.

        :param vocabulary: If provided, this vocabulary will be used instead
            of creating a new one when training.
        :type vocabulary: `nltk.lm.Vocabulary` or None
        :param counter: If provided, use this object to count ngrams.
        :type counter: `nltk.lm.NgramCounter` or None
        :param ngrams_fn: If given, defines how sentences in training text are turned to ngram
            sequences.
        :type ngrams_fn: function or None
        :param pad_fn: If given, defines how sentences in training text are padded.
        :type pad_fn: function or None
        z$The `vocabulary` argument passed to z- must be an instance of `nltk.lm.Vocabulary`.   )
stacklevelN)
orderr%   r   warningswarn	__class__r   r
   r   r   )r   r6   r   r   r   r   r   r   P   s   zLanguageModel.__init__c                    s@    j s|du rtd j |  j fdd|D  dS )zeTrains the model on a text.

        :param text: Training text as a sequence of sentences.

        Nz:Cannot fit without a vocabulary or text to create it from.c                 3   s    | ]	} j |V  qd S r   )r
   lookup).0sentr   r   r   	<genexpr>t       z$LanguageModel.fit.<locals>.<genexpr>)r
   r*   updater   )r   textZvocabulary_textr   r=   r   fith   s   zLanguageModel.fitc                 C   s&   |  | j||r| j|S dS )zMasks out of vocab (OOV) words and computes their model score.

        For model-specific logic of calculating scores, see the `unmasked_score`
        method.
        N)unmasked_scorer
   r:   r   r   r   r   scorev   s
   zLanguageModel.scorec                 C   r   )a  Score a word given some optional context.

        Concrete models are expected to provide an implementation.
        Note that this method does not mask its arguments with the OOV label.
        Use the `score` method for that.

        :param str word: Word for which we want the score
        :param tuple(str) context: Context the word is in.
            If `None`, compute unigram score.
        :param context: tuple(str) or None
        :rtype: float
        r   r   r   r   r   rC      s   zLanguageModel.unmasked_scorec                 C   s   t | ||S )zEvaluate the log score of this word in this context.

        The arguments are the same as for `score` and `unmasked_score`.

        )r   rD   r   r   r   r   logscore      zLanguageModel.logscorec                 C   s"   |r| j t|d  | S | j jS )zHelper method for retrieving counts for a given context.

        Assumes context has been checked and oov words in it masked.
        :type context: tuple(str) or None

           )r   r"   Zunigrams)r   r   r   r   r   context_counts   s   zLanguageModel.context_countsc                    s   dt  fdd|D  S )a?  Calculate cross-entropy of model for given evaluation text.

        This implementation is based on the Shannon-McMillan-Breiman theorem,
        as used and referenced by Dan Jurafsky and Jordan Boyd-Graber.

        :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples.
        :rtype: float

        r)   c                    s$   g | ]}  |d  |dd  qS )r)   N)rE   )r;   Zngramr=   r   r   
<listcomp>   s   $ z)LanguageModel.entropy.<locals>.<listcomp>)r$   r   Ztext_ngramsr   r=   r   entropy   s   
zLanguageModel.entropyc                 C   s   t d| |S )zCalculates the perplexity of the given text.

        This is simply 2 ** cross-entropy for the text, so the arguments are the same.

        g       @)powrK   rJ   r   r   r   
perplexity   rF   zLanguageModel.perplexityrG   c                    s   |du rg nt |}t|}|dkrat|jkr#|j d d n| j } rM|sMt dkr> dd ng  j } rM|r2t|}t|t	 fdd|D |S g }t
|D ]}|jd|| |d qg|S )a  Generate words from the model.

        :param int num_words: How many words to generate. By default 1.
        :param text_seed: Generation can be conditioned on preceding context.
        :param random_seed: A random seed or an instance of `random.Random`. If provided,
            makes the random sampling part of generation reproducible.
        :return: One (str) word or a list of words generated from model.

        Examples:

        >>> from nltk.lm import MLE
        >>> lm = MLE(2)
        >>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c'])
        >>> lm.fit([[("a",), ("b",), ("c",)]])
        >>> lm.generate(random_seed=3)
        'a'
        >>> lm.generate(text_seed=['a'])
        'b'

        NrG   c                 3   s    | ]	} | V  qd S r   )rD   )r;   wr   r   r   r   r>      r?   z)LanguageModel.generate.<locals>.<genexpr>)	num_words	text_seedrandom_seed)r+   r(   r"   r6   rH   r
   r:   sortedr2   tuplerangeappendgenerate)r   rP   rQ   rR   r.   Zsamples	generated_r   rO   r   rW      s8   zLanguageModel.generate)NNr   )rG   NN)r   r   r   r   r   rB   rD   r   rC   rE   rH   rK   rM   rW   r   r   r   r   r3   I   s    




r3   r   )r   r&   r7   abcr   r   r   	itertoolsr   Znltk.lm.counterr   Znltk.lm.utilr   Znltk.lm.vocabularyr   r	   r$   r(   r2   r3   r   r   r   r   <module>   s   
