o
    rZh(                     @   sb   d Z ddlmZ ddlmZ ddlmZmZmZm	Z	 G dd deZ
dd Zed	kr/e  d
S d
S )a  
A classifier based on the Naive Bayes algorithm.  In order to find the
probability for a label, this algorithm first uses the Bayes rule to
express P(label|features) in terms of P(label) and P(features|label):

|                       P(label) * P(features|label)
|  P(label|features) = ------------------------------
|                              P(features)

The algorithm then makes the 'naive' assumption that all features are
independent, given the label:

|                       P(label) * P(f1|label) * ... * P(fn|label)
|  P(label|features) = --------------------------------------------
|                                         P(features)

Rather than computing P(features) explicitly, the algorithm just
calculates the numerator for each label, and normalizes them so they
sum to one:

|                       P(label) * P(f1|label) * ... * P(fn|label)
|  P(label|features) = --------------------------------------------
|                        SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
    )defaultdict)ClassifierI)DictionaryProbDistELEProbDistFreqDistsum_logsc                   @   sT   e Zd ZdZdd Zdd Zdd Zdd	 ZdddZdddZ	e
efddZdS )NaiveBayesClassifiera  
    A Naive Bayes classifier.  Naive Bayes classifiers are
    paramaterized by two probability distributions:

      - P(label) gives the probability that an input will receive each
        label, given no information about the input's features.

      - P(fname=fval|label) gives the probability that a given feature
        (fname) will receive a given value (fval), given that the
        label (label).

    If the classifier encounters an input with a feature that has
    never been seen with any label, then rather than assigning a
    probability of 0 to all labels, it will ignore that feature.

    The feature value 'None' is reserved for unseen feature values;
    you generally should not use 'None' as a feature value for one of
    your own features.
    c                 C   s   || _ || _t| | _dS )a=  
        :param label_probdist: P(label), the probability distribution
            over labels.  It is expressed as a ``ProbDistI`` whose
            samples are labels.  I.e., P(label) =
            ``label_probdist.prob(label)``.

        :param feature_probdist: P(fname=fval|label), the probability
            distribution for feature values, given labels.  It is
            expressed as a dictionary whose keys are ``(label, fname)``
            pairs and whose values are ``ProbDistI`` objects over feature
            values.  I.e., P(fname=fval|label) =
            ``feature_probdist[label,fname].prob(fval)``.  If a given
            ``(label,fname)`` is not a key in ``feature_probdist``, then
            it is assumed that the corresponding P(fname=fval|label)
            is 0 for all values of ``fval``.
        N)_label_probdist_feature_probdistlistsamples_labels)selflabel_probdistfeature_probdist r   G/var/www/auris/lib/python3.10/site-packages/nltk/classify/naivebayes.py__init__@   s   zNaiveBayesClassifier.__init__c                 C   s   | j S N)r   )r   r   r   r   labelsU   s   zNaiveBayesClassifier.labelsc                 C   s   |  | S r   )prob_classifymax)r   
featuresetr   r   r   classifyX   s   zNaiveBayesClassifier.classifyc                 C   s   |  }t| D ]}| jD ]}||f| jv r nq||= q
i }| jD ]
}| j|||< q$| jD ]/}| D ](\}}||f| jv rV| j||f }||  ||7  < q8||  tg 7  < q8q2t	|dddS )NT)	normalizelog)
copyr   keysr   r
   r	   logprobitemsr   r   )r   r   fnamelabelr   fvalZfeature_probsr   r   r   r   [   s&   



z"NaiveBayesClassifier.prob_classify
   c              	      s   | j  td | |D ]i\ fddt fdd| jD fdddd	}t|d
kr4q|d }|d } |f dkrJd}nd |f  |f   }tdd| d d d| d d |f  qd S )NzMost Informative Featuresc                    s    | f  S r   )prob)lcpdistr    r"   r   r   	labelprob   s   zFNaiveBayesClassifier.show_most_informative_features.<locals>.labelprobc                 3   s(    | ]} |f   v r|V  qd S r   )r   ).0r%   r&   r   r   	<genexpr>   s   & zFNaiveBayesClassifier.show_most_informative_features.<locals>.<genexpr>c                    s    |  | fS r   r   )element)r(   r   r   <lambda>   s    zENaiveBayesClassifier.show_most_informative_features.<locals>.<lambda>T)keyreverse   r   ZINFz%8.1fz"%24s = %-14r %6s : %-6s = %s : 1.0z%s   )r
   printmost_informative_featuressortedr   lenr$   )r   nr   Zl0l1ratior   )r'   r    r"   r(   r   show_most_informative_features|   s0   
"$z3NaiveBayesClassifier.show_most_informative_featuresd   c           	         s   t | dr| jd| S t }tt tdd | j D ]8\\}}}| D ]-}||f}|| |	|}t
| |  |< t|| |< | dkrU|| q(qt| fddd| _| jd| S )a  
        Return a list of the 'most informative' features used by this
        classifier.  For the purpose of this function, the
        informativeness of a feature ``(fname,fval)`` is equal to the
        highest value of P(fname=fval|label), for any label, divided by
        the lowest value of P(fname=fval|label), for any label:

        |  max[ P(fname=fval|label1) / P(fname=fval|label2) ]
        _most_informative_featuresNc                   S   s   dS )Ng      ?r   r   r   r   r   r,      s    z@NaiveBayesClassifier.most_informative_features.<locals>.<lambda>r   c                    s0   |   |   | d | d dv t | d  fS )Nr   r/   )NFT)strlower)Zfeature_ZmaxprobZminprobr   r   r,      s
   
)r-   )hasattrr;   setr   floatr
   r   r   addr$   r   mindiscardr4   )	r   r6   featuresr!   r    probdistr"   featurepr   r>   r   r3      s*   




	z.NaiveBayesClassifier.most_informative_featuresc                 C   s*  t  }tt }tt}t }|D ]-\}}||  d7  < | D ]\}	}
|||	f |
  d7  < ||	 |
 ||	 q q|D ].}|| }|D ]%}	|||	f  }|| dkrm|||	f d  || 7  < ||	 d qHq@||}i }| D ]\\}}	}||t||	 d}||||	f< qy| ||S )z
        :param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples ``(featureset, label)``.
        r/   r   N)Zbins)r   r   r@   r   rB   Nr5   )clsZlabeled_featuresetsZ	estimatorZlabel_freqdistZfeature_freqdistZfeature_valuesfnamesr   r!   r    r"   Znum_samplescountr   r   ZfreqdistrF   r   r   r   train   s4   	
zNaiveBayesClassifier.trainN)r#   )r:   )__name__
__module____qualname____doc__r   r   r   r   r9   r3   classmethodr   rM   r   r   r   r   r   +   s    
!
+r   c                  C   s"   ddl m}  | tj}|  d S )Nr   )
names_demo)Znltk.classify.utilrS   r   rM   r9   )rS   Z
classifierr   r   r   demo   s   
rT   __main__N)rQ   collectionsr   Znltk.classify.apir   Znltk.probabilityr   r   r   r   r   rT   rN   r   r   r   r   <module>   s    R
