o
    rZh>                     @   s   d Z ddlZddlmZmZ ddlmZ dd Zej	Z
dd ZdZzdd	lmZ W n ey6   d
d ZY nw dZ	 dZ	 dZ	 G dd dedZG dd deZG dd deZG dd deZG dd dZdS )z
Provides scoring functions for a number of association measures through a
generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
    N)ABCMetaabstractmethodreducec                 C   s
   t | S N)_mathlog2)x r
   G/var/www/auris/lib/python3.10/site-packages/nltk/metrics/association.py<lambda>   s   
 r   c                 C   s   t dd | S )Nc                 S   s   | | S r   r
   )r	   yr
   r
   r   r      s    z<lambda>.<locals>.<lambda>r   )sr
   r
   r   r      s    g#B;)fisher_exactc                  O   s   t r   NotImplementedError)_args_kwargsr
   r
   r   r      s   r   c                   @   s   e Zd ZdZdZeedd Zeedd Ze	dd Z
ed	d
 Ze	dd Ze	dd Zedd Ze	dd Ze	dd Ze	dd Ze	dd ZdS )NgramAssocMeasuresa  
    An abstract class defining a collection of generic association measures.
    Each public method returns a score, taking the following arguments::

        score_fn(count_of_ngram,
                 (count_of_n-1gram_1, ..., count_of_n-1gram_j),
                 (count_of_n-2gram_1, ..., count_of_n-2gram_k),
                 ...,
                 (count_of_1gram_1, ..., count_of_1gram_n),
                 count_of_total_words)

    See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``

    Inheriting classes should define a property _n, and a method _contingency
    which calculates contingency values from marginals in order for all
    association measures defined here to be usable.
    r   c                  G      t d)z>Calculates values of a contingency table from marginal values.?The contingency table is not availablein the general ngram caser   	marginalsr
   r
   r   _contingencyB      zNgramAssocMeasures._contingencyc                  G   r   )ACalculates values of contingency table marginals from its values.r   r   contingencyr
   r
   r   
_marginalsJ   r   zNgramAssocMeasures._marginalsc                 #   s^    t }dd t jD }ttD ]t fdd|D | jd   V  qdS )3Calculates expected values for a contingency table.c                 S   s   g | ]}d |> qS )   r
   ).0ir
   r
   r   
<listcomp>V   s    z7NgramAssocMeasures._expected_values.<locals>.<listcomp>c                 3   s4    | ] t  fd dtdj D V  qdS )c                 3   s(    | ]}|@ @ kr | V  qd S r   r
   )r#   r	   )contr$   jr
   r   	<genexpr>]      & z@NgramAssocMeasures._expected_values.<locals>.<genexpr>.<genexpr>   N)sumrange_n)r#   clsr&   r$   )r'   r   r(   \   s
    "
z6NgramAssocMeasures._expected_values.<locals>.<genexpr>r"   N)r+   r,   r-   len_product)r/   r&   n_allbitsr
   r.   r   _expected_valuesR   s   z#NgramAssocMeasures._expected_valuesc                  G   s   | t  | t  S )z Scores ngrams by their frequency)NGRAMTOTALr   r
   r
   r   raw_freqc   s   zNgramAssocMeasures.raw_freqc                 G   s6   |t  t|t |t | jd    |t  t d  S )zScores ngrams using Student's t test with independence hypothesis
        for unigrams, as in Manning and Schutze 5.3.1.
        r"   g      ?)r5   r1   UNIGRAMSr6   r-   _SMALLr/   r   r
   r
   r   	student_th   s
   zNgramAssocMeasures.student_tc                 G   s,   | j | }| |}tdd t||D S )zZScores ngrams using Pearson's chi-square as in Manning and Schutze
        5.3.3.
        c                 s   s(    | ]\}}|| d  |t   V  qdS )r*   N)r9   r#   Zobsexpr
   r
   r   r(   y   r)   z,NgramAssocMeasures.chi_sq.<locals>.<genexpr>)r   r4   r+   zip)r/   r   r&   Zexpsr
   r
   r   chi_sqr   s   

zNgramAssocMeasures.chi_sqc                  O   s    | t  |dd t| t  S )zScores ngrams using a variant of mutual information. The keyword
        argument power sets an exponent (default 3) for the numerator. No
        logarithm of the result is calculated.
        power   )r5   getr1   r8   )r   kwargsr
   r
   r   mi_like{   s   zNgramAssocMeasures.mi_likec                 G   s.   t |t |t | jd   t t|t  S )z^Scores ngrams by pointwise mutual information, as in Manning and
        Schutze 5.4.
        r"   )_log2r5   r6   r-   r1   r8   r:   r
   r
   r   pmi   s   
zNgramAssocMeasures.pmic                 G   s,   | j | }dtdd t|| |D  S )zFScores ngrams using likelihood ratios as in Manning and Schutze 5.3.4.r*   c                 s   s,    | ]\}}|t ||t  t  V  qd S r   )_lnr9   r<   r
   r
   r   r(      s
    
z6NgramAssocMeasures.likelihood_ratio.<locals>.<genexpr>)r   r+   r>   r4   r/   r   r&   r
   r
   r   likelihood_ratio   s   


z#NgramAssocMeasures.likelihood_ratioc                 G   s:   t |t |t | jd   }|t t|t | d  S )z1Scores ngrams using the Poisson-Stirling measure.r"   )r1   r8   r6   r-   r5   rE   )r/   r   r=   r
   r
   r   poisson_stirling   s   z#NgramAssocMeasures.poisson_stirlingc                 G   s"   | j | }|d t|dd  S )z&Scores ngrams using the Jaccard index.r   Nr   )r   r+   rH   r
   r
   r   jaccard   s   
zNgramAssocMeasures.jaccardN)__name__
__module____qualname____doc__r-   staticmethodr   r   r    classmethodr4   r7   r;   r?   rD   rF   rI   rJ   rK   r
   r
   r
   r   r   -   s6    


	

	


r   )	metaclassc                   @   sh   e Zd ZdZdZedd Zedd Zedd Ze	d	d
 Z
e	dd Ze	dd Zedd ZdS )BigramAssocMeasuresa  
    A collection of bigram association measures. Each association measure
    is provided as a function with three arguments::

        bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:

    - n_ii counts ``(w1, w2)``, i.e. the bigram being scored
    - n_ix counts ``(w1, *)``
    - n_xi counts ``(*, w2)``
    - n_xx counts ``(*, *)``, i.e. any bigram

    This may be shown with respect to a contingency table::

                w1    ~w1
             ------ ------
         w2 | n_ii | n_oi | = n_xi
             ------ ------
        ~w2 | n_io | n_oo |
             ------ ------
             = n_ix        TOTAL = n_xx
    r*   c                 C   s0   |\}}||  }||  }| ||||  | | fS )zECalculates values of a bigram contingency table from marginal values.r
   )n_iin_ix_xi_tuplen_xxn_ixn_xin_oin_ior
   r
   r   r      s   z BigramAssocMeasures._contingencyc                 C   s"   | ||  ||  f|| | |  fS )r   r
   )rT   rY   rZ   n_oor
   r
   r   r       s   "zBigramAssocMeasures._marginalsc                 c   sJ    t | }tdD ]}| | | |dA   | | | |dA    | V  q	dS )r!      r"   r*   N)r+   r,   )r&   rV   r$   r
   r
   r   r4      s
   0z$BigramAssocMeasures._expected_valuesc                 G   sF   | j | \}}}}|| ||  d || ||  ||  ||   S )zdScores bigrams using phi-square, the square of the Pearson correlation
        coefficient.
        r*   )r   )r/   r   rT   rZ   rY   r[   r
   r
   r   phi_sq   s   zBigramAssocMeasures.phi_sqc                 C   s   |\}}||  |||f| S )zScores bigrams using chi-square, i.e. phi-sq multiplied by the number
        of bigrams, as in Manning and Schutze 5.3.3.
        )r]   )r/   rT   rU   rV   rW   rX   r
   r
   r   r?      s   zBigramAssocMeasures.chi_sqc                 G   s2   | j | \}}}}t||g||ggdd\}}|S )zScores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
        sensitive to small counts than PMI or Chi Sq, but also more expensive
        to compute. Requires scipy.
        less)alternative)r   r   )r/   r   rT   rZ   rY   r[   ZoddsZpvaluer
   r
   r   fisher   s   zBigramAssocMeasures.fisherc                 C   s   |\}}d|  ||  S )z(Scores bigrams using Dice's coefficient.r*   r
   )rT   rU   rV   rW   rX   r
   r
   r   dice   s   zBigramAssocMeasures.diceN)rL   rM   rN   rO   r-   rP   r   r    r4   rQ   r]   r?   r`   ra   r
   r
   r
   r   rS      s"    






rS   c                   @   ,   e Zd ZdZdZedd Zedd ZdS )TrigramAssocMeasuresa  
    A collection of trigram association measures. Each association measure
    is provided as a function with four arguments::

        trigram_score_fn(n_iii,
                         (n_iix, n_ixi, n_xii),
                         (n_ixx, n_xix, n_xxi),
                         n_xxx)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:

    - n_iii counts ``(w1, w2, w3)``, i.e. the trigram being scored
    - n_ixx counts ``(w1, *, *)``
    - n_xxx counts ``(*, *, *)``, i.e. any trigram
    rA   c                 C   s   |\}}}|\}}}	||  }
||  }||  }|	|  |
 | }||  |
 | }||  | | }||  |
 | | | | | }| |
||||||fS )zCalculates values of a trigram contingency table (or cube) from
        marginal values.
        >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
        (1, 0, 0, 0, 0, 72, 0, 1927)
        r
   )n_iiiZn_iix_tupleZn_ixx_tupleZn_xxxZn_iixZn_ixiZn_xiiZn_ixxZn_xixZn_xxin_oiin_ioin_iion_ooin_oion_ioon_ooor
   r
   r   r     s   

 z!TrigramAssocMeasures._contingencyc            	      G   s`   | \}}}}}}}}||| || || f|| | | || | | || | | ft | fS )zCalculates values of contingency table marginals from its values.
        >>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)
        (1, (1, 1, 1), (1, 73, 1), 2000)
        r+   )	r   rd   re   rf   rh   rg   ri   rj   rk   r
   r
   r   r    &  s   zTrigramAssocMeasures._marginalsNrL   rM   rN   rO   r-   rP   r   r    r
   r
   r
   r   rc      s    
rc   c                   @   rb   )QuadgramAssocMeasuresaF  
    A collection of quadgram association measures. Each association measure
    is provided as a function with five arguments::

        trigram_score_fn(n_iiii,
                        (n_iiix, n_iixi, n_ixii, n_xiii),
                        (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
                        (n_ixxx, n_xixx, n_xxix, n_xxxi),
                        n_all)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:

    - n_iiii counts ``(w1, w2, w3, w4)``, i.e. the quadgram being scored
    - n_ixxi counts ``(w1, *, *, w4)``
    - n_xxxx counts ``(*, *, *, *)``, i.e. any quadgram
    r\   c           "      C   s  |\}}}}|\}	}
}}}}|\}}}}||  }||  }||  }||  | | }||  | | }||  | | }||  | | | | | | }||  }||  | | }|
|  | | }||  | | | | | | }|	|  | | }||  | | | | | | }||  | | | | | | } ||  | | | | | | | | | | | | | |  }!| |||||||||||||| |!fS )zXCalculates values of a quadgram contingency table from
        marginal values.
        r
   )"n_iiiiZn_iiix_tupleZn_iixx_tupleZn_ixxx_tupleZn_xxxxn_iiixn_iixin_ixiin_xiiin_iixxn_ixixn_ixxin_xixin_xxiin_xiixn_ixxxn_xixxn_xxixn_xxxin_oiiin_ioiin_iioin_ooiin_oioin_iooin_oooin_iiion_oiion_ioion_ooion_iioon_oioon_iooon_oooor
   r
   r   r   P  s       	
z"QuadgramAssocMeasures._contingencyc                   G   sV  | \}}}}}}}}}	}
}}}}}}||	 }|| }|| }|| }|| |	 | }|| |	 | }|| | | }|| | | }|| | | }|| |	 |
 }|| | |	 | | | | }|| | |	 | |
 | | }|| | |	 | | |
 | }|| | | | | | | }t | }|||||f||||||f||||f|fS )a  Calculates values of contingency table marginals from its values.
        QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
        (1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
        rl   ) r   ro   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r2   r
   r
   r   r      sN       

z QuadgramAssocMeasures._marginalsNrm   r
   r
   r
   r   rn   9  s    
;rn   c                   @   s$   e Zd ZdZdd Zedd ZdS )ContingencyMeasureszWraps NgramAssocMeasures classes such that the arguments of association
    measures are contingency table values rather than marginals.
    c                 C   sZ   d|j j | j _t|D ]}|drqt||}|ds$| ||}t| || qdS )zAConstructs a ContingencyMeasures given a NgramAssocMeasures classZContingency___N)	__class__rL   dir
startswithgetattr_make_contingency_fnsetattr)selfmeasureskvr
   r
   r   __init__  s   


zContingencyMeasures.__init__c                    s"    fdd}j |_ j|_|S )zFrom an association measure function, produces a new function which
        accepts contingency table values as its arguments.
        c                     s    j |   S r   )r    r   r   old_fnr
   r   res  s   z5ContingencyMeasures._make_contingency_fn.<locals>.res)rO   rL   )r   r   r   r
   r   r   r     s   z(ContingencyMeasures._make_contingency_fnN)rL   rM   rN   rO   r   rP   r   r
   r
   r
   r   r     s
    r   )rO   mathr   abcr   r   	functoolsr   rE   logrG   r1   r9   Zscipy.statsr   ImportErrorr5   r8   r6   r   rS   rc   rn   r   r
   r
   r
   r   <module>   s2   wY< 	