
    /h^                     T    S r  SSKrSS jrS rS rS	S jrS
S jrg! \ a     Nf = f)a  
Text Segmentation Metrics

1. Windowdiff

Pevzner, L., and Hearst, M., A Critique and Improvement of
  an Evaluation Metric for Text Segmentation,
  Computational Linguistics 28, 19-36


2. Generalized Hamming Distance

Bookstein A., Kulyukin V.A., Raita T.
Generalized Hamming Distance
Information Retrieval 5, 2002, pp 353-375

Baseline implementation in C++
http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html

Study describing benefits of Generalized Hamming Distance Versus
WindowDiff for evaluating text segmentation tasks
Begsten, Y.  Quel indice pour mesurer l'efficacite en segmentation de textes ?
TALN 2009


3. Pk text segmentation metric

Beeferman D., Berger A., Lafferty J. (1999)
Statistical Models for Text Segmentation
Machine Learning, 34, 177-210
    Nc                    [        U 5      [        U5      :w  a  [        S5      eU[        U 5      :  a  [        S5      eSn[        [        U 5      U-
  S-   5       HS  n[        XXb-    R	                  U5      XXb-    R	                  U5      -
  5      nU(       a  XW-  nMD  U[        SU5      -  nMU     U[        U 5      U-
  S-   -  $ )a  
Compute the windowdiff score for a pair of segmentations.  A
segmentation is any sequence over a vocabulary of two items
(e.g. "0", "1"), where the specified boundary value is used to
mark the edge of a segmentation.

    >>> s1 = "000100000010"
    >>> s2 = "000010000100"
    >>> s3 = "100000010000"
    >>> '%.2f' % windowdiff(s1, s1, 3)
    '0.00'
    >>> '%.2f' % windowdiff(s1, s2, 3)
    '0.30'
    >>> '%.2f' % windowdiff(s2, s3, 3)
    '0.80'

:param seg1: a segmentation
:type seg1: str or list
:param seg2: a segmentation
:type seg2: str or list
:param k: window width
:type k: int
:param boundary: boundary value
:type boundary: str or int or bool
:param weighted: use the weighted variant of windowdiff
:type weighted: boolean
:rtype: float
z!Segmentations have unequal lengthzCWindow width k should be smaller or equal than segmentation lengthsr            ?)len
ValueErrorrangeabscountmin)seg1seg2kboundaryweightedwdindiffs           Q/var/www/auris/envauris/lib/python3.13/site-packages/nltk/metrics/segmentation.py
windowdiffr   1   s    < 4yCI<==3t9}Q
 	
 
B3t9q=1$%DQUO))(3dquo6K6KH6UUVKB#a-B & TQ$%%    c                     [         R                  " X45      nU[         R                  " U5      -  USS S 24'   U[         R                  " U 5      -  US S 2S4'   U$ )Nr   )npemptyarange)nrowsncolsins_costdel_costmats        r   	_init_matr    b   sM    
((E>
"C299U++C1I299U++C1IJr   c                     [        U5       Hn  u  pg[        U5       HZ  u  pU[        Xy-
  5      -  XU4   -   n
Xy:X  a  XU4   nOXy:  a  X@XhS-   4   -   nOX0US-   U4   -   n[        X5      XS-   US-   4'   M\     Mp     g )Nr   )	enumerater	   r   )r   rowvcolvr   r   shift_cost_coeffr   rowijcolj
shift_costtcosts               r   _ghd_auxr+   i   s    T? GA)C,<<sa4yHJ|qD	 qa%x=0 !q1uax=0 #E 6CAq1u ' #r   c                    [        U 5       VVs/ s H  u  pgXu:X  d  M  UPM     nnn[        U5       VVs/ s H  u  pgXu:X  d  M  UPM     n	nn[        U5      n
[        U	5      nU
S:X  a  US:X  a  gU
S:  a
  US:X  a  X-  $ U
S:X  a
  US:  a  X-  $ [        US-   U
S-   X#5      n[        XXX45        [	        US   5      $ s  snnf s  snnf )a  
Compute the Generalized Hamming Distance for a reference and a hypothetical
segmentation, corresponding to the cost related to the transformation
of the hypothetical segmentation into the reference segmentation
through boundary insertion, deletion and shift operations.

A segmentation is any sequence over a vocabulary of two items
(e.g. "0", "1"), where the specified boundary value is used to
mark the edge of a segmentation.

Recommended parameter values are a shift_cost_coeff of 2.
Associated with a ins_cost, and del_cost equal to the mean segment
length in the reference segmentation.

    >>> # Same examples as Kulyukin C++ implementation
    >>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
    0.5
    >>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
    2.0
    >>> ghd('011', '110', 1.0, 1.0, 0.5)
    1.0
    >>> ghd('1', '0', 1.0, 1.0, 0.5)
    1.0
    >>> ghd('111', '000', 1.0, 1.0, 0.5)
    3.0
    >>> ghd('000', '111', 1.0, 2.0, 0.5)
    6.0

:param ref: the reference segmentation
:type ref: str or list
:param hyp: the hypothetical segmentation
:type hyp: str or list
:param ins_cost: insertion cost
:type ins_cost: float
:param del_cost: deletion cost
:type del_cost: float
:param shift_cost_coeff: constant used to compute the cost of a shift.
    ``shift cost = shift_cost_coeff * |i - j|`` where ``i`` and ``j``
    are the positions indicating the shift
:type shift_cost_coeff: float
:param boundary: boundary value
:type boundary: str or int or bool
:rtype: float
r   g        r   )r-   )r"   r   r    r+   float)refhypr   r   r%   r   r   valref_idxhyp_idx
nref_bound
nhyp_boundr   s                r   ghdr6   y   s    \ "+3CXa3?qGC!*3CXa3?qGCWJWJQ:?	aJ!O$$	qZ!^$$
JNJNH
GCS7hIV DCs   B>B>CCc                 R   Uc2  [        [        [        U 5      U R                  U5      S-  -  5      5      nSn[	        [        U 5      U-
  S-   5       H?  nXXR-    R                  U5      S:  nXXR-    R                  U5      S:  nXg:w  d  M:  US-  nMA     U[        U 5      U-
  S-   -  $ )a  
Compute the Pk metric for a pair of segmentations A segmentation
is any sequence over a vocabulary of two items (e.g. "0", "1"),
where the specified boundary value is used to mark the edge of a
segmentation.

>>> '%.2f' % pk('0100'*100, '1'*400, 2)
'0.50'
>>> '%.2f' % pk('0100'*100, '0'*400, 2)
'0.50'
>>> '%.2f' % pk('0100'*100, '0100'*100, 2)
'0.00'

:param ref: the reference segmentation
:type ref: str or list
:param hyp: the segmentation to evaluate
:type hyp: str or list
:param k: window size, if None, set to half of the average reference segment length
:type boundary: str or int or bool
:param boundary: boundary value
:type boundary: str or int or bool
:rtype: float
       @r   r   r   )introundr   r
   r   )r/   r0   r   r   errr   rhs           r   pkr>      s    2 	yc#h#))H"5";<=>
C3s8a<!#$AEN  *Q.AEN  *Q.61HC	 %
 #c(Q,$%%r   )1F)r8   r8   r   r?   )Nr?   )	__doc__numpyr   ImportErrorr   r    r+   r6   r>    r   r   <module>rD      sC   @	
+&b7 =F"&_  		s    ''