o
    rZh!                     @   s   d Z ddlZzddlmZ ddlmZ W n ey)   dd Zdd Zd	d
 ZY nw e	dZ
G dd dZdd Zdd ZefddZefddZdd Zdd ZdS )z

A port of the Gale-Church Aligner.

Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora.
https://aclweb.org/anthology/J93-1004.pdf

    N)logsf)normc                 C   s   t | }ddd|   }|t| | d |d|d|d|d|d|d	|d
|d|d                   }| dkrB|S d| S )zComplementary error function.         ?gś??g5 ?g`yg?gƸ?gꪂIǿg#v?g9)gS?gޅ1Ogv(?g        g       @)absmathexp)xztr r   I/var/www/auris/lib/python3.10/site-packages/nltk/translate/gale_church.pyerfcc   sL   r   c                 C   s   ddt | td   S )u>   Return the area under the normal distribution from M{-∞..x}.r   r      )r   r   sqrtr	   r   r   r   norm_cdf@   s   r   c                 C   s0   z
t dt|  W S  ty   td Y S w )Nr   -inf)r   logr   
ValueErrorfloatr   r   r   r   
norm_logsfD   s
   r   r   c                   @   s&   e Zd ZdddddddZdZdZdS )	LanguageIndependentgׁsF?g{Gz?gbX9ȶ?gI+?))r   r   )r   r   )r   r   )r   r   )r   r   )r   r   r   g333333@N)__name__
__module____qualname__PRIORSAVERAGE_CHARACTERSVARIANCE_CHARACTERSr   r   r   r   r   N   s    	r   c           	      C   s   g }t |t |f}|dkrotdd |D roz| | \}}W n ty4   |d d |d d f}Y q
w t|D ]}t|D ]}||d | d |d | d f q?q9|d | |d | f}|dkrotdd |D s|ddd S )a  
    Traverse the alignment cost from the tracebacks and retrieves
    appropriate sentence pairs.

    :param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS)
    :type backlinks: dict
    :param source_sents_lens: A list of target sentences' lengths
    :type source_sents_lens: list(int)
    :param target_sents_lens: A list of target sentences' lengths
    :type target_sents_lens: list(int)
    )r   r   c                 s   s    | ]}|d kV  qdS )r   Nr   ).0pr   r   r   	<genexpr>n       ztrace.<locals>.<genexpr>r   r   N)lenall	TypeErrorrangeappend)		backlinkssource_sents_lenstarget_sents_lenslinkspositionsr   ijr   r   r   trace`   s    (r2   c           
         s   t  fddt|d D }t fddt|d D }z|||j  d }||j | t||j  }	W n tyG   td Y S w tt	t
|	 t|j|   S )aP  Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]}
    being aligned with a specific C{alignment}.

    @param i: The offset of the source sentence.
    @param j: The offset of the target sentence.
    @param source_sents: The list of source sentence lengths.
    @param target_sents: The list of target sentence lengths.
    @param alignment: The alignment type, a tuple of two integers.
    @param params: The sentence alignment parameters.

    @returns: The log probability of a specific alignment between the two sentences, given the parameters.
    c                 3        | ]} | d   V  qdS r   Nr   r    offset)r0   source_sentsr   r   r"          z!align_log_prob.<locals>.<genexpr>r   c                 3   r3   r4   r   r5   )r1   target_sentsr   r   r"      r8   r   r   r   )sumr(   r   r   r   r   ZeroDivisionErrorr   LOG2r   r   r   r   )
r0   r1   r7   r9   Z	alignmentparamsZl_sZl_tmdeltar   )r0   r1   r7   r9   r   align_log_prob|   s     
"r@   c                 C   s  t |j }g g}i }tt| d D ]l}tt|d D ]Q}td}d}	|D ]1}
d|
d  }||
d  }|t| k sA|dk rBq(|| | t||| ||
| }||k rY|}|
}	q(|tdkrbd}|	|||f< |d | qt|dkr{|d |g  qt	|| |S )a  Return the sentence alignment of two text blocks (usually paragraphs).

        >>> align_blocks([5,5,5], [7,7,7])
        [(0, 0), (1, 1), (2, 2)]
        >>> align_blocks([10,5,5], [12,20])
        [(0, 0), (1, 1), (2, 1)]
        >>> align_blocks([12,20], [10,5,5])
        [(0, 0), (1, 1), (1, 2)]
        >>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12])
        [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)]

    @param source_sents_lens: The list of source sentence lengths.
    @param target_sents_lens: The list of target sentence lengths.
    @param params: the sentence alignment parameters.
    @return: The sentence alignments, a list of index pairs.
    r   infNr$   r   r   )
listr   keysr(   r%   r   r@   r)   popr2   )r+   r,   r=   Zalignment_typesDr*   r0   r1   Zmin_distZ	min_alignaZprev_iZprev_jr!   r   r   r   align_blocks   s6   
rG   c                    s0   t | t |krtd fddt| |D S )a  Creates the sentence alignment of two texts.

    Texts can consist of several blocks. Block boundaries cannot be crossed by sentence
    alignment links.

    Each block consists of a list that contains the lengths (in characters) of the sentences
    in this block.

    @param source_blocks: The list of blocks in the source text.
    @param target_blocks: The list of blocks in the target text.
    @param params: the sentence alignment parameters.

    @returns: A list of sentence alignment lists
    z>Source and target texts do not have the same number of blocks.c                    s   g | ]
\}}t || qS r   )rG   )r    Zsource_blockZtarget_blockr=   r   r   
<listcomp>   s    
zalign_texts.<locals>.<listcomp>)r%   r   zip)Zsource_blocksZtarget_blocksr=   r   rH   r   align_texts   s   
rK   c                 #   s"     fdd}	 |   V  q	)zSplits an iterator C{it} at values of C{split_value}.

    Each instance of C{split_value} is swallowed. The iterator produces
    subiterators which need to be consumed fully before the next subiterator
    can be used.
    c                 3   s,    | }|kr|V     }|ksd S d S Nnext)firstvitsplit_valuer   r   _chunk_iterator   s   z!split_at.<locals>._chunk_iteratorrM   )rR   rS   rT   r   rQ   r   split_at   s
   rU   c                    s    fddt | |D S )zParses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens)
    and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function.
    c                    s    g | ]}d d t | D qS )c                 S   s   g | ]}t d d |D qS )c                 s   s    | ]}t |V  qd S rL   )r%   )r    tokenr   r   r   r"     r#   z;parse_token_stream.<locals>.<listcomp>.<listcomp>.<genexpr>)r:   )r    Zsentence_itr   r   r   rI     s    z1parse_token_stream.<locals>.<listcomp>.<listcomp>rU   )r    Zblock_itsoft_delimiterr   r   rI     s    z&parse_token_stream.<locals>.<listcomp>rW   )streamrY   Zhard_delimiterr   rX   r   parse_token_stream   s   
r[   )__doc__r   r   r   r   Zscipy.statsImportErrorr   r   r   r<   r   r2   r@   rG   rK   rU   r[   r   r   r   r   <module>   s$   	'
46