o
    Zh"                     @   s   d Z ddlZG dd dZdS )z"English Normalizer class for CLVP.    Nc                   @   s   e Zd Zdd ZdedefddZdedefdd	Zd
edefddZd
edefddZ	d
edefddZ
dedefddZd
edefddZdedefddZdedefddZdedefddZdd ZdS )EnglishNormalizerc                 C   s2   dd dD | _ g d| _g d| _g d| _d S )Nc                 S   s*   g | ]}t d |d  t j|d fqS )z\b%s\.r      )recompile
IGNORECASE).0x r	   Y/var/www/auris/lib/python3.10/site-packages/transformers/models/clvp/number_normalizer.py
<listcomp>   s    z.EnglishNormalizer.__init__.<locals>.<listcomp>))ZmrsZmisess)mrZmister)ZdrZdoctor)stZsaint)coZcompany)ZjrZjunior)Zmajmajor)genZgeneral)ZdrsZdoctors)revZreverend)ltZ
lieutenant)ZhonZ	honorable)ZsgtZsergeant)ZcaptZcaptain)ZesqZesquire)ZltdZlimited)colZcolonel)ftZfort)
 onetwothreefourfivesixseveneightnine)
tenZelevenZtwelveZthirteenZfourteenZfifteenZsixteenZ	seventeenZeighteenZnineteen)
r   r   ZtwentyZthirtyZfortyZfiftyZsixtyZseventyZeightyZninety)_abbreviationsonesteenstens)selfr	   r	   r
   __init__   s   

zEnglishNormalizer.__init__numreturnc                 C   s   |dkrdS |dk rd|  t| S |dk r| j| S |dk r'| j|d  S |dk rD| j|d  |d dkrAd|  |d   S d S |d	k rc| j|d  d
 |d dkr`d|  |d   S d S |dk r|  |d	 d |d	 dkrd|  |d	   S d S |dk r|  |d d |d dkrd|  |d   S d S |dk r|  |d d |d dkrd|  |d   S d S |dk r|  |d d |d dkrd|  |d   S d S |dk r|  |d d |d dkrd|  |d   S d S dS )ax  
        Converts numbers(`int`) to words(`str`).

        Please note that it only supports upto - "'nine hundred ninety-nine quadrillion, nine hundred ninety-nine
        trillion, nine hundred ninety-nine billion, nine hundred ninety-nine million, nine hundred ninety-nine
        thousand, nine hundred ninety-nine'" or `number_to_words(999_999_999_999_999_999)`.
        r   zerozminus 
      d   -r      hundred i@B z	 thousandz, i ʚ;z millionl    J)z billionl     I5 z	 trillionl     NZoz quadrillionznumber out of range)number_to_wordsabsr!   r"   r#   )r$   r&   r	   r	   r
   r0   ?   s   
2.
z!EnglishNormalizer.number_to_wordstextc                 C   s   | dddS )z+
        Converts unicode to ascii
        asciiignorezutf-8)encodedecoder$   r2   r	   r	   r
   convert_to_asciiz      z"EnglishNormalizer.convert_to_asciimc                 C   s   | d}|d}t|dkr|d S |d rt|d nd}t|dkr0|d r0t|d nd}|rN|rN|dkr<dnd}|dkrDdnd	}d
||||f S |r^|dkrVdnd}d||f S |rn|dkrfdnd	}d||f S dS )zZ
        This method is used to expand numerical dollar values into spoken words.
        r   .   z dollarsr   dollardollarscentcentsz%s %s, %s %sz%s %szzero dollars)groupsplitlenint)r$   r:   matchpartsr>   r@   Zdollar_unitZ	cent_unitr	   r	   r
   _expand_dollars   s"   

$z!EnglishNormalizer._expand_dollarsc                 C      | dddS )zF
        This method is used to remove commas from sentences.
        r   ,r   rA   replacer$   r:   r	   r	   r
   _remove_commas   r9   z EnglishNormalizer._remove_commasc                 C   rH   )zO
        This method is used to expand '.' into spoken word ' point '.
        r   r;   z point rJ   rL   r	   r	   r
   _expand_decimal_point   r9   z'EnglishNormalizer._expand_decimal_pointc                 C   s^   dddd}t |ddd }d|d	 kr |d	 d
kr d}n||d d}| || S )z`
        This method is used to expand ordinals such as '1st', '2nd' into spoken words.
        r   ndrd)r   r<      r   Nr)   r+   r*   th)rD   rA   getr0   )r$   r&   Zordinal_suffixessuffixr	   r	   r
   _expand_ordinal   s   z!EnglishNormalizer._expand_ordinalc                 C   s~   t |d}|dkr:|dk r:|dkrdS |dkr&|dk r&d| |d  S |d dkr5| |d d	 S | |S | |S )
a  
        This method acts as a preprocessing step for numbers between 1000 and 3000 (same as the original repository,
        link :
        https://github.com/neonbjb/tortoise-tts/blob/4003544b6ff4b68c09856e04d3eff9da26d023c2/tortoise/utils/tokenizer.py#L86)
        r   r-   i  i  ztwo thousandi  ztwo thousand r+   r.   )rD   rA   r0   )r$   r:   r&   r	   r	   r
   _expand_number   s   

z EnglishNormalizer._expand_numberc                 C   s   t t d| j|}t t dd|}t t d| j|}t t d| j|}t t d| j|}t t d| j|}|S )z
        This method is used to normalize numbers within a text such as converting the numbers to words, removing
        commas, etc.
        z([0-9][0-9\,]+[0-9])u   £([0-9\,]*[0-9]+)z	\1 poundsz\$([0-9\.\,]*[0-9]+)z([0-9]+\.[0-9]+)z[0-9]+(st|nd|rd|th)z[0-9]+)r   subr   rM   rG   rN   rV   rW   r7   r	   r	   r
   normalize_numbers   s   z#EnglishNormalizer.normalize_numbersc                 C   s"   | j D ]\}}t|||}q|S )z/
        Expands the abbreviate words.
        )r    r   rX   )r$   r2   regexreplacementr	   r	   r
   expand_abbreviations   s   z&EnglishNormalizer.expand_abbreviationsc                 C   s   t t dd|S )z.
        Removes multiple whitespaces
        z\s+r/   )r   rX   r   r7   r	   r	   r
   collapse_whitespace   s   z%EnglishNormalizer.collapse_whitespacec                 C   s@   |  |}| }| |}| |}| |}|dd}|S )z
        Converts text to ascii, numbers / number-like quantities to their spelt-out counterparts and expands
        abbreviations
        "r   )r8   lowerrY   r\   r]   rK   r7   r	   r	   r
   __call__   s   



zEnglishNormalizer.__call__N)__name__
__module____qualname__r%   rD   strr0   r8   rG   rM   rN   rV   rW   rY   r\   r]   r`   r	   r	   r	   r
   r      s    );r   )__doc__r   r   r	   r	   r	   r
   <module>   s   