o
    Zh&Y                     @   s   d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	 d dl
Z
i dddddd	d
ddddddddddddddddddddddddd Zd/d"efd#d$Zd"efd%d&ZG d'd( d(ZG d)d* d*ZG d+d, d,ZG d-d. d.ZdS )0    N)Fraction)IteratorListMatchOptionalUnionu   œZoeu   ŒZOE   øo   ØO   æZae   ÆZAE   ßssu   ẞZSSu   đdu   ĐD   ð   Ð   þth   Þu   łlu   ŁL sc                    s,    fddd fddtd| D S )z
    Replace any other markers, symbols, and punctuations with a space, and drop any diacritics (category 'Mn' and some
    manual mappings)
    c                    sH   |  v r| S | t v rt |  S t| dkrdS t| d dv r"dS | S )NZMnr   r   MSP )ADDITIONAL_DIACRITICSunicodedatacategory)charkeep ]/var/www/auris/lib/python3.10/site-packages/transformers/models/whisper/english_normalizer.pyreplace_character3   s   z8remove_symbols_and_diacritics.<locals>.replace_characterr   c                 3   s    | ]} |V  qd S Nr#   .0c)r%   r#   r$   	<genexpr>A   s    z0remove_symbols_and_diacritics.<locals>.<genexpr>ZNFKDjoinr   	normalize)r   r"   r#   )r"   r%   r$   remove_symbols_and_diacritics-   s    r.   c                 C   s   d dd td| D S )z[
    Replace any other markers, symbols, punctuations with a space, keeping diacritics
    r   c                 s   s*    | ]}t |d  dv rdn|V  qdS )r   r   r   N)r   r   r'   r#   r#   r$   r*   H   s   ( z!remove_symbols.<locals>.<genexpr>NFKCr+   r   r#   r#   r$   remove_symbolsD   s   r1   c                   @   s.   e Zd Zd
dedefddZdefddZd	S )BasicTextNormalizerFremove_diacriticssplit_lettersc                 C   s   |rt nt| _|| _d S r&   )r.   r1   cleanr4   )selfr3   r4   r#   r#   r$   __init__L   s   
zBasicTextNormalizer.__init__r   c                 C   s`   |  }tdd|}tdd|}| |  }| jr'dtd|tj}tdd|}|S )N[<\[][^>\]]*[>\]]r   \(([^)]+?)\)r   z\X\s+)	lowerresubr5   r4   r,   regexfindallUr6   r   r#   r#   r$   __call__P   s   zBasicTextNormalizer.__call__N)FF)__name__
__module____qualname__boolr7   strrB   r#   r#   r#   r$   r2   K   s    r2   c                       sd   e Zd ZdZ fddZdee dee fddZdefd	d
Z	defddZ
defddZ  ZS )EnglishNumberNormalizerav  
    Convert any spelled-out numbers into arabic numbers, while handling:

    - remove any commas
    - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
    - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
    - spell out `one` and `ones`
    - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
    c                    s  t    h d| _dd tg dddD | _dd | j D | _dd	d
dddddd | j D | _i | j| j| _ddddddddd| _	dd | j	 D | _
dd | j	 D | _i | j
| j| _dddddd d!d"d#d$d%d&d'| _d(d | j D | _d)d | j D | _i | j| j| _h | j| j	| j| _d*d*d+d+d,| _d-d-d.d.d/d/d0d0d1| _tt| j t| j  | _d2d3id3d4| _h d5| _d6d7 | j| j| j| j	| j| j| j| j| j| j| jfD | _d8d9h| _d S ):N>   zeror	   Zohc                 S   s   i | ]\}}||qS r#   r#   )r(   inamer#   r#   r$   
<dictcomp>n   s    z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>)onetwothreefourfivesixseveneightninetenZelevenZtwelveZthirteenZfourteenZfifteenZsixteenZ	seventeenZeighteenZnineteen   )startc                 S   s*   i | ]\}}|d krdn|d |dfqS )rR   Zsixesr   r#   r(   rK   valuer#   r#   r$   rL   v   s    )r   r   )rW   st)   nd)   rd)   r   )   r   )ZzerothfirstsecondthirdZfifthZtwelfthc                 S   sD   i | ]\}}|d kr|dkr|dkr|| drdnd |dfqS )r^   r`   ra   thr   )endswithrY   r#   r#   r$   rL      s
          (   2   <   F   P   Z   )ZtwentyZthirtyZfortyZfiftyZsixtyZseventyZeightyZninetyc                 S   "   i | ]\}}| d d|dfqS )yZiesr   replacerY   r#   r#   r$   rL         " c                 S   rp   )rq   Ziethr   rr   rY   r#   r#   r$   rL      rt   d     i@B i ʚ;l    J)l     I5 l     NZol     @=7M.cl      B3v^!< l      P ~cegl       73Me'l       (l
F3YHqS )ZhundredZthousandZmillionZbillionZtrillionZquadrillionZquintillionZ
sextillionZ
septillionZ	octillionZ	nonillionZ	decillionc                 S      i | ]\}}|d  |d fqS r0   r#   rY   r#   r#   r$   rL          c                 S   rw   )r   r#   rY   r#   r#   r$   rL      rx   -+)minusnegativeplusZpositive   £u   €$   ¢)poundZpoundseuroZeurosdollarZdollarscentcentsr   %)Zperpercent>   tripleanddoublepointc                 S   s   h | ]	}|D ]}|qqS r#   r#   )r(   mappingkeyr#   r#   r$   	<setcomp>   s    z3EnglishNumberNormalizer.__init__.<locals>.<setcomp>rM   ones)superr7   zeros	enumerater   itemsZones_pluralZones_ordinalones_suffixedtensZtens_pluralZtens_ordinaltens_suffixedmultipliersZmultipliers_pluralZmultipliers_ordinalmultipliers_suffixeddecimalspreceding_prefixersfollowing_prefixerssetlistvaluesprefixes	suffixersspecialswordsZliteral_wordsr6   	__class__r#   r$   r7   i   s   


"

z EnglishNumberNormalizer.__init__r   returnc                 #   s   d  d d}dt fdd}dtt tf f fdd}t|dkr$d S t|D ] \}}|dkr7||d	  nd }|t|d	 krG||d	  nd }|rNd}q(|d uoWtd
|}	|d | jv }
|
rg|d	d  n|}td
|r||}|d u r{tdd urt	t r
drt t | q(|V  |
r|d n  |jd	kr|jq(|q(|| jvrd ur|V  ||V  q(|| jv rt pdd q(|| jv r6| j| }d u r|q(t	t s|| jv r|| jv r|dk rd d t | q(t t | q(|dk r!d dkr|7 q(t t | q(d dkr-|7 q(t t | q(|| jv r| j| \}}d u rR|t || V  n{t	t s^|| jv r|| jv ry|dk ry|d d t | | V  nT|t t | | V  nF|dk rd dkr|t | | V  n.|t t | | V  n d dkr|t | | V  n|t t | | V  d q(|| jv r| j| }d u r|q(t	t rt t | q(d dkr|7 q(t t | q(|| jv rX| j| \}}d u r#|t || V  q(t	t r7|t t | | V  q(d dkrJ|t | | V  q(|t t | | V  q(|| jv r| j| }d u rk|q(t	t svdkr|}|d ur|| nd }|d ur|jd	kr|jq(|V  |q(d d }d }|||  q(|| jv r| j| \}}d u r|t || V  nSt	t r|}|d ur|| nd }|d ur|jd	kr|t |j| V  n(|V  |t || V  nd d }d }|||  |t | V  d q(|| jv rDd ur/|V  || jv s8|	r>| j|  q(||V  q(|| jv r`d urZ| j|  |V  q(||V  q(|| jv rd ur| j| }t	|tr||v r|t ||  V  d}q(|V  ||V  q(|t | V  q(||V  q(|| jv rC|| jvr|	sÈd ur|V  ||V  q(|dkr|| jvr݈d ur|V  ||V  q(|dks|dkr$|| jv s|| jv r|dkrdnd}| j|d}t p	dt ||  d}q(d ur|V  ||V  q(|dkr<|| jv s2|	r;t p7dd q(td| td| d urV|V  d S d S )NFr   c                 S   s    zt | W S  ty   Y d S w r&   )r   
ValueErrorr0   r#   r#   r$   to_fraction   s
   
z:EnglishNumberNormalizer.process_words.<locals>.to_fractionresultc                    s$   t | }  d ur |  } d d  | S r&   )rG   )r   prefixrZ   r#   r$   output   s   z5EnglishNumberNormalizer.process_words.<locals>.outputr   rW   z^\d+(\.\d+)?$zConverting the fraction failed.r   0
   ru   rv   Tr   r   r   r\   r^   r   zUnexpected token: )rG   r   intlenr   r<   matchr   r   
isinstancerg   denominator	numeratorr   r   r   r   r   r   r   r   r   r   r   dictr   getr   )r6   r   skipr   r   rJ   currentprevnextZnext_is_numericZ
has_prefixZcurrent_without_prefixfr   suffixr   
multiplierpbeforeZresidualZrepeatsr#   r   r$   process_words   sL  	 









 



























z%EnglishNumberNormalizer.process_wordsr   c                 C   s   g }t d|}t|D ]=\}}t| dkrq|t|d kr'|| q|| |jddd }|| jv s>|| jv rD|d q|d qd		|}t 
d
d|}t 
dd|}t 
dd|}|S )Nz\band\s+a\s+half\br   rW   r\   )maxsplitr   z
point fivez
and a halfr   z([a-z])([0-9])z\1 \2z([0-9])([a-z])z([0-9])\s+(st|nd|rd|th|s)\b\1\2)r<   splitr   r   stripappendrsplitr   r   r,   r=   )r6   r   resultssegmentsrJ   segment	last_wordr#   r#   r$   
preprocess  s"   

z"EnglishNumberNormalizer.preprocessc                 C   sJ   dt fdd}dt fdd}td||}td||}tdd	|}|S )
Nmc                 S   sR   z|  d}|  d}t|  d}| | d|dW S  ty(   | j Y S w )NrW   r\   r^   r   02d)groupr   r   string)r   currencyintegerr   r#   r#   r$   combine_cents  s   


z:EnglishNumberNormalizer.postprocess.<locals>.combine_centsc                 S   s0   zdt | d W S  ty   | j Y S w )Nr   rW   )r   r   r   r   )r   r#   r#   r$   extract_cents  s
   
z:EnglishNumberNormalizer.postprocess.<locals>.extract_centsu,   ([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\bu   [€£$]0.([0-9]{1,2})\bz	\b1(s?)\bzone\1)r   r<   r=   )r6   r   r   r   r#   r#   r$   postprocess  s   	z#EnglishNumberNormalizer.postprocessc                 C   s6   |  |}ddd | | D }| |}|S )Nr   c                 s   s    | ]	}|d ur|V  qd S r&   r#   r(   wordr#   r#   r$   r*     s    z3EnglishNumberNormalizer.__call__.<locals>.<genexpr>)r   r,   r   r   r   rA   r#   r#   r$   rB     s   

z EnglishNumberNormalizer.__call__)rC   rD   rE   __doc__r7   r   rG   r   r   r   r   rB   __classcell__r#   r#   r   r$   rH   ^   s    
j `rH   c                   @   s&   e Zd ZdZdd ZdefddZdS )EnglishSpellingNormalizerz~
    Applies British-American spelling mappings as listed in [1].

    [1] https://www.tysto.com/uk-us-spelling-list.html
    c                 C   s
   || _ d S r&   )r   r6   Zenglish_spelling_mappingr#   r#   r$   r7     s   
z"EnglishSpellingNormalizer.__init__r   c                    s   d  fdd| D S )Nr   c                 3   s    | ]
} j ||V  qd S r&   )r   r   r   r   r#   r$   r*     s    z5EnglishSpellingNormalizer.__call__.<locals>.<genexpr>)r,   r   rA   r#   r   r$   rB     s   z"EnglishSpellingNormalizer.__call__N)rC   rD   rE   r   r7   rG   rB   r#   r#   r#   r$   r     s    r   c                   @   s"   e Zd Zdd ZdefddZdS )EnglishTextNormalizerc                 C   s  d| _ i dddddddd	d
ddddddddddddddddddddddd d!d"i d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdLdQdRdS| _t | _t|| _d S )TNz\b(hmm|mm|mhm|mmm|uh|um)\bz	\bwon't\bzwill notz	\bcan't\bzcan notz	\blet's\bzlet usz	\bain't\bZaintz	\by'all\bzyou allz	\bwanna\bzwant toz	\bgotta\bzgot toz	\bgonna\bzgoing toz\bi'ma\bzi am going toz\bimma\bz
\bwoulda\bz
would havez
\bcoulda\bz
could havez\bshoulda\bzshould havez	\bma'am\bZmadamz\bmr\bzmister z\bmrs\bzmissus z\bst\bzsaint z\bdr\bzdoctor z\bprof\bz
professor z\bcapt\bzcaptain z\bgov\bz	governor z\bald\bz	alderman z\bgen\bzgeneral z\bsen\bzsenator z\brep\bzrepresentative z\bpres\bz
president z\brev\bz	reverend z\bhon\bz
honorable z\basst\bz
assistant z	\bassoc\bz
associate z\blt\bzlieutenant z\bcol\bzcolonel z\bjr\bzjunior z\bsr\bzsenior zesquire z	 had beenz	 has beenz	 had gonez	 has gonez	 had donez has gotz notz arez isz wouldz willz havez am)z\besq\bz	'd been\bz	's been\bz	'd gone\bz	's gone\bz	'd done\bz's got\bzn't\bz're\bz's\bz'd\bz'll\bz't\bz've\bz'm\b)ignore_patterns	replacersrH   standardize_numbersr   standardize_spellingsr   r#   r#   r$   r7     s   	
 !"#$%
7zEnglishTextNormalizer.__init__r   c                 C   s   |  }tdd|}tdd|}t| jd|}tdd|}| j D ]\}}t|||}q&tdd|}tdd	|}t|d
d}| |}| |}tdd	|}tdd|}tdd|}|S )Nr8   r   r9   z\s+''z	(\d),(\d)r   z\.([^0-9]|$)z \1u
   .%$¢€£r!   u   [.$¢€£]([^0-9])z	([^0-9])%z\1 r:   r   )	r;   r<   r=   r   r   r   r.   r   r   )r6   r   patternreplacementr#   r#   r$   rB   ;  s    

zEnglishTextNormalizer.__call__N)rC   rD   rE   r7   rG   rB   r#   r#   r#   r$   r     s    <r   )r   )r<   r   	fractionsr   typingr   r   r   r   r   r>   r   rG   r.   r1   r2   rH   r   r   r#   r#   r#   r$   <module>   s^   	
   