
    2h*                     \    S SK r  " S S5      r " S S5      r " S S5      r " S S	5      rg)
    Nc                   .    \ rS rSrSrSS jrS rS rSrg)	RegexBuilder   aA  Builds regex using arguments passed into a pattern template.

Builds a regex object for which the pattern is made from an argument
passed into a template. If more than one argument is passed (iterable),
each pattern is joined by "|" (regex alternation 'or') to create a
single pattern.

Args:
    pattern_args (iterable): String element(s) to be each passed to
        ``pattern_func`` to create a regex pattern. Each element is
        ``re.escape``'d before being passed.
    pattern_func (callable): A 'template' function that should take a
        string and return a string. It should take an element of
        ``pattern_args`` and return a valid regex pattern group string.
    flags: ``re`` flag(s) to compile with the regex.

Example:
    To create a simple regex that matches on the characters "a", "b",
    or "c", followed by a period::

        >>> rb = RegexBuilder('abc', lambda x: "{}\.".format(x))

    Looking at ``rb.regex`` we get the following compiled regex::

        >>> print(rb.regex)
        'a\.|b\.|c\.'

    The above is fairly simple, but this class can help in writing more
    complex repetitive regex, making them more readable and easier to
    create by using existing data structures.

Example:
    To match the character following the words "lorem", "ipsum", "meili"
    or "koda"::

        >>> words = ['lorem', 'ipsum', 'meili', 'koda']
        >>> rb = RegexBuilder(words, lambda x: "(?<={}).".format(x))

    Looking at ``rb.regex`` we get the following compiled regex::

        >>> print(rb.regex)
        '(?<=lorem).|(?<=ipsum).|(?<=meili).|(?<=koda).'

c                 R    Xl         X l        X0l        U R                  5       U l        g N)pattern_argspattern_funcflags_compileregex)selfr   r	   r
   s       K/var/www/auris/envauris/lib/python3.13/site-packages/gtts/tokenizer/core.py__init__RegexBuilder.__init__3   s"    ((
 ]]_
    c                     / nU R                    H;  n[        R                  " U5      nU R                  U5      nUR	                  U5        M=     SR                  U5      n[        R                  " X@R                  5      $ )N|)r   reescaper	   appendjoincompiler
   )r   altsargaltpatterns        r   r   RegexBuilder._compile;   sb    $$C))C.C##C(CKK %
 ((4.zz'::..r   c                 ,    [        U R                  5      $ r   )strr   r   s    r   __repr__RegexBuilder.__repr__E   s    4::r   )r
   r   r	   r   Nr   )	__name__
__module____qualname____firstlineno____doc__r   r   r!   __static_attributes__ r   r   r   r      s    +Z%/r   r   c                   .    \ rS rSrSrSS jrS rS rSrg)	PreProcessorRegexI   aj  Regex-based substitution text pre-processor.

Runs a series of regex substitutions (``re.sub``) from each ``regex`` of a
:class:`gtts.tokenizer.core.RegexBuilder` with an extra ``repl``
replacement parameter.

Args:
    search_args (iterable): String element(s) to be each passed to
        ``search_func`` to create a regex pattern. Each element is
        ``re.escape``'d before being passed.
    search_func (callable): A 'template' function that should take a
        string and return a string. It should take an element of
        ``search_args`` and return a valid regex search pattern string.
    repl (string): The common replacement passed to the ``sub`` method for
        each ``regex``. Can be a raw string (the case of a regex
        backreference, for example)
    flags: ``re`` flag(s) to compile with each `regex`.

Example:
    Add "!" after the words "lorem" or "ipsum", while ignoring case::

        >>> import re
        >>> words = ['lorem', 'ipsum']
        >>> pp = PreProcessorRegex(words,
        ...                        lambda x: "({})".format(x), r'\\1!',
        ...                        re.IGNORECASE)

    In this case, the regex is a group and the replacement uses its
    backreference ``\\1`` (as a raw string). Looking at ``pp`` we get the
    following list of search/replacement pairs::

        >>> print(pp)
        (re.compile('(lorem)', re.IGNORECASE), repl='\1!'),
        (re.compile('(ipsum)', re.IGNORECASE), repl='\1!')

    It can then be run on any string of text::

        >>> pp.run("LOREM ipSuM")
        "LOREM! ipSuM!"

See :mod:`gtts.tokenizer.pre_processors` for more examples.

c                     X0l         / U l        U H5  n[        U/X$5      nU R                  R                  UR                  5        M7     g r   )replregexesr   r   r   )r   search_argssearch_funcr/   r
   r   rbs          r   r   PreProcessorRegex.__init__v   s>    	 Cse[8BLL) r   c                 d    U R                    H  nUR                  U R                  U5      nM!     U$ )zRun each regex substitution on ``text``.

Args:
    text (string): the input text.

Returns:
    string: text after all substitutions have been sequentially
    applied.

)r0   subr/   )r   textr   s      r   runPreProcessorRegex.run   s+     \\E99TYY-D "r   c                     / nU R                    H-  nUR                  SR                  X R                  5      5        M/     SR	                  U5      $ )Nz({}, repl='{}'), )r0   r   formatr/   r   )r   	subs_strsrs      r   r!   PreProcessorRegex.__repr__   sB    	A.55aCD yy##r   )r0   r/   Nr#   	r$   r%   r&   r'   r(   r   r8   r!   r)   r*   r   r   r,   r,   I   s    *X*$r   r,   c                   .    \ rS rSrSrSS jrS rS rSrg)	PreProcessorSub   a  Simple substitution text preprocessor.

Performs string-for-string substitution from list a find/replace pairs.
It abstracts :class:`gtts.tokenizer.core.PreProcessorRegex` with a default
simple substitution regex.

Args:
    sub_pairs (list): A list of tuples of the style
        ``(<search str>, <replace str>)``
    ignore_case (bool): Ignore case during search. Defaults to ``True``.

Example:
    Replace all occurrences of "Mac" to "PC" and "Firefox" to "Chrome"::

        >>> sub_pairs = [('Mac', 'PC'), ('Firefox', 'Chrome')]
        >>> pp = PreProcessorSub(sub_pairs)

    Looking at the ``pp``, we get the following list of
    search (regex)/replacement pairs::

        >>> print(pp)
        (re.compile('Mac', re.IGNORECASE), repl='PC'),
        (re.compile('Firefox', re.IGNORECASE), repl='Chrome')

    It can then be run on any string of text::

        >>> pp.run("I use firefox on my mac")
        "I use Chrome on my PC"

See :mod:`gtts.tokenizer.pre_processors` for more examples.

c                     S nU(       a  [         R                  OSn/ U l        U H0  nUu  pg[        U/X7U5      nU R                  R	                  U5        M2     g )Nc                 $    SR                  U 5      $ )Nz{})r<   )xs    r   r2   -PreProcessorSub.__init__.<locals>.search_func   s    <<?"r   r   )r   Ipre_processorsr,   r   )	r   	sub_pairsignore_caser2   r
   sub_pairr   r/   pps	            r   r   PreProcessorSub.__init__   sU    	# $ !!H$MG"G9kGB&&r* "r   c                 N    U R                    H  nUR                  U5      nM     U$ )zRun each substitution on ``text``.

Args:
    text (string): the input text.

Returns:
    string: text after all substitutions have been sequentially
    applied.

)rI   r8   )r   r7   rM   s      r   r8   PreProcessorSub.run   s&     %%B66$<D &r   c                 v    SR                  U R                   Vs/ s H  n[        U5      PM     sn5      $ s  snf )Nr;   )r   rI   r   )r   rM   s     r   r!   PreProcessorSub.__repr__   s/    yyD,?,?@,?b#b',?@AA@s   6)rI   N)Tr@   r*   r   r   rB   rB      s    B+Br   rB   c                   J    \ rS rSrSr\R                  4S jrS rS r	S r
Srg)		Tokenizer   a	  An extensible but simple generic rule-based tokenizer.

A generic and simple string tokenizer that takes a list of functions
(called `tokenizer cases`) returning ``regex`` objects and joins them by
"|" (regex alternation 'or') to create a single regex to use with the
standard ``regex.split()`` function.

``regex_funcs`` is a list of any function that can return a ``regex``
(from ``re.compile()``) object, such as a
:class:`gtts.tokenizer.core.RegexBuilder` instance (and its ``regex``
attribute).

See the :mod:`gtts.tokenizer.tokenizer_cases` module for examples.

Args:
    regex_funcs (list): List of compiled ``regex`` objects. Each
        function's pattern will be joined into a single pattern and
        compiled.
    flags: ``re`` flag(s) to compile with the final regex. Defaults to
        ``re.IGNORECASE``

Note:
    When the ``regex`` objects obtained from ``regex_funcs`` are joined,
    their individual ``re`` flags are ignored in favour of ``flags``.

Raises:
    TypeError: When an element of ``regex_funcs`` is not a function, or
        a function that does not return a compiled ``regex`` object.

Warning:
    Joined ``regex`` patterns can easily interfere with one another in
    unexpected ways. It is recommended that each tokenizer case operate
    on distinct or non-overlapping characters/sets of characters
    (For example, a tokenizer case for the period (".") should also
    handle not matching/cutting on decimals, instead of making that
    a separate tokenizer case).

Example:
    A tokenizer with a two simple case (*Note: these are bad cases to
    tokenize on, this is simply a usage example*)::

        >>> import re, RegexBuilder
        >>>
        >>> def case1():
        ...     return re.compile("\,")
        >>>
        >>> def case2():
        ...     return RegexBuilder('abc', lambda x: "{}\.".format(x)).regex
        >>>
        >>> t = Tokenizer([case1, case2])

    Looking at ``case1().pattern``, we get::

        >>> print(case1().pattern)
        '\\,'

    Looking at ``case2().pattern``, we get::

        >>> print(case2().pattern)
        'a\\.|b\\.|c\\.'

    Finally, looking at ``t``, we get them combined::

        >>> print(t)
        're.compile('\\,|a\\.|b\\.|c\\.', re.IGNORECASE)
         from: [<function case1 at 0x10bbcdd08>, <function case2 at 0x10b5c5e18>]'

    It can then be run on any string of text::

        >>> t.run("Hello, my name is Linda a. Call me Lin, b. I'm your friend")
        ['Hello', ' my name is Linda ', ' Call me Lin', ' ', " I'm your friend"]

c                     Xl         X l         U R                  5       U l        g ! [        [
        4 a  n[	        S[        U5      -   5      eS nAff = f)Nz`Tokenizer() expects a list of functions returning regular expression objects (i.e. re.compile). )regex_funcsr
   _combine_regextotal_regex	TypeErrorAttributeErrorr   )r   rW   r
   es       r   r   Tokenizer.__init__"  sX    &
	#224D>* 	ACFq6J 	s   $ AAAc                     / nU R                    H  nUR                  U" 5       5        M     SR                  S U 5       5      n[        R                  " X0R
                  5      $ )Nr   c              3   8   #    U  H  oR                   v   M     g 7fr   )r   ).0r   s     r   	<genexpr>+Tokenizer._combine_regex.<locals>.<genexpr>4  s     7$3;;$s   )rW   r   r   r   r   r
   )r   r   funcr   s       r   rX   Tokenizer._combine_regex/  sN    $$DKK % ((7$77zz'::..r   c                 8    U R                   R                  U5      $ )zTokenize `text`.

Args:
    text (string): the input text to tokenize.

Returns:
    list: A list of strings (token) split according to the tokenizer cases.

)rY   split)r   r7   s     r   r8   Tokenizer.run7  s     %%d++r   c                 ^    [        U R                  5      S-   [        U R                  5      -   $ )Nz from: )r   rY   rW   r    s    r   r!   Tokenizer.__repr__C  s(    4##$y03t7G7G3HHHr   )r
   rW   rY   N)r$   r%   r&   r'   r(   r   
IGNORECASEr   rX   r8   r!   r)   r*   r   r   rT   rT      s'    HT +--- /
,Ir   rT   )r   r   r,   rB   rT   r*   r   r   <module>rk      s=    	A AHI$ I$X?B ?BDmI mIr   