o
    vZh*                     @   sD   d dl Z G dd dZG dd dZG dd dZG dd	 d	ZdS )
    Nc                   @   *   e Zd ZdZd
ddZdd Zdd Zd	S )RegexBuildera  Builds regex using arguments passed into a pattern template.

    Builds a regex object for which the pattern is made from an argument
    passed into a template. If more than one argument is passed (iterable),
    each pattern is joined by "|" (regex alternation 'or') to create a
    single pattern.

    Args:
        pattern_args (iterable): String element(s) to be each passed to
            ``pattern_func`` to create a regex pattern. Each element is
            ``re.escape``'d before being passed.
        pattern_func (callable): A 'template' function that should take a
            string and return a string. It should take an element of
            ``pattern_args`` and return a valid regex pattern group string.
        flags: ``re`` flag(s) to compile with the regex.

    Example:
        To create a simple regex that matches on the characters "a", "b",
        or "c", followed by a period::

            >>> rb = RegexBuilder('abc', lambda x: "{}\.".format(x))

        Looking at ``rb.regex`` we get the following compiled regex::

            >>> print(rb.regex)
            'a\.|b\.|c\.'

        The above is fairly simple, but this class can help in writing more
        complex repetitive regex, making them more readable and easier to
        create by using existing data structures.

    Example:
        To match the character following the words "lorem", "ipsum", "meili"
        or "koda"::

            >>> words = ['lorem', 'ipsum', 'meili', 'koda']
            >>> rb = RegexBuilder(words, lambda x: "(?<={}).".format(x))

        Looking at ``rb.regex`` we get the following compiled regex::

            >>> print(rb.regex)
            '(?<=lorem).|(?<=ipsum).|(?<=meili).|(?<=koda).'

    r   c                 C   s    || _ || _|| _|  | _d S N)pattern_argspattern_funcflags_compileregex)selfr   r   r    r   B/var/www/auris/lib/python3.10/site-packages/gtts/tokenizer/core.py__init__3   s   zRegexBuilder.__init__c                 C   sF   g }| j D ]}t|}| |}|| qd|}t|| jS )N|)r   reescaper   appendjoincompiler   )r
   altsargaltpatternr   r   r   r   ;   s   



zRegexBuilder._compilec                 C   s
   t | jS r   )strr	   r
   r   r   r   __repr__E      
zRegexBuilder.__repr__Nr   )__name__
__module____qualname____doc__r   r   r   r   r   r   r   r      s
    
-
r   c                   @   r   )PreProcessorRegexa  Regex-based substitution text pre-processor.

    Runs a series of regex substitutions (``re.sub``) from each ``regex`` of a
    :class:`gtts.tokenizer.core.RegexBuilder` with an extra ``repl``
    replacement parameter.

    Args:
        search_args (iterable): String element(s) to be each passed to
            ``search_func`` to create a regex pattern. Each element is
            ``re.escape``'d before being passed.
        search_func (callable): A 'template' function that should take a
            string and return a string. It should take an element of
            ``search_args`` and return a valid regex search pattern string.
        repl (string): The common replacement passed to the ``sub`` method for
            each ``regex``. Can be a raw string (the case of a regex
            backreference, for example)
        flags: ``re`` flag(s) to compile with each `regex`.

    Example:
        Add "!" after the words "lorem" or "ipsum", while ignoring case::

            >>> import re
            >>> words = ['lorem', 'ipsum']
            >>> pp = PreProcessorRegex(words,
            ...                        lambda x: "({})".format(x), r'\\1!',
            ...                        re.IGNORECASE)

        In this case, the regex is a group and the replacement uses its
        backreference ``\\1`` (as a raw string). Looking at ``pp`` we get the
        following list of search/replacement pairs::

            >>> print(pp)
            (re.compile('(lorem)', re.IGNORECASE), repl='\1!'),
            (re.compile('(ipsum)', re.IGNORECASE), repl='\1!')

        It can then be run on any string of text::

            >>> pp.run("LOREM ipSuM")
            "LOREM! ipSuM!"

    See :mod:`gtts.tokenizer.pre_processors` for more examples.

    r   c                 C   s6   || _ g | _|D ]}t|g||}| j|j qd S r   )replregexesr   r   r	   )r
   Zsearch_argssearch_funcr"   r   r   rbr   r   r   r   v   s   zPreProcessorRegex.__init__c                 C   s   | j D ]	}|| j|}q|S )zRun each regex substitution on ``text``.

        Args:
            text (string): the input text.

        Returns:
            string: text after all substitutions have been sequentially
            applied.

        )r#   subr"   )r
   textr	   r   r   r   run   s   
zPreProcessorRegex.runc                 C   s.   g }| j D ]}|d|| j qd|S )Nz({}, repl='{}'), )r#   r   formatr"   r   )r
   Z	subs_strsrr   r   r   r      s   

zPreProcessorRegex.__repr__Nr   r   r   r   r    r   r(   r   r   r   r   r   r!   I   s
    
,	r!   c                   @   r   )PreProcessorSuba@  Simple substitution text preprocessor.

    Performs string-for-string substitution from list a find/replace pairs.
    It abstracts :class:`gtts.tokenizer.core.PreProcessorRegex` with a default
    simple substitution regex.

    Args:
        sub_pairs (list): A list of tuples of the style
            ``(<search str>, <replace str>)``
        ignore_case (bool): Ignore case during search. Defaults to ``True``.

    Example:
        Replace all occurrences of "Mac" to "PC" and "Firefox" to "Chrome"::

            >>> sub_pairs = [('Mac', 'PC'), ('Firefox', 'Chrome')]
            >>> pp = PreProcessorSub(sub_pairs)

        Looking at the ``pp``, we get the following list of
        search (regex)/replacement pairs::

            >>> print(pp)
            (re.compile('Mac', re.IGNORECASE), repl='PC'),
            (re.compile('Firefox', re.IGNORECASE), repl='Chrome')

        It can then be run on any string of text::

            >>> pp.run("I use firefox on my mac")
            "I use Chrome on my PC"

    See :mod:`gtts.tokenizer.pre_processors` for more examples.

    Tc           	      C   sN   dd }|r	t jnd}g | _|D ]}|\}}t|g|||}| j| qd S )Nc                 S   s
   d | S )Nz{})r*   )xr   r   r   r$      r   z-PreProcessorSub.__init__.<locals>.search_funcr   )r   Ipre_processorsr!   r   )	r
   Z	sub_pairsZignore_caser$   r   Zsub_pairr   r"   ppr   r   r   r      s   zPreProcessorSub.__init__c                 C   s   | j D ]}||}q|S )zRun each substitution on ``text``.

        Args:
            text (string): the input text.

        Returns:
            string: text after all substitutions have been sequentially
            applied.

        )r0   r(   )r
   r'   r1   r   r   r   r(      s   
zPreProcessorSub.runc                 C   s   d dd | jD S )Nr)   c                 S   s   g | ]}t |qS r   )r   ).0r1   r   r   r   
<listcomp>   s    z,PreProcessorSub.__repr__.<locals>.<listcomp>)r   r0   r   r   r   r   r      s   zPreProcessorSub.__repr__N)Tr,   r   r   r   r   r-      s
    
!r-   c                   @   s6   e Zd ZdZejfddZdd Zdd Zdd	 Z	d
S )	Tokenizera
  An extensible but simple generic rule-based tokenizer.

    A generic and simple string tokenizer that takes a list of functions
    (called `tokenizer cases`) returning ``regex`` objects and joins them by
    "|" (regex alternation 'or') to create a single regex to use with the
    standard ``regex.split()`` function.

    ``regex_funcs`` is a list of any function that can return a ``regex``
    (from ``re.compile()``) object, such as a
    :class:`gtts.tokenizer.core.RegexBuilder` instance (and its ``regex``
    attribute).

    See the :mod:`gtts.tokenizer.tokenizer_cases` module for examples.

    Args:
        regex_funcs (list): List of compiled ``regex`` objects. Each
            function's pattern will be joined into a single pattern and
            compiled.
        flags: ``re`` flag(s) to compile with the final regex. Defaults to
            ``re.IGNORECASE``

    Note:
        When the ``regex`` objects obtained from ``regex_funcs`` are joined,
        their individual ``re`` flags are ignored in favour of ``flags``.

    Raises:
        TypeError: When an element of ``regex_funcs`` is not a function, or
            a function that does not return a compiled ``regex`` object.

    Warning:
        Joined ``regex`` patterns can easily interfere with one another in
        unexpected ways. It is recommended that each tokenizer case operate
        on distinct or non-overlapping characters/sets of characters
        (For example, a tokenizer case for the period (".") should also
        handle not matching/cutting on decimals, instead of making that
        a separate tokenizer case).

    Example:
        A tokenizer with a two simple case (*Note: these are bad cases to
        tokenize on, this is simply a usage example*)::

            >>> import re, RegexBuilder
            >>>
            >>> def case1():
            ...     return re.compile("\,")
            >>>
            >>> def case2():
            ...     return RegexBuilder('abc', lambda x: "{}\.".format(x)).regex
            >>>
            >>> t = Tokenizer([case1, case2])

        Looking at ``case1().pattern``, we get::

            >>> print(case1().pattern)
            '\\,'

        Looking at ``case2().pattern``, we get::

            >>> print(case2().pattern)
            'a\\.|b\\.|c\\.'

        Finally, looking at ``t``, we get them combined::

            >>> print(t)
            're.compile('\\,|a\\.|b\\.|c\\.', re.IGNORECASE)
             from: [<function case1 at 0x10bbcdd08>, <function case2 at 0x10b5c5e18>]'

        It can then be run on any string of text::

            >>> t.run("Hello, my name is Linda a. Call me Lin, b. I'm your friend")
            ['Hello', ' my name is Linda ', ' Call me Lin', ' ', " I'm your friend"]

    c              
   C   sJ   || _ || _z|  | _W d S  ttfy$ } ztdt| d }~ww )Nz`Tokenizer() expects a list of functions returning regular expression objects (i.e. re.compile). )regex_funcsr   _combine_regextotal_regex	TypeErrorAttributeErrorr   )r
   r5   r   er   r   r   r   "  s   zTokenizer.__init__c                 C   s>   g }| j D ]}||  qddd |D }t|| jS )Nr   c                 s   s    | ]}|j V  qd S r   )r   )r2   r   r   r   r   	<genexpr>4  s    z+Tokenizer._combine_regex.<locals>.<genexpr>)r5   r   r   r   r   r   )r
   r   funcr   r   r   r   r6   /  s
   
zTokenizer._combine_regexc                 C   s   | j |S )zTokenize `text`.

        Args:
            text (string): the input text to tokenize.

        Returns:
            list: A list of strings (token) split according to the tokenizer cases.

        )r7   split)r
   r'   r   r   r   r(   7  s   
zTokenizer.runc                 C   s   t | jd t | j S )Nz from: )r   r7   r5   r   r   r   r   r   C  s   zTokenizer.__repr__N)
r   r   r   r    r   
IGNORECASEr   r6   r(   r   r   r   r   r   r4      s    Jr4   )r   r   r!   r-   r4   r   r   r   r   <module>   s
   DLB