
    fTh$                         S r SSKrSSKrSSKJr  SSKJrJrJr  SSK	J
r
  SSKJrJrJr  SSKJrJr  SS	KJr   " S
 S\SS9r\R*                  " \5      r " S S\5      rS/rg)z
Processor class for Donut.
    N)contextmanager)ListOptionalUnion   )
ImageInput)ProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)loggingc                       \ rS rSr0 rSrg)DonutProcessorKwargs    N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r       b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/donut/processing_donut.pyr   r      s    Ir   r   F)totalc            
          ^  \ rS rSrSrSS/rSrSrSU 4S jjr    SS\	S	\
\\\\   \\4      S
\\   4S jjrS rS r\S 5       rSS jr\S 5       r\S 5       rSrU =r$ )DonutProcessor%   a  
Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single
processor.

[`DonutProcessor`] offers all the functionalities of [`DonutImageProcessor`] and
[`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. See the [`~DonutProcessor.__call__`] and
[`~DonutProcessor.decode`] for more information.

Args:
    image_processor ([`DonutImageProcessor`], *optional*):
        An instance of [`DonutImageProcessor`]. The image processor is a required input.
    tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`], *optional*):
        An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
image_processor	tokenizerAutoImageProcessorAutoTokenizerc                   > S nSU;   a,  [         R                  " S[        5        UR                  S5      nUb  UOUnUc  [	        S5      eUc  [	        S5      e[
        TU ]  X5        U R                  U l        SU l	        g )Nfeature_extractorzhThe `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor` instead.z)You need to specify an `image_processor`.z"You need to specify a `tokenizer`.F)
warningswarnFutureWarningpop
ValueErrorsuper__init__r   current_processor_in_target_context_manager)selfr   r    kwargsr$   	__class__s        r   r+   DonutProcessor.__init__9   s     &(MM
 !'

+> ?-<-H/N_"HIIABB4!%!5!5*/'r   imagestextr/   c                    U R                   (       a  U R                  " X40 UD6$ Uc  Uc  [        S5      eU R                  " [        4SU R
                  R                  0UD6nUb  U R                  " U40 US   D6nUb.  Ub  US   R                  SS5        U R
                  " U40 US   D6nUc  W$ Uc  W$ WS   WS'   US   US'   U$ )	ax  
When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
[`~AutoImageProcessor.__call__`] and returns its output. If used in the context
[`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's
[`~DonutTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
zBYou need to specify either an `images` or `text` input to process.tokenizer_init_kwargsimages_kwargstext_kwargsadd_special_tokensF	input_idslabels)	r-   r,   r)   _merge_kwargsr   r    init_kwargsr   
setdefault)	r.   r2   r3   audiovideosr/   output_kwargsinputs	encodingss	            r   __call__DonutProcessor.__call__M   s     **))&A&AA>dlabb** 
"&.."<"<
 
 ))&SM/4RSF!m,778LeTtL}]/KLI<M^(5F8"+K"8F;Mr   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to DonutTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
to the docstring of this method for more information.
)r    batch_decoder.   argsr/   s      r   rF   DonutProcessor.batch_decodew   s    
 ~~**D;F;;r   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to DonutTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
docstring of this method for more information.
)r    decoderG   s      r   rK   DonutProcessor.decode~   s    
 ~~$$d5f55r   c              #      #    [         R                  " S5        SU l        U R                  U l        Sv   U R
                  U l        SU l        g7f)zq
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning TrOCR.
z`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your labels by using the argument `text` of the regular `__call__` method (either in the same call as your images inputs, or in a separate call.TNF)r%   r&   r-   r    r,   r   r.   s    r   as_target_processor"DonutProcessor.as_target_processor   sG     
 	9	

 +/'!%!%!5!5*/'s   AAc                    Uc  U R                   R                  5       n0 nU(       Gao  [        R                  " SU[        R                  5      nUc  GODXR                  5       S nSU;  a  GO*USUR                  S5      S-    nU[        S5      [        S5      *  n[        R                  " U5      n[        R                  " SU S3U[        R                  5      n	U	c  UR                  US5      nGOU	R                  5       n	[        R                  " U5      n
[        R                  " U	5      n[        R                  " U
 SU 3U[        R                  [        R                  -  5      nUb  UR                  S5      R                  5       nSU;   a7  SU;   a1  U R                  USUS	9nU(       a  [        U5      S:X  a  US
   nXU'   Ou/ XG'   UR                  S5       HB  nUR                  5       nX;   a  US
   S:X  a  USS S:X  a  USS nXG   R                  U5        MD     [        XG   5      S:X  a	  XG   S
   XG'   XR!                  U	5      [        U	5      -   S R                  5       nUSS S:X  a  U/U R                  USS SUS	9-   $ U(       a  GMo  [        U5      (       a  U(       a  U/$ U$ U(       a  / $ SU0$ )zC
Convert a (generated) token sequence into an ordered JSON format.
Nz<s_>   z</s_ z(.*?)T)is_inner_valueadded_vocabr   z<sep/><z/>   text_sequence)r    get_added_vocabresearch
IGNORECASEstartindexlenescapereplacegroupDOTALLstrip
token2jsonsplitappendfind)r.   tokensrU   rV   outputpotential_startstart_tokenkeykey_escaped	end_tokenstart_token_escapedend_token_escapedcontentvalueleafs                  r   rg   DonutProcessor.token2json   s    ..88:K iiFO& !6!6!8!:;K+%%&B(9(9#(>(BCKc%jCH95C))C.K		T+a"8&"--PI R8%OO-	&(ii&<#$&IIi$8!))*+51B0CDfbmm^`^g^gNg &%mmA.446G(W-? $Ze f "5zQ(-a*/3K&($+MM)$<D#'::<D#2tAw#~$rs)W[J['+Abz"K..t4	 %=
 v{+q0*0+a.FKI 6Y G IJPPR"1:*"8doofQRjQUcno&oooU fX v;;-F8969'2Fov-FFr   c                 P    [         R                  " S[        5        U R                  $ )Nzg`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.)r%   r&   r'   image_processor_classrN   s    r   feature_extractor_class&DonutProcessor.feature_extractor_class   s"    u	
 )))r   c                 P    [         R                  " S[        5        U R                  $ )Nz[`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.)r%   r&   r'   r   rN   s    r   r$    DonutProcessor.feature_extractor   s"    i	
 ###r   )r-   r,   )NN)NNNN)FN)r   r   r   r   __doc__
attributesry   tokenizer_classr+   r   r   r   strr   r   r   r   r   rC   rF   rK   r   rO   rg   propertyrz   r$   r   __classcell__)r0   s   @r   r   r   %   s     $[1J0%O0, "NR(( uS$s)Y8IIJK( -.(T<6 0 08Gt * * $ $r   r   )r~   r\   r%   
contextlibr   typingr   r   r   image_utilsr   processing_utilsr	   r
   r   tokenization_utils_baser   r   utilsr   r   
get_loggerr   loggerr   __all__r   r   r   <module>r      sd    
  % ( ( % H H C +5  
		H	%x$^ x$v 
r   