
    fTh7Q                       S r SSKrSSKJr  SSKJrJrJr  SSKrSSK	J
r
  SSKJr  SSKJr  SSKJrJrJr  SS	KJrJr  SS
KJr  SSKJr  \R4                  " \5      r " S S\
R:                  5      r " S S\
R:                  5      r " S S\
R:                  5      r  " S S\
R:                  5      r! " S S\
R:                  5      r" " S S\
R:                  5      r# " S S\
R:                  5      r$ " S S\
R:                  5      r% " S S\
R:                  5      r& " S S \
R:                  5      r' " S! S"\
R:                  5      r( " S# S$\
R:                  5      r) " S% S&\
R:                  5      r*\ " S' S(\5      5       r+ " S) S*\
R:                  5      r,   SeS+\RZ                  S,\.S-\\/   S.\0S/\14
S0 jjr2  SfS+\RZ                  S1\\/\14   S-\\/   S/\14S2 jjr3 " S3 S4\
R:                  5      r4 " S5 S6\
R:                  5      r5 " S7 S8\
R:                  5      r6 " S9 S:\
R:                  5      r7 " S; S<\
R:                  5      r8\ " S= S>\5      5       r9 " S? S@\+5      r:\ " SA SB\5      5       r;\" SCSD9 " SE SF\+5      5       r<\ " SG SH\5      5       r= " SI SJ\+5      r>\ " SK SL\5      5       r?\ " SM SN\5      5       r@\ " SO SP\5      5       rASQ\R                  R                  SR\RZ                  SS\RZ                  4ST jrDSgSU\RZ                  SV\\RZ                     SS\RZ                  4SW jjrE " SX SY\+5      rF\ " SZ S[\5      5       rG " S\ S]\+5      rH\ " S^ S_\5      5       rI " S` Sa\
R:                  5      rJ " Sb Sc\+5      rK/ SdQrLg)hzPyTorch PatchTSMixer model.    N)	dataclass)OptionalTupleUnion)PreTrainedModel)ModelOutput   )NegativeBinomialOutputNormalOutputStudentTOutput)auto_docstringlogging)deprecate_kwarg   )PatchTSMixerConfigc                   >   ^  \ rS rSrSrS\S\4U 4S jjrS rSrU =r	$ )PatchTSMixerGatedAttention$   z
Module that applies gated attention to input data.

Args:
    in_size (`int`): The input size.
    out_size (`int`): The output size.
in_sizeout_sizec                    > [         TU ]  5         [        R                  " X5      U l        [        R
                  " SS9U l        g )Ndim)super__init__nnLinear
attn_layerSoftmaxattn_softmax)selfr   r   	__class__s      n/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/patchtsmixer/modeling_patchtsmixer.pyr   #PatchTSMixerGatedAttention.__init__-   s/    ))G6JJ2.    c                 N    U R                  U R                  U5      5      nX-  nU$ N)r!   r   )r"   inputsattn_weights      r$   forward"PatchTSMixerGatedAttention.forward2   s(    ''(?@%r&   )r   r!   )
__name__
__module____qualname____firstlineno____doc__intr   r+   __static_attributes____classcell__r#   s   @r$   r   r   $   s%    / /s /
 r&   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSMixerBatchNorm9   zH
Compute batch normalization over the sequence length (time) dimension.
configc                 ~   > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        g )Neps)r   r   r   BatchNorm1dd_modelnorm_eps	batchnormr"   r9   r#   s     r$   r   PatchTSMixerBatchNorm.__init__>   s(    FOOLr&   r)   c                 l    UR                  SS5      nU R                  U5      nUR                  SS5      $ )z
Parameters:
    inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
        input for Batch norm calculation
Returns:
    `torch.Tensor` of shape `(batch_size, sequence_length, d_model)`
r      )	transposer@   )r"   r)   outputs      r$   r+   PatchTSMixerBatchNorm.forwardB   s7     !!!Q''1%%r&   )r@   r-   r.   r/   r0   r1   r   r   torchTensorr+   r3   r4   r5   s   @r$   r7   r7   9   s,    M1 M
&ell 
& 
&r&   r7   c                      ^  \ rS rSrSrS\4U 4S jjr\S\S\R                  4S j5       r
S\R                  4S jrS	rU =r$ )
PatchTSMixerPositionalEncodingO   z
Class for positional encoding
r9   c                    > [         TU ]  5         UR                  (       a  U R                  U5      U l        g [
        R                  " [        R                  " UR                  UR                  5      5      U l        g r(   )r   r   use_positional_encoding_init_peposition_encr   	ParameterrI   zerosnum_patchesr>   rA   s     r$   r   'PatchTSMixerPositionalEncoding.__init__T   sN    )) $f 5D "U[[9K9KV^^-\ ]Dr&   returnc                 d   U R                   S:X  a@  [        R                  " [        R                  " U R
                  U R                  5      SS9nU$ U R                   S:X  Ga8  [        R                  " U R
                  U R                  5      n[        R                  " SU R
                  5      R                  S5      n[        R                  " [        R                  " SU R                  S5      [        R                  " S5      U R                  -  * -  5      n[        R                  " X#-  5      US S 2SS S24'   [        R                  " X#-  5      US S 2SS S24'   XR                  5       -
  nXR!                  5       S	-  -  n[        R                  " US
S9nU$ [#        U R                    S35      e)NrandomTrequires_gradsincosr   r   rD   g     @
   FzN is not a valid positional encoder. Available types are 'random' and 'sincos'.)positional_encoding_typer   rR   rI   randnrT   r>   rS   arange	unsqueezeexpmathlogsincosmeanstd
ValueError)r9   rQ   positiondiv_terms       r$   rP   'PatchTSMixerPositionalEncoding._init_pe\   sn    **h6<<F4F4F(WgklL  ,,8 ;;v'9'96>>JL||Av'9'9:DDQGHyya!CQXHY\b\j\jHjFk!klH$)IIh.A$BLADqD!$)IIh.A$BLADqD!'*;*;*==L'+;+;+=+BCL<<EJL
  223  4B  C r&   patch_inputc                 "    XR                   -   nU$ r(   rQ   )r"   rl   hidden_states      r$   r+   &PatchTSMixerPositionalEncoding.forwardp   s    "%6%66r&   rn   )r-   r.   r/   r0   r1   r   r   staticmethodr   rR   rP   rI   rJ   r+   r3   r4   r5   s   @r$   rL   rL   O   sS    ^1 ^ +   &5<<  r&   rL   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSMixerNormLayerv   zUNormalization block

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r9   c                    > [         TU ]  5         UR                  U l        SUR                  R                  5       ;   a  [	        U5      U l        g [        R                  " UR                  UR                  S9U l        g )Nbatchr;   )
r   r   norm_mlplowerr7   normr   	LayerNormr>   r?   rA   s     r$   r   PatchTSMixerNormLayer.__init__~   sT    foo++---f5DIV^^IDIr&   r)   c                 l   SU R                   R                  5       ;   a  [        R                  " UUR                  S   UR                  S   -  UR                  S   UR                  S   45      nU R                  U5      n[        R                  " X!R                  5      nU$ U R                  U5      nU$ )z
Args:
    inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
        Input to the normalization layer.
Returns:
    `torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`
rv   r   r   rD   r	   )rw   rx   rI   reshapeshapery   )r"   r)   inputs_reshapeds      r$   r+   PatchTSMixerNormLayer.forward   s     dmm))++#mmLLOfll1o5LLOLLOO #ii8O ]]?LLAF
  YYv&Fr&   )ry   rw   rH   r5   s   @r$   rs   rs   v   s,    J1 Jell  r&   rs   c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )PatchTSMixerMLP   c                 >  > [         TU ]  5         XR                  -  n[        R                  " X5      U l        [        R                  " UR                  5      U l        [        R                  " XB5      U l	        [        R                  " UR                  5      U l
        g r(   )r   r   expansion_factorr   r   fc1Dropoutdropoutdropout1fc2dropout2)r"   in_featuresout_featuresr9   
num_hiddenr#   s        r$   r   PatchTSMixerMLP.__init__   sd     #:#::
99[5

6>>299Z6

6>>2r&   r)   c                     U R                  [        R                  R                  U R	                  U5      5      5      nU R                  U5      nU R                  U5      nU$ )z
Args:
    inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
        Input to the MLP layer.
Returns:
    `torch.Tensor` of the same shape as `inputs`
)r   r   
functionalgelur   r   r   )r"   r)   s     r$   r+   PatchTSMixerMLP.forward   sK     r}}11$((62BCD&!v&r&   )r   r   r   r   )
r-   r.   r/   r0   r   rI   rJ   r+   r3   r4   r5   s   @r$   r   r      s    3ell  r&   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )$PatchTSMixerChannelFeatureMixerBlock   zzThis module mixes the features in the channel dimension.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r9   c                   > [         TU ]  5         [        U5      U l        UR                  U l        [        UR                  UR                  US9U l        UR                  (       a$  [        UR                  UR                  S9U l	        g g Nr   r   r9   r   r   )
r   r   rs   ry   
gated_attnr   num_input_channelsmlpr   gating_blockrA   s     r$   r   -PatchTSMixerChannelFeatureMixerBlock.__init__   sv    )&1	 ++"1122
  :11F<U<U!D r&   r)   c                     UnU R                  U5      nUR                  SSSS5      nU R                  (       a  U R                  U5      nU R	                  U5      nUR                  SSSS5      nX-   nU$ )z
Args:
    inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
        input to the MLP layer
Returns:
    `torch.Tensor` of the same shape as `inputs`
r   r	   rD   r   )ry   permuter   r   r   )r"   r)   residualouts       r$   r+   ,PatchTSMixerChannelFeatureMixerBlock.forward   sq     6"1a+??&&v.F&!1a+
r&   r   r   r   ry   rH   r5   s   @r$   r   r      s*    1  ell  r&   r   c                     ^  \ rS rSrSr      SS\S\S\S\S\S\S	\\	   S
\\   4U 4S jjjr
\" SSS9\" SSS9\" SSS9      SS\R                  S\\R                     S\\\R                        S\\R                     S\\R                     S\S\\R                     S\\R                  \\R                     \\\R                        4   4S jj5       5       5       rSrU =r$ )PatchTSMixerAttention   z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsr   
is_decoderbias	is_causalr9   	layer_idxc	                 t  > [         T	U ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        Xl        Uc>  U R                  (       a-  [        R                  SU R                  R                   S35        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).g      zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.)r   )r   r   r   r   r   head_dimr9   rh   scalingr   r   r   loggerwarning_oncer#   r-   r   r   k_projv_projq_projout_proj)
r"   r   r   r   r   r   r   r9   r   r#   s
            r$   r   PatchTSMixerAttention.__init__   s    	""!.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	4@ii	4@ii	4@		)TBr&   key_value_statesz4.55)versionpast_key_valuecache_positionhidden_statesattention_masklayer_head_maskoutput_attentionsrV   c                 n   UR                  5       u  pn
U R                  U5      R                  USU R                  U R                  5      R                  SS5      nXR                  -  nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nXR                  -  SU R                  4nUR                  " U6 nUR                  " U6 nUR                  " U6 nUR                  S5      n[        R                  " XR                  SS5      5      nUR                  5       XR                  -  X4:w  a-  [        SXR                  -  X4 SUR                  5        35      eUb[  USS2SS2SS2SUR                  S   24   nUR                  XR                  X5      U-   nUR                  XR                  -  X5      n[        R                  R!                  USS9nUb  UR                  5       U R                  4:w  a*  [        S	U R                  4 SUR                  5        35      eUR                  SSSS5      UR                  XR                  X5      -  nUR                  XR                  -  X5      nU(       a;  UR                  XR                  X5      nUR                  XR                  -  X5      nOSn[        R                  R#                  UU R"                  U R$                  S
9n[        R                  " UU5      nUR                  5       XR                  -  XR                  4:w  a7  [        SXR                  -  XR                  4 SUR                  5        35      eUR                  XR                  XR                  5      nUR                  SS5      nUR                  XU R&                  5      nU R)                  U5      nUUS4$ )z#Input shape: Batch x Time x Channelr   r   rD   z$Attention weights should be of size z	, but is Nr   z/Head mask for a single layer should be of size )ptrainingz `attn_output` should be of size )sizer   viewr   r   rE   r   r   r   r}   rI   bmmrh   r~   r   r   softmaxr   r   r   r   )r"   r   r   r   r   r   r   r   bsztgt_len_query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                       r$   r+   PatchTSMixerAttention.forward  s    (,,.a {{=166sBPTP]P]^hhijlmn#ll2[[/44S"dnndmm\ffghjkl
{{=166sBPTP]P]^hhijlmnNN*B>
#++Z8''4
#++Z8//!$yy/C/CAq/IJ3#7"JJ6nn8Lg7_6` a %%'(* 
 %+Aq!5Kz7G7G7K5K,KLN',,S..'SVddL',,S>>-A7TL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfm?wwL',,S>>-A7TL
 %1$5$5c>>7$\!055cNN6JG]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2C..4H'S`S`3a2b c$$&') 
 "&&sNNG]]S!++Aq1 "))#GmmK01477r&   )r9   r   r   r   r   r   r   r   r   r   r   r   r   )        FTFNN)NNNNFN)r-   r.   r/   r0   r1   r2   floatboolr   r   r   r   rI   rJ   r   r+   r3   r4   r5   s   @r$   r   r      s   G  /3#'%C%C %C 	%C
 %C %C %C +,%C C=%C %CP '8%v6%v6 488<1526"'15P8||P8 #5<<0P8 !u||!45	P8
 !.P8 "%,,/P8  P8 !.P8 
u||Xell3XeELL>Q5RR	SP8 7 7 9P8r&   r   c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )PatchMixerBlockip  zhThis module mixes the patch dimension.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r9   c                   > [         TU ]  5         [        U5      U l        UR                  U l        UR
                  U l        [        UR                  UR                  US9U l        UR
                  (       a#  [        UR                  UR                  S9U l
        UR                  (       a?  [        UR                  UR                  UR                  S9U l        [        U5      U l        g g )Nr   r   )r   r   r   )r   r   rs   ry   	self_attnr   r   rT   r   r   r   r   r>   self_attn_headsr   self_attn_layer	norm_attnrA   s     r$   r   PatchMixerBlock.__init__x  s    )&1	)) ++"**++
  :6CUCU`f`r`r sD#8 .. 00$D 
 36:DN r&   c                    UnU R                  U5      nU R                  (       aI  UR                  u  p4pVUR                  X4-  XV5      nU R	                  USS9u  n  n	UR                  X4XV5      nUR                  SS5      nU R                  U5      nU R                  (       a  U R                  U5      nUR                  SS5      nU R                  (       a  U R                  UW-   5      nX-   n
U
$ )zj
Args:
    hidden_state (`torch.Tensor`): Input tensor.

Returns:
    `torch.Tensor`: Transformed tensor.
F)r   rD   r	   )
ry   r   r~   r}   r   rE   r   r   r   r   )r"   ro   r   
batch_sizen_varsrT   r>   hidden_state_reshapedx_attnr   r   s              r$   r+   PatchMixerBlock.forward  s      yy.>>7C7I7I4J$0$8$89Lk$c!//0EY^/_LFAq^^JMF $--a3xx-??,,\:L $--a3>>>>,*?@L%
r&   )r   r   r   ry   r   r   r   
r-   r.   r/   r0   r1   r   r   r+   r3   r4   r5   s   @r$   r   r   p  s    ;1 ;2! !r&   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )FeatureMixerBlocki  zrThis module mixes the hidden feature dimension.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.

r9   c                   > [         TU ]  5         [        U5      U l        UR                  U l        [        UR                  UR                  US9U l        UR                  (       a$  [        UR                  UR                  S9U l	        g g r   )
r   r   rs   ry   r   r   r>   r   r   r   rA   s     r$   r   FeatureMixerBlock.__init__  sn    )&1	 ++"
  :6>>\b\j\j kD r&   hiddenc                     UnU R                  U5      nU R                  U5      nU R                  (       a  U R                  U5      nX-   nU$ )
Args:
    hidden (`torch.Tensor` of shape `(batch_size, num_patches, d_model)`):
        Input tensor to the layer.

Returns:
    `torch.Tensor`: Transformed tensor.
)ry   r   r   r   )r"   r   r   r   s       r$   r+   FeatureMixerBlock.forward  sI     6"&!??&&v.F
r&   r   rH   r5   s   @r$   r   r     s,    l1 l ell  r&   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSMixerLayeri  z
The `PatchTSMixer` layer that does all three kinds of mixing.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.

r9   c                    > [         TU ]  5         [        US9U l        [	        US9U l        UR                  U l        UR                  S:X  a  [        US9U l        g g )Nr9   mix_channel)	r   r   r   patch_mixerr   feature_mixermoder   channel_feature_mixerrA   s     r$   r   PatchTSMixerLayer.__init__  sR    *&9.f=KK	;;-')MU[)\D& (r&   r   c                     U R                   S:X  a  U R                  U5      nU R                  U5      nU R                  U5      nU$ )r   r   )r   r   r   r   )r"   r   s     r$   r+   PatchTSMixerLayer.forward  sE     99%//7F!!&)##F+r&   )r   r   r   r   rH   r5   s   @r$   r   r     s,    	]1 	]ell  r&   r   c                   F   ^  \ rS rSrSrS\4U 4S jjrSS\4S jjrSr	U =r
$ )	PatchTSMixerBlocki  z{The main computing framework of the `PatchTSMixer` model.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r9   c           	         > [         TU ]  5         UR                  n[        R                  " [        U5       Vs/ s H  n[        US9PM     sn5      U l        g s  snf Nr   )r   r   
num_layersr   
ModuleListranger   mixers)r"   r9   r  r   r#   s       r$   r   PatchTSMixerBlock.__init__  sI    &&
mmuU_O`$aO`!%6f%EO`$ab$as   Aoutput_hidden_statesc                     / nUnU R                    H%  nU" U5      nU(       d  M  UR                  U5        M'     U(       a  XC4$ US4$ )a3  
Args:
    hidden_state (`torch.Tensor`): The input tensor.
    output_hidden_states (`bool`, *optional*, defaults to False.):
        Whether to output the hidden states as well.

Returns:
    `torch.Tensor`: The embedding. `list`: List of all hidden states if `output_hidden_states` is set to
    `True`.
N)r  append)r"   ro   r  all_hidden_states	embeddingmods         r$   r+   PatchTSMixerBlock.forward  sR      	;;CII##!((3 
  //d?"r&   )r  F)r-   r.   r/   r0   r1   r   r   r   r+   r3   r4   r5   s   @r$   r   r     s(    c1 c#$ # #r&   r   c                   >   ^  \ rS rSrSrSS\4U 4S jjjrS rSrU =r	$ )PatchTSMixerForPredictionHeadi1  zaPrediction Head for Forecasting

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r9   c                   > [         TU ]  5         UR                  U l        U R                  b  U R                  R                  5         [        R
                  " UR                  5      U l        Uc>  [        R                  " UR                  UR                  -  UR                  5      U l        O-UR                  UR                  UR                  -  5      U l        [        R                  " SS9U l        g )Nr   	start_dim)r   r   prediction_channel_indicessortr   r   head_dropoutdropout_layerr   rT   r>   prediction_lengthbase_forecast_blockget_parameter_projectionFlattenflatten)r"   r9   distribution_outputr#   s      r$   r   &PatchTSMixerForPredictionHead.__init__9  s    *0*K*K'**6++002ZZ(;(;<&')yy&2D2Dv~~2UX^XpXp'qD$':'S'S""V^^3(D$ zzB/r&   c                 v  ^  T R                  U5      nT R                  U5      nT R                  U5      n[        U[        5      (       a  [	        S U 5       5      nOUR                  SS5      nT R                  b=  [        U[        5      (       a  [	        U 4S jU 5       5      nU$ UST R                  4   nU$ )a:  

Args:
    hidden_features (`torch.Tensor` of shape `(batch_size, num_patch, d_model)` in `flatten` mode
        or `(batch_size, n_vars, num_patch, d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
        features.

Returns:
    `torch.Tensor` of shape `(batch_size, prediction_length, nvars)`.

c              3   D   #    U  H  oR                  S S5      v   M     g7f)r   r   N)rE   ).0zs     r$   	<genexpr>8PatchTSMixerForPredictionHead.forward.<locals>.<genexpr>\  s     C(Q[[R00(s    r   r   c              3   D   >#    U  H  oS TR                   4   v   M     g7f).N)r  )r   r!  r"   s     r$   r"  r#  b  s!      [RZQ3(G(G#G!HRZs    .)r  r  r  
isinstancetuplerE   r  r"   hidden_featuresforecasts   `  r$   r+   %PatchTSMixerForPredictionHead.forwardK  s     ,,7,,_=++O<h&&C(CCH))"b1H**6(E**  [RZ [[  $C)H)H$HIr&   )r  r  r  r  r(   r   r5   s   @r$   r  r  1  s$    01 0 0$ r&   r  c                   >   ^  \ rS rSrSrSS\4U 4S jjjrS rSrU =r	$ )PatchTSMixerLinearHeadii  zpLinear head for Classification and Regression.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r9   c                 \  > [         TU ]  5         UR                  U l        UR                  U l        UR                  c  UR                  nOSnX l        UcA  [        R                  " UR                  UR                  -  U-  UR                  5      U l        O0UR                  UR                  UR                  -  U-  5      U l        UR                  c  [        R                  " SS9U l        O[        R                  " SS9U l        [        R                  " UR                   5      U l        g )Nr   r  r   )r   r   head_aggregationoutput_rangerT   r  r   r   r>   r   num_targets
projectionr  r  r  r   r  r   )r"   r9   r  
mul_factorr#   s       r$   r   PatchTSMixerLinearHead.__init__q  s     & 7 7"//""*++JJ#6 & ii!:!::ZG""DO
 2JJ!:!::ZGDO ""*::3DL::3DLzz&"5"56r&   c                 0   UR                  SS5      nU R                  S:X  a  US   nOIU R                  S:X  a  UR                  SS9R                  nOU R                  S:X  a  UR	                  SS9nU R
                  (       a  U R                  U5      nU R                  U5      nU R                  U5      nU R                  cS  U R                  bF  [        R                  " U5      U R                  S   U R                  S	   -
  -  U R                  S	   -   nU$ )
a1  
Args:
    hidden_features (`torch.Tensor` of shape `(batch_size x num_patch x d_model)` in `flatten` mode
        or `(batch_size x n_vars x num_patch x d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
        features.

Returns:
    `torch.Tensor` of shape `(batch_size x num_targets)`.
r   r   use_last).r   max_poolr   avg_poolr   r   )rE   r/  maxvaluesrf   r  r   r2  r  r0  rI   sigmoid)r"   r(  s     r$   r+   PatchTSMixerLinearHead.forward  s
    *33B;  J.-g6O""j0-11b19@@O""j0-22r2:O<<"ll?;O,,7///:$$,43D3D3Po.$2C2CA2FIZIZ[\I]2]^aeararstauu  r&   )r  r   r  r/  r0  r2  r(   r   r5   s   @r$   r,  r,  i  s$    71 7 78   r&   r,  c                   *    \ rS rSr\rSrSrSrS r	Sr
g)PatchTSMixerPreTrainedModeli  modelpast_valuesFc                    [        U[        5      (       aE  U R                  R                  S:X  a*  [        R
                  R                  UR                  SSS9  gg[        U[        R                  [        R                  45      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        5      (       a^  UR                   R                  R                  R                  5         UR                   R                  R                  R                  S5        g[        U[        R"                  5      (       ak  UR                  R                  R                  SU R                  R$                  S9  UR                  b%  UR                  R                  R                  5         ggg)zInitialize weightsrX   r   g?)rf   rg         ?N)r%  rL   r9   r]   r   initnormal_rQ   rz   r=   r   datazero_weightfill_r7   r@   r   init_std)r"   modules     r$   _init_weights)PatchTSMixerPreTrainedModel._init_weights  s:   f<=={{33x? 3 3#3G @r~~ >??KK""$MM$$S) 566!!&&,,.##((..s3		**MM&&CT[[5I5I&J{{&  &&( ' +r&    N)r-   r.   r/   r0   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingrK  r3   rM  r&   r$   r>  r>    s     &L#O&+#)r&   r>  c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )PatchTSMixerPretrainHeadi  zSPretraining head.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r9   c                    > [         TU ]  5         [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l	        g r(   )
r   r   r   r   r  r  r   r>   patch_lengthbase_pt_blockrA   s     r$   r   !PatchTSMixerPretrainHead.__init__  sB    ZZ(;(;<YYv~~v7J7JKr&   c                 J    U R                  U5      nU R                  U5      nU$ )aG  
Args:
    hidden_features (`torch.Tensor` of shape `(batch_size x num_patch x d_model)` in `flatten` mode
        or `(batch_size x n_vars x num_patch x d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
        features.

Returns:
    `torch.Tensor` of shape `(batch_size x n_vars x num_patch x patch_length)`.
)r  rV  r'  s      r$   r+    PatchTSMixerPretrainHead.forward  s)     ,,_=%%o6r&   )rV  r  r   r5   s   @r$   rS  rS    s!    L1 L r&   rS  r)   
mask_ratiounmasked_channel_indiceschannel_consistent_masking
mask_valuec                    US:  d  US:  a  [        SU S35      eU R                  u  pVpxU R                  n	[        USU-
  -  5      n
U(       a*  [        R
                  " USXyS9nUR                  SUS5      nO[        R
                  " XVXyS9n[        R                  " XVXyS9nSUSS2SS2SU
24'   [        R                  " USS9n[        R                  " USS9n[        R                  " USUS	9nUR                  S5      R                  SSSU5      nUb  SUSS2USS2SS24'   U R                  UR                  5       U5      nXS
   4$ )a  random_masking: Mask the input considering the control variables.

Args:
    inputs (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, num_features)`):
        The input tensor to mask.
    mask_ratio (`float`):
        Masking ratio applied to mask the input data during random pretraining. It is the number between 0 and 1.
    unmasked_channel_indices (list, *optional*):
        Indices of channels that will not be masked.
    channel_consistent_masking (bool, *optional*, defaults to `False`):
        When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
        across channels.
    mask_value (int, *optional*, defaults to 0):
        Define the value of masked patches for pretraining.

Returns:
    `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as input Tensor and mask tensor of shape [bs x c x
    n]
r   r   zMask ratio z has to be between 0 and 1.deviceNr   r   )r   index.r   )rh   r~   r`  r2   rI   randrepeatonesargsortgatherr`   masked_fillr   )r)   rZ  r[  r\  r]  r   num_channelssequence_lengthnum_featuresr`  len_keepnoisemaskids_shuffleids_restoreinputs_masks                   r$   random_maskingrr    sA   4 A~q;zl2MNOO>Dll;Jo]]F?a*n56H!

:q/IQa0 

:_T ::jODDAyy --2.K--4K<<"K8D>>"$$Q1l;D+23Q(!Q./$$TYY[*=KV$$r&   num_forecast_mask_patchesc                 P   [        U[        5      (       a  U/nU Vs/ s H  nSPM     nnU R                  u  pgp[        R                  " XgXR
                  S9n
/ nSn[        U5      n[        X5       HG  u  pUS::  d  X:  a  [        SU S35      e[        Xo-  U-  5      nUR                  XU/5        UU-  nMI     [        US S9nX:  a  US   S   Xl-
  -   US   S'   OX:  a  US	   S   X-
  -   US	   S'   SnU H  u  nnnUU-   nSU
UU2S
S
2U* S
24'   UnM     [        R                  " U
R                  S   5      nU
U   n
U
R                  S	5      R                  SSSU	5      n
Ub  SU
S
S
2US
S
2S
S
24'   U R                  U
R                  5       U5      nUU
S   4$ s  snf )ai  Forecast masking that masks the last K patches where K is from the num_forecast_mask_patches.
If num_forecast_mask_patches is a list, samples in the batch will be randomly masked by numbers defined in the list.

Parameters:
    inputs (`torch.Tensor`):
        Input of shape `(bs, num_channels, num_patch, patch_length)`
    num_forecast_mask_patches (`list`):
        Number of patches to be masked at the end of each batch sample. e.g. 4 or [3, 5].
    unmasked_channel_indices (`list`, *optional*):
        Indices of channels that are not masked.
    mask_value (`int`, *optional*, defaults to 0):
        Values in the masked patches will be filled by `mask_value`.

Returns:
    `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs,
    num_channels , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)`
r   r_  r   znum_forecast_mask_patches z6 should be greater than 0 and less than total patches.c                     U S   $ NrD   rM  )xs    r$   <lambda>"forecast_masking.<locals>.<lambda>P  s    !A$r&   )keyrD   r   Nrb  )r%  r2   r~   rI   rS   r`  sumziprh   r  sortedrandpermr`   rd  rh  r   )r)   rs  r[  r]  r   forecast_mask_ratiosr   ri  rj  rk  rn  t_listtotal_lengthtotal_ratiorU  ratiotemp_lenbatch1	patch_lenbatch2permrq  s                         r$   forecast_maskingr  $  s   0 +S11%>$?!'@A'@!A'@A>Dll;Jo;;zWDFL*+K"#<S1 ?,\N:pq  z)K78|H56   T F/F ay|z'@Aq	!		"r
1)BCr
1F"(	1h("./VF]A	z{*+ #)
 >>$**Q-(D:D>>"$$Q1l;D+23Q(!Q./$$TYY[*=KV$$O Bs   F#c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSMixerPatchifyii  z
A class to patchify the time series sequence into different patches

Returns:
    `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
r9   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        U R                  U R                  ::  a&  [        SU R                   SU R                   S35      e[        U R                  U R                  5      U R                  -
  U R
                  -  S-   U l        U R                  U R
                  U R                  S-
  -  -   nU R                  U-
  U l	        g )NzSequence length (z+) has to be greater than the patch length ()r   )
r   r   context_lengthrj  rU  patch_striderh   r9  rT   sequence_start)r"   r9   new_sequence_lengthr#   s      r$   r   PatchTSMixerPatchify.__init__q  s    %44"//"//4#4#44#D$8$8#99deievevdwwxy 
   4 4d6G6GH4K\K\\aeararruvv"//$2C2CtGWGWZ[G[2\\"225HHr&   r@  c                 4   UR                   S   nX R                  :w  a  [        SU SU R                   S35      eUSS2U R                  S2SS24   nUR	                  SU R
                  U R                  S9nUR                  SS5      R                  5       nU$ )z
Parameters:
    past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*):
        Input for patchification

Returns:
    `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
r   zInput sequence length (z%) doesn't match model configuration (r   N)	dimensionr   stepr.  )	r~   rj  rh   r  unfoldrU  r  rE   
contiguous)r"   r@  rj  rF   s       r$   r+   PatchTSMixerPatchify.forward  s     &++B/222)/)::_`d`t`t_uuwx  Q 3 3 5q89$2C2C$J[J[\!!"b)446r&   )rT   rU  r  rj  r  rH   r5   s   @r$   r  r  i  s,    I1 I"5<<  r&   r  c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSMixerMaskingi  ap  
Class to perform random or forecast masking.

Parameters:
    config (`PatchTSMixerConfig`): model config
Returns:
    x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
        Masked patched input
    mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
        Bool tensor indicating True on masked points
r9   c                 >  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        U R                  b  [        U R                  5      U l        g g r(   )	r   r   random_mask_ratior\  	mask_typers  r[  r]  r}  rA   s     r$   r   PatchTSMixerMasking.__init__  s    !'!9!9*0*K*K')))/)I)I&(.(G(G% ++((4,243P3P,QD) 5r&   rl   c                 d   U R                   S:X  a8  [        UU R                  U R                  U R                  U R
                  S9u  p#OVU R                   S:X  a-  [        UU R                  U R                  U R
                  S9u  p#O[        SU R                    S35      eUR                  5       nX#4$ )a  
Parameters:
    patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
        Patch input

Return:
    masked_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
        Masked patched input
    mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
        Bool tensor indicating True on masked points

rX   )r)   rZ  r[  r\  r]  r)  )r)   rs  r[  r]  zInvalid mask type .)
r  rr  r  r[  r\  r]  r  rs  rh   r   )r"   rl   masked_inputrn  s       r$   r+   PatchTSMixerMasking.forward  s     >>X%!/"11)-)F)F+/+J+J??"L$ ^^z)!1"*.*H*H)-)F)F??	"L$ 1$..1ACDD yy{!!r&   )r\  r  r]  rs  r  r[  rH   r5   s   @r$   r  r    s,    
	R1 	R!"5<< !" !"r&   r  c            	          ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\	\R                  \R                  \R                  4   4S jr
S	rU =r$ )
PatchTSMixerStdScaleri  z
Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
subtracting from the mean and dividing by the standard deviation.
r9   c                   > [         TU ]  5         [        US5      (       a  UR                  OSU l        [        US5      (       a  UR
                  OSU l        [        US5      (       a  UR                  U l        g SU l        g )Nscaling_dimr   keepdimTminimum_scalegh㈵>)r   r   hasattrr  r   r  r  rA   s     r$   r   PatchTSMixerStdScaler.__init__  sd    )0)G)G6%%Q)0)C)Cv~~5<V_5U5UV11[_r&   rE  observed_indicatorrV   c                 r   UR                  U R                  U R                  S9nUR                  S5      nX-  R                  U R                  U R                  S9U-  nX-
  U-  S-  R                  U R                  U R                  S9U-  n[        R
                  " XPR                  -   5      nX-
  U-  XF4$ )  
Parameters:
    data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        input for Batch norm calculation
    observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Calculating the scale on the observed indicator.
Returns:
    tuple of `torch.Tensor` of shapes
        (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
        `(batch_size, 1, num_input_channels)`)
r  rB  rD   )r{  r   r  	clamp_minrI   sqrtr  )r"   rE  r  denominatorlocvariancescales          r$   r+   PatchTSMixerStdScaler.forward  s     ),,TXXt||,L!++C0(--dhh-MP[[j$661<AA$((TXT`T`Aadoo

8&8&889
e#S//r&   )r   r  r  r-   r.   r/   r0   r1   r   r   rI   rJ   r   r+   r3   r4   r5   s   @r$   r  r    sY    
`1 `0LL06;ll0	u||U\\5<<7	80 0r&   r  c            	          ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\	\R                  \R                  \R                  4   4S jr
S	rU =r$ )
PatchTSMixerMeanScaleri  z~
Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
accordingly.
r9   c                 N  > [         TU ]  5         [        US5      (       a  UR                  OSU l        [        US5      (       a  UR
                  OSU l        [        US5      (       a  UR                  OSU l        [        US5      (       a  UR                  U l        g S U l        g )Nr  r   r  Tr  绽|=default_scale)r   r   r  r  r   r  r  r  rA   s     r$   r   PatchTSMixerMeanScaler.__init__  s    )0)G)G6%%Q)0)C)Cv~~5<V_5U5UV11[`5<V_5U5UV11[_r&   rE  r  rV   c                    X-  R                  5       R                  U R                  SS9nUR                  U R                  SS9nU[        R                  " USS9-  nU R
                  cL  UR                  SS9n[        R                  " UR                  S5      SS9n[        R                  " Xg-  5      nO#U R
                  [        R                  " U5      -  n[        R                  " US:  XX5      n[        R                  " XPR                  S9nX-  n	U R                  (       d  UR                  U R                  S9nU	[        R                  " U5      U4$ )r  Tr  r   minr   r   )absr{  r   rI   clampr  squeeze	ones_likewherer  r  
zeros_like)
r"   rE  r  ts_sumnum_observedr  	batch_sumbatch_observationsr  scaled_datas
             r$   r+   PatchTSMixerMeanScaler.forward  s"    +00266txx6N)--dhh-E\q99 %

q
)I!&\-=-=a-@a!H!MM)*HIM ..1GGM L1,eC E'9'9:l||MMdhhM/EE,,U3U::r&   )r  r   r  r  r  r5   s   @r$   r  r    sY    
`1 `&;LL&;6;ll&;	u||U\\5<<7	8&; &;r&   r  c            
          ^  \ rS rSrSrS\4U 4S jjr S
S\R                  S\	\R                     S\
\R                  \R                  \R                  4   4S jjrS	rU =r$ )PatchTSMixerNOPScaleri2  zt
Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
r9   c                    > [         TU ]  5         [        US5      (       a  UR                  OSU l        [        US5      (       a  UR
                  U l        g SU l        g )Nr  r   r  T)r   r   r  r  r   r  rA   s     r$   r   PatchTSMixerNOPScaler.__init__7  sF    )0)G)G6%%Q)0)C)Cv~~r&   rE  r  rV   c                     [         R                  " USS9R                  U R                  U R                  S9n[         R
                  " USS9R                  U R                  U R                  S9nXU4$ )aP  
Parameters:
    data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        input for Batch norm calculation
Returns:
    tuple of `torch.Tensor` of shapes
        (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
        `(batch_size, 1, num_input_channels)`)
FrY   r   r  )rI   r  rf   r   r  r  )r"   rE  r  r  r  s        r$   r+   PatchTSMixerNOPScaler.forward<  sg     E:??DHHVZVbVb?ct59>>488UYUaUa>b%r&   r  r(   )r-   r.   r/   r0   r1   r   r   rI   rJ   r   r   r+   r3   r4   r5   s   @r$   r  r  2  se    N1 N PT LL 6>u||6L 	u||U\\5<<7	8   r&   r  c                   p    \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Srg)PatchTSMixerEncoderOutputiM  az  
Base class for `PatchTSMixerEncoderOutput`, with potential hidden states.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, d_model)`):
        Hidden-state at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Hidden-states of the model at the output of each layer.
Nlast_hidden_stater   rM  )r-   r.   r/   r0   r1   r  r   rI   FloatTensor__annotations__r   r   r3   rM  r&   r$   r  r  M  s9     6:x 1 1298<M8E%"3"345<r&   r  c                      ^  \ rS rSrSrS\4U 4S jjr\  SS\R                  S\
\   S\
\   S\\\4   4S	 jj5       rS
rU =r$ )PatchTSMixerEncoderi]  z
Encoder for PatchTSMixer which inputs patched time-series and outputs patched embeddings.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r9   c                 T  > [         TU ]  U5        UR                  U l        [        R                  " UR
                  UR                  5      U l        UR                  (       a  [        US9U l
        OS U l
        [        US9U l        UR                  (       a  U R                  5         g g r   )r   r   use_return_dictr   r   rU  r>   patcherrO   rL   positional_encoderr   mlp_mixer_encoder	post_initrA   s     r$   r   PatchTSMixerEncoder.__init__f  s     %55yy!4!4fnnE))&DF&SD#&*D#!2&!A NN r&   r@  r  return_dictrV   c                     Ub  UOU R                   nU R                  U5      nU R                  b  U R                  U5      nU R                  XBS9u  pVU(       d  [	        S UU4 5       5      $ [        XVS9$ )a  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to
    predict the masked portion. For a forecasting task, this denotes the history/past time series values.
    Similarly, for classification or regression tasks, it denotes the appropriate context values of the
    time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series,
    it is greater than 1.

Returns:
    `torch.FloatTensor` of shape `(batch_size, n_vars, num_patches, d_model)`
)r  c              3   &   #    U  H  nUv   M	     g 7fr(   rM  r   vs     r$   r"  .PatchTSMixerEncoder.forward.<locals>.<genexpr>  s      A    )r  r   )r  r  r  r  r&  r  )r"   r@  r  r  patchesr  r   s          r$   r+   PatchTSMixerEncoder.forwardv  s    * &1%<k$BVBV ,,{+ "".--g6G+/+A+A'+A+u(  &!   );Ljjr&   )r  r  r  r  )FN)r-   r.   r/   r0   r1   r   r   r   rI   rJ   r   r   r   r   r  r+   r3   r4   r5   s   @r$   r  r  ]  st    1    05&*	(k\\(k 'tn(k d^	(k
 
u//	0(k (kr&   r  c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S	'   S
rg)PatchTSMixerModelOutputi  aQ  
Base class for model's outputs, with potential hidden states.

Args:
    last_hidden_state (`torch.FloatTensor`  of shape `(batch_size, num_channels, num_patches, d_model)`):
        Hidden-state at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Hidden-states of the model at the output of each layer.
    patch_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
        Patched input data to the model.
    mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`,*optional*):
        Bool Tensor indicating True in masked patches and False otherwise.
    loc: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*):
        Gives the mean of the context window per channel. Used for revin denorm outside the model, if revin
        enabled.
    scale: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*):
        Gives the std dev of the context window per channel. Used for revin denorm outside the model, if revin
        enabled.
Nr  r   rl   rn  r  r  rM  )r-   r.   r/   r0   r1   r  r   rI   r  r  r   r   rl   rn  r  r  r3   rM  r&   r$   r  r    s    ( 6:x 1 1298<M8E%"3"345</3K%++,3(,D(5$$
%,'+C%##	$+)-E8E%%&-r&   r  z=
    The PatchTSMixer Model for time-series forecasting.
    )custom_introc                      ^  \ rS rSrSS\S\4U 4S jjjr\   SS\R                  S\
\R                     S\
\   S\
\   S	\4
S
 jj5       rSrU =r$ )PatchTSMixerModeli  r9   
mask_inputc                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        USL a  [        U5      U l        OSU l        UR                  S:X  a  [        U5      U l        O@UR                  S:X  d  UR                  SL a  [        U5      U l        O[        U5      U l        UR                  (       a  U R                  5         gg)z}
mask_input (bool, *optional*, defaults to `False`):
    Whether to mask the input using the [`PatchTSMixerMasking`] module.
TNrf   rg   )r   r   r  r  encoderr  patchingr  maskingr   r  scalerr  r  r  )r"   r9   r  r#   s      r$   r   PatchTSMixerModel.__init__  s    
 	 %55*62,V4.v6DLDL>>V#08DK^^u$$(>/7DK/7DK NN r&   r@  observed_maskr  r  rV   c           	         Ub  UOU R                   nSnUc  [        R                  " U5      nU R                  X5      u  pgnU R	                  U5      n	U	n
U R
                  b  U R                  U	5      u  pU R                  U
UUS9n[        U[        5      (       a  [        U6 nU(       d,  [        S UR                  UR                  U	UUU4 5       5      $ [        UR                  UR                  U	UUUS9$ )a  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to predict
    the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
    for classification or regression tasks, it denotes the appropriate context values of the time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
    greater than 1.
observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
    Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
    in `[0, 1]`:
    - 1 for values that are **observed**,
    - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
Nr  r  c              3   &   #    U  H  nUv   M	     g 7fr(   rM  r  s     r$   r"  ,PatchTSMixerModel.forward.<locals>.<genexpr>        
A r  )r  r   rl   rn  r  r  )r  rI   r  r  r  r  r  r%  r&  r  r  r   r  )r"   r@  r  r  r  rn  scaled_past_valuesr  r  	patched_x	enc_inputencoder_outputs               r$   r+   PatchTSMixerModel.forward  s   , &1%<k$BVBV !OOK8M)-[)P&MM"45		<<#"ll95OI !5# & 
 ne,,6GN 
 #44"00
 
 
 ',>>(66!
 	
r&   )r  r  r  r  r  r  )NFN)r-   r.   r/   r0   r   r   r   r   rI   rJ   r   r  r+   r3   r4   r5   s   @r$   r  r    s    1 t  6  15/4&*A
\\A
  -A
 'tn	A

 d^A
 
!A
 A
r&   r  c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Srg)	 PatchTSMixerForPreTrainingOutputi&  ax  
Output type of [`PatchTSMixerForPreTrainingOutput`].

Args:
    prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, patch_length)`):
        Prediction output from the pretrain head.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Hidden-states of the model at the output of each layer.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
        Backbone embeddings before passing through the head.
    loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
        Total loss
Nlossprediction_outputsr  r   rM  r-   r.   r/   r0   r1   r  r   rI   r  r  r  r  r   r   r3   rM  r&   r$   r  r  &  d     )-D(5$$
%,6:!2!23:59x 1 1298<M8E%"3"345<r&   r  c                      ^  \ rS rSrSrS\4U 4S jjr\    SS\R                  S\
\R                     S\
\   S\S	\
\   S
\4S jj5       rSrU =r$ )PatchTSMixerForPretrainingi<  z}
`PatchTSMixer` for mask pretraining.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.

Returns:
    `None`.
r9   c                    > [         TU ]  U5        [        USS9U l        [	        US9U l        UR                  U l        UR                  U l        UR                  (       a  U R                  5         g g )NT)r  r   )	r   r   r  r?  rS  headmasked_lossr  r  rA   s     r$   r   #PatchTSMixerForPretraining.__init__H  s`     &v$?
,F;	!--%55 NN r&   r@  r  r  return_lossr  rV   c                    Ub  UOU R                   nU R                  SL a  [        R                  R	                  SS9nO[        R                  R	                  SS9nU R                  UUUUS9n[        U[        5      (       a  [        U6 nU R                  UR                  5      nUSL a  U" XR                  5      n	OSn	U R                  SL aK  U	bH  U	R                  SS9UR                  -  R                  5       UR                  R                  5       S	-   -  n	U(       d*  [        S
 U	UUR                  UR                  4 5       5      $ [!        U	UUR                  UR                  S9$ )a  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to predict
    the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
    for classification or regression tasks, it denotes the appropriate context values of the time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
    greater than 1.
observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
    Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
    in `[0, 1]`:
    - 1 for values that are **observed**,
    - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
return_loss (`bool`,  *optional*):
    Whether to return the loss in the `forward` call.
NTnone	reductionrf   r  r  r  r   r   r  c              3   &   #    U  H  nUv   M	     g 7fr(   rM  r  s     r$   r"  5PatchTSMixerForPretraining.forward.<locals>.<genexpr>        A r  r  r  r  r   )r  r  rI   r   MSELossr?  r%  r&  r  r
  r  rl   rf   rn  r{  r   r  )
r"   r@  r  r  r  r  r  model_outputx_hatloss_vals
             r$   r+   "PatchTSMixerForPretraining.forwardS  sp   2 &1%<k$BVBVt#88##f#5D88##f#5D zz'!5#	 " 
 lE**2LAL		,889$E#;#;<HH t#(< "-0A0AAFFHLL]L]LaLaLcfkLklH   22 ..	   0$*<<&44	
 	
r&   )r
  r  r?  r  NFTN)r-   r.   r/   r0   r1   r   r   r   rI   rJ   r   r   r  r+   r3   r4   r5   s   @r$   r  r  <  s    		1 	  15/4 &*D
\\D
  -D
 'tn	D

 D
 d^D
 
*D
 D
r&   r  c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S'   Sr\\R                     \	S	'   S
rg)PatchTSMixerForPredictionOutputi  a|  
Output type of [`PatchTSMixerForPredictionOutput`].

Args:
    prediction_outputs (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_input_channels)`):
        Prediction output from the forecast head.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
        Backbone embeddings before passing through the head.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
        Total loss.
    loc (`torch.FloatTensor`, *optional* of shape `(batch_size, 1, num_input_channels)`):
        Input mean
    scale (`torch.FloatTensor`, *optional* of shape `(batch_size, 1, num_input_channels)`):
        Input std dev

Nr  r  r  r   r  r  rM  )r-   r.   r/   r0   r1   r  r   rI   r  r  r  r  r   r   r  r  r3   rM  r&   r$   r  r    s    & )-D(5$$
%,6:!2!23:59x 1 1298<M8E%"3"345<'+C%##	$+)-E8E%%&-r&   r  c                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)"SamplePatchTSMixerPredictionOutputi  a!  
Base class for time series model's predictions outputs that contains the sampled values from the chosen
distribution.

Args:
    sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length, number_channels)`):
        Sampled values from the chosen distribution.
N	sequencesrM  r-   r.   r/   r0   r1   r!  r   rI   r  r  r3   rM  r&   r$   r   r          .2Ix))*1r&   r   c                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)"SamplePatchTSMixerRegressionOutputi  a  
Base class for time series model's predictions outputs that contains the sampled values from the chosen
distribution.

Args:
    sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, num_targets)`
            Sampled values from the chosen distribution.
Nr!  rM  r"  rM  r&   r$   r%  r%    r#  r&   r%  inputtargetrV   c                 &    U R                  U5      * $ )z[
Computes the negative log likelihood loss from input distribution with respect to target.
)log_prob)r&  r'  s     r$   nllr*    s     NN6"""r&   input_tensorweightsc                 R   Ub  [         R                  " US:g  X-  [         R                  " U 5      5      n[         R                  " U(       a  UR	                  US9OUR	                  5       SS9nU(       a  UR	                  US9U-  $ UR	                  5       U-  $ U R                  US9$ )a:  
Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.

Args:
    input_tensor (`torch.FloatTensor`):
        Input tensor, of which the average must be computed.
    weights (`torch.FloatTensor`, *optional*):
        Weights tensor, of the same shape as `input_tensor`.
    dim (`int`, *optional*):
        The dim along which to average `input_tensor`.

Returns:
    `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
r   r   rB  r  )rI   r  r  r  r{  rf   )r+  r,  r   weighted_tensorsum_weightss        r$   weighted_averager0    s      ++glL4JEL\L\]iLjkkk#'++#+"67;;=VYZ03###,R]]]9L9L9NR]]]  S ))r&   c                     ^  \ rS rSrSrS\4U 4S jjr\     SS\R                  S\
\R                     S\
\R                     S\
\   S	\S
\
\   S\4S jj5       r SS\R                  S\
\R                     S\4S jjrSrU =r$ )PatchTSMixerForPredictioni  z
`PatchTSMixer` for forecasting application.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.

Returns:
    `None`.
r9   c                 4  > [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  S:X  a  S U l        OaUR                  n[        [        [        S.nUR                  UR                  S 5      nUb  U" US9U l        O[        SUR                   35      e[        U5      U l        [        UU R                  S9U l        UR"                  (       a  U R#                  5         g g )Nmse	student_tnormalnegative_binomialr   Unknown distribution output r9   r  )r   r   r  r  r  num_parallel_samplesr  r  r   r   r
   getrh   r  r?  r  r
  r  )r"   r9   r   distribution_output_mapoutput_classr#   s        r$   r   "PatchTSMixerForPrediction.__init__  s     KK	%55*0*K*K'$*$?$?!;;%'+D$**C+&%;'#
 366v7Q7QSWXL'+7C+@( #?@Z@Z?[!\]]&v.
1 $ 8 8
	 NN r&   r@  r  future_valuesr  r  r  rV   c           	         U R                   S:X  a  [        R                  " SS9nO"U R                   S:X  a  [        nO[	        S5      eUb  UOU R
                  nU R                  UUUUS9n[        U[        5      (       a  [        U6 nU R                  UR                  5      n	Sn
U R                  b  U R                  (       ay  U R                  R                  U	UR                  SU R                  4   UR                   SU R                  4   S	9nUb(  US
L a#  U" UUSU R                  4   5      n
[#        U
5      n
OXR                   SU R                  4   -  UR                  SU R                  4   -   n	Ub  US
L a  U" XSU R                  4   5      n
OU R                  (       aJ  U R                  R                  XR                  UR                   S	9nUb  US
L a  U" X5      n
[#        U
5      n
O+XR                   -  UR                  -   n	Ub  US
L a  U" X5      n
U R                  b7  UR                  SU R                  4   nUR                   SU R                  4   nOUR                  nUR                   nU(       d,  [        S U
U	UR                  UR$                  UU4 5       5      $ ['        U
U	UR                  UR$                  UUS9$ )a  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to predict
    the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
    for classification or regression tasks, it denotes the appropriate context values of the time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
    greater than 1.
observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
    Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
    in `[0, 1]`:
    - 1 for values that are **observed**,
    - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
future_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,:
    `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*):
    Target values of the time series, that serve as labels for the model. The `future_values` is what the
    Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
    required for a pretraining task.

    For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
    to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
    pass the target data with all channels, as channel Filtering for both prediction and target will be
    manually applied before the loss computation.
return_loss (`bool`,  *optional*):
    Whether to return the loss in the `forward` call.
r4  rf   r  r*  2Invalid loss function: Allowed values: mse and nllNr  .r  r  Tc              3   &   #    U  H  nUv   M	     g 7fr(   rM  r  s     r$   r"  4PatchTSMixerForPrediction.forward.<locals>.<genexpr>  r  r  )r  r  r  r   r  r  )r  r   r  r*  rh   r  r?  r%  r&  r  r
  r  r  r  distributionr  r  r0  r   r  )r"   r@  r  r@  r  r  r  r  r  y_hatr  rF  r  r  s                 r$   r+   !PatchTSMixerForPrediction.forward!  s   H 99::/DYY%DQRR%0%<k$BVBV zz'!5#	 " 
 lE**2LAL 		,889**6''#77DD$((d.M.M)MN&,,S$2Q2Q-QR  E  
 !,1D#$%c4+J+J&JK H
  09H ..sD4S4S/STT"&&sD,K,K'KLM  !,1D#Ed>]>]9]+^_H''#77DD//|7I7I  E   !,1D#L@H/9H 2 22\5E5EE ,1D#E9H**6""3(G(G#GHC &&sD,K,K'KLE""C &&E 
  22 ..
 
 
 /$*<<&44
 	
r&   c                 4   U R                   nU " USUSS9nU R                  R                  UR                  UR                  UR
                  S9n[        U5       Vs/ s H  oeR                  5       PM     nn[        R                  " USS9n[        US9$ s  snf )aX  
Generate sequences of sample predictions from a model with a probability distribution head.

Args:
    past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Past values of the time series that serves as context in order to predict the future.

    observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
        Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
        in `[0, 1]`:

        - 1 for values that are **observed**,
        - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

Return:
    [`SamplePatchTSMixerPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
    number of samples, prediction_length, num_input_channels)`.
NF)r@  r@  r  r  rC  r   r   r!  )r;  r  rF  r  r  r  r  samplerI   stackr   )r"   r@  r  r;  outputsrF  r   sampless           r$   generate"PatchTSMixerForPrediction.generate  s    0  $88 #'!&	
 //<<&&GKKw}} = 

 388L2MN2MQ&&(2MN ++g1-1GDD	 Os   B)r  r
  r  r?  r;  r  r  )NNFTNr(   )r-   r.   r/   r0   r1   r   r   r   rI   rJ   r   r   r  r+   r   rO  r3   r4   r5   s   @r$   r2  r2    s    	1 @  1504/4 &*w
\\w
  -w
  -	w

 'tnw
 w
 d^w
 
)w
 w
x 15-E\\-E  --E 
,	-E -Er&   r2  c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Srg)	-PatchTSMixerForTimeSeriesClassificationOutputi  a  
Output type of [`PatchTSMixerForTimeSeriesClassificationOutput`].

Args:
    prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
        Prediction output from the classification head.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
        Backbone embeddings before passing through the head.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
        Total loss.
Nr  r  r  r   rM  r  rM  r&   r$   rR  rR    r  r&   rR  c                      ^  \ rS rSrSrS\4U 4S jjr\    SS\R                  S\
\R                     S\
\   S\S	\
\   S
\4S jj5       rSrU =r$ )'PatchTSMixerForTimeSeriesClassificationi  z
`PatchTSMixer` for classification application.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.

Returns:
    `None`.
r9   c                 <  > [         TU ]  U5        [        U5      U l        [	        US9U l        UR                  U l        UR                  S;   a$  [        UR                  UR                  S9U l        OS U l        UR                  (       a  U R                  5         g g )Nr   rg   rf   Tr>   rT   )r   r   r  r?  r,  r
  r  r   InjectScalerStatistics4Dr>   rT   inject_scaler  rA   s     r$   r   0PatchTSMixerForTimeSeriesClassification.__init__  s     &v.
*
	  &55>>22 8]c]o]o pD $D NN r&   r@  target_valuesr  r  r  rV   c                 <   [         R                  R                  5       nUb  UOU R                  nU R	                  UUUS9n[        U[        5      (       a  [        U6 nU R                  b4  U R                  UR                  UR                  UR                  S9Ul	        U R                  UR                  5      nUb  USL a	  U" X5      n	OSn	U(       d*  [        S U	UUR                  UR                  4 5       5      $ [        U	UUR                  UR                  S9$ )aH  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to predict
    the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
    for classification or regression tasks, it denotes the appropriate context values of the time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
    greater than 1.
target_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,
    `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*):
    Target
    values of the time series, that serve as labels for the model. The `target_values` is what the
    Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
    required for a pretraining task.

    For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
    to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
    pass the target data with all channels, as channel Filtering for both prediction and target will be
    manually applied before the loss computation.

    For a classification task, it has a shape of `(batch_size,)`.

    For a regression task, it has a shape of `(batch_size, num_targets)`.
return_loss (`bool`, *optional*):
    Whether to return the loss in the `forward` call.
Nr  rC  Tc              3   &   #    U  H  nUv   M	     g 7fr(   rM  r  s     r$   r"  BPatchTSMixerForTimeSeriesClassification.forward.<locals>.<genexpr>=  r  r  r  )rI   r   CrossEntropyLossr  r?  r%  r&  r  rY  r  r  r  r
  r   rR  )
r"   r@  r[  r  r  r  r  r  rG  r  s
             r$   r+   /PatchTSMixerForTimeSeriesClassification.forward  s/   H xx((*%0%<k$BVBVzz!5# " 

 lE**2LAL(-1->->.. $$"(( .? .L* 		,889$)<E1HH   22 ..	   =$*<<&44	
 	
r&   )r
  rY  r?  r  r  )r-   r.   r/   r0   r1   r   r   r   rI   rJ   r   r   rR  r+   r3   r4   r5   s   @r$   rT  rT    s    	1 "  15/4 &*M
\\M
  -M
 'tn	M

 M
 d^M
 
7M
 M
r&   rT  c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Srg)	PatchTSMixerForRegressionOutputiO  a  
Output type of [`PatchTSMixerForRegressionOutput`].

Args:
    regression_outputs (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
        Prediction output from the regression head.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
        Backbone embeddings before passing through the head.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
        Total loss.
Nr  regression_outputsr  r   rM  )r-   r.   r/   r0   r1   r  r   rI   r  r  rc  r  r   r   r3   rM  r&   r$   rb  rb  O  r  r&   rb  c                      ^  \ rS rSrSS\S\S\4U 4S jjjrS\R                  S\R                  S\R                  4S	 jrS
r	U =r
$ )rX  ie  r>   rT   	expansionc                 (  > [         TU ]  5         [        R                  " US-   X1-  5      U l        [        R                  " X1-  U5      U l        [        R                  " SSU-  5      U l        [        R                  " SU-  S5      U l        X l        g rv  )	r   r   r   r   inverse_trans_expansioninverse_trans_compressionmap_scale_expansionmap_scale_compressionrT   )r"   r>   rT   re  r#   s       r$   r   !InjectScalerStatistics4D.__init__f  sr    ')yy1i>Q'R$)+93F)P&#%99QI#> %'YYq9}a%@"&r&   r)   r  r  c                    UR                  SS5      nUR                  S5      nUR                  SSU R                  S5      nUR                  SS5      nUR                  S5      nUR                  SSU R                  S5      n[        R
                  " XE/SS9nU R                  U5      nU R                  U5      n[        R
                  " X/SS9nU R                  U5      nU R                  U5      nU$ )aQ  
Args:
    inputs (`torch.Tensor` of shape `(batch_size, num_input_channels, num_patch, d_model)`)
    loc (`torch.Tensor` of shape `(batch_size, 1, num_input_channels)`)
    scale (`torch.Tensor` of shape `(batch_size, 1, num_input_channels)`)
Returns:
    `torch.Tensor` of shape `(batch_size, num_input_channels, num_patch, d_model)`
r   r   r   r   )
rE   r`   rd  rT   rI   catri  rj  rg  rh  )r"   r)   r  r  rf   stdevconcat_statss          r$   r+    InjectScalerStatistics4D.forwardo  s     }}R$~~b!{{1a!1!115B'#Q4#3#3Q7yy$B7//=11,?F1r:--f5//7r&   )rh  rg  rj  ri  rT   )rD   )r-   r.   r/   r0   r2   r   rI   rJ   r+   r3   r4   r5   s   @r$   rX  rX  e  sM    ' '# '# ' 'ell  ell  r&   rX  c                      ^  \ rS rSrSrS\4U 4S jjr\    SS\R                  S\
\R                     S\
\   S\S	\
\   S
\4S jj5       rS\R                  S
\4S jrSrU =r$ )PatchTSMixerForRegressioni  z
`PatchTSMixer` for regression application.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.

Returns:
    `None`.
r9   c                   > [         TU ]  U5        [        U5      U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  S:X  a  S U l        O^[        [        [        S.nUR                  UR
                  5      nUb  U" UR                  S9U l        O[        SUR
                   35      eUR                  S;   a$  [        UR                   UR"                  S9U l        OS U l        ['        UU R
                  S9U l        UR*                  (       a  U R+                  5         g g )Nr4  r5  r   r9  rV  rW  r:  )r   r   r  r?  r  r  r  r;  r   r   r
   r<  r1  rh   r   rX  r>   rT   rY  r,  r
  r  )r"   r9   r=  r>  r#   s       r$   r   "PatchTSMixerForRegression.__init__  s$    &v.
KK	#)#=#= %55$*$?$?!;;%'+D$ ,&%;'#
 366v7Q7QRL'+7F<N<N+O( #?@Z@Z?[!\]]>>22 8]c]o]o pD $D* $ 8 8
	 NN r&   r@  r[  r  r  r  rV   c           	         U R                   S:X  a  [        R                  " SS9nO"U R                   S:X  a  [        nO[	        S5      eUb  UOU R
                  nU R                  UUUS9n[        U[        5      (       a  [        U6 nU R                  b4  U R                  UR                  UR                  UR                  S9Ul        U R                  UR                  5      nUb  US	L a  U R                  (       a  U R                  S
:X  a)  [         R"                  " US:  5      (       a  [%        S5      eU R                  R'                  U5      n	[        U V
s/ s H(  oR)                  SU R*                  R,                  5      PM*     sn
5      nU" X5      n[/        U5      nOU" X5      nOSnU(       d*  [        S UUUR                  UR0                  4 5       5      $ [3        UUUR                  UR0                  S9$ s  sn
f )aD  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to predict
    the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
    for classification or regression tasks, it denotes the appropriate context values of the time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
    greater than 1.
target_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,
    `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*):
    Target values of the time series, that serve as labels for the model. The `target_values` is what the
    Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
    required for a pretraining task.

    For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
    to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
    pass the target data with all channels, as channel Filtering for both prediction and target will be
    manually applied before the loss computation.

    For a classification task, it has a shape of `(batch_size,)`.

    For a regression task, it has a shape of `(batch_size, num_targets)`.
return_loss (`bool`, *optional*):
    Whether to return the loss in the `forward` call.
r4  rf   r  r*  rB  Nr  rC  Tr8  r   zDtarget_values cannot be negative for negative_binomial distribution.r   c              3   &   #    U  H  nUv   M	     g 7fr(   rM  r  s     r$   r"  4PatchTSMixerForRegression.forward.<locals>.<genexpr>  r  r  )r  rc  r  r   )r  r   r  r*  rh   r  r?  r%  r&  r  rY  r  r  r  r
  r  rI   any	ExceptionrF  r   r9   r1  r0  r   rb  )r"   r@  r[  r  r  r  r  r  rG  rF  itemr  s               r$   r+   !PatchTSMixerForRegression.forward  s   F 99::/DYY%DQRR%0%<k$BVBVzz!5# " 

 lE**2LAL(-1->->.. $$"(( .? .L* 		,889$)<''++/BBuyyQ^abQbGcGc#$jkk#77DDUKRWXRW$yyT[[-D-DERWXY<+H55H   22 ..	   /$*<<&44	
 	
) Ys   /G?c                 R   U R                   nU " USSS9nU R                  R                  UR                  5      n[	        U5       Vs/ s H  oTR                  5       PM     nn[        R                  " USS9R                  SX R                  R                  5      n[        US9$ s  snf )a  
Generate sequences of sample predictions from a model with a probability distribution head.

Args:
    past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Past values of the time series that serves as context in order to predict the target values.

Return:
    [`SamplePatchTSMixerRegressionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
    number of samples, num_targets)`.
NF)r@  r[  r  r   r   r   rJ  )r;  r  rF  rc  r  rK  rI   rL  r   r9   r1  r%  )r"   r@  r;  rM  rF  r   rN  s          r$   rO  "PatchTSMixerForRegression.generate  s       $88 #!&
 //<<W=W=WX ,11E+F
+Fa!+F 	 

 ++g1-2227K[[MdMde1GDD
s   B$)r  r
  rY  r  r?  r;  r  r  )r-   r.   r/   r0   r1   r   r   r   rI   rJ   r   r   rb  r+   r%  rO  r3   r4   r5   s   @r$   rr  rr    s    	%1 %N  15/4 &*Z
\\Z
  -Z
 'tn	Z

 Z
 d^Z
 
)Z
 Z
x#E\\#E 
,#E #Er&   rr  )r>  r  r  r2  rT  rr  )NFr   )Nr   )NN)Mr1   rb   dataclassesr   typingr   r   r   rI   torch.nnr   transformers.modeling_utilsr   transformers.utilsr   time_series_utilsr
   r   r   utilsr   r   utils.deprecationr   configuration_patchtsmixerr   
get_loggerr-   r   Moduler   r7   rL   rs   r   r   r   r   r   r   r   r  r,  r>  rS  rJ   r   listr   r2   rr  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r%  distributionsDistributionr*  r0  r2  rR  rT  rb  rX  rr  __all__rM  r&   r$   <module>r     s   "  ! ) )   7 * U U , 0 : 
		H	% *&BII &,$RYY $N.BII .bbii .-299 -b~8BII ~8BBbii BJ*		 *Z#		 #L&#		 &#R5BII 5pDRYY DN )/ ) )2ryy D 04',7%LL7%7% 'tn7% !%	7%
 7%| 04	A%LLA%$T3Y/A% 'tnA% 	A%J-299 -b9"")) 9"z 0BII  0H3;RYY 3;n BII  6 = = =Bk5 BkJ .k . .: 
^
3 ^

^
B ={ = =*\
!< \
~ .k . .8 
2 
2 
2 
2 
2 
2#u""// # #%,, #*5<< *(5<<:P *fkfrfr *0SE ; SEl =K = =*k
.I k
\ =k = =*%ryy %PsE ; sElr&   