
    fThs`                    P   S r SSKrSSKJr  SSKJrJrJr  SSKrSSKJ	r	  SSK
Jr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJrJrJr  SSKJr  \R4                  " \5      r " S S\	R:                  5      r " S S\	R:                  5      r   S\S\R@                  S\!S\\"   S\#S\$4
S jjr%  S]S\R@                  S\\"\$4   S\\"   S\$4S jjr& " S S\	R:                  5      r' " S S\	R:                  5      r( " S S\	R:                  5      r)\ " S  S!\5      5       r* " S" S#\	R:                  5      r+ " S$ S%\	R:                  5      r, " S& S'\*5      r-\ " S( S)\5      5       r.\ " S* S+\5      5       r/\ " S, S-\5      5       r0\ " S. S/\5      5       r1\ " S0 S1\5      5       r2\ " S2 S3\5      5       r3S4\Rh                  Rj                  S5\R@                  S6\R@                  4S7 jr6S^S8\R@                  S9\\R@                     S6\R@                  4S: jjr7 " S; S<\	R:                  5      r8 " S= S>\	R:                  5      r9 " S? S@\	R:                  5      r: " SA SB\	R:                  5      r;\ " SC SD\*5      5       r< " SE SF\	R:                  5      r=\" SGSH9 " SI SJ\*5      5       r> " SK SL\	R:                  5      r?\" SMSH9 " SN SO\*5      5       r@\" SPSH9 " SQ SR\	R:                  5      5       rA\" SSSH9 " ST SU\*5      5       rB " SV SW\	R:                  5      rC\" SXSH9 " SY SZ\*5      5       rD/ S[QrEg)_zPyTorch PatchTST model.    N)	dataclass)OptionalTupleUnion)nn   )ACT2CLS)BaseModelOutput)PreTrainedModel)NegativeBinomialOutputNormalOutputStudentTOutput)ModelOutputauto_docstringlogging   )PatchTSTConfigc                     ^  \ rS rSrSr     SS\S\S\S\S\S\S	\\	   4U 4S
 jjjr
S\R                  S\S\4S jr     SS\R                  S\\R                     S\\\R                        S\\R                     S\\R                     S\S\\R                  \\R                     \\\R                        4   4S jjrSrU =r$ )PatchTSTAttention$   z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsdropout
is_decoderbias	is_causalconfigc                   > [         TU ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).g      ࿩r   )super__init__r   r   r   head_dimr   
ValueErrorscalingr   r   r   Lineark_projv_projq_projout_proj)	selfr   r   r   r   r   r   r   	__class__s	           f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/patchtst/modeling_patchtst.pyr"   PatchTSTAttention.__init__'   s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TB    tensorseq_lenbszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr      )viewr   r#   	transpose
contiguous)r+   r0   r1   r2   s       r-   _shapePatchTSTAttention._shapeF   s5    {{3GQQRSUVWbbddr/   hidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskoutput_attentionsreturnc                 	   USLnUR                  5       u  pn
U R                  U5      U R                  -  nU(       a2  Ub/  US   R                  S   UR                  S   :X  a  US   nUS   nGOU(       aE  U R	                  U R                  U5      SU5      nU R	                  U R                  U5      SU5      nOUby  U R	                  U R                  U5      SU5      nU R	                  U R                  U5      SU5      n[        R                  " US   U/SS9n[        R                  " US   U/SS9nODU R	                  U R                  U5      SU5      nU R	                  U R                  U5      SU5      nU R                  (       a  X4nXR                  -  SU R                  4nU R	                  XU5      R                  " U6 nUR                  " U6 nUR                  " U6 nUR                  S5      n[        R                  " XR                  SS5      5      nUR                  5       XR                  -  X4:w  a-  [!        SXR                  -  X4 SUR                  5        35      eUbv  UR                  5       USX4:w  a"  [!        S	USX4 SUR                  5        35      eUR                  XR                  X5      U-   nUR                  XR                  -  X5      n["        R$                  R'                  USS9nUb  UR                  5       U R                  4:w  a*  [!        S
U R                  4 SUR                  5        35      eUR                  SSSS5      UR                  XR                  X5      -  nUR                  XR                  -  X5      nU(       a;  UR                  XR                  X5      nUR                  XR                  -  X5      nOSn["        R$                  R)                  UU R(                  U R*                  S9n[        R                  " UU5      nUR                  5       XR                  -  XR                  4:w  a7  [!        SXR                  -  XR                  4 SUR                  5        35      eUR                  XR                  XR                  5      nUR                  SS5      nUR                  XU R,                  5      nU R/                  U5      nUUU4$ )z#Input shape: Batch x Time x ChannelNr   r4   r   dimz$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size )ptrainingz `attn_output` should be of size )sizer)   r%   shaper8   r'   r(   torchcatr   r   r#   r5   reshapebmmr6   r$   r   
functionalsoftmaxr   rF   r   r*   )r+   r:   r;   r<   r=   r>   r?   is_cross_attentionr2   tgt_len_query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                       r-   forwardPatchTSTAttention.forwardI   s    .T9',,.a {{=1DLL@ *q!''*.>.D.DQ.GG (*J)!,LT[[1A%BBLJ;;t{{3C'Db#NL'T[[%?SIJ;;t{{='A2sKLN1$5z#BJJ 99nQ&7%FANL T[[%?SIJ;;t{{='A2sKL?? )7NNN*B>
{{<#>CCZP''4
#++Z8//!$yy/C/CAq/IJ3#7"JJ6nn8Lg7_6` a %%'(* 
 %""$a(BB 7a8R7SS\]k]p]p]r\st  (,,S..'SVddL',,S>>-A7TL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfm?wwL',,S>>-A7TL
 %1$5$5c>>7$\!055cNN6JG]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2C..4H'S`S`3a2b c$$&') 
 "&&sNNG]]S!++Aq1 "))#GmmK01>AAr/   )r   r   r   r#   r   r   r'   r   r*   r)   r%   r(   )        FTFN)NNNNF)__name__
__module____qualname____firstlineno____doc__intfloatboolr   r   r"   rI   Tensorr8   r   r[   __static_attributes____classcell__r,   s   @r-   r   r   $   sZ   G  +/CC C 	C
 C C C (C C>eU\\ eC ec e 488<1526"'vB||vB #5<<0vB !u||!45	vB
 !.vB "%,,/vB  vB 
u||Xell3XeELL>Q5RR	SvB vBr/   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSTBatchNorm   zH
Compute batch normalization over the sequence length (time) dimension.
r   c                 ~   > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        g )Neps)r!   r"   r   BatchNorm1dd_modelnorm_eps	batchnormr+   r   r,   s     r-   r"   PatchTSTBatchNorm.__init__   s(    FOOLr/   inputsc                 l    UR                  SS5      nU R                  U5      nUR                  SS5      $ )z
Parameters:
    inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
        input for Batch norm calculation
Returns:
    `torch.Tensor` of shape `(batch_size, sequence_length, d_model)`
r   r4   )r6   rs   )r+   rv   outputs      r-   r[   PatchTSTBatchNorm.forward   s7     !!!Q''1%%r/   )rs   r^   r_   r`   ra   rb   r   r"   rI   rf   r[   rg   rh   ri   s   @r-   rk   rk      s+    M~ M
&ell 
& 
&r/   rk   rv   
mask_ratiounmasked_channel_indiceschannel_consistent_masking
mask_valuec                    US:  d  US:  a  [        SU S35      eU R                  u  pVpxU R                  n	[        USU-
  -  5      n
U(       a*  [        R
                  " USXyS9nUR                  SUS5      nO[        R
                  " XVXyS9n[        R                  " XVXyS9nSUSS2SS2SU
24'   [        R                  " USS9n[        R                  " USS9n[        R                  " USUS	9nUR                  S5      R                  SSSU5      nUb  SUSS2USS2SS24'   U R                  UR                  5       U5      nXS
   4$ )a  random_masking: Mask the input considering the control variables.

Args:
    inputs (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, num_features)`):
        The input tensor to mask.
    mask_ratio (`float`):
        Masking ratio applied to mask the input data during random pretraining. It is the number between 0 and 1.
    unmasked_channel_indices (list, *optional*):
        Indices of channels that will not be masked.
    channel_consistent_masking (bool, *optional*, defaults to `False`):
        When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
        across channels.
    mask_value (int, *optional*, defaults to 0):
        Define the value of masked patches for pretraining.

Returns:
    `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as input Tensor and mask tensor of shape [bs x c x
    n]
r   r   zMask ratio z has to be between 0 and 1.deviceNrB   rC   )rD   index.r   )r$   rH   r   rc   rI   randrepeatonesargsortgather	unsqueezemasked_fillre   )rv   r{   r|   r}   r~   
batch_sizenum_channelssequence_lengthnum_featuresr   len_keepnoisemaskids_shuffleids_restoreinputs_masks                   r-   random_maskingr      sA   4 A~q;zl2MNOO>Dll;Jo]]F?a*n56H!

:q/IQa0 

:_T ::jODDAyy --2.K--4K<<"K8D>>"$$Q1l;D+23Q(!Q./$$TYY[*=KV$$r/   num_forecast_mask_patchesc                 P   [        U[        5      (       a  U/nU Vs/ s H  nSPM     nnU R                  u  pgp[        R                  " XgXR
                  S9n
/ nSn[        U5      n[        X5       HG  u  pUS::  d  X:  a  [        SU S35      e[        Xo-  U-  5      nUR                  XU/5        UU-  nMI     [        US S9nX:  a  US   S   Xl-
  -   US   S'   OX:  a  US	   S   X-
  -   US	   S'   SnU H  u  nnnUU-   nSU
UU2S
S
2U* S
24'   UnM     [        R                  " U
R                  S   5      nU
U   n
U
R                  S	5      R                  SSSU	5      n
Ub  SU
S
S
2US
S
2S
S
24'   U R                  U
R                  5       U5      nUU
S   4$ s  snf )ai  Forecast masking that masks the last K patches where K is from the num_forecast_mask_patches.
If num_forecast_mask_patches is a list, samples in the batch will be randomly masked by numbers defined in the list.

Parameters:
    inputs (`torch.Tensor`):
        Input of shape `(bs, num_channels, num_patch, patch_length)`
    num_forecast_mask_patches (`list`):
        Number of patches to be masked at the end of each batch sample. e.g. 4 or [3, 5].
    unmasked_channel_indices (`list`, *optional*):
        Indices of channels that are not masked.
    mask_value (`int`, *optional*, defaults to 0):
        Values in the masked patches will be filled by `mask_value`.

Returns:
    `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs,
    num_channels , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)`
r   r   r   znum_forecast_mask_patches z6 should be greater than 0 and less than total patches.c                     U S   $ )Nr4    )xs    r-   <lambda>"forecast_masking.<locals>.<lambda>>  s    !A$r/   )keyr4   rB   Nr   )
isinstancerc   rH   rI   zerosr   sumzipr$   appendsortedrandpermr   r   r   re   )rv   r   r|   r~   rQ   forecast_mask_ratiosr   r   r   r   r   t_listtotal_lengthtotal_ratiopatch_lengthratiotemp_lenbatch1	patch_lenbatch2permr   s                         r-   forecast_maskingr     s   0 +S11%>$?!'@A'@!A'@A>Dll;Jo;;zWDFL*+K"#<S1 ?,\N:pq  z)K78|H56   T F/F ay|z'@Aq	!		"r
1)BCr
1F"(	1h("./VF]A	z{*+ #)
 >>$**Q-(D:D>>"$$Q1l;D+23Q(!Q./$$TYY[*=KV$$O Bs   F#c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSTPatchifyiV  z
A class to patchify the time series sequence into different patches

Returns:
    `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
r   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        U R                  U R                  ::  a&  [        SU R                   SU R                   S35      e[        U R                  U R                  5      U R                  -
  U R
                  -  S-   U l        U R                  U R
                  U R                  S-
  -  -   nU R                  U-
  U l	        g )NzSequence length (z+) has to be greater than the patch length ()r   )
r!   r"   context_lengthr   r   patch_strider$   maxnum_patchessequence_start)r+   r   new_sequence_lengthr,   s      r-   r"   PatchTSTPatchify.__init__^  s    %44"//"//4#4#44#D$8$8#99deievevdwwxy 
   4 4d6G6GH4K\K\\aeararruvv"//$2C2CtGWGWZ[G[2\\"225HHr/   past_valuesc                 4   UR                   S   nX R                  :w  a  [        SU SU R                   S35      eUSS2U R                  S2SS24   nUR	                  SU R
                  U R                  S9nUR                  SS5      R                  5       nU$ )z
Parameters:
    past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*):
        Input for patchification

Returns:
    `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
zInput sequence length (z%) doesn't match model configuration (r   N)	dimensionrG   step)	rH   r   r$   r   unfoldr   r   r6   r7   )r+   r   r   rx   s       r-   r[   PatchTSTPatchify.forwardo  s     &++B/222)/)::_`d`t`t_uuwx  Q 3 3 5q89$2C2C$J[J[\!!"b)446r/   )r   r   r   r   r   rz   ri   s   @r-   r   r   V  s+    I~ I"5<<  r/   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSTMaskingi  al  
Class to perform random or forecast masking.

Parameters:
    config (`PatchTSTConfig`): model config
Returns:
    x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
        Masked patched input
    mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
        Bool tensor indicating True on masked points
r   c                 >  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        U R                  b  [        U R                  5      U l        g g N)	r!   r"   random_mask_ratior}   	mask_typer   r|   r~   r   rt   s     r-   r"   PatchTSTMasking.__init__  s    !'!9!9*0*K*K')))/)I)I&(.(G(G% ++((4,243P3P,QD) 5r/   patch_inputc                 d   U R                   S:X  a8  [        UU R                  U R                  U R                  U R
                  S9u  p#OVU R                   S:X  a-  [        UU R                  U R                  U R
                  S9u  p#O[        SU R                    S35      eUR                  5       nX#4$ )a  
Parameters:
    patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
        Patch input

Return:
    masked_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
        Masked patched input
    mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
        Bool tensor indicating True on masked points

random)rv   r{   r|   r}   r~   forecast)rv   r   r|   r~   zInvalid mask type .)
r   r   r   r|   r}   r~   r   r   r$   re   )r+   r   masked_inputr   s       r-   r[   PatchTSTMasking.forward  s     >>X%!/"11)-)F)F+/+J+J??"L$ ^^z)!1"*.*H*H)-)F)F??	"L$ 1$..1ACDD yy{!!r/   )r}   r   r~   r   r   r|   rz   ri   s   @r-   r   r     s+    
	R~ 	R!"5<< !" !"r/   r   c                   d   ^  \ rS rSrSrS\4U 4S jjrS	S\R                  S\	\
   4S jjrSrU =r$ )
PatchTSTEncoderLayeri  z
PatchTST encoder layer
r   c                 &  > [         TU ]  5         UR                  U l        [        UR                  UR
                  UR                  S9U l        UR                  S:  a   [        R                  " UR                  5      O[        R                  " 5       U l        UR                  S:X  a  [        U5      U l        OWUR                  S:X  a/  [        R                   " UR                  UR"                  S9U l        O[%        UR                   S35      eU R                  (       a  UR                  S:  a   [        R                  " UR                  5      O[        R                  " 5       U l        UR                  S:X  a  [        U5      U l        OWUR                  S:X  a/  [        R                   " UR                  UR"                  S9U l        O[%        UR                   S35      e[        R*                  " [        R,                  " UR                  UR.                  UR0                  S9[2        UR4                     " 5       UR6                  S:  a   [        R                  " UR6                  5      O[        R                  " 5       [        R,                  " UR.                  UR                  UR0                  S95      U l        UR                  S:  a   [        R                  " UR                  5      O[        R                  " 5       U l        UR                  S:X  a  [        U5      U l        OWUR                  S:X  a/  [        R                   " UR                  UR"                  S9U l        O[%        UR                   S35      eUR>                  U l        g )N)r   r   r   r   rs   	layernormrn   z$ is not a supported norm layer type.r    ) r!   r"   channel_attentionr   rq   num_attention_headsattention_dropout	self_attnpath_dropoutr   DropoutIdentitydropout_path1	norm_typerk   norm_sublayer1	LayerNormrr   r$   dropout_path2norm_sublayer2
Sequentialr&   ffn_dimr   r	   activation_function
ff_dropoutffdropout_path3norm_sublayer3pre_normrt   s     r-   r"   PatchTSTEncoderLayer.__init__  s   !'!9!9*nn00,,
 AG@S@SVW@WRZZ(;(;<]_]h]h]j{*"3F";D,"$,,v~~6??"SD 0 011UVWW !!DJDWDWZ[D[F,?,?!@acalalanD;.&7&?#!![0&(ll6>>v&W# F$4$4#55Y!Z[[ --IIfnnfnn6;;GF../1-3->->-BBJJv(()IIfnnfnn6;;G	
 AG@S@SVW@WRZZ(;(;<]_]h]h]j{*"3F";D,"$,,v~~6??"SD 0 011UVWWr/   hidden_stater?   c                    UR                   u  p4pVUR                  X4-  XV5      nU R                  (       a6  U R                  U R	                  U5      US9u  pxn	XR                  U5      -   nO4U R                  XS9u  pxn	U R	                  XR                  U5      -   5      nUR                  X4XV5      nU R                  (       a  UR                  SS5      R                  5       nUR                  X5-  XF5      nU R                  (       a6  U R                  U R                  U5      US9u  pzn	XR                  U5      -   nO4U R                  XS9u  pzn	U R                  XR                  U5      -   5      nUR                  X5XF5      nUR                  SS5      R                  5       nUR                  X4-  XV5      nU R                  (       a2  XR                  U R                  U R                  U5      5      5      -   nO1U R                  XR                  U R                  U5      5      -   5      nUR                  X4XV5      nU4nU(       a  XR                  (       a  UW
4OU4-  nU$ )ao  
Parameters:
    hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
        Past values of the time series
    output_attentions (`bool`, *optional*):
        Whether or not to return the output attention of all layers
Return:
    `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`

)r:   r?   r4   r   )rH   r5   r   r   r   r   rK   r   r6   r7   r   r   r   r   r   )r+   r   r?   r   num_input_channelsr   rq   rZ   rW   rQ   channel_attn_weightsoutputss               r-   r[   PatchTSTEncoderLayer.forward  si    DPCUCU@
 $(()H/c==+/>>"11,?Sd ,: ,(Kq (*<*<[*IIL ,0>>* ,: ,(Kq  ..|>P>PQ\>]/]^L $++JOe !!'11!Q7BBDL',,Z-IK]gL}}7;~~"&"5"5l"CWh 8F 841  ,.@.@.MM 8<~~". 8F 841  $22<BTBTU`Ba3ab (//
M_iL'11!Q7BBDL $(()H/c== (*<*<TWWTEXEXYeEf=g*hhL  ..|>P>PQUQXQXYeQf>g/ghL $++JOe/?U?U&:;\h[jjGr/   )
r   r   r   r   r   r   r   r   r   r   r   )r^   r_   r`   ra   rb   r   r"   rI   rf   r   re   r[   rg   rh   ri   s   @r-   r   r     s9    /(~ /(bQELL QXd^ Q Qr/   r   c                   4    \ rS rSr\rSrSrSrS r	S	S jr
Srg)
PatchTSTPreTrainedModeliL  modelr   Fc                 *   [        U[        5      (       a  U R                  R                  (       a(  [        R
                  R                  UR                  SS9  U R                  R                  S:X  a*  [        R
                  R                  UR                  SSS9  gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[         5      (       a^  UR"                  R                  R                  R                  5         UR"                  R                  R                  R                  S5        g[        U[        R$                  [        R&                  45      (       ak  UR                  R                  R                  SU R                  R(                  S9  UR                  b%  UR                  R                  R                  5         ggg)	z
Initialize weights
g{Gz?)stdr   r]   g?)meanr         ?N)r   PatchTSTPositionalEncodingr   use_cls_tokenr   initnormal_	cls_tokenpositional_encoding_typeposition_encr   r   datazero_weightfill_rk   rs   r&   Conv1dinit_std)r+   modules     r-   _init_weights%PatchTSTPreTrainedModel._init_weightsS  sc    f899{{(( 0 0d;{{33x? 3 3#3G @--KK""$MM$$S) 122!!&&,,.##((..s3BII 677MM&&CT[[5I5I&J{{&  &&( ' 8r/   c                 <    [        U[        5      (       a  X!l        g g r   )r   PatchTSTEncodergradient_checkpointing)r+   r  values      r-   _set_gradient_checkpointing3PatchTSTPreTrainedModel._set_gradient_checkpointingi  s    f00,1) 1r/   r   N)F)r^   r_   r`   ra   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr  r  rg   r   r/   r-   r   r   L  s"    !L#O&+#),2r/   r   c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )PatchTSTEmbeddingin  r   c                   > [         TU ]  5         UR                  U l        UR                  U l        U R                  (       a1  [        R
                  " UR                  UR                  5      U l        g [        R                  " 5       U l        [        UR                  5       HG  nU R                  R                  [        R
                  " UR                  UR                  5      5        MI     g r   )r!   r"   r   share_embeddingr   r&   r   rq   input_embedding
ModuleListranger   )r+   r   rQ   r,   s      r-   r"   PatchTSTEmbedding.__init__o  s    "(";";%55#%99V-@-@&..#QD #%==?D 6445$$++BIIf6I6I6>>,Z[ 6r/   r   c                 j   UR                   S   nX R                  :w  a  [        SU R                   SU S35      eU R                  (       a  U R	                  U5      nU$ [        U5       Vs/ s H$  o@R                  U   " USS2USS2SS24   5      PM&     nn[        R                  " USS9nU$ s  snf )z
Parameters:
    patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
        Patch input for embedding
return:
    `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)`
r   z&The defined number of input channels (zQ) in the config has to be the same as the number of channels in the batch input (r   NrC   )rH   r   r$   r  r  r  rI   stack)r+   r   r   
embeddingsis        r-   r[   PatchTSTEmbedding.forward{  s     )..q1!8!8889P9P8Q RTTfSgghj  --k:J  UZZlTmnTmq..q1+aAqj2IJTmJnZQ7J os   ,+B0)r  r   r  r^   r_   r`   ra   r   r"   rI   rf   r[   rg   rh   ri   s   @r-   r  r  n  s&    
\~ 
\5<<  r/   r  c                      ^  \ rS rSrSrS\S\4U 4S jjr\S\S\S\	R                  4S j5       rS\R                  4S	 jrS
rU =r$ )r   i  z
Class for positional encoding
r   r   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  (       aA  [        R
                  " [        R                  " SSSUR                  5      5      U l	        US-  nU R                  X5      U l        UR                  S:  a&  [        R                  " UR                  5      U l        g [        R                  " 5       U l        g )Nr   r   )r!   r"   r   r   r   	ParameterrI   r   rq   r   _init_per   positional_dropoutr   r   r+   r   r   r,   s      r-   r"   #PatchTSTPositionalEncoding.__init__  s    #11"(";";\\%++aAv~~*NODN1K MM&> 6<5N5NQR5RBJJv001 	XZXcXcXe 	r/   r@   c                 $   U R                   S:X  a5  [        R                  " [        R                  " XR
                  5      SS9nU$ U R                   S:X  Ga#  [        R                  " XR
                  5      n[        R                  " SU5      R                  S5      n[        R                  " [        R                  " SU R
                  S5      [        R                  " S5      U R
                  -  * -  5      n[        R                  " X4-  5      US S 2SS S24'   [        R                  " X4-  5      US S 2SS S24'   X"R                  5       -
  nX"R                  5       S	-  -  n[        R                  " US
S9nU$ [!        U R                    S35      e)Nr   Trequires_gradsincosr   r   r4   g     @
   FzN is not a valid positional encoder. Available types are 'random' and 'sincos'.)r   r   r#  rI   randnrq   r   aranger   expmathlogsincosr   r   r$   )r   r   r   positiondiv_terms        r-   r$  #PatchTSTPositionalEncoding._init_pe  sX    **h6<<K(P`deL  ,,8 ;;{NNCL||A{3==a@Hyya!CQXHY\b\j\jHjFk!klH$)IIh.A$BLADqD!$)IIh.A$BLADqD!'*;*;*==L'+;+;+=+BCL<<EJL
  223  4B  C r/   r   c                 x   U R                   (       a  U R                  XR                  SS 2S S 24   -   5      nU R                  U R                  S S2S S 24   -   nUR	                  UR
                  S   U R                  SS5      n[        R                  " X14SS9nU$ U R                  XR                  -   5      nU$ )Nr   r   rB   r4   rC   )	r   r%  r   r   expandrH   r   rI   rJ   )r+   r   r   
cls_tokensr   s        r-   r[   "PatchTSTPositionalEncoding.forward  s    11+@Q@QRSRTVWRW@X2XYK):):2A2q5)AAI"))+*;*;A*>@W@WY[]_`J 99j%>AFL   22;ARAR3RSLr/   )r   r   r   r%  r   )r^   r_   r`   ra   rb   r   rc   r"   staticmethodr   r#  r$  rI   rf   r[   rg   rh   ri   s   @r-   r   r     s]    
~ 
C 
  c bll  &5<<  r/   r   c            	       z   ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\
\   S\
\   S	\4S
 jjrSrU =r$ )r
  i  z
PatchTST Encoder
r   r   c                 ,  > [         TU ]  U5        SU l        [        U5      U l        [        X5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        U R                  5         g s  snf )NF)r!   r"   r  r  embedderr   positional_encoderr   r  r  num_hidden_layersr   layers	post_init)r+   r   r   r  r,   s       r-   r"   PatchTSTEncoder.__init__  sx     &+# *&1"<V"Qmm5QWQiQiKj$kKja%9&%AKj$kl 	 %ls   Br   output_hidden_statesr?   r@   c                 h   Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  U5      nU R	                  U5      nU(       a  SOSnU(       a  SOSnU R
                   H+  nU(       a  XT4-   nU" XCS9nUS   nU(       d  M#  XhS   4-   nM-     [        XEUS9$ )ar  
Parameters:
    patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
        Past values of the time series
    output_hidden_states (bool, optional): Indicates if hidden states should be outputted.
    output_attentions (bool, optional): Indicates if attentions should be outputted.

return:
    `BaseModelOutput`
Nr   )r   r?   r   r   )last_hidden_stater:   
attentions)r   r?   rD  r>  r?  rA  r
   )	r+   r   rD  r?   r   encoder_statesall_attentionsencoder_layerlayer_outputss	            r-   r[   PatchTSTEncoder.forward  s      2C1N-TXT_T_TqTq$8$D $++JjJj 	
 mmK0..{;30d![[M#!//!A)|iM )+L  !/3C2E!E ) hvwwr/   )r>  r  rA  r?  NN)r^   r_   r`   ra   rb   r   rc   r"   rI   rf   r   re   r
   r[   rg   rh   ri   s   @r-   r
  r
    se    ~ C " 04,0	(x\\(x 'tn(x $D>	(x
 
(x (xr/   r
  c                   >   \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S	'   Sr\\R                     \	S
'   Srg)PatchTSTModelOutputi  a  
Base class for model's outputs, with potential hidden states.

Parameters:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of
        the model at the output of each layer plus the optional initial embedding outputs.
    mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`, *optional*)
        Bool masked tensor indicating which patches are masked
    loc: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
        Mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
    scale: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
        Std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
    patch_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
        Patched input to the Transformer
NrF  r:   rG  r   locscaler   r   )r^   r_   r`   ra   rb   rF  r   rI   FloatTensor__annotations__r:   r   rG  r   rP  rQ  r   rg   r   r/   r-   rO  rO    s    ( 6:x 1 1298<M8E%"3"345<59Ju00129(,D(5$$
%,'+C%##	$+)-E8E%%&-/3K%++,3r/   rO  c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	PatchTSTForPretrainingOutputi'  a  
Output type of [`PatchTSTForPretraining`].

Parameters:
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        MSE loss.
    prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction outputs of the time series modeling heads.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nlossprediction_outputr:   rG  r   )r^   r_   r`   ra   rb   rV  r   rI   rR  rS  rW  r:   r   rG  rg   r   r/   r-   rU  rU  '  sh    * )-D(5$$
%,59x 1 1298<M8E%"3"345<59Ju00129r/   rU  c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	PatchTSTForRegressionOutputiD  a  
Output type of [`PatchTSTForRegression`].

Parameters:
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        MSE loss.
    regression_outputs (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
        Regression outputs of the time series modeling heads.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
NrV  regression_outputsr:   rG  r   )r^   r_   r`   ra   rb   rV  r   rI   rR  rS  rZ  r:   r   rG  rg   r   r/   r-   rY  rY  D  sh    * )-D(5$$
%,6:!2!23:8<M8E%"3"345<59Ju00129r/   rY  c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S'   Sr\\R                     \	S	'   S
rg)PatchTSTForPredictionOutputia  a  
Output type of [`PatchTSTForPrediction`].

Parameters:
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        MSE loss.
    prediction_outputs (`torch.FloatTensor` of shape `(batch_size, prediction_length, -1)`):
        Prediction outputs of the time series modeling heads.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    loc: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
        Mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
    scale: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
        Std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
NrV  prediction_outputsr:   rG  rP  rQ  r   )r^   r_   r`   ra   rb   rV  r   rI   rR  rS  r]  r:   r   rG  rP  rQ  rg   r   r/   r-   r\  r\  a  s    2 )-D(5$$
%,6:!2!23:8<M8E%"3"345<59Ju00129'+C%##	$+)-E8E%%&-r/   r\  c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	PatchTSTForClassificationOutputi  a  
Output type of [`PatchTSTForClassification`].

Parameters:
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
        Prediction scores of the PatchTST modeling head (scores before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
NrV  prediction_logitsr:   rG  r   )r^   r_   r`   ra   rb   rV  r   rI   rR  rS  r`  r:   r   rG  rg   r   r/   r-   r_  r_    sh    , )-D(5$$
%,59x 1 1298<M8E%"3"345<59Ju00129r/   r_  c                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)SamplePatchTSTOutputi  a	  
Base class for time series model's predictions outputs that contains the sampled values from the chosen
distribution.

Parameters:
    sequences `(batch_size, num_samples, prediction_length, num_targets)`):
            Sampled values from the chosen distribution.
N	sequencesr   )r^   r_   r`   ra   rb   rc  r   rI   rR  rS  rg   r   r/   r-   rb  rb    s     .2Ix))*1r/   rb  inputtargetr@   c                 &    U R                  U5      * $ )z[
Computes the negative log likelihood loss from input distribution with respect to target.
)log_prob)rd  re  s     r-   nllrh    s     NN6"""r/   input_tensorweightsc                 R   Ub  [         R                  " US:g  X-  [         R                  " U 5      5      n[         R                  " U(       a  UR	                  US9OUR	                  5       SS9nU(       a  UR	                  US9U-  $ UR	                  5       U-  $ U R                  US9$ )a:  
Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.

Args:
    input_tensor (`torch.FloatTensor`):
        Input tensor, of which the average must be computed.
    weights (`torch.FloatTensor`, *optional*):
        Weights tensor, of the same shape as `input_tensor`.
    dim (`int`, *optional*):
        The dim along which to average `input_tensor`.

Returns:
    `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
r   rC   r   min)rI   where
zeros_likeclampr   r   )ri  rj  rD   weighted_tensorsum_weightss        r-   weighted_averagers    s      ++glL4JEL\L\]iLjkkk#'++#+"67;;=VYZ03###,R]]]9L9L9NR]]]  S ))r/   c            	          ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\	\R                  \R                  \R                  4   4S jr
S	rU =r$ )
PatchTSTStdScaleri  z
Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
subtracting from the mean and dividing by the standard deviation.
r   c                   > [         TU ]  5         [        US5      (       a  UR                  OSU l        [        US5      (       a  UR
                  OSU l        [        US5      (       a  UR                  U l        g SU l        g )Nscaling_dimr   keepdimTminimum_scalegh㈵>)r!   r"   hasattrrw  rD   rx  ry  rt   s     r-   r"   PatchTSTStdScaler.__init__  sd    )0)G)G6%%Q)0)C)Cv~~5<V_5U5UV11[_r/   r   observed_indicatorr@   c                 r   UR                  U R                  U R                  S9nUR                  S5      nX-  R                  U R                  U R                  S9U-  nX-
  U-  S-  R                  U R                  U R                  S9U-  n[        R
                  " XPR                  -   5      nX-
  U-  XF4$ )  
Parameters:
    data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        input for Batch norm calculation
    observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Calculating the scale on the observed indicator.
Returns:
    tuple of `torch.Tensor` of shapes
        (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
        `(batch_size, 1, num_input_channels)`)
rx  r   r4   )r   rD   rx  	clamp_minrI   sqrtry  )r+   r   r|  denominatorrP  variancerQ  s          r-   r[   PatchTSTStdScaler.forward  s     ),,TXXt||,L!++C0(--dhh-MP[[j$661<AA$((TXT`T`Aadoo

8&8&889
e#S//r/   )rD   rx  ry  r^   r_   r`   ra   rb   r   r"   rI   rf   r   r[   rg   rh   ri   s   @r-   ru  ru    sX    
`~ `0LL06;ll0	u||U\\5<<7	80 0r/   ru  c            	          ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\	\R                  \R                  \R                  4   4S jr
S	rU =r$ )
PatchTSTMeanScaleri  z~
Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
accordingly.
r   c                 N  > [         TU ]  5         [        US5      (       a  UR                  OSU l        [        US5      (       a  UR
                  OSU l        [        US5      (       a  UR                  OSU l        [        US5      (       a  UR                  U l        g S U l        g )Nrw  r   rx  Try  绽|=default_scale)r!   r"   rz  rw  rD   rx  ry  r  rt   s     r-   r"   PatchTSTMeanScaler.__init__  s    )0)G)G6%%Q)0)C)Cv~~5<V_5U5UV11[`5<V_5U5UV11[_r/   r   r|  r@   c                    X-  R                  5       R                  U R                  SS9nUR                  U R                  SS9nU[        R                  " USS9-  nU R
                  cL  UR                  SS9n[        R                  " UR                  S5      SS9n[        R                  " Xg-  5      nO#U R
                  [        R                  " U5      -  n[        R                  " US:  XX5      n[        R                  " XPR                  S9nX-  n	U R                  (       d  UR                  U R                  S9nU	[        R                  " U5      U4$ )r~  Tr  r   rl  r   rC   )absr   rD   rI   rp  r  squeeze	ones_likern  ry  rx  ro  )
r+   r   r|  ts_sumnum_observedrQ  	batch_sumbatch_observationsr  scaled_datas
             r-   r[   PatchTSTMeanScaler.forward  s"    +00266txx6N)--dhh-E\q99 %

q
)I!&\-=-=a-@a!H!MM)*HIM ..1GGM L1,eC E'9'9:l||MMdhhM/EE,,U3U::r/   )r  rD   rx  ry  r  ri   s   @r-   r  r    sX    
`~ `&;LL&;6;ll&;	u||U\\5<<7	8&; &;r/   r  c            
          ^  \ rS rSrSrS\4U 4S jjr S
S\R                  S\	\R                     S\
\R                  \R                  \R                  4   4S jjrS	rU =r$ )PatchTSTNOPScaleri-  zt
Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
r   c                    > [         TU ]  5         [        US5      (       a  UR                  OSU l        [        US5      (       a  UR
                  U l        g SU l        g )Nrw  r   rx  T)r!   r"   rz  rw  rD   rx  rt   s     r-   r"   PatchTSTNOPScaler.__init__2  sF    )0)G)G6%%Q)0)C)Cv~~r/   r   r|  r@   c                     [         R                  " USS9R                  U R                  U R                  S9n[         R
                  " USS9R                  U R                  U R                  S9nXU4$ )aP  
Parameters:
    data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        input for Batch norm calculation
Returns:
    tuple of `torch.Tensor` of shapes
        (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
        `(batch_size, 1, num_input_channels)`)
Fr)  rD   rx  )rI   r  r   rD   rx  ro  )r+   r   r|  rQ  rP  s        r-   r[   PatchTSTNOPScaler.forward7  sg     E:??DHHVZVbVb?ct59>>488UYUaUa>b%r/   r  r   )r^   r_   r`   ra   rb   r   r"   rI   rf   r   r   r[   rg   rh   ri   s   @r-   r  r  -  sd    N~ N PT LL 6>u||6L 	u||U\\5<<7	8   r/   r  c            	          ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\\R                  \R                  \R                  4   4S jr	Sr
U =r$ )	PatchTSTScaleriH  r   c                    > [         TU ]  5         UR                  S:X  d  UR                  SL a  [        U5      U l        g UR                  S:X  a  [        U5      U l        g [        U5      U l        g )Nr   Tr   )r!   r"   r%   r  scalerru  r  rt   s     r-   r"   PatchTSTScaler.__init__I  sU    >>V#v~~'=,V4DK^^u$+F3DK+F3DKr/   r   r|  r@   c                 2    U R                  X5      u  pnXU4$ )a  
Parameters:
    data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Input for scaler calculation
    observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Calculating the scale on the observed indicator.
Returns:
    tuple of `torch.Tensor` of shapes
        (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
        `(batch_size, 1, um_input_channels)`)
r  )r+   r   r|  rP  rQ  s        r-   r[   PatchTSTScaler.forwardR  s"      ;;t@5%r/   r  )r^   r_   r`   ra   r   r"   rI   rf   r   r[   rg   rh   ri   s   @r-   r  r  H  sQ    4~ 4 LL 6;ll 	u||U\\5<<7	8   r/   r  c                      ^  \ rS rSrS\4U 4S jjr     SS\R                  S\\R                     S\\R                     S\\	   S\\	   S	\\	   S
\
\\4   4S jjrSrU =r$ )PatchTSTModelid  r   c                 f  > [         TU ]  U5        [        U5      U l        [	        U5      U l        UR                  U l        U R
                  R                  nU R                  (       a  [        U5      U l	        O[        R                  " 5       U l	        [        XS9U l        U R                  5         g )N)r   )r!   r"   r  r  r   
patchifierdo_mask_inputr   r   maskingr   r   r
  encoderrB  r&  s      r-   r"   PatchTSTModel.__init__f  s     $V,*62#11oo11*62DL;;=DL&vG 	r/   r   past_observed_maskfuture_valuesrD  r?   return_dictr@   c           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [        R
                  " U5      nU R                  X5      u  pxn	U R                  U5      n
U R                  (       a  U R                  U
5      u  pOU R                  U
5      SpU R                  XUS9nU(       d<  UR                  UR                  UR                  4nXXU
4-   n[        S U 5       5      $ [        UR                  UR                  UR                  UUU	U
S9$ )a  
Parameters:
    past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
        Input sequence to the model
    past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
        Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
        in `[0, 1]`:

        - 1 for values that are **observed**,
        - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
    future_values (`torch.BoolTensor` of shape `(batch_size, prediction_length, num_input_channels)`, *optional*):
        Future target values associated with the `past_values`
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers
    output_attentions (`bool`, *optional*):
        Whether or not to return the output attention of all layers
    return_dict (`bool`, *optional*):
        Whether or not to return a `ModelOutput` instead of a plain tuple.

Returns:
    `PatchTSTModelOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)

Examples:

```python
>>> from huggingface_hub import hf_hub_download
>>> import torch
>>> from transformers import PatchTSTModel

>>> file = hf_hub_download(
...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
... )
>>> batch = torch.load(file)

>>> model = PatchTSTModel.from_pretrained("namctin/patchtst_etth1_pretrain")

>>> # during training, one provides both past and future values
>>> outputs = model(
...     past_values=batch["past_values"],
...     future_values=batch["future_values"],
... )

>>> last_hidden_state = outputs.last_hidden_state
```N)r   rD  r?   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r   ).0vs     r-   	<genexpr>(PatchTSTModel.forward.<locals>.<genexpr>  s     =GqGs   	)rF  r:   rG  r   rP  rQ  r   )r   use_return_dictr?   rD  rI   r  r  r  r  r  r  rF  r:   rG  tuplerO  )r+   r   r  r  rD  r?   r  scaled_past_valuesrP  rQ  patched_valuesmasked_valuesr   encoder_outputr   s                  r-   r[   PatchTSTModel.forwardx  sK   l &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 %!&!= *.[)U& );<"&,,~">M4"&,,~">4%du & 
 %779U9UWeWpWpqGs> BBG=G===",>>(66%00&
 	
r/   )r  r  r  r  r  NNNNN)r^   r_   r`   ra   r   r"   rI   rf   r   re   r   r   rO  r[   rg   rh   ri   s   @r-   r  r  d  s    ~ * 6:04/3,0&*Z
\\Z
 %U\\2Z
  -	Z

 'tnZ
 $D>Z
 d^Z
 
u))	*Z
 Z
r/   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	PatchTSTMaskPretrainHeadi  z%
Pretraining head for mask modelling
r   c                 8  > [         TU ]  5         UR                  S:  a   [        R                  " UR                  5      O[        R
                  " 5       U l        [        R                  " UR                  UR                  5      U l
        UR                  U l        g Nr   )r!   r"   head_dropoutr   r   r   r   r&   rq   r   linearr   rt   s     r-   r"   !PatchTSTMaskPretrainHead.__init__  sh    :@:M:MPQ:Qrzz&"5"56WYWbWbWdii0C0CD#11r/   	embeddingr@   c                     U R                  U R                  U5      5      nU R                  (       a  USS2SS2SS2SS24   nU$ )a  
Parameters:
    embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
            `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
        Embedding from the model
Returns:
    `torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                    `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True

Nr   )r  r   r   )r+   r  s     r-   r[    PatchTSTMaskPretrainHead.forward  s>     KKY 78	!!QA+.Ir/   )r   r  r   rz   ri   s   @r-   r  r    s4    2~ 2 %,,  r/   r  z*
    The PatchTST for pretrain model.
    )custom_introc                      ^  \ rS rSrS\4U 4S jjr    SS\R                  S\\R                     S\\	   S\\	   S\\	   S	\
\\4   4S
 jjrSrU =r$ )PatchTSTForPretrainingi  r   c                    > [         TU ]  U5        SUl        [        US9U l        [        U5      U l        U R                  5         g )NT)r   )r!   r"   r  r  r   r  headrB  rt   s     r-   r"   PatchTSTForPretraining.__init__  s<     #"&1
,V4	 	r/   r   r  rD  r?   r  r@   c                    Ub  UOU R                   R                  nU R                  UUUUSS9nU R                  UR                  5      n[
        R                  " SS9nU" XvR                  5      n	U	R                  SS9UR                  -  R                  5       UR                  R                  5       S-   -  n
UR                  nU(       d  U4USS	 -   nU
b  U
4U-   nU$ UnU$ [        XXR                  S
9$ )a  
Parameters:
    past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
        Input sequence to the model
    past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
        Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
        in `[0, 1]`:

        - 1 for values that are **observed**,
        - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers
    output_attentions (`bool`, *optional*):
        Whether or not to return the output attention of all layers
    return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.

Returns:
    `PatchTSTForPretrainingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
    `config.return_dict`=False)

Examples:

```python
>>> from huggingface_hub import hf_hub_download
>>> import torch
>>> from transformers import PatchTSTConfig, PatchTSTForPretraining

>>> file = hf_hub_download(
...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
... )
>>> batch = torch.load(file)

>>> # Config for random mask pretraining
>>> config = PatchTSTConfig(
...     num_input_channels=7,
...     context_length=512,
...     patch_length=12,
...     stride=12,
...     mask_type='random',
...     random_mask_ratio=0.4,
...     use_cls_token=True,
... )
>>> # Config for forecast mask pretraining
>>> config = PatchTSTConfig(
...     num_input_channels=7,
...     context_length=512,
...     patch_length=12,
...     stride=12,
...     mask_type='forecast',
...     num_forecast_mask_patches=5,
...     use_cls_token=True,
... )
>>> model = PatchTSTForPretraining(config)

>>> # during training, one provides both past and future values
>>> outputs = model(past_values=batch["past_values"])

>>> loss = outputs.loss
>>> loss.backward()
```Tr   r  rD  r?   r  none	reductionrB   rC   r  r   )rV  rW  r:   rG  )r   r  r   r  rF  r   MSELossr   r   r   r   r:   rU  rG  )r+   r   r  rD  r?   r  model_outputx_hatrV  loss_valmasked_lossrH  r   s                r-   r[   PatchTSTForPretraining.forward  s   J &1%<k$++B]B] zz#1!5/ " 
 		,889 zzF+778}}},|/@/@@EEG<K\K\K`K`KbejKjk%33ha!33G2=2I{nw.GN PWGN+^`w`w
 	
r/   r  r   )NNNN)r^   r_   r`   ra   r   r"   rI   rf   r   re   r   r   rU  r[   rg   rh   ri   s   @r-   r  r    s    ~  6:/3,0&*a
\\a
 %U\\2a
 'tn	a

 $D>a
 d^a
 
u22	3a
 a
r/   r  c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )PatchTSTClassificationHeadie  r   c                   > [         TU ]  5         UR                  U l        UR                  U l        [        R
                  " SS9U l        UR                  S:  a   [        R                  " UR                  5      O[        R                  " 5       U l
        [        R                  " UR                  UR                  -  UR                  5      U l        g Nr   	start_dimr   )r!   r"   r   pooling_typer   Flattenflattenr  r   r   r   r&   r   rq   num_targetsr  rt   s     r-   r"   #PatchTSTClassificationHead.__init__f  s    #11"//zzA.:@:M:MPQ:Qrzz&"5"56WYWbWbWdii 9 9FNN JFL^L^_r/   r  c                 p   U R                   (       a  USS2SS2SSS24   nOcU R                  S:X  a  UR                  SS9nOCU R                  S:X  a  UR                  SS9R                  nO[        SU R                   S35      eU R                  U5      nU R                  U R                  U5      5      nU$ )	a#  
Parameters:
    embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
             `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
        Embedding from the model
Returns:
    `torch.Tensor` of shape `(bs, num_targets)`

Nr   r   r4   rC   r   pooling operator  is not implemented yet)	r   r  r   r   valuesr$   r  r  r   r+   r  pooled_embeddingrx   s       r-   r[   "PatchTSTClassificationHead.forwardn  s     (Aq!4&((~~!~4%'(}}}3::01B1B0CCZ[\\<<(89T\\*:;<r/   )r   r  r  r  r   r   ri   s   @r-   r  r  e  s&    `~ `  r/   r  z0
    The PatchTST for classification model.
    c                      ^  \ rS rSrS\4U 4S jjr\     SS\R                  S\	\R                     S\	\
   S\	\
   S\	\
   S	\	\
   S
\\\4   4S jj5       rSrU =r$ )PatchTSTForClassificationi  r   c                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        SUl        [        U5      U l        [        U5      U l        U R                  5         g )N+Setting `do_mask_input` parameter to False.F)
r!   r"   r  loggerwarningr  r   r  r  rB  rt   s     r-   r"   "PatchTSTForClassification.__init__  sT      NNHI#(F "6*
.v6	 	r/   r   target_valuesr  rD  r?   r  r@   c                 V   Ub  UOU R                   R                  nU R                  UUUUSS9nU R                  UR                  5      nSn	Ub  [
        R                  " 5       n
U
" X5      n	U(       d  U4USS -   nU	b  U	4U-   nU$ UnU$ [        U	UUR                  UR                  S9$ )a  
past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
    Input sequence to the model
target_values (`torch.Tensor`, *optional*):
    Labels associates with the `past_values`
past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
    Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
    in `[0, 1]`:

    - 1 for values that are **observed**,
    - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

Examples:

```python
>>> from transformers import PatchTSTConfig, PatchTSTForClassification

>>> # classification task with two input channel2 and 3 classes
>>> config = PatchTSTConfig(
...     num_input_channels=2,
...     num_targets=3,
...     context_length=512,
...     patch_length=12,
...     stride=12,
...     use_cls_token=True,
... )
>>> model = PatchTSTForClassification(config=config)

>>> # during inference, one only provides past values
>>> past_values = torch.randn(20, 512, 2)
>>> outputs = model(past_values=past_values)
>>> labels = outputs.prediction_logits
```NTr  r   r   )rV  r`  r:   rG  )
r   r  r   r  rF  r   CrossEntropyLossr_  r:   rG  )r+   r   r  r  rD  r?   r  r  y_hatr  rV  r   s               r-   r[   !PatchTSTForClassification.forward  s    X &1%<k$++B]B]zz#1!5/ " 
 		,889$&&(DE1Hha!33G/7/CxkG+GN JQGN.#&44#..	
 	
r/   r  r  )r^   r_   r`   ra   r   r"   r   rI   rf   r   re   r   r  r_  r[   rg   rh   ri   s   @r-   r  r    s    ~   15-1/3,0&*D
\\D
  -D
 %TN	D

 'tnD
 $D>D
 d^D
 
u55	6D
 D
r/   r  z,
    The PatchTST for regression Model.
    c                   Z   ^  \ rS rSrSS\S\4U 4S jjjrS\R                  4S jr	Sr
U =r$ )	PatchTSTPredictionHeadi  r   r   c                 H  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR
                  U l        U R
                  (       d  U R                  (       a  UR                  nOUR                  U-  nU R                  (       Gd]  [        R                  " 5       U l	        [        R                  " 5       U l
        [        R                  " 5       U l        [        U R                  5       H  nU R                  R                  [        R                  " SS95        Uc:  U R                  R                  [        R                  " XAR                   5      5        O*U R                  R                  UR#                  U5      5        U R                  R                  UR$                  S:  a   [        R&                  " UR$                  5      O[        R(                  " 5       5        M     g[        R                  " SS9U l        Uc&  [        R                  " XAR                   5      U l        OUR#                  U5      U l        UR$                  S:  a   [        R&                  " UR$                  5      O[        R(                  " 5       U l        g)z
num_patches (`int`):
    The number of patches in the input sequence.
distribution_output (`DistributionOutput`, *optional*):
    The distribution output layer for probabilistic forecasting. If None, a linear output layer is used.
r4   r  Nr   )r!   r"   share_projectionr   r   r  rq   r   r  projectionsdropoutsflattensr  r   r  r&   prediction_lengthget_parameter_projectionr  r   r   r  
projectionr   )r+   r   r   distribution_outputr#   r  r,   s         r-   r"   PatchTSTPredictionHead.__init__  s    	 & 7 7"(";";#11"// 2 2~~H~~3H$$$!}}DMMODMMMODM4223$$RZZ!%<=&.$$++BIIh@X@X,YZ $$++,?,X,XYa,bc$$H[H[^_H_RZZ0C0C%Degepepers 4 ::2DL"*"$))H6N6N"O #6"N"Nx"X>D>Q>QTU>U2::f&9&9:[][f[f[hDLr/   r  c                    U R                   (       a  USS2SS2SSS24   nOLU R                  S:X  a  UR                  SS9nO,U R                  S:X  a  UR                  SS9R                  nOUnU R
                  (       d  / n[        U R                  5       H]  nU R                  U   " USS2USS24   5      nU R                  U   " U5      nU R                  U   " U5      nUR                  U5        M_     [        R                  " USS9nO3U R                  U5      nU R                  U5      nU R!                  U5      n[#        U[$        5      (       a  [%        S U 5       5      nU$ UR'                  SS5      nU$ )	a2  
Parameters:
    embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
             `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
        Embedding from the model
Returns:
    `torch.Tensor` of shape `(bs, forecast_len, num_channels)`

Nr   r   r4   rC   r   r   c              3   D   #    U  H  oR                  S S5      v   M     g7f)r4   r   N)r6   )r  zs     r-   r  1PatchTSTPredictionHead.forward.<locals>.<genexpr>E  s     =f;;q!,,fs    )r   r  r   r   r  r  r  r   r  r  r  r   rI   r  r  r   r  r   r  r6   )r+   r  r  rx   r  s        r-   r[   PatchTSTPredictionHead.forward  sl    (Aq!4  F*#,>>a>#8 ""e+#,==Q=#7#>#>  $- $$F4223#'==#34DQ1W4M#N #'==#34D#E  $(#3#3A#67G#H ./ 4 [[Q/F  $||,<=#||,<= __%56Ffe$$=f==F  %%a+Fr/   )
r   r  r  r  r   r  r  r  r  r   r   )r^   r_   r`   ra   r   rc   r"   rI   rf   r[   rg   rh   ri   s   @r-   r  r    s5    )i~ )iC )i )iV1 1 1r/   r  z,
    The PatchTST for prediction model.
    c                     ^  \ rS rSrS\4U 4S jjr     SS\R                  S\\R                     S\\R                     S\\	   S\\	   S	\\	   S
\
\\4   4S jjr SS\R                  S\\R                     S
\4S jjrSrU =r$ )PatchTSTForPredictioniK  r   c                   > [         TU ]  U5        UR                  (       a  [        R	                  S5        SUl        [        U5      U l        UR                  S:X  a  S U l        OUR                  S:X  a  [        UR                  S9U l        OjUR                  S:X  a  [        UR                  S9U l        OAUR                  S:X  a  [        UR                  S9U l        O[        SUR                   35      e[        XR                  R                  R                   U R                  S	9U l        U R%                  5         g )
Nr  Fmse	student_trC   normalnegative_binomialUnknown distribution output )r  )r!   r"   r  r  r  r  r   rV  r  r   r   r   r   r$   r  r  r   r  rB  rt   s     r-   r"   PatchTSTForPrediction.__init__Q  s     NNHI#(F "6*
;;%'+D$))[8+9f>V>V+W(++x7+7F<T<T+U(++/BB+AfF^F^+_( #?@Z@Z?[!\]]*JJ))554KcKc
	
 	r/   r   r  r  rD  r?   r  r@   c           	         Ub  UOU R                   R                  nU R                  UUUUSS9nU R                  UR                  5      nSn	U R
                  (       a  Un
OXR                  -  UR                  -   n
Ubr  U R
                  (       aE  U R
                  R                  XR                  UR                  S9n[        X5      n	[        U	5      n	O[        R                  " SS9nU" X5      n	UR                  nUR                  nU(       d  U
4USS -   nU	b  U	4U-   nU$ UnU$ [        U	U
UR                  UR                  UUS	9$ )
a  
Parameters:
    past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
        Input sequence to the model
    past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
        Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
        in `[0, 1]`:

        - 1 for values that are **observed**,
        - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
    future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*):
        Future target values associated with the `past_values`
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers
    output_attentions (`bool`, *optional*):
        Whether or not to return the output attention of all layers
    return_dict (`bool`, *optional*):
        Whether or not to return a `ModelOutput` instead of a plain tuple.

Returns:
    `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
    `config.return_dict`=False)

Examples:

```python
>>> from huggingface_hub import hf_hub_download
>>> import torch
>>> from transformers import PatchTSTConfig, PatchTSTForPrediction

>>> file = hf_hub_download(
...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
... )
>>> batch = torch.load(file)

>>> # Prediction task with 7 input channels and prediction length is 96
>>> model = PatchTSTForPrediction.from_pretrained("namctin/patchtst_etth1_forecast")

>>> # during training, one provides both past and future values
>>> outputs = model(
...     past_values=batch["past_values"],
...     future_values=batch["future_values"],
... )

>>> loss = outputs.loss
>>> loss.backward()

>>> # during inference, one only provides past values, the model outputs future values
>>> outputs = model(past_values=batch["past_values"])
>>> prediction_outputs = outputs.prediction_outputs
```NTr  rP  rQ  r   r  r   rB   )rV  r]  r:   rG  rP  rQ  )r   r  r   r  rF  r  rQ  rP  distributionrh  rs  r   r  r\  r:   rG  )r+   r   r  r  rD  r?   r  r  r  r  	y_hat_outr  rV  rP  rQ  r   s                   r-   r[   PatchTSTForPrediction.forwardn  sf   z &1%<k$++B]B] zz#1!5/ " 
 		,889##I 2 22\5E5EEI$''#77DD//|7I7I  E   |;+H5zzF3	9"" l\!B%77G/7/CxkG+GN JQGN*(&44#..
 	
r/   c                    U R                   R                  nU " USUSS9nU R                  (       aw  U R                  R                  UR                  UR
                  UR                  S9n[        U5       Vs/ s H  oeR                  5       PM     nn[        R                  " USS9nOUR                  R                  S5      n[        US9$ s  snf )a  
Generate sequences of sample predictions from a model with a probability distribution head.

Parameters:
    past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Past values of the time series that serves as context in order to predict the future.
    past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
        Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
        in `[0, 1]`:

        - 1 for values that are **observed**,
        - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

Return:
    [`SamplePatchTSTOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
    samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)`
    for multivariate predictions.
NF)r   r  r  rD  r  r   rC   rc  )r   num_parallel_samplesr  r  r]  rP  rQ  r  samplerI   r  r   rb  r+   r   r  r  r   r  rQ   sampless           r-   generatePatchTSTForPrediction.generate  s    0  ${{?? #1!&	
 ##33@@**7== A L 7<<P6QR6Q**,6QGRkk'q1G00::1=G#g66 Ss   7Cr  r  r   r  r   )r^   r_   r`   ra   r   r"   rI   rf   r   re   r   r   r\  r[   rb  r  rg   rh   ri   s   @r-   r  r  K  s    ~ @ 6:04/3,0&*k
\\k
 %U\\2k
  -	k

 'tnk
 $D>k
 d^k
 
u11	2k
` 6:-7\\-7 %U\\2-7 
	-7 -7r/   r  c                   Z   ^  \ rS rSrSrSS\4U 4S jjjrS\R                  4S jr	Sr
U =r$ )	PatchTSTRegressionHeadi  z
Regression head
r   c                 
  > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        X l        UR                  UR                  -  n[        R                  " SS9U l        UR                  S:  a   [        R                  " UR                  5      O[        R                  " 5       U l        Uc&  [        R                   " X1R"                  5      U l        g UR'                  U5      U l        g r  )r!   r"   output_rangey_ranger   r  r  r   rq   r   r  r  r  r   r   r   r&   r  r  r  )r+   r   r  r#   r,   s       r-   r"   PatchTSTRegressionHead.__init__  s    **#11"//#6 ,,v~~=zzA.:@:M:MPQ:Qrzz&"5"56WYWbWbWd& ii2D2DEDO1JJ8TDOr/   r  c                 @   U R                   (       a  USS2SS2SSS24   nOcU R                  S:X  a  UR                  SS9nOCU R                  S:X  a  UR                  SS9R                  nO[        SU R                   S35      eU R                  U R                  U5      5      nU R                  U5      nU R                  SL U R                  SL-  (       aF  [        R                  " U5      U R                  S	   U R                  S   -
  -  U R                  S   -   nU$ )
a!  
Parameters:
    embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
            `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
        Embedding from the model
Returns:
    `torch.Tensor` of shape `(bs, output_dim)`

Nr   r   r4   rC   r   r  r  r   )r   r  r   r   r  r$   r   r  r  r  r%  rI   sigmoidr  s       r-   r[   PatchTSTRegressionHead.forward!  s    (Aq!4&((~~!~4%'(}}}3::01B1B0CCZ[\\  <<5E(FG !12$$,T1IJ]]6*dll1oQ.OPSWS_S_`aSbbFr/   )r  r   r  r  r  r   r%  r   rz   ri   s   @r-   r"  r"    s1    U~ U U"  r/   r"  z,
    The PatchTST for regression model.
    c                     ^  \ rS rSrS\4U 4S jjr\     SS\R                  S\	\R                     S\	\R                     S\	\
   S\	\
   S	\	\
   S
\\\4   4S jj5       r SS\R                  S\	\R                     S
\4S jjrSrU =r$ )PatchTSTForRegressioniB  r   c                 H  > [         TU ]  U5        UR                  (       a  [        R	                  S5        SUl        [        U5      U l        UR                  S:X  a  S U l        OUR                  S:X  a  [        UR                  S9U l        OjUR                  S:X  a  [        UR                  S9U l        OAUR                  S:X  a  [        UR                  S9U l        O[        SUR                   35      e[        XR                  5      U l        U R!                  5         g )	Nr  Fr  r  rC   r  r  r  )r!   r"   r  r  r  r  r   rV  r  r   r  r   r   r$   r"  r  rB  rt   s     r-   r"   PatchTSTForRegression.__init__H  s      NNHI#(F "6*
;;%'+D$))[8+9f>P>P+Q(++x7+7F<N<N+O(++/BB+AfFXFX+Y( #?@Z@Z?[!\]]*63K3KL	 	r/   r   r  r  rD  r?   r  r@   c           	      b   Ub  UOU R                   R                  nU R                  UUUUSS9nU R                  UR                  5      nSn	Ub  U R
                  (       aq  U R
                  R                  U5      n
[        U Vs/ s H(  oR                  SU R                   R                  5      PM*     sn5      n[        X5      n	[        U	5      n	O[        R                  " SS9n	U	" X5      n	U(       d  U4USS -   nU	b  U	4U-   nU$ UnU$ [        U	UUR                  UR                   S	9$ s  snf )
a  
past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
    Input sequence to the model
target_values (`torch.Tensor` of shape `(bs, num_input_channels)`):
    Target values associates with the `past_values`
past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
    Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
    in `[0, 1]`:

    - 1 for values that are **observed**,
    - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
    Whether or not to return a `ModelOutput` instead of a plain tuple.

Examples:

```python
>>> from transformers import PatchTSTConfig, PatchTSTForRegression

>>> # Regression task with 6 input channels and regress 2 targets
>>> model = PatchTSTForRegression.from_pretrained("namctin/patchtst_etth1_regression")

>>> # during inference, one only provides past values, the model outputs future values
>>> past_values = torch.randn(20, 512, 6)
>>> outputs = model(past_values=past_values)
>>> regression_outputs = outputs.regression_outputs
```NTr  rB   r   r  r   r   )rV  rZ  r:   rG  )r   r  r   r  rF  r  r  r  r5   r  rh  rs  r   r  rY  r:   rG  )r+   r   r  r  rD  r?   r  r  r  rV  r  itemr   s                r-   r[   PatchTSTForRegression.forwardb  s=   J &1%<k$++B]B]zz#1!5/ " 
 		,889$''#77DDUKRWXRW$yyT[[-D-DERWXY<7'-zzF3E1ha!33G+/+;tg'GN BIGN*$&44#..	
 	
 Ys   /D,c                 h   U R                   R                  nU " USUSS9nU R                  R                  UR                  5      n[        U5       Vs/ s H  oeR                  5       PM     nn[        R                  " USS9R                  SX0R                   R                  5      n[        US9$ s  snf )a:  
Generate sequences of sample predictions from a model with a probability distribution head.

Parameters:
    past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Past values of the time series that serves as context in order to predict the future.
    past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
        Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
        in `[0, 1]`:

        - 1 for values that are **observed**,
        - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

Return:
    [`SamplePatchTSTOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
    samples, num_targets)`.
NF)r   r  r  rD  r   rC   rB   r  )r   r  r  r  rZ  r  r  rI   r  r5   r  rb  r  s           r-   r  PatchTSTForRegression.generate  s    .  ${{?? #1!&	
 //<<W=W=WX278L2MN2MQ&&(2MN++g1-2227K[[MdMde#g66 Os   B/r   r  r   )r^   r_   r`   ra   r   r"   r   rI   rf   r   re   r   r  rY  r[   rb  r  rg   rh   ri   s   @r-   r+  r+  B  s    ~ 4  1559/3,0&*G
\\G
  -G
 %U\\2	G

 'tnG
 $D>G
 d^G
 
u11	2G
 G
X 6:'7\\'7 %U\\2'7 
	'7 '7r/   r+  )r  r   r  r  r+  r  )NFr   r  rM  )Frb   r0  dataclassesr   typingr   r   r   rI   r   activationsr	   modeling_outputsr
   modeling_utilsr   time_series_utilsr   r   r   utilsr   r   r   configuration_patchtstr   
get_loggerr^   r  Moduler   rk   rf   rd   listre   rc   r   r   r   r   r   r   r  r   r
  rO  rU  rY  r\  r_  rb  distributionsDistributionrh  rs  ru  r  r  r  r  r  r  r  r  r  r  r"  r+  __all__r   r/   r-   <module>rA     s     ! ) )   " / - U U 9 9 2 
		H	%[B		 [B|&		 &2 04',7%LL7%7% 'tn7% !%	7%
 7%z 04	A%LLA%$T3Y/A% 'tnA% 	A%H-ryy -`9"bii 9"xG299 GT 2o 2 2B!		 !H5 5p;x- ;x| 4+ 4 4< :; : :8 :+ : :8 .+ . .D :k : :: 
2; 
2 
2#u""// # #%,, #*5<< *(5<<:P *fkfrfr *2 0		  0H3; 3;n 		  6 RYY  8 m
+ m
 m
`ryy 8 
l
4 l

l
^" "J 
T
 7 T

T
n 
]RYY ]
]@ 
x73 x7
x7v4RYY 4n 
L73 L7
L7^r/   