o
    Zhs`                    @   sx  d Z ddlZddlmZ ddlmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZmZ ddlmZ eeZG dd de	jZG dd de	jZ			dadej de!dee" de#de$f
ddZ%		dbdej dee"e$f dee" de$fddZ&G dd de	jZ'G dd  d e	jZ(G d!d" d"e	jZ)eG d#d$ d$eZ*G d%d& d&e	jZ+G d'd( d(e	jZ,G d)d* d*e*Z-eG d+d, d,eZ.eG d-d. d.eZ/eG d/d0 d0eZ0eG d1d2 d2eZ1eG d3d4 d4eZ2eG d5d6 d6eZ3d7ej4j5d8ej d9ej fd:d;Z6dcd<ej d=eej  d9ej fd>d?Z7G d@dA dAe	jZ8G dBdC dCe	jZ9G dDdE dEe	jZ:G dFdG dGe	jZ;eG dHdI dIe*Z<G dJdK dKe	jZ=edLdMG dNdO dOe*Z>G dPdQ dQe	jZ?edRdMG dSdT dTe*Z@edUdMG dVdW dWe	jZAedXdMG dYdZ dZe*ZBG d[d\ d\e	jZCed]dMG d^d_ d_e*ZDg d`ZEdS )dzPyTorch PatchTST model.    N)	dataclass)OptionalTupleUnion)nn   )ACT2CLS)BaseModelOutput)PreTrainedModel)NegativeBinomialOutputNormalOutputStudentTOutput)ModelOutputauto_docstringlogging   )PatchTSTConfigc                       s   e Zd ZdZ					ddededed	ed
ededee f fddZ	de
jdedefddZ					dde
jdee
j deee
j  dee
j dee
j dedee
jee
j eee
j  f fddZ  ZS )PatchTSTAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        FTN	embed_dim	num_headsdropout
is_decoderbias	is_causalconfigc                    s   t    || _|| _|| _|| | _|| _| j| | jkr*td| j d| d| jd | _|| _	|| _
tj|||d| _tj|||d| _tj|||d| _tj|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).g      ࿩r   )super__init__r   r   r   head_dimr   
ValueErrorscalingr   r   r   Lineark_projv_projq_projout_proj)selfr   r   r   r   r   r   r   	__class__ ]/var/www/auris/lib/python3.10/site-packages/transformers/models/patchtst/modeling_patchtst.pyr   '   s&   



zPatchTSTAttention.__init__tensorseq_lenbszc                 C   s    | ||| j| jdd S )Nr      )viewr   r    	transpose
contiguous)r(   r-   r.   r/   r+   r+   r,   _shapeF   s    zPatchTSTAttention._shapehidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskoutput_attentionsreturnc                 C   sr  |du}|  \}}	}
| || j }|r.|dur.|d jd |jd kr.|d }|d }nZ|rE| | |d|}| | |d|}nC|durt| | |d|}| | |d|}tj|d |gdd}tj|d |gdd}n| | |d|}| | |d|}| j	r||f}|| j
 d| jf}| ||	|j| }|j| }|j| }| d}t||dd}|  || j
 |	|fkrtd|| j
 |	|f d|   |dur|  |d|	|fkrtd	|d|	|f d|   ||| j
|	|| }||| j
 |	|}tjj|dd}|durL|  | j
fkr1td
| j
f d|   |dddd||| j
|	| }||| j
 |	|}|rc||| j
|	|}||| j
 |	|}nd}tjj|| j| jd}t||}|  || j
 |	| jfkrtd|| j
 |	| jf d|   ||| j
|	| j}|dd}|||	| j}| |}|||fS )z#Input shape: Batch x Time x ChannelNr   r0   r   dimz$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size )ptrainingz `attn_output` should be of size )sizer&   r"   shaper4   r$   r%   torchcatr   r   r    r1   reshapeZbmmr2   r!   r   Z
functionalZsoftmaxr   r@   r   r'   )r(   r5   r6   r7   r8   r9   r:   Zis_cross_attentionr/   Ztgt_len_Zquery_statesZ
key_statesZvalue_statesZ
proj_shapeZsrc_lenattn_weightsZattn_weights_reshapedZ
attn_probsattn_outputr+   r+   r,   forwardI   s   





"

zPatchTSTAttention.forward)r   FTFN)NNNNF)__name__
__module____qualname____doc__intfloatboolr   r   r   rC   Tensorr4   r   rI   __classcell__r+   r+   r)   r,   r   $   sV    r   c                       6   e Zd ZdZdef fddZdejfddZ  Z	S )PatchTSTBatchNormzP
    Compute batch normalization over the sequence length (time) dimension.
    r   c                    s"   t    tj|j|jd| _d S )NZeps)r   r   r   ZBatchNorm1dd_modelnorm_eps	batchnormr(   r   r)   r+   r,   r      s   
zPatchTSTBatchNorm.__init__inputsc                 C   s"   | dd}| |}| ddS )a  
        Parameters:
            inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
                input for Batch norm calculation
        Returns:
            `torch.Tensor` of shape `(batch_size, sequence_length, d_model)`
        r   r0   )r2   rX   )r(   rZ   outputr+   r+   r,   rI      s   
zPatchTSTBatchNorm.forward
rJ   rK   rL   rM   r   r   rC   rQ   rI   rR   r+   r+   r)   r,   rT      s    rT   FrZ   
mask_ratiounmasked_channel_indiceschannel_consistent_masking
mask_valuec                 C   s*  |dk s|dkrt d| d| j\}}}}| j}	t|d|  }
|r5tj|d||	d}|d|d}n	tj||||	d}tj||||	d}d|ddddd|
f< tj|dd}tj|dd}tj	|d|d	}|
dddd|}|durd|dd|ddddf< | | |}||d
 fS )a  random_masking: Mask the input considering the control variables.

    Args:
        inputs (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, num_features)`):
            The input tensor to mask.
        mask_ratio (`float`):
            Masking ratio applied to mask the input data during random pretraining. It is the number between 0 and 1.
        unmasked_channel_indices (list, *optional*):
            Indices of channels that will not be masked.
        channel_consistent_masking (bool, *optional*, defaults to `False`):
            When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
            across channels.
        mask_value (int, *optional*, defaults to 0):
            Define the value of masked patches for pretraining.

    Returns:
        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as input Tensor and mask tensor of shape [bs x c x
        n]
    r   r   zMask ratio z has to be between 0 and 1.deviceNr<   r=   )r>   index.r   )r!   rB   rb   rN   rC   ZrandrepeatZonesZargsortgather	unsqueezemasked_fillrP   )rZ   r]   r^   r_   r`   
batch_sizenum_channelssequence_lengthnum_featuresrb   Zlen_keepnoisemaskZids_shuffleZids_restoreinputs_maskr+   r+   r,   random_masking   s&   rp   num_forecast_mask_patchesc                 C   s  t |tr|g}dd |D }| j\}}}}tj|||| jd}	g }
d}t|}t||D ](\}}|dks9||krAtd| dt|| | }|
	|||g ||7 }q-t
|
dd d	}
||k rq|
d d
 ||  |
d d
< n||kr|
d d
 ||  |
d d
< d}|
D ]\}}}|| }d|	||dd| df< |}qt|	jd }|	| }	|	dddd|}	|durd|	dd|ddddf< | |	 |}||	d fS )a  Forecast masking that masks the last K patches where K is from the num_forecast_mask_patches.
    If num_forecast_mask_patches is a list, samples in the batch will be randomly masked by numbers defined in the list.

    Parameters:
        inputs (`torch.Tensor`):
            Input of shape `(bs, num_channels, num_patch, patch_length)`
        num_forecast_mask_patches (`list`):
            Number of patches to be masked at the end of each batch sample. e.g. 4 or [3, 5].
        unmasked_channel_indices (`list`, *optional*):
            Indices of channels that are not masked.
        mask_value (`int`, *optional*, defaults to 0):
            Values in the masked patches will be filled by `mask_value`.

    Returns:
        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs,
        num_channels , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)`
    c                 S   s   g | ]}d qS )r   r+   .0rF   r+   r+   r,   
<listcomp>,  s    z$forecast_masking.<locals>.<listcomp>ra   r   znum_forecast_mask_patches z6 should be greater than 0 and less than total patches.c                 S   s   | d S )Nr0   r+   )xr+   r+   r,   <lambda>>  s    z"forecast_masking.<locals>.<lambda>)keyr0   r<   r   Nrd   )
isinstancerN   rB   rC   zerosrb   sumzipr!   appendsortedZrandpermrg   re   rh   rP   )rZ   rq   r^   r`   Zforecast_mask_ratiosri   rj   rk   rl   rn   Zt_listtotal_lengthtotal_ratiopatch_lengthratioZtemp_lenZbatch1Z	patch_lenrF   Zbatch2permro   r+   r+   r,   forecast_masking  sB   


r   c                       rS   )PatchTSTPatchifyz
    A class to patchify the time series sequence into different patches

    Returns:
        `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
    r   c                    s   t    |j| _|j| _|j| _| j| jkr$td| j d| j dt| j| j| j | j d | _| j| j| jd   }| j| | _	d S )NzSequence length (z+) has to be greater than the patch length ()r   )
r   r   Zcontext_lengthrk   r   patch_strider!   maxnum_patchessequence_start)r(   r   Znew_sequence_lengthr)   r+   r,   r   ^  s   
 zPatchTSTPatchify.__init__past_valuesc                 C   sp   |j d }|| jkrtd| d| j d|dd| jdddf }|jd| j| jd}|dd }|S )a!  
        Parameters:
            past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*):
                Input for patchification

        Returns:
            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
        zInput sequence length (z%) doesn't match model configuration (r   N)	dimensionrA   step)	rB   rk   r!   r   Zunfoldr   r   r2   r3   )r(   r   rk   r[   r+   r+   r,   rI   o  s   
	
zPatchTSTPatchify.forwardr\   r+   r+   r)   r,   r   V  s    r   c                       rS   )PatchTSTMaskinga  
    Class to perform random or forecast masking.

    Parameters:
        config (`PatchTSTConfig`): model config
    Returns:
        x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
            Masked patched input
        mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
            Bool tensor indicating True on masked points
    r   c                    sX   t    |j| _|j| _|j| _|j| _|j| _|j| _| jd ur*t| j| _d S d S N)	r   r   random_mask_ratior_   	mask_typerq   r^   r`   r}   rY   r)   r+   r,   r     s   

zPatchTSTMasking.__init__patch_inputc                 C   sr   | j dkrt|| j| j| j| jd\}}n| j dkr(t|| j| j| jd\}}n	td| j  d|	 }||fS )a  
        Parameters:
            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                Patch input

        Return:
            masked_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
                Masked patched input
            mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
                Bool tensor indicating True on masked points

        random)rZ   r]   r^   r_   r`   Zforecast)rZ   rq   r^   r`   zInvalid mask type .)
r   rp   r   r^   r_   r`   r   rq   r!   rP   )r(   r   Zmasked_inputrn   r+   r+   r,   rI     s$   

zPatchTSTMasking.forwardr\   r+   r+   r)   r,   r     s    r   c                       s@   e Zd ZdZdef fddZd
dejdee	 fdd	Z
  ZS )PatchTSTEncoderLayerz 
    PatchTST encoder layer
    r   c              
      s  t    |j| _t|j|j|jd| _|jdkrt	
|jnt	 | _|jdkr/t|| _n|jdkr?t	j|j|jd| _nt|j d| jr}|jdkrUt	
|jnt	 | _|jdkret|| _n|jdkrut	j|j|jd| _nt|j dt	t	j|j|j|jdt|j  |jdkrt	
|jnt	 t	j|j|j|jd| _|jdkrt	
|jnt	 | _|jdkrt|| _n|jdkrt	j|j|jd| _nt|j d|j| _d S )N)r   r   r   r   rX   Z	layernormrU   z$ is not a supported norm layer type.r   ) r   r   channel_attentionr   rV   Znum_attention_headsZattention_dropout	self_attnZpath_dropoutr   DropoutIdentitydropout_path1Z	norm_typerT   norm_sublayer1	LayerNormrW   r!   dropout_path2norm_sublayer2Z
Sequentialr#   Zffn_dimr   r   Zactivation_functionZ
ff_dropoutffdropout_path3norm_sublayer3pre_normrY   r)   r+   r,   r     sB   
 

 


 

zPatchTSTEncoderLayer.__init__Nhidden_stater:   c                 C   s  |j \}}}}||| ||}| jr(| j| ||d\}}}	|| | }n| j||d\}}}	| || | }|||||}| jr|dd	 }||| ||}| jrp| j| 
||d\}}
}	|| | }n| j||d\}}
}	| 
|| | }|||||}|dd	 }||| ||}| jr|| | | | }n| || | | }|||||}|f}|r|| jr||
fn|f7 }|S )a  
        Parameters:
            hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
                Past values of the time series
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
        Return:
            `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`

        )r5   r:   r0   r   )rB   r1   r   r   r   r   rE   r   r2   r3   r   r   r   r   r   )r(   r   r:   ri   num_input_channelsrk   rV   rH   rG   rF   Zchannel_attn_weightsoutputsr+   r+   r,   rI     sF   

zPatchTSTEncoderLayer.forwardr   )rJ   rK   rL   rM   r   r   rC   rQ   r   rP   rI   rR   r+   r+   r)   r,   r     s    "1r   c                   @   s.   e Zd ZeZdZdZdZdd Zd	ddZ	dS )
PatchTSTPreTrainedModelmodelr   Fc                 C   s   t |tr&| jjrtjj|jdd | jjdkr$tjj|j	ddd dS dS t |tj
r;|jj  |jjd dS t |trQ|jjj  |jjjd dS t |tjtjfrr|jjjd| jjd |jdurt|jj  dS dS dS )	z$
        Initialize weights
        g{Gz?)stdr   r   g?)meanr         ?N)rx   PatchTSTPositionalEncodingr   use_cls_tokenr   initZnormal_	cls_tokenpositional_encoding_typeposition_encr   r   dataZzero_weightZfill_rT   rX   r#   ZConv1dZinit_std)r(   moduler+   r+   r,   _init_weightsS  s$   


z%PatchTSTPreTrainedModel._init_weightsc                 C   s   t |tr
||_d S d S r   )rx   PatchTSTEncodergradient_checkpointing)r(   r   valuer+   r+   r,   _set_gradient_checkpointingi  s   

z3PatchTSTPreTrainedModel._set_gradient_checkpointingN)F)
rJ   rK   rL   r   Zconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingr   r   r+   r+   r+   r,   r   L  s    r   c                       2   e Zd Zdef fddZdejfddZ  ZS )PatchTSTEmbeddingr   c                    sl   t    |j| _|j| _| jrt|j|j| _d S t	 | _t
|jD ]}| jt|j|j q%d S r   )r   r   r   share_embeddingr   r#   r   rV   input_embedding
ModuleListranger|   )r(   r   rF   r)   r+   r,   r   o  s   

zPatchTSTEmbedding.__init__r   c                    sj    j d }|jkrtdj d| djr  }|S  fddt|D }tj|dd}|S )a%  
        Parameters:
            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                Patch input for embedding
        return:
            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)`
        r   z&The defined number of input channels (zQ) in the config has to be the same as the number of channels in the batch input (r   c              	      s2   g | ]}j |  d d |d d d d f qS r   )r   rs   ir   r(   r+   r,   rt     s   2 z-PatchTSTEmbedding.forward.<locals>.<listcomp>r=   )rB   r   r!   r   r   r   rC   stack)r(   r   r   Z
embeddingsr+   r   r,   rI   {  s   
	


zPatchTSTEmbedding.forward	rJ   rK   rL   r   r   rC   rQ   rI   rR   r+   r+   r)   r,   r   n  s    r   c                       sV   e Zd ZdZdedef fddZedededej	fddZ
d	ejfd
dZ  ZS )r   z'
    Class for positional encoding
    r   r   c                    sz   t    |j| _|j| _|jr!ttddd|j| _	|d7 }| 
||| _|jdkr6t|j| _d S t | _d S )Nr   r   )r   r   r   r   r   	ParameterrC   ry   rV   r   _init_per   positional_dropoutr   r   r(   r   r   r)   r+   r,   r     s   
z#PatchTSTPositionalEncoding.__init__r;   c                 C   s   | j dkrtjt|| jdd}|S | j dkrst|| j}td|d}t	td| jdt
d| j   }t|| |d d dd df< t|| |d d dd df< ||  }|| d	  }tj|d
d}|S t| j  d)Nr   TZrequires_gradZsincosr   r   r0   g     @
   FzN is not a valid positional encoder. Available types are 'random' and 'sincos'.)r   r   r   rC   ZrandnrV   ry   Zarangerg   expmathlogsincosr   r   r!   )r   r   r   positionZdiv_termr+   r+   r,   r     s    

(  
z#PatchTSTPositionalEncoding._init_per   c                 C   s   | j r8| || jdd d d f  }| j| jd dd d f  }||jd | jdd}tj||fdd}|S | || j }|S )Nr   r   r<   r0   r=   )	r   r   r   r   expandrB   r   rC   rD   )r(   r   r   Z
cls_tokensr   r+   r+   r,   rI     s    z"PatchTSTPositionalEncoding.forward)rJ   rK   rL   rM   r   rN   r   staticmethodr   r   r   rC   rQ   rI   rR   r+   r+   r)   r,   r     s    r   c                	       sT   e Zd ZdZdedef fddZ		ddejde	e
 d	e	e
 d
efddZ  ZS )r   z
    PatchTST Encoder
    r   r   c                    sT   t    d| _t | _t || _t fddt	 j
D | _|   d S )NFc                    s   g | ]}t  qS r+   )r   r   r   r+   r,   rt         z,PatchTSTEncoder.__init__.<locals>.<listcomp>)r   r   r   r   embedderr   positional_encoderr   r   r   Znum_hidden_layerslayers	post_initr   r)   r   r,   r     s   
 zPatchTSTEncoder.__init__Nr   output_hidden_statesr:   r;   c           	      C   s   |dur|n| j j}|dur|n| j j}| |}| |}|r"dnd}|r(dnd}| jD ]}|r6||f }|||d}|d }|rI||d f }q-t|||dS )a  
        Parameters:
            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                Past values of the time series
            output_hidden_states (bool, optional): Indicates if hidden states should be outputted.
            output_attentions (bool, optional): Indicates if attentions should be outputted.

        return:
            `BaseModelOutput`
        Nr+   )r   r:   r   r   )last_hidden_stater5   
attentions)r   r:   r   r   r   r   r	   )	r(   r   r   r:   r   encoder_statesZall_attentionsZencoder_layerZlayer_outputsr+   r+   r,   rI     s    



zPatchTSTEncoder.forwardNN)rJ   rK   rL   rM   r   rN   r   rC   rQ   r   rP   r	   rI   rR   r+   r+   r)   r,   r     s    r   c                   @   s   e Zd ZU dZdZeej ed< dZ	ee
ej  ed< dZee
ej  ed< dZeej ed< dZeej ed< dZeej ed< dZeej ed	< dS )
PatchTSTModelOutputa  
    Base class for model's outputs, with potential hidden states.

    Parameters:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of
            the model at the output of each layer plus the optional initial embedding outputs.
        mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`, *optional*)
            Bool masked tensor indicating which patches are masked
        loc: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
            Mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
        scale: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
            Std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
        patch_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
            Patched input to the Transformer
    Nr   r5   r   rn   locscaler   )rJ   rK   rL   rM   r   r   rC   FloatTensor__annotations__r5   r   r   rn   r   r   r   r+   r+   r+   r,   r     s   
 r   c                   @   b   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dS )PatchTSTForPretrainingOutputa  
    Output type of [`PatchTSTForPretraining`].

    Parameters:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            MSE loss.
        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction outputs of the time series modeling heads.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossprediction_outputr5   r   )rJ   rK   rL   rM   r   r   rC   r   r   r   r5   r   r   r+   r+   r+   r,   r   '     
 r   c                   @   r   )PatchTSTForRegressionOutputa  
    Output type of [`PatchTSTForRegression`].

    Parameters:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            MSE loss.
        regression_outputs (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
            Regression outputs of the time series modeling heads.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nr   regression_outputsr5   r   )rJ   rK   rL   rM   r   r   rC   r   r   r   r5   r   r   r+   r+   r+   r,   r   D  r   r   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dZeej ed< dZeej ed< dS )	PatchTSTForPredictionOutputaR  
    Output type of [`PatchTSTForPrediction`].

    Parameters:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            MSE loss.
        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, prediction_length, -1)`):
            Prediction outputs of the time series modeling heads.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        loc: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
            Mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
        scale: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
            Std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
    Nr   prediction_outputsr5   r   r   r   )rJ   rK   rL   rM   r   r   rC   r   r   r   r5   r   r   r   r   r+   r+   r+   r,   r   a  s   
 r   c                   @   r   )PatchTSTForClassificationOutputaR  
    Output type of [`PatchTSTForClassification`].

    Parameters:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Total loss as the sum of the masked language modeling loss and the next sequence prediction
            (classification) loss.
        prediction_logits (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
            Prediction scores of the PatchTST modeling head (scores before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nr   prediction_logitsr5   r   )rJ   rK   rL   rM   r   r   rC   r   r   r   r5   r   r   r+   r+   r+   r,   r     s   
 r   c                   @   s$   e Zd ZU dZdZeej ed< dS )SamplePatchTSTOutputa!  
    Base class for time series model's predictions outputs that contains the sampled values from the chosen
    distribution.

    Parameters:
        sequences `(batch_size, num_samples, prediction_length, num_targets)`):
                Sampled values from the chosen distribution.
    N	sequences)	rJ   rK   rL   rM   r   r   rC   r   r   r+   r+   r+   r,   r     s   
 	r   inputtargetr;   c                 C   s   |  | S )zc
    Computes the negative log likelihood loss from input distribution with respect to target.
    )Zlog_prob)r   r   r+   r+   r,   nll  s   r   input_tensorweightsc                 C   sr   |dur3t |dk| | t | }t j|r|j|dn| dd}|r-|j|d| S | | S | j|dS )aj  
    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.

    Args:
        input_tensor (`torch.FloatTensor`):
            Input tensor, of which the average must be computed.
        weights (`torch.FloatTensor`, *optional*):
            Weights tensor, of the same shape as `input_tensor`.
        dim (`int`, *optional*):
            The dim along which to average `input_tensor`.

    Returns:
        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
    Nr   r=   r   min)rC   where
zeros_likeclamprz   r   )r   r   r>   Zweighted_tensorZsum_weightsr+   r+   r,   weighted_average  s
   " r   c                	       P   e Zd ZdZdef fddZdejdejdeejejejf fdd	Z	  Z
S )
PatchTSTStdScalerz
    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
    subtracting from the mean and dividing by the standard deviation.
    r   c                    sV   t    t|dr|jnd| _t|dr|jnd| _t|dr&|j| _d S d| _d S )Nscaling_dimr   keepdimTminimum_scalegh㈵>)r   r   hasattrr   r>   r   r   rY   r)   r+   r,   r     s   
 zPatchTSTStdScaler.__init__r   observed_indicatorr;   c                 C   sz   |j | j| jd}|d}|| j | j| jd| }|| | d j | j| jd| }t|| j }|| | ||fS )C  
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        r   r   r0   )rz   r>   r   Z	clamp_minrC   sqrtr   )r(   r   r  denominatorr   Zvariancer   r+   r+   r,   rI     s   
"zPatchTSTStdScaler.forwardrJ   rK   rL   rM   r   r   rC   rQ   r   rI   rR   r+   r+   r)   r,   r     s    r   c                	       r   )
PatchTSTMeanScalerz
    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
    accordingly.
    r   c                    sl   t    t|dr|jnd| _t|dr|jnd| _t|dr#|jnd| _t|dr1|j| _d S d | _d S )Nr   r   r   Tr   绽|=default_scale)r   r   r   r   r>   r   r   r	  rY   r)   r+   r,   r     s
   
 zPatchTSTMeanScaler.__init__r   r  r;   c           
      C   s   ||   j| jdd}|j| jdd}|tj|dd }| jdu r:|jdd}tj|ddd}t|| }n| jt| }t|dk||}tj|| j	d}|| }	| j
sa|j| jd}|	t||fS )r  Tr  r   r   Nr   r=   )absrz   r>   rC   r   r	  Zsqueeze	ones_liker   r   r   r   )
r(   r   r  Zts_sumZnum_observedr   Z	batch_sumZbatch_observationsr	  Zscaled_datar+   r+   r,   rI     s   
zPatchTSTMeanScaler.forwardr  r+   r+   r)   r,   r    s    r  c                
       sX   e Zd ZdZdef fddZ	ddejdeej de	ejejejf fd	d
Z
  ZS )PatchTSTNOPScalerz|
    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
    r   c                    s@   t    t|dr|jnd| _t|dr|j| _d S d| _d S )Nr   r   r   T)r   r   r   r   r>   r   rY   r)   r+   r,   r   2  s   
 zPatchTSTNOPScaler.__init__Nr   r  r;   c                 C   sB   t j|ddj| j| jd}t j|ddj| j| jd}|||fS )a  
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        Fr   )r>   r   )rC   r  r   r>   r   r   )r(   r   r  r   r   r+   r+   r,   rI   7  s   
zPatchTSTNOPScaler.forwardr   )rJ   rK   rL   rM   r   r   rC   rQ   r   r   rI   rR   r+   r+   r)   r,   r  -  s    r  c                	       sL   e Zd Zdef fddZdejdejdeejejejf fddZ  Z	S )	PatchTSTScalerr   c                    sR   t    |jdks|jdu rt|| _d S |jdkr"t|| _d S t|| _d S )Nr   Tr   )r   r   r"   r  scalerr   r  rY   r)   r+   r,   r   I  s   

zPatchTSTScaler.__init__r   r  r;   c                 C   s   |  ||\}}}|||fS )a>  
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Input for scaler calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, um_input_channels)`)
        )r  )r(   r   r  r   r   r+   r+   r,   rI   R  s   
zPatchTSTScaler.forward)
rJ   rK   rL   r   r   rC   rQ   r   rI   rR   r+   r+   r)   r,   r  H  s    	r  c                       sv   e Zd Zdef fddZ					ddejdeej deej dee d	ee d
ee de	e
ef fddZ  ZS )PatchTSTModelr   c                    sf   t  | t|| _t|| _|j| _| jj}| jr!t|| _	nt
 | _	t||d| _|   d S )N)r   )r   r   r  r  r   
patchifierdo_mask_inputr   r   maskingr   r   r   encoderr   r   r)   r+   r,   r   f  s   


zPatchTSTModel.__init__Nr   past_observed_maskfuture_valuesr   r:   return_dictr;   c              	   C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r't|}| ||\}}}	| |}
| jr@| 	|
\}}n| 	|
d}}| j
|||d}|sk|j|j|jf}||||	|
f }tdd |D S t|j|j|j|||	|
dS )a  
        Parameters:
            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
                Input sequence to the model
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            future_values (`torch.BoolTensor` of shape `(batch_size, prediction_length, num_input_channels)`, *optional*):
                Future target values associated with the `past_values`
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
            return_dict (`bool`, *optional*):
                Whether or not to return a `ModelOutput` instead of a plain tuple.

        Returns:
            `PatchTSTModelOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)

        Examples:

        ```python
        >>> from huggingface_hub import hf_hub_download
        >>> import torch
        >>> from transformers import PatchTSTModel

        >>> file = hf_hub_download(
        ...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
        ... )
        >>> batch = torch.load(file)

        >>> model = PatchTSTModel.from_pretrained("namctin/patchtst_etth1_pretrain")

        >>> # during training, one provides both past and future values
        >>> outputs = model(
        ...     past_values=batch["past_values"],
        ...     future_values=batch["future_values"],
        ... )

        >>> last_hidden_state = outputs.last_hidden_state
        ```N)r   r   r:   c                 s   s    | ]	}|d ur|V  qd S r   r+   )rs   vr+   r+   r,   	<genexpr>      z(PatchTSTModel.forward.<locals>.<genexpr>)r   r5   r   rn   r   r   r   )r   use_return_dictr:   r   rC   r  r  r  r  r  r  r   r5   r   tupler   )r(   r   r  r  r   r:   r  Zscaled_past_valuesr   r   Zpatched_valuesZmasked_valuesrn   Zencoder_outputr   r+   r+   r,   rI   x  s6   6

zPatchTSTModel.forwardNNNNN)rJ   rK   rL   r   r   rC   rQ   r   rP   r   r   r   rI   rR   r+   r+   r)   r,   r  d  s,    
r  c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	PatchTSTMaskPretrainHeadz-
    Pretraining head for mask modelling
    r   c                    sH   t    |jdkrt|jnt | _t|j|j	| _
|j| _d S Nr   )r   r   head_dropoutr   r   r   r   r#   rV   r   linearr   rY   r)   r+   r,   r     s   
 z!PatchTSTMaskPretrainHead.__init__	embeddingr;   c                 C   s:   |  | |}| jr|ddddddddf }|S )a  
        Parameters:
            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                    `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                Embedding from the model
        Returns:
            `torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                            `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True

        Nr   )r   r   r   )r(   r!  r+   r+   r,   rI     s    z PatchTSTMaskPretrainHead.forwardr\   r+   r+   r)   r,   r    s    r  z*
    The PatchTST for pretrain model.
    )Zcustom_introc                       sj   e Zd Zdef fddZ				ddejdeej dee dee d	ee d
e	e
ef fddZ  ZS )PatchTSTForPretrainingr   c                    s4   t  | d|_t|d| _t|| _|   d S )NTr   )r   r   r  r  r   r  headr   rY   r)   r+   r,   r     s
   
zPatchTSTForPretraining.__init__Nr   r  r   r:   r  r;   c                 C   s   |dur|n| j j}| j||||dd}| |j}tjdd}|||j}	|	jdd|j	 
 |j	
 d  }
|j}|sU|f|d	d
  }|
durQ|
f| }|S |}|S t|
|||jdS )a	  
        Parameters:
            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
                Input sequence to the model
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
            return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.

        Returns:
            `PatchTSTForPretrainingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
            `config.return_dict`=False)

        Examples:

        ```python
        >>> from huggingface_hub import hf_hub_download
        >>> import torch
        >>> from transformers import PatchTSTConfig, PatchTSTForPretraining

        >>> file = hf_hub_download(
        ...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
        ... )
        >>> batch = torch.load(file)

        >>> # Config for random mask pretraining
        >>> config = PatchTSTConfig(
        ...     num_input_channels=7,
        ...     context_length=512,
        ...     patch_length=12,
        ...     stride=12,
        ...     mask_type='random',
        ...     random_mask_ratio=0.4,
        ...     use_cls_token=True,
        ... )
        >>> # Config for forecast mask pretraining
        >>> config = PatchTSTConfig(
        ...     num_input_channels=7,
        ...     context_length=512,
        ...     patch_length=12,
        ...     stride=12,
        ...     mask_type='forecast',
        ...     num_forecast_mask_patches=5,
        ...     use_cls_token=True,
        ... )
        >>> model = PatchTSTForPretraining(config)

        >>> # during training, one provides both past and future values
        >>> outputs = model(past_values=batch["past_values"])

        >>> loss = outputs.loss
        >>> loss.backward()
        ```NTr   r  r   r:   r  noneZ	reductionr<   r=   r  r   )r   r   r5   r   )r   r  r   r#  r   r   MSELossr   r   rn   rz   r5   r   r   )r(   r   r  r   r:   r  model_outputZx_hatr   loss_valZmasked_lossr   r   r+   r+   r,   rI     s,   E
$
zPatchTSTForPretraining.forward)NNNN)rJ   rK   rL   r   r   rC   rQ   r   rP   r   r   r   rI   rR   r+   r+   r)   r,   r"    s&    
r"  c                       r   )PatchTSTClassificationHeadr   c                    sd   t    |j| _|j| _tjdd| _|jdkrt|jnt	 | _
t|j|j |j| _d S Nr   Z	start_dimr   )r   r   r   pooling_typer   Flattenflattenr  r   r   r   r#   r   rV   num_targetsr   rY   r)   r+   r,   r   f  s   
 z#PatchTSTClassificationHead.__init__r!  c                 C   s   | j r|dddddddf }n"| jdkr|jdd}n| jdkr+|jddj}n	td| j d| |}| | |}|S )	a[  
        Parameters:
            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                     `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                Embedding from the model
        Returns:
            `torch.Tensor` of shape `(bs, num_targets)`

        Nr   r   r0   r=   r   pooling operator  is not implemented yet)	r   r.  r   r   valuesr!   r0  r   r   r(   r!  pooled_embeddingr[   r+   r+   r,   rI   n  s   



z"PatchTSTClassificationHead.forwardr   r+   r+   r)   r,   r+  e  s    r+  z0
    The PatchTST for classification model.
    c                       sx   e Zd Zdef fddZe					ddejdeej dee	 dee	 d	ee	 d
ee	 de
eef fddZ  ZS )PatchTSTForClassificationr   c                    sB   t  | |jrtd d|_t|| _t|| _| 	  d S )N+Setting `do_mask_input` parameter to False.F)
r   r   r  loggerwarningr  r   r+  r#  r   rY   r)   r+   r,   r     s   


z"PatchTSTForClassification.__init__Nr   target_valuesr  r   r:   r  r;   c                 C   s   |dur|n| j j}| j||||dd}| |j}d}	|dur)t }
|
||}	|sC|f|dd  }|	dur?|	f| }|S |}|S t|	||j|j	dS )ac  
        past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
            Input sequence to the model
        target_values (`torch.Tensor`, *optional*):
            Labels associates with the `past_values`
        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
            in `[0, 1]`:

            - 1 for values that are **observed**,
            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

        Examples:

        ```python
        >>> from transformers import PatchTSTConfig, PatchTSTForClassification

        >>> # classification task with two input channel2 and 3 classes
        >>> config = PatchTSTConfig(
        ...     num_input_channels=2,
        ...     num_targets=3,
        ...     context_length=512,
        ...     patch_length=12,
        ...     stride=12,
        ...     use_cls_token=True,
        ... )
        >>> model = PatchTSTForClassification(config=config)

        >>> # during inference, one only provides past values
        >>> past_values = torch.randn(20, 512, 2)
        >>> outputs = model(past_values=past_values)
        >>> labels = outputs.prediction_logits
        ```NTr$  r   r   )r   r   r5   r   )
r   r  r   r#  r   r   ZCrossEntropyLossr   r5   r   )r(   r   r;  r  r   r:   r  r)  y_hatr*  r   r   r+   r+   r,   rI     s2   ,
z!PatchTSTForClassification.forwardr  )rJ   rK   rL   r   r   r   rC   rQ   r   rP   r   r  r   rI   rR   r+   r+   r)   r,   r7    s.    
r7  z,
    The PatchTST for regression Model.
    c                       s8   e Zd Zd	dedef fddZdejfddZ  Z	S )
PatchTSTPredictionHeadNr   r   c                    sD  t    |j| _|j| _|j| _|j| _| js| jr|j}n|j| }| jsvt | _	t | _
t | _t| jD ]8}| jtjdd |du rW| j	t||j n	| j	|| | j
|jdkrnt|jnt  q;dS tjdd| _|du rt||j| _n||| _|jdkrt|jnt | _dS )a  
        num_patches (`int`):
            The number of patches in the input sequence.
        distribution_output (`DistributionOutput`, *optional*):
            The distribution output layer for probabilistic forecasting. If None, a linear output layer is used.
        r0   r-  Nr   )r   r   share_projectionr   r   r.  rV   r   r   projectionsdropoutsflattensr   r|   r/  r#   prediction_lengthget_parameter_projectionr  r   r   r0  
projectionr   )r(   r   r   distribution_outputr    r   r)   r+   r,   r     s0   




($zPatchTSTPredictionHead.__init__r!  c                 C   s  | j r|dddddddf }n| jdkr|jdd}n| jdkr+|jddj}n|}| jseg }t| jD ]%}| j| |dd|ddf }| j	| |}| j
| |}|| q7tj|dd}n| |}| |}| |}t|trtdd	 |D }|S |dd}|S )
aj  
        Parameters:
            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                     `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                Embedding from the model
        Returns:
            `torch.Tensor` of shape `(bs, forecast_len, num_channels)`

        Nr   r   r0   r=   r   r   c                 s   s    | ]	}| d dV  qdS )r0   r   N)r2   )rs   zr+   r+   r,   r  E  r  z1PatchTSTPredictionHead.forward.<locals>.<genexpr>)r   r.  r   r   r4  r>  r   r   rA  r@  r?  r|   rC   r   r0  r   rD  rx   r  r2   )r(   r!  r6  r[   r   r+   r+   r,   rI     s.   


 



zPatchTSTPredictionHead.forwardr   )
rJ   rK   rL   r   rN   r   rC   rQ   rI   rR   r+   r+   r)   r,   r=    s    +r=  z,
    The PatchTST for prediction model.
    c                       s   e Zd Zdef fddZ					ddejdeej deej dee d	ee d
ee de	e
ef fddZ	ddejdeej defddZ  ZS )PatchTSTForPredictionr   c                    s   t  | |jrtd d|_t|| _|jdkrd | _n/|jdkr,t	|j
d| _n"|jdkr9t|j
d| _n|jdkrFt|j
d| _ntd|j t|| jjj| jd	| _|   d S )
Nr8  Fmse	student_tr=   normalnegative_binomialUnknown distribution output )rE  )r   r   r  r9  r:  r  r   r   rE  r   rB  r   r   r!   r=  r  r   r#  r   rY   r)   r+   r,   r   Q  s$   





zPatchTSTForPrediction.__init__Nr   r  r  r   r:   r  r;   c                 C   s   |dur|n| j j}| j||||dd}| |j}d}	| jr"|}
n||j |j }
|durQ| jrF| jj||j|jd}t	||}	t
|	}	ntjdd}||
|}	|j}|j}|sq|
f|dd  }|	durm|	f| }|S |}|S t|	|
|j|j||d	S )
aV	  
        Parameters:
            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
                Input sequence to the model
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*):
                Future target values associated with the `past_values`
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
            return_dict (`bool`, *optional*):
                Whether or not to return a `ModelOutput` instead of a plain tuple.

        Returns:
            `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
            `config.return_dict`=False)

        Examples:

        ```python
        >>> from huggingface_hub import hf_hub_download
        >>> import torch
        >>> from transformers import PatchTSTConfig, PatchTSTForPrediction

        >>> file = hf_hub_download(
        ...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
        ... )
        >>> batch = torch.load(file)

        >>> # Prediction task with 7 input channels and prediction length is 96
        >>> model = PatchTSTForPrediction.from_pretrained("namctin/patchtst_etth1_forecast")

        >>> # during training, one provides both past and future values
        >>> outputs = model(
        ...     past_values=batch["past_values"],
        ...     future_values=batch["future_values"],
        ... )

        >>> loss = outputs.loss
        >>> loss.backward()

        >>> # during inference, one only provides past values, the model outputs future values
        >>> outputs = model(past_values=batch["past_values"])
        >>> prediction_outputs = outputs.prediction_outputs
        ```NTr$  r   r   r   r&  r   r<   )r   r   r5   r   r   r   )r   r  r   r#  r   rE  r   r   distributionr   r   r   r(  r   r5   r   )r(   r   r  r  r   r:   r  r)  r<  r*  Z	y_hat_outrN  r   r   r   r   r+   r+   r,   rI   n  sL   =



zPatchTSTForPrediction.forwardc                    sr   | j j}| |d|dd}| jr.| jj|j|j|jd  fddt|D }tj	|dd}n|j
d}t|d	S )
a   
        Generate sequences of sample predictions from a model with a probability distribution head.

        Parameters:
            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Past values of the time series that serves as context in order to predict the future.
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

        Return:
            [`SamplePatchTSTOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
            samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)`
            for multivariate predictions.
        NF)r   r  r  r   rM  c                       g | ]}   qS r+   samplerr   rN  r+   r,   rt     r   z2PatchTSTForPrediction.generate.<locals>.<listcomp>r   r=   r   )r   num_parallel_samplesrE  rN  r   r   r   r   rC   r   rg   r   r(   r   r  rT  r   Zsamplesr+   rR  r,   generate  s   
zPatchTSTForPrediction.generater  r   )rJ   rK   rL   r   r   rC   rQ   r   rP   r   r   r   rI   r   rV  rR   r+   r+   r)   r,   rG  K  s<     

prG  c                       s8   e Zd ZdZd	def fddZdejfddZ  Z	S )
PatchTSTRegressionHeadz
    Regression head
    Nr   c                    s   t    |j| _|j| _|j| _|| _|j|j }t	j
dd| _|jdkr,t	|jnt	 | _|d u r?t	||j| _d S ||| _d S r,  )r   r   Zoutput_rangey_ranger   r.  rE  r   rV   r   r/  r0  r  r   r   r   r#   r1  rD  rC  )r(   r   rE  r    r)   r+   r,   r     s   
 zPatchTSTRegressionHead.__init__r!  c                 C   s   | j r|dddddddf }n"| jdkr|jdd}n| jdkr+|jddj}n	td| j d| | |}| |}| j	du | j
du@ r_t|| j
d	 | j
d   | j
d  }|S )
aY  
        Parameters:
            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                    `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                Embedding from the model
        Returns:
            `torch.Tensor` of shape `(bs, output_dim)`

        Nr   r   r0   r=   r   r2  r3  r   )r   r.  r   r   r4  r!   r   r0  rD  rE  rX  rC   Zsigmoidr5  r+   r+   r,   rI   !  s   



(zPatchTSTRegressionHead.forwardr   r\   r+   r+   r)   r,   rW    s    rW  z,
    The PatchTST for regression model.
    c                       s   e Zd Zdef fddZe					ddejdeej deej dee	 d	ee	 d
ee	 de
eef fddZ	ddejdeej defddZ  ZS )PatchTSTForRegressionr   c                    s   t  | |jrtd d|_t|| _|jdkrd | _n/|jdkr,t	|j
d| _n"|jdkr9t|j
d| _n|jdkrFt|j
d| _ntd|j t|| j| _|   d S )	Nr8  FrH  rI  r=   rJ  rK  rL  )r   r   r  r9  r:  r  r   r   rE  r   r1  r   r   r!   rW  r#  r   rY   r)   r+   r,   r   H  s    





zPatchTSTForRegression.__init__Nr   r;  r  r   r:   r  r;   c                    s   |dur|n j j} j||||dd} |j}d}	|durI jr> j|}
t fdd|D }t|
|}	t	|	}	nt
jdd}	|	||}	|sc|f|dd	  }|	dur_|	f| }|S |}|S t|	||j|jd
S )a#  
        past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
            Input sequence to the model
        target_values (`torch.Tensor` of shape `(bs, num_input_channels)`):
            Target values associates with the `past_values`
        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
            in `[0, 1]`:

            - 1 for values that are **observed**,
            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            Whether or not to return a `ModelOutput` instead of a plain tuple.

        Examples:

        ```python
        >>> from transformers import PatchTSTConfig, PatchTSTForRegression

        >>> # Regression task with 6 input channels and regress 2 targets
        >>> model = PatchTSTForRegression.from_pretrained("namctin/patchtst_etth1_regression")

        >>> # during inference, one only provides past values, the model outputs future values
        >>> past_values = torch.randn(20, 512, 6)
        >>> outputs = model(past_values=past_values)
        >>> regression_outputs = outputs.regression_outputs
        ```NTr$  c                    s   g | ]
}| d  jjqS )r<   )r1   r   r1  )rs   itemr(   r+   r,   rt     s    z1PatchTSTForRegression.forward.<locals>.<listcomp>r   r&  r   r   )r   r   r5   r   )r   r  r   r#  r   rE  rN  r  r   r   r   r(  r   r5   r   )r(   r   r;  r  r   r:   r  r)  r<  r   rN  r   r+   r[  r,   rI   b  s<   %


zPatchTSTForRegression.forwardc                    sb   | j j}| |d|dd}| j|j  fddt|D }tj|ddd|| j j	}t
|d	S )
a  
        Generate sequences of sample predictions from a model with a probability distribution head.

        Parameters:
            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Past values of the time series that serves as context in order to predict the future.
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

        Return:
            [`SamplePatchTSTOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
            samples, num_targets)`.
        NF)r   r;  r  r   c                    rO  r+   rP  rr   rR  r+   r,   rt     r   z2PatchTSTForRegression.generate.<locals>.<listcomp>r   r=   r<   rS  )r   rT  rE  rN  r   r   rC   r   r1   r1  r   rU  r+   rR  r,   rV    s   
zPatchTSTForRegression.generater  r   )rJ   rK   rL   r   r   r   rC   rQ   r   rP   r   r  r   rI   r   rV  rR   r+   r+   r)   r,   rY  B  s>    
LrY  )r  r   rG  r"  rY  r7  )NFr   r  r   )FrM   r   dataclassesr   typingr   r   r   rC   r   Zactivationsr   Zmodeling_outputsr	   Zmodeling_utilsr
   Ztime_series_utilsr   r   r   utilsr   r   r   Zconfiguration_patchtstr   Z
get_loggerrJ   r9  Moduler   rT   rQ   rO   listrP   rN   rp   r   r   r   r   r   r   r   r   r   r   r   r   r   r   distributionsDistributionr   r   r   r  r  r  r  r  r"  r+  r7  r=  rG  rW  rY  __all__r+   r+   r+   r,   <module>   s   
 
=

D0< !$8>""$7po%W` <7 