
    fThL                        S SK JrJrJr  S SKrS SKJr  S SKJs  Jr	  S SK
rSSKJr  SSKJr  SSKJr  SSKJrJrJrJrJrJrJrJrJr  S	S
KJr  \R:                  " \5      r " S S\R@                  5      r! " S S\5      r"SS jr# " S S\5      r$ " S S\5      r% " S S\5      r& " S S\5      r' " S S\5      r( " S S\5      r)/ SQr*g)    )CallableOptionalTupleN   )Cache)ALL_ATTENTION_FUNCTIONS)logging   )	LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaMLP
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbeddingeager_attention_forwardrotate_half   )
OlmoConfigc                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
OlmoLayerNorm   z/LayerNorm but with no learnable weight or bias.hidden_sizereturnNc                 2   > [         TU ]  5         U4U l        g N)super__init__normalized_shape)selfr   	__class__s     ]/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/olmo/modular_olmo.pyr   OlmoLayerNorm.__init__   s    !,    hidden_statesc                     UR                   n[        R                  " UR                  [        R
                  S9U R                  S S SS9R                  U5      $ )N)dtypegh㈵>)eps)r'   F
layer_normtotorchfloat32r   )r    r%   
orig_dtypes      r"   forwardOlmoLayerNorm.forward#   sO    "((
||M,,5==,A4CXCXZ^`djnorr
 	
r$   )r   )__name__
__module____qualname____firstlineno____doc__intr   r,   Tensorr/   __static_attributes____classcell__r!   s   @r"   r   r      s9    9/C /D /
U\\ 
ell 
 
r$   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )OlmoMLP*   c                 >  > [         TU ]  U5        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R
                  U R                  SS9U l        g )NF)bias)	r   r   nnLinearr   intermediate_size	gate_projup_proj	down_proj)r    configr!   s     r"   r   OlmoMLP.__init__+   ss     4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXr$   )rE   rC   rD   )r1   r2   r3   r4   r   r8   r9   r:   s   @r"   r<   r<   *   s    Y Yr$   r<   c                    U R                   UR                   pvUR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   n	UR                  U5      U	R                  U5      4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)r'   	unsqueezer   r+   )
qkcossinposition_idsunsqueeze_dimq_typek_typeq_embedk_embeds
             r"   apply_rotary_pos_embrT   2   sv    ( WWaggF
--
&C
--
&Cw;q>C/0Gw;q>C/0G::fwzz&111r$   c                      \ rS rSr  SS\R
                  S\\R
                  \R
                  4   S\\R
                     S\\   S\\R                     S\\R
                  \\R
                     \\\R
                        4   4S	 jjr
S
rg)OlmoAttentionN   Nr%   position_embeddingsattention_maskpast_key_valuecache_positionr   c                 R   UR                   S S n/ UQSPU R                  P7nU R                  U5      n	U R                  U5      n
U R	                  U5      nU R
                  R                  b  U	R                  U R
                  R                  * U R
                  R                  S9  U
R                  U R
                  R                  * U R
                  R                  S9  UR                  U R
                  R                  * U R
                  R                  S9  U	R                  U5      R                  SS5      n	U
R                  U5      R                  SS5      n
UR                  U5      R                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R
                  R                  S:w  ad  U R
                  R                  S:X  a-  UR                  SS	5      (       a  [         R#                  S
5        O[$        U R
                  R                     nU" U U	U
UU4U R&                  (       d  SOU R(                  U R*                  S.UD6u  nnUR,                  " / UQSP76 R/                  5       nU R1                  U5      nUU4$ )N)minmaxr   r
   )rM   rL   r[   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )dropoutscaling)shapehead_dimq_projk_projv_projrF   clip_qkvclamp_view	transposerT   update	layer_idxr   _attn_implementationgetloggerwarning_oncer   trainingattention_dropoutre   reshape
contiguouso_proj)r    r%   rX   rY   rZ   r[   kwargsinput_shapehidden_shapequery_states
key_statesvalue_statesrL   rM   cache_kwargsattention_interfaceattn_outputattn_weightss                     r"   r/   OlmoAttention.forwardO   sk    $))#2.88b8$--8{{=1[[/
{{=1;;+T[[%9%9$9t{{?S?ST4;;#7#7"7T[[=Q=QRT[[%9%9$9t{{?S?ST#((6@@AF__\2<<QB
#((6@@AF&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r$    )NN)r1   r2   r3   r4   r,   r7   r   r   r   
LongTensorr/   r8   r   r$   r"   rV   rV   N   s     +/598)||8) #5<<#=>8) !.	8)
 !8) !!1!128) 
u||Xell3XeELL>Q5RR	S8) 8)r$   rV   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )OlmoDecoderLayer   rF   rp   c                    > [         TU ]  X5        [        UR                  5      U l        [        UR                  5      U l        [        XS9U l        g )N)rF   rp   )r   r   r   r   input_layernormpost_attention_layernormrV   	self_attnr    rF   rp   r!   s      r"   r   OlmoDecoderLayer.__init__   sB    +,V-?-?@(5f6H6H(I%&fJr$   )r   r   r   )	r1   r2   r3   r4   r   r6   r   r8   r9   r:   s   @r"   r   r      s    Kz Kc K Kr$   r   c                       \ rS rSrS rSrg)OlmoRotaryEmbedding   c                    U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	X4sS S S 5        $ ! , (       d  f       g = f)
Nr   r]   r   mpscpuF)device_typeenabledr
   )dim)inv_freqfloatexpandrf   r+   device
isinstancetypestrr,   autocastrn   catrL   attention_scalingrM   )
r    xrN   inv_freq_expandedposition_ids_expandedr   freqsembrL   rM   s
             r"   r/   OlmoRotaryEmbedding.forward   s'    MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C8 DCCs   $BE22
F r   N)r1   r2   r3   r4   r/   r8   r   r$   r"   r   r      s    
r$   r   c                       \ rS rSrS rSrg)OlmoPreTrainedModel   c                 "   U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g g )Nrc   )meanstd)rF   initializer_ranger   r@   rA   weightdatanormal_r?   zero_	Embeddingpadding_idx)r    moduler   s      r"   _init_weights!OlmoPreTrainedModel._init_weights   s    kk++fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> . .r$   r   N)r1   r2   r3   r4   r   r8   r   r$   r"   r   r      s    	?r$   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )	OlmoModel   rF   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  5      U l
        g s  snf r   )r   r   r@   
ModuleListrangenum_hidden_layersr   layersr   r   normr   s      r"   r   OlmoModel.__init__   s_     mmBGH`H`BabBaYf0Bab
 "&"4"45	 cs   A4)r   r   )r1   r2   r3   r4   r   r   r8   r9   r:   s   @r"   r   r      s    6z 6 6r$   r   c                       \ rS rSrSrg)OlmoForCausalLM   r   N)r1   r2   r3   r4   r8   r   r$   r"   r   r      s    r$   r   )r   r   r   )Nr   )+typingr   r   r   r,   torch.nnr@   torch.nn.functional
functionalr)   torch.utils.checkpointcache_utilsr   modeling_utilsr   utilsr	   llama.modeling_llamar   r   r   r   r   r   r   r   r   configuration_olmor   
get_loggerr1   rs   Moduler   r<   rT   rV   r   r   r   r   r   __all__r   r$   r"   <module>r      s    , ,        5 
 
 
 + 
		H	%
BII 
Yh Y289)N 9)xK( K. 
?. 
?6
 6	& 	 Br$   