
    fThB                         S r SSKJr  SSKJrJrJrJrJr  SSK	J
r
  SSKJrJr  SSKJr  \(       a  SSKJr  SS	KJr  SS
KJr  \R*                  " \5      r/ SQr/ SQr " S S\
5      r " S S\5      rSS/rg)zWhisper model configuration    )OrderedDict)TYPE_CHECKINGAnyMappingOptionalUnion   )PretrainedConfig)
OnnxConfigOnnxSeq2SeqConfigWithPast)logging)FeatureExtractionMixin)PreTrainedTokenizerBase)
TensorType)X            	   
                        :   ;   <   =   >   ?   Z   [   \   ]   ie  in  i  i  i  i  i  i  i"  i  i  i  i  i?  ia  io  ic  i  iS  ir  i9	  i	  i  i  is  i  i  i  i  i  i#  i%  i&  iC)  i"*  i,  i-  i.  ik3  i5  i5  i9  i;  i@  iA  iHF  iK  i6L  iP  i!W  iY  ii  iu  iv  i  i  i[  i-  ie  i  i  Q  i      )Vr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   ig  i  i
  i  ii  i}  i  i  i  i  iF  i=  i  i	  iC
  i  i  i  i  i  iy  iW  i;  i  i  ii  ie#  i$  i(  i*  i.  i/  i+0  i1  i5  iM7  i+9  i;  i=  i@  i@  iG  iJ  ikN  iT  iW  if  i1f  iCg  iwn  is  i{  i.~  i~  i  io  iA  i  iN  iR  r)   r*   i  c            %          ^  \ rS rSrSrSrS/rSSSS.rSS	S
SS
SSSSSSSSSSSSSSSSSSSSSSS/SSSSSSSSSS4%U 4S jjrS r	U =r
$ )!WhisperConfigr   a7!  
This is the configuration class to store the configuration of a [`WhisperModel`]. It is used to instantiate a
Whisper model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Whisper
[openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    vocab_size (`int`, *optional*, defaults to 51865):
        Vocabulary size of the Whisper model. Defines the number of different tokens that can be represented by the
        `decoder_input_ids` passed when calling [`WhisperModel`]
    num_mel_bins (`int`, *optional*, defaults to 80):
        Number of mel features used per input features. Should correspond to the value used in the
        `WhisperProcessor` class.
    encoder_layers (`int`, *optional*, defaults to 4):
        Number of encoder layers.
    decoder_layers (`int`, *optional*, defaults to 4):
        Number of decoder layers.
    encoder_attention_heads (`int`, *optional*, defaults to 6):
        Number of attention heads for each attention layer in the Transformer encoder.
    decoder_attention_heads (`int`, *optional*, defaults to 6):
        Number of attention heads for each attention layer in the Transformer decoder.
    encoder_ffn_dim (`int`, *optional*, defaults to 1536):
        Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
    decoder_ffn_dim (`int`, *optional*, defaults to 1536):
        Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
    encoder_layerdrop (`float`, *optional*, defaults to 0.0):
        The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
        for more details.
    decoder_layerdrop (`float`, *optional*, defaults to 0.0):
        The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
        for more details.
    decoder_start_token_id (`int`, *optional*, defaults to 50257):
        Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
        are provided to the `generate` function. It is used to guide the model`s generation process depending on
        the task.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models).
    is_encoder_decoder (`bool`, *optional*, defaults to `True`):
        Whether the model is used as an encoder/decoder or not.
    activation_function (`str`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"silu"` and `"gelu_new"` are supported.
    d_model (`int`, *optional*, defaults to 384):
        Dimensionality of the layers.
    dropout (`float`, *optional*, defaults to 0.1):
        The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    activation_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for activations inside the fully connected layer.
    init_std (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    scale_embedding (`bool`, *optional*, defaults to False):
        Scale embeddings by diving by sqrt(d_model).
    max_source_positions (`int`, *optional*, defaults to 1500):
        The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
    max_target_positions (`int`, *optional*, defaults to 448):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    pad_token_id (`int`, *optional*, defaults to 50256):
        Padding token id.
    bos_token_id (`int`, *optional*, defaults to 50256):
        Begin of stream token id.
    eos_token_id (`int`, *optional*, defaults to 50256):
        End of stream token id.
    suppress_tokens (`List[int]`, *optional*):
        A list containing the non-speech tokens that will be used by the logit processor in the `generate`
        function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI each correspond to the `english-only` and the
        `multilingual` model.
    begin_suppress_tokens (`List[int]`, *optional*, defaults to `[220,50256]`):
        A list containing tokens that will be suppressed at the beginning of the sampling process. Initialized as
        the token for `" "` (`blank_token_id`) and the `eos_token_id`
    use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
        Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
        instance of [`WhisperForAudioClassification`].
    classifier_proj_size (`int`, *optional*, defaults to 256):
        Dimensionality of the projection before token mean-pooling for classification. Only relevant when using an
        instance of [`WhisperForAudioClassification`].
    apply_spec_augment (`bool`, *optional*, defaults to `False`):
        Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
        [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
        Recognition](https://arxiv.org/abs/1904.08779).
    mask_time_prob (`float`, *optional*, defaults to 0.05):
        Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
        procedure generates `mask_time_prob*len(time_axis)/mask_time_length` independent masks over the axis. If
        reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
        masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
        actual percentage of masked vectors. This is only relevant if `apply_spec_augment == True`.
    mask_time_length (`int`, *optional*, defaults to 10):
        Length of vector span along the time axis.
    mask_time_min_masks (`int`, *optional*, defaults to 2),:
        The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
        irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
        mask_time_min_masks''
    mask_feature_prob (`float`, *optional*, defaults to 0.0):
        Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
        masking procedure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over
        the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
        span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
        may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
        True`.
    mask_feature_length (`int`, *optional*, defaults to 10):
        Length of vector span along the feature axis.
    mask_feature_min_masks (`int`, *optional*, defaults to 0),:
        The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
        step, irrespectively of `mask_feature_prob`. Only relevant if
        `mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks`.
    median_filter_width (`int`, *optional*, defaults to 7):
        Width of the median filter used to smoothen to cross-attention outputs when computing token timestamps.
        Should be an odd number.

Example:

```python
>>> from transformers import WhisperConfig, WhisperModel

>>> # Initializing a Whisper tiny style configuration
>>> configuration = WhisperConfig()

>>> # Initializing a model (with random weights) from the tiny style configuration
>>> model = WhisperModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```whisperpast_key_valuesencoder_attention_headsd_model)num_key_value_headsnum_attention_headshidden_sizei  P         i   g        r(   Tgelui  g{Gz?Fi  i  iP  N      g?r   r   r   r   c&                   > Xl         X l        Xl        X0l        X@l        XPl        X`l        Xpl        Xl        UU l	        UU l
        UU l        Xl        UU l        Xl        Xl        Xl        X0l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        U U l        U!U l        U"U l        U#U l        U$U l        U%U l        [>        T'U ]  " SUUUUUUUS.U&D6  g )N)pad_token_idbos_token_ideos_token_idis_encoder_decoderdecoder_start_token_idsuppress_tokensbegin_suppress_tokens )!
vocab_sizenum_mel_binsr0   encoder_layersr/   decoder_layersdecoder_attention_headsdecoder_ffn_dimencoder_ffn_dimdropoutattention_dropoutactivation_dropoutactivation_functioninit_stdencoder_layerdropdecoder_layerdrop	use_cachenum_hidden_layersscale_embeddingmax_source_positionsmax_target_positionsclassifier_proj_sizeuse_weighted_layer_sumapply_spec_augmentmask_time_probmask_time_lengthmask_time_min_masksmask_feature_probmask_feature_lengthmask_feature_min_masksmedian_filter_widthsuper__init__)(selfrC   rD   rE   r/   rF   rG   rH   rI   rO   rP   r?   rQ   r>   rM   r0   rJ   rK   rL   rN   rS   rT   rU   r;   r<   r=   r@   rA   rW   rV   rX   rY   rZ   r[   r\   r]   r^   r_   kwargs	__class__s(                                          i/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/whisper/configuration_whisper.pyra   WhisperConfig.__init__   s   R %(,'>$,'>$..!2"4#6  !2!2"!/.$8!$8! %9!&<# #5, 0#6 !2#6 &<##6  		
%%%1#9+"7		
 		
    )rL   rM   rX   rK   rV   r0   rG   rH   rP   rF   rJ   r/   rI   rO   rE   rN   r]   r^   r\   rZ   r[   rY   rT   rU   r_   rR   rD   rS   rQ   rW   rC   )__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapra   __static_attributes____classcell__rd   s   @re   r,   r,   ;   s    @D J#4"588 M  ! !$"! "El$   MW
 W
rg   r,   c                      ^  \ rS rSr\S\\\\\4   4   4S j5       r       SS\	S   S\S\S\
S	\S
   S\S\S\S\\\4   4U 4S jjjr\S\4S j5       rSrU =r$ )WhisperOnnxConfigi   returnc                     [        SSSSS.4/5      nU R                  (       a  SS0US'   OSSS	.US'   U R                  (       a  U R                  US
S9  U$ )Ninput_featuresbatchfeature_sizeencoder_sequence)r   r   r   r   decoder_input_idsdecoder_sequence)r   r   inputs)	direction)r   use_pastfill_with_past_key_values_)rb   common_inputss     re   r}   WhisperOnnxConfig.inputs!  sj    #!w>FX#YZ

 ==23WM-.5<AS1TM-.==++MX+Nrg   preprocessor)r   r   
batch_size
seq_lengthis_pair	frameworkr   sampling_ratetime_duration	frequencyc	           
      r  > [        5       n	[        R                  " U UR                  UUUUUS9n
U
S   R                  S   nU R
                  (       a  US-  OUn[        TU ]	  UR                  X#XE5      nU
R                  S5      U	S'   UR                  S5      U	S'   SU;   a  UR                  S5      U	S'   U	$ )N)r   r   r   r   r   r   rw   r   r{   r.   )	r   r   generate_dummy_inputsfeature_extractorshaper   r`   	tokenizerpop)rb   r   r   r   r   r   r   r   r   dummy_inputsencoder_inputsencoder_sequence_lengthdecoder_inputsrd   s                re   r   'WhisperOnnxConfig.generate_dummy_inputs2  s     #}#99%77!''
 #11A"B"H"H"K59]],1

6""JG
 *8););<L)M%&,:,>,>?R,S()..<.@.@AR.SL*+rg   c                     g)NgMbP?rB   )rb   s    re   atol_for_validation%WhisperOnnxConfig.atol_for_validationV  s    rg   rB   )r   FNi"V  g      @r8   )rh   ri   rj   rk   propertyr   strintr}   r   boolr   floatr   r   r   rp   rq   rr   s   @re   rt   rt      s    WS#X%6 67  & ,0"""OP" " 	"
 " L)" " " " 
c	" "H U  rg   rt   N)rl   collectionsr   typingr   r   r   r   r   configuration_utilsr
   onnxr   r   utilsr   feature_extraction_utilsr   tokenization_utils_baser   r   
get_loggerrh   loggerNON_SPEECH_TOKENSNON_SPEECH_TOKENS_MULTIr,   rt   __all__rB   rg   re   <module>r      su    " # ? ? 3 9  BB#			H	%
 
 b
$ b
J81 8v /
0rg   