o ZŽhðùã@s<ddlZddlZddlZddlmZddlmZmZmZddl Z ddl m Z ddlmZm Z mZddlmZddlmZmZmZmZmZmZmZmZdd lmZdd lmZmZddlm Z m!Z!m"Z"dd l#m$Z$e" %e&¡Z'dd„Z(Gdd„de j)ƒZ*e j+e*dœZ,Gdd„de j)ƒZ-Gdd„de j)ƒZ.Gdd„de j)ƒZ/Gdd„de j)ƒZ0Gdd„de j)ƒZ1Gdd„de j)ƒZ2Gdd „d e j)ƒZ3Gd!d"„d"e j)ƒZ4Gd#d$„d$e j)ƒZ5Gd%d&„d&e j)ƒZ6Gd'd(„d(e j)ƒZ7Gd)d*„d*e j)ƒZ8Gd+d,„d,e j)ƒZ9Gd-d.„d.e j)ƒZ:Gd/d0„d0e j)ƒZ;Gd1d2„d2e j)ƒZe!Gd7d8„d8eƒƒZ?eGd9d:„d:e ƒƒZ@e!Gd;d<„dGd?d@„d@e?ƒƒZBe!GdAdB„dBe?ƒƒZCGdCdD„dDe j)ƒZDe!dEd>GdFdG„dGe?ƒƒZEe!dHd>GdIdJ„dJe?ƒƒZFe!GdKdL„dLe?ƒƒZGe!GdMdN„dNe?ƒƒZHe!GdOdP„dPe?ƒƒZIgdQ¢ZJdS)RéN)Ú dataclass)ÚOptionalÚTupleÚUnion)Únn)ÚBCEWithLogitsLossÚCrossEntropyLossÚMSELossé)ÚACT2FN)ÚBaseModelOutputÚBaseModelOutputWithPoolingÚMaskedLMOutputÚMultipleChoiceModelOutputÚNextSentencePredictorOutputÚQuestionAnsweringModelOutputÚSequenceClassifierOutputÚTokenClassifierOutput)ÚPreTrainedModel)Ú find_pruneable_heads_and_indicesÚprune_linear_layer)ÚModelOutputÚauto_docstringÚloggingé)ÚMobileBertConfigcCsàzddl}ddl}ddl}Wn tyt d¡‚wtj |¡}t d|›¡|j |¡}g}g} |D] \} }t d| ›d|›¡|j || ¡}| | ¡| |¡q6t|| ƒD]\} }| dd¡} | d d ¡} | dd¡} | d d¡} | d¡} tdd„| Dƒƒr“t dd | ¡›¡q\|} | D]~}| d|¡r¦| d|¡}n|g}|ddksµ|ddkr»t| dƒ} nI|ddksÇ|ddkrÍt| dƒ} n7|ddkrÙt| dƒ} n+|ddkråt| dƒ} nz t| |dƒ} Wntyt dd | ¡›¡Yq—wt|ƒdkrt|dƒ}| |} q—|d d…d!kr%t| dƒ} n |dkr/| |¡}z| j|jksDJd"| j›d#|j›d$ƒ‚Wnty^}z|j| j|jf7_‚d}~wwt d%| ›¡t |¡| _q\|S)&z'Load tf checkpoints in a pytorch model.rNz™Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape Z ffn_layerÚffnZ FakeLayerNormÚ LayerNormZextra_output_weightszdense/kernelZbertÚ mobilebertú/css|]}|dvVqdS))Zadam_vZadam_mZAdamWeightDecayOptimizerZAdamWeightDecayOptimizer_1Zglobal_stepN©)Ú.0Únr r úa/var/www/auris/lib/python3.10/site-packages/transformers/models/mobilebert/modeling_mobilebert.pyÚ Vs €ÿ ÿz0load_tf_weights_in_mobilebert..z Skipping z [A-Za-z]+_\d+z_(\d+)ZkernelÚgammaÚweightZoutput_biasÚbetaÚbiasZoutput_weightsZsquadÚ classifierériõÿÿÿZ_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )ÚreÚnumpyZ tensorflowÚImportErrorÚloggerÚerrorÚosÚpathÚabspathÚinfoÚtrainZlist_variablesZ load_variableÚappendÚzipÚreplaceÚsplitÚanyÚjoinÚ fullmatchÚgetattrÚAttributeErrorÚlenÚintÚ transposeÚshapeÚAssertionErrorÚargsÚtorchZ from_numpyÚdata)ÚmodelÚconfigZtf_checkpoint_pathr+ÚnpÚtfZtf_pathZ init_varsÚnamesZarraysÚnamerAÚarrayZpointerZm_nameZscope_namesÚnumÚer r r#Úload_tf_weights_in_mobilebert5sŒÿû þþ€ ÿ€þrOcs4eZdZd‡fdd„ Zdejdejfdd„Z‡ZS) ÚNoNormNcs2tƒ ¡t t |¡¡|_t t |¡¡|_dS©N) ÚsuperÚ__init__rÚ ParameterrDÚzerosr(Úonesr&)ÚselfZ feat_sizeÚeps©Ú __class__r r#rS„s zNoNorm.__init__Úinput_tensorÚreturncCs||j|jSrQ)r&r()rWr[r r r#Úforward‰szNoNorm.forwardrQ©Ú__name__Ú __module__Ú__qualname__rSrDÚTensorr]Ú __classcell__r r rYr#rPƒsrP)Ú layer_normZno_normcsbeZdZdZ‡fdd„Z ddeejdeejdeejdeejd ej f d d„Z ‡ZS) ÚMobileBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.csÈtƒ ¡|j|_|j|_|j|_tj|j|j|jd|_ t |j |j¡|_t |j|j¡|_ |jr4dnd}|j|}t ||j¡|_t|j|jƒ|_t |j¡|_|jdt |j ¡ d¡dddS)N)Úpadding_idxr rÚposition_ids)réÿÿÿÿF)Ú persistent)rRrSÚ trigram_inputÚembedding_sizeÚhidden_sizerÚ EmbeddingÚ vocab_sizeZpad_token_idÚword_embeddingsZmax_position_embeddingsÚposition_embeddingsZtype_vocab_sizeÚtoken_type_embeddingsÚLinearÚembedding_transformationÚNORM2FNÚnormalization_typerÚDropoutÚhidden_dropout_probÚdropoutZregister_bufferrDZarangeÚexpand)rWrGZembed_dim_multiplierZembedded_input_sizerYr r#rS“s ÿzMobileBertEmbeddings.__init__NÚ input_idsÚtoken_type_idsrgÚ inputs_embedsr\c Cs*|dur | ¡}n| ¡dd…}|d}|dur$|jdd…d|…f}|dur3tj|tj|jjd}|dur<| |¡}|jrktjt j j|dd…dd…fgd¢dd|t j j|dd…dd…fgd¢ddgdd }|jst|j|j kry| |¡}| |¡}| |¡}|||} | | ¡} | | ¡} | S) Nrhr©ÚdtypeÚdevice)rrrrrrç)Úvalue)rrrrrrr*©Údim)ÚsizergrDrUÚlongrrorjÚcatrÚ functionalÚpadrkrlrsrprqrrx) rWrzr{rgr|Úinput_shapeZ seq_lengthrprqÚ embeddingsr r r#r]©s4 $$ýú zMobileBertEmbeddings.forward)NNNN)r_r`raÚ__doc__rSrrDÚ LongTensorÚFloatTensorrbr]rcr r rYr#res$ûþýüûúrecsneZdZ‡fdd„Zdd„Z ddejdejdejd eejd eejdee de ejfd d„Z‡ZS)ÚMobileBertSelfAttentioncs†tƒ ¡|j|_t|j|jƒ|_|j|j|_t |j|j¡|_ t |j|j¡|_ t |jr3|jn|j|j¡|_ t |j¡|_dSrQ)rRrSÚnum_attention_headsr?Útrue_hidden_sizeÚattention_head_sizeÚ all_head_sizerrrÚqueryÚkeyÚuse_bottleneck_attentionrlrrvZattention_probs_dropout_probrx©rWrGrYr r#rSÝs ÿz MobileBertSelfAttention.__init__cCs6| ¡dd…|j|jf}| |¡}| dddd¡S)Nrhrr*rr )r„rr‘ÚviewÚpermute)rWÚxZnew_x_shaper r r#Útranspose_for_scoresês z,MobileBertSelfAttention.transpose_for_scoresNÚquery_tensorÚ key_tensorÚvalue_tensorÚattention_maskÚ head_maskÚoutput_attentionsr\cCsö| |¡}| |¡}| |¡} | |¡} | |¡}| | ¡}t | | dd¡¡} | t |j ¡} |dur8| |} t jj| dd}| |¡}|durM||}t ||¡}| dddd¡ ¡}| ¡dd…|jf}| |¡}|rv||f}|S|f}|S)Nrhéþÿÿÿr‚rr*rr )r“r”rršrDÚmatmulr@ÚmathÚsqrtr‘rr‡Zsoftmaxrxr˜Ú contiguousr„r’r—)rWr›rœrržrŸr Zmixed_query_layerZmixed_key_layerZmixed_value_layerZquery_layerZ key_layerZvalue_layerZattention_scoresZattention_probsZ context_layerZnew_context_layer_shapeÚoutputsr r r#r]ïs, ÿzMobileBertSelfAttention.forward©NNN) r_r`rarSršrDrbrrÚboolrr]rcr r rYr#rŽÜs* ùþýüûúùørŽcó8eZdZ‡fdd„Zdejdejdejfdd„Z‡ZS)ÚMobileBertSelfOutputcsXtƒ ¡|j|_t |j|j¡|_t|j|j|j d|_ |js*t |j¡|_ dSdS©N©rX)rRrSÚuse_bottleneckrrrrÚdensertruÚlayer_norm_epsrrvrwrxr–rYr r#rSs ÿzMobileBertSelfOutput.__init__Ú hidden_statesÚresidual_tensorr\cCs,| |¡}|js | |¡}| ||¡}|SrQ)r®rrxr©rWr°r±Ú layer_outputsr r r#r]s zMobileBertSelfOutput.forwardr^r r rYr#rªs$rªcsteZdZ‡fdd„Zdd„Z ddejdejdejd ejd eejdeejdee d e ejfdd„Z‡ZS)ÚMobileBertAttentioncs*tƒ ¡t|ƒ|_t|ƒ|_tƒ|_dSrQ)rRrSrŽrWrªÚoutputÚsetÚpruned_headsr–rYr r#rS(s zMobileBertAttention.__init__cCs²t|ƒdkrdSt||jj|jj|jƒ\}}t|jj|ƒ|j_t|jj|ƒ|j_t|jj |ƒ|j_ t|j j|dd|j _|jjt|ƒ|j_|jj|jj|j_|j |¡|_dS)Nrrr‚)r>rrWrr‘r·rr“r”rrµr®r’Úunion)rWÚheadsÚindexr r r#Úprune_heads.sÿzMobileBertAttention.prune_headsNr›rœrÚlayer_inputržrŸr r\cCs:| ||||||¡}| |d|¡} | f|dd…} | S)Nrr)rWrµ)rWr›rœrr¼ržrŸr Zself_outputsÚattention_outputr¦r r r#r]@s ú zMobileBertAttention.forwardr§) r_r`rarSr»rDrbrrr¨rr]rcr r rYr#r´'s.øþýüûúùø ÷r´có2eZdZ‡fdd„Zdejdejfdd„Z‡ZS)ÚMobileBertIntermediatecsDtƒ ¡t |j|j¡|_t|jt ƒrt |j|_dS|j|_dSrQ)rRrSrrrrÚintermediate_sizer®Ú isinstanceÚ hidden_actÚstrrÚintermediate_act_fnr–rYr r#rSZs zMobileBertIntermediate.__init__r°r\cCs| |¡}| |¡}|SrQ)r®rÄ©rWr°r r r#r]bó zMobileBertIntermediate.forwardr^r r rYr#r¿Ysr¿cr©)ÚOutputBottleneckcsFtƒ ¡t |j|j¡|_t|j|j|j d|_ t |j¡|_ dSr«)rRrSrrrrrlr®rtrur¯rrvrwrxr–rYr r#rSis zOutputBottleneck.__init__r°r±r\cCs&| |¡}| |¡}| ||¡}|SrQ)r®rxrr²r r r#r]os zOutputBottleneck.forwardr^r r rYr#rÇhs$rÇcs>eZdZ‡fdd„Zdejdejdejdejfdd„Z‡ZS) ÚMobileBertOutputcs\tƒ ¡|j|_t |j|j¡|_t|j |jƒ|_ |js't |j¡|_ dSt|ƒ|_dSrQ)rRrSrrrrrÀrr®rtrurrvrwrxrÇÚ bottleneckr–rYr r#rSws zMobileBertOutput.__init__Úintermediate_statesÚresidual_tensor_1Úresidual_tensor_2r\cCsJ| |¡}|js| |¡}| ||¡}|S| ||¡}| ||¡}|SrQ)r®rrxrrÉ)rWrÊrËrÌÚlayer_outputr r r#r]s þzMobileBertOutput.forwardr^r r rYr#rÈvs ÿÿÿþrÈcr¾)ÚBottleneckLayercó8tƒ ¡t |j|j¡|_t|j|j|j d|_ dSr«)rRrSrrrrlZintra_bottleneck_sizer®rtrur¯rr–rYr r#rSó zBottleneckLayer.__init__r°r\cCs| |¡}| |¡}|SrQ©r®r)rWr°r¼r r r#r]”rÆzBottleneckLayer.forwardr^r r rYr#rÎŽórÎcs6eZdZ‡fdd„Zdejdeejfdd„Z‡ZS)Ú Bottleneckcs<tƒ ¡|j|_|j|_t|ƒ|_|jrt|ƒ|_dSdSrQ)rRrSÚkey_query_shared_bottleneckr•rÎÚinputÚ attentionr–rYr r#rS›s ÿzBottleneck.__init__r°r\cCsB| |¡}|jr |fdS|jr| |¡}||||fS||||fS)Né)rÕr•rÔrÖ)rWr°Zbottlenecked_hidden_statesZshared_attention_inputr r r#r]£s zBottleneck.forward© r_r`rarSrDrbrr]rcr r rYr#rÓšs"rÓcr©)Ú FFNOutputcrÏr«)rRrSrrrrÀrr®rtrur¯rr–rYr r#rS¿rÐzFFNOutput.__init__r°r±r\cCs| |¡}| ||¡}|SrQrÑr²r r r#r]Äs zFFNOutput.forwardr^r r rYr#rÙ¾s$rÙcr¾)ÚFFNLayercs"tƒ ¡t|ƒ|_t|ƒ|_dSrQ)rRrSr¿ÚintermediaterÙrµr–rYr r#rSËs zFFNLayer.__init__r°r\cCs| |¡}| ||¡}|SrQ)rÛrµ)rWr°Úintermediate_outputr³r r r#r]Ðs zFFNLayer.forwardr^r r rYr#rÚÊrÒrÚcsZeZdZ‡fdd„Z ddejdeejdeejdeede ejf d d „Z ‡ZS)ÚMobileBertLayercs~tƒ ¡ˆj|_ˆj|_tˆƒ|_tˆƒ|_tˆƒ|_ |jr$t ˆƒ|_ˆjdkr=t ‡fdd„tˆjdƒDƒ¡|_dSdS)Nrcóg|]}tˆƒ‘qSr )rÚ©r!Ú_©rGr r#Ú âóz,MobileBertLayer.__init__..)rRrSrÚnum_feedforward_networksr´rÖr¿rÛrÈrµrÓrÉrÚ ModuleListÚrangerr–rYrár#rS×s (ÿzMobileBertLayer.__init__Nr°ržrŸr r\c CsÐ|jr | |¡\}}}}n |gd\}}}}|j|||||||d} | d} | f}| dd…}|jdkrGt|jƒD] \} }|| ƒ} || f7}q9| | ¡}| || |¡}|f|t d¡||||| |f|}|S)Nr×)r rriè) rrÉrÖräÚ enumeraterrÛrµrDZtensor)rWr°ržrŸr r›rœrr¼Zself_attention_outputsr½Úsr¦ÚiZ ffn_modulerÜrÍr r r#r]äsJù ÿùþõÿzMobileBertLayer.forwardr§)r_r`rarSrDrbrrr¨rr]rcr r rYr#rÝÖs ûþýüûúrÝcspeZdZ‡fdd„Z ddejdeejdeejd eed eedeede e effd d„Z‡Z S)ÚMobileBertEncodercs.tƒ ¡t ‡fdd„tˆjƒDƒ¡|_dS)NcrÞr )rÝrßrár r#rârãz.MobileBertEncoder.__init__..)rRrSrråræÚnum_hidden_layersÚlayerr–rYrár#rSs $zMobileBertEncoder.__init__NFTr°ržrŸr Úoutput_hidden_statesÚreturn_dictr\cCsž|rdnd}|r dnd}t|jƒD]!\} } |r||f}| |||| |ƒ}|d}|r2||df}q|r:||f}|sHtdd„|||fDƒƒSt|||dS)Nr rrcss|] }|dur|VqdSrQr )r!Úvr r r#r$9s€z,MobileBertEncoder.forward..)Úlast_hidden_stater°Ú attentions)rçrìÚtupler)rWr°ržrŸr rírîZall_hidden_statesZall_attentionsréZlayer_moduler³r r r#r]s, ü€ ÿzMobileBertEncoder.forward)NNFFT)r_r`rarSrDrbrrr¨rrrr]rcr r rYr#rês,ùþýüûúù ørêcr¾)ÚMobileBertPoolercs2tƒ ¡|j|_|jrt |j|j¡|_dSdSrQ)rRrSZclassifier_activationÚdo_activaterrrrlr®r–rYr r#rS@s ÿzMobileBertPooler.__init__r°r\cCs2|dd…df}|js |S| |¡}t |¡}|S)Nr)rôr®rDÚtanh)rWr°Zfirst_token_tensorÚ pooled_outputr r r#r]Fs zMobileBertPooler.forwardr^r r rYr#ró?srócr¾)Ú!MobileBertPredictionHeadTransformcsXtƒ ¡t |j|j¡|_t|jtƒrt |j|_ n|j|_ td|j|jd|_ dS)Nrdr¬)rRrSrrrrlr®rÁrÂrÃrÚtransform_act_fnrtr¯rr–rYr r#rSSs z*MobileBertPredictionHeadTransform.__init__r°r\cCs"| |¡}| |¡}| |¡}|SrQ)r®rørrÅr r r#r]\s z)MobileBertPredictionHeadTransform.forwardr^r r rYr#r÷Rs r÷cs<eZdZ‡fdd„Zd dd„Zdejdejfdd „Z‡ZS)ÚMobileBertLMPredictionHeadcshtƒ ¡t|ƒ|_tj|j|j|jdd|_ tj|j|jdd|_ t t |j¡¡|_|j|j _dS)NF)r()rRrSr÷Ú transformrrrrnrlrkr®ÚdecoderrTrDrUr(r–rYr r#rSds z#MobileBertLMPredictionHead.__init__r\NcCs|j|j_dSrQ)r(rû©rWr r r#Ú_tie_weightsosz'MobileBertLMPredictionHead._tie_weightsr°cCs>| |¡}| tj|jj ¡|jjgdd¡}||jj7}|S)Nrr‚) rúr¢rDr†rûr&Útr®r(rÅr r r#r]rs $z"MobileBertLMPredictionHead.forward)r\N) r_r`rarSrýrDrbr]rcr r rYr#rùcs rùcr¾)ÚMobileBertOnlyMLMHeadcstƒ ¡t|ƒ|_dSrQ)rRrSrùÚpredictionsr–rYr r#rSzs zMobileBertOnlyMLMHead.__init__Úsequence_outputr\cCó| |¡}|SrQ)r)rWrÚprediction_scoresr r r#r]~ó zMobileBertOnlyMLMHead.forwardr^r r rYr#rÿyórÿcs<eZdZ‡fdd„Zdejdejdeejfdd„Z‡ZS)ÚMobileBertPreTrainingHeadscs(tƒ ¡t|ƒ|_t |jd¡|_dS©Nr*)rRrSrùrrrrrlÚseq_relationshipr–rYr r#rS„s z#MobileBertPreTrainingHeads.__init__rrör\cCs| |¡}| |¡}||fSrQ)rr)rWrrörÚseq_relationship_scorer r r#r]‰s z"MobileBertPreTrainingHeads.forwardrØr r rYr#rƒs(rc@s eZdZeZeZdZdd„ZdS)ÚMobileBertPreTrainedModelrcCsÒt|tjƒr |jjjd|jjd|jdur|jj ¡dSdSt|tj ƒrC|jjjd|jjd|jdurA|jj|j ¡dSdSt|tjt fƒrZ|jj ¡|jj d¡dSt|tƒrg|jj ¡dSdS)zInitialize the weightsr€)ÚmeanZstdNgð?)rÁrrrr&rEZnormal_rGZinitializer_ranger(Zzero_rmrfrrPZfill_rù)rWÚmoduler r r#Ú _init_weights•s ÿ ÿ ÿz'MobileBertPreTrainedModel._init_weightsN) r_r`rarZconfig_classrOZload_tf_weightsZbase_model_prefixr r r r r#r s r c@steZdZUdZdZeejed<dZ eejed<dZ eejed<dZeeejed<dZ eeejed<dS)ÚMobileBertForPreTrainingOutputab Output type of [`MobileBertForPreTraining`]. Args: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. NÚlossÚprediction_logitsÚseq_relationship_logitsr°rñ)r_r`rar‹rrrDrÚ__annotations__rrr°rrñr r r r#r¨s rcs¼eZdZdZd‡fdd„ Zdd„Zdd„Zd d „Ze dde e jd e e jde e jde e jde e jde e jde e de e de e deeeffdd„ƒZ‡ZS)ÚMobileBertModelz. https://arxiv.org/pdf/2004.02984.pdf TcsDtƒ |¡||_t|ƒ|_t|ƒ|_|rt|ƒnd|_| ¡dS)zv add_pooling_layer (bool, *optional*, defaults to `True`): Whether to add a pooling layer N) rRrSrGrerŠrêÚencoderróÚpoolerÚ post_init)rWrGÚadd_pooling_layerrYr r#rSÐs zMobileBertModel.__init__cCs|jjSrQ©rŠrorür r r#Úget_input_embeddingsßsz$MobileBertModel.get_input_embeddingscCs||j_dSrQr)rWrr r r#Úset_input_embeddingsâsz$MobileBertModel.set_input_embeddingscCs*| ¡D]\}}|jj|j |¡qdS)z Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel N)ÚitemsrrìrÖr»)rWZheads_to_prunerìr¹r r r#Ú_prune_headsåsÿzMobileBertModel._prune_headsNrzržr{rgrŸr|rír rîr\c Csh|dur|n|jj}|dur|n|jj}| dur| n|jj} |dur*|dur*tdƒ‚|dur9| ||¡| ¡} n|durF| ¡dd…} ntdƒ‚|durQ|jn|j}|dur_tj | |d}|durltj | tj|d}| || ¡}| ||jj¡}|j||||d} |j| ||||| d}|d}|jdurœ| |¡nd}| sª||f|d d…St|||j|jd S)NzDYou cannot specify both input_ids and inputs_embeds at the same timerhz5You have to specify either input_ids or inputs_embeds)rr})rzrgr{r|)ržrŸr rírîrr)rðZ pooler_outputr°rñ)rGr ríÚuse_return_dictÚ ValueErrorZ%warn_if_padding_and_no_attention_maskr„rrDrVrUr…Zget_extended_attention_maskZ get_head_maskrërŠrrr r°rñ)rWrzržr{rgrŸr|rír rîr‰rZextended_attention_maskZembedding_outputZencoder_outputsrrör r r#r]ísP ÿ ÿúüzMobileBertModel.forward)T) NNNNNNNNN)r_r`rar‹rSrrrrrrDrŒrr¨rrr r]rcr r rYr#rÊsJöþýüûúùø ÷ ö õrz® MobileBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next sentence prediction (classification)` head. )Zcustom_introcsòeZdZddgZ‡fdd„Zdd„Zdd„Zdd eede j f‡fdd „ Ze ddee jdee jdee jdee jdee jdee jdee jdee jdee jdee jdee jdeeeffdd„ƒZ‡ZS)ÚMobileBertForPreTrainingúcls.predictions.decoder.weightúcls.predictions.decoder.biascó,tƒ |¡t|ƒ|_t|ƒ|_| ¡dSrQ)rRrSrrrÚclsrr–rYr r#rS>s z!MobileBertForPreTraining.__init__cCó |jjjSrQ©r#rrûrür r r#Úget_output_embeddingsFó z.MobileBertForPreTraining.get_output_embeddingscCó||jj_|j|jj_dSrQ©r#rrûr(©rWZnew_embeddingsr r r#Úset_output_embeddingsIó z.MobileBertForPreTraining.set_output_embeddingsNÚnew_num_tokensr\có*|j|jjj|dd|jj_tƒj|dS©NT)r-Z transposed)r-©Z_get_resized_lm_headr#rr®rRÚresize_token_embeddings©rWr-rYr r#r1Msÿz0MobileBertForPreTraining.resize_token_embeddingsrzržr{rgrŸr|ÚlabelsÚnext_sentence_labelr rírîcCsê|dur|n|jj}|j||||||| | |d }|dd…\} }| | |¡\}}d}|durS|durStƒ}|| d|jj¡| d¡ƒ}|| dd¡| d¡ƒ}||}|sj||f|dd…}|durh|f|S|St||||j|j dS)a® labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see `input_ids` docstring) Indices should be in `[0, 1]`: - 0 indicates sequence B is a continuation of sequence A, - 1 indicates sequence B is a random sequence. Examples: ```python >>> from transformers import AutoTokenizer, MobileBertForPreTraining >>> import torch >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased") >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased") >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) >>> # Batch size 1 >>> outputs = model(input_ids) >>> prediction_logits = outputs.prediction_logits >>> seq_relationship_logits = outputs.seq_relationship_logits ```N©ržr{rgrŸr|r rírîr*rh)rrrr°rñ) rGrrr#rr—rnrr°rñ)rWrzržr{rgrŸr|r3r4r rírîr¦rrörr Ú total_lossÚloss_fctÚmasked_lm_lossÚnext_sentence_lossrµr r r#r]Us<+÷ûz MobileBertForPreTraining.forwardrQ©NNNNNNNNNNN)r_r`raÚ_tied_weights_keysrSr&r+rr?rrmr1rrDrŒrrrrr]rcr r rYr#r5sVôþýüûúùø ÷ öõô órcsàeZdZddgZ‡fdd„Zdd„Zdd„Zdd eede j f‡fdd „ Ze ddee jdee jdee jdee jdee jdee jdee jdeedeedeedeeeffdd„ƒZ‡ZS)ÚMobileBertForMaskedLMr r!cs6tƒ |¡t|dd|_t|ƒ|_||_| ¡dS©NF)r)rRrSrrrÿr#rGrr–rYr r#rS¨s zMobileBertForMaskedLM.__init__cCr$rQr%rür r r#r&±r'z+MobileBertForMaskedLM.get_output_embeddingscCr(rQr)r*r r r#r+´r,z+MobileBertForMaskedLM.set_output_embeddingsNr-r\cr.r/r0r2rYr r#r1¸sÿz-MobileBertForMaskedLM.resize_token_embeddingsrzržr{rgrŸr|r3r rírîcCs°| dur| n|jj} |j|||||||| | d }|d}| |¡} d}|dur8tƒ}|| d|jj¡| d¡ƒ}| sN| f|dd…}|durL|f|S|St|| |j|j dS)a¢ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` Nr5rrhr*©rÚlogitsr°rñ) rGrrr#rr—rnrr°rñ)rWrzržr{rgrŸr|r3r rírîr¦rrr8r7rµr r r#r]¿s6÷ üzMobileBertForMaskedLM.forwardrQ© NNNNNNNNNN)r_r`rar;rSr&r+rr?rrmr1rrDrŒrr¨rrrr]rcr r rYr#r<¤sP õþýüûúùø ÷ öõ ôr<cr¾)ÚMobileBertOnlyNSPHeadcstƒ ¡t |jd¡|_dSr)rRrSrrrrlrr–rYr r#rSös zMobileBertOnlyNSPHead.__init__rör\cCrrQ)r)rWrör r r r#r]úrzMobileBertOnlyNSPHead.forwardr^r r rYr#rAõrrAzZ MobileBert Model with a `next sentence prediction (classification)` head on top. csªeZdZ‡fdd„Ze ddeejdeejdeejdeejdeejd eejd eejdee dee d ee de eeffdd„ƒZ ‡ZS)Ú#MobileBertForNextSentencePredictioncr"rQ)rRrSrrrAr#rr–rYr r#rSs z,MobileBertForNextSentencePrediction.__init__Nrzržr{rgrŸr|r3r rírîr\cKsÊd|vrt dt¡| d¡}| dur| n|jj} |j|||||||| | d }|d} | | ¡}d}|durEtƒ}|| dd¡| d¡ƒ}| s[|f|dd…}|durY|f|S|St |||j|jdS) aŠ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see `input_ids` docstring) Indices should be in `[0, 1]`. - 0 indicates sequence B is a continuation of sequence A, - 1 indicates sequence B is a random sequence. Examples: ```python >>> from transformers import AutoTokenizer, MobileBertForNextSentencePrediction >>> import torch >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased") >>> model = MobileBertForNextSentencePrediction.from_pretrained("google/mobilebert-uncased") >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt") >>> outputs = model(**encoding, labels=torch.LongTensor([1])) >>> loss = outputs.loss >>> logits = outputs.logits ```r4zoThe `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.Nr5rrhr*r>) ÚwarningsÚwarnÚ FutureWarningÚpoprGrrr#rr—rr°rñ)rWrzržr{rgrŸr|r3r rírîÚkwargsr¦rör r9r7rµr r r#r]sB)ý ÷ üz+MobileBertForNextSentencePrediction.forwardr@)r_r`rarSrrrDrŒrr¨rrrr]rcr r rYr#rBÿsH õþýüûúùø ÷ öõ órBz¢ MobileBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. có°eZdZ‡fdd„Ze ddeejdeejdeejdeejdeejd eejd eejdeedeed eede e ejeffdd„ƒZ‡Z S)Ú#MobileBertForSequenceClassificationcsdtƒ |¡|j|_||_t|ƒ|_|jdur|jn|j}t |¡|_ t |j|j¡|_ | ¡dSrQ)rRrSÚ num_labelsrGrrÚclassifier_dropoutrwrrvrxrrrlr)r©rWrGrKrYr r#rSis ÿz,MobileBertForSequenceClassification.__init__Nrzržr{rgrŸr|r3r rírîr\cCsr| dur| n|jj} |j|||||||| | d }|d}| |¡}| |¡} d}|dur™|jjdurV|jdkr 1` a classification loss is computed (Cross-Entropy). Nr5rZ regressionZsingle_label_classificationZmulti_label_classificationrhr*r>)rGrrrxr)Zproblem_typerJr~rDr…r?r Úsqueezerr—rrr°rñ)rWrzržr{rgrŸr|r3r rírîr¦rör?rr7rµr r r#r]xsV÷ " üz+MobileBertForSequenceClassification.forwardr@)r_r`rarSrrrDrbr¨rrrr]rcr r rYr#rIasHõþýüûúùø ÷ öõôrIcs¼eZdZ‡fdd„Ze ddeejdeejdeejdeejdeejd eejd eejdeejdeed eedeede e ejeffdd„ƒZ‡Z S)ÚMobileBertForQuestionAnsweringcs@tƒ |¡|j|_t|dd|_t |j|j¡|_| ¡dSr=) rRrSrJrrrrrrlÚ qa_outputsrr–rYr r#rSÄs z'MobileBertForQuestionAnswering.__init__Nrzržr{rgrŸr|Ústart_positionsÚ end_positionsr rírîr\cCsH|dur|n|jj}|j||||||| | |d }|d} | | ¡}|jddd\}}| d¡ ¡}| d¡ ¡}d}|dur‚|dur‚t| ¡ƒdkrO| d¡}t| ¡ƒdkr\| d¡}| d¡}| d|¡}| d|¡}t |d}|||ƒ}|||ƒ}||d}|s™||f|dd…}|dur—|f|S|St||||j|j dS) Nr5rrrhr‚)Zignore_indexr*)rÚstart_logitsÚ end_logitsr°rñ)rGrrrOr8rMr¥r>r„Úclamprrr°rñ)rWrzržr{rgrŸr|rPrQr rírîr¦rr?rRrSr6Z ignored_indexr7Z start_lossZend_lossrµr r r#r]ÎsP÷ ûz&MobileBertForQuestionAnswering.forwardr:)r_r`rarSrrrDrbr¨rrrr]rcr r rYr#rNÁsN ôþýüûúùø ÷ öõô órNcrH)ÚMobileBertForMultipleChoicecsTtƒ |¡t|ƒ|_|jdur|jn|j}t |¡|_t |j d¡|_| ¡dS)Nr) rRrSrrrKrwrrvrxrrrlr)rrLrYr r#rSs ÿz$MobileBertForMultipleChoice.__init__Nrzržr{rgrŸr|r3r rírîr\cCsn| dur| n|jj} |dur|jdn|jd}|dur%| d| d¡¡nd}|dur4| d| d¡¡nd}|durC| d| d¡¡nd}|durR| d| d¡¡nd}|dure| d| d¡| d¡¡nd}|j|||||||| | d }|d} | | ¡} | | ¡}| d|¡}d}|dur—tƒ}|||ƒ}| s|f|dd…}|dur«|f|S|St |||j |jdS)a[ input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`: - 0 corresponds to a *sentence A* token, - 1 corresponds to a *sentence B* token. [What are token type IDs?](../glossary#token-type-ids) position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`. [What are position IDs?](../glossary#position-ids) inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) Nrrhr¡r5r*r>)rGrrAr—r„rrxr)rrr°rñ)rWrzržr{rgrŸr|r3r rírîZnum_choicesr¦rör?Zreshaped_logitsrr7rµr r r#r] sL,ÿý÷ üz#MobileBertForMultipleChoice.forwardr@)r_r`rarSrrrDrbr¨rrrr]rcr r rYr#rUsH õþýüûúùø ÷ öõôrUcrH)Ú MobileBertForTokenClassificationcsbtƒ |¡|j|_t|dd|_|jdur|jn|j}t |¡|_ t |j|j¡|_| ¡dSr=)rRrSrJrrrKrwrrvrxrrrlr)rrLrYr r#rSsÿz)MobileBertForTokenClassification.__init__Nrzržr{rgrŸr|r3r rírîr\cCs¸| dur| n|jj} |j|||||||| | d }|d}| |¡}| |¡} d}|dur)rGrrrxr)rr—rJrr°rñ)rWrzržr{rgrŸr|r3r rírîr¦rr?rr7rµr r r#r]s8÷ üz(MobileBertForTokenClassification.forwardr@)r_r`rarSrrrDrbr¨rrrr]rcr r rYr#rV|sHõþýüûúùø ÷ öõôrV)r<rUrBrrNrIrVrÝrr rO)Kr£r0rCÚdataclassesrÚtypingrrrrDrZtorch.nnrrr ZactivationsrZmodeling_outputsrr rrrrrrZmodeling_utilsrZ pytorch_utilsrrÚutilsrrrZconfiguration_mobilebertrZ get_loggerr_r.rOÚModulerPrrtrerŽrªr´r¿rÇrÈrÎrÓrÙrÚrÝrêrór÷rùrÿrr rrrr<rArBrIrNrUrVÚ__all__r r r r#Ús€( N L:2$?* !jÿiP ÿ]ÿYMjE