
    sh-                     Z   S SK r S SKrS SKJr  S SKJr  S SKJrJrJ	r	J
r
  S SKrS SKrS SKJs  Jr  S SKJrJr  SSKJr  SSKJr  SS	KJr   S S
KJr  Sr\ " S S5      5       r " S S\R@                  5      r  " S S\RB                  5      r! " S S\RD                  5      r"S!S jr#\S 5       r$ " S S\RJ                  5      r& " S S\RJ                  5      r' " S S\RJ                  5      r( " S S\RJ                  5      r) " S S \RJ                  5      r*g! \\\4 a    SrSr Nf = f)"    N)contextmanager)	dataclass)DictIterableOptionalTuple)Tensornn   )decode)detect_language)
transcribe)scaled_dot_product_attentionTFc                   z    \ rS rSr% \\S'   \\S'   \\S'   \\S'   \\S'   \\S'   \\S'   \\S	'   \\S
'   \\S'   Srg)ModelDimensions   n_melsn_audio_ctxn_audio_staten_audio_headn_audio_layern_vocab
n_text_ctxn_text_staten_text_headn_text_layer N)__name__
__module____qualname____firstlineno__int__annotations____static_attributes__r       a/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/whisper/model.pyr   r      s;    KLOr%   r   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )	LayerNorm'   xreturnc                 p   > [         TU ]  UR                  5       5      R                  UR                  5      $ N)superforwardfloattypedtype)selfr*   	__class__s     r&   r/   LayerNorm.forward(   s'    wqwwy)..qww77r%   r   )r   r   r    r!   r	   r/   r$   __classcell__r4   s   @r&   r(   r(   '   s    8 8F 8 8r%   r(   c                   &    \ rS rSrS\S\4S jrSrg)Linear,   r*   r+   c                     [         R                  " UU R                  R                  UR                  5      U R
                  c  S 5      $ U R
                  R                  UR                  5      5      $ r-   )Flinearweighttor2   bias)r3   r*   s     r&   r/   Linear.forward-   sV    xxKKNN177#II%D
 	
 ,099<<+@
 	
r%   r   N)r   r   r    r!   r	   r/   r$   r   r%   r&   r9   r9   ,   s    
 
F 
r%   r9   c                   B   ^  \ rS rSrS\S\S\\   S\4U 4S jjrSrU =r$ )Conv1d5   r*   r>   r@   r+   c                    > [         TU ]  XR                  UR                  5      Uc  S 5      $ UR                  UR                  5      5      $ r-   )r.   _conv_forwardr?   r2   )r3   r*   r>   r@   r4   s       r&   rF   Conv1d._conv_forward6   sG     w$yy!4<4
 	
=AWWQWW=M
 	
r%   r   )	r   r   r    r!   r	   r   rF   r$   r6   r7   s   @r&   rC   rC   5   s2    

!'
/7/?
	
 
r%   rC   c                    US-  S:X  d   e[         R                  " U5      US-  S-
  -  n[        R                  " U* [        R                  " US-  5      -  5      n[        R                  " U 5      SS2[         R
                  4   U[         R
                  SS24   -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9$ )z*Returns sinusoids for positional embedding   r   r   Ndim)	nplogtorchexparangenewaxiscatsincos)lengthchannelsmax_timescalelog_timescale_incrementinv_timescalesscaled_times         r&   	sinusoidsr[   >   s    a<1 ff]3x1}q7HIYY 77%,,xST}:UUVN,,v&q"**}5rzzST}8UUK99eii,eii.DE1MMr%   c               #      #    [         R                  n  S[         l        S v   U [         l        g ! U [         l        f = f7f)NF)MultiHeadAttentionuse_sdpa)
prev_states    r&   disable_sdpar`   G   s3     #,,J1&+#&0#j#s   ?/ ?<?c                      ^  \ rS rSrSrS\S\4U 4S jjr   SS\S\\   S\\   S	\\	   4S
 jjr
 SS\S\S\S\\   S\\R                  \\R                     4   4
S jjrSrU =r$ )r]   Q   Tn_staten_headc                    > [         TU ]  5         X l        [        X5      U l        [        XSS9U l        [        X5      U l        [        X5      U l        g )NF)r@   )r.   __init__rd   r9   querykeyvalueout)r3   rc   rd   r4   s      r&   rf   MultiHeadAttention.__init__T   sE    G-
'7G-
'+r%   r*   xamaskkv_cachec                 0   U R                  U5      nUb  Ub  U R                  U;  a-  U R                  Uc  UOU5      nU R                  Uc  UOU5      nOX@R                     nX@R                     nU R                  XVXs5      u  pU R	                  U5      U	4$ r-   )rg   rh   ri   qkv_attentionrj   )
r3   r*   rl   rm   rn   qkvwvqks
             r&   r/   MultiHeadAttention.forward\   s     JJqMrzTXXX-E bjb1A


13A "A$A##A!2xx|Rr%   rq   rr   rs   r+   c                    UR                   u  pVnXpR                  -  S-  nUR                  " / UR                   S S QU R                  PSP76 R                  SSSS5      nUR                  " / UR                   S S QU R                  PSP76 R                  SSSS5      nUR                  " / UR                   S S QU R                  PSP76 R                  SSSS5      n[        (       aS  [
        R                  (       a>  [        XX4S L=(       a    US:  S9n	U	R                  SSSS5      R                  SS9n
S nX4$ X-  X(-  R                  SS	5      -  nUb  XS U2S U24   -   nUR                  5       n[        R                  " USS
9R                  UR                  5      nX-  R                  SSSS5      R                  SS9n
UR                  5       nX4$ )Ng      пrI   r   r      )	is_causal)	start_dimrJ   )shaperd   viewpermuteSDPA_AVAILABLEr]   r^   r   flatten	transposer0   r<   softmaxr?   r2   detach)r3   rq   rr   rs   rm   n_batchn_ctxrc   scalearj   ru   ws                r&   rp    MultiHeadAttention.qkv_attentionr   s    #$''KK'E1FF1AGGBQK11b199!Q1EFF1AGGBQK11b199!Q1EFF1AGGBQK11b199!Q1E>099,at#3#A	A ))Aq!Q'//!/<CB w )	44R<<Bvvvv~..B		""%((1A5//!Q1-555BCBwr%   )rh   rd   rj   rg   ri   NNNr-   )r   r   r    r!   r^   r"   rf   r	   r   dictr/   r   rN   rp   r$   r6   r7   s   @r&   r]   r]   Q   s    H, ,S ,  $!%#'   V  v	 
 4. . IM"'-5=f5E	u||Xell33	4 r%   r]   c            
       r   ^  \ rS rSrSS\S\S\4U 4S jjjr   SS\S\\   S\\   S	\\	   4S
 jjr
SrU =r$ )ResidualAttentionBlock   rc   rd   cross_attentionc                 t  > [         TU ]  5         [        X5      U l        [	        U5      U l        U(       a  [        X5      OS U l        U(       a  [	        U5      OS U l        US-  n[        R                  " [        X5      [        R                  " 5       [        XA5      5      U l        [	        U5      U l        g )N   )r.   rf   r]   attnr(   attn_ln
cross_attncross_attn_lnr
   
Sequentialr9   GELUmlpmlp_ln)r3   rc   rd   r   n_mlpr4   s        r&   rf   ResidualAttentionBlock.__init__   s    &w7	 ) 4Cw/ 	 4CYw/!==7"BGGIve/E
  (r%   r*   rl   rm   rn   c                     XR                  U R                  U5      X4S9S   -   nU R                  (       a$  XR                  U R                  U5      X$S9S   -   nXR	                  U R                  U5      5      -   nU$ )Nrm   rn   r   )rn   r   r   r   r   r   r   )r3   r*   rl   rm   rn   s        r&   r/   ResidualAttentionBlock.forward   sq     		$,,q/	HKK??OOD$6$6q$92OQRSTTAQ((r%   r   )Fr   )r   r   r    r!   r"   boolrf   r	   r   r   r/   r$   r6   r7   s   @r&   r   r      sk    ) )S )4 ) )(  $!%#' V v	
 4. r%   r   c            
       N   ^  \ rS rSrS\S\S\S\S\4
U 4S jjrS\4S	 jrS
rU =r	$ )AudioEncoder   r   r   rc   rd   n_layerc           	      @  > [         TU ]  5         [        XSSS9U l        [        X3SSSS9U l        U R                  S[        X#5      5        [        R                  " [        U5       Vs/ s H  n[        X45      PM     sn5      U l        [        U5      U l        g s  snf )Nry   r   )kernel_sizepaddingrI   )r   strider   positional_embedding)r.   rf   rC   conv1conv2register_bufferr[   r
   
ModuleListranger   blocksr(   ln_post)r3   r   r   rc   rd   r   _r4   s          r&   rf   AudioEncoder.__init__   s     	FAF
G!AqQ
3Yu5NO8:>CGnMn#G4nM9
 !) Ns   )Br*   c                    [         R                  " U R                  U5      5      n[         R                  " U R                  U5      5      nUR	                  SSS5      nUR
                  SS U R                  R
                  :X  d   S5       eXR                  -   R                  UR                  5      nU R                   H  nU" U5      nM     U R                  U5      nU$ )z\
x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
    the mel spectrogram of the audio
r   rI   r   Nzincorrect audio shape)r<   gelur   r   r   r}   r   r?   r2   r   r   )r3   r*   blocks      r&   r/   AudioEncoder.forward   s    
 FF4::a=!FF4::a=!IIaAwwqr{d77===V?VV=***..qww7[[EaA ! LLOr%   )r   r   r   r   )
r   r   r    r!   r"   rf   r	   r/   r$   r6   r7   s   @r&   r   r      sA    **"%*03*=@*KN*  r%   r   c            
       `   ^  \ rS rSrS\S\S\S\S\4
U 4S jjrSS\S	\S
\\   4S jjr	Sr
U =r$ )TextDecoder   r   r   rc   rd   r   c                   > [         TU ]  5         [        R                  " X5      U l        [        R
                  " [        R                  " X#5      5      U l        [        R                  " [        U5       Vs/ s H  n[        X4SS9PM     sn5      U l        [        U5      U l        [        R                  " X"5      R                  [         R"                  * 5      R%                  S5      nU R'                  SUSS9  g s  snf )NT)r   r   rm   F
persistent)r.   rf   r
   	Embeddingtoken_embedding	ParameterrN   emptyr   r   r   r   r   r(   lnfill_rL   inftriu_r   )	r3   r   r   rc   rd   r   r   rm   r4   s	           r&   rf   TextDecoder.__init__   s     	!||G=$&LLU1L$M!8: w'A 'wM'9
 G${{5(..w7==a@VTe<s   7C<r*   rl   rn   c                 $   U(       a/  [        [        UR                  5       5      5      R                  S   OSnU R	                  U5      U R
                  XDUR                  S   -    -   nUR                  UR                  5      nU R                   H  nU" XU R                  US9nM     U R                  U5      nU[        R                  " U R                  R                  R                  UR                  5      SS5      -  R                  5       nU$ )z
x : torch.LongTensor, shape = (batch_size, <= n_ctx)
    the text tokens
xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state)
    the encoded audio features to be attended on
r   r   rx   r   )nextitervaluesr}   r   r   r?   r2   r   rm   r   rN   r   r>   r0   )r3   r*   rl   rn   offsetr   logitss          r&   r/   TextDecoder.forward   s     <Dd8??,-.44Q7  #''!''"+1EFG 	
 DDN[[Ea$))h?A ! GGAJ 4 4 ; ; > >qww GANN
%' 	 r%   )r   r   r   r   r-   )r   r   r    r!   r"   rf   r	   r   r   r/   r$   r6   r7   s   @r&   r   r      sS    ==#&=14=>A=LO=& V x~  r%   r   c                   J  ^  \ rS rSrS\4U 4S jjrS\4S jrS\R                  4S jr
S\R                  S	\R                  4S
 jrS\R                  S\R                  S\\\R                  4   4S jr\S 5       r\S 5       r\S 5       rSS\\   4S jjr\r\r\rSrU =r$ )Whisper   dimsc                   > [         TU ]  5         Xl        [        U R                  R                  U R                  R
                  U R                  R                  U R                  R                  U R                  R                  5      U l	        [        U R                  R                  U R                  R                  U R                  R                  U R                  R                  U R                  R                  5      U l        ["        R$                  " U R                  R                  U R                  R                  ["        R&                  S9nSX R                  R                  S-  S & U R)                  SUR+                  5       SS9  g )Nr2   TrI   alignment_headsFr   )r.   rf   r   r   r   r   r   r   r   encoderr   r   r   r   r   r   decoderrN   zerosr   r   	to_sparse)r3   r   	all_headsr4   s      r&   rf   Whisper.__init__   s   	#IIII!!II##II""II##
 #IIII  II""II!!II""
 KKII""DII$9$9
	 48	))((A-/0.	0C0C0ERWXr%   dumpc                 |   [         R                  " [        R                  " [        R
                  " U5      5      [        S9R                  5       n[        R                  " U5      R                  U R                  R                  U R                  R                  5      nU R                  SUR                  5       SS9  g )Nr   r   Fr   )rL   
frombuffergzip
decompressbase64	b85decoder   copyrN   
from_numpyreshaper   r   r   r   r   )r3   r   arrayrm   s       r&   set_alignment_headsWhisper.set_alignment_heads  s    OOF,,T234

$& 	 &..II""DII$9$9
 	.0@USr%   melc                 $    U R                  U5      $ r-   )r   )r3   r   s     r&   embed_audioWhisper.embed_audio  s    ||C  r%   tokensaudio_featuresc                 $    U R                  X5      $ r-   )r   )r3   r   r   s      r&   r   Whisper.logits"  s    ||F33r%   r+   c                 B    U R                  X R                  U5      5      $ r-   )r   r   )r3   r   r   s      r&   r/   Whisper.forward%  s     ||FLL$566r%   c                 H    [        U R                  5       5      R                  $ r-   )r   
parametersdevicer3   s    r&   r   Whisper.device*  s    DOO%&---r%   c                 4    U R                   R                  S:  $ )Ni  )r   r   r   s    r&   is_multilingualWhisper.is_multilingual.  s    yy  E))r%   c                 `    U R                   R                  S-
  [        U R                  5      -
  $ )Ni5  )r   r   r"   r   r   s    r&   num_languagesWhisper.num_languages2  s'    yy  5(3t/C/C+DDDr%   cachec                    ^ ^^^ Tb  0 TEO0 m/ mUU 4S jmS[         R                  4UU4S jjnT R                  R                  U5        TT4$ )a@  
The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
tensors calculated for the previous positions. This method returns a dictionary that stores
all caches, and the necessary hooks for the key and value projection modules that save the
intermediate tensors to be reused during later calculations.

Returns
-------
cache : Dict[nn.Module, torch.Tensor]
    A dictionary object mapping the key/value projection modules to its cache
hooks : List[RemovableHandle]
    List of PyTorch RemovableHandle objects to stop the hooks to be called
c                    > U T;  d'  UR                   S   TR                  R                  :  a
  UTU '   TU    $ [        R                  " TU    U/SS9R                  5       TU '   TU    $ )Nr   rJ   )r}   r   r   rN   rR   r   )moduler   outputr   r3   s      r&   save_to_cache5Whisper.install_kv_cache_hooks.<locals>.save_to_cacheG  sk    U"fll1o		8L8L&L &f =  !&		5=&*Aq I P P Rf= r%   layerc                    > [        U [        5      (       aU  TR                  U R                  R	                  T5      5        TR                  U R
                  R	                  T5      5        g g r-   )
isinstancer]   appendrh   register_forward_hookri   )r  hooksr  s    r&   install_hooks5Whisper.install_kv_cache_hooks.<locals>.install_hooksO  sL    %!344UYY<<]KLU[[>>}MN 5r%   )r
   Moduler   apply)r3   r   r
  r	  r  s   `` @@r&   install_kv_cache_hooksWhisper.install_kv_cache_hooks6  sU     #.	5	B	!	O 	O 	O
 	=)e|r%   )r   r   r   r-   )r   r   r    r!   r   rf   bytesr   rN   r	   r   r   r   strr/   propertyr   r   r   r   r   r  detect_language_functionr   transcribe_functionr   decode_functionr   r$   r6   r7   s   @r&   r   r      s    Y_ Y2T T!u|| !4U\\ 45<< 47<<7).7	c5<<	 7
 . . * * E EHTN B /O$JFr%   r   )i'  )+r   r   
contextlibr   dataclassesr   typingr   r   r   r   numpyrL   rN   torch.nn.functionalr
   
functionalr<   r	   decodingr   r  r   r  r   r  r   r   ImportErrorRuntimeErrorOSErrorr   r(   r9   rC   r[   r`   r  r]   r   r   r   r   r   r%   r&   <module>r      s     % ! 2 2      / A 9@N 
 
 
8 8

RYY 

RYY 
N 1 1: :zRYY @299 B*")) *Z]bii ]Q 	\7+ #' Ns   D D*)D*