
    sh20                        S SK r S SKrS SKrS SKJrJr  S SKJrJr  S SK	J
r
JrJrJr  S SKr0 SS_SS_S	S
_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS _S!S"_S#S$_S%S&_0 S'S(_S)S*_S+S,_S-S._S/S0_S1S2_S3S4_S5S6_S7S8_S9S:_S;S<_S=S>_S?S@_SASB_SCSD_SESF_SGSH_E0 SISJ_SKSL_SMSN_SOSP_SQSR_SSST_SUSV_SWSX_SYSZ_S[S\_S]S^_S_S`_SaSb_ScSd_SeSf_SgSh_SiSj_E0 SkSl_SmSn_SoSp_SqSr_SsSt_SuSv_SwSx_SySz_S{S|_S}S~_SS_SS_SS_SS_SS_SS_SS_E0 SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_ESSSSSSSSSSSSSSSS.Er0 \R!                  5        V Vs0 s H  u  pX_M	     snn ESSSSSSSS7S7SSSS.Er\ " S S5      5       r\" SS9SS\S\4S jj5       r\" SS9SSSS.S\S\S\\   S\\   S\4
S jj5       rgs  snn f )    N)	dataclassfield)cached_property	lru_cache)DictListOptionalTupleenenglishzhchinesedegermanesspanishrurussiankokoreanfrfrenchjajapanesept
portuguesetrturkishplpolishcacatalannldutchararabicsvswedishititalianid
indonesianhihindififinnishvi
vietnamesehehebrewuk	ukrainianelgreekmsmalaycsczechroromaniandadanishhu	hungariantatamilno	norwegianththaiururduhrcroatianbg	bulgarianlt
lithuanianlalatinmimaoriml	malayalamcywelshskslovaktetelugufapersianlvlatvianbnbengalisrserbianazazerbaijanisl	slovenianknkannadaetestonianmk
macedonianbrbretoneubasqueis	icelandichyarmeniannenepalimn	mongolianbsbosniankkkazakhsqalbanianswswahiliglgalicianmrmarathipapunjabisisinhalakmkhmersnshonayoyorubasosomaliaf	afrikaansococcitankageorgianbe
belarusiantgtajiksdsindhigugujaratiamamharicyiyiddishlolaouzuzbekfofaroesehtzhaitian creolepspashtotkturkmennnnynorskmtmaltesesanskritluxembourgishmyanmartibetantagalogmalagasyassamesetatarhawaiianlingalahausabashkirjavanese	sundanese	cantonese)salbmybotlmgastthawlnhabajwsuyuer   r   )burmese	valencianflemishhaitianletzeburgeschpushtopanjabi	moldavianmoldovan	sinhalese	castilianmandarinc                      \ rS rSr% Sr\R                  \S'   \\S'   Sr	\
\   \S'   Sr\
\   \S'   Sr\\   \S	'   \" \S
9r\\\4   \S'   S rS rS\\   S\4S jrS\\   S\4S jr\S\4S j5       r\S\4S j5       r\S\4S j5       r\S\4S j5       r\S\4S j5       r\S\4S j5       r\S\4S j5       r\S\4S j5       r \S\4S j5       r!\S\4S j5       r"S r#\S\\   4S j5       r$\S\\   4S j5       r%\S\\   4S j5       r&\S\\   4S  j5       r'S!\\   4S" jr(S!\\   4S# jr)S!\\   4S$ jr*S%r+g)&	Tokenizer   zIA thin wrapper around `tiktoken` providing quick access to special tokensencodingnum_languagesNlanguagetask sot_sequence)default_factoryspecial_tokensc                 B   U R                   R                   H,  nU R                   R                  U5      nX R                  U'   M.     U R                  S   nU R                  S   nU R                  S   n[	        [
        R                  5       5      S U R                   nU/nU R                  b0  UR                  US-   UR                  U R                  5      -   5        U R                  b%  U R                  S:X  a  UOUnUR                  U5        [	        U5      U l        g )N<|startoftranscript|><|translate|><|transcribe|>   
transcribe)r   special_tokens_setencode_single_tokenr   tuple	LANGUAGESkeysr   r   appendindexr   r   )	selfspecialspecial_tokensot	translater   langsr   
task_tokens	            e/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/whisper/tokenizer.py__post_init__Tokenizer.__post_init__   s    }}77G MM==gFM+8( 8 &&'>?,,_=	--.>?
inn&'(<$*<*<=u==$a%++dmm*D DE99 ,0II,Ej9J
+!,/    c                 <    U R                   R                  " U40 UD6$ N)r   encode)r   textkwargss      r   r  Tokenizer.encode   s    }}##D3F33r   	token_idsreturnc                     U Vs/ s H  o3U R                   :  d  M  UPM     nnU R                  R                  " U40 UD6$ s  snf r  )timestamp_beginr   decode)r   r  r  ts       r   r  Tokenizer.decode   sA     )F	11E1E-EQ		F}}##I888 Gs
   AAc                 <    U R                   R                  " U40 UD6$ )z
Timestamp tokens are above other special tokens' id range and are ignored by `decode()`.
This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
)r   r  )r   r  r  s      r   decode_with_timestamps Tokenizer.decode_with_timestamps   s    
 }}##I888r   c                 .    U R                   R                  $ r  )r   	eot_tokenr   s    r   eotTokenizer.eot   s    }}&&&r   c                      U R                   S   $ )Nr   r   r  s    r   r   Tokenizer.transcribe   s    ""#344r   c                      U R                   S   $ )Nr   r  r  s    r   r   Tokenizer.translate       ""?33r   c                      U R                   S   $ )Nr   r  r  s    r   r   Tokenizer.sot   s    ""#:;;r   c                      U R                   S   $ )N<|startoflm|>r  r  s    r   sot_lmTokenizer.sot_lm   r  r   c                      U R                   S   $ )N<|startofprev|>r  r  s    r   sot_prevTokenizer.sot_prev   s    ""#455r   c                      U R                   S   $ )N<|nospeech|>r  r  s    r   	no_speechTokenizer.no_speech   s    "">22r   c                      U R                   S   $ )N<|notimestamps|>r  r  s    r   no_timestampsTokenizer.no_timestamps   s    ""#566r   c                      U R                   S   $ )Nz<|0.00|>r  r  s    r   r
  Tokenizer.timestamp_begin   s    "":..r   c                 h    U R                   c  [        S5      eU R                  U R                   5      $ )zGReturns the token id corresponding to the value of the `language` fieldz6This tokenizer does not have language token configured)r   
ValueErrorto_language_tokenr  s    r   language_tokenTokenizer.language_token   s/     == UVV%%dmm44r   c                 r    U R                   R                  SU S3S 5      =n(       a  U$ [        SU S35      e)N<||>z	Language z not found in tokenizer.)r   getKeyError)r   r   tokens      r   r2  Tokenizer.to_language_token   sC    ''++b
",=tDD5DL8*,DEFFr   c                     / nU R                   R                  5        H1  u  p#UR                  S5      [        ;   d  M   UR	                  U5        M3     [        U5      S U R                   $ )N<|>)r   itemsstripr   r   r   r   )r   resultr:  token_ids       r   all_language_tokensTokenizer.all_language_tokens   sY    #2288:OE{{5!Y.h'  ; V}1t1122r   c                 B   ^  [        U 4S jT R                   5       5      $ )Nc              3   f   >#    U  H&  nTR                  U/5      R                  S 5      v   M(     g7f)r=  N)r  r?  ).0_lr   s     r   	<genexpr>/Tokenizer.all_language_codes.<locals>.<genexpr>   s,     W>VT[["&,,U33>Vs   .1)r   rB  r  s   `r   all_language_codesTokenizer.all_language_codes   s    Wd>V>VWWWr   c                 Z    [        [        U R                  5      U R                  /-   5      $ r  )r   listr   r,  r  s    r   #sot_sequence_including_notimestamps-Tokenizer.sot_sequence_including_notimestamps   s&    T$++,0B0B/CCDDr   c                 $   [        S5      nUSR                  5       -  n[        S5      n[        S U 5       5      (       d   eU R                  R                  S5      S   U R                  R                  S5      S   1nU[        U5      -    Hm  nU R                  R                  U5      U R                  R                  SU-   5      4 H-  n[        U5      S	:X  d  XB;   d  M  UR                  US   5        M/     Mo     [        [        U5      5      $ )
uM  
Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.

- ♪♪♪
- ( SPEAKING FOREIGN LANGUAGE )
- [DAVID] Hey there,

keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
u#   "#()*+/:;<=>@[\]^_`{|}~「」『』uK   << >> <<< >>> -- --- -( -[ (' (" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪u   ♩♪♫♬♭♮♯c              3   ^   #    U  H#  nS [        U5      s=:*  =(       a    S:*  Os  v   M%     g7f)i@&  i&  N)ord)rF  cs     r   rH  .Tokenizer.non_speech_tokens.<locals>.<genexpr>  s$     E}!6SV--v--}s   +-z -r   z ' r   )
rM  splitsetallr   r  lenaddr   sorted)r   symbolsmiscellaneousr@  symboltokenss         r   non_speech_tokensTokenizer.non_speech_tokens   s     =>Z``b	
 34E}EEEEE --&&t,Q/1E1Ed1KA1NO] 33F$$V,$$S6\2 v;!#v'>JJvay) 4 VF^$$r   r_  c                 f    U R                   S;   a  U R                  U5      $ U R                  U5      $ )N>   r   r   r   rG   r   r   )r   split_tokens_on_unicodesplit_tokens_on_spaces)r   r_  s     r   split_to_word_tokensTokenizer.split_to_word_tokens  s3    ==AA //77**622r   c                 :   U R                  U5      nSn/ n/ n/ nSnU Hx  nUR                  U5        U R                  U5      n	X9;  d  X'U	R                  U5      -      U:X  d  MF  UR                  U	5        UR                  U5        / nU[        U	5      -  nMz     XE4$ )Nu   �r   )r  r   r   rY  )
r   r_  decoded_fullreplacement_charwordsword_tokenscurrent_tokensunicode_offsetr:  decodeds
             r   rc  !Tokenizer.split_tokens_on_unicode  s    226:#E!!%(11.AG !/?O1P PQ#$ W%"">2!##g,.  !!r   c                    U R                  U5      u  p#/ n/ n[        X#5       H  u  pgUS   U R                  :  nUR                  S5      n	UR	                  5       [
        R                  ;   n
U(       d  U	(       d  U
(       d  [        U5      S:X  a$  UR                  U5        UR                  U5        M  US   U-   US'   US   R                  U5        M     XE4$ )Nr   rU  )
rc  zipr  
startswithr?  stringpunctuationrY  r   extend)r   r_  subwordssubword_tokens_listrj  rk  subwordsubword_tokensr   
with_spaceru  s              r   rd   Tokenizer.split_tokens_on_spaces7  s    (,(D(DV(L%'*8'I#G$Q'4883G ++C0J!--/V-?-??K*s5zQW%"">2!"I/b	B&&~6 (J !!r   )r   ),__name__
__module____qualname____firstlineno____doc__tiktokenEncoding__annotations__intr   r	   strr   r   r
   r   dictr   r   r   r  r   r  r  r   r  r   r   r   r   r$  r(  r,  r
  r3  r2  rB  rJ  rN  r`  re  rc  rd  __static_attributes__r   r   r   r   r      sW   S"Hhsm"D(3-!L%*!%*4%@NDcN@0&49S	 9 99S	 9 9 'S ' ' 5C 5 5 43 4 4 <S < < 4 4 4 6# 6 6 33 3 3 7s 7 7 / / / 5 5 5G 3U3Z 3 3 XE#J X X EU3Z E E !%5: !% !%F349 3"d3i "2"T#Y "r   r   )maxsizec   namer   c                    [         R                  R                  [         R                  R                  [        5      SU  S35      nS [        U5       5        VVs0 s H%  u  p4[        R                  " U5      [        U5      _M'     nnn[        U5      n0 nSS/[        [        R                  5       5      S U  Vs/ s H	  nSU S3PM     snQSPS	PS
PSPSPSP[        S5       V	s/ s H  n	SU	S-  S S3PM     sn	Qn
U
 H  nXgU'   US-  nM     [        R                  " [         R                  R!                  U5      USUUS9$ s  snnf s  snf s  sn	f )Nassetsz	.tiktokenc              3   R   #    U  H  o(       d  M  UR                  5       v   M     g 7fr  )rV  )rF  lines     r   rH  get_encoding.<locals>.<genexpr>O  s     N5ETLDJJLL5Es   
''z<|endoftext|>r   r6  r7  r   r   r  r#  r'  r+  i  g{Gz?z.2fr   zJ's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)r  explicit_n_vocabpat_strmergeable_ranksr   )ospathjoindirname__file__openbase64	b64decoder  rY  rM  r   r   ranger  r  basename)r  r   
vocab_pathr:  rankranksn_vocabr   langispecialss              r   get_encodingr  J  s   bggooh7dV9CUVJ OT*5ENNKE 	T*N 
  %jGN 	 %))9$:>M$J	K$JDBtfB-$J	K 		
 	 	 	 	 	 */t	5ABq4xnB
	5H  'u1  WWj) a% 1 
L 
6s   ,E<E(E)r   r   r   multilingualr   r   r  c                    Ub<  UR                  5       nU[        ;  a"  U[        ;   a
  [        U   nO[        SU 35      eU (       a  SnU=(       d    SnU=(       d    SnOSnS nS n[	        XAS9n[        XQX#S9$ )NzUnsupported language: r  r   r   gpt2)r  r   )r   r   r   r   )lowerr   TO_LANGUAGE_CODEr1  r  r   )r  r   r   r   encoding_namer   s         r   get_tokenizerr  n  s     >>#9$+++H5 #9(!DEE&#t#|LH r   )r  r  )r  r  rt  dataclassesr   r   	functoolsr   r   typingr   r   r	   r
   r  r   r>  r  r   r  r  r  boolr  )coder   s   00r   <module>r     s    	  ( 0 . . e)e)e 	(e 	)	e
 	)e 	(e 	(e 	*e 	,e 	)e 	(e 	)e 	'e 	(e 	)e  	)!e" 	,#e$ 	'%e& 	)'e( 	,)e* 	(+e, 	+-e. 	'/e0 	'1e2 	'3e4 	*5e6 	(7e8 	+9e: 	';e< 	+=e> 	&?e@ 	&AeB 	*CeD 	+EeF 	,GeH 	'IeJ 	'KeL 	+MeN 	'OeP 	(QeR 	(SeT 	)UeV 	)WeX 	)YeZ 	)[e\ 	-]e^ 	+_e` 	)aeb 	*ced 	,eef 	(geh 	(iej 	+kel 	*men 	(oep 	+qer 	)set 	(uev 	*wex 	)yez 	*{e| 	)}e~ 	)e@ 	)AeB 	'CeD 	'EeF 	(GeH 	(IeJ 	+KeL 	)MeN 	*OeP 	,QeR 	'SeT 	(UeV 	*WeX 	)YeZ 	)[e\ 	%]e^ 	'_e` 	)aeb 	
ced 	(eef 	)geh 	)iej 	)kel 











Ie	P,5OO,=>,=.$x~,=> " C" C" C"L 4 s  C    F 4 "  sm	
 3-  y ?s   G