
    sh%                         S r SSKJr  SSKrSSKrSSKr " S S5      r S
       SS jjr S
         SS jjrSS jr	S	 r
g)zJThis is an educational implementation of the byte pair encoding algorithm.    )annotationsNc                  n    \ rS rSrSS jrSSS jjrSS jrSS jrSS jr\	SS j5       r
\	S 5       rS	rg
)SimpleBytePairEncoding   c                   Xl         X l        UR                  5        VVs0 s H  u  p4XC_M	     snnU l        [        R
                  " U5      U l        gs  snnf )zCreates an Encoding object.N)pat_strmergeable_ranksitems_decoderregexcompile_pat)selfr   r	   token_bytestokens        i/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/tiktoken/_educational.py__init__SimpleBytePairEncoding.__init__   sL     .FUF[F[F]^F]0B+F]^MM'*	 _s   Ac                    U R                   R                  U5      n/ nU H9  nUR                  S5      n[        U R                  XbS9nUR                  U5        M;     U$ )zHEncodes a string into tokens.

>>> enc.encode("hello world")
[388, 372]
utf-8)	visualise)r   findallencode
bpe_encoder	   extend)r   textr   wordstokensword
word_bytesword_tokenss           r   r   SimpleBytePairEncoding.encode   sZ     		!!$'DW-J$T%9%9:[KMM+&	 
     c                :   ^  SR                  U 4S jU 5       5      $ )zVDecodes a list of tokens into bytes.

>>> enc.decode_bytes([388, 372])
b'hello world'
r#   c              3  B   >#    U  H  nTR                   U   v   M     g 7fNr   ).0r   r   s     r   	<genexpr>6SimpleBytePairEncoding.decode_bytes.<locals>.<genexpr>-   s     A&e,&s   )joinr   r   s   ` r   decode_bytes#SimpleBytePairEncoding.decode_bytes'   s     xxA&AAAr#   c                @    U R                  U5      R                  SSS9$ )u   Decodes a list of tokens into a string.

Decoded bytes are not guaranteed to be valid UTF-8. In that case, we replace
the invalid bytes with the replacement character "�".

>>> enc.decode([388, 372])
'hello world'
r   replaceerrors)r-   decoder,   s     r   r3   SimpleBytePairEncoding.decode/   s%       (//	/JJr#   c                J    U Vs/ s H  o R                   U   PM     sn$ s  snf )zDecodes a list of tokens into a list of bytes.

Useful for visualising how a string is tokenised.

>>> enc.decode_tokens_bytes([388, 372])
[b'hello', b' world']
r'   )r   r   r   s      r   decode_tokens_bytes*SimpleBytePairEncoding.decode_tokens_bytes:   s#     399&e$&999s    c                (    [        XUS9n[        X#S9$ )z#Train a BPE tokeniser on some data!)data
vocab_sizer   r   r	   )	bpe_trainr   )training_datar:   r   r	   s       r   trainSimpleBytePairEncoding.trainD   s     $W^_%gWWr#   c                    [        U [        5      (       a  [        R                  " U 5      n [	        U R
                  U R                  S9$ )Nr;   )
isinstancestrtiktokenget_encodingr   _pat_str_mergeable_ranks)encodings    r   from_tiktoken$SimpleBytePairEncoding.from_tiktokenJ   s=    h$$,,X6H%%%x7P7P
 	
r#   )r   r   r	   r   N)r   rB   r	   dict[bytes, int]returnNonecolour)r   rB   r   
str | NonerK   	list[int])r   rP   rK   bytes)r   rP   rK   rB   )r   rP   rK   list[bytes])r=   rB   r:   intr   rB   )__name__
__module____qualname____firstlineno__r   r   r-   r3   r6   staticmethodr>   rH   __static_attributes__ r#   r   r   r      sG    + B	K: X X
 
 
r#   r   c                   U Vs/ s H  n[        U/5      PM     nn U(       a#  US;   a  [        U5        OUS:X  a  [        U5        S nS n[        [	        US S USS  5      5       H2  u  pxU R                  US   US   -   5      n	U	c  M$  Ub  X:  d  M.  UnU	nM4     Uc  O"Uc   eUS U XE   XES-      -   /-   XES-   S  -   nM  U(       a
  [        5         U V
s/ s H  oU
   PM	     nn
U$ s  snf s  sn
f )NrN   colorsimple   r      )rQ   visualise_tokensprint	enumeratezipget)r	   inputr   bpartsmin_idxmin_rankipairrankpartr   s               r   r   r   S   s*    "''AUA3ZE'
// 'h&e  U3BZqr!;<GA"&&tAwa'89DX%5	 = """ hw5>EA+4F#F"GG%Z[P[P]J^^/ 2 056d#F6M= (: 7s   C*C/c           
       ^ US:  a  [        S5      e0 n[        S5       H  nXT[        U/5      '   M     [        R                  " X 5       VVs/ s H.  ofR                  S5       Vs/ s H  n[        U/5      PM     snPM0     nnn[        U5      U:  Ga  [        R                  " 5       mU H)  n	[        U	S S U	SS  5       H  n
TU
==   S-  ss'   M     M+     [        TU4S jS9nUS   US   -   n[        U5      nXU'   / nU H  n/ nSnU[        U5      S-
  :  aR  Xe   XeS-      4U:X  a  UR                  U5        US	-  nOUR                  Xe   5        US-  nU[        U5      S-
  :  a  MR  U[        U5      S-
  :X  a  UR                  Xe   5        UR                  U5        M     UnU(       a  [        S
US    SUS    35        [        SU S[        U5       S35        US;   a5  [        S5        [        US S  VVs/ s H  of  H  oPM     M     snn5        O(US:X  a"  [        S5        US S  H  n[        U5        M     [        S5        [        U5      U:  a  GM  U$ s  snf s  snnf s  snnf )N   z;vocab_size must be at least 256, so we can encode all bytesr   r_   r`   c                   > TU    $ r&   rZ   )xstatss    r   <lambda>bpe_train.<locals>.<lambda>   s	    E!Hr#   )keyr   ra   z The current most common pair is z + zSo we made z our zth tokenr\   z9Now the first fifty words in our training data look like:2   r^   z:Now the first twenty words in our training data look like:   
)
ValueErrorrangerQ   r   r   r   lencollectionsCounterre   maxappendrc   rb   )r9   r:   r   r   ranksrl   r   rh   r   piecerm   most_common_pairr   r   	new_wordsnew_wordrt   s                   @r   r<   r<   w   s    DVWWE4[eQCj  @E}}W?[ ?[t[[121s12?[ 
  
 e*z
!##%EE#2Jab	2dq  3  u*<=&q),<Q,??E
"k 	DHAc$i!m#GTa%[)-==OOK0FAOODG,FA c$i!m# CIM!(X&   45Ea5H4IM]^_M`LabcK}E#e*XFG//QR E#2J!QJDD5%D%J!QRh&RS!#2JD$K '$KW e*z
!Z Lc 	3 V "Rs   I-&I(<I-I3
(I-c                H   S Vs/ s H	  nSU S3PM     nnU  Vs/ s H  o3R                  SSS9PM     nnSnS nU HN  nX%[        U5      -     nX:X  a  X%S-   [        U5      -     nX:w  d   eUnU[        U5      -  n[        X-   S	S
9  MP     [        S5        g s  snf s  snf )N)         M   P   D      z[48;5;mr   r0   r1   r   r`    )endz[0m)r3   r}   rc   )	token_valuesrl   
backgroundrs   unicode_token_valuesrunning_length
last_colorr   r]   s	            r   rb   rb      s    /OP/O!L1%/OJP JVVAHHWYH?VNJ%C
O;< 2c*oEFE&&&
#e*$em$ & 
+! Q Ws
   BBc                 l   Sn [        [        5       nUR                  5       nS S S 5        [        R	                  WSU S9n[        S5        UR                  S5      nUR                  U5      S:X  d   eUR                  U5      S:X  d   eUR                  U5      SS/:X  d   eU$ ! , (       d  f       N= f)	NzN's|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+iX  )r:   r   zJThis is the sequence of merges performed in order to encode 'hello world':zhello worlds   hello worlds   hellos    world)
open__file__readr   r>   rc   r   r3   r-   r6   )gpt2_patternfr9   encr   s        r   train_simple_encodingr      s    ]  
h1vvx 
 !
&
&t\
&
RC	
VWZZ&F::f...F#~555""6*x.CCCCJ 
s   B%%
B3rM   )r	   rJ   rg   rQ   r   rO   rK   rP   )
r9   rB   r:   rS   r   rB   r   rO   rK   rJ   )r   rR   rK   rL   )__doc__
__future__r   r~   r   rC   r   r   r<   rb   r   rZ   r#   r   <module>r      s    P "   D
 D
P NV!%!.3!@J!!J GOB
BB),B9CBBJ(r#   