
    0;jiK                     Z   d dl Z d dlmZmZmZ  e j        e          Zd dlmZ d dl	Z	d dl
mc mZ d dl	mZmZ d dlmZmZmZmZ d dlmZmZmZmZmZmZ ddlmZ dd	lmZmZ dd
l m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)  e j        e          ZdefdZ* G d dej+                  Z,dS )    N)UnionOptionalList)tqdm)nnTensor)
LlamaModelLlamaConfig
GPT2Config	GPT2Model)LogitsProcessorList RepetitionPenaltyLogitsProcessorTemperatureLogitsWarperTopKLogitsWarperTopPLogitsWarperMinPLogitsWarper   )LearnedPositionEmbeddings)	T3CondEncT3Cond)T3Config)LLAMA_CONFIGS)T3HuggingfaceBackend)AlignmentStreamAnalyzer   )AttrDicttext_tokensc                 $   |                      d          }| |j        k                                                                    |k    s
J d            | |j        k                                                                    |k    s
J d            d S )Nr   zmissing start_text_tokenzmissing stop_text_token)sizestart_text_tokenintsumstop_text_token)r   hpBs      Q/root/voice-cloning/.venv/lib/python3.11/site-packages/chatterbox/models/t3/t3.py_ensure_BOT_EOTr'   "   s    A2..335599;;q@@@B\@@@2--224488::a???AZ?????    c                       e Zd ZdZd! fd	Zed             ZdefdZddded	e	j
        d
e	j
        defdZddded	e	j
        de	j
        d
e	j
        de	j
        f
dZded	e	j
        de	j
        d
e	j
        de	j
        f
dZ e	j                    dddddddddddddded	edee         dee         fd            Z e	j                    	 	 d"d             Z xZS )#T3a  
    Token-To-Token (T3) TTS model using huggingface transformer models as backbones,
        * tokenization, including start / stop tokens are always added externally to this class
        * conditioning data like CLAP, emotion, etc are all in a separate file for more modularity
        * careful! this class assumes relative positional encoding -- with absolute PE, we would at
            least want to reset the position to 0 when speech tokens begin, and optionally use a
            different PE embedding space for speech.
    Nc                    |t          j                    }t                                                       || _        t
          |j                 }|                    d          dk    | _        | j        r+t          di || _
        t          | j
                  | _        n*t          di || _
        t          | j
                  | _        | j
        j        | _        d| _        t%          |          | _        t)          j        |j        | j                  | _        t)          j        |j        | j                  | _        d | _        d | _        |j        dk    rK|j        dz   }t=          || j                  | _        |j        dz   dz   }t=          || j                  | _        t)          j         | j
        j        |j        d          | _!        t)          j         | j
        j        |j        | j                  | _"        d| _#        d S )N
model_typegpt2Flearnedr   )bias )$r   english_onlysuper__init__r$   r   llama_config_namegetis_gptr   cfgr   tfmrr
   r	   hidden_sizedimdeepspeed_patch_appliedr   cond_encr   	Embeddingtext_tokens_dict_sizetext_embspeech_tokens_dict_size
speech_embtext_pos_embspeech_pos_embinput_pos_embmax_text_tokensr   max_speech_tokensLinear	text_headspeech_headcompiled)selfr$   config_dictmax_text_seq_lenmax_mel_seq_len	__class__s        r&   r3   zT3.__init__2   s   :&((B#B$89!ool33v=; 	-!00K00DH!$(++DII"11[11DH"48,,DI8'',$ ""R%=txHH,r'A48LL !"y((!1A5 9:JDH U UD 2Q6:O";OTX"V"VD 48#79QX]^^^9TX%92;U\`\ghhhr(   c                 $    | j         j        j        S N)rI   weightdevice)rK   s    r&   rS   z	T3.deviceY   s    &--r(   t3_condc                     |j         U|j        N|                     |j                   |_        | j        s(|xj        |                     |j                   z  c_        |                     |          S )zk
        Token cond data needs to be embedded, so that needs to be here instead of in `T3CondEnc`.
        )cond_prompt_speech_tokenscond_prompt_speech_embrA   r6   rC   r<   )rK   rT   s     r&   prepare_conditioningzT3.prepare_conditioning]   sn     ,8W=[=c-1__W=^-_-_G*; i..$2E2EgFg2h2hh..}}W%%%r(           )
cfg_weightr   speech_tokensrZ   c                   |                      |          }|                     |          }|dk    r!| j        s|d                                          |                     |          }| j        j        dk    r0||                     |          z   }||                     |          z   }|	                    d          }|	                    d          |	                    d          k    r*|
                    |	                    d          dd          }t          j        d t          |||          D                       }	|	|fS )NrY   r   r.   r   c                 D    g | ]\  }}}t          j        |||f          S r0   )torchcat).0ceteses       r&   
<listcomp>z+T3.prepare_input_embeds.<locals>.<listcomp>   s>     
 
 
B Ir2rl##
 
 
r(   )rX   r?   r6   zero_rA   r$   rD   rB   rC   r   expandr_   stackzip)
rK   rT   r   r[   rZ   cond_embr?   rA   len_condembedss
             r&   prepare_input_embedszT3.prepare_input_embedsg   s>    ,,W55==--DKQK__]33
7 I--$"3"3K"@"@@H#d&9&9-&H&HHJ==##==x}}Q//// a(8(8"bAAX  
 
!(HjAA
 
 
   xr(   F)trainingtext_token_lensspeech_token_lensc                V   t          || j                   |                     |||          \  }}| j                            d |dd|           }	|	j        d         }
|                    d          }|                    d          }|
j        \  }}}|
j        |
j	        }}t          j        |||||          }t          j        |||||          }||}}t          |          D ]}|||                                         z   }||                    d          z   }|||                                         z   }|
|||f         ||d ||         f<   |
|||f         ||d ||         f<   |                     |          }|                     |          }t!          |||||
          S )N)rT   r   r[   T)	input_idsinputs_embedsoutput_hidden_statesreturn_dict	use_cacher]   r   dtyperS   )text_logitstext_latentsspeech_logitsspeech_latentshidden_states)r'   r$   rm   r8   forwardr}   r   shaperS   rx   r_   zerosrangeitemrH   rI   r   )rK   rT   r   ro   r[   rp   rn   rl   rk   tfmr_outr}   len_text
len_speechr%   _r:   rS   rx   rz   r|   ttlstlitext_endspeech_start
speech_endry   r{   s                               r&   r~   z
T3.forward   s    	TW---  44#' 5 
 
 9$$ !%#| % 
 
 !.r2 ##A&&"''**
!'	1c%,m.A{1h5PPPQ
CuVTTT"$5Sq 	S 	SA#a&++--/H#k&6&6q&9&99L%A5J'4Q8I5I'JLGSVG$)6q,z:Q7Q)RN1gs1vg:&& nn\22((88#%')'
 
 
 	
r(   c                   |                     d          }|                     d          }||                                k    sJ ||                                k    sJ |                     |||||d          }d}	|j        j        }
t          j        ||
          d         |dddf         k    }t          j        ||
          d         |dddf         k    }|                    ||	          }|                    ||	          }t          j	        |j        ||	          }t          j	        |j
        ||	          }||fS )ztraining methodr   T)rT   r   ro   r[   rp   rn   i)rS   N)ignore_index)r   maxr~   ry   rS   r_   arangemasked_fillFcross_entropyr{   )rK   rT   r   ro   r[   rp   r   r   out	IGNORE_IDrS   	mask_textmask_speechmasked_textmasked_speech	loss_textloss_speechs                    r&   losszT3.loss   sh    ##A&&"''**
?..000000.22444444ll#+'/  
 
 	'L&999$??STSTSTVZSZC[[	l:f===dCGXYZYZYZ\`Y`Gaa!--iCC%11+yIIOCO[yYYY	oc&7U^___+%%r(   r   T皙?ffffff?g?      ?333333?g      ?)initial_speech_tokensprepend_prompt_speech_tokensnum_return_sequencesmax_new_tokensstop_on_eos	do_sampletemperaturetop_pmin_plength_penaltyrepetition_penaltyrZ   r   r   c          	      	   |
J d            t          || j                   t          j        |                              t          j        | j                  }|-| j        j        t          j        |ddddf                   z  }| 	                    ||||          \  }}d| _
        | j
        sd}| j        j        rRt          | j        d|||                    d          z   fd| j        j        	          }|j        | j        j        k    sJ t#          | j        | j        | j        | j        |
          }|| _        d| _
        |j        }t          j        | j        j        ggt          j        |          }|                     |          }|| j                            d          z   }t          j        ||g          }t          j        ||gd          }|                                }g }t7          |
          }t9          |          }t7          |
          }t;          t=          |                    }|                     |ddddd          }|j        }tA          tC          |          dd          D ]3}|j"        dddddf         }|ddddf         } |ddddf         }!t          j#        || j        | j$                  }"| |"| |!z
  z  z   }#| j        j%        |#&                                dk    r|#'                    d          }#tQ          |d                   dk    r|d         )                                nd}$| j        j%        *                    |#|$          }#|dddf         }% ||%|#          }#|	dk    r|#|	z  }# ||%|#          }# ||%|#          }#t          j+        |#d          }&t          j,        |&d          }'|-                    |'           t          j        ||'gd          }|'.                    d          | j        j        k    r"t^          0                    d|dz                nn|                     |'          }(|(| j                            |dz             z   }(t          j        |(|(g          }(|                     |(|ddd          }|j        }5t          j        |d          })|)S )zY
        Args:
            text_tokens: a 1D (unbatched) or 2D (batched) tensor.
        Nznot implementedrw   r   rT   r   r[   rZ   Fr]   	   )text_tokens_slicealignment_layer_idxeos_idx)configllama
speech_encrI   alignment_stream_analyzerTr   r:   )r   )r   )penalty)rs   past_key_valuesrv   output_attentionsrt   ru   Sampling)descdynamic_ncolsr   )rS   rx   r   r]   )
next_token.r   num_samplesu4   ✅ EOS token detected! Stopping generation at step )rs   r   r   rt   ru   )1r'   r$   r_   
atleast_2dtolongrS   start_speech_token	ones_likerm   rJ   is_multilingualr   r8   r   stop_speech_tokenr   r   r7   rA   rI   patched_modeltensorrC   get_fixed_embeddingr`   cloner   r   r   floatr   r   r   logits	as_tensorrx   r   r:   	unsqueezelenr   stepsoftmaxmultinomialappendviewloggerinfo)*rK   rT   r   r   r   r   r   r   r   r   r   r   r   r   rZ   rl   rk   r   r   rS   	bos_token	bos_embedrs   generated_ids	predictedtop_p_warpermin_p_warperrepetition_penalty_processoroutputpastr   logits_stepconduncondr7   r   
last_tokenids_for_procprobsr   next_token_embedpredicted_tokenss*                                             r&   	inferencezT3.inference   si   8 ,335F333TW---&{3366UZPTP[6\\ !($(G$>Q\]^]^]^`bab`b]bQcAdAd$d!  44#/!	 5 
 
  } 	!(,%w& V,CI'/K<L<LR<P<P1P&Q() G5- - -) 18DG<UUUUU0xi? ,*C  M "/D DM& L47#=">!?uzZ`aaa	OOI..	 3 G G J JJ	 Iy)455	 	69"51=== "))	 (e444'e444'e444'GPUVhPiPi'j'j'j$ ##' "!% $ 
 
 % eN++*DQQQ 8	* 8	*A -2qqq1K 1aaa(D 1aaa(F/*T[
SSSCC4&=11F !;G::<<1$$#--a00F<?a@P<Q<QTU<U<U]5166888[_
+EJJ6^hJii )!S1L11,GGF c!!+- "\,77F!\,77F M&b111E*5a@@@JZ(((!I}j&AqIIIM r""dg&???XSTUVSVXXYYY  $z::/$2E2Y2YZ[^_Z_2`2``  %y*:<L)MNN ''. $"&%)  (  F )DD !9YA666r(     c           	      r   t                      }|dk    r(|dk    r"|                    t          |                     |dk    r"|                    t          |                     |dk     r"|                    t	          |                     |dk    r"|                    t          |                     | j        j        t          j	        |d d d df                   z  }	| 
                    |||	d          \  }
}g }|                     |
d          }|d         }|j        }|d d dd f         }|                     |          } ||	|d d dd d f                   }t          j        |d	          }t          j        |d
          }|                    |           |}t#          t%          |                    D ]!}|                     |          }|                     ||d          }|d         }|j        }|                     |          }t          j        |d	          } |||d d dd d f                   }t          j        |t-          d           k              rt/          d            nit          j        |d	          }t          j        |d
          }|                    |           |}t          j        || j        j        k              r n#t          j        |d	          }|                    d          dk    r$|d         | j        j        k    r|d d d df         }|S )Nr   r   r   rY   r   T)rs   rv   r]   r   r   )rs   r   rv   infzWarning: All logits are -infr   )r   r   r   r   r   r   r$   r   r_   r   rm   r8   r   rI   r   r   r   r   r   rA   r`   allr   printr   r   )rK   rT   r   r   top_kr   r   max_gen_lenlogits_processorsspeech_start_tokenrl   r   generated_speech_tokensllm_outputsr}   r   speech_hiddenr{   processed_logitsr   next_speech_tokencurrent_speech_tokencurrent_speech_embedrr   
all_tokenss                            r&   inference_turbozT3.inference_turbo  s    011??{c11$$%<[%I%IJJJ199$$%5e%<%<===3;;$$%5e%<%<===$$$$%EFX%Y%YZZZ "W7%/+VWVWVWY[Z[Y[V[J\:]:]]--#,	 . 
 
	 #%ii    
 

 $A%5%aaaf-((77,,-?qqqRTVWVWVWxAXYY	*333!-eCCC&&'89990eK(()) 	 	A#'??3G#H#H ))2 / $  K (NM)9O ,,];;M	"9qAAAI00M!!!RQRQRQR(<STTy)eEll]:;; 4555I.B777E % 1%Q G G G#**+<===#4 y*dg.GGHH  Y6A>>>
 ??1!!j&747;T&T&T#AAAssF+Jr(   rQ   )r   r   r   r   r   )__name__
__module____qualname____doc__r3   propertyrS   r   rX   r_   
LongTensorr   rm   r~   r   inference_moder   r   r   r   __classcell__)rO   s   @r&   r*   r*   (   s\        % % % % % %N . . X.&F & & & &           %	 
 '         L 7
 7
 7
 7
 %	7

 )7
 '7
 !+7
 7
 7
 7
r"& "& %	"&
 )"& '"& !+"& "& "& "&H U 15 8< )y  y  y  y  	y 
  (/y  '/v&6y  y  y  y v Ups$(K K K K K K K Kr(   r*   )-loggingtypingr   r   r   	getLoggerr   r   r   r_   torch.nn.functionalr   
functionalr   r   transformersr	   r
   r   r   &transformers.generation.logits_processr   r   r   r   r   r   modules.learned_pos_embr   modules.cond_encr   r   modules.t3_configr   llama_configsr   inference.t3_hf_backendr   #inference.alignment_stream_analyzerr   utilsr   r'   Moduler*   r0   r(   r&   <module>r     s    ( ( ( ( ( ( ( ( ( (		8	$	$                         G G G G G G G G G G G G                ? > > > > > / / / / / / / / ' ' ' ' ' ' ( ( ( ( ( ( 9 9 9 9 9 9 H H H H H H       
	8	$	$[ [ [ [ [B B B B B B B B B Br(   