
    0;ji                     h    d dl mZ d dlZd dlmZ d dlmZmZmZmZ d dl	m
Z
  G d dee          ZdS )    )OptionalN)nn)LlamaConfig
LlamaModelLlamaPreTrainedModelGenerationMixin)!CausalLMOutputWithCrossAttentionsc                        e Zd ZdZdddddededdf fdZ ej                    	 	 dd	ej	        d
ej	        de
fd            Z ej                    	 	 	 	 	 ddej	        deej	                 fd            Z xZS )T3HuggingfaceBackendz
    Override some HuggingFace interface methods so we can use the standard `generate` method with our
    custom embedding / logit layers.

    NOTE: need to extend "*PreTrainedModel" to avoid re-initializing weights!
    N)latents_queuelogits_queuealignment_stream_analyzerconfigllamar   AlignmentStreamAnalyzerc                    t                                          |           || _        || _        || _        d| _        || _        d S )NF)super__init__model
speech_encspeech_head_added_condr   )	selfr   r   r   r   r   r   r   	__class__s	           f/root/voice-cloning/.venv/lib/python3.11/site-packages/chatterbox/models/t3/inference/t3_hf_backend.pyr   zT3HuggingfaceBackend.__init__   sI     	   
$& )B&&&    	input_idsdecoder_cond	use_cachec                 `   |sd}||ddddf         }|                      |          }| j        sy|J |                    d          |                    d          k    r*|                    |                    d          dd          }t	          j        ||gd          }d| _        |||dS )a9  
        This is a method used by huggingface's generate() method.
        Overridden here to apply our custom speech token embedding layer.

        :param input_ids: (B, S) int64 tensors of input tokens.
        :param decoder_cond: (B, T, C) float32 tensor of conditioning (prefixed to <input_embeds>)
        Nr      )dimT)inputs_embedspast_key_valuesr   )r   r   sizeexpandtorchcat)r   r   r   r   r%   cache_positionr$   s          r   prepare_inputs_for_generationz2T3HuggingfaceBackend.prepare_inputs_for_generation#   s      	#"O&!!!!RSS&)I 	22  	$"...  ##}'9'9!'<'<<<+22=3E3Ea3H3H"bQQ!I|]&CKKKM#D +."
 
 	
r   TFr$   r%   c                 <   |                     d          dk    }|duot          |          dk    }|r|rJ |sJ |sJ |                     |||||d          }	|	j        d         }
|                     |
          }t          ||	j        |	j        |	j                  S )a+  
        This is a method used by huggingface's generate() method.
        Overridden here to apply our custom layer norm and speech logit projection layers.

        :param inputs_embeds: (B, S, C) float32 tensor of conditioning inputs. If past key values are given,
        S should be 1.
        r"   Nr   T)r$   r%   r   output_attentionsoutput_hidden_statesreturn_dictr!   )logitsr%   hidden_states
attentions)r&   lenr   r1   r   r	   r%   r2   )r   r$   r%   r   r-   r.   r/   is_large_input	has_cachetfmr_outr1   r0   s               r   forwardzT3HuggingfaceBackend.forwardH   s    " '++A..!3#4/LC4H4H14L	"1y111{####::'+/!5  
 
 !.r2!!-00 1$4"0*	
 
 
 	
r   )NN)NTFTT)__name__
__module____qualname____doc__r   r   r   r(   inference_modeTensorboolr+   r   r7   __classcell__)r   s   @r   r   r   	   s4         =AC C CC C $=C C C C C C$ Udh"
 "
"
5:\"
NR"
 "
 "
 "
H U 15!+
 +
|+
 "%,/+
 +
 +
 +
 +
 +
 +
 +
r   r   )typingr   r(   r   transformersr   r   r   r   transformers.modeling_outputsr	   r    r   r   <module>rD      s                 W W W W W W W W W W W W K K K K K Kk
 k
 k
 k
 k
/ k
 k
 k
 k
 k
r   