
    0;ji                         d dl mZ d dlmZ d dlZd dlmZmZ ddlmZ ddl	m
Z
 e G d d	                      Z G d
 dej                  ZdS )    )	dataclass)OptionalN)nnTensor   )	Perceiver)T3Configc                       e Zd ZU dZeed<   dZee         ed<   dZee         ed<   dZ	ee         ed<   dZ
ee         ed<   ddd	d
Zd Zedd            ZdS )T3Condz
    Dataclass container for most / all conditioning info.
    TODO: serialization methods aren't used, keeping them around for convenience
    speaker_embNclap_embcond_prompt_speech_tokenscond_prompt_speech_embg      ?emotion_advdevicedtypec          	      B   | j                                         D ]\  }}t          j        |          rkt	          |                    d          d                                                   t          u}t          | ||	                    ||r|nd                     | S )zJCast to a device and dtype. Dtype casting is ignored for long/int tensors.r   Nr   )
__dict__itemstorch	is_tensortypeviewitemintsetattrto)selfr   r   kvis_fps         _/root/voice-cloning/.venv/lib/python3.11/site-packages/chatterbox/models/t3/modules/cond_enc.pyr   z	T3Cond.to   s    M'')) 	V 	VDAqq!! VQVVBZZ]//1122#=aVE;S55t!T!TUUU    c                 :    t          j        | j        |           d S )N)r   saver   )r    fpaths     r$   r'   zT3Cond.save    s    
4=%(((((r%   cpuc                 H    t          j        | |d          }t          di |S )NT)map_locationweights_only )r   loadr   )r(   r+   kwargss      r$   r.   zT3Cond.load#   s,    E4PPPr%   )r)   )__name__
__module____qualname____doc__r   __annotations__r   r   r   r   r   r   r'   staticmethodr.   r-   r%   r$   r   r      s          
 !%Hhv%%%26x/666/3HV,333$'K&!'''t     ) ) )       \     r%   r   c                   4     e Zd ZdZdef fdZdefdZ xZS )	T3CondEnczb
    Handle all non-text conditioning, like speaker embeddings / prompts, CLAP, emotion, etc.
    hpc                    t                                                       || _        |j        dk    r%t	          j        |j        |j                  | _        n!t          t          |j                            d | _        |j        r!t	          j        d|j        d          | _        d | _        |j        rt                      | _        d S d S )Nvoice_encoderr   F)bias)super__init__r8   encoder_typer   Linearspeaker_embed_size
n_channelsspkr_encNotImplementedErrorstremotion_adv_fcr   	perceiveruse_perceiver_resamplerr   )r    r8   	__class__s     r$   r=   zT3CondEnc.__init__.   s    ?o--Ib&;R]KKDMM%c"/&:&:;;; #> 	J"$)Ar}5"I"I"ID % 	)&[[DNNN	) 	)r%   condc                 F   |j         d u |j        d u k    s
J d            |                     |j                            d| j        j                            d d d f         }t          j        |d d d df                   }|j	        
J d            |}|j        }||}n!| j        j
        r|                     |          }|}| j        j        r8|j        J |                     |j                            ddd                    }t          j        ||||fd          }|S )Nz+no embeddings for cond_prompt_speech_tokensr   r   zclap_embed not implementedr   )dim)r   r   rB   r   r   r8   r@   r   
zeros_liker   rG   rF   r   rE   cat)r    rI   	cond_spkrempty	cond_clapr   cond_emotion_advcond_embedss           r$   forwardzT3CondEnc.forward@   sl   .$6D<W[_<_```9 a`` MM$"2"7"7DG<V"W"WXXYZYZYZ\`Y`a	 111bqb5!122 }$$&B$$$	 "&!<!)%*""W, 	L%)^^4J%K%K" !7 	T#///#2243C3H3HQPQ3R3RSS i"	!

    r%   )	r0   r1   r2   r3   r	   r=   r   rS   __classcell__)rH   s   @r$   r7   r7   )   si         )8 ) ) ) ) ) )$!F ! ! ! ! ! ! ! !r%   r7   )dataclassesr   typingr   r   r   r   rF   r   	t3_configr	   r   Moduler7   r-   r%   r$   <module>rY      s    ! ! ! ! ! !                                                 :8 8 8 8 8	 8 8 8 8 8r%   