
    /;ji                        d dl Z d dlmZmZmZmZ d dlZd dlZd dl	m
c mZ d dlm
Z
 ddlmZ ddlmZmZ ddlmZ 	 	 	 	 dZdej        dededededefdZ	 d[dZd Zd Z G d de
j                  Zd\dZd]dZd^dedeej         ef         defd Z!d!ej        d"eej        eej                 f         d#eej        ej        f         fd$Z" G d% d&e
j                  Z# G d' d(e
j                  Z$ G d) d*e
j                  Z% G d+ d,e
j                  Z& G d- d.e
j                  Z' G d/ d0e
j                  Z( G d1 d2e
j                  Z) G d3 d4e
j                  Z* G d5 d6e
j                  Z+ G d7 d8e
j                  Z, G d9 d:e
j                  Z- G d; d<e
j                  Z. G d= d>e
j                  Z/ G d? d@e
j                  Z0 G dA dBe
j                  Z1 G dC dDe
j                  Z2 G dE dFe
j                  Z3 G dG dHe
j                  Z4 G dI dJe
j                  Z5dK Z6 G dL dMe
j                  Z7 G dN dOe
j                  Z8 G dP dQe
j                  Z9 G dR dSe
j                  Z: G dT dUe
j                  Z; G dV dWe
j                  Z< G dX dYe
j                  Z=dS )_    N)ListOptionalTupleUnion)nn   )	deprecate   )FP32SiLUget_activation)	AttentionF'  	timestepsembedding_dimflip_sin_to_cosdownscale_freq_shiftscale
max_periodc                    t          | j                  dk    s
J d            |dz  }t          j        |           t	          j        d|t          j        | j                  z  }|||z
  z  }t	          j        |          }| dddf         	                                |dddf         z  }||z  }t	          j
        t	          j        |          t	          j        |          gd          }|r0t	          j
        |dd|df         |ddd|f         gd          }|dz  dk    r%t          j        j                            |d	          }|S )
a  
    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.

    :param timesteps: a 1-D Tensor of N indices, one per batch element.
                      These may be fractional.
    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
    embeddings. :return: an [N x dim] Tensor of positional embeddings.
    r
   zTimesteps should be a 1d-arrayr   r   )startenddtypedeviceNdim)r   r
   r   r   )lenshapemathlogtorcharangefloat32r   expfloatcatsincosr   
functionalpad)	r   r   r   r   r   r   half_dimexponentembs	            U/root/voice-cloning/.venv/lib/python3.11/site-packages/diffusers/models/embeddings.pyget_timestep_embeddingr/      sg     y1$$$&F$$$!H$$$u|XU]9;K( ( ( H 8&::;H
)H

C
AAAtG

"
"
$
$s47|
3C #+C )UYs^^UYs^^4"
=
=
=C  HiQQQ		\*C9H9,=>BGGG qAh!%%c<88J          ?   c                 4   t          |t                    r||f}t          j        |d         t          j                  |d         |z  z  |z  }t          j        |d         t          j                  |d         |z  z  |z  }t          j        ||          }t          j        |d          }|                    dd|d         |d         g          }t          | |          }	|r2|dk    r,t          j	        t          j
        || g          |	gd          }	|	S )z
    grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
    [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    r   r   r
   axisr   )
isinstanceintnpr"   r#   meshgridstackreshape!get_2d_sincos_pos_embed_from_gridconcatenatezeros)
	embed_dim	grid_size	cls_tokenextra_tokensinterpolation_scale	base_sizegrid_hgrid_wgrid	pos_embeds
             r.   get_2d_sincos_pos_embedrJ   F   s    )S!! +	*	Yy|2:666)A,:RSViiFYy|2:666)A,:RSViiF;vv&&D8Dq!!!D<<Ay|Yq\:;;D1)TBBI ]\A%%NBHlI-F$G$G#SZ[\\\	r0   c                     | dz  dk    rt          d          t          | dz  |d                   }t          | dz  |d                   }t          j        ||gd          }|S )Nr   r    embed_dim must be divisible by 2r
   r5   )
ValueError!get_1d_sincos_pos_embed_from_gridr9   r>   )r@   rH   emb_hemb_wr-   s        r.   r=   r=   \   sm    1};<<< .i1nd1gFFE-i1nd1gFFE
.%a
0
0
0CJr0   c                 r   | dz  dk    rt          d          t          j        | dz  t          j                  }|| dz  z  }dd|z  z  }|                    d          }t          j        d	||          }t          j        |          }t          j        |          }t          j        ||gd
          }|S )zu
    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
    r   r   rL   r4   g       @r1   r   r   zm,d->mdr
   r5   )	rM   r9   r"   float64r<   einsumr'   r(   r>   )r@   posomegaoutemb_sinemb_cosr-   s          r.   rN   rN   h   s     1};<<<Ii1nBJ777E	Y_E%,E
++b//C
)IsE
*
*CfSkkGfSkkG
.'7+!
4
4
4CJr0   c                   F     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 d fd	Zd Zd Z xZS )
PatchEmbedz:2D Image to Patch Embedding with support for SD3 cropping.   r2         FTr
   sincosNc                    t                                                       ||z  ||z  z  }|| _        || _        || _        t          j        ||||f||          | _        |rt          j        |dd          | _	        nd | _	        || _
        ||z  ||z  c| _        | _        ||z  | _        |	| _        |r|}nt          |dz            }|
	d | _        d S |
dk    rtt#          ||| j        | j                  }|rdnd}|                     d	t'          j        |                                                              d
          |           d S t/          d|
           )N)kernel_sizestridebiasFgư>)elementwise_affineeps      ?r^   )rE   rD   TrI   r   )
persistentzUnsupported pos_embed_type: )super__init__flatten
layer_normpos_embed_max_sizer   Conv2dproj	LayerNormnorm
patch_sizeheightwidthrE   rD   r8   rI   rJ   register_bufferr!   
from_numpyr%   	unsqueezerM   )selfrq   rr   rp   in_channelsr@   rj   ri   rb   rD   pos_embed_typerk   num_patchesrA   rI   rf   	__class__s                   r.   rh   zPatchEmbed.__init__   s    	+0CD$"4IZ0HQ[bf
 
 
	  	Y5dSSSDIIDI$"(J"68KTZ:-#6   	.*IIK,--I!!DNNNx''/9TXTl  I "4>J  e.>y.I.I.O.O.Q.Q.[.[\].^.^ku vvvvvLNLLMMMr0   c                    | j         t          d          || j        z  }|| j        z  }|| j         k    rt          d| d| j          d          || j         k    rt          d| d| j          d          | j         |z
  dz  }| j         |z
  dz  }| j                            d| j         | j         d	          }|dd|||z   |||z   ddf         }|                    dd	|j        d	                   }|S )
z2Crops positional embeddings for SD3 compatibility.Nz.`pos_embed_max_size` must be set for cropping.zHeight (z/) cannot be greater than `pos_embed_max_size`: .zWidth (r   r
   r   )rk   rM   rp   rI   r<   r   )rv   rq   rr   topleftspatial_pos_embeds         r.   cropped_pos_embedzPatchEmbed.cropped_pos_embed   sK   "*MNNN4?*(D+++l6llRVRilll   4***j%jjPTPgjjj   &/A5'%/A5 N221d6MtOfhjkk-aaasV|1CTDSXLEXZ[Z[Z[.[\-55a=N=TUW=XYY  r0   c                 @   | j         |j        dd          \  }}n*|j        d         | j        z  |j        d         | j        z  }}|                     |          }| j        r)|                    d                              dd          }| j        r|                     |          }| j        |	                    |j
                  S | j         r|                     ||          }n| j        |k    s| j        |k    rt          | j        j        d         ||f| j        | j                  }t#          j        |                                                              d          	                    |j                  }n| j        }||z   	                    |j
                  S )Nr   r   r
   )r@   rA   rE   rD   r   )rk   r   rp   rm   ri   	transposerj   ro   rI   tor   r   rq   rr   rJ   rE   rD   r!   rt   r%   ru   r   )rv   latentrq   rr   rI   s        r.   forwardzPatchEmbed.forward   s   "."L-MFEE"L,?bAQUYUdAdEF6""< 	7^^A&&00A66F? 	'YYv&&F>!99V\***" 	+..vu==II{f$$
e(;(;3"n226%uo"n(,(@	  	 ",Y77==??II!LLOOPVP]^^		 N	"&&v|444r0   )r[   r[   r2   r\   r]   FTTr
   r^   N)__name__
__module____qualname____doc__rh   r   r   __classcell__rz   s   @r.   rZ   rZ   }   s        DD 1N 1N 1N 1N 1N 1Nf! ! !.5 5 5 5 5 5 5r0   rZ   Tc                    |\  }}t          j        |d         |d         |d         dt           j                  }t          j        |d         |d         |d         dt           j                  }t          j        ||          }t          j        |d          }|                    ddg|j        dd                   }t          | ||          }	|	S )	a  
    RoPE for image tokens with 2d structure.

    Args:
    embed_dim: (`int`):
        The embedding dimension size
    crops_coords (`Tuple[int]`)
        The top-left and bottom-right coordinates of the crop.
    grid_size (`Tuple[int]`):
        The grid size of the positional embedding.
    use_real (`bool`):
        If True, return real part and imaginary part separately. Otherwise, return complex numbers.

    Returns:
        `torch.Tensor`: positional embdding with shape `( grid_size * grid_size, embed_dim/2)`.
    r   F)endpointr   r
   r5   r   Nuse_real)r9   linspacer#   r:   r;   r<   r   !get_2d_rotary_pos_embed_from_grid)
r@   crops_coordsrA   r   r   stoprF   rG   rH   rI   s
             r.   get_2d_rotary_pos_embedr      s    " KE4[q47IaL5PRPZ[[[F[q47IaL5PRPZ[[[F;vv&&D8Dq!!!D<<A/
122/00D1)THUUUIr0   c                    | dz  dk    sJ t          | dz  |d                             d          |          }t          | dz  |d                             d          |          }|rLt          j        |d         |d         gd          }t          j        |d         |d         gd          }||fS t          j        ||gd          }|S )N   r   r   r   r   r
   r   )get_1d_rotary_pos_embedr<   r!   r&   )r@   rH   r   rO   rP   r(   r'   r-   s           r.   r   r     s    q=A $INDGOOB4G4GRZ[[[E#INDGOOB4G4GRZ[[[E iq58,!444iq58,!444CxiA...
r0        @r   rT   thetac                 t   t          |t                    rt          j        |          }d|t	          j        d| d          d| dz                                           | z  z  z  }t	          j        |                              |j                  }t	          j	        ||                                          }|rV|
                                                    dd          }|                                                    dd          }||fS t	          j        t	          j        |          |          }|S )ay  
    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.

    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end
    index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64
    data type.

    Args:
        dim (`int`): Dimension of the frequency tensor.
        pos (`np.ndarray` or `int`): Position indices for the frequency tensor. [S] or scalar
        theta (`float`, *optional*, defaults to 10000.0):
            Scaling factor for frequency computation. Defaults to 10000.0.
        use_real (`bool`, *optional*):
            If True, return real part and imaginary part separately. Otherwise, return complex numbers.

    Returns:
        `torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2]
    r1   r   r   Nr
   r   )r7   r8   r9   r"   r!   r%   rt   r   r   outerr(   repeat_interleaver'   polar	ones_like)	r   rT   r   r   freqst	freqs_cos	freqs_sin	freqs_ciss	            r.   r   r     s   & #s inn5U\!S!44\q\BHHJJSPQRE  ..AK5!!''))E IIKK11!1;;	IIKK11!1;;	)##K 6 6>>	r0   xr   returnc                    |\  }}|d         }|d         }|                     | j                  |                     | j                  }} | j        g | j        dd         ddR                      d          \  }}t          j        | |gd                              d          }|                                 |z  |                                |z  z                        | j	                  }|S )a3  
    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
    tensors contain rotary embeddings and are returned as real tensors.

    Args:
        x (`torch.Tensor`):
            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
    )NNNr   r   r   r\   )
r   r   r<   r   unbindr!   r;   ri   r%   r   )r   r   r(   r'   x_realx_imag	x_rotatedrV   s           r.   apply_rotary_embr   6  s    $ HC
j/C
j/Cvvah!1!1CQY44b4!444;;B??NFFfWf-2666>>qAAI7799s?Y__..44
8
8
A
ACJr0   c                   T     e Zd Z	 	 	 	 	 ddededededee         f
 fd	Zdd
Z xZS )TimestepEmbeddingsiluNTrw   time_embed_dimact_fnout_dimpost_act_fnc                 t   t                                                       t          j        |||          | _        |t          j        ||d          | _        nd | _        t          |          | _        ||}n|}t          j        |||          | _        |	d | _	        d S t          |          | _	        d S )NF)rb   )
rg   rh   r   Linearlinear_1	cond_projr   actlinear_2post_act)
rv   rw   r   r   r   r   cond_proj_dimsample_proj_biastime_embed_dim_outrz   s
            r.   rh   zTimestepEmbedding.__init__U  s     		+~?OPP$Y}kNNNDNN!DN!&))!(!/	.2DFVWW DMMM*;77DMMMr0   c                     |||                      |          z   }|                     |          }| j        |                     |          }|                     |          }| j        |                     |          }|S N)r   r   r   r   r   )rv   sample	conditions      r.   r   zTimestepEmbedding.forwardu  sw     dnnY777Fv&&8XXf%%Fv&&=$]]6**Fr0   )r   NNNTr   )	r   r   r   r8   strr   rh   r   r   r   s   @r.   r   r   T  s        
 %)8 88 8 	8
 8 c]8 8 8 8 8 8@       r0   r   c                   2     e Zd Zdededef fdZd Z xZS )	Timestepsnum_channelsr   r   c                 r    t                                                       || _        || _        || _        d S r   )rg   rh   r   r   r   )rv   r   r   r   rz   s       r.   rh   zTimesteps.__init__  s7    (.$8!!!r0   c                 J    t          || j        | j        | j                  }|S )N)r   r   )r/   r   r   r   )rv   r   t_embs      r.   r   zTimesteps.forward  s2    & 0!%!:	
 
 
 r0   )	r   r   r   r8   boolr%   rh   r   r   r   s   @r.   r   r     sb        9S 94 9W\ 9 9 9 9 9 9      r0   r   c                   6     e Zd ZdZ	 d
dedef fdZd	 Z xZS )GaussianFourierProjectionz-Gaussian Fourier embeddings for noise levels.   r1   TFembedding_sizer   c                 D   t                                                       t          j        t	          j        |          |z  d          | _        || _        || _        |r>t          j        t	          j        |          |z  d          | _	        | j	        | _        d S d S )NF)requires_grad)
rg   rh   r   	Parameterr!   randnweightr    r   W)rv   r   r   set_W_to_weightr    r   rz   s         r.   rh   z"GaussianFourierProjection.__init__  s     	l5;~#>#>#FV[\\\. 	!\%+n"="="EUZ[[[DF&DKKK		! 	!r0   c                    | j         rt          j         |          }|d d d f         | j        d d d f         z  dz  t          j        z  }| j        r=t          j        t          j        |          t          j        |          gd          }n<t          j        t          j        |          t          j        |          gd          }|S )Nr   r   r   )	r    r!   r   r9   pir   r&   r(   r'   )rv   r   x_projrV   s       r.   r   z!GaussianFourierProjection.forward  s    8 		!A111d7dk$'22Q6> 	L)UYv..	&0A0ABKKKCC)UYv..	&0A0ABKKKC
r0   )r   r1   TTF)	r   r   r   r   r8   r%   rh   r   r   r   s   @r.   r   r     sn        77 ns! !!!05! ! ! ! ! !
 
 
 
 
 
 
r0   r   c                   4     e Zd ZdZddedef fdZd Z xZS )SinusoidalPositionalEmbeddinga[  Apply positional information to a sequence of embeddings.

    Takes in a sequence of embeddings with shape (batch_size, seq_length, embed_dim) and adds positional embeddings to
    them

    Args:
        embed_dim: (int): Dimension of the positional embedding.
        max_seq_length: Maximum sequence length to apply positional embeddings

        r@   max_seq_lengthc                    t                                                       t          j        |                              d          }t          j        t          j        d|d          t          j        d           |z  z            }t          j        d||          }t          j	        ||z            |dd d dd df<   t          j
        ||z            |dd d dd df<   |                     d|           d S )Nr
   r   r   r   pe)rg   rh   r!   r"   ru   r$   r   r    r?   r'   r(   rs   )rv   r@   r   positiondiv_termr   rz   s         r.   rh   z&SinusoidalPositionalEmbedding.__init__  s    <//99!<<9U\!Y::tx?P?P>PS\>\]^^[NI668h#6771aaaA:8h#6771aaaA:T2&&&&&r0   c                 H    |j         \  }}}|| j        d d d |f         z   }|S r   )r   r   )rv   r   _
seq_lengths       r.   r   z%SinusoidalPositionalEmbedding.forward  s2    7:q;J;''r0   )r   r   r   r   r   r8   rh   r   r   r   s   @r.   r   r     si        	 	' '# 's ' ' ' ' ' '      r0   r   c                   :     e Zd ZdZdedededef fdZd Z xZS )ImagePositionalEmbeddingsa  
    Converts latent image classes into vector embeddings. Sums the vector embeddings with positional embeddings for the
    height and width of the latent space.

    For more details, see figure 10 of the dall-e paper: https://arxiv.org/abs/2102.12092

    For VQ-diffusion:

    Output vector embeddings are used as input for the transformer.

    Note that the vector embeddings for the transformer are different than the vector embeddings from the VQVAE.

    Args:
        num_embed (`int`):
            Number of embeddings for the latent pixels embeddings.
        height (`int`):
            Height of the latent image i.e. the number of height embeddings.
        width (`int`):
            Width of the latent image i.e. the number of width embeddings.
        embed_dim (`int`):
            Dimension of the produced vector embeddings. Used for the latent pixel, height, and width embeddings.
    	num_embedrq   rr   r@   c                 :   t                                                       || _        || _        || _        || _        t          j        | j        |          | _        t          j        | j        |          | _	        t          j        | j        |          | _
        d S r   )rg   rh   rq   rr   r   r@   r   	Embeddingr-   
height_emb	width_emb)rv   r   rq   rr   r@   rz   s        r.   rh   z"ImagePositionalEmbeddings.__init__  sz     	
""<	::,t{I>>dj)<<r0   c                 D   |                      |          }|                     t          j        | j        |j                                      d| j                            }|                    d          }|                     t          j        | j	        |j                                      d| j	                            }|                    d          }||z   }|                    d| j        | j	        z  d          }||d d d |j
        d         d d f         z   }|S )Nr   r
   r   r   )r-   r   r!   r"   rq   r   viewru   r   rr   r   )rv   indexr-   r   r   pos_embs         r.   r   z!ImagePositionalEmbeddings.forward  s   hhuoo__U\$+el%S%S%S%X%XYZ\`\g%h%hii
  ))!,,
NN5<
5<#P#P#P#U#UVWY]Yc#d#dee	 ''**	y( ,,q$+
":B??GAAA~1~qqq011
r0   r   r   s   @r.   r   r     s|         .== = 	=
 = = = = = =$      r0   r   c                   B     e Zd ZdZ fdZddZddej        fdZ xZ	S )LabelEmbeddinga7  
    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.

    Args:
        num_classes (`int`): The number of classes.
        hidden_size (`int`): The size of the vector embeddings.
        dropout_prob (`float`): The probability of dropping a label.
    c                     t                                                       |dk    }t          j        ||z   |          | _        || _        || _        d S Nr   )rg   rh   r   r   embedding_tablenum_classesdropout_prob)rv   r   hidden_sizer   use_cfg_embeddingrz   s        r.   rh   zLabelEmbedding.__init__  sT    (1,!|K:K,K[YY&(r0   Nc                     |0t          j        |j        d         |j                  | j        k     }nt          j        |dk              }t          j        || j        |          }|S )zB
        Drops labels to enable classifier-free guidance.
        Nr   r   r
   )r!   randr   r   r   tensorwherer   )rv   labelsforce_drop_idsdrop_idss       r.   
token_dropzLabelEmbedding.token_drop   s_     !z&,q/&-HHH4K\\HH|Na$788HXt'7@@r0   r   c                     | j         dk    }| j        r|s||                     ||          }|                     |          }|S r   )r   trainingr   r   )rv   r   r   use_dropout
embeddingss        r.   r   zLabelEmbedding.forward+  sQ    '!+M 	=k 	=~/I__V^<<F))&11
r0   r   )
r   r   r   r   rh   r   r!   
LongTensorr   r   r   s   @r.   r   r     sy         ) ) ) ) )	 	 	 	 e.        r0   r   c            	       ^     e Zd Z	 	 	 	 ddedededef fdZd	ej        d
ej        fdZ xZS )TextImageProjection   r]   
   text_embed_dimimage_embed_dimcross_attention_dimnum_image_text_embedsc                     t                                                       || _        t          j        || j        |z            | _        t          j        ||          | _        d S r   )rg   rh   r  r   r   image_embeds	text_proj)rv   r
  r  r  r  rz   s        r.   rh   zTextImageProjection.__init__4  sX     	%:"Iot7QTg7ghh>3FGGr0   text_embedsr  c                     |j         d         }|                     |          }|                    || j        d          }|                     |          }t          j        ||gd          S )Nr   r   r
   r   )r   r  r<   r  r  r!   r&   )rv   r  r  
batch_sizeimage_text_embedss        r.   r   zTextImageProjection.forwardA  so     &q)
 !--l;;-55j$B\^`aa nn[11y+[9qAAAAr0   )r  r]   r]   r	  	r   r   r   r8   rh   r!   Tensorr   r   r   s   @r.   r  r  3  s         #"#&%'H HH H !	H
  #H H H H H H
B5< 
Bu| 
B 
B 
B 
B 
B 
B 
B 
Br0   r  c                   J     e Zd Z	 	 	 d	dededef fdZdej        fdZ xZS )
ImageProjectionr]   r   r  r  r  c                     t                                                       || _        t          j        || j        |z            | _        t          j        |          | _        d S r   )rg   rh   r  r   r   r  rn   ro   )rv   r  r  r  rz   s       r.   rh   zImageProjection.__init__O  sV     	%:"Iot7QTg7ghhL!455			r0   r  c                     |j         d         }|                     |          }|                    || j        d          }|                     |          }|S )Nr   r   )r   r  r<   r  ro   )rv   r  r  s      r.   r   zImageProjection.forward[  sV    !'*
 ((66#++J8RTVWWyy..r0   )r]   r]   r   r  r   s   @r.   r  r  N  s          ##&%'	
6 
6
6 !
6  #	
6 
6 
6 
6 
6 
6EL        r0   r  c                   6     e Zd Zd fd	Zdej        fdZ xZS )IPAdapterFullImageProjectionr  c                     t                                                       ddlm}  |||dd          | _        t          j        |          | _        d S Nr
   FeedForwardgelu)multactivation_fn)rg   rh   	attentionr   ffr   rn   ro   )rv   r  r  r   rz   s       r.   rh   z%IPAdapterFullImageProjection.__init__f  s]    ******+o/BZ`aaaL!455			r0   r  c                 R    |                      |                     |                    S r   )ro   r%  )rv   r  s     r.   r   z$IPAdapterFullImageProjection.forwardm  s     yy..///r0   )r  r  r   r   r   rh   r!   r  r   r   r   s   @r.   r  r  e  sZ        6 6 6 6 6 60EL 0 0 0 0 0 0 0 0r0   r  c                   6     e Zd Zd fd	Zdej        fdZ xZS )IPAdapterFaceIDImageProjectionr  r
   c                     t                                                       ddlm} || _        || _         ||||z  |d          | _        t          j        |          | _	        d S r  )
rg   rh   r$  r   
num_tokensr  r%  r   rn   ro   )rv   r  r  r"  r+  r   rz   s         r.   rh   z'IPAdapterFaceIDImageProjection.__init__r  sr    ******$#6 +o/BZ/OVZjpqqqL!455			r0   r  c                     |                      |          }|                    d| j        | j                  }|                     |          S )Nr   )r%  r<   r+  r  ro   )rv   r  r   s      r.   r   z&IPAdapterFaceIDImageProjection.forward{  s=    GGL!!IIb$/4+CDDyy||r0   )r  r  r
   r
   r'  r   s   @r.   r)  r)  q  sZ        6 6 6 6 6 6EL        r0   r)  c                   (     e Zd Zd fd	ZddZ xZS )CombinedTimestepLabelEmbeddings皙?c                     t                                                       t          ddd          | _        t	          d|          | _        t          |||          | _        d S )Nr   Tr
   r   r   r   rw   r   )rg   rh   r   	time_projr   timestep_embedderr   class_embedder)rv   r   r   class_dropout_probrz   s       r.   rh   z(CombinedTimestepLabelEmbeddings.__init__  sa    "T`abbb!2sS`!a!a!a,[-I[\\r0   Nc                     |                      |          }|                     |                    |                    }|                     |          }||z   }|S Nr4   )r3  r4  r   r5  )rv   timestepclass_labelshidden_dtypetimesteps_projtimesteps_embconditionings          r.   r   z'CombinedTimestepLabelEmbeddings.forward  sZ    11..~/@/@|/@/T/TUU**<88$|3r0   )r/  r   r   r   r   rh   r   r   r   s   @r.   r.  r.    sW        ] ] ] ] ] ]       r0   r.  c                   $     e Zd Z fdZd Z xZS )"CombinedTimestepTextProjEmbeddingsc                     t                                                       t          ddd          | _        t	          d|          | _        t          ||d          | _        d S )Nr   Tr   r1  r2  r   )r   )rg   rh   r   r3  r   r4  PixArtAlphaTextProjectiontext_embedder)rv   r   pooled_projection_dimrz   s      r.   rh   z+CombinedTimestepTextProjEmbeddings.__init__  se    "T`abbb!2sS`!a!a!a67Lmdjkkkr0   c                     |                      |          }|                     |                    |j                            }|                     |          }||z   }|S r8  )r3  r4  r   r   rD  )rv   r9  pooled_projectionr<  r=  pooled_projectionsr>  s          r.   r   z*CombinedTimestepTextProjEmbeddings.forward  sa    11..~/@/@GXG^/@/_/_``!//0ABB$'99r0   r?  r   s   @r.   rA  rA    sL        l l l l l      r0   rA  c            	       8     e Zd Zddedededef fdZd Z xZS )	HunyuanDiTAttentionPoolNspacial_dimr@   	num_heads
output_dimc                    t                                                       t          j        t	          j        |dz   |          |dz  z            | _        t          j        ||          | _        t          j        ||          | _	        t          j        ||          | _
        t          j        ||p|          | _        || _        d S )Nr
   re   )rg   rh   r   r   r!   r   positional_embeddingr   k_projq_projv_projc_projrL  )rv   rK  r@   rL  rM  rz   s        r.   rh   z HunyuanDiTAttentionPool.__init__  s    $&L[1_i1X1X[dfi[i1i$j$j!i	955i	955i	955i	:+BCC"r0   c           
         |                     ddd          }t          j        |                    dd          |gd          }|| j        d d d d d f                             |j                  z   }t          j        di d|d d         d|d	|d
|j	        d         d| j
        d| j        j        d| j        j        d| j        j        dd dt          j        | j        j        | j        j        | j        j        g          dd dd ddddd| j        j        d| j        j        ddd| j        dd\  }}|                    d          S )Nr
   r   r   Tr   keepdimr   querykeyvalueembed_dim_to_checkr   rL  q_proj_weightk_proj_weightv_proj_weightin_proj_weightin_proj_biasbias_kbias_vadd_zero_attnF	dropout_pout_proj_weightout_proj_biasuse_separate_proj_weightr  need_weights )permuter!   r&   meanrO  r   r   Fmulti_head_attention_forwardr   rL  rQ  r   rP  rR  rb   rS  r  squeeze)rv   r   r   s      r.   r   zHunyuanDiTAttentionPool.forward  s   IIaAIqvv!Tv22A6A>>>)!!!T111*588AAA- 
 
 
BQB%%

 !
  !wr{{	

 nn
 +,,
 +,,
 +,,
  4
 DK$4dk6FHX#YZZZ
 4
 4
  %
 a
 !K..
  +**!
" &*T#
$ ]]%
& '
1* yy||r0   r   r   r   r   r8   rh   r   r   r   s   @r.   rJ  rJ    sn        # #C #C #C #UX # # # # # #      r0   rJ  c                   (     e Zd Zd fd	ZddZ xZS )	-HunyuanCombinedTimestepTextSizeStyleEmbeddingr  r      c                 L   t                                                       t          ddd          | _        t	          d|          | _        t          ||d|          | _        t          j	        d|          | _
        d	|z   |z   }t          ||d
z  |d          | _        d S )Nr   Tr   r1  r2     )rL  rM  r
      r   	silu_fp32)in_featuresr   out_featuresr   )rg   rh   r   r3  r   r4  rJ  poolerr   r   style_embedderrC  extra_embedder)rv   r   rE  seq_lenr  extra_in_dimrz   s         r.   rh   z6HunyuanCombinedTimestepTextSizeStyleEmbedding.__init__  s    "T`abbb!2sS`!a!a!a-(ABW
 
 
 !l1m<<.1FF7$%)&	
 
 
r0   Nc                    |                      |          }|                     |                    |                    }|                     |          }t	          |                    d          ddd          }|                    |          }|                    dd          }|                     |          }	t          j        |||	gd          }
|| 	                    |
          z   }|S )	Nr4   r   r   Tr   rt  r
   r   )
r3  r4  r   rx  r/   r   ry  r!   r&   rz  )rv   r9  encoder_hidden_statesimage_meta_sizestyler;  r<  r=  rH  style_embedding
extra_condr>  s               r.   r   z5HunyuanCombinedTimestepTextSizeStyleEmbedding.forward  s    11..~/@/@|/@/T/TUU "[[)>?? 11E1Eb1I1I3PTVWXX),,<,@@)..r7;; --e44 Y 2O_U[\]]]
$t':'::'F'FFr0   )r  r   rq  r   r?  r   s   @r.   rp  rp    sQ        
 
 
 
 
 
&       r0   rp  c                   4     e Zd Zddededef fdZd Z xZS )TextTimeEmbedding@   encoder_dimr   rL  c                 
   t                                                       t          j        |          | _        t          ||          | _        t          j        ||          | _        t          j        |          | _	        d S r   )
rg   rh   r   rn   norm1AttentionPoolingpoolr   rm   norm2)rv   r  r   rL  rz   s       r.   rh   zTextTimeEmbedding.__init__  sc    \+..
$Y<<	Ik>::	\.11


r0   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S r   )r  r  rm   r  )rv   hidden_statess     r.   r   zTextTimeEmbedding.forward  sL    

=11		-00		-00

=11r0   )r  rn  r   s   @r.   r  r    sf        2 2C 2 2 2 2 2 2 2 2      r0   r  c                   R     e Zd Zd
dededef fdZdej        dej        fd	Z xZS )TextImageTimeEmbeddingr]   rt  r
  r  r   c                     t                                                       t          j        ||          | _        t          j        |          | _        t          j        ||          | _        d S r   )rg   rh   r   r   r  rn   	text_norm
image_proj)rv   r
  r  r   rz   s       r.   rh   zTextImageTimeEmbedding.__init__
  sT    >>BBn55)O^DDr0   r  r  c                     |                      |          }|                     |          }|                     |          }||z   S r   )r  r  r  )rv   r  r  time_text_embedstime_image_embedss        r.   r   zTextImageTimeEmbedding.forward  sG    >>+66>>*:;; !OOL99 #333r0   )r]   r]   rt  r  r   s   @r.   r  r  	  s        E Es E3 E^a E E E E E E45< 4u| 4 4 4 4 4 4 4 4r0   r  c                   @     e Zd Zddedef fdZdej        fdZ xZS )	ImageTimeEmbeddingr]   rt  r  r   c                     t                                                       t          j        ||          | _        t          j        |          | _        d S r   )rg   rh   r   r   r  rn   
image_normrv   r  r   rz   s      r.   rh   zImageTimeEmbedding.__init__  sA    )O^DD,~66r0   r  c                 Z    |                      |          }|                     |          }|S r   )r  r  )rv   r  r  s      r.   r   zImageTimeEmbedding.forward!  s-     OOL99 OO,=>>  r0   r]   rt  r  r   s   @r.   r  r    sm        7 7 73 7 7 7 7 7 7
!EL ! ! ! ! ! ! ! !r0   r  c                   N     e Zd Zd	dedef fdZdej        dej        fdZ xZS )
ImageHintTimeEmbeddingr]   rt  r  r   c                 P   t                                                       t          j        ||          | _        t          j        |          | _        t          j        t          j        dddd          t          j	                    t          j        dddd          t          j	                    t          j        ddddd          t          j	                    t          j        dddd          t          j	                    t          j        ddddd          t          j	                    t          j        dddd          t          j	                    t          j        dd	ddd          t          j	                    t          j        d	d
dd                    | _
        d S )Nr\   r2   r
   )paddingr   r   )r  ra   `   r   r   )rg   rh   r   r   r  rn   r  
Sequentialrl   SiLUinput_hint_blockr  s      r.   rh   zImageHintTimeEmbedding.__init__)  sL   )O^DD,~66 "IaQ***GIIIb"a+++GIIIb"a1555GIIIb"a+++GIIIb"a1555GIIIb"a+++GIIIb#q!A666GIIIc1a+++!
 !
r0   r  hintc                     |                      |          }|                     |          }|                     |          }||fS r   )r  r  r  )rv   r  r  r  s       r.   r   zImageHintTimeEmbedding.forward?  sE     OOL99 OO,=>>$$T** $&&r0   r  r  r   s   @r.   r  r  (  sv        
 
 
3 
 
 
 
 
 
,'EL ' ' ' ' ' ' ' ' 'r0   r  c                   &     e Zd Zd fd	Zd Z xZS )r  Nc                    t                                                       || _        t          j        t          j        d|          |dz  z            | _        t          j        ||| j                  | _	        t          j        ||| j                  | _
        t          j        ||| j                  | _        || _        || j        z  | _        d S )Nr
   re   r4   )rg   rh   r   r   r   r!   r   rO  r   rP  rQ  rR  rL  dim_per_head)rv   rL  r@   r   rz   s       r.   rh   zAttentionPooling.__init__J  s    
$&LQ	1J1JYX[^1[$\$\!i	9DJGGGi	9DJGGGi	9DJGGG"%7r0   c                 H    |                                 \  }} fd}|                    dd           j                            |j                  z   }t          j        ||gd          } |                     |                    } |                     |                    } | 	                    |                    }dt          j        t          j         j                            z  }	t          j        d||	z  ||	z            }
t          j        |
                                d                              |
j                  }
t          j        d|
|          }|                    dd                              dd	          }|d d d
d d f         S )Nc                     |                      dj        j                  } |                     dd          } |                     j        z  dj                  } |                     dd          } | S )Nr   r
   r   )r   rL  r  r   r<   )r   bsrv   s    r.   r   z'AttentionPooling.forward.<locals>.shapeW  si    r2t~t/@AAAAq!!A		"t~-r43DEEAAq!!AHr0   r
   TrU  r   zbct,bcs->btsr   zbts,bcs->bctr   r   )sizerj  rO  r   r   r!   r&   rQ  rP  rR  r   sqrtr  rS   softmaxr%   typer<   r   )rv   r   lengthrr   r   class_tokenqkvr   r   ar  s   `           @r.   r   zAttentionPooling.forwardT  s   FFHHFE		 		 		 		 		 		 ffDf11D4M4P4PQRQX4Y4YYI{A&A... E$++k**++E$++a..!!E$++a..!! DIdi(9::;;;na%iUCCv||~~2666;;FLII L33 IIb"a  **1a00Aqqqzr0   r   r?  r   s   @r.   r  r  G  sL        8 8 8 8 8 8" " " " " " "r0   r  c                    |j         dd         \  }}dt          j        |           | z  z  }|d                             |j        |j                  }||                    d          z  }t          j        |                                |	                                fd          }|
                    dd	d
dd                              ||| dz  dz            }|S )z
    Args:
        embed_dim: int
        box: a 3-D tensor [B x N x 4] representing the bounding boxes for GLIGEN pipeline
    Returns:
        [B x N x embed_dim] tensor of positional embeddings
    Nr   d   )NNN)r   r   r   r   r   r
   r\   r   )r   r!   r"   r   r   r   ru   r;   r'   r(   ri  r<   )r@   boxr  	num_boxesr-   s        r.   #get_fourier_embeds_from_boundingboxr  y  s      IbqbMJ	
%,y))I5
6C


"
"#*CI
"
F
FC
b!!
!C
+swwyy#'')),"
5
5
5C
++aAq!
$
$
,
,ZIPQMTUDU
V
VCJr0   c                   2     e Zd Zd fd	Z	 	 	 	 	 ddZ xZS )GLIGENTextBoundingboxProjection	text-onlyrs  c           
         t                                                       || _        || _        || _        |dz  dz  | _        t          |t                    r|d         }|dk    rt          j	        t          j
        | j        | j        z   d          t          j                    t          j
        dd          t          j                    t          j
        d|                    | _        t          j                            t          j        | j        g                    | _        n|dk    rt          j	        t          j
        | j        | j        z   d          t          j                    t          j
        dd          t          j                    t          j
        d|                    | _        t          j	        t          j
        | j        | j        z   d          t          j                    t          j
        dd          t          j                    t          j
        d|                    | _        t          j                            t          j        | j        g                    | _        t          j                            t          j        | j        g                    | _        t          j                            t          j        | j        g                    | _        d S )Nr   r   r   r     z
text-image)rg   rh   positive_lenr   fourier_embedder_dimposition_dimr7   tupler   r  r   r  linearsr!   r   r?   null_positive_featurelinears_textlinears_imagenull_text_featurenull_image_featurenull_position_feature)rv   r  r   feature_typefourier_freqsrz   s        r.   rh   z(GLIGENTextBoundingboxProjection.__init__  s;   ($1!)A-1gu%% 	!ajG;&&=	$+d.??EE			#s##			#w'' DL */););EKIZH[<\<\)])]D&&\)) "	$+d.??EE			#s##			#w''! !D "$	$+d.??EE			#s##			#w''" "D &+X%7%7TEVDW8X8X%Y%YD"&+h&8&8dFWEX9Y9Y&Z&ZD#%*X%7%7TEVDW8X8X%Y%Y"""r0   Nc                 6   |                     d          }t          | j        |          }| j                            ddd          }	||z  d|z
  |	z  z   }|V| j                            ddd          }
||z  d|z
  |
z  z   }|                     t          j        ||gd                    }n|                     d          }|                     d          }| j	                            ddd          }| j
                            ddd          }||z  d|z
  |z  z   }||z  d|z
  |z  z   }|                     t          j        ||gd                    }|                     t          j        ||gd                    }t          j        ||gd          }|S )Nr   r
   r   )ru   r  r  r  r   r  r  r!   r&   r  r  r  r  )rv   boxesmaskspositive_embeddingsphrases_masksimage_masksphrases_embeddingsimage_embeddingsxyxy_embedding	xyxy_nullpositive_nullobjs	text_null
image_null	objs_text
objs_images                   r.   r   z'GLIGENTextBoundingboxProjection.forward  s    ## =T=VX]^^ .33Aq"==	 (%/1u9	2II * 6;;Aq"EEM #6"=Um@["[<<	+>*OUW X X XYYDD *33B77M%//33K .33Aq"==I055aB??J "4m!Cq=GX\eFe!e/+=[T^@^^))%)5G4X^`*a*a*abbI++EI7G6X^`,a,a,abbJ9i4!<<<Dr0   )r  rs  )NNNNNr?  r   s   @r.   r  r    si        'Z 'Z 'Z 'Z 'Z 'ZZ !0 0 0 0 0 0 0 0r0   r  c                   0     e Zd ZdZddef fdZd Z xZS ))PixArtAlphaCombinedTimestepSizeEmbeddingsz
    For PixArt-Alpha.

    Reference:
    https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
    Fuse_additional_conditionsc                 L   t                                                       || _        t          ddd          | _        t          d|          | _        || _        |rEt          ddd          | _        t          d|          | _	        t          d|          | _
        d S d S )Nr   Tr   r1  r2  )rg   rh   outdimr   r3  r   r4  r  additional_condition_projresolution_embedderaspect_ratio_embedder)rv   r   size_emb_dimr  rz   s       r.   rh   z2PixArtAlphaCombinedTimestepSizeEmbeddings.__init__  s    ""T`abbb!2sS`!a!a!a)B&$ 	i-6CY]tu-v-v-vD*'8SYe'f'f'fD$):s[g)h)h)hD&&&	i 	ir0   c                 X   |                      |          }|                     |                    |                    }| j        r|                     |                                                              |          }|                     |                              |d          }|                     |                                                              |          }	|                     |	                              |d          }	|t          j
        ||	gd          z   }
n|}
|
S )Nr4   r   r
   r   )r3  r4  r   r  r  ri   r  r<   r  r!   r&   )rv   r9  
resolutionaspect_ratior  r;  r<  r=  resolution_embaspect_ratio_embr>  s              r.   r   z1PixArtAlphaCombinedTimestepSizeEmbeddings.forward   s   11..~/@/@|/@/T/TUU) 	)!;;J<N<N<P<PQQTTUabbN!55nEEMMjZ\]]N#==l>R>R>T>TUUXXYeff#99:JKKSST^`bcc(59nFV5W]^+_+_+__LL(Lr0   F)r   r   r   r   r   rh   r   r   r   s   @r.   r  r    sj         i it i i i i i i      r0   r  c                   *     e Zd ZdZd fd	Zd Z xZS )rC  z
    Projects caption embeddings. Also handles dropout for classifier-free guidance.

    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
    N	gelu_tanhc                    t                                                       ||}t          j        ||d          | _        |dk    rt          j        d          | _        nK|dk    rt          j                    | _        n,|dk    rt                      | _        nt          d|           t          j        ||d          | _
        d S )	NT)rv  rw  rb   r  tanh)approximater   ru  zUnknown activation function: )rg   rh   r   r   r   GELUact_1r  r   rM   r   )rv   rv  r   rw  r   rz   s        r.   rh   z"PixArtAlphaTextProjection.__init__  s    &L	kZ^___[  V444DJJvDJJ{""!DJJEVEEFFF	k[_```r0   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r  r   )rv   captionr  s      r.   r   z!PixArtAlphaTextProjection.forward&  s;    g..

=11m44r0   )Nr  )r   r   r   r   rh   r   r   r   s   @r.   rC  rC    s\         a a a a a a      r0   rC  c                   D     e Zd Z	 	 	 	 ddedededed	d
f
 fdZd Z xZS )!IPAdapterPlusImageProjectionBlockr]   r  r2   r   
embed_dimsdim_headheads	ffn_ratior   Nc           
      \   t                                                       ddlm} t	          j        |          | _        t	          j        |          | _        t          |||d          | _	        t	          j
        t	          j        |           |||d|d                    | _        d S )Nr
   r  F)	query_dimr  r  out_biasr!  )r#  r"  rb   )rg   rh   r$  r   r   rn   ln0ln1r   attnr  r%  )rv   r  r  r  r  r   rz   s         r.   rh   z*IPAdapterPlusImageProjectionBlock.__init__.  s     	******<
++<
++ 	
 
 
	 -L$$K
Jf9[`aaa
 
r0   c                     |                      |          }|                     |          }t          j        ||gd          }|                     ||          |z   }|                     |          |z   }|S )Nr   r   )r  r   r!   r&   r  r%  )rv   r   latentsresidualr~  s        r.   r   z)IPAdapterPlusImageProjectionBlock.forwardE  ss     $((7## %	+@'*JPR S S S))G%:;;hF'''""W,r0   )r]   r  r2   r   )r   r   r   r8   r%   rh   r   r   r   s   @r.   r  r  -  s         
 

 
 	

 
 

 
 
 
 
 
.      r0   r  c                   ~     e Zd ZdZ	 	 	 	 	 	 	 	 dd	ed
edededededededdf fdZdej        dej        fdZ	 xZ
S )IPAdapterPlusImageProjectiona  Resampler of IP-Adapter Plus.

    Args:
        embed_dims (int): The feature dimension. Defaults to 768. output_dims (int): The number of output channels,
        that is the same
            number of the channels in the `unet.config.cross_attention_dim`. Defaults to 1024.
        hidden_dims (int):
            The number of hidden channels. Defaults to 1280. depth (int): The number of blocks. Defaults
        to 8. dim_head (int): The number of head channels. Defaults to 64. heads (int): Parallel attention heads.
        Defaults to 16. num_queries (int):
            The number of queries. Defaults to 8. ffn_ratio (float): The expansion ratio
        of feedforward network hidden
            layer channels. Defaults to 4.
    r]   r     r   r  r2   rs  r  output_dimshidden_dimsdepthr  r  num_queriesr  r   Nc	                    t                                                       t          j        t	          j        d|          dz  z            | _        t          j        |          | _        t          j        |          | _	        t          j
        |          | _        t          j        fdt          |          D                       | _        d S )Nr
   re   c                 4    g | ]}t                    S rh  r  ).0r   r  r  r  r	  s     r.   
<listcomp>z9IPAdapterPlusImageProjection.__init__.<locals>.<listcomp>r  s)    nnn\].{HeYWWnnnr0   )rg   rh   r   r   r!   r   r  r   proj_inproj_outrn   norm_out
ModuleListrangelayers)
rv   r  r  r	  r
  r  r  r  r  rz   s
      ` `` `r.   rh   z%IPAdapterPlusImageProjection.__init__^  s     	|EK;$L$L{\_O_$_``y[99	+{;;[11mnnnnnnnafglamamnnn
 
r0   r   c                    | j                             |                    d          dd          }|                     |          }| j        D ]}|} ||||          }|                     |          }|                     |          S )zForward pass.

        Args:
            x (torch.Tensor): Input Tensor.
        Returns:
            torch.Tensor: Output Tensor.
        r   r
   )r  repeatr  r  r  r  r  )rv   r   r  blockr  s        r.   r   z$IPAdapterPlusImageProjection.forwardu  s     ,%%affQiiA66LLOO[ 	2 	2EHeAw11GG--((}}W%%%r0   )r]   r  r  r   r  r2   rs  r   r   r   r   r   r8   r%   rh   r!   r  r   r   r   s   @r.   r  r  N  s         " 
 

 
 	

 
 
 
 
 
 

 
 
 
 
 
.& &%, & & & & & & & &r0   r  c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 dd
edededededededededededdf fdZdej        dej        fdZ	 xZ
S )"IPAdapterFaceIDPlusImageProjectiona  FacePerceiverResampler of IP-Adapter Plus.

    Args:
        embed_dims (int): The feature dimension. Defaults to 768. output_dims (int): The number of output channels,
        that is the same
            number of the channels in the `unet.config.cross_attention_dim`. Defaults to 1024.
        hidden_dims (int):
            The number of hidden channels. Defaults to 1280. depth (int): The number of blocks. Defaults
        to 8. dim_head (int): The number of head channels. Defaults to 64. heads (int): Parallel attention heads.
        Defaults to 16. num_tokens (int): Number of tokens num_queries (int): The number of queries. Defaults to 8.
        ffn_ratio (float): The expansion ratio of feedforward network hidden
            layer channels. Defaults to 4.
        ffproj_ratio (float): The expansion ratio of feedforward network hidden
            layer channels (for ID embeddings). Defaults to 4.
    r]   r  r  r   r  r2   rs  r   r  r  r	  id_embeddings_dimr
  r  r  r+  r  r  ffproj_ratior   Nc                   
 t                                                       ddlm} || _        | _        d | _        d| _        d| _         |||z  d|          | _	        t          j                  | _        t          j        |          | _        t          j        |          | _        t          j        |          | _        t          j        
fdt%          |          D                       | _        d S )Nr
   r  Fr1   r!  )r#  r"  c                 4    g | ]}t                    S rh  r  )r  r   r  r  r  r  s     r.   r  z?IPAdapterFaceIDPlusImageProjection.__init__.<locals>.<listcomp>  s)    mmm[\.z8UIVVmmmr0   )rg   rh   r$  r   r+  r@   clip_embedsshortcutshortcut_scalerm   r   rn   ro   r   r  r  r  r  r  r  )rv   r  r  r	  r  r
  r  r  r+  r  r  r  r   rz   s    `    ``  `  r.   rh   z+IPAdapterFaceIDPlusImageProjection.__init__  s    	******$#!K 1:
3JZ`gsttt	L,,	yj99	*k::[11mmmmmmmm`efk`l`lmmm
 
r0   	id_embedsc                    |                     | j        j                  }|                     |          }|                    d| j        | j                  }|                     |          }|}|                     | j                  }|                    d|j	        d         |j	        d                   }| j
        D ]}|} ||||          }|                     |          }|                     |          }| j        r|| j        |z  z   }|S )zForward pass.

        Args:
            id_embeds (torch.Tensor): Input Tensor (ID embeds).
        Returns:
            torch.Tensor: Output Tensor.
        r   r   r\   )r   r!  r   rm   r<   r+  r@   ro   r  r   r  r  r  r"  r#  )rv   r$  r  r!  r   r  r  rV   s           r.   r   z*IPAdapterFaceIDPlusImageProjection.forward  s    LL!1!788	IIi((	%%b$/4>JJ	IIi((	ll4#344K$5a$8+:KA:NOO[ 	2 	2EHeAw11GG--((mmG$$= 	8d1C77C
r0   )r]   r]   r  r  r   r  r2   r   rs  r   r   r  r   s   @r.   r  r    s        $ !$!
 !
!
 !
 	!

 !
 !
 !
 !
 !
 !
 !
 !
 
!
 !
 !
 !
 !
 !
F %,        r0   r  c                        e Zd Zdeeej                 eej                 f         f fdZdee	j
                 fdZ xZS )MultiIPAdapterImageProjectionIPAdapterImageProjectionLayersc                 z    t                                                       t          j        |          | _        d S r   )rg   rh   r   r  image_projection_layers)rv   r(  rz   s     r.   rh   z&MultiIPAdapterImageProjection.__init__  s2    ')}5S'T'T$$$r0   r  c                    g }t          |t                    s+d}t          dd|d           |                    d          g}t	          |          t	          | j                  k    r4t          dt	          |           dt	          | j                             t          || j                  D ]\  }}|j        d	         |j        d         }}|	                    ||z  f|j        d
d          z             } ||          }|	                    ||f|j        dd          z             }|
                    |           |S )NzYou have passed a tensor as `image_embeds`.This is deprecated and will be removed in a future release. Please make sure to update your script to pass `image_embeds` as a list of tensors to supress this warning.zimage_embeds not a listz1.0.0F)standard_warnr
   zGimage_embeds must have the same length as image_projection_layers, got z and r   r   )r7   listr	   ru   r   r*  rM   zipr   r<   append)rv   r  projected_image_embedsdeprecation_messageimage_embedimage_projection_layerr  
num_imagess           r.   r   z%MultiIPAdapterImageProjection.forward  s   !#
 ,-- 	7   /:M]bcccc(221556L|D$@ A AAA VZ]^jZkZk  V  Vruvz  wS  sT  sT  V  V   47|TEa3b3b 	7 	7/K/%0%6q%9;;LQ;O
J%--zJ/F.H;K\]^]_]_K`.`aaK00==K%--z:.FIZ[\[][]I^.^__K"))+6666%%r0   )r   r   r   r   r   r   Moduler   rh   r!   r  r   r   r   s   @r.   r'  r'    s        UuT")_eTVT]N^=^7_ U U U U U U&D$6 & & & & & & & &r0   r'  )Fr
   r
   r   )Fr   r1   r2   )Tr  )r   F)>r   typingr   r   r   r   numpyr9   r!   torch.nn.functionalr   r)   rk  utilsr	   activationsr   r   attention_processorr   r  r8   r   r%   r/   rJ   r=   rN   r5  rZ   r   r   ndarrayr   r   r   r   r   r   r   r   r  r  r  r)  r.  rA  rJ  rp  r  r  r  r  r  r  r  r  rC  r  r  r  r'  rh  r0   r.   <module>r=     s    / / / / / / / / / / / /                           1 1 1 1 1 1 1 1 * * * * * * ""#( (|(( (  	(
 ( ( ( ( (X _a   ,	 	 	  *i5 i5 i5 i5 i5 i5 i5 i5X   8      5S+A %    B|U\5#667 5<%&   <- - - - -	 - - -`    	   "    	   <    BI   6> > > > >	 > > >B! ! ! ! !RY ! ! !HB B B B B") B B B6    bi   .	0 	0 	0 	0 	029 	0 	0 	0    RY        bi   &       &% % % % %bi % % %P' ' ' ' 'BI ' ' 'T    	    4 4 4 4 4RY 4 4 4$
! 
! 
! 
! 
! 
! 
! 
!' ' ' ' 'RY ' ' '>/ / / / /ry / / /d  *Z Z Z Z Zbi Z Z Zz" " " " "	 " " "J    	   :    	   B8& 8& 8& 8& 8&29 8& 8& 8&vM M M M M M M M` &  &  &  &  &BI  &  &  &  &  &r0   