
    &Vjid5                        d dl Z d dlmZmZmZ d dlZd dlmZ d dl	mc m
c mZ d dlmc mZ d dlmZmZmZmZ d dl	mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
l m!Z! d dl"m#Z#m$Z$m%Z%mZ& d dl'm(Z(m)Z) dgZ*de$de+ej,        ej,        f         fdZ-de$de.de+ej,        ej,        f         fdZ/de$de+ej,        ej,        f         fdZ0de$de.defdZ1de$dej2        defdZ3de$dej2        fdZ4dej5        dej6        de.dej5        fdZ7dej6        de.de.de.dej2        dej6        fd Z8dej6        de.d!e#de$fd"Z9dej6        de+ej6        e:e         f         fd#Z;de$d$ee#         dej6        fd%Z< G d& de          Z=dS )'    N)AnycastOptional)ShardShardedTensorShardedTensorMetadataTensorProperties)ShardMetadata)ChunkShardingSpec)_mesh_resources)_set_fsdp_flattened)FSDPExtensions)_create_chunk_sharded_tensor)_remote_device)
DeviceMeshDTensor	Replicater   )_flatten_tensor_unflatten_tensorDTensorExtensionstensorreturnc                    | j         }|j        dk    s
J d            | j        d         }dgt          |                                           z  }|                    d          }| j        d                                         r7t          t          |          j        }|                     |          |z  }|||<   t          j
        |          | j                                        fS )N   &Only 1D DeviceMeshes currently handledr   )mesh_dim)device_meshndim
placementslensizeis_shardr   DSharddimtorchSize_local_tensor)r   r   	placementoffsets
num_chunks	shard_dim
chunk_sizes          `/root/voice-cloning/.venv/lib/python3.11/site-packages/torch/distributed/tensor/parallel/fsdp.py_get_boxr.       s    $Kq   "J   !!$IcC&&&G!!1!--J$$&& (++/	[[++z9
'	Jw!5!:!:!<!<==    idxc                 l    t          |           \  }}t          j        fd|D                       |fS )Nc                     g | ]}|z  S  r3   ).0valr0   s     r-   
<listcomp>z _get_box_for.<locals>.<listcomp>2   s    555cc	555r/   )r.   r%   r&   )r   r0   r)   r!   s    `  r-   _get_box_forr7   0   s>    V$$MGTJ5555W55566==r/   c                 l    | j         }|                                }|J t          | |d                   S )Nr   )r   get_coordinater7   )r   r   coords      r-   _get_local_boxr;   5   s;    $K&&((Ea)))r/   dtcurrent_rankc                     | j         }|j        dk    s
J d            t          |           \  }}t          t	          |          t	          |          d| d| j        j                   S )Nr   r   rank:/shard_offsetsshard_sizesr(   )r   r   r;   r
   listr'   device)r<   r=   meshr)   sizess        r-   _create_shard_md_from_dtrH   <   sr    >D9>>>C>>>#B''NGU7mmKKB,BB)9)@BB   r/   dt_pgc                 ,   g }t          j        |          }|dk    rdnd}| j        d                                         r|                                }nd}t          |          D ]l}t          | |          \  }}|                    t          t          |          t          |          d|dk    r|n| d| j
        j                              mt          ||                                 t          | j        | j        | j                            S )Nr   r   r?   r@   rA   )dtypelayoutrequires_grad)shards_metadatar!   tensor_properties)distget_rankr   r"   r!   ranger7   appendr
   rD   r'   rE   r   r	   rK   rL   rM   )	r<   rI   	shards_mdmy_rankscapegoat_rankshard_countir)   rG   s	            r-   !_create_sharded_tensor_md_from_dtrY   H   s1    ImE""G!A++QQ1N	}Q  "" jjll; 

 

%b!,,"7mm KK\a!eeNN\\2CSCZ\\	  	
 	
 	
 	
 !!WWYY*(9*
 
 
	 	 	 	r/   c                 b    | j         }|j        dk    s
J d            |                                S )Nr   r   )r   r   	get_group)r<   rF   s     r-   
_get_dt_pgr\   o   s/    >D9>>>C>>>>>r/   specrankc                     t          | t                    s| S d}| j        D ]P}t          t          |          }|                                |k    r!|                                |j        k    rd} nQ|rt          j        |           } t          | j                  D ]q\  }}t          t          |          }|                                |k    r?|                                |j        k    r"t	          d| d|j                   | j        |<   r| S )z
    Rewrite ``spec`` to match the device of ``tensor``.

    FSDP.sharded_optim_state_dict sneakly ships optimizer state to CPU so if the original ShardingSpec
    produces CUDA metadata, ST construction bombs.
    FTr?   r@   )

isinstancer   r   r   r   r^   rE   copydeepcopy	enumerate)r]   r   r^   rewriteprX   r(   s          r-   _rewrite_spec_if_neededrf   u   s    d-..  G_  ##6688t

fm ; ;GE T}T""%do66 	T 	TLAy^Y77I~~4''I,<,<,>,>&-,O,O%34RD4R4R6=4R4R%S%S"Kr/   
world_sizenum_devices_per_nodepgc           	         t          |           t          u rt          |                                           dk    sJ |                                 }t          |||||          }|                                 d         }t          |t          j        |j	                            g}t          j        | 	                                          }	d|	j
        _        t          j        ||	| j        d          }
|
S t          |           t          u r| j        }|j        dk    s
J d            | j        }t          |||t$          j                                        |          }t+          |           }t          |t-          | t/          j        |                              g}t3          | |          }	d|	j
        _        t          j        ||	|d          }
|
S t          | ||||          S )Nr   r   F)sharded_tensor_metadataprocess_group
init_rrefsr   )typer   r    local_shardslocal_tensorr   r   ra   rb   metadatarO   rM   +_init_from_local_shards_and_global_metadata_process_groupr   r   r   r'   r%   acceleratordevice_countr\   rH   rP   rQ   rY   )r   r^   rg   rh   ri   inner_paraminner_stouter_local_shardshardsst_metast_outerr   rI   s                r-   _chunk_tensorr|      s    F||}$$6&&(())Q....))++/ 
 
 #//11!4(DM*;*DEEFF
 - 1 12227!/ L$+ /	
 
 
 	f	 	 (1$$$&N$$$*/**,,
 
 6"" (4VT]5=Q=QRRSS
 4FEBB27!/ L$+	
 
 
 + 
 
 	
r/   r   c                    t          j        |          }|t          d          |j        dk     rt          d|j         dd          |                                                                 } t          | t          j                  rt          | t                    s{d t          |j                  D             }d t          |j                  D             }t          d	          |d	<   t          j        | ||d
                              ||          S | j        }|d	         }|                                 } d t          |j                  D             }||d<   d t          |j                  D             }t          d	          |d<   ||d<   t          j        | ||d
                              ||          S )z
    Shard a tensor to chunks along the first dimension.

    The local rank will gets its corresponding chunk as the local tensor to create a DTensor.
    Nz4No parent device_mesh is found for FSDP device_mesh.   z!Found parent device_mesh of ndim=,zbut meshes must be at least 2D.c                 *    g | ]}t                      S r3   r   r4   _s     r-   r6   z"_chunk_dtensor.<locals>.<listcomp>       KKK	KKKr/   c                 *    g | ]}t                      S r3   r   r   s     r-   r6   z"_chunk_dtensor.<locals>.<listcomp>       GGGAIKKGGGr/   r   F)	run_checkr   r   c                 *    g | ]}t                      S r3   r   r   s     r-   r6   z"_chunk_dtensor.<locals>.<listcomp>  r   r/   c                 *    g | ]}t                      S r3   r   )r4   rX   s     r-   r6   z"_chunk_dtensor.<locals>.<listcomp>  r   r/   )r   get_root_meshRuntimeErrorr   detachcloner`   r%   Tensorr   rR   r#   
from_localredistributer   to_local)r   r^   r   	root_meshreplicate_placementsshard_placementstp_placementstp_placements           r-   _chunk_dtensorr      s     -k::IQRRR~A	AAA-
 
 	
 ]]__""$$F
 &%,'' %

670K0K %
  LKU9>5J5JKKKGGy~1F1FGGG$Qii!I3u
 
 

,!'  
 
	
 )$Q'""  LKU9>5J5JKKK#/R GGy~1F1FGGG%ayy+!I3u
 
 

,!'  
 
	
r/   c                 0   t          t          |                                           }t          |          dk    rDt	          |d         j                  t          u r#|d         j        }|                                }|} | t          |          dk    r|ng fS )Nr   r   )r   r   ro   r    rn   r   )r   ry   inner_tensors      r-   _pre_load_state_dictr     s     -((5577F
6{{aD!122mCCay'**,,c&kkAooFF266r/   parent_meshc                 0   || j         k    sJ t          t          j        | j                            }t          dt          |          dz
            D ]}t                      ||<   |                     | j         |          } | 	                                S )zGAll gather a DTensor in its FSDP dimension and return the local tensor.r   r   r   )
r   rD   ra   rb   r   rR   r    r   r   r   )r   r   r   rX   s       r-   _all_gather_dtensorr   )  s    
 &,,,,,dmF$56677J 1c*oo)** $ $!
1  & !  F
 ??r/   c                   ~    e Zd ZdZd fdZdej        deej        ee	         f         fdZ
dej        de	dej        fdZ	 ddej        d	ed
ededej        deej                 dej        fdZdej        d	ededej        fdZdej        deej        ee         f         fdZdedee         dej        fdZ xZS )r   z
    DTensorExtension is the TensorFlattener extension needed for 2D FSDP + TP.

    This is the implementation for FSDPExtensions defined in
    https://github.com/pytorch/pytorch/blob/main/torch/distributed/fsdp/_fsdp_extensions.py
    r   Nc                     t                                                       d | _        || _        t          j                            | j                  | _        d S N)super__init__compute_streamdevice_handler%   _dynamodisablepost_unflatten_transform)selfr   	__class__s     r-   r   zDTensorExtensions.__init__E  sQ    "* ).(=(=))
 )
%%%r/   r   c                      t          |          S r   )r   r   r   s     r-   pre_flatten_transformz'DTensorExtensions.pre_flatten_transformO  s     v&&&r/   param_extensionc                    | j         p| j                                        }| j                            |          5  t	          ||| j        | j                   }t          |           |cd d d            S # 1 swxY w Y   d S )N)r   r   )r   r   current_streamstreamr   r   )r   r   r   r   results        r-   r   z*DTensorExtensions.post_unflatten_transformU  s     $K(:(I(I(K(K&&v.. 	 	 '"0#2	  F  '''	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   .A66A:=A:r^   rg   rh   ri   rE   c                 (    t          |||||          S r   )r|   )r   r   r^   rg   rh   ri   rE   s          r-   chunk_tensorzDTensorExtensions.chunk_tensorh  s     VT:7KRPPPr/   r   c                 $    t          |||          S r   )r   )r   r   r^   r   s       r-   chunk_dtensorzDTensorExtensions.chunk_dtensors  s     fdK888r/   c                      t          |          S r   )r   r   s     r-   pre_load_state_dict_transformz/DTensorExtensions.pre_load_state_dict_transform{  s     $F+++r/   r   c                 "    t          ||          S r   )r   )r   r   r   s      r-   all_gather_dtensorz$DTensorExtensions.all_gather_dtensor  s    
 #6;777r/   )r   Nr   )__name__
__module____qualname____doc__r   r%   r   tupler   r   r   r   intrP   ProcessGrouprE   r   r   r   rD   r   r   r   r   __classcell__)r   s   @r-   r   r   =  s        
 
 
 
 
 
'' 
u|Xc]*	+' ' ' 'l58	   4 *.	Q 	Q	Q 	Q 		Q
 "	Q 	Q &	Q 
	Q 	Q 	Q 	Q99 9  	9
 
9 9 9 9,, 
u|T%[(	), , , ,88 j)8 
	8 8 8 8 8 8 8 8r/   )>ra   typingr   r   r   r%   torch.distributeddistributedrP   &torch.distributed._shard.sharding_spec_shardsharding_spec
shard_spec"torch.distributed.distributed_c10ddistributed_c10dc10d'torch.distributed._shard.sharded_tensorr   r   r   r	   r
   :torch.distributed._shard.sharding_spec.chunk_sharding_specr   torch.distributed.device_meshr   $torch.distributed.fsdp._common_utilsr   'torch.distributed.fsdp._fsdp_extensionsr   #torch.distributed.fsdp._shard_utilsr   torch.distributed.remote_devicer   torch.distributed.tensorr   r   r   r#   6torch.distributed.tensor.parallel._data_parallel_utilsr   r   __all__r   r&   r.   r   r7   r;   rH   r   rY   r\   ShardingSpecr   rf   r|   r   rD   r   r   r   r3   r/   r-   <module>r      s,    & & & & & & & & & &              ; ; ; ; ; ; ; ; ; ; ; ; 1 1 1 1 1 1 1 1 1            A @ @ @ @ @ X X X X X X 9 9 9 9 9 9 D D D D D D B B B B B B L L L L L L : : : : : : T T T T T T T T T T T T        
>W >uz5:'=!> > > > > > >s >uUZ5K/L > > > >
*7 *uUZ-C'D * * * *	 	 	 	 	 	 	$$)$$ $ $ $N7 t0    

!+0<?B   :G
LG

G
 G
 	G

 	G
 \G
 G
 G
 G
T>
L>

>
 >
 	>
 >
 >
 >
B	7L	7
5<e$%	7 	7 	7 	7*% \   (I8 I8 I8 I8 I8 I8 I8 I8 I8 I8r/   