
    Vji&M                     6   d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	 d dl
Z
d dl
mZmZ d dlmZmZmZ d dlmZmZmZ  G d d          Zdd	d
edee         deej        j        eeeef         fdZde	eee j        e j         f         de!eeef         fdZ"dS )    N)Path)LiteralOptionalTupleUnion)deviceTensor)_coreFrame
FrameBatch)_get_cuda_backendcreate_decoderERROR_REPORTING_INSTRUCTIONSc                      e Zd ZdZddddddddeeeej        ej	        e
ef         dee         d	ed
         dedeeeef                  ded         deeee
ej        ej	        f                  fdZdefdZdedefdZdedefdZdeej        ef         defdZdee         fdZdedefdZdeej        ee         f         defdZd&dedededefdZd e defd!Z!d eej        ee          f         defd"Z"d#e d$e defd%Z#dS )'VideoDecodera  A single-stream video decoder.

    Args:
        source (str, ``Pathlib.path``, bytes, ``torch.Tensor`` or file-like object): The source of the video:

            - If ``str``: a local path or a URL to a video file.
            - If ``Pathlib.path``: a path to a local video file.
            - If ``bytes`` object or ``torch.Tensor``: the raw encoded video data.
            - If file-like object: we read video data from the object on demand. The object must
              expose the methods `read(self, size: int) -> bytes` and
              `seek(self, offset: int, whence: int) -> int`. Read more in:
              :ref:`sphx_glr_generated_examples_decoding_file_like.py`.
        stream_index (int, optional): Specifies which stream in the video to decode frames from.
            Note that this index is absolute across all media types. If left unspecified, then
            the :term:`best stream` is used.
        dimension_order(str, optional): The dimension order of the decoded frames.
            This can be either "NCHW" (default) or "NHWC", where N is the batch
            size, C is the number of channels, H is the height, and W is the
            width of the frames.

            .. note::

                Frames are natively decoded in NHWC format by the underlying
                FFmpeg implementation. Converting those into NCHW format is a
                cheap no-copy operation that allows these frames to be
                transformed using the `torchvision transforms
                <https://pytorch.org/vision/stable/transforms.html>`_.
        num_ffmpeg_threads (int, optional): The number of threads to use for decoding.
            Use 1 for single-threaded decoding which may be best if you are running multiple
            instances of ``VideoDecoder`` in parallel. Use a higher number for multi-threaded
            decoding which is best if you are running a single instance of ``VideoDecoder``.
            Passing 0 lets FFmpeg decide on the number of threads.
            Default: 1.
        device (str or torch.device, optional): The device to use for decoding.
            If ``None`` (default), uses the current default device.
            If you pass a CUDA device, we recommend trying the "beta" CUDA
            backend which is faster! See :func:`~torchcodec.decoders.set_cuda_backend`.
        seek_mode (str, optional): Determines if frame access will be "exact" or
            "approximate". Exact guarantees that requesting frame i will always
            return frame i, but doing so requires an initial :term:`scan` of the
            file. Approximate is faster as it avoids scanning the file, but less
            accurate as it uses the file's metadata to calculate where i
            probably is. Default: "exact".
            Read more about this parameter in:
            :ref:`sphx_glr_generated_examples_decoding_approximate_mode.py`
        custom_frame_mappings (str, bytes, or file-like object, optional):
            Mapping of frames to their metadata, typically generated via ffprobe.
            This enables accurate frame seeking without requiring a full video scan.
            Do not set seek_mode when custom_frame_mappings is provided.
            Expected JSON format:

            .. code-block:: json

                {
                    "frames": [
                        {
                            "pts": 0,
                            "duration": 1001,
                            "key_frame": 1
                        }
                    ]
                }

            Alternative field names "pkt_pts" and "pkt_duration" are also supported.
            Read more about this parameter in:
            :ref:`sphx_glr_generated_examples_decoding_custom_frame_mappings.py`

    Attributes:
        metadata (VideoStreamMetadata): Metadata of the video stream.
        stream_index (int): The stream index that this decoder is retrieving frames from. If a
            stream index was provided at initialization, this is the same value. If it was left
            unspecified, this is the :term:`best stream`.
    NNCHW   exact)stream_indexdimension_ordernum_ffmpeg_threadsr   	seek_modecustom_frame_mappingssourcer   r   r   NHWCr   r   r   r   approximater   c          	         t           j                            d           d}||vr)t          d| dd                    |           d          ||dk    rt          d          d }	|d	}t          |          }	t          ||
          | _        t          | j        |          \  | _	        | _
        | _        | _        | _        d}
||
vr)t          d| dd                    |
           d          |t          d|d          |!t          t          j                              }n$t!          |t"                    rt          |          }t%                      }t'          j        | j        | j
        |||||	           d S )Nz torchcodec.decoders.VideoDecoderr   zInvalid seek mode (z). Supported values are z, .r   zcustom_frame_mappings is incompatible with seek_mode='approximate'. Use seek_mode='custom_frame_mappings' or leave it unspecified to automatically use custom frame mappings.r   )r   r   )decoderr   r   zInvalid dimension order (znum_ffmpeg_threads = z should be an int.)r   r   num_threadsr   device_variantr   )torch_C_log_api_usage_once
ValueErrorjoin_read_custom_frame_mappingsr   _decoder!_get_and_validate_stream_metadatametadatar   _begin_stream_seconds_end_stream_seconds_num_framesstrget_default_device
isinstancetorch_devicer   coreadd_video_stream)selfr   r   r   r   r   r   r   allowed_seek_modescustom_frame_mappings_dataallowed_dimension_ordersr#   s               \/root/voice-cloning/.venv/lib/python3.11/site-packages/torchcodec/decoders/_video_decoder.py__init__zVideoDecoder.__init__c   s    	$$%GHHH5...Ii I I(,		2D(E(EI I I   !,m1K1K|   &*" ,/I)D%* *& 'f	JJJ .M
 
 
	
M&$
 $4 ":::OO O O(,		2J(K(KO O O  
 %I 2IIIJJJ>13344FF-- 	![[F*,,M*+*)"<	
 	
 	
 	
 	
 	
    returnc                     | j         S N)r/   r6   s    r:   __len__zVideoDecoder.__len__   s    r<   keyc                 n    t          |t                    sJ t          j        | j        |          ^}}|S )Nframe_index)r2   intr4   get_frame_at_indexr*   )r6   rB   
frame_data_s       r:   _getitem_intzVideoDecoder._getitem_int   s9    #s#####0CPPP
Qr<   c                     t          |t                    sJ |                    t          |                     \  }}}t	          j        | j        |||          ^}}|S )Nstartstopstep)r2   sliceindiceslenr4   get_frames_in_ranger*   )r6   rB   rM   rN   rO   rH   rI   s          r:   _getitem_slicezVideoDecoder._getitem_slice   sg    #u%%%%%KKD		22tT1M	
 
 

Q r<   c                    t          |t          j                  r"|                     t	          |                    S t          |t
                    r|                     |          S t          dt          |           d          )a  Return frame or frames as tensors, at the given index or range.

        .. note::

            If you need to decode multiple frames, we recommend using the batch
            methods instead, since they are faster:
            :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at`,
            :meth:`~torchcodec.decoders.VideoDecoder.get_frames_in_range`,
            :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_at`, and
            :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_in_range`.

        Args:
            key(int or slice): The index or range of frame(s) to retrieve.

        Returns:
            torch.Tensor: The frame or frames at the given index or range.
        zUnsupported key type: z$. Supported types are int and slice.)	r2   numbersIntegralrJ   rF   rP   rT   	TypeErrortype)r6   rB   s     r:   __getitem__zVideoDecoder.__getitem__   s    $ c7+,, 	,$$SXX...U## 	,&&s+++TT#YYTTT
 
 	
r<   c                 4    t          j        | j                  S r?   )r4   _get_key_frame_indicesr*   r@   s    r:   r\   z#VideoDecoder._get_key_frame_indices   s    *4=999r<   indexc                     t          j        | j        |          \  }}}t          ||                                |                                          S )a  Return a single frame at the given index.

        .. note::

            If you need to decode multiple frames, we recommend using the batch
            methods instead, since they are faster:
            :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at`,
            :meth:`~torchcodec.decoders.VideoDecoder.get_frames_in_range`,
            :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_at`,
            :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_in_range`.

        Args:
            index (int): The index of the frame to retrieve.

        Returns:
            Frame: The frame at the given index.
        rD   datapts_secondsduration_seconds)r4   rG   r*   r   item)r6   r]   r`   ra   rb   s        r:   get_frame_atzVideoDecoder.get_frame_at   sd    $ /3.EMu/
 /
 /
+k+ #((**-2244
 
 
 	
r<   rQ   c                 d    t          j        | j        |          \  }}}t          |||          S )zReturn frames at the given indices.

        Args:
            indices (torch.Tensor or list of int): The indices of the frames to retrieve.

        Returns:
            FrameBatch: The frames at the given indices.
        )frame_indicesr_   )r4   get_frames_at_indicesr*   r   )r6   rQ   r`   ra   rb   s        r:   get_frames_atzVideoDecoder.get_frames_at   sL     /3.HM/
 /
 /
+k+ #-
 
 
 	
r<   rM   rN   rO   c                     t          |||                              | j                  \  }}}t          j        | j        |||          }t          | S )a  Return multiple frames at the given index range.

        Frames are in [start, stop).

        Args:
            start (int): Index of the first frame to retrieve.
            stop (int): End of indexing range (exclusive, as per Python
                conventions).
            step (int, optional): Step size between frames. Default: 1.

        Returns:
            FrameBatch: The frames within the specified range.
        rL   )rP   rQ   r/   r4   rS   r*   r   )r6   rM   rN   rO   framess        r:   rS   z VideoDecoder.get_frames_in_range  s_     "%t44<<T=MNNtT)M	
 
 
 6""r<   secondsc           	      "   | j         |cxk    r| j        k     s%n t          d| d| j          d| j         d          t          j        | j        |          \  }}}t          ||                                |                                          S )a  Return a single frame played at the given timestamp in seconds.

        .. note::

            If you need to decode multiple frames, we recommend using the batch
            methods instead, since they are faster:
            :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at`,
            :meth:`~torchcodec.decoders.VideoDecoder.get_frames_in_range`,
            :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_at`,
            :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_in_range`.

        Args:
            seconds (float): The time stamp in seconds when the frame is played.

        Returns:
            Frame: The frame that is played at ``seconds``.
        zInvalid pts in seconds: &. It must be greater than or equal to z and less than r    r_   )r-   r.   
IndexErrorr4   get_frame_at_ptsr*   r   rc   r6   rk   r`   ra   rb   s        r:   get_frame_played_atz VideoDecoder.get_frame_played_at(  s    $ )WOOOOt7OOOOO=7 = =7;7Q= =!%!9= = =  
 /3.CM7/
 /
+k+ #((**-2244
 
 
 	
r<   c                 d    t          j        | j        |          \  }}}t          |||          S )a  Return frames played at the given timestamps in seconds.

        Args:
            seconds (torch.Tensor or list of float): The timestamps in seconds when the frames are played.

        Returns:
            FrameBatch: The frames that are played at ``seconds``.
        )
timestampsr_   )r4   get_frames_by_ptsr*   r   rp   s        r:   get_frames_played_atz!VideoDecoder.get_frames_played_atI  sL     /3.DMg/
 /
 /
+k+ #-
 
 
 	
r<   start_secondsstop_secondsc           	      H   ||k    st          d| d| d          | j        |cxk    r| j        k     s%n t          d| d| j         d| j         d          || j        k    st          d| d| j         d          t          j        | j        ||	          }t          | S )
a'  Returns multiple frames in the given range.

        Frames are in the half open range [start_seconds, stop_seconds). Each
        returned frame's :term:`pts`, in seconds, is inside of the half open
        range.

        Args:
            start_seconds (float): Time, in seconds, of the start of the
                range.
            stop_seconds (float): Time, in seconds, of the end of the
                range. As a half open range, the end is excluded.

        Returns:
            FrameBatch: The frames within the specified range.
        zInvalid start seconds: z1. It must be less than or equal to stop seconds (z).rm   z and less than or equal to r    zInvalid stop seconds: z#. It must be less than or equal to )rv   rw   )r'   r-   r.   r4   get_frames_by_pts_in_ranger*   r   )r6   rv   rw   rj   s       r:   get_frames_played_in_rangez'VideoDecoder.get_frames_played_in_range^  s.   $ ,,z-zzjvzzz   )]UUUUT=UUUUUI- I I7;7QI I-1-EI I I  
 t777P P P484LP P P   0M'%
 
 

 6""r<   )r   )$__name__
__module____qualname____doc__r   r0   r   io	RawIOBaseBufferedReaderbytesr	   r   rF   r   r3   r;   rA   rJ   rP   rT   rV   rW   rZ   listr\   r   rd   r$   r   rh   rS   floatrq   ru   rz    r<   r:   r   r      s       H H\ '+39"#595< I
 I
 I
c4r/@%OPI
 sm	I

 !0I
  I
 sL012I
 12I
  (#ublB,==> 
I
 I
 I
 I
V              
% 
F 
 
 
 

uW%5u%<= 
& 
 
 
 
6:S	 : : : :
# 
% 
 
 
 
6
U5<c+B%C 

 
 
 
 
(# # #C #s #: # # # #0
5 
U 
 
 
 
B
U\4;67
	
 
 
 
*&#"&#27&#	&# &# &# &# &# &#r<   r   )r   r!   r   r=   c                 .   t          j        |           }| |j        x}t          dt          z             |t          |j                  k    rt          d| d          |j        |         }t          |t           j        j	                  st          d| d          |j
        t          dt          z             |j
        }|j        t          dt          z             |j        }|j        t          dt          z             |j        }|||||fS )	NzCThe best video stream is unknown and there is no specified stream. zThe stream index z is not a valid stream.zThe stream at index z is not a video stream. z-The minimum pts value in seconds is unknown. z-The maximum pts value in seconds is unknown. z!The number of frames is unknown. )r4   get_container_metadatabest_video_stream_indexr'   r   rR   streamsr2   	_metadataVideoStreamMetadatabegin_stream_secondsend_stream_seconds
num_frames)r!   r   container_metadatar,   r   r   r   s          r:   r+   r+     sc    4W==.FFLOU./  
 s-56666R\RRRSSS!),7Hh BCC XVVVVWWW$,;*+
 
 	
 $8"*;*+
 
 	
 "4"/2NN
 
 	
 $J 	 r<   r   c                 v  	 	 t          | d          rt          j        |           nt          j        |           }n+# t          j        $ r}t          d| d          |d}~ww xY w|rd|vrt          d          |d         d         t          fdd	D             d          	t          fd
dD             d          dv }	rr|st          d          t          j        	fd|d         D             t          j	                  }t          j        d |d         D             t          j
                  }t          j        fd|d         D             t          j	                  }t          |          t          |          cxk    rt          |          k    sn t          d          |||fS )a  Parse custom frame mappings from JSON data and extract frame metadata.

    Args:
        custom_frame_mappings: JSON data containing frame metadata, provided as:
            - A JSON string (str, bytes)
            - A file-like object with a read() method

    Returns:
        A tuple of three tensors:
        - all_frames (Tensor): Presentation timestamps (PTS) for each frame
        - is_key_frame (Tensor): Boolean tensor indicating which frames are key frames
        - duration (Tensor): Duration of each frame
    readzInvalid custom frame mappings: z9. It should be a valid JSON string or a file-like object.Nrj   zWInvalid custom frame mappings. The input is empty or missing the required 'frames' key.r   c              3   $   K   | ]
}|v |V  d S r?   r   .0rB   first_frames     r:   	<genexpr>z._read_custom_frame_mappings.<locals>.<genexpr>  s-      LLC9K9KC9K9K9K9KLLr<   )ptspkt_ptsc              3   $   K   | ]
}|v |V  d S r?   r   r   s     r:   r   z._read_custom_frame_mappings.<locals>.<genexpr>  s-      KK{8J8J8J8J8J8JKKr<   )durationpkt_duration	key_framezInvalid custom frame mappings. The 'pts'/'pkt_pts', 'duration'/'pkt_duration', and 'key_frame' keys are required in the frame metadata.c                 :    g | ]}t          |                   S r   rF   )r   framepts_keys     r:   
<listcomp>z/_read_custom_frame_mappings.<locals>.<listcomp>  s%    ???U7^		???r<   )dtypec                 8    g | ]}t          |d                    S )r   r   )r   r   s     r:   r   z/_read_custom_frame_mappings.<locals>.<listcomp>  s%    CCCUU;	 	 CCCr<   c                 :    g | ]}t          |                   S r   r   )r   r   duration_keys     r:   r   z/_read_custom_frame_mappings.<locals>.<listcomp>  s&    DDDeU< 	!	!DDDr<   z&Mismatched lengths in frame index data)hasattrjsonloadloadsJSONDecodeErrorr'   nextr$   tensorint64boolrR   )
r   
input_dataekey_frame_present
all_framesis_key_framer   r   r   r   s
          @@@r:   r)   r)     sB    	 ,f553DI+,,,122 	

    jajjj
 
	
  
33e
 
 	
 X&q)KLLLL#5LLLdSSGKKKK4KKKT L ${2 
, 
.? 
 V
 
 	
 ????*X*>???u{  J <CCj.BCCC5:  L |DDDDz(/CDDDEK  H 
OOs<00AAAACMMAAAAABBB|X--s   8> A&A!!A&)#r   r   rV   pathlibr   typingr   r   r   r   r$   r   r3   r	   
torchcodecr
   r4   r   r   "torchcodec.decoders._decoder_utilsr   r   r   r   rF   r   r   r   r+   r0   r   r   r   tupler)   r   r<   r:   <module>r      s   
			         2 2 2 2 2 2 2 2 2 2 2 2  0 0 0 0 0 0 0 0 7 7 7 7 7 7 7 7 7 7         l# l# l# l# l# l# l# l#d #'0 0 00 3-0 4>-sE5#EF	0 0 0 0f7. eR\2;L!LM7.
666!"7. 7. 7. 7. 7. 7.r<   