iterative · dreadatour · Jan 13, 2025 · Jan 14, 2025 · Jan 14, 2025 · Jan 14, 2025
diff --git a/.github/workflows/tests-studio.yml b/.github/workflows/tests-studio.yml
@@ -75,6 +75,9 @@ jobs:
           path: './backend/datachain'
           fetch-depth: 0
 
+      - name: Set up FFmpeg
+        uses: AnimMouse/setup-ffmpeg@v1
+
       - name: Set up Python ${{ matrix.pyv }}
         uses: actions/setup-python@v5
         with:

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -78,6 +78,9 @@ jobs:
           fetch-depth: 0
           ref: ${{ github.event.pull_request.head.sha || github.ref }}
 
+      - name: Set up FFmpeg
+        uses: AnimMouse/setup-ffmpeg@v1
+
       - name: Set up Python ${{ matrix.pyv }}
         uses: actions/setup-python@v5
         with:

diff --git a/pyproject.toml b/pyproject.toml
@@ -79,8 +79,16 @@ hf = [
   "numba>=0.60.0",
   "datasets[audio,vision]>=2.21.0"
 ]
+video = [
+  # Use 'av<14' because of incompatibility with imageio
+  # See https://github.com/PyAV-Org/PyAV/discussions/1700
+  "av<14",
+  "ffmpeg-python",
+  "imageio[ffmpeg]",
+  "opencv-python"
+]
 tests = [
-  "datachain[torch,remote,vector,hf]",
+  "datachain[torch,remote,vector,hf,video]",
   "pytest>=8,<9",
   "pytest-sugar>=0.9.6",
   "pytest-cov>=4.1.0",

diff --git a/src/datachain/__init__.py b/src/datachain/__init__.py
@@ -4,9 +4,14 @@
     ArrowRow,
     File,
     FileError,
+    Image,
     ImageFile,
     TarVFile,
     TextFile,
+    Video,
+    VideoFile,
+    VideoFragment,
+    VideoFrame,
 )
 from datachain.lib.model_store import ModelStore
 from datachain.lib.udf import Aggregator, Generator, Mapper
@@ -27,13 +32,18 @@
     "File",
     "FileError",
     "Generator",
+    "Image",
     "ImageFile",
     "Mapper",
     "ModelStore",
     "Session",
     "Sys",
     "TarVFile",
     "TextFile",
+    "Video",
+    "VideoFile",
+    "VideoFragment",
+    "VideoFrame",
     "is_chain_type",
     "metrics",
     "param",

diff --git a/src/datachain/lib/file.py b/src/datachain/lib/file.py
@@ -17,7 +17,7 @@
 from urllib.request import url2pathname
 
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
-from PIL import Image
+from PIL import Image as PilImage
 from pydantic import Field, field_validator
 
 from datachain.client.fileslice import FileSlice
@@ -27,6 +27,7 @@
 from datachain.utils import TIME_ZERO
 
 if TYPE_CHECKING:
+    from numpy import ndarray
     from typing_extensions import Self
 
     from datachain.catalog import Catalog
@@ -40,7 +41,7 @@
 # how to create file path when exporting
 ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
 
-FileType = Literal["binary", "text", "image"]
+FileType = Literal["binary", "text", "image", "video"]
 
 
 class VFileError(DataChainError):
@@ -193,7 +194,7 @@
     @classmethod
     def upload(
         cls, data: bytes, path: str, catalog: Optional["Catalog"] = None
-    ) -> "File":
+    ) -> "Self":
         if catalog is None:
             from datachain.catalog.loader import get_catalog
 
@@ -203,6 +204,8 @@
 
         client = catalog.get_client(parent)
         file = client.upload(data, name)
+        if not isinstance(file, cls):
+            file = cls(**file.model_dump())
         file._set_stream(catalog)
         return file
 
@@ -486,13 +489,205 @@
     def read(self):
         """Returns `PIL.Image.Image` object."""
         fobj = super().read()
-        return Image.open(BytesIO(fobj))
+        return PilImage.open(BytesIO(fobj))
 
     def save(self, destination: str):
         """Writes it's content to destination"""
         self.read().save(destination)
 
 
+class Image(DataModel):
+    """`DataModel` for image file meta information."""
+
+    width: int = Field(default=-1)
+    height: int = Field(default=-1)
+    format: str = Field(default="")
+
+
+class VideoFile(File):
+    """`DataModel` for reading video files."""
+
+    def get_info(self) -> "Video":
+        """Returns video file information."""
+        from .video import video_info
+
+        return video_info(self)
+
+    def get_frame(self, frame: int) -> "VideoFrame":
+        """
+        Returns VideoFrame model for a video frame.
+
+        Args:
+            frame (int): Frame number to read.
+
+        Returns:
+            VideoFrame: Video frame model.
+        """
+        if frame < 0:
+            raise ValueError("frame must be a non-negative integer")
+
+        frame_file = VideoFrame(**self.model_dump(), frame=frame)
+        frame_file._set_stream(self._catalog)
+        return frame_file
+
+    def get_frames(
+        self,
+        start: int = 0,
+        end: Optional[int] = None,
+        step: int = 1,
+    ) -> "Iterator[VideoFrame]":
+        """
+        Returns VideoFrame models for a video frame.
+
+        Args:
+            start (int): Frame number to start reading from (default: 0).
+            end (Optional[int]): Frame number to stop reading at, non-inclusive
+                                 (default: None, read until the end).
+            step (int): Step size for reading frames (default: 1).
+
+        Returns:
+            Iterator[VideoFrame]: List of video frame models.
+
+        Note:
+            If end is not specified, number of frames will be taken from the video file.
+        """
+        if start < 0:
+            raise ValueError("start_frame must be a non-negative integer.")
+
+        if end is None:
+            end = self.get_info().frames
+
+        if end < 0:
+            raise ValueError("end_frame must be a non-negative integer.")
+        if start > end:
+            raise ValueError("start_frame must be less than or equal to end_frame.")
+
+        if step < 1:
+            raise ValueError("step must be a positive integer.")
+
+        for frame in range(start, end, step):
+            yield self.get_frame(frame)
+
+    def get_fragment(self, start: float, end: float) -> "VideoFragment":
+        """
+        Returns VideoFragment model for a video interval.
+
+        Args:
+            start (float): Start time in seconds.
+            end (float): End time in seconds.
+
+        Returns:
+            VideoFragment: Video fragment model.
+        """
+        if start < 0 or end < 0 or start >= end:
+            raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
+
+        fragment_file = VideoFragment(**self.model_dump(), start=start, end=end)
+        fragment_file._set_stream(self._catalog)
+        return fragment_file
+
+    def get_fragments(
+        self,
+        intervals: list[tuple[float, float]],
+    ) -> "Iterator[VideoFragment]":
+        """
+        Returns VideoFragment models for video intervals.
+
+        Args:
+            intervals (list[tuple[float, float]]): List of start and end times
+                                                   in seconds.
+
+        Returns:
+            Iterator[VideoFragment]: List of video fragment models.
+        """
+        for start, end in intervals:
+            yield self.get_fragment(start, end)
+
+
+class VideoFrame(VideoFile):
+    """`DataModel` for reading video frames."""
+
+    frame: int = Field(default=-1)
+
+    def get_np(self) -> "ndarray":
+        """
+        Reads video frame from a video file and returns as numpy array.
+
+        Returns:
+            ndarray: Video frame.
+        """
+        from .video import video_frame_np
+
+        return video_frame_np(self)
+
+    def read_bytes(self, format: str = "jpg") -> bytes:
+        """
+        Reads video frame from a video file and returns as image bytes.
+
+        Args:
+            format (str): Image format (default: 'jpg').
+
+        Returns:
+            bytes: Video frame image as bytes.
+        """
+        from .video import video_frame_bytes
+
+        return video_frame_bytes(self, format)
+
+    def save(self, output: str, format: str = "jpg") -> "ImageFile":
+        """
+        Saves video frame as a new image file. If output is a remote path,
+        the image file will be uploaded to the remote storage.
+
+        Args:
+            output (str): Output path, can be a local path or a remote path.
+            format (str): Image format (default: 'jpg').
+
+        Returns:
+            ImageFile: Image file model.
+        """
+        from .video import save_video_frame
+
+        return save_video_frame(self, output, format)
+
+
+class VideoFragment(VideoFile):
+    """`DataModel` for reading video fragments."""
+
+    start: float = Field(default=-1.0)
+    end: float = Field(default=-1.0)
+
+    def save(self, output: str, format: Optional[str] = None) -> "VideoFile":
+        """
+        Saves video interval as a new video file. If output is a remote path,
+        the video file will be uploaded to the remote storage.
+
+        Args:
+            output (str): Output path, can be a local path or a remote path.
+            format (Optional[str]): Output format (default: None). If not provided,
+                                    the format will be inferred from the video fragment
+                                    file extension.
+
+        Returns:
+            VideoFile: Video fragment model.
+        """
+        from .video import save_video_fragment
+
+        return save_video_fragment(self, output, format)
+
+
+class Video(DataModel):
+    """`DataModel` for video file meta information."""
+
+    width: int = Field(default=-1)
+    height: int = Field(default=-1)
+    fps: float = Field(default=-1.0)
+    duration: float = Field(default=-1.0)
+    frames: int = Field(default=-1)
+    format: str = Field(default="")
+    codec: str = Field(default="")
+
+
 class ArrowRow(DataModel):
     """`DataModel` for reading row from Arrow-supported file."""
 
@@ -528,5 +723,7 @@
         file = TextFile
     elif type_ == "image":
         file = ImageFile  # type: ignore[assignment]
+    elif type_ == "video":
+        file = VideoFile
 
     return file
diff --git a/src/datachain/lib/hf.py b/src/datachain/lib/hf.py
@@ -20,7 +20,7 @@
 
 except ImportError as exc:
     raise ImportError(
-        "Missing dependencies for huggingface datasets:\n"
+        "Missing dependencies for huggingface datasets.\n"
         "To install run:\n\n"
         "  pip install 'datachain[hf]'\n"
     ) from exc

diff --git a/src/datachain/lib/vfile.py b/src/datachain/lib/vfile.py