Skip to content

Commit

Permalink
Subset worker refactor (#287)
Browse files Browse the repository at this point in the history
* ClippingSubsampler rewrite and bug fixes

* More refactoring of ClippingSubsampler, plus a fix to _get_clip_intervals

* Finished refactoring ClippingSubsampler

* Final code changes

* Added docstrings

* Passed tests and linting

* Made type annotations consistent with Python 3.8

* More annotation fixes

* The Python 3.8 annotation needs a lot of hand-holding, it seems

* Pylint has to cut it out, I swear to God

* No real change, just relauching unit tests which failed due to connection timeouts

* Linting issue

* Another linting issue

* Separated per-shard code from code that should only be executed once

* Pulled ShardStatus parameters into their own data type

* Cleaned up shard processing error handling

* Cleaned up code

* Bug fixes

* Formatting

* Fixed linting issues

* Fixing more damn linting

* Added a missing docstring

* Removed git worktree folder (ugh)

---------

Co-authored-by: iejMac <[email protected]>
Co-authored-by: Romain Beaumont <[email protected]>
  • Loading branch information
3 people authored Jan 25, 2024
1 parent e7a4591 commit ab510d0
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 161 deletions.
17 changes: 9 additions & 8 deletions video2dataset/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,26 @@
from typing import List, Optional, Any
import numpy as np # pylint: disable=unused-import

from .logger import LoggerProcess
from .data_writer import (
from video2dataset.logger import LoggerProcess
from video2dataset.data_writer import (
WebDatasetSampleWriter,
FilesSampleWriter,
ParquetSampleWriter,
TFRecordSampleWriter,
DummySampleWriter,
)
from .input_sharder import InputSharder
from .output_sharder import OutputSharder
from .distributor import (
from video2dataset.input_sharder import InputSharder
from video2dataset.output_sharder import OutputSharder
from video2dataset.distributor import (
no_distributor,
multiprocessing_distributor,
pyspark_distributor,
SlurmDistributor,
SlurmShardSampler,
)
from .workers import DownloadWorker, SubsetWorker, OpticalFlowWorker, CaptionWorker, WhisperWorker
from .configs import CONFIGS
from video2dataset.workers import DownloadWorker, SubsetWorker, OpticalFlowWorker, CaptionWorker, WhisperWorker
from video2dataset.configs import CONFIGS
from video2dataset.types import EncodeFormats


def identity(x):
Expand All @@ -42,7 +43,7 @@ def video2dataset(
output_folder: str = "dataset",
output_format: str = "files",
input_format: str = "csv",
encode_formats: Optional[dict] = None,
encode_formats: Optional[EncodeFormats] = None,
stage: str = "download",
url_col: str = "url",
caption_col: Optional[str] = None,
Expand Down
17 changes: 4 additions & 13 deletions video2dataset/subsamplers/clipping_subsampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,21 @@
clipping subsampler turns full videos into clips of videos according to clip_col
"""
from collections.abc import Iterable
from typing import Any, Union, List, Tuple, Dict, TypedDict, Literal, cast
import copy
import datetime
import ffmpeg
import glob
import os
import tempfile
from typing import Any, Union, List, Tuple, Dict, Literal, cast

import datetime
from .subsampler import Subsampler
from video2dataset.subsamplers.subsampler import Subsampler
from video2dataset.types import EncodeFormats, Streams


ClipSpan = List[float] # [start, end]


class EncodeFormats(TypedDict):
video: str
audio: str


class Streams(TypedDict):
video: List[bytes]
audio: List[bytes]


def _get_seconds(t: Union[str, float]) -> float:
"""Converts time to seconds"""
if not isinstance(t, str):
Expand Down
12 changes: 12 additions & 0 deletions video2dataset/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Type definitions for video2dataset."""
from typing import List, TypedDict


class EncodeFormats(TypedDict, total=False):
video: str
audio: str


class Streams(TypedDict, total=False):
video: List[bytes]
audio: List[bytes]
Loading

0 comments on commit ab510d0

Please sign in to comment.