Subset worker refactor (#287)

* ClippingSubsampler rewrite and bug fixes * More refactoring of ClippingSubsampler, plus a fix to _get_clip_intervals * Finished refactoring ClippingSubsampler * Final code changes * Added docstrings * Passed tests and linting * Made type annotations consistent with Python 3.8 * More annotation fixes * The Python 3.8 annotation needs a lot of hand-holding, it seems * Pylint has to cut it out, I swear to God * No real change, just relauching unit tests which failed due to connection timeouts * Linting issue * Another linting issue * Separated per-shard code from code that should only be executed once * Pulled ShardStatus parameters into their own data type * Cleaned up shard processing error handling * Cleaned up code * Bug fixes * Formatting * Fixed linting issues * Fixing more damn linting * Added a missing docstring * Removed git worktree folder (ugh) --------- Co-authored-by: iejMac <[email protected]> Co-authored-by: Romain Beaumont <[email protected]>
iejMac · Jan 25, 2024 · ab510d0 · ab510d0
1 parent e7a4591
commit ab510d0
Show file tree

Hide file tree

Showing 4 changed files with 174 additions and 161 deletions.
diff --git a/video2dataset/main.py b/video2dataset/main.py
@@ -9,25 +9,26 @@
 from typing import List, Optional, Any
 import numpy as np  # pylint: disable=unused-import
 
-from .logger import LoggerProcess
-from .data_writer import (
+from video2dataset.logger import LoggerProcess
+from video2dataset.data_writer import (
     WebDatasetSampleWriter,
     FilesSampleWriter,
     ParquetSampleWriter,
     TFRecordSampleWriter,
     DummySampleWriter,
 )
-from .input_sharder import InputSharder
-from .output_sharder import OutputSharder
-from .distributor import (
+from video2dataset.input_sharder import InputSharder
+from video2dataset.output_sharder import OutputSharder
+from video2dataset.distributor import (
     no_distributor,
     multiprocessing_distributor,
     pyspark_distributor,
     SlurmDistributor,
     SlurmShardSampler,
 )
-from .workers import DownloadWorker, SubsetWorker, OpticalFlowWorker, CaptionWorker, WhisperWorker
-from .configs import CONFIGS
+from video2dataset.workers import DownloadWorker, SubsetWorker, OpticalFlowWorker, CaptionWorker, WhisperWorker
+from video2dataset.configs import CONFIGS
+from video2dataset.types import EncodeFormats
 
 
 def identity(x):
@@ -42,7 +43,7 @@ def video2dataset(
     output_folder: str = "dataset",
     output_format: str = "files",
     input_format: str = "csv",
-    encode_formats: Optional[dict] = None,
+    encode_formats: Optional[EncodeFormats] = None,
     stage: str = "download",
     url_col: str = "url",
     caption_col: Optional[str] = None,

diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py
@@ -2,30 +2,21 @@
 clipping subsampler turns full videos into clips of videos according to clip_col
 """
 from collections.abc import Iterable
-from typing import Any, Union, List, Tuple, Dict, TypedDict, Literal, cast
 import copy
+import datetime
 import ffmpeg
 import glob
 import os
 import tempfile
+from typing import Any, Union, List, Tuple, Dict, Literal, cast
 
-import datetime
-from .subsampler import Subsampler
+from video2dataset.subsamplers.subsampler import Subsampler
+from video2dataset.types import EncodeFormats, Streams
 
 
 ClipSpan = List[float]  # [start, end]
 
 
-class EncodeFormats(TypedDict):
-    video: str
-    audio: str
-
-
-class Streams(TypedDict):
-    video: List[bytes]
-    audio: List[bytes]
-
-
 def _get_seconds(t: Union[str, float]) -> float:
     """Converts time to seconds"""
     if not isinstance(t, str):

diff --git a/video2dataset/types.py b/video2dataset/types.py
@@ -0,0 +1,12 @@
+"""Type definitions for video2dataset."""
+from typing import List, TypedDict
+
+
+class EncodeFormats(TypedDict, total=False):
+    video: str
+    audio: str
+
+
+class Streams(TypedDict, total=False):
+    video: List[bytes]
+    audio: List[bytes]