-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcoco_handler.py
88 lines (70 loc) · 3.26 KB
/
coco_handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from pathlib import Path
import requests
import zipfile
from tqdm import tqdm
import shutil
class COCOHandler:
def __init__(self, target_dir: Path):
self.target_dir = target_dir
self.target_dir.mkdir(exist_ok=True)
def download_file(self, url: str, dest_path: Path):
response = requests.get(url, stream=True)
total_size = int(response.headers.get("content-length", 0))
with open(dest_path, "wb") as file, tqdm(
desc=dest_path.name,
total=total_size,
unit="iB",
unit_scale=True,
unit_divisor=1024,
) as pbar:
for data in response.iter_content(chunk_size=1024):
size = file.write(data)
pbar.update(size)
def setup_dataset(self, dataset_type: str = "val", num_images: int = 5000):
if self.target_dir.exists() and any(self.target_dir.iterdir()):
print("Skipping COCO download.")
return len(list(self.target_dir.glob("*.jpg")))
temp_dir = Path("temp_coco")
temp_dir.mkdir(exist_ok=True)
try:
if dataset_type == "test":
image_url = "http://images.cocodataset.org/zips/test2017.zip"
subfolder = "test2017"
else:
image_url = "http://images.cocodataset.org/zips/val2017.zip"
subfolder = "val2017"
image_zip = temp_dir / f"{subfolder}.zip"
print(f"Downloading COCO {dataset_type} images...")
self.download_file(image_url, image_zip)
print("Extracting files...")
with zipfile.ZipFile(image_zip, "r") as zip_ref:
zip_ref.extractall(temp_dir)
dataset_dir = temp_dir / subfolder
if dataset_dir.exists():
print(f"Moving {num_images} images to {self.target_dir}...")
image_files = list(dataset_dir.glob("*.jpg"))[:num_images]
for file in self.target_dir.glob("*.jpg"):
file.unlink()
chunk_size = 1000
for i in range(0, len(image_files), chunk_size):
chunk = image_files[i : i + chunk_size]
for img_file in tqdm(
chunk, desc=f"Moving images batch {i//chunk_size + 1}"
):
try:
shutil.copy2(img_file, self.target_dir / img_file.name)
except OSError as e:
print(f"Error copying {img_file}: {e}")
for temp_file in dataset_dir.glob("*.jpg"):
if temp_file not in image_files[: i + len(chunk)]:
temp_file.unlink()
shutil.copy2(img_file, self.target_dir / img_file.name)
print(f"Successfully moved {len(image_files)} images")
finally:
if temp_dir.exists():
shutil.rmtree(temp_dir)
print("COCO dataset preparation complete!")
return len(list(self.target_dir.glob("*.jpg")))
def prepare_coco(target_dir: Path, num_images: int = 5000, dataset_type: str = "val"):
handler = COCOHandler(target_dir)
return handler.setup_dataset(dataset_type, num_images)