-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStatistics.py
109 lines (92 loc) · 4.01 KB
/
Statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from Select import Select
import os
from tqdm import tqdm
from Common import Common
class Statistics:
def __init__(self, image_level=None):
""" Builds the object
Parameters
----------
image_level : bool
Whether using image-level dataset, set to False if using bounding boxes, by default is True
"""
self.image_level = True if image_level is None else image_level
self.common = Common(self.image_level)
def get_class_counts(self, root_dir, human_readable=None):
""" Returns the counts of each class for the training, validation, and test sets
Parameters
----------
root_dir : str
Root directory containing csv files and new folder
human_readable : bool
Whether the dictionary should use the class labels as keys or human descriptions, default is True
Returns
-------
dict of str -> dict of str -> int
Top level dictionary is keyed by labels or human descriptions, second level is train, validation, or test,
the value of the second level dictionary is the counts for that labels subset
"""
human_readable = True if human_readable is None else human_readable
class_counts = {}
for subset in ["train", "validation", "test"]:
print("Loading CSVs for {}".format(subset))
image_labels_file = self.common.get_image_labels_file(subset)
image_labels_path = os.path.join(root_dir, image_labels_file)
c = Common.load_csv_as_dict(image_labels_path)
for row in tqdm(c):
label_name = row["LabelName"]
if not class_counts.get(label_name):
class_counts[label_name] = {}
if not class_counts[label_name].get(subset):
class_counts[label_name][subset] = 0
class_counts[label_name][subset] += 1
if human_readable:
label_to_human_label = Select.class_names_to_human_names(
os.path.join(root_dir, self.common.get_classes_description_file()))
for label in label_to_human_label.keys():
class_counts[label_to_human_label[label]] = class_counts[label]
del class_counts[label]
return class_counts
def number_of_images(self, root_dir, per_subset=None):
""" Returns the count of all images
Parameters
----------
root_dir : str
Root directory containing csv files and new folder
per_subset : bool
Whether to give stats per subset or for the whole dataset, by default the whole dataset
Returns
-------
int
Count of all images
"""
per_subset = False if per_subset is None else per_subset
counts = {}
for subset in ["train", "validation", "test"]:
counts[subset] = 0
print("Loading CSVs for {}".format(subset))
image_ids_file = self.common.get_image_ids_file(subset)
image_ids_path = os.path.join(root_dir, image_ids_file)
counts[subset] += len(Select.get_image_names(image_ids_path))
if not per_subset:
counts = sum(counts.values())
return counts
def download_space_required(self, root_dir):
""" Returns the space required in bytes to download all training, validation, and test files
Parameters
----------
root_dir : str
Root directory containing csv files and new folder
Returns
-------
int
Bytes required to download all training, validation, and test files
"""
size = 0
for subset in ["train", "validation", "test"]:
image_ids_file = self.common.get_image_ids_file(subset)
print("Reading {}".format(subset))
c = Common.load_csv_as_dict(os.path.join(root_dir, image_ids_file))
for row in tqdm(c):
size += int(row["OriginalSize"])
return size