From 2633c60d757688ab21f856f6e43ad38a46cb4daf Mon Sep 17 00:00:00 2001 From: Shankhanil Ghosh Date: Fri, 19 Mar 2021 22:08:19 +0530 Subject: [PATCH 01/17] created morphological preprocessing file --- klar_eda/preprocess/csv_preprocess.py | 2 +- klar_eda/preprocess/image_preprocess.py | 166 +++++++++--------- .../preprocess/image_preprocess/__init__.py | 3 + .../image_preprocess/morphological.py | 114 ++++++++++++ klar_eda/preprocessing.py | 23 +-- 5 files changed, 214 insertions(+), 94 deletions(-) create mode 100644 klar_eda/preprocess/image_preprocess/__init__.py create mode 100644 klar_eda/preprocess/image_preprocess/morphological.py diff --git a/klar_eda/preprocess/csv_preprocess.py b/klar_eda/preprocess/csv_preprocess.py index 43e9264..bbcd0f7 100644 --- a/klar_eda/preprocess/csv_preprocess.py +++ b/klar_eda/preprocess/csv_preprocess.py @@ -68,7 +68,7 @@ def fill_numerical_na(self, ret = False): self.df[col] = y except Exception as e: pass - if ret == True: + if ret == True: return self.df def fill_categorical_na(self, ret = False): diff --git a/klar_eda/preprocess/image_preprocess.py b/klar_eda/preprocess/image_preprocess.py index 8c4eeeb..50afaa9 100644 --- a/klar_eda/preprocess/image_preprocess.py +++ b/klar_eda/preprocess/image_preprocess.py @@ -96,93 +96,93 @@ def contrast_control(self, alpha = 1.25, beta = 0, save=True, show=False): print('Error while changing contast for image ',image_index, e) self.cv2_image_list = contrast_image_list - def thresholding(self, technique = 'mean', threshold = cv2.THRESH_BINARY, save=True, show=False): - binarized_image_list = [] - image_index = 0 - #study the parameters - for image in self.cv2_image_list: - try: - if technique == 'simple': - res , img = cv2.threshold(image, 120, 255, threshold) - binarized_image_list.append(img) - self.save_or_show_image(img,image_index,'threshold',save=save,show=show) - image_index += 1 - elif technique == 'mean': - img = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, threshold, 199, 5) - binarized_image_list.append(img) - self.save_or_show_image(img,image_index,'threshold',save=save,show=show) - image_index += 1 - else: - img = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, threshold, 199, 5) - binarized_image_list.append(img) - self.save_or_show_image(img,image_index,'threshold',save=save,show=show) - image_index += 1 - except Exception as e: - print('Error during binarization of image ', image_index, e) - self.cv2_image_list = binarized_image_list + # def thresholding(self, technique = 'mean', threshold = cv2.THRESH_BINARY, save=True, show=False): + # binarized_image_list = [] + # image_index = 0 + # #study the parameters + # for image in self.cv2_image_list: + # try: + # if technique == 'simple': + # res , img = cv2.threshold(image, 120, 255, threshold) + # binarized_image_list.append(img) + # self.save_or_show_image(img,image_index,'threshold',save=save,show=show) + # image_index += 1 + # elif technique == 'mean': + # img = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, threshold, 199, 5) + # binarized_image_list.append(img) + # self.save_or_show_image(img,image_index,'threshold',save=save,show=show) + # image_index += 1 + # else: + # img = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, threshold, 199, 5) + # binarized_image_list.append(img) + # self.save_or_show_image(img,image_index,'threshold',save=save,show=show) + # image_index += 1 + # except Exception as e: + # print('Error during binarization of image ', image_index, e) + # self.cv2_image_list = binarized_image_list - def denoise(self, is_gray = True, save=True, show=False): - denoised_image_list = [] - image_index = 0 - for image in self.cv2_image_list: - try: - if not is_gray: - img = cv2.fastNlMeansDenoisingColored(image,None,10,10,7,21) - else: - img = cv2.fastNlMeansDenoising(image,None,3,7,21) - denoised_image_list.append(img) - self.save_or_show_image(img,image_index,'denoise',save=save,show=show) - image_index += 1 - except Exception as e: - print('Error during denoising image ', image_index, e) - self.cv2_image_list = denoised_image_list + # def denoise(self, is_gray = True, save=True, show=False): + # denoised_image_list = [] + # image_index = 0 + # for image in self.cv2_image_list: + # try: + # if not is_gray: + # img = cv2.fastNlMeansDenoisingColored(image,None,10,10,7,21) + # else: + # img = cv2.fastNlMeansDenoising(image,None,3,7,21) + # denoised_image_list.append(img) + # self.save_or_show_image(img,image_index,'denoise',save=save,show=show) + # image_index += 1 + # except Exception as e: + # print('Error during denoising image ', image_index, e) + # self.cv2_image_list = denoised_image_list - def erode(self, dim = None, save=True, show=False): - eroded_image_list = [] - image_index = 0 - if dim == None: - dim = (2,2) - for image in self.cv2_image_list: - try: - kernel = np.ones(dim,np.uint8) - img = cv2.erode(image,kernel,iterations = 1) - self.save_or_show_image(img,image_index,'erode',save=save,show=show) - image_index += 1 - eroded_image_list.append(img) - except Exception as e: - print('Error during eroding image ', image_index, e) - self.cv2_image_list = eroded_image_list + # def erode(self, dim = None, save=True, show=False): + # eroded_image_list = [] + # image_index = 0 + # if dim == None: + # dim = (2,2) + # for image in self.cv2_image_list: + # try: + # kernel = np.ones(dim,np.uint8) + # img = cv2.erode(image,kernel,iterations = 1) + # self.save_or_show_image(img,image_index,'erode',save=save,show=show) + # image_index += 1 + # eroded_image_list.append(img) + # except Exception as e: + # print('Error during eroding image ', image_index, e) + # self.cv2_image_list = eroded_image_list - def dilation(self, dim = None, save=True, show=False): - dilated_image_list = [] - image_index = 0 - if dim == None: - dim = (2,2) - for image in self.cv2_image_list: - try: - kernel = np.ones(dim,np.uint8) - img = cv2.dilate(image,kernel,iterations = 1) - self.save_or_show_image(img,image_index,'dilation',save=save,show=show) - image_index += 1 - dilated_image_list.append(img) - except Exception as e: - print('Error while dilating image ', image_index, e) - self.cv2_image_list = dilated_image_list + # def dilation(self, dim = None, save=True, show=False): + # dilated_image_list = [] + # image_index = 0 + # if dim == None: + # dim = (2,2) + # for image in self.cv2_image_list: + # try: + # kernel = np.ones(dim,np.uint8) + # img = cv2.dilate(image,kernel,iterations = 1) + # self.save_or_show_image(img,image_index,'dilation',save=save,show=show) + # image_index += 1 + # dilated_image_list.append(img) + # except Exception as e: + # print('Error while dilating image ', image_index, e) + # self.cv2_image_list = dilated_image_list - def normalize(self, dim = None, save=True, show=False): - normalized_image_list = [] - image_index = 0 - if dim == None: - dim = (512,512) - for image in self.cv2_image_list: - try: - kernel = np.zeros(dim) - img = cv2.normalize(image,kernel,0,255,cv2.NORM_MINMAX) - normalized_image_list.append(img) - self.save_or_show_image(img,image_index,'normalize',save=save,show=show) - image_index += 1 - except Exception as e: - print('Error while normalizing image ', image_index, e) + # def normalize(self, dim = None, save=True, show=False): + # normalized_image_list = [] + # image_index = 0 + # if dim == None: + # dim = (512,512) + # for image in self.cv2_image_list: + # try: + # kernel = np.zeros(dim) + # img = cv2.normalize(image,kernel,0,255,cv2.NORM_MINMAX) + # normalized_image_list.append(img) + # self.save_or_show_image(img,image_index,'normalize',save=save,show=show) + # image_index += 1 + # except Exception as e: + # print('Error while normalizing image ', image_index, e) def print_variables(self): for img in self.cv2_image_list: diff --git a/klar_eda/preprocess/image_preprocess/__init__.py b/klar_eda/preprocess/image_preprocess/__init__.py new file mode 100644 index 0000000..f51efc3 --- /dev/null +++ b/klar_eda/preprocess/image_preprocess/__init__.py @@ -0,0 +1,3 @@ +from . import morphological +import pkg_resources +pkg_resources.declare_namespace(__name__) \ No newline at end of file diff --git a/klar_eda/preprocess/image_preprocess/morphological.py b/klar_eda/preprocess/image_preprocess/morphological.py new file mode 100644 index 0000000..569986e --- /dev/null +++ b/klar_eda/preprocess/image_preprocess/morphological.py @@ -0,0 +1,114 @@ +import os +from os import makedirs +from os.path import join, exists +import numpy as np +import cv2 +import matplotlib.pyplot as plt +import matplotlib.image as mpimg +from ..image_preprocess import ImagePreprocess + +""" +This document contains the functions: +Thresholding, Denoise, Erode, Dilation, normalize +""" + +class MorphologicalPreprocess: + def __init__(self,input,labels = None): + self.suffixes = ('.jpeg', '.jpg', '.png') + self.clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) + self.labels = labels + if type(input)==str: + self.path = input + self.image_list = sorted([ file for file in os.listdir(input) if (file.endswith(self.suffixes))]) + self.cv2_image_list = [ self.read_images(os.path.join(self.path,image_name)) for image_name in self.image_list ] + else: + self.path = None + self.image_list = None + self.cv2_image_list = input + def thresholding(self, technique = 'mean', threshold = cv2.THRESH_BINARY, save=True, show=False): + binarized_image_list = [] + image_index = 0 + #study the parameters + for image in self.cv2_image_list: + try: + if technique == 'simple': + res , img = cv2.threshold(image, 120, 255, threshold) + binarized_image_list.append(img) + ImagePreprocess.save_or_show_image(img,image_index,'threshold',save=save,show=show) + image_index += 1 + elif technique == 'mean': + img = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, threshold, 199, 5) + binarized_image_list.append(img) + ImagePreprocess.save_or_show_image(img,image_index,'threshold',save=save,show=show) + image_index += 1 + else: + img = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, threshold, 199, 5) + binarized_image_list.append(img) + ImagePreprocess.save_or_show_image(img,image_index,'threshold',save=save,show=show) + image_index += 1 + except Exception as e: + print('Error during binarization of image ', image_index, e) + self.cv2_image_list = binarized_image_list + + def denoise(self, is_gray = True, save=True, show=False): + denoised_image_list = [] + image_index = 0 + for image in self.cv2_image_list: + try: + if not is_gray: + img = cv2.fastNlMeansDenoisingColored(image,None,10,10,7,21) + else: + img = cv2.fastNlMeansDenoising(image,None,3,7,21) + denoised_image_list.append(img) + ImagePreprocess.save_or_show_image(img,image_index,'denoise',save=save,show=show) + image_index += 1 + except Exception as e: + print('Error during denoising image ', image_index, e) + self.cv2_image_list = denoised_image_list + + def erode(self, dim = None, save=True, show=False): + eroded_image_list = [] + image_index = 0 + if dim == None: + dim = (2,2) + for image in self.cv2_image_list: + try: + kernel = np.ones(dim,np.uint8) + img = cv2.erode(image,kernel,iterations = 1) + ImagePreprocess.save_or_show_image(img,image_index,'erode',save=save,show=show) + image_index += 1 + eroded_image_list.append(img) + except Exception as e: + print('Error during eroding image ', image_index, e) + self.cv2_image_list = eroded_image_list + + def dilation(self, dim = None, save=True, show=False): + dilated_image_list = [] + image_index = 0 + if dim == None: + dim = (2,2) + for image in self.cv2_image_list: + try: + kernel = np.ones(dim,np.uint8) + img = cv2.dilate(image,kernel,iterations = 1) + ImagePreprocess.save_or_show_image(img,image_index,'dilation',save=save,show=show) + image_index += 1 + dilated_image_list.append(img) + except Exception as e: + print('Error while dilating image ', image_index, e) + self.cv2_image_list = dilated_image_list + + def normalize(self, dim = None, save=True, show=False): + normalized_image_list = [] + image_index = 0 + if dim == None: + dim = (512,512) + for image in self.cv2_image_list: + try: + kernel = np.zeros(dim) + img = cv2.normalize(image,kernel,0,255,cv2.NORM_MINMAX) + normalized_image_list.append(img) + ImagePreprocess.save_or_show_image(img,image_index,'normalize',save=save,show=show) + image_index += 1 + except Exception as e: + print('Error while normalizing image ', image_index, e) \ No newline at end of file diff --git a/klar_eda/preprocessing.py b/klar_eda/preprocessing.py index de7e648..be75202 100644 --- a/klar_eda/preprocessing.py +++ b/klar_eda/preprocessing.py @@ -1,5 +1,6 @@ from .preprocess.csv_preprocess import CSVPreProcess from .preprocess.image_preprocess import ImagePreprocess +from .preprocess.image_preprocess.morphological import MorphologicalPreprocess def preprocess_csv(csv, target_column=None, index_column=None): """Preprocesses the csv file OR the dataframe, @@ -35,26 +36,28 @@ def preprocess_images(data_path, dataset_type='other',save=True,show=False): :type show: bool, optional """ preprocessor = ImagePreprocess(data_path) + morphPreprocessor = MorphologicalPreprocess(data_path) + preprocessor.resize_images(height = 512, width = 512) if dataset_type == 'ocr': - preprocessor.denoise(save=save,show=show) + morphPreprocessor.denoise(save=save,show=show) preprocessor.colorize(text = True,save=save,show=show) - preprocessor.thresholding(technique = 'gaussian' ,threshold = cv2.THRESH_BINARY,save=save,show=show) + morphPreprocessor.thresholding(technique = 'gaussian' ,threshold = cv2.THRESH_BINARY,save=save,show=show) elif dataset_type == 'face': preprocessor.detect_face_and_crop(crop=True,save=save,show=show) preprocessor.colorize(text = False,save=save,show=show) preprocessor.adaptive_histogram_equalization(save=save,show=show) - preprocessor.denoise(is_gray=True,save=save,show=show) - preprocessor.normalize(save=save,show=show) - preprocessor.erode(save=save,show=show) - preprocessor.dilation(save=save,show=show) + morphPreprocessor.denoise(is_gray=True,save=save,show=show) + morphPreprocessor.normalize(save=save,show=show) + morphPreprocessor.erode(save=save,show=show) + morphPreprocessor.dilation(save=save,show=show) preprocessor.contrast_control(save=save,show=show) else: preprocessor.colorize(text = False,save=save,show=show) preprocessor.adaptive_histogram_equalization(save=save,show=show) - preprocessor.normalize(save=save,show=show) - preprocessor.denoise(is_gray=True,save=save,show=show) - preprocessor.erode(save=save,show=show) - preprocessor.dilation(save=save,show=show) + morphPreprocessor.normalize(save=save,show=show) + morphPreprocessor.denoise(is_gray=True,save=save,show=show) + morphPreprocessor.erode(save=save,show=show) + morphPreprocessor.dilation(save=save,show=show) preprocessor.contrast_control(save=save,show=show) print('Image Preprocessing completed successfully!') \ No newline at end of file From c1c689585078528301a13586f09687d0b25bf9ef Mon Sep 17 00:00:00 2001 From: Shankhanil Ghosh Date: Sat, 20 Mar 2021 00:59:53 +0530 Subject: [PATCH 02/17] restructured codebase: intelligent image preprocessing --- klar_eda/preprocess/__init__.py | 2 + klar_eda/preprocess/image_preprocess.py | 63 ++++++++++--------- .../preprocess/image_preprocess/__init__.py | 1 + .../image_preprocess/intelligent.py | 56 +++++++++++++++++ .../image_preprocess/morphological.py | 13 ++-- klar_eda/preprocessing.py | 6 +- 6 files changed, 104 insertions(+), 37 deletions(-) create mode 100644 klar_eda/preprocess/image_preprocess/intelligent.py diff --git a/klar_eda/preprocess/__init__.py b/klar_eda/preprocess/__init__.py index 46af576..9de78be 100644 --- a/klar_eda/preprocess/__init__.py +++ b/klar_eda/preprocess/__init__.py @@ -2,5 +2,7 @@ from . import csv_preprocess from . import image_preprocess from . import preprocess +# To import morphological preprocessor +from .image_preprocess import morphological import pkg_resources pkg_resources.declare_namespace(__name__) diff --git a/klar_eda/preprocess/image_preprocess.py b/klar_eda/preprocess/image_preprocess.py index 50afaa9..8749ab2 100644 --- a/klar_eda/preprocess/image_preprocess.py +++ b/klar_eda/preprocess/image_preprocess.py @@ -96,6 +96,7 @@ def contrast_control(self, alpha = 1.25, beta = 0, save=True, show=False): print('Error while changing contast for image ',image_index, e) self.cv2_image_list = contrast_image_list + # ***************************CODE SEGMENT MOVED TO ./morphological.py*************************** # def thresholding(self, technique = 'mean', threshold = cv2.THRESH_BINARY, save=True, show=False): # binarized_image_list = [] # image_index = 0 @@ -183,7 +184,7 @@ def contrast_control(self, alpha = 1.25, beta = 0, save=True, show=False): # image_index += 1 # except Exception as e: # print('Error while normalizing image ', image_index, e) - + # ****************************************************************************************** def print_variables(self): for img in self.cv2_image_list: cv2.imshow('img',img) @@ -193,35 +194,37 @@ def get_cascade(self, cascade_type='face'): #if cascade_type == 'face': return cv2.CascadeClassifier('haarcascade_frontalface_default.xml') - def detect_face_and_crop(self, crop = False, save=True, show=False): - face_image_list = [] - image_index = -1 - face_cascade = self.get_cascade('face') - for image in self.cv2_image_list: - try: - image_index += 1 - img = image.copy() - faces = face_cascade.detectMultiScale(img, 1.3, 5) - if faces is None: - print('Unable to find face ') - continue - for (x,y,w,h) in faces: - padding = 10 - ih, iw = img.shape[:2] - lx = max( 0, x - padding ) - ly = max( 0, x - padding ) - ux = min( iw, x + w + padding ) - uy = min( ih, y + h + padding ) - img = cv2.rectangle(img,(lx,ly),(ux,uy),(255,0,0),2) - roi_color = img[y:y+h, x:x+w] - if crop == True: - self.save_or_show_image(roi_color, image_index, 'haarcascade_faces',save=save,show=show) - self.save_or_show_image(img, image_index, 'haarcascade',save=save,show=show) - face_image_list.append(img) - except Exception as e: - print('Error while detecing') - self.cv2_image_list = face_image_list - + # ***************************CODE SEGMENT MOVED TO ./intelligent.py*************************** + # def detect_face_and_crop(self, crop = False, save=True, show=False): + # face_image_list = [] + # image_index = -1 + # face_cascade = self.get_cascade('face') + # for image in self.cv2_image_list: + # try: + # image_index += 1 + # img = image.copy() + # faces = face_cascade.detectMultiScale(img, 1.3, 5) + # if faces is None: + # print('Unable to find face ') + # continue + # for (x,y,w,h) in faces: + # padding = 10 + # ih, iw = img.shape[:2] + # lx = max( 0, x - padding ) + # ly = max( 0, x - padding ) + # ux = min( iw, x + w + padding ) + # uy = min( ih, y + h + padding ) + # img = cv2.rectangle(img,(lx,ly),(ux,uy),(255,0,0),2) + # roi_color = img[y:y+h, x:x+w] + # if crop == True: + # self.save_or_show_image(roi_color, image_index, 'haarcascade_faces',save=save,show=show) + # self.save_or_show_image(img, image_index, 'haarcascade',save=save,show=show) + # face_image_list.append(img) + # except Exception as e: + # print('Error while detecing') + # self.cv2_image_list = face_image_list + # ****************************************************************************************** + def adaptive_histogram_equalization(self, save=True, show=False): refined_image_list = [] image_index = 0 diff --git a/klar_eda/preprocess/image_preprocess/__init__.py b/klar_eda/preprocess/image_preprocess/__init__.py index f51efc3..50fe2a7 100644 --- a/klar_eda/preprocess/image_preprocess/__init__.py +++ b/klar_eda/preprocess/image_preprocess/__init__.py @@ -1,3 +1,4 @@ from . import morphological +from . import intelligent import pkg_resources pkg_resources.declare_namespace(__name__) \ No newline at end of file diff --git a/klar_eda/preprocess/image_preprocess/intelligent.py b/klar_eda/preprocess/image_preprocess/intelligent.py new file mode 100644 index 0000000..77e46df --- /dev/null +++ b/klar_eda/preprocess/image_preprocess/intelligent.py @@ -0,0 +1,56 @@ +import os +from os import makedirs +from os.path import join, exists +import numpy as np +import cv2 +import matplotlib.pyplot as plt +import matplotlib.image as mpimg +from ..image_preprocess import ImagePreprocess + +class IntelligentImagePreprocess: + """ + This class contains the functions: + + """ + def __init__(self,input,labels = None): + self.suffixes = ('.jpeg', '.jpg', '.png') + # self.clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) + self.labels = labels + if type(input)==str: + self.path = input + self.image_list = sorted([ file for file in os.listdir(input) if (file.endswith(self.suffixes))]) + self.cv2_image_list = [ self.read_images(os.path.join(self.path,image_name)) for image_name in self.image_list ] + else: + self.path = None + self.image_list = None + self.cv2_image_list = input + + # the functions + def detect_face_and_crop(self, crop = False, save=True, show=False): + face_image_list = [] + image_index = -1 + face_cascade = self.get_cascade('face') + for image in self.cv2_image_list: + try: + image_index += 1 + img = image.copy() + faces = face_cascade.detectMultiScale(img, 1.3, 5) + if faces is None: + print('Unable to find face ') + continue + for (x,y,w,h) in faces: + padding = 10 + ih, iw = img.shape[:2] + lx = max( 0, x - padding ) + ly = max( 0, x - padding ) + ux = min( iw, x + w + padding ) + uy = min( ih, y + h + padding ) + img = cv2.rectangle(img,(lx,ly),(ux,uy),(255,0,0),2) + roi_color = img[y:y+h, x:x+w] + if crop == True: + self.save_or_show_image(roi_color, image_index, 'haarcascade_faces',save=save,show=show) + self.save_or_show_image(img, image_index, 'haarcascade',save=save,show=show) + face_image_list.append(img) + except Exception as e: + print('Error while detecing') + self.cv2_image_list = face_image_list \ No newline at end of file diff --git a/klar_eda/preprocess/image_preprocess/morphological.py b/klar_eda/preprocess/image_preprocess/morphological.py index 569986e..676d8a6 100644 --- a/klar_eda/preprocess/image_preprocess/morphological.py +++ b/klar_eda/preprocess/image_preprocess/morphological.py @@ -7,15 +7,14 @@ import matplotlib.image as mpimg from ..image_preprocess import ImagePreprocess -""" -This document contains the functions: -Thresholding, Denoise, Erode, Dilation, normalize -""" - class MorphologicalPreprocess: + """ + This class contains the functions: + Thresholding, Denoise, Erode, Dilation, normalize + """ def __init__(self,input,labels = None): self.suffixes = ('.jpeg', '.jpg', '.png') - self.clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) + # self.clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) self.labels = labels if type(input)==str: self.path = input @@ -25,6 +24,8 @@ def __init__(self,input,labels = None): self.path = None self.image_list = None self.cv2_image_list = input + + # the functions def thresholding(self, technique = 'mean', threshold = cv2.THRESH_BINARY, save=True, show=False): binarized_image_list = [] image_index = 0 diff --git a/klar_eda/preprocessing.py b/klar_eda/preprocessing.py index be75202..bdb6af2 100644 --- a/klar_eda/preprocessing.py +++ b/klar_eda/preprocessing.py @@ -1,6 +1,8 @@ from .preprocess.csv_preprocess import CSVPreProcess from .preprocess.image_preprocess import ImagePreprocess +# importing the morph-preprocess and intelligent preprocess class from .preprocess.image_preprocess.morphological import MorphologicalPreprocess +from .preprocess.image_preprocess.intelligent import IntelligentImagePreprocess def preprocess_csv(csv, target_column=None, index_column=None): """Preprocesses the csv file OR the dataframe, @@ -36,7 +38,9 @@ def preprocess_images(data_path, dataset_type='other',save=True,show=False): :type show: bool, optional """ preprocessor = ImagePreprocess(data_path) + # creating a morphological preprocessor object morphPreprocessor = MorphologicalPreprocess(data_path) + inteligentPreprocessor = IntelligentImagePreprocess(data_path) preprocessor.resize_images(height = 512, width = 512) if dataset_type == 'ocr': @@ -44,7 +48,7 @@ def preprocess_images(data_path, dataset_type='other',save=True,show=False): preprocessor.colorize(text = True,save=save,show=show) morphPreprocessor.thresholding(technique = 'gaussian' ,threshold = cv2.THRESH_BINARY,save=save,show=show) elif dataset_type == 'face': - preprocessor.detect_face_and_crop(crop=True,save=save,show=show) + inteligentPreprocessor.detect_face_and_crop(crop=True,save=save,show=show) preprocessor.colorize(text = False,save=save,show=show) preprocessor.adaptive_histogram_equalization(save=save,show=show) morphPreprocessor.denoise(is_gray=True,save=save,show=show) From bfadcd6b594389451215414bdef633ebfa2315de Mon Sep 17 00:00:00 2001 From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com> Date: Thu, 18 Mar 2021 23:41:18 +0530 Subject: [PATCH 03/17] Update image_visualize.py --- klar_eda/visualize/image_visualize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py index d526739..70e29ba 100644 --- a/klar_eda/visualize/image_visualize.py +++ b/klar_eda/visualize/image_visualize.py @@ -24,7 +24,7 @@ ############################################################################ class ImageDataVisualize: - +# the first function is used intialize/get data in the form of images and labels.It also checks/looksout for the greyscale values.Also it checks whether the number of images are equal to the no of labels for naming /labelling properly.If it is not found an error is shown. def __init__(self, data, labels, boxes=None): self.images = data self.labels = labels From a8d2bf8645c3d0003a69af525941386584a59d8b Mon Sep 17 00:00:00 2001 From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com> Date: Sat, 20 Mar 2021 14:09:12 +0530 Subject: [PATCH 04/17] Update image_visualize.py --- klar_eda/visualize/image_visualize.py | 35 +++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py index 70e29ba..3528432 100644 --- a/klar_eda/visualize/image_visualize.py +++ b/klar_eda/visualize/image_visualize.py @@ -24,7 +24,12 @@ ############################################################################ class ImageDataVisualize: -# the first function is used intialize/get data in the form of images and labels.It also checks/looksout for the greyscale values.Also it checks whether the number of images are equal to the no of labels for naming /labelling properly.If it is not found an error is shown. +# the first function is used intialize/get data in the form of images and labels. +#It also checks/looksout for the greyscale values.Also it checks whether the number of images are equal to the no of labels for naming /labelling properly. +#If it is not found an error is shown.this is used for validating the images. +#The dataset is then prepared from the dataframe by giving all the appropriate tags to the respective dataframes.Image,height,width labels. +#Thea area is then calculated from the height and width and the number of imagesare then printed out. + def __init__(self, data, labels, boxes=None): self.images = data self.labels = labels @@ -48,7 +53,13 @@ def __init__(self, data, labels, boxes=None): }) self.dataset['area'] = self.dataset['Height'] * self.dataset['Width'] print('Number of images after validation and filtering:', self.num_images) - +#This function is written for saving the file(of the program) for better access and smooth running. +#it looks for the directory where the file can be saved by applying the join function if actually it is saved. +# It looks for the directory of the file where it can be saved (by using the join function) and if such directory doen't exist a new directory has to be made by using the makedirs function. +# the x_label and y_label are also been given their respective titles. +# the title of the plot is being labelled by combining (formatting) the plot_type and file_name. +# the path is saved by the join fucntion which combines the directory and the file name (save_dir,file_name). +# the title is then displayed. def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, save=True, show=False): if save: save_dir = join(VIZ_ROOT, plot_type) @@ -65,6 +76,10 @@ def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, sa plt.title("{}: {}".format(plot_type, file_name)) plt.show() plt.clf() +# this function is used to check about the data we obtained from the images that whether or not its in the desired format or not. +# if the image type isn't in an n dimensional array format then it is discarded else it is saved. +#if the image type is in ndimensional array format then its accepted or it is discarded. +#also if the dimensions of the image <=2,then again the images are skipped/discarded. def validate_images(self): for image, label in zip(self.images, self.labels): @@ -78,17 +93,24 @@ def validate_images(self): self.images.remove(image) self.labels.remove(label) continue - +# this function is used to define the aspect_ratio of the histogram plotted. +# the aspect ratrio of the histogram plotted is the ratio of its idth to the height(ratio=width/height) +# It is commonly used to describe the proportions of a rectangular screen. def aspect_ratio_histogram(self, save=True, show=False): aspect_ratios = self.dataset['Width'] / self.dataset['Height'] plot = sns.histplot(aspect_ratios, bins='auto') self.save_or_show(plot.figure, 'aspect_ratios', 'aspect_ratios', x_label='aspect_ratios', y_label='frequency', save=save, show=show) +# In this function we segregate the areas by their categories(labels) and then take the mean per category. +#then we display those figures. def area_vs_category(self, save=True, show=False): mean_areas = self.dataset.groupby('Label')['area'].mean() plot = sns.barplot(x=mean_areas.index, y=mean_areas.tolist()) self.save_or_show(plot.figure, 'area_vs_category', 'area_vs_category', x_label='category',y_label= 'area', save=save, show=show) - +#In this function we take the mean of the images segregated in groups as per their labels. +#in these we then choose a matrixof datset and name them images. +# we take the mean and by rows(or columns) and standardize it. +# now we display it. def mean_images(self, save=True, show=False): groups = self.dataset.groupby('Label') for group in groups: @@ -96,7 +118,10 @@ def mean_images(self, save=True, show=False): mean_image = np.array(list(images)).mean(axis=0) plot = plt.imshow(mean_image/255) self.save_or_show(plot.figure, 'mean_images', str(group[0]), save=save, show=show) - +# in this function we find the eigen values and the eigen vectors of the system through principal component analysis. +#find the mean of the eigenvectors. +#we change the dimension of the mean matrix. +#finding the eigen images by rounding off the eigen vectors and then displaying it. def eigen_images(self, save=True, show=False): groups = self.dataset.groupby('Label') for group in groups: From 4dd0dacf6faaf6a29b800f4ae0ba9a6129d248e6 Mon Sep 17 00:00:00 2001 From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com> Date: Sat, 20 Mar 2021 14:19:09 +0530 Subject: [PATCH 05/17] Update image_visualize.py --- klar_eda/visualize/image_visualize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py index 3528432..f78491a 100644 --- a/klar_eda/visualize/image_visualize.py +++ b/klar_eda/visualize/image_visualize.py @@ -136,7 +136,8 @@ def eigen_images(self, save=True, show=False): img = np.round((eigenVectors[i] + 1)/2) plot = plt.imshow(img) self.save_or_show(plot.figure, 'eigen_images/{}'.format(group[0]), str(i), save=save, show=show) - +#in this function we try to figure out the number of images in each category(labels) by aranging them in descending order(frequency wise/no of images wise). +#we then represent it visually by plotting barplots to show them. def num_images_by_category(self, save=True, show=False): counts = self.dataset['Label'].value_counts() plot = sns.barplot(x=counts.index, y=counts.tolist()) From 457fd7c19bf68c7e1d559a66b30eac2646242e71 Mon Sep 17 00:00:00 2001 From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com> Date: Sat, 20 Mar 2021 20:06:27 +0530 Subject: [PATCH 06/17] Update image_visualize.py here I have documented the methods in image_visualize.py asper what I could understand. Please go through it .I tried to create a function for implementing all these methods but I don't know which dataset to implement it on. A little more clarification from your end will be appreciated. Thank you --- klar_eda/visualize/image_visualize.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py index f78491a..6c003e6 100644 --- a/klar_eda/visualize/image_visualize.py +++ b/klar_eda/visualize/image_visualize.py @@ -144,7 +144,9 @@ def num_images_by_category(self, save=True, show=False): self.save_or_show(plot.figure, 'num_images_by_category', 'bar_chart',x_label='category', y_label='No. of images', save=save, show=show) plot = plt.pie(counts.tolist(), labels=counts.index) self.save_or_show(plt, 'num_images_by_category', 'pie_chart', save=save, show=show) - +#here we plot a graph showing the dependence of standard deviation on the mean of the data.here standard deviation is represented as a function of mean and then plotted. +# first the images are segregated by their labels and then their std and mean are taken into variables images and mean. +#they are then apepnede or put into the y and x cordintae of the graph and then plotted. def std_vs_mean(self, save=True, show=False): groups = self.dataset.groupby('Label') y = [] @@ -166,6 +168,8 @@ def std_vs_mean(self, save=True, show=False): labels = self.dataset['Label'].to_list() plot = sns.scatterplot(x=means, y=stds, hue=labels, palette='viridis', legend='full') self.save_or_show(plot.figure, 'std_vs_mean', 'std_vs_mean_all',x_label='mean', y_label='Std Deviation', save=save, show=show) +#t-Distributed Stochastic Neighbor Embedding (t-SNE) is an unsupervised, non-linear technique primarily used for data exploration and visualizing high-dimensional data. +# In simpler terms, t-SNE gives you a feel or intuition of how the data is arranged in a high-dimensional space. def t_sne(self, batch_size=32, save=True, show=False): model = ResNet50(weights='imagenet', pooling=max, include_top = False) From ca556a3d30159e824ddf4e96a0795e01f5eef2cb Mon Sep 17 00:00:00 2001 From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com> Date: Tue, 23 Mar 2021 15:14:57 +0530 Subject: [PATCH 07/17] Update image_visualize.py --- klar_eda/visualize/image_visualize.py | 72 +++++++++++++-------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py index 6c003e6..23002a1 100644 --- a/klar_eda/visualize/image_visualize.py +++ b/klar_eda/visualize/image_visualize.py @@ -24,11 +24,11 @@ ############################################################################ class ImageDataVisualize: -# the first function is used intialize/get data in the form of images and labels. -#It also checks/looksout for the greyscale values.Also it checks whether the number of images are equal to the no of labels for naming /labelling properly. -#If it is not found an error is shown.this is used for validating the images. -#The dataset is then prepared from the dataframe by giving all the appropriate tags to the respective dataframes.Image,height,width labels. -#Thea area is then calculated from the height and width and the number of imagesare then printed out. +""" init: the first function is used intialize/get data in the form of images and labels. + :It also checks/looksout for the greyscale values.Also it checks whether the number of images are equal to the no of labels for naming /labelling properly. + :If it is not found an error is shown.this is used for validating the images. + :The dataset is then prepared from the dataframe by giving all the appropriate tags to the respective dataframes.Image,height,width labels. + :Thea area is then calculated from the height and width and the number of imagesare then printed out.""" def __init__(self, data, labels, boxes=None): self.images = data @@ -53,13 +53,13 @@ def __init__(self, data, labels, boxes=None): }) self.dataset['area'] = self.dataset['Height'] * self.dataset['Width'] print('Number of images after validation and filtering:', self.num_images) -#This function is written for saving the file(of the program) for better access and smooth running. -#it looks for the directory where the file can be saved by applying the join function if actually it is saved. -# It looks for the directory of the file where it can be saved (by using the join function) and if such directory doen't exist a new directory has to be made by using the makedirs function. -# the x_label and y_label are also been given their respective titles. -# the title of the plot is being labelled by combining (formatting) the plot_type and file_name. -# the path is saved by the join fucntion which combines the directory and the file name (save_dir,file_name). -# the title is then displayed. +"""This function is written for saving the file(of the program) for better access and smooth running. + :it looks for the directory where the file can be saved by applying the join function if actually it is saved. + : It looks for the directory of the file where it can be saved . + : the x_label and y_label are also been given their respective titles. + :the title of the plot is being labelled by combining (formatting) the plot_type and file_name.""" + + def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, save=True, show=False): if save: save_dir = join(VIZ_ROOT, plot_type) @@ -76,10 +76,10 @@ def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, sa plt.title("{}: {}".format(plot_type, file_name)) plt.show() plt.clf() -# this function is used to check about the data we obtained from the images that whether or not its in the desired format or not. -# if the image type isn't in an n dimensional array format then it is discarded else it is saved. -#if the image type is in ndimensional array format then its accepted or it is discarded. -#also if the dimensions of the image <=2,then again the images are skipped/discarded. +""" validate_images: this function is used to check about the data we obtained from the images that whether or not its in the desired format or not. + if the image type isn't in an n dimensional array format then it is discarded else it is saved. +if the image type is in ndimensional array format then its accepted or it is discarded. +also if the dimensions of the image <=2,then again the images are skipped/discarded.""" def validate_images(self): for image, label in zip(self.images, self.labels): @@ -93,24 +93,24 @@ def validate_images(self): self.images.remove(image) self.labels.remove(label) continue -# this function is used to define the aspect_ratio of the histogram plotted. -# the aspect ratrio of the histogram plotted is the ratio of its idth to the height(ratio=width/height) -# It is commonly used to describe the proportions of a rectangular screen. +""" this function is used to define the aspect_ratio of the histogram plotted. +the aspect ratrio of the histogram plotted is the ratio of its idth to the height(ratio=width/height) +It is commonly used to describe the proportions of a rectangular screen.""" def aspect_ratio_histogram(self, save=True, show=False): aspect_ratios = self.dataset['Width'] / self.dataset['Height'] plot = sns.histplot(aspect_ratios, bins='auto') self.save_or_show(plot.figure, 'aspect_ratios', 'aspect_ratios', x_label='aspect_ratios', y_label='frequency', save=save, show=show) -# In this function we segregate the areas by their categories(labels) and then take the mean per category. -#then we display those figures. +""" area vs category: In this function we segregate the areas by their categories(labels) and then take the mean per category. +then we display those figures.""" def area_vs_category(self, save=True, show=False): mean_areas = self.dataset.groupby('Label')['area'].mean() plot = sns.barplot(x=mean_areas.index, y=mean_areas.tolist()) self.save_or_show(plot.figure, 'area_vs_category', 'area_vs_category', x_label='category',y_label= 'area', save=save, show=show) -#In this function we take the mean of the images segregated in groups as per their labels. -#in these we then choose a matrixof datset and name them images. -# we take the mean and by rows(or columns) and standardize it. -# now we display it. +""" mean_images:In this function we take the mean of the images segregated in groups as per their labels. +in these we then choose a matrixof datset and name them images. + we take the mean and by rows(or columns) and standardize it. + now we display it.""" def mean_images(self, save=True, show=False): groups = self.dataset.groupby('Label') for group in groups: @@ -118,10 +118,10 @@ def mean_images(self, save=True, show=False): mean_image = np.array(list(images)).mean(axis=0) plot = plt.imshow(mean_image/255) self.save_or_show(plot.figure, 'mean_images', str(group[0]), save=save, show=show) -# in this function we find the eigen values and the eigen vectors of the system through principal component analysis. -#find the mean of the eigenvectors. -#we change the dimension of the mean matrix. -#finding the eigen images by rounding off the eigen vectors and then displaying it. +""" in this function we find the eigen values and the eigen vectors of the system through principal component analysis. +find the mean of the eigenvectors. +we change the dimension of the mean matrix. +finding the eigen images by rounding off the eigen vectors and then displaying it.""" def eigen_images(self, save=True, show=False): groups = self.dataset.groupby('Label') for group in groups: @@ -136,17 +136,17 @@ def eigen_images(self, save=True, show=False): img = np.round((eigenVectors[i] + 1)/2) plot = plt.imshow(img) self.save_or_show(plot.figure, 'eigen_images/{}'.format(group[0]), str(i), save=save, show=show) -#in this function we try to figure out the number of images in each category(labels) by aranging them in descending order(frequency wise/no of images wise). -#we then represent it visually by plotting barplots to show them. +"""in this function we try to figure out the number of images in each category(labels) by aranging them in descending order(frequency wise/no of images wise). +we then represent it visually by plotting barplots to show them.""" def num_images_by_category(self, save=True, show=False): counts = self.dataset['Label'].value_counts() plot = sns.barplot(x=counts.index, y=counts.tolist()) self.save_or_show(plot.figure, 'num_images_by_category', 'bar_chart',x_label='category', y_label='No. of images', save=save, show=show) plot = plt.pie(counts.tolist(), labels=counts.index) self.save_or_show(plt, 'num_images_by_category', 'pie_chart', save=save, show=show) -#here we plot a graph showing the dependence of standard deviation on the mean of the data.here standard deviation is represented as a function of mean and then plotted. -# first the images are segregated by their labels and then their std and mean are taken into variables images and mean. -#they are then apepnede or put into the y and x cordintae of the graph and then plotted. +"""here we plot a graph showing the dependence of standard deviation on the mean of the data.here standard deviation is represented as a function of mean and then plotted. + first the images are segregated by their labels and then their std and mean are taken into variables images and mean. +they are then apepnede or put into the y and x cordintae of the graph and then plotted.""" def std_vs_mean(self, save=True, show=False): groups = self.dataset.groupby('Label') y = [] @@ -168,8 +168,8 @@ def std_vs_mean(self, save=True, show=False): labels = self.dataset['Label'].to_list() plot = sns.scatterplot(x=means, y=stds, hue=labels, palette='viridis', legend='full') self.save_or_show(plot.figure, 'std_vs_mean', 'std_vs_mean_all',x_label='mean', y_label='Std Deviation', save=save, show=show) -#t-Distributed Stochastic Neighbor Embedding (t-SNE) is an unsupervised, non-linear technique primarily used for data exploration and visualizing high-dimensional data. -# In simpler terms, t-SNE gives you a feel or intuition of how the data is arranged in a high-dimensional space. +"""t-Distributed Stochastic Neighbor Embedding (t-SNE) is an unsupervised, non-linear technique primarily used for data exploration and visualizing high-dimensional data. +: In simpler terms, t-SNE gives you a feel or intuition of how the data is arranged in a high-dimensional space. """ def t_sne(self, batch_size=32, save=True, show=False): model = ResNet50(weights='imagenet', pooling=max, include_top = False) From 983d617666daa0be757e483809a35228081fe825 Mon Sep 17 00:00:00 2001 From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com> Date: Wed, 24 Mar 2021 19:28:58 +0530 Subject: [PATCH 08/17] Update image_visualize.py --- klar_eda/visualize/image_visualize.py | 53 +++++++++------------------ 1 file changed, 18 insertions(+), 35 deletions(-) diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py index 23002a1..450ac03 100644 --- a/klar_eda/visualize/image_visualize.py +++ b/klar_eda/visualize/image_visualize.py @@ -24,13 +24,16 @@ ############################################################################ class ImageDataVisualize: -""" init: the first function is used intialize/get data in the form of images and labels. - :It also checks/looksout for the greyscale values.Also it checks whether the number of images are equal to the no of labels for naming /labelling properly. - :If it is not found an error is shown.this is used for validating the images. - :The dataset is then prepared from the dataframe by giving all the appropriate tags to the respective dataframes.Image,height,width labels. - :Thea area is then calculated from the height and width and the number of imagesare then printed out.""" + def __init__(self, data, labels, boxes=None): + """init:this function is for initializing the parameters to work on + :self_param:the file from which we have to takee the data to work on + :self_type:csv file + :data_param:the images form our dataset + :labels_param:to categorize the images. + :boxes_param:a null parameter used for storing the dimensions of the images.""" + self.images = data self.labels = labels self.grey_present = False @@ -53,14 +56,11 @@ def __init__(self, data, labels, boxes=None): }) self.dataset['area'] = self.dataset['Height'] * self.dataset['Width'] print('Number of images after validation and filtering:', self.num_images) -"""This function is written for saving the file(of the program) for better access and smooth running. - :it looks for the directory where the file can be saved by applying the join function if actually it is saved. - : It looks for the directory of the file where it can be saved . - : the x_label and y_label are also been given their respective titles. - :the title of the plot is being labelled by combining (formatting) the plot_type and file_name.""" + def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, save=True, show=False): + if save: save_dir = join(VIZ_ROOT, plot_type) if not exists(save_dir): @@ -76,10 +76,7 @@ def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, sa plt.title("{}: {}".format(plot_type, file_name)) plt.show() plt.clf() -""" validate_images: this function is used to check about the data we obtained from the images that whether or not its in the desired format or not. - if the image type isn't in an n dimensional array format then it is discarded else it is saved. -if the image type is in ndimensional array format then its accepted or it is discarded. -also if the dimensions of the image <=2,then again the images are skipped/discarded.""" + def validate_images(self): for image, label in zip(self.images, self.labels): @@ -93,24 +90,18 @@ def validate_images(self): self.images.remove(image) self.labels.remove(label) continue -""" this function is used to define the aspect_ratio of the histogram plotted. -the aspect ratrio of the histogram plotted is the ratio of its idth to the height(ratio=width/height) -It is commonly used to describe the proportions of a rectangular screen.""" + def aspect_ratio_histogram(self, save=True, show=False): aspect_ratios = self.dataset['Width'] / self.dataset['Height'] plot = sns.histplot(aspect_ratios, bins='auto') self.save_or_show(plot.figure, 'aspect_ratios', 'aspect_ratios', x_label='aspect_ratios', y_label='frequency', save=save, show=show) -""" area vs category: In this function we segregate the areas by their categories(labels) and then take the mean per category. -then we display those figures.""" + def area_vs_category(self, save=True, show=False): mean_areas = self.dataset.groupby('Label')['area'].mean() plot = sns.barplot(x=mean_areas.index, y=mean_areas.tolist()) self.save_or_show(plot.figure, 'area_vs_category', 'area_vs_category', x_label='category',y_label= 'area', save=save, show=show) -""" mean_images:In this function we take the mean of the images segregated in groups as per their labels. -in these we then choose a matrixof datset and name them images. - we take the mean and by rows(or columns) and standardize it. - now we display it.""" + def mean_images(self, save=True, show=False): groups = self.dataset.groupby('Label') for group in groups: @@ -118,11 +109,7 @@ def mean_images(self, save=True, show=False): mean_image = np.array(list(images)).mean(axis=0) plot = plt.imshow(mean_image/255) self.save_or_show(plot.figure, 'mean_images', str(group[0]), save=save, show=show) -""" in this function we find the eigen values and the eigen vectors of the system through principal component analysis. -find the mean of the eigenvectors. -we change the dimension of the mean matrix. -finding the eigen images by rounding off the eigen vectors and then displaying it.""" - def eigen_images(self, save=True, show=False): + groups = self.dataset.groupby('Label') for group in groups: images = group[1]['Image'] @@ -136,17 +123,14 @@ def eigen_images(self, save=True, show=False): img = np.round((eigenVectors[i] + 1)/2) plot = plt.imshow(img) self.save_or_show(plot.figure, 'eigen_images/{}'.format(group[0]), str(i), save=save, show=show) -"""in this function we try to figure out the number of images in each category(labels) by aranging them in descending order(frequency wise/no of images wise). -we then represent it visually by plotting barplots to show them.""" + def num_images_by_category(self, save=True, show=False): counts = self.dataset['Label'].value_counts() plot = sns.barplot(x=counts.index, y=counts.tolist()) self.save_or_show(plot.figure, 'num_images_by_category', 'bar_chart',x_label='category', y_label='No. of images', save=save, show=show) plot = plt.pie(counts.tolist(), labels=counts.index) self.save_or_show(plt, 'num_images_by_category', 'pie_chart', save=save, show=show) -"""here we plot a graph showing the dependence of standard deviation on the mean of the data.here standard deviation is represented as a function of mean and then plotted. - first the images are segregated by their labels and then their std and mean are taken into variables images and mean. -they are then apepnede or put into the y and x cordintae of the graph and then plotted.""" + def std_vs_mean(self, save=True, show=False): groups = self.dataset.groupby('Label') y = [] @@ -168,8 +152,7 @@ def std_vs_mean(self, save=True, show=False): labels = self.dataset['Label'].to_list() plot = sns.scatterplot(x=means, y=stds, hue=labels, palette='viridis', legend='full') self.save_or_show(plot.figure, 'std_vs_mean', 'std_vs_mean_all',x_label='mean', y_label='Std Deviation', save=save, show=show) -"""t-Distributed Stochastic Neighbor Embedding (t-SNE) is an unsupervised, non-linear technique primarily used for data exploration and visualizing high-dimensional data. -: In simpler terms, t-SNE gives you a feel or intuition of how the data is arranged in a high-dimensional space. """ + def t_sne(self, batch_size=32, save=True, show=False): model = ResNet50(weights='imagenet', pooling=max, include_top = False) From bd33030932cc61f398375f4d53fe906fa2924850 Mon Sep 17 00:00:00 2001 From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com> Date: Thu, 25 Mar 2021 12:57:26 +0530 Subject: [PATCH 09/17] Update image_visualize.py --- klar_eda/visualize/image_visualize.py | 34 +++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py index 450ac03..186053c 100644 --- a/klar_eda/visualize/image_visualize.py +++ b/klar_eda/visualize/image_visualize.py @@ -60,6 +60,15 @@ def __init__(self, data, labels, boxes=None): def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, save=True, show=False): + """function save_or_show: to save the file(plot_type) in its designated directory or to make the path for the directory if such directory doesn't exist and then displaying the file type. + :plot_param-the figure to be plotted for graphical visualization. + :plot_type- the file in which all the visualizations are stored. + :file_name- the name of the file to be stored. + :x-label - the label to be put on the x-axis of the graph + :y-label- the label to be put on the y-axis of the graph + :save-parameter- the boolean parameter passed for saving the file. + :show-parameter- display the fiel along with its title and also displayin gthe plot.""" + if save: save_dir = join(VIZ_ROOT, plot_type) @@ -79,6 +88,9 @@ def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, sa def validate_images(self): + """validate_images:the function used to validate images,whether or not it has the required no of dimensions and whether it's a numpy array or not. + :self-the dataset on which the visualization and the analysis has to be performed.""" + for image, label in zip(self.images, self.labels): if type(image) != np.ndarray: print('Image not a numpy array, skipping...') @@ -92,17 +104,26 @@ def validate_images(self): continue def aspect_ratio_histogram(self, save=True, show=False): + """ aspect_ratio_histogram:the function used to define the aspect ratio of the histogram.aspect_ratio=Width/Height. + :save-param:the boolean for instructing to save the file. + :show-param:to display the ratios,the plot ,the labels and everything related to visualisation.""" aspect_ratios = self.dataset['Width'] / self.dataset['Height'] plot = sns.histplot(aspect_ratios, bins='auto') self.save_or_show(plot.figure, 'aspect_ratios', 'aspect_ratios', x_label='aspect_ratios', y_label='frequency', save=save, show=show) def area_vs_category(self, save=True, show=False): + """area_vs_category:the plot to show the areas percategory(label). + :save-param:the boolean for instructing to save the file. + :show-param:to display the ratios,the plot ,the labels and everything related to visualisation.""" mean_areas = self.dataset.groupby('Label')['area'].mean() plot = sns.barplot(x=mean_areas.index, y=mean_areas.tolist()) self.save_or_show(plot.figure, 'area_vs_category', 'area_vs_category', x_label='category',y_label= 'area', save=save, show=show) def mean_images(self, save=True, show=False): + """mean_images:The function for evaluating the mean of the areas per category. + :save-param:the boolean for instructing to save the file. + :show-param:to display the ratios,the plot ,the labels and everything related to visualisation.""" groups = self.dataset.groupby('Label') for group in groups: images = group[1]['Image'] @@ -125,6 +146,9 @@ def mean_images(self, save=True, show=False): self.save_or_show(plot.figure, 'eigen_images/{}'.format(group[0]), str(i), save=save, show=show) def num_images_by_category(self, save=True, show=False): + """ the function to display the no of images per category. + :save-param:the boolean for instructing to save the file. + :show-param:to display the ratios,the plot ,the labels and everything related to visualisation.""" counts = self.dataset['Label'].value_counts() plot = sns.barplot(x=counts.index, y=counts.tolist()) self.save_or_show(plot.figure, 'num_images_by_category', 'bar_chart',x_label='category', y_label='No. of images', save=save, show=show) @@ -132,6 +156,11 @@ def num_images_by_category(self, save=True, show=False): self.save_or_show(plt, 'num_images_by_category', 'pie_chart', save=save, show=show) def std_vs_mean(self, save=True, show=False): + """std_vs_mean:the function used to plot the graph of the standard deviation versus the mean. + : self_param:The dataset on which the analysis is used. + :save-param:the boolean for instructing to save the file. + :show-param:to display the ratios,the plot ,the labels and everything related to visualisation.""" + groups = self.dataset.groupby('Label') y = [] x = [] @@ -155,6 +184,11 @@ def std_vs_mean(self, save=True, show=False): def t_sne(self, batch_size=32, save=True, show=False): + """t-SNE gives you a feel or intuition of how the data is arranged in a high-dimensional space. + :batch_size:the dataset is dividied into batches for smooth functioning of the model on the dataset. + :save-param:the boolean for instructing to save the file. + ::show-param:to display the ratios,the plot ,the labels and everything related to visualisation.""" + model = ResNet50(weights='imagenet', pooling=max, include_top = False) features_list = [] print('Extracting features ...') From 4776bb5d122854ce96c8888c3e3a143809a12e1e Mon Sep 17 00:00:00 2001 From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com> Date: Thu, 25 Mar 2021 18:13:30 +0530 Subject: [PATCH 10/17] Update image_visualize.py --- klar_eda/visualize/image_visualize.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py index 186053c..aead948 100644 --- a/klar_eda/visualize/image_visualize.py +++ b/klar_eda/visualize/image_visualize.py @@ -27,13 +27,13 @@ class ImageDataVisualize: def __init__(self, data, labels, boxes=None): - """init:this function is for initializing the parameters to work on + """this function is for initializing the parameters to work on :self_param:the file from which we have to takee the data to work on :self_type:csv file :data_param:the images form our dataset :labels_param:to categorize the images. - :boxes_param:a null parameter used for storing the dimensions of the images.""" - + :boxes_param:a null parameter used for storing the dimensions of the images. + """ self.images = data self.labels = labels self.grey_present = False @@ -60,15 +60,15 @@ def __init__(self, data, labels, boxes=None): def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, save=True, show=False): - """function save_or_show: to save the file(plot_type) in its designated directory or to make the path for the directory if such directory doesn't exist and then displaying the file type. + """ to save the file(plot_type) in its designated directory or to make the path for the directory if such directory doesn't exist and then displaying the file type. :plot_param-the figure to be plotted for graphical visualization. :plot_type- the file in which all the visualizations are stored. :file_name- the name of the file to be stored. :x-label - the label to be put on the x-axis of the graph :y-label- the label to be put on the y-axis of the graph :save-parameter- the boolean parameter passed for saving the file. - :show-parameter- display the fiel along with its title and also displayin gthe plot.""" - + :show-parameter- display the fiel along with its title and also displayin gthe plot. + """ if save: save_dir = join(VIZ_ROOT, plot_type) @@ -121,7 +121,7 @@ def area_vs_category(self, save=True, show=False): self.save_or_show(plot.figure, 'area_vs_category', 'area_vs_category', x_label='category',y_label= 'area', save=save, show=show) def mean_images(self, save=True, show=False): - """mean_images:The function for evaluating the mean of the areas per category. + """The function for evaluating the mean of the areas per category. :save-param:the boolean for instructing to save the file. :show-param:to display the ratios,the plot ,the labels and everything related to visualisation.""" groups = self.dataset.groupby('Label') @@ -156,7 +156,7 @@ def num_images_by_category(self, save=True, show=False): self.save_or_show(plt, 'num_images_by_category', 'pie_chart', save=save, show=show) def std_vs_mean(self, save=True, show=False): - """std_vs_mean:the function used to plot the graph of the standard deviation versus the mean. + """the function used to plot the graph of the standard deviation versus the mean. : self_param:The dataset on which the analysis is used. :save-param:the boolean for instructing to save the file. :show-param:to display the ratios,the plot ,the labels and everything related to visualisation.""" From 2e024fc3580ed26c97d3cf8631263b90c359725e Mon Sep 17 00:00:00 2001 From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com> Date: Tue, 30 Mar 2021 16:19:06 +0530 Subject: [PATCH 11/17] Update image_visualize.py --- klar_eda/visualize/image_visualize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py index aead948..1f9ae53 100644 --- a/klar_eda/visualize/image_visualize.py +++ b/klar_eda/visualize/image_visualize.py @@ -28,8 +28,8 @@ class ImageDataVisualize: def __init__(self, data, labels, boxes=None): """this function is for initializing the parameters to work on - :self_param:the file from which we have to takee the data to work on - :self_type:csv file + :self_param:the file from which we have to take the data to work on + :type:csv file :data_param:the images form our dataset :labels_param:to categorize the images. :boxes_param:a null parameter used for storing the dimensions of the images. From bcc56f44602fc730a60135790df6b557b38e3635 Mon Sep 17 00:00:00 2001 From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com> Date: Tue, 30 Mar 2021 19:02:50 +0530 Subject: [PATCH 12/17] Update image_visualize.py have updated the changes by running on local machine.please have a look also trying to change the .rst file you suggested me for --- klar_eda/visualize/image_visualize.py | 83 ++++++++++++++++----------- 1 file changed, 50 insertions(+), 33 deletions(-) diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py index 1f9ae53..8f4a1c5 100644 --- a/klar_eda/visualize/image_visualize.py +++ b/klar_eda/visualize/image_visualize.py @@ -27,13 +27,13 @@ class ImageDataVisualize: def __init__(self, data, labels, boxes=None): - """this function is for initializing the parameters to work on - :self_param:the file from which we have to take the data to work on - :type:csv file - :data_param:the images form our dataset - :labels_param:to categorize the images. - :boxes_param:a null parameter used for storing the dimensions of the images. - """ + """ this function is for initializing the parameters to work on + :param self:the file from which we have to take the data to work on + :type self:csv file + :param data:the images form our dataset + :param labels:to categorize the images. + :param boxes:a null parameter used for storing the dimensions of the images. + """ self.images = data self.labels = labels self.grey_present = False @@ -60,15 +60,19 @@ def __init__(self, data, labels, boxes=None): def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, save=True, show=False): - """ to save the file(plot_type) in its designated directory or to make the path for the directory if such directory doesn't exist and then displaying the file type. - :plot_param-the figure to be plotted for graphical visualization. - :plot_type- the file in which all the visualizations are stored. - :file_name- the name of the file to be stored. - :x-label - the label to be put on the x-axis of the graph - :y-label- the label to be put on the y-axis of the graph - :save-parameter- the boolean parameter passed for saving the file. - :show-parameter- display the fiel along with its title and also displayin gthe plot. - """ + """ to save the file(plot_type) in its designated directory or to make the path for the directory if such directory doesn't exist and then displaying the file type. + :param plot-the figure to be plotted for graphical visualization. + :type plot:png file. + :param file_name: the name of the file to be stored. + :type file_name:csv file + :param x-label : the label to be put on the x-axis of the graph + :type x-label: string + :param y-label: the label to be put on the y-axis of the graph + :type y-label: string + : param save: the boolean parameter passed for saving the file. + :type save:boolean + :param show: display the fiel along with its title and also displayin gthe plot. + :type show:boolean """ if save: save_dir = join(VIZ_ROOT, plot_type) @@ -89,8 +93,7 @@ def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, sa def validate_images(self): """validate_images:the function used to validate images,whether or not it has the required no of dimensions and whether it's a numpy array or not. - :self-the dataset on which the visualization and the analysis has to be performed.""" - + : param self-the dataset on which the visualization and the analysis has to be performed.""" for image, label in zip(self.images, self.labels): if type(image) != np.ndarray: print('Image not a numpy array, skipping...') @@ -105,8 +108,11 @@ def validate_images(self): def aspect_ratio_histogram(self, save=True, show=False): """ aspect_ratio_histogram:the function used to define the aspect ratio of the histogram.aspect_ratio=Width/Height. - :save-param:the boolean for instructing to save the file. - :show-param:to display the ratios,the plot ,the labels and everything related to visualisation.""" + :param save:the boolean for instructing to save the file. + :type save:boolean + :param show:to display the ratios,the plot ,the labels and everything related to visualisation. + :type show:boolean + """ aspect_ratios = self.dataset['Width'] / self.dataset['Height'] plot = sns.histplot(aspect_ratios, bins='auto') self.save_or_show(plot.figure, 'aspect_ratios', 'aspect_ratios', x_label='aspect_ratios', y_label='frequency', save=save, show=show) @@ -114,16 +120,20 @@ def aspect_ratio_histogram(self, save=True, show=False): def area_vs_category(self, save=True, show=False): """area_vs_category:the plot to show the areas percategory(label). - :save-param:the boolean for instructing to save the file. - :show-param:to display the ratios,the plot ,the labels and everything related to visualisation.""" + :param save:the boolean for instructing to save the file. + :type save:boolean + :param show:to display the ratios,the plot ,the labels and everything related to visualisation. + :type show:boolean""" mean_areas = self.dataset.groupby('Label')['area'].mean() plot = sns.barplot(x=mean_areas.index, y=mean_areas.tolist()) self.save_or_show(plot.figure, 'area_vs_category', 'area_vs_category', x_label='category',y_label= 'area', save=save, show=show) def mean_images(self, save=True, show=False): """The function for evaluating the mean of the areas per category. - :save-param:the boolean for instructing to save the file. - :show-param:to display the ratios,the plot ,the labels and everything related to visualisation.""" + :param save:the boolean for instructing to save the file. + :type save:boolean + :param show:to display the ratios,the plot ,the labels and everything related to visualisation. + :type show:boolean""" groups = self.dataset.groupby('Label') for group in groups: images = group[1]['Image'] @@ -147,8 +157,10 @@ def mean_images(self, save=True, show=False): def num_images_by_category(self, save=True, show=False): """ the function to display the no of images per category. - :save-param:the boolean for instructing to save the file. - :show-param:to display the ratios,the plot ,the labels and everything related to visualisation.""" + :param save:the boolean for instructing to save the file. + :type save:boolean + :param show:to display the ratios,the plot ,the labels and everything related to visualisation. + :type show:boolean""" counts = self.dataset['Label'].value_counts() plot = sns.barplot(x=counts.index, y=counts.tolist()) self.save_or_show(plot.figure, 'num_images_by_category', 'bar_chart',x_label='category', y_label='No. of images', save=save, show=show) @@ -156,10 +168,11 @@ def num_images_by_category(self, save=True, show=False): self.save_or_show(plt, 'num_images_by_category', 'pie_chart', save=save, show=show) def std_vs_mean(self, save=True, show=False): - """the function used to plot the graph of the standard deviation versus the mean. - : self_param:The dataset on which the analysis is used. - :save-param:the boolean for instructing to save the file. - :show-param:to display the ratios,the plot ,the labels and everything related to visualisation.""" + """the function used to plot the graph of the standard deviation versus the mean. + :param save:the boolean for instructing to save the file. + :type save:boolean + :param show:to display the ratios,the plot ,the labels and everything related to visualisation. + :type show:boolean""" groups = self.dataset.groupby('Label') y = [] @@ -185,9 +198,13 @@ def std_vs_mean(self, save=True, show=False): def t_sne(self, batch_size=32, save=True, show=False): """t-SNE gives you a feel or intuition of how the data is arranged in a high-dimensional space. - :batch_size:the dataset is dividied into batches for smooth functioning of the model on the dataset. - :save-param:the boolean for instructing to save the file. - ::show-param:to display the ratios,the plot ,the labels and everything related to visualisation.""" + :param batch_size:the dataset is dividied into batches for smooth functioning of the model on the dataset. + :type batch_size:integer + :param save:the boolean for instructing to save the file. + :type save:boolean + :param show:to display the ratios,the plot ,the labels and everything related to visualisation. + :type show:boolean + """ model = ResNet50(weights='imagenet', pooling=max, include_top = False) features_list = [] From 6bbe76fa9b78581d3b42803f6478804eb81504a3 Mon Sep 17 00:00:00 2001 From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com> Date: Tue, 30 Mar 2021 23:33:25 +0530 Subject: [PATCH 13/17] Update image_visualize.py --- klar_eda/visualize/image_visualize.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py index 8f4a1c5..edc186a 100644 --- a/klar_eda/visualize/image_visualize.py +++ b/klar_eda/visualize/image_visualize.py @@ -28,9 +28,8 @@ class ImageDataVisualize: def __init__(self, data, labels, boxes=None): """ this function is for initializing the parameters to work on - :param self:the file from which we have to take the data to work on - :type self:csv file :param data:the images form our dataset + :type data:csv file. :param labels:to categorize the images. :param boxes:a null parameter used for storing the dimensions of the images. """ @@ -61,7 +60,7 @@ def __init__(self, data, labels, boxes=None): def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, save=True, show=False): """ to save the file(plot_type) in its designated directory or to make the path for the directory if such directory doesn't exist and then displaying the file type. - :param plot-the figure to be plotted for graphical visualization. + :param plot:the figure to be plotted for graphical visualization. :type plot:png file. :param file_name: the name of the file to be stored. :type file_name:csv file @@ -71,7 +70,7 @@ def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, sa :type y-label: string : param save: the boolean parameter passed for saving the file. :type save:boolean - :param show: display the fiel along with its title and also displayin gthe plot. + :param show: display the fiel along with its title and also displaying the plot. :type show:boolean """ if save: @@ -93,7 +92,7 @@ def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, sa def validate_images(self): """validate_images:the function used to validate images,whether or not it has the required no of dimensions and whether it's a numpy array or not. - : param self-the dataset on which the visualization and the analysis has to be performed.""" + : param self-the dataset on which the visualization and the analysis has to be performed.""" for image, label in zip(self.images, self.labels): if type(image) != np.ndarray: print('Image not a numpy array, skipping...') From 51b920024493a79fdfa093281aac0fa39f2bcf21 Mon Sep 17 00:00:00 2001 From: ask149 Date: Wed, 31 Mar 2021 01:04:15 +0530 Subject: [PATCH 14/17] Refactor the code to the standards --- docsource/image_visualize.rst | 6 + docsource/index.rst | 1 + klar_eda/visualize/image_visualize.py | 159 ++++++++++++++------------ 3 files changed, 94 insertions(+), 72 deletions(-) create mode 100644 docsource/image_visualize.rst diff --git a/docsource/image_visualize.rst b/docsource/image_visualize.rst new file mode 100644 index 0000000..46dbb00 --- /dev/null +++ b/docsource/image_visualize.rst @@ -0,0 +1,6 @@ +Image Visualize +========================= + +.. automodule:: klar_eda.visualize.image_visualize + :members: + :undoc-members: diff --git a/docsource/index.rst b/docsource/index.rst index efdb37d..1ce5013 100644 --- a/docsource/index.rst +++ b/docsource/index.rst @@ -12,6 +12,7 @@ klar-eda's documentation! preprocess visualize + image_visualize Indices and tables diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py index edc186a..5c6d0a6 100644 --- a/klar_eda/visualize/image_visualize.py +++ b/klar_eda/visualize/image_visualize.py @@ -14,8 +14,8 @@ ############################################################################ # To do: 1) resizing for the funnctions that require uniform size -# 2) handle rgb/gray images -# 3) axis labels, plot title +# 2) handle rgb/gray images +# 3) axis labels, plot title # 4) num components in eigen images # 5) optimize mean/eigen computation # 6) optimize std vs mean, different types of plots @@ -27,11 +27,12 @@ class ImageDataVisualize: def __init__(self, data, labels, boxes=None): - """ this function is for initializing the parameters to work on - :param data:the images form our dataset - :type data:csv file. - :param labels:to categorize the images. - :param boxes:a null parameter used for storing the dimensions of the images. + """Constructor for Image Data Visualization. + + :param data: images + :type data: list of numpy image arrays + :param labels: labels corresponding to each image + :param boxes: list containing shape of each image """ self.images = data self.labels = labels @@ -56,23 +57,27 @@ def __init__(self, data, labels, boxes=None): self.dataset['area'] = self.dataset['Height'] * self.dataset['Width'] print('Number of images after validation and filtering:', self.num_images) - + def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, save=True, show=False): - """ to save the file(plot_type) in its designated directory or to make the path for the directory if such directory doesn't exist and then displaying the file type. - :param plot:the figure to be plotted for graphical visualization. - :type plot:png file. - :param file_name: the name of the file to be stored. - :type file_name:csv file - :param x-label : the label to be put on the x-axis of the graph - :type x-label: string - :param y-label: the label to be put on the y-axis of the graph - :type y-label: string - : param save: the boolean parameter passed for saving the file. - :type save:boolean - :param show: display the fiel along with its title and also displaying the plot. - :type show:boolean """ - + """To save the file(plot_type) in its designated directory or to make + the path for the directory if such directory doesn't exist and then + displaying the file type. + + :param plot: The figure to be plotted for graphical visualization. + :type plot: png file. + :param file_name: The filename to be stored. + :type file_name: csv file + :param x-label : The label to be put on the x-axis of the graph + :type x-label: string + :param y-label: The label to be put on the y-axis of the graph + :type y-label: string + :param save: To save the results in the background + :type save: boolean + :param show: To display the images in the foreground + :type show: boolean + """ + if save: save_dir = join(VIZ_ROOT, plot_type) if not exists(save_dir): @@ -90,14 +95,14 @@ def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, sa plt.clf() + + def validate_images(self): - """validate_images:the function used to validate images,whether or not it has the required no of dimensions and whether it's a numpy array or not. - : param self-the dataset on which the visualization and the analysis has to be performed.""" + """Function used to validate images, whether or not it has the required + no of dimensions and whether it's a numpy array or not.""" for image, label in zip(self.images, self.labels): if type(image) != np.ndarray: print('Image not a numpy array, skipping...') - self.images.remove(image) - self.labels.remove(label) continue elif image.ndim < 2: print('Image has less than 2 dimensions, skipping...') @@ -105,38 +110,43 @@ def validate_images(self): self.labels.remove(label) continue + continue + def aspect_ratio_histogram(self, save=True, show=False): - """ aspect_ratio_histogram:the function used to define the aspect ratio of the histogram.aspect_ratio=Width/Height. - :param save:the boolean for instructing to save the file. - :type save:boolean - :param show:to display the ratios,the plot ,the labels and everything related to visualisation. - :type show:boolean + """Function used to plot the aspect ratio histogram for the dataset. + + :param save: To save the results in the background + :type save: boolean + :param show: To display the images in the foreground + :type show: boolean """ aspect_ratios = self.dataset['Width'] / self.dataset['Height'] plot = sns.histplot(aspect_ratios, bins='auto') - self.save_or_show(plot.figure, 'aspect_ratios', 'aspect_ratios', x_label='aspect_ratios', y_label='frequency', save=save, show=show) + def area_vs_category(self, save=True, show=False): - """area_vs_category:the plot to show the areas percategory(label). - :param save:the boolean for instructing to save the file. - :type save:boolean - :param show:to display the ratios,the plot ,the labels and everything related to visualisation. - :type show:boolean""" + """Function used to plot area per category of the images. + + :param save: To save the results in the background + :type save: boolean + :param show: To display the images in the foreground + :type show: boolean + """ mean_areas = self.dataset.groupby('Label')['area'].mean() plot = sns.barplot(x=mean_areas.index, y=mean_areas.tolist()) self.save_or_show(plot.figure, 'area_vs_category', 'area_vs_category', x_label='category',y_label= 'area', save=save, show=show) def mean_images(self, save=True, show=False): - """The function for evaluating the mean of the areas per category. - :param save:the boolean for instructing to save the file. - :type save:boolean - :param show:to display the ratios,the plot ,the labels and everything related to visualisation. - :type show:boolean""" + """Function used for evaluating the mean of the areas per category. + + :param save: To save the results in the background + :type save: boolean + :param show: To display the images in the foreground + :type show: boolean + """ groups = self.dataset.groupby('Label') - for group in groups: - images = group[1]['Image'] - mean_image = np.array(list(images)).mean(axis=0) + for group in groups:array(list(images)).mean(axis=0) plot = plt.imshow(mean_image/255) self.save_or_show(plot.figure, 'mean_images', str(group[0]), save=save, show=show) @@ -154,29 +164,33 @@ def mean_images(self, save=True, show=False): plot = plt.imshow(img) self.save_or_show(plot.figure, 'eigen_images/{}'.format(group[0]), str(i), save=save, show=show) + self.save_or_show(plot.figure, 'eigen_images/{}'.format(group[0]), str(i), save=save, show=show) + def num_images_by_category(self, save=True, show=False): - """ the function to display the no of images per category. - :param save:the boolean for instructing to save the file. - :type save:boolean - :param show:to display the ratios,the plot ,the labels and everything related to visualisation. - :type show:boolean""" + """Function used to display the no of images per category. + + :param save: To save the results in the background + :type save: boolean + :param show: To display the images in the foreground + :type show: boolean + """ counts = self.dataset['Label'].value_counts() plot = sns.barplot(x=counts.index, y=counts.tolist()) - self.save_or_show(plot.figure, 'num_images_by_category', 'bar_chart',x_label='category', y_label='No. of images', save=save, show=show) - plot = plt.pie(counts.tolist(), labels=counts.index) + self.save_or_show(plt, 'num_images_by_category', 'pie_chart', save=save, show=show) + self.save_or_show(plt, 'num_images_by_category', 'pie_chart', save=save, show=show) def std_vs_mean(self, save=True, show=False): - """the function used to plot the graph of the standard deviation versus the mean. - :param save:the boolean for instructing to save the file. - :type save:boolean - :param show:to display the ratios,the plot ,the labels and everything related to visualisation. - :type show:boolean""" - + """Function used to plot the graph of the standard deviation versus the + mean plot. + + :param save: To save the results in the background + :type save: boolean + :param show: To display the images in the foreground + :type show: boolean + """ groups = self.dataset.groupby('Label') - y = [] - x = [] - hue = [] + y = [][] for group in groups: images = group[1]['Image'] images = np.array(list(images)) @@ -196,24 +210,25 @@ def std_vs_mean(self, save=True, show=False): def t_sne(self, batch_size=32, save=True, show=False): - """t-SNE gives you a feel or intuition of how the data is arranged in a high-dimensional space. - :param batch_size:the dataset is dividied into batches for smooth functioning of the model on the dataset. - :type batch_size:integer - :param save:the boolean for instructing to save the file. - :type save:boolean - :param show:to display the ratios,the plot ,the labels and everything related to visualisation. - :type show:boolean - """ + """ t-distributed Stochastic Neighbor Embedding - used to visualize high dimensional data + :param batch_size: The size of the batch + :type batch_size: integer + :param save: To save the results in the background + :type save: boolean + :param show: To display the images in the foreground + :type show: boolean + """ + model = ResNet50(weights='imagenet', pooling=max, include_top = False) features_list = [] print('Extracting features ...') for image in tqdm(self.images): if self.grey_present and (image.ndim < 3 or image.shape[-1] == 1): image = np.stack((image.squeeze(),)*3, axis=-1) - image = np.expand_dims(image, axis=0) - image = preprocess_input(image) - features = model.predict(image) + image = np.expand_dims(image, axis=0) + image = preprocess_input(image) + features = model.predict(image) features_reduce = features.squeeze() features_list.append(features_reduce) From c2a82cccb8b39f97d48e39e9d83fa72017ed0ac1 Mon Sep 17 00:00:00 2001 From: Ashish <54330052+ashish-hacker@users.noreply.github.com> Date: Fri, 2 Apr 2021 01:56:57 +0530 Subject: [PATCH 15/17] Added the standardisation function (#29) * Added the standardisation function * Added Mean Normalization & changed name of the function standardize to z_score_normalization * Added the standardisation function * Added Mean Normalization & changed name of the function standardize to z_score_normalization Co-authored-by: ask149 --- klar_eda/preprocess/csv_preprocess.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/klar_eda/preprocess/csv_preprocess.py b/klar_eda/preprocess/csv_preprocess.py index 862191c..1c157d4 100644 --- a/klar_eda/preprocess/csv_preprocess.py +++ b/klar_eda/preprocess/csv_preprocess.py @@ -84,6 +84,25 @@ def normalize_numerical(self): for col in self.numerical_column_list: if col != self.target_column: self.df[col]=(self.df[col]-self.df[col].min())/(self.df[col].max()-self.df[col].min()) + def standardize(self): + + ### Data use cases for Standardization: ### + + # It makes the data with unit variance and zero mean. + # This will be used when the features have different scales , for example if there are two features salary and age , Obviously age will be from 1-100 and salary can be substantially higher than age values. So if we fit the model directly the salary feature will have a larger impact on predicting the target variable. But it may not be the case. + # So It's necessary to standardise the data. + # We should do standardization in case of algorithms where Gradient descent is used for optimizations, for achieving the minima faster. + # Standardisation is also called z-score normalisation. + + for i in df.columns: + self.df[i] = (self.df[i] - self.df[i].mean())/self.df[i].std() # Standardise the data z = (x - mean)/ (standard deviation) + + def mean_normalization(self): + """ converts x to x' where, + x' = (x - mean(x))/(max(x) - min(x)) + """ + for col in df.columns: + self.df[i] = (self.df[i] - self.df[i].mean())/(self.df[i].max() - self.df[i].min()) def encode_categorical(self): enc = OneHotEncoder(handle_unknown='ignore') @@ -160,4 +179,4 @@ def convert_date_format(self, input_date, output_date_format = 'DD/MM/YYYY'): parsed_date = dateutil.parser.parse(input_date, dayfirst=True) self.converted_date = parsed_date.strftime(output_date_formats[output_date_format]) - return self.converted_date \ No newline at end of file + return self.converted_date From 9903dab6df8db33c1856ed2f957aea808a749f8b Mon Sep 17 00:00:00 2001 From: Shankhanil Ghosh Date: Mon, 5 Apr 2021 09:31:50 +0530 Subject: [PATCH 16/17] resolved conflict in csv_preprocess --- klar_eda/preprocess/csv_preprocess.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/klar_eda/preprocess/csv_preprocess.py b/klar_eda/preprocess/csv_preprocess.py index 9a2bcd9..1c157d4 100644 --- a/klar_eda/preprocess/csv_preprocess.py +++ b/klar_eda/preprocess/csv_preprocess.py @@ -72,13 +72,8 @@ def fill_numerical_na(self, ret = False): self.df[col] = y except Exception as e: pass -<<<<<<< HEAD if ret == True: return self.df -======= - if ret == True: - return self.df ->>>>>>> issue22 def fill_categorical_na(self, ret = False): self.df = self.df.fillna("Unknown") From b80fd1d9bfdc7e05827fe0bb9fedb08d4cc18990 Mon Sep 17 00:00:00 2001 From: Shankhanil Ghosh Date: Fri, 16 Apr 2021 12:13:40 +0530 Subject: [PATCH 17/17] removed file --- klar_eda/preprocess/csv_preprocess.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/klar_eda/preprocess/csv_preprocess.py b/klar_eda/preprocess/csv_preprocess.py index 1c157d4..9a2bcd9 100644 --- a/klar_eda/preprocess/csv_preprocess.py +++ b/klar_eda/preprocess/csv_preprocess.py @@ -72,8 +72,13 @@ def fill_numerical_na(self, ret = False): self.df[col] = y except Exception as e: pass +<<<<<<< HEAD if ret == True: return self.df +======= + if ret == True: + return self.df +>>>>>>> issue22 def fill_categorical_na(self, ret = False): self.df = self.df.fillna("Unknown")