From 2633c60d757688ab21f856f6e43ad38a46cb4daf Mon Sep 17 00:00:00 2001
From: Shankhanil Ghosh <shankha.rik@gmail.com>
Date: Fri, 19 Mar 2021 22:08:19 +0530
Subject: [PATCH 01/17] created morphological preprocessing file

---
 klar_eda/preprocess/csv_preprocess.py         |   2 +-
 klar_eda/preprocess/image_preprocess.py       | 166 +++++++++---------
 .../preprocess/image_preprocess/__init__.py   |   3 +
 .../image_preprocess/morphological.py         | 114 ++++++++++++
 klar_eda/preprocessing.py                     |  23 +--
 5 files changed, 214 insertions(+), 94 deletions(-)
 create mode 100644 klar_eda/preprocess/image_preprocess/__init__.py
 create mode 100644 klar_eda/preprocess/image_preprocess/morphological.py

diff --git a/klar_eda/preprocess/csv_preprocess.py b/klar_eda/preprocess/csv_preprocess.py
index 43e9264..bbcd0f7 100644
--- a/klar_eda/preprocess/csv_preprocess.py
+++ b/klar_eda/preprocess/csv_preprocess.py
@@ -68,7 +68,7 @@ def fill_numerical_na(self, ret = False):
                         self.df[col] = y
             except Exception as e:
                 pass
-           if ret == True:
+        if ret == True:
             return self.df
 
     def fill_categorical_na(self, ret = False):
diff --git a/klar_eda/preprocess/image_preprocess.py b/klar_eda/preprocess/image_preprocess.py
index 8c4eeeb..50afaa9 100644
--- a/klar_eda/preprocess/image_preprocess.py
+++ b/klar_eda/preprocess/image_preprocess.py
@@ -96,93 +96,93 @@ def contrast_control(self, alpha = 1.25, beta = 0, save=True, show=False):
                 print('Error while changing contast for image ',image_index, e)
         self.cv2_image_list = contrast_image_list
 
-    def thresholding(self, technique = 'mean', threshold = cv2.THRESH_BINARY, save=True, show=False):
-        binarized_image_list = []
-        image_index = 0
-        #study the parameters
-        for image in self.cv2_image_list:
-            try:
-                if technique == 'simple':
-                    res , img = cv2.threshold(image, 120, 255, threshold)
-                    binarized_image_list.append(img)
-                    self.save_or_show_image(img,image_index,'threshold',save=save,show=show)
-                    image_index += 1
-                elif technique == 'mean':
-                    img = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, threshold, 199, 5)
-                    binarized_image_list.append(img)
-                    self.save_or_show_image(img,image_index,'threshold',save=save,show=show)
-                    image_index += 1
-                else:
-                    img = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, threshold, 199, 5)
-                    binarized_image_list.append(img)
-                    self.save_or_show_image(img,image_index,'threshold',save=save,show=show)
-                    image_index += 1
-            except Exception as e:
-                print('Error during binarization of image ', image_index, e)
-        self.cv2_image_list = binarized_image_list
+    # def thresholding(self, technique = 'mean', threshold = cv2.THRESH_BINARY, save=True, show=False):
+    #     binarized_image_list = []
+    #     image_index = 0
+    #     #study the parameters
+    #     for image in self.cv2_image_list:
+    #         try:
+    #             if technique == 'simple':
+    #                 res , img = cv2.threshold(image, 120, 255, threshold)
+    #                 binarized_image_list.append(img)
+    #                 self.save_or_show_image(img,image_index,'threshold',save=save,show=show)
+    #                 image_index += 1
+    #             elif technique == 'mean':
+    #                 img = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, threshold, 199, 5)
+    #                 binarized_image_list.append(img)
+    #                 self.save_or_show_image(img,image_index,'threshold',save=save,show=show)
+    #                 image_index += 1
+    #             else:
+    #                 img = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, threshold, 199, 5)
+    #                 binarized_image_list.append(img)
+    #                 self.save_or_show_image(img,image_index,'threshold',save=save,show=show)
+    #                 image_index += 1
+    #         except Exception as e:
+    #             print('Error during binarization of image ', image_index, e)
+    #     self.cv2_image_list = binarized_image_list
     
-    def denoise(self, is_gray = True, save=True, show=False):
-        denoised_image_list = []
-        image_index = 0
-        for image in self.cv2_image_list:
-            try:
-                if not is_gray:
-                    img = cv2.fastNlMeansDenoisingColored(image,None,10,10,7,21)
-                else:
-                    img = cv2.fastNlMeansDenoising(image,None,3,7,21)
-                denoised_image_list.append(img)
-                self.save_or_show_image(img,image_index,'denoise',save=save,show=show)
-                image_index += 1
-            except Exception as e:
-                print('Error during denoising image ', image_index, e)
-        self.cv2_image_list = denoised_image_list
+    # def denoise(self, is_gray = True, save=True, show=False):
+    #     denoised_image_list = []
+    #     image_index = 0
+    #     for image in self.cv2_image_list:
+    #         try:
+    #             if not is_gray:
+    #                 img = cv2.fastNlMeansDenoisingColored(image,None,10,10,7,21)
+    #             else:
+    #                 img = cv2.fastNlMeansDenoising(image,None,3,7,21)
+    #             denoised_image_list.append(img)
+    #             self.save_or_show_image(img,image_index,'denoise',save=save,show=show)
+    #             image_index += 1
+    #         except Exception as e:
+    #             print('Error during denoising image ', image_index, e)
+    #     self.cv2_image_list = denoised_image_list
         
-    def erode(self, dim = None, save=True, show=False):
-        eroded_image_list = []
-        image_index = 0
-        if dim == None:
-            dim = (2,2)
-        for image in self.cv2_image_list:
-            try:
-                kernel = np.ones(dim,np.uint8)
-                img = cv2.erode(image,kernel,iterations = 1)
-                self.save_or_show_image(img,image_index,'erode',save=save,show=show)
-                image_index += 1
-                eroded_image_list.append(img)
-            except Exception as e:
-                print('Error during eroding image ', image_index, e)
-        self.cv2_image_list = eroded_image_list
+    # def erode(self, dim = None, save=True, show=False):
+    #     eroded_image_list = []
+    #     image_index = 0
+    #     if dim == None:
+    #         dim = (2,2)
+    #     for image in self.cv2_image_list:
+    #         try:
+    #             kernel = np.ones(dim,np.uint8)
+    #             img = cv2.erode(image,kernel,iterations = 1)
+    #             self.save_or_show_image(img,image_index,'erode',save=save,show=show)
+    #             image_index += 1
+    #             eroded_image_list.append(img)
+    #         except Exception as e:
+    #             print('Error during eroding image ', image_index, e)
+    #     self.cv2_image_list = eroded_image_list
 
-    def dilation(self, dim = None, save=True, show=False):
-        dilated_image_list = []
-        image_index = 0
-        if dim == None:
-            dim = (2,2)
-        for image in self.cv2_image_list:
-            try:
-                kernel = np.ones(dim,np.uint8)
-                img = cv2.dilate(image,kernel,iterations = 1)
-                self.save_or_show_image(img,image_index,'dilation',save=save,show=show)
-                image_index += 1
-                dilated_image_list.append(img)
-            except Exception as e:
-                print('Error while dilating image ', image_index, e)
-        self.cv2_image_list = dilated_image_list
+    # def dilation(self, dim = None, save=True, show=False):
+    #     dilated_image_list = []
+    #     image_index = 0
+    #     if dim == None:
+    #         dim = (2,2)
+    #     for image in self.cv2_image_list:
+    #         try:
+    #             kernel = np.ones(dim,np.uint8)
+    #             img = cv2.dilate(image,kernel,iterations = 1)
+    #             self.save_or_show_image(img,image_index,'dilation',save=save,show=show)
+    #             image_index += 1
+    #             dilated_image_list.append(img)
+    #         except Exception as e:
+    #             print('Error while dilating image ', image_index, e)
+    #     self.cv2_image_list = dilated_image_list
         
-    def normalize(self, dim = None, save=True, show=False):
-        normalized_image_list = []
-        image_index = 0
-        if dim == None:
-            dim = (512,512)
-        for image in self.cv2_image_list:
-            try:
-                kernel = np.zeros(dim)
-                img = cv2.normalize(image,kernel,0,255,cv2.NORM_MINMAX)
-                normalized_image_list.append(img)
-                self.save_or_show_image(img,image_index,'normalize',save=save,show=show)
-                image_index += 1
-            except Exception as e:
-                print('Error while normalizing image ', image_index, e)
+    # def normalize(self, dim = None, save=True, show=False):
+    #     normalized_image_list = []
+    #     image_index = 0
+    #     if dim == None:
+    #         dim = (512,512)
+    #     for image in self.cv2_image_list:
+    #         try:
+    #             kernel = np.zeros(dim)
+    #             img = cv2.normalize(image,kernel,0,255,cv2.NORM_MINMAX)
+    #             normalized_image_list.append(img)
+    #             self.save_or_show_image(img,image_index,'normalize',save=save,show=show)
+    #             image_index += 1
+    #         except Exception as e:
+    #             print('Error while normalizing image ', image_index, e)
 
     def print_variables(self):
         for img in self.cv2_image_list:
diff --git a/klar_eda/preprocess/image_preprocess/__init__.py b/klar_eda/preprocess/image_preprocess/__init__.py
new file mode 100644
index 0000000..f51efc3
--- /dev/null
+++ b/klar_eda/preprocess/image_preprocess/__init__.py
@@ -0,0 +1,3 @@
+from . import morphological
+import pkg_resources
+pkg_resources.declare_namespace(__name__)
\ No newline at end of file
diff --git a/klar_eda/preprocess/image_preprocess/morphological.py b/klar_eda/preprocess/image_preprocess/morphological.py
new file mode 100644
index 0000000..569986e
--- /dev/null
+++ b/klar_eda/preprocess/image_preprocess/morphological.py
@@ -0,0 +1,114 @@
+import os
+from os import makedirs
+from os.path import join, exists
+import numpy as np
+import cv2
+import matplotlib.pyplot as plt
+import matplotlib.image as mpimg
+from ..image_preprocess import ImagePreprocess
+
+"""
+This document contains the functions:
+Thresholding, Denoise, Erode, Dilation, normalize
+"""
+
+class MorphologicalPreprocess:
+    def __init__(self,input,labels = None):
+        self.suffixes = ('.jpeg', '.jpg', '.png')
+        self.clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
+        self.labels = labels
+        if type(input)==str:
+            self.path = input
+            self.image_list = sorted([ file for file in os.listdir(input) if (file.endswith(self.suffixes))])
+            self.cv2_image_list = [ self.read_images(os.path.join(self.path,image_name)) for image_name in  self.image_list ]
+        else:
+            self.path = None
+            self.image_list = None
+            self.cv2_image_list = input
+    def thresholding(self, technique = 'mean', threshold = cv2.THRESH_BINARY, save=True, show=False):
+        binarized_image_list = []
+        image_index = 0
+        #study the parameters
+        for image in self.cv2_image_list:
+            try:
+                if technique == 'simple':
+                    res , img = cv2.threshold(image, 120, 255, threshold)
+                    binarized_image_list.append(img)
+                    ImagePreprocess.save_or_show_image(img,image_index,'threshold',save=save,show=show)
+                    image_index += 1
+                elif technique == 'mean':
+                    img = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, threshold, 199, 5)
+                    binarized_image_list.append(img)
+                    ImagePreprocess.save_or_show_image(img,image_index,'threshold',save=save,show=show)
+                    image_index += 1
+                else:
+                    img = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, threshold, 199, 5)
+                    binarized_image_list.append(img)
+                    ImagePreprocess.save_or_show_image(img,image_index,'threshold',save=save,show=show)
+                    image_index += 1
+            except Exception as e:
+                print('Error during binarization of image ', image_index, e)
+        self.cv2_image_list = binarized_image_list
+    
+    def denoise(self, is_gray = True, save=True, show=False):
+        denoised_image_list = []
+        image_index = 0
+        for image in self.cv2_image_list:
+            try:
+                if not is_gray:
+                    img = cv2.fastNlMeansDenoisingColored(image,None,10,10,7,21)
+                else:
+                    img = cv2.fastNlMeansDenoising(image,None,3,7,21)
+                denoised_image_list.append(img)
+                ImagePreprocess.save_or_show_image(img,image_index,'denoise',save=save,show=show)
+                image_index += 1
+            except Exception as e:
+                print('Error during denoising image ', image_index, e)
+        self.cv2_image_list = denoised_image_list
+        
+    def erode(self, dim = None, save=True, show=False):
+        eroded_image_list = []
+        image_index = 0
+        if dim == None:
+            dim = (2,2)
+        for image in self.cv2_image_list:
+            try:
+                kernel = np.ones(dim,np.uint8)
+                img = cv2.erode(image,kernel,iterations = 1)
+                ImagePreprocess.save_or_show_image(img,image_index,'erode',save=save,show=show)
+                image_index += 1
+                eroded_image_list.append(img)
+            except Exception as e:
+                print('Error during eroding image ', image_index, e)
+        self.cv2_image_list = eroded_image_list
+
+    def dilation(self, dim = None, save=True, show=False):
+        dilated_image_list = []
+        image_index = 0
+        if dim == None:
+            dim = (2,2)
+        for image in self.cv2_image_list:
+            try:
+                kernel = np.ones(dim,np.uint8)
+                img = cv2.dilate(image,kernel,iterations = 1)
+                ImagePreprocess.save_or_show_image(img,image_index,'dilation',save=save,show=show)
+                image_index += 1
+                dilated_image_list.append(img)
+            except Exception as e:
+                print('Error while dilating image ', image_index, e)
+        self.cv2_image_list = dilated_image_list
+        
+    def normalize(self, dim = None, save=True, show=False):
+        normalized_image_list = []
+        image_index = 0
+        if dim == None:
+            dim = (512,512)
+        for image in self.cv2_image_list:
+            try:
+                kernel = np.zeros(dim)
+                img = cv2.normalize(image,kernel,0,255,cv2.NORM_MINMAX)
+                normalized_image_list.append(img)
+                ImagePreprocess.save_or_show_image(img,image_index,'normalize',save=save,show=show)
+                image_index += 1
+            except Exception as e:
+                print('Error while normalizing image ', image_index, e)
\ No newline at end of file
diff --git a/klar_eda/preprocessing.py b/klar_eda/preprocessing.py
index de7e648..be75202 100644
--- a/klar_eda/preprocessing.py
+++ b/klar_eda/preprocessing.py
@@ -1,5 +1,6 @@
 from .preprocess.csv_preprocess import CSVPreProcess
 from .preprocess.image_preprocess import ImagePreprocess
+from .preprocess.image_preprocess.morphological import MorphologicalPreprocess
 
 def preprocess_csv(csv, target_column=None, index_column=None):
     """Preprocesses the csv file OR the dataframe, 
@@ -35,26 +36,28 @@ def preprocess_images(data_path, dataset_type='other',save=True,show=False):
     :type show: bool, optional
     """
     preprocessor = ImagePreprocess(data_path)
+    morphPreprocessor = MorphologicalPreprocess(data_path)
+    
     preprocessor.resize_images(height = 512, width = 512)
     if dataset_type == 'ocr':
-        preprocessor.denoise(save=save,show=show)
+        morphPreprocessor.denoise(save=save,show=show)
         preprocessor.colorize(text = True,save=save,show=show)
-        preprocessor.thresholding(technique = 'gaussian' ,threshold = cv2.THRESH_BINARY,save=save,show=show)
+        morphPreprocessor.thresholding(technique = 'gaussian' ,threshold = cv2.THRESH_BINARY,save=save,show=show)
     elif dataset_type == 'face':
         preprocessor.detect_face_and_crop(crop=True,save=save,show=show)
         preprocessor.colorize(text = False,save=save,show=show)
         preprocessor.adaptive_histogram_equalization(save=save,show=show)
-        preprocessor.denoise(is_gray=True,save=save,show=show)
-        preprocessor.normalize(save=save,show=show)
-        preprocessor.erode(save=save,show=show)
-        preprocessor.dilation(save=save,show=show)
+        morphPreprocessor.denoise(is_gray=True,save=save,show=show)
+        morphPreprocessor.normalize(save=save,show=show)
+        morphPreprocessor.erode(save=save,show=show)
+        morphPreprocessor.dilation(save=save,show=show)
         preprocessor.contrast_control(save=save,show=show)
     else:
         preprocessor.colorize(text = False,save=save,show=show)
         preprocessor.adaptive_histogram_equalization(save=save,show=show)
-        preprocessor.normalize(save=save,show=show)
-        preprocessor.denoise(is_gray=True,save=save,show=show)
-        preprocessor.erode(save=save,show=show)
-        preprocessor.dilation(save=save,show=show)
+        morphPreprocessor.normalize(save=save,show=show)
+        morphPreprocessor.denoise(is_gray=True,save=save,show=show)
+        morphPreprocessor.erode(save=save,show=show)
+        morphPreprocessor.dilation(save=save,show=show)
         preprocessor.contrast_control(save=save,show=show)
     print('Image Preprocessing completed successfully!')
\ No newline at end of file

From c1c689585078528301a13586f09687d0b25bf9ef Mon Sep 17 00:00:00 2001
From: Shankhanil Ghosh <shankha.rik@gmail.com>
Date: Sat, 20 Mar 2021 00:59:53 +0530
Subject: [PATCH 02/17] restructured codebase: intelligent image preprocessing

---
 klar_eda/preprocess/__init__.py               |  2 +
 klar_eda/preprocess/image_preprocess.py       | 63 ++++++++++---------
 .../preprocess/image_preprocess/__init__.py   |  1 +
 .../image_preprocess/intelligent.py           | 56 +++++++++++++++++
 .../image_preprocess/morphological.py         | 13 ++--
 klar_eda/preprocessing.py                     |  6 +-
 6 files changed, 104 insertions(+), 37 deletions(-)
 create mode 100644 klar_eda/preprocess/image_preprocess/intelligent.py

diff --git a/klar_eda/preprocess/__init__.py b/klar_eda/preprocess/__init__.py
index 46af576..9de78be 100644
--- a/klar_eda/preprocess/__init__.py
+++ b/klar_eda/preprocess/__init__.py
@@ -2,5 +2,7 @@
 from . import csv_preprocess
 from . import image_preprocess
 from . import preprocess
+# To import morphological preprocessor
+from .image_preprocess import morphological  
 import pkg_resources
 pkg_resources.declare_namespace(__name__)
diff --git a/klar_eda/preprocess/image_preprocess.py b/klar_eda/preprocess/image_preprocess.py
index 50afaa9..8749ab2 100644
--- a/klar_eda/preprocess/image_preprocess.py
+++ b/klar_eda/preprocess/image_preprocess.py
@@ -96,6 +96,7 @@ def contrast_control(self, alpha = 1.25, beta = 0, save=True, show=False):
                 print('Error while changing contast for image ',image_index, e)
         self.cv2_image_list = contrast_image_list
 
+    # ***************************CODE SEGMENT MOVED TO ./morphological.py***************************
     # def thresholding(self, technique = 'mean', threshold = cv2.THRESH_BINARY, save=True, show=False):
     #     binarized_image_list = []
     #     image_index = 0
@@ -183,7 +184,7 @@ def contrast_control(self, alpha = 1.25, beta = 0, save=True, show=False):
     #             image_index += 1
     #         except Exception as e:
     #             print('Error while normalizing image ', image_index, e)
-
+    # ******************************************************************************************
     def print_variables(self):
         for img in self.cv2_image_list:
             cv2.imshow('img',img)
@@ -193,35 +194,37 @@ def get_cascade(self, cascade_type='face'):
         #if cascade_type == 'face':
         return cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
 
-    def detect_face_and_crop(self, crop = False, save=True, show=False):
-        face_image_list = []
-        image_index = -1
-        face_cascade = self.get_cascade('face')
-        for image in self.cv2_image_list:
-            try:
-                image_index += 1
-                img = image.copy()
-                faces = face_cascade.detectMultiScale(img, 1.3, 5)
-                if faces is None:
-                    print('Unable to find face ')
-                    continue
-                for (x,y,w,h) in faces:
-                    padding = 10
-                    ih, iw = img.shape[:2]
-                    lx = max( 0, x - padding )
-                    ly = max( 0, x - padding )
-                    ux = min( iw, x + w + padding )
-                    uy = min( ih, y + h + padding )
-                    img = cv2.rectangle(img,(lx,ly),(ux,uy),(255,0,0),2)
-                    roi_color = img[y:y+h, x:x+w]
-                    if crop == True:
-                        self.save_or_show_image(roi_color, image_index, 'haarcascade_faces',save=save,show=show)
-                self.save_or_show_image(img, image_index, 'haarcascade',save=save,show=show)
-                face_image_list.append(img)
-            except Exception as e:
-                print('Error while detecing')
-        self.cv2_image_list = face_image_list
-
+    # ***************************CODE SEGMENT MOVED TO ./intelligent.py***************************
+    # def detect_face_and_crop(self, crop = False, save=True, show=False):
+    #     face_image_list = []
+    #     image_index = -1
+    #     face_cascade = self.get_cascade('face')
+    #     for image in self.cv2_image_list:
+    #         try:
+    #             image_index += 1
+    #             img = image.copy()
+    #             faces = face_cascade.detectMultiScale(img, 1.3, 5)
+    #             if faces is None:
+    #                 print('Unable to find face ')
+    #                 continue
+    #             for (x,y,w,h) in faces:
+    #                 padding = 10
+    #                 ih, iw = img.shape[:2]
+    #                 lx = max( 0, x - padding )
+    #                 ly = max( 0, x - padding )
+    #                 ux = min( iw, x + w + padding )
+    #                 uy = min( ih, y + h + padding )
+    #                 img = cv2.rectangle(img,(lx,ly),(ux,uy),(255,0,0),2)
+    #                 roi_color = img[y:y+h, x:x+w]
+    #                 if crop == True:
+    #                     self.save_or_show_image(roi_color, image_index, 'haarcascade_faces',save=save,show=show)
+    #             self.save_or_show_image(img, image_index, 'haarcascade',save=save,show=show)
+    #             face_image_list.append(img)
+    #         except Exception as e:
+    #             print('Error while detecing')
+    #     self.cv2_image_list = face_image_list
+    # ******************************************************************************************
+    
     def adaptive_histogram_equalization(self, save=True, show=False):
         refined_image_list = []
         image_index = 0
diff --git a/klar_eda/preprocess/image_preprocess/__init__.py b/klar_eda/preprocess/image_preprocess/__init__.py
index f51efc3..50fe2a7 100644
--- a/klar_eda/preprocess/image_preprocess/__init__.py
+++ b/klar_eda/preprocess/image_preprocess/__init__.py
@@ -1,3 +1,4 @@
 from . import morphological
+from . import intelligent
 import pkg_resources
 pkg_resources.declare_namespace(__name__)
\ No newline at end of file
diff --git a/klar_eda/preprocess/image_preprocess/intelligent.py b/klar_eda/preprocess/image_preprocess/intelligent.py
new file mode 100644
index 0000000..77e46df
--- /dev/null
+++ b/klar_eda/preprocess/image_preprocess/intelligent.py
@@ -0,0 +1,56 @@
+import os
+from os import makedirs
+from os.path import join, exists
+import numpy as np
+import cv2
+import matplotlib.pyplot as plt
+import matplotlib.image as mpimg
+from ..image_preprocess import ImagePreprocess
+
+class IntelligentImagePreprocess:
+    """
+    This class contains the functions:
+    
+    """    
+    def __init__(self,input,labels = None):
+        self.suffixes = ('.jpeg', '.jpg', '.png')
+        # self.clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
+        self.labels = labels
+        if type(input)==str:
+            self.path = input
+            self.image_list = sorted([ file for file in os.listdir(input) if (file.endswith(self.suffixes))])
+            self.cv2_image_list = [ self.read_images(os.path.join(self.path,image_name)) for image_name in  self.image_list ]
+        else:
+            self.path = None
+            self.image_list = None
+            self.cv2_image_list = input
+    
+    #  the functions
+    def detect_face_and_crop(self, crop = False, save=True, show=False):
+        face_image_list = []
+        image_index = -1
+        face_cascade = self.get_cascade('face')
+        for image in self.cv2_image_list:
+            try:
+                image_index += 1
+                img = image.copy()
+                faces = face_cascade.detectMultiScale(img, 1.3, 5)
+                if faces is None:
+                    print('Unable to find face ')
+                    continue
+                for (x,y,w,h) in faces:
+                    padding = 10
+                    ih, iw = img.shape[:2]
+                    lx = max( 0, x - padding )
+                    ly = max( 0, x - padding )
+                    ux = min( iw, x + w + padding )
+                    uy = min( ih, y + h + padding )
+                    img = cv2.rectangle(img,(lx,ly),(ux,uy),(255,0,0),2)
+                    roi_color = img[y:y+h, x:x+w]
+                    if crop == True:
+                        self.save_or_show_image(roi_color, image_index, 'haarcascade_faces',save=save,show=show)
+                self.save_or_show_image(img, image_index, 'haarcascade',save=save,show=show)
+                face_image_list.append(img)
+            except Exception as e:
+                print('Error while detecing')
+        self.cv2_image_list = face_image_list
\ No newline at end of file
diff --git a/klar_eda/preprocess/image_preprocess/morphological.py b/klar_eda/preprocess/image_preprocess/morphological.py
index 569986e..676d8a6 100644
--- a/klar_eda/preprocess/image_preprocess/morphological.py
+++ b/klar_eda/preprocess/image_preprocess/morphological.py
@@ -7,15 +7,14 @@
 import matplotlib.image as mpimg
 from ..image_preprocess import ImagePreprocess
 
-"""
-This document contains the functions:
-Thresholding, Denoise, Erode, Dilation, normalize
-"""
-
 class MorphologicalPreprocess:
+    """
+    This class contains the functions:
+    Thresholding, Denoise, Erode, Dilation, normalize
+    """    
     def __init__(self,input,labels = None):
         self.suffixes = ('.jpeg', '.jpg', '.png')
-        self.clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
+        # self.clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
         self.labels = labels
         if type(input)==str:
             self.path = input
@@ -25,6 +24,8 @@ def __init__(self,input,labels = None):
             self.path = None
             self.image_list = None
             self.cv2_image_list = input
+    
+    #  the functions
     def thresholding(self, technique = 'mean', threshold = cv2.THRESH_BINARY, save=True, show=False):
         binarized_image_list = []
         image_index = 0
diff --git a/klar_eda/preprocessing.py b/klar_eda/preprocessing.py
index be75202..bdb6af2 100644
--- a/klar_eda/preprocessing.py
+++ b/klar_eda/preprocessing.py
@@ -1,6 +1,8 @@
 from .preprocess.csv_preprocess import CSVPreProcess
 from .preprocess.image_preprocess import ImagePreprocess
+# importing the morph-preprocess and intelligent preprocess class
 from .preprocess.image_preprocess.morphological import MorphologicalPreprocess
+from .preprocess.image_preprocess.intelligent import IntelligentImagePreprocess
 
 def preprocess_csv(csv, target_column=None, index_column=None):
     """Preprocesses the csv file OR the dataframe, 
@@ -36,7 +38,9 @@ def preprocess_images(data_path, dataset_type='other',save=True,show=False):
     :type show: bool, optional
     """
     preprocessor = ImagePreprocess(data_path)
+    # creating a morphological preprocessor object
     morphPreprocessor = MorphologicalPreprocess(data_path)
+    inteligentPreprocessor = IntelligentImagePreprocess(data_path)
     
     preprocessor.resize_images(height = 512, width = 512)
     if dataset_type == 'ocr':
@@ -44,7 +48,7 @@ def preprocess_images(data_path, dataset_type='other',save=True,show=False):
         preprocessor.colorize(text = True,save=save,show=show)
         morphPreprocessor.thresholding(technique = 'gaussian' ,threshold = cv2.THRESH_BINARY,save=save,show=show)
     elif dataset_type == 'face':
-        preprocessor.detect_face_and_crop(crop=True,save=save,show=show)
+        inteligentPreprocessor.detect_face_and_crop(crop=True,save=save,show=show)
         preprocessor.colorize(text = False,save=save,show=show)
         preprocessor.adaptive_histogram_equalization(save=save,show=show)
         morphPreprocessor.denoise(is_gray=True,save=save,show=show)

From bfadcd6b594389451215414bdef633ebfa2315de Mon Sep 17 00:00:00 2001
From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com>
Date: Thu, 18 Mar 2021 23:41:18 +0530
Subject: [PATCH 03/17] Update image_visualize.py

---
 klar_eda/visualize/image_visualize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py
index d526739..70e29ba 100644
--- a/klar_eda/visualize/image_visualize.py
+++ b/klar_eda/visualize/image_visualize.py
@@ -24,7 +24,7 @@
 ############################################################################
 
 class ImageDataVisualize:
-
+# the first function is used intialize/get data in the form of images and labels.It also checks/looksout for the greyscale values.Also it checks whether the number of images are equal to the no of labels for naming /labelling properly.If it is not found an error is shown.
     def __init__(self, data, labels, boxes=None):
         self.images = data
         self.labels = labels

From a8d2bf8645c3d0003a69af525941386584a59d8b Mon Sep 17 00:00:00 2001
From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com>
Date: Sat, 20 Mar 2021 14:09:12 +0530
Subject: [PATCH 04/17] Update image_visualize.py

---
 klar_eda/visualize/image_visualize.py | 35 +++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py
index 70e29ba..3528432 100644
--- a/klar_eda/visualize/image_visualize.py
+++ b/klar_eda/visualize/image_visualize.py
@@ -24,7 +24,12 @@
 ############################################################################
 
 class ImageDataVisualize:
-# the first function is used intialize/get data in the form of images and labels.It also checks/looksout for the greyscale values.Also it checks whether the number of images are equal to the no of labels for naming /labelling properly.If it is not found an error is shown.
+# the first function is used intialize/get data in the form of images and labels.
+#It also checks/looksout for the greyscale values.Also it checks whether the number of images are equal to the no of labels for naming /labelling properly.
+#If it is not found an error is shown.this is used for validating the images.
+#The dataset is then prepared from the dataframe by giving all the appropriate tags to the respective dataframes.Image,height,width labels.
+#Thea area is then calculated from the height and width and the number of imagesare then printed out.
+
     def __init__(self, data, labels, boxes=None):
         self.images = data
         self.labels = labels
@@ -48,7 +53,13 @@ def __init__(self, data, labels, boxes=None):
         })
         self.dataset['area'] = self.dataset['Height'] * self.dataset['Width']
         print('Number of images after validation and filtering:', self.num_images)
-
+#This function is written for saving the file(of the program) for better access and smooth running.
+#it looks for the  directory where the file can be saved by applying the join function if actually it is saved.
+# It looks for the directory of the file where it  can be saved (by using the join function) and if such directory doen't exist a new directory has to be made by using the makedirs function.
+# the x_label and y_label are also been given their respective titles.
+# the title of the plot is being labelled  by combining (formatting) the plot_type and file_name.
+# the path is saved by the join fucntion which combines the directory and the file name (save_dir,file_name).
+#  the title is then displayed.
     def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, save=True, show=False):
         if save:
             save_dir = join(VIZ_ROOT, plot_type)
@@ -65,6 +76,10 @@ def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, sa
             plt.title("{}: {}".format(plot_type, file_name))
             plt.show()
         plt.clf()
+# this function is used to check about the data we obtained from the images that whether or not its in the desired format or not.
+# if the image type isn't in an n dimensional array format then it is discarded else it is saved.
+#if the image type is in ndimensional array format then its accepted or it is discarded.
+#also if the dimensions of the image <=2,then again the images are skipped/discarded.
 
     def validate_images(self):
         for image, label in zip(self.images, self.labels):
@@ -78,17 +93,24 @@ def validate_images(self):
                 self.images.remove(image)
                 self.labels.remove(label)
                 continue
-
+# this function is used to define the aspect_ratio of the histogram plotted.
+# the aspect ratrio of the histogram plotted is the ratio of its idth to the height(ratio=width/height)
+# It is commonly used  to describe the proportions of a rectangular screen.
     def aspect_ratio_histogram(self, save=True, show=False):
         aspect_ratios = self.dataset['Width'] / self.dataset['Height']
         plot = sns.histplot(aspect_ratios, bins='auto')
         self.save_or_show(plot.figure, 'aspect_ratios', 'aspect_ratios', x_label='aspect_ratios', y_label='frequency', save=save, show=show)
+# In this function we segregate the areas by their categories(labels) and then take the mean per category.
+#then we display those figures.
 
     def area_vs_category(self, save=True, show=False):
         mean_areas = self.dataset.groupby('Label')['area'].mean()
         plot = sns.barplot(x=mean_areas.index, y=mean_areas.tolist())
         self.save_or_show(plot.figure, 'area_vs_category', 'area_vs_category', x_label='category',y_label= 'area', save=save, show=show)
-
+#In this function we take the mean of the images segregated in groups as per their labels.
+#in these we then choose a matrixof datset and name them images.
+# we take the mean and by rows(or columns) and standardize it.
+# now we display it.
     def mean_images(self, save=True, show=False):
         groups = self.dataset.groupby('Label')
         for group in groups:
@@ -96,7 +118,10 @@ def mean_images(self, save=True, show=False):
             mean_image = np.array(list(images)).mean(axis=0)
             plot = plt.imshow(mean_image/255)
             self.save_or_show(plot.figure, 'mean_images', str(group[0]), save=save, show=show)
-
+# in this function we find the eigen values and the eigen vectors of the system through principal component analysis.
+#find the mean of the eigenvectors.
+#we change the dimension of the mean matrix.
+#finding the eigen images by rounding off the eigen vectors and then displaying it.
     def eigen_images(self, save=True, show=False):
         groups = self.dataset.groupby('Label')
         for group in groups:

From 4dd0dacf6faaf6a29b800f4ae0ba9a6129d248e6 Mon Sep 17 00:00:00 2001
From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com>
Date: Sat, 20 Mar 2021 14:19:09 +0530
Subject: [PATCH 05/17] Update image_visualize.py

---
 klar_eda/visualize/image_visualize.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py
index 3528432..f78491a 100644
--- a/klar_eda/visualize/image_visualize.py
+++ b/klar_eda/visualize/image_visualize.py
@@ -136,7 +136,8 @@ def eigen_images(self, save=True, show=False):
                 img = np.round((eigenVectors[i] + 1)/2)
                 plot = plt.imshow(img)
                 self.save_or_show(plot.figure, 'eigen_images/{}'.format(group[0]), str(i), save=save, show=show)
-
+#in this function we try to figure out the number of images in each category(labels) by aranging them in  descending order(frequency wise/no of images wise).
+#we then represent it visually  by plotting barplots to show them.
     def num_images_by_category(self, save=True, show=False):
         counts = self.dataset['Label'].value_counts()
         plot = sns.barplot(x=counts.index, y=counts.tolist())

From 457fd7c19bf68c7e1d559a66b30eac2646242e71 Mon Sep 17 00:00:00 2001
From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com>
Date: Sat, 20 Mar 2021 20:06:27 +0530
Subject: [PATCH 06/17] Update image_visualize.py

here  I have documented the methods in image_visualize.py  asper what I could understand. Please go through it .I tried to create a function for implementing all these methods but I don't know which dataset to implement it on. A little more clarification from your end will be appreciated. Thank you
---
 klar_eda/visualize/image_visualize.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py
index f78491a..6c003e6 100644
--- a/klar_eda/visualize/image_visualize.py
+++ b/klar_eda/visualize/image_visualize.py
@@ -144,7 +144,9 @@ def num_images_by_category(self, save=True, show=False):
         self.save_or_show(plot.figure, 'num_images_by_category', 'bar_chart',x_label='category', y_label='No. of images', save=save, show=show)
         plot = plt.pie(counts.tolist(), labels=counts.index)
         self.save_or_show(plt, 'num_images_by_category', 'pie_chart', save=save, show=show)
-
+#here we plot a graph showing the dependence of standard deviation on the mean of the data.here standard deviation is represented as a function of mean and then plotted.
+# first the images are segregated by their labels and then their std and mean are taken into variables images and mean.
+#they are then apepnede or put into the y and x cordintae of the graph and then plotted.
     def std_vs_mean(self, save=True, show=False):
         groups = self.dataset.groupby('Label')
         y = []
@@ -166,6 +168,8 @@ def std_vs_mean(self, save=True, show=False):
         labels = self.dataset['Label'].to_list()
         plot = sns.scatterplot(x=means, y=stds, hue=labels, palette='viridis', legend='full')
         self.save_or_show(plot.figure, 'std_vs_mean', 'std_vs_mean_all',x_label='mean', y_label='Std Deviation', save=save, show=show)
+#t-Distributed Stochastic Neighbor Embedding (t-SNE) is an unsupervised, non-linear technique primarily used for data exploration and visualizing high-dimensional data.
+# In simpler terms, t-SNE gives you a feel or intuition of how the data is arranged in a high-dimensional space. 
 
     def t_sne(self, batch_size=32, save=True, show=False):
         model = ResNet50(weights='imagenet', pooling=max, include_top = False)

From ca556a3d30159e824ddf4e96a0795e01f5eef2cb Mon Sep 17 00:00:00 2001
From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com>
Date: Tue, 23 Mar 2021 15:14:57 +0530
Subject: [PATCH 07/17] Update image_visualize.py

---
 klar_eda/visualize/image_visualize.py | 72 +++++++++++++--------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py
index 6c003e6..23002a1 100644
--- a/klar_eda/visualize/image_visualize.py
+++ b/klar_eda/visualize/image_visualize.py
@@ -24,11 +24,11 @@
 ############################################################################
 
 class ImageDataVisualize:
-# the first function is used intialize/get data in the form of images and labels.
-#It also checks/looksout for the greyscale values.Also it checks whether the number of images are equal to the no of labels for naming /labelling properly.
-#If it is not found an error is shown.this is used for validating the images.
-#The dataset is then prepared from the dataframe by giving all the appropriate tags to the respective dataframes.Image,height,width labels.
-#Thea area is then calculated from the height and width and the number of imagesare then printed out.
+""" init: the first function is used intialize/get data in the form of images and labels.
+ :It also checks/looksout for the greyscale values.Also it checks whether the number of images are equal to the no of labels for naming /labelling properly.
+ :If it is not found an error is shown.this is used for validating the images.
+ :The dataset is then prepared from the dataframe by giving all the appropriate tags to the respective dataframes.Image,height,width labels.
+ :Thea area is then calculated from the height and width and the number of imagesare then printed out."""
 
     def __init__(self, data, labels, boxes=None):
         self.images = data
@@ -53,13 +53,13 @@ def __init__(self, data, labels, boxes=None):
         })
         self.dataset['area'] = self.dataset['Height'] * self.dataset['Width']
         print('Number of images after validation and filtering:', self.num_images)
-#This function is written for saving the file(of the program) for better access and smooth running.
-#it looks for the  directory where the file can be saved by applying the join function if actually it is saved.
-# It looks for the directory of the file where it  can be saved (by using the join function) and if such directory doen't exist a new directory has to be made by using the makedirs function.
-# the x_label and y_label are also been given their respective titles.
-# the title of the plot is being labelled  by combining (formatting) the plot_type and file_name.
-# the path is saved by the join fucntion which combines the directory and the file name (save_dir,file_name).
-#  the title is then displayed.
+"""This function is written for saving the file(of the program) for better access and smooth running.
+ :it looks for the  directory where the file can be saved by applying the join function if actually it is saved.
+ : It looks for the directory of the file where it  can be saved .
+ : the x_label and y_label are also been given their respective titles.
+ :the title of the plot is being labelled  by combining (formatting) the plot_type and file_name."""
+ 
+
     def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, save=True, show=False):
         if save:
             save_dir = join(VIZ_ROOT, plot_type)
@@ -76,10 +76,10 @@ def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, sa
             plt.title("{}: {}".format(plot_type, file_name))
             plt.show()
         plt.clf()
-# this function is used to check about the data we obtained from the images that whether or not its in the desired format or not.
-# if the image type isn't in an n dimensional array format then it is discarded else it is saved.
-#if the image type is in ndimensional array format then its accepted or it is discarded.
-#also if the dimensions of the image <=2,then again the images are skipped/discarded.
+""" validate_images: this function is used to check about the data we obtained from the images that whether or not its in the desired format or not.
+ if the image type isn't in an n dimensional array format then it is discarded else it is saved.
+if the image type is in ndimensional array format then its accepted or it is discarded.
+also if the dimensions of the image <=2,then again the images are skipped/discarded."""
 
     def validate_images(self):
         for image, label in zip(self.images, self.labels):
@@ -93,24 +93,24 @@ def validate_images(self):
                 self.images.remove(image)
                 self.labels.remove(label)
                 continue
-# this function is used to define the aspect_ratio of the histogram plotted.
-# the aspect ratrio of the histogram plotted is the ratio of its idth to the height(ratio=width/height)
-# It is commonly used  to describe the proportions of a rectangular screen.
+""" this function is used to define the aspect_ratio of the histogram plotted.
+the aspect ratrio of the histogram plotted is the ratio of its idth to the height(ratio=width/height)
+It is commonly used  to describe the proportions of a rectangular screen."""
     def aspect_ratio_histogram(self, save=True, show=False):
         aspect_ratios = self.dataset['Width'] / self.dataset['Height']
         plot = sns.histplot(aspect_ratios, bins='auto')
         self.save_or_show(plot.figure, 'aspect_ratios', 'aspect_ratios', x_label='aspect_ratios', y_label='frequency', save=save, show=show)
-# In this function we segregate the areas by their categories(labels) and then take the mean per category.
-#then we display those figures.
+""" area vs category: In this function we segregate the areas by their categories(labels) and then take the mean per category.
+then we display those figures."""
 
     def area_vs_category(self, save=True, show=False):
         mean_areas = self.dataset.groupby('Label')['area'].mean()
         plot = sns.barplot(x=mean_areas.index, y=mean_areas.tolist())
         self.save_or_show(plot.figure, 'area_vs_category', 'area_vs_category', x_label='category',y_label= 'area', save=save, show=show)
-#In this function we take the mean of the images segregated in groups as per their labels.
-#in these we then choose a matrixof datset and name them images.
-# we take the mean and by rows(or columns) and standardize it.
-# now we display it.
+""" mean_images:In this function we take the mean of the images segregated in groups as per their labels.
+in these we then choose a matrixof datset and name them images.
+ we take the mean and by rows(or columns) and standardize it.
+ now we display it."""
     def mean_images(self, save=True, show=False):
         groups = self.dataset.groupby('Label')
         for group in groups:
@@ -118,10 +118,10 @@ def mean_images(self, save=True, show=False):
             mean_image = np.array(list(images)).mean(axis=0)
             plot = plt.imshow(mean_image/255)
             self.save_or_show(plot.figure, 'mean_images', str(group[0]), save=save, show=show)
-# in this function we find the eigen values and the eigen vectors of the system through principal component analysis.
-#find the mean of the eigenvectors.
-#we change the dimension of the mean matrix.
-#finding the eigen images by rounding off the eigen vectors and then displaying it.
+""" in this function we find the eigen values and the eigen vectors of the system through principal component analysis.
+find the mean of the eigenvectors.
+we change the dimension of the mean matrix.
+finding the eigen images by rounding off the eigen vectors and then displaying it."""
     def eigen_images(self, save=True, show=False):
         groups = self.dataset.groupby('Label')
         for group in groups:
@@ -136,17 +136,17 @@ def eigen_images(self, save=True, show=False):
                 img = np.round((eigenVectors[i] + 1)/2)
                 plot = plt.imshow(img)
                 self.save_or_show(plot.figure, 'eigen_images/{}'.format(group[0]), str(i), save=save, show=show)
-#in this function we try to figure out the number of images in each category(labels) by aranging them in  descending order(frequency wise/no of images wise).
-#we then represent it visually  by plotting barplots to show them.
+"""in this function we try to figure out the number of images in each category(labels) by aranging them in  descending order(frequency wise/no of images wise).
+we then represent it visually  by plotting barplots to show them."""
     def num_images_by_category(self, save=True, show=False):
         counts = self.dataset['Label'].value_counts()
         plot = sns.barplot(x=counts.index, y=counts.tolist())
         self.save_or_show(plot.figure, 'num_images_by_category', 'bar_chart',x_label='category', y_label='No. of images', save=save, show=show)
         plot = plt.pie(counts.tolist(), labels=counts.index)
         self.save_or_show(plt, 'num_images_by_category', 'pie_chart', save=save, show=show)
-#here we plot a graph showing the dependence of standard deviation on the mean of the data.here standard deviation is represented as a function of mean and then plotted.
-# first the images are segregated by their labels and then their std and mean are taken into variables images and mean.
-#they are then apepnede or put into the y and x cordintae of the graph and then plotted.
+"""here we plot a graph showing the dependence of standard deviation on the mean of the data.here standard deviation is represented as a function of mean and then plotted.
+ first the images are segregated by their labels and then their std and mean are taken into variables images and mean.
+they are then apepnede or put into the y and x cordintae of the graph and then plotted."""
     def std_vs_mean(self, save=True, show=False):
         groups = self.dataset.groupby('Label')
         y = []
@@ -168,8 +168,8 @@ def std_vs_mean(self, save=True, show=False):
         labels = self.dataset['Label'].to_list()
         plot = sns.scatterplot(x=means, y=stds, hue=labels, palette='viridis', legend='full')
         self.save_or_show(plot.figure, 'std_vs_mean', 'std_vs_mean_all',x_label='mean', y_label='Std Deviation', save=save, show=show)
-#t-Distributed Stochastic Neighbor Embedding (t-SNE) is an unsupervised, non-linear technique primarily used for data exploration and visualizing high-dimensional data.
-# In simpler terms, t-SNE gives you a feel or intuition of how the data is arranged in a high-dimensional space. 
+"""t-Distributed Stochastic Neighbor Embedding (t-SNE) is an unsupervised, non-linear technique primarily used for data exploration and visualizing high-dimensional data.
+: In simpler terms, t-SNE gives you a feel or intuition of how the data is arranged in a high-dimensional space. """
 
     def t_sne(self, batch_size=32, save=True, show=False):
         model = ResNet50(weights='imagenet', pooling=max, include_top = False)

From 983d617666daa0be757e483809a35228081fe825 Mon Sep 17 00:00:00 2001
From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com>
Date: Wed, 24 Mar 2021 19:28:58 +0530
Subject: [PATCH 08/17] Update image_visualize.py

---
 klar_eda/visualize/image_visualize.py | 53 +++++++++------------------
 1 file changed, 18 insertions(+), 35 deletions(-)

diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py
index 23002a1..450ac03 100644
--- a/klar_eda/visualize/image_visualize.py
+++ b/klar_eda/visualize/image_visualize.py
@@ -24,13 +24,16 @@
 ############################################################################
 
 class ImageDataVisualize:
-""" init: the first function is used intialize/get data in the form of images and labels.
- :It also checks/looksout for the greyscale values.Also it checks whether the number of images are equal to the no of labels for naming /labelling properly.
- :If it is not found an error is shown.this is used for validating the images.
- :The dataset is then prepared from the dataframe by giving all the appropriate tags to the respective dataframes.Image,height,width labels.
- :Thea area is then calculated from the height and width and the number of imagesare then printed out."""
+
 
     def __init__(self, data, labels, boxes=None):
+   """init:this function is for initializing the parameters to work on
+      :self_param:the file from which we have to takee the data to work on
+      :self_type:csv file
+      :data_param:the images form our dataset
+      :labels_param:to categorize the images.
+      :boxes_param:a null parameter used for storing the dimensions of the images."""
+  
         self.images = data
         self.labels = labels
         self.grey_present = False
@@ -53,14 +56,11 @@ def __init__(self, data, labels, boxes=None):
         })
         self.dataset['area'] = self.dataset['Height'] * self.dataset['Width']
         print('Number of images after validation and filtering:', self.num_images)
-"""This function is written for saving the file(of the program) for better access and smooth running.
- :it looks for the  directory where the file can be saved by applying the join function if actually it is saved.
- : It looks for the directory of the file where it  can be saved .
- : the x_label and y_label are also been given their respective titles.
- :the title of the plot is being labelled  by combining (formatting) the plot_type and file_name."""
+
  
 
     def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, save=True, show=False):
+  
         if save:
             save_dir = join(VIZ_ROOT, plot_type)
             if not exists(save_dir):
@@ -76,10 +76,7 @@ def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, sa
             plt.title("{}: {}".format(plot_type, file_name))
             plt.show()
         plt.clf()
-""" validate_images: this function is used to check about the data we obtained from the images that whether or not its in the desired format or not.
- if the image type isn't in an n dimensional array format then it is discarded else it is saved.
-if the image type is in ndimensional array format then its accepted or it is discarded.
-also if the dimensions of the image <=2,then again the images are skipped/discarded."""
+
 
     def validate_images(self):
         for image, label in zip(self.images, self.labels):
@@ -93,24 +90,18 @@ def validate_images(self):
                 self.images.remove(image)
                 self.labels.remove(label)
                 continue
-""" this function is used to define the aspect_ratio of the histogram plotted.
-the aspect ratrio of the histogram plotted is the ratio of its idth to the height(ratio=width/height)
-It is commonly used  to describe the proportions of a rectangular screen."""
+
     def aspect_ratio_histogram(self, save=True, show=False):
         aspect_ratios = self.dataset['Width'] / self.dataset['Height']
         plot = sns.histplot(aspect_ratios, bins='auto')
         self.save_or_show(plot.figure, 'aspect_ratios', 'aspect_ratios', x_label='aspect_ratios', y_label='frequency', save=save, show=show)
-""" area vs category: In this function we segregate the areas by their categories(labels) and then take the mean per category.
-then we display those figures."""
+
 
     def area_vs_category(self, save=True, show=False):
         mean_areas = self.dataset.groupby('Label')['area'].mean()
         plot = sns.barplot(x=mean_areas.index, y=mean_areas.tolist())
         self.save_or_show(plot.figure, 'area_vs_category', 'area_vs_category', x_label='category',y_label= 'area', save=save, show=show)
-""" mean_images:In this function we take the mean of the images segregated in groups as per their labels.
-in these we then choose a matrixof datset and name them images.
- we take the mean and by rows(or columns) and standardize it.
- now we display it."""
+
     def mean_images(self, save=True, show=False):
         groups = self.dataset.groupby('Label')
         for group in groups:
@@ -118,11 +109,7 @@ def mean_images(self, save=True, show=False):
             mean_image = np.array(list(images)).mean(axis=0)
             plot = plt.imshow(mean_image/255)
             self.save_or_show(plot.figure, 'mean_images', str(group[0]), save=save, show=show)
-""" in this function we find the eigen values and the eigen vectors of the system through principal component analysis.
-find the mean of the eigenvectors.
-we change the dimension of the mean matrix.
-finding the eigen images by rounding off the eigen vectors and then displaying it."""
-    def eigen_images(self, save=True, show=False):
+
         groups = self.dataset.groupby('Label')
         for group in groups:
             images = group[1]['Image']
@@ -136,17 +123,14 @@ def eigen_images(self, save=True, show=False):
                 img = np.round((eigenVectors[i] + 1)/2)
                 plot = plt.imshow(img)
                 self.save_or_show(plot.figure, 'eigen_images/{}'.format(group[0]), str(i), save=save, show=show)
-"""in this function we try to figure out the number of images in each category(labels) by aranging them in  descending order(frequency wise/no of images wise).
-we then represent it visually  by plotting barplots to show them."""
+
     def num_images_by_category(self, save=True, show=False):
         counts = self.dataset['Label'].value_counts()
         plot = sns.barplot(x=counts.index, y=counts.tolist())
         self.save_or_show(plot.figure, 'num_images_by_category', 'bar_chart',x_label='category', y_label='No. of images', save=save, show=show)
         plot = plt.pie(counts.tolist(), labels=counts.index)
         self.save_or_show(plt, 'num_images_by_category', 'pie_chart', save=save, show=show)
-"""here we plot a graph showing the dependence of standard deviation on the mean of the data.here standard deviation is represented as a function of mean and then plotted.
- first the images are segregated by their labels and then their std and mean are taken into variables images and mean.
-they are then apepnede or put into the y and x cordintae of the graph and then plotted."""
+
     def std_vs_mean(self, save=True, show=False):
         groups = self.dataset.groupby('Label')
         y = []
@@ -168,8 +152,7 @@ def std_vs_mean(self, save=True, show=False):
         labels = self.dataset['Label'].to_list()
         plot = sns.scatterplot(x=means, y=stds, hue=labels, palette='viridis', legend='full')
         self.save_or_show(plot.figure, 'std_vs_mean', 'std_vs_mean_all',x_label='mean', y_label='Std Deviation', save=save, show=show)
-"""t-Distributed Stochastic Neighbor Embedding (t-SNE) is an unsupervised, non-linear technique primarily used for data exploration and visualizing high-dimensional data.
-: In simpler terms, t-SNE gives you a feel or intuition of how the data is arranged in a high-dimensional space. """
+
 
     def t_sne(self, batch_size=32, save=True, show=False):
         model = ResNet50(weights='imagenet', pooling=max, include_top = False)

From bd33030932cc61f398375f4d53fe906fa2924850 Mon Sep 17 00:00:00 2001
From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com>
Date: Thu, 25 Mar 2021 12:57:26 +0530
Subject: [PATCH 09/17] Update image_visualize.py

---
 klar_eda/visualize/image_visualize.py | 34 +++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py
index 450ac03..186053c 100644
--- a/klar_eda/visualize/image_visualize.py
+++ b/klar_eda/visualize/image_visualize.py
@@ -60,6 +60,15 @@ def __init__(self, data, labels, boxes=None):
  
 
     def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, save=True, show=False):
+  """function save_or_show: to save the file(plot_type) in its designated directory or to make the path for the directory if such directory doesn't exist and then displaying the file type.
+     :plot_param-the figure to be plotted for graphical visualization. 
+     :plot_type- the file in which all the visualizations are stored.
+     :file_name- the name of the file to be stored.
+     :x-label - the label to be put on the x-axis of the graph
+     :y-label- the label to be put on the y-axis of the graph
+     :save-parameter- the boolean parameter passed for saving the file.
+     :show-parameter- display the fiel along with its title and also displayin gthe plot."""
+    
   
         if save:
             save_dir = join(VIZ_ROOT, plot_type)
@@ -79,6 +88,9 @@ def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, sa
 
 
     def validate_images(self):
+        """validate_images:the function used to validate images,whether or not  it has the required no of dimensions  and whether  it's a numpy array or not.
+            :self-the dataset on which the visualization and the analysis has to be performed."""
+        
         for image, label in zip(self.images, self.labels):
             if type(image) != np.ndarray:
                 print('Image not a numpy array, skipping...')
@@ -92,17 +104,26 @@ def validate_images(self):
                 continue
 
     def aspect_ratio_histogram(self, save=True, show=False):
+        """ aspect_ratio_histogram:the function used to define the aspect ratio of the histogram.aspect_ratio=Width/Height.
+            :save-param:the boolean  for instructing  to save the file.
+            :show-param:to display the ratios,the plot ,the labels and everything related to visualisation."""
         aspect_ratios = self.dataset['Width'] / self.dataset['Height']
         plot = sns.histplot(aspect_ratios, bins='auto')
         self.save_or_show(plot.figure, 'aspect_ratios', 'aspect_ratios', x_label='aspect_ratios', y_label='frequency', save=save, show=show)
 
 
     def area_vs_category(self, save=True, show=False):
+        """area_vs_category:the plot to show the areas percategory(label).
+            :save-param:the boolean  for instructing  to save the file.
+            :show-param:to display the ratios,the plot ,the labels and everything related to visualisation."""
         mean_areas = self.dataset.groupby('Label')['area'].mean()
         plot = sns.barplot(x=mean_areas.index, y=mean_areas.tolist())
         self.save_or_show(plot.figure, 'area_vs_category', 'area_vs_category', x_label='category',y_label= 'area', save=save, show=show)
 
     def mean_images(self, save=True, show=False):
+        """mean_images:The function for evaluating the mean of the areas per category.
+            :save-param:the boolean  for instructing  to save the file.
+            :show-param:to display the ratios,the plot ,the labels and everything related to visualisation."""
         groups = self.dataset.groupby('Label')
         for group in groups:
             images = group[1]['Image']
@@ -125,6 +146,9 @@ def mean_images(self, save=True, show=False):
                 self.save_or_show(plot.figure, 'eigen_images/{}'.format(group[0]), str(i), save=save, show=show)
 
     def num_images_by_category(self, save=True, show=False):
+        """ the function to display the no of images per category.
+            :save-param:the boolean  for instructing  to save the file.
+            :show-param:to display the ratios,the plot ,the labels and everything related to visualisation."""
         counts = self.dataset['Label'].value_counts()
         plot = sns.barplot(x=counts.index, y=counts.tolist())
         self.save_or_show(plot.figure, 'num_images_by_category', 'bar_chart',x_label='category', y_label='No. of images', save=save, show=show)
@@ -132,6 +156,11 @@ def num_images_by_category(self, save=True, show=False):
         self.save_or_show(plt, 'num_images_by_category', 'pie_chart', save=save, show=show)
 
     def std_vs_mean(self, save=True, show=False):
+        """std_vs_mean:the function used to plot the graph of the standard deviation versus the mean.
+          : self_param:The dataset on which the analysis is used.
+          :save-param:the boolean  for instructing  to save the file.
+          :show-param:to display the ratios,the plot ,the labels and everything related to visualisation."""
+        
         groups = self.dataset.groupby('Label')
         y = []
         x = []
@@ -155,6 +184,11 @@ def std_vs_mean(self, save=True, show=False):
 
 
     def t_sne(self, batch_size=32, save=True, show=False):
+        """t-SNE gives you a feel or intuition of how the data is arranged in a high-dimensional space.
+            :batch_size:the dataset is dividied into batches for smooth functioning of the model on the dataset.
+            :save-param:the boolean  for instructing  to save the file.
+            ::show-param:to display the ratios,the plot ,the labels and everything related to visualisation."""
+        
         model = ResNet50(weights='imagenet', pooling=max, include_top = False)
         features_list = []
         print('Extracting features ...')

From 4776bb5d122854ce96c8888c3e3a143809a12e1e Mon Sep 17 00:00:00 2001
From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com>
Date: Thu, 25 Mar 2021 18:13:30 +0530
Subject: [PATCH 10/17] Update image_visualize.py

---
 klar_eda/visualize/image_visualize.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py
index 186053c..aead948 100644
--- a/klar_eda/visualize/image_visualize.py
+++ b/klar_eda/visualize/image_visualize.py
@@ -27,13 +27,13 @@ class ImageDataVisualize:
 
 
     def __init__(self, data, labels, boxes=None):
-   """init:this function is for initializing the parameters to work on
+   """this function is for initializing the parameters to work on
       :self_param:the file from which we have to takee the data to work on
       :self_type:csv file
       :data_param:the images form our dataset
       :labels_param:to categorize the images.
-      :boxes_param:a null parameter used for storing the dimensions of the images."""
-  
+      :boxes_param:a null parameter used for storing the dimensions of the images.
+  """
         self.images = data
         self.labels = labels
         self.grey_present = False
@@ -60,15 +60,15 @@ def __init__(self, data, labels, boxes=None):
  
 
     def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, save=True, show=False):
-  """function save_or_show: to save the file(plot_type) in its designated directory or to make the path for the directory if such directory doesn't exist and then displaying the file type.
+  """ to save the file(plot_type) in its designated directory or to make the path for the directory if such directory doesn't exist and then displaying the file type.
      :plot_param-the figure to be plotted for graphical visualization. 
      :plot_type- the file in which all the visualizations are stored.
      :file_name- the name of the file to be stored.
      :x-label - the label to be put on the x-axis of the graph
      :y-label- the label to be put on the y-axis of the graph
      :save-parameter- the boolean parameter passed for saving the file.
-     :show-parameter- display the fiel along with its title and also displayin gthe plot."""
-    
+     :show-parameter- display the fiel along with its title and also displayin gthe plot.
+    """
   
         if save:
             save_dir = join(VIZ_ROOT, plot_type)
@@ -121,7 +121,7 @@ def area_vs_category(self, save=True, show=False):
         self.save_or_show(plot.figure, 'area_vs_category', 'area_vs_category', x_label='category',y_label= 'area', save=save, show=show)
 
     def mean_images(self, save=True, show=False):
-        """mean_images:The function for evaluating the mean of the areas per category.
+        """The function for evaluating the mean of the areas per category.
             :save-param:the boolean  for instructing  to save the file.
             :show-param:to display the ratios,the plot ,the labels and everything related to visualisation."""
         groups = self.dataset.groupby('Label')
@@ -156,7 +156,7 @@ def num_images_by_category(self, save=True, show=False):
         self.save_or_show(plt, 'num_images_by_category', 'pie_chart', save=save, show=show)
 
     def std_vs_mean(self, save=True, show=False):
-        """std_vs_mean:the function used to plot the graph of the standard deviation versus the mean.
+        """the function used to plot the graph of the standard deviation versus the mean.
           : self_param:The dataset on which the analysis is used.
           :save-param:the boolean  for instructing  to save the file.
           :show-param:to display the ratios,the plot ,the labels and everything related to visualisation."""

From 2e024fc3580ed26c97d3cf8631263b90c359725e Mon Sep 17 00:00:00 2001
From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com>
Date: Tue, 30 Mar 2021 16:19:06 +0530
Subject: [PATCH 11/17] Update image_visualize.py

---
 klar_eda/visualize/image_visualize.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py
index aead948..1f9ae53 100644
--- a/klar_eda/visualize/image_visualize.py
+++ b/klar_eda/visualize/image_visualize.py
@@ -28,8 +28,8 @@ class ImageDataVisualize:
 
     def __init__(self, data, labels, boxes=None):
    """this function is for initializing the parameters to work on
-      :self_param:the file from which we have to takee the data to work on
-      :self_type:csv file
+      :self_param:the file from which we have to take the data to work on
+      :type:csv file
       :data_param:the images form our dataset
       :labels_param:to categorize the images.
       :boxes_param:a null parameter used for storing the dimensions of the images.

From bcc56f44602fc730a60135790df6b557b38e3635 Mon Sep 17 00:00:00 2001
From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com>
Date: Tue, 30 Mar 2021 19:02:50 +0530
Subject: [PATCH 12/17] Update image_visualize.py

have updated the changes by running on local machine.please have a look also trying to change the .rst file you  suggested me for
---
 klar_eda/visualize/image_visualize.py | 83 ++++++++++++++++-----------
 1 file changed, 50 insertions(+), 33 deletions(-)

diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py
index 1f9ae53..8f4a1c5 100644
--- a/klar_eda/visualize/image_visualize.py
+++ b/klar_eda/visualize/image_visualize.py
@@ -27,13 +27,13 @@ class ImageDataVisualize:
 
 
     def __init__(self, data, labels, boxes=None):
-   """this function is for initializing the parameters to work on
-      :self_param:the file from which we have to take the data to work on
-      :type:csv file
-      :data_param:the images form our dataset
-      :labels_param:to categorize the images.
-      :boxes_param:a null parameter used for storing the dimensions of the images.
-  """
+        """ this function is for initializing the parameters to work on
+            :param self:the file from which we have to take the data to work on
+            :type self:csv file
+            :param data:the images form our dataset
+            :param labels:to categorize the images.
+            :param boxes:a null parameter used for storing the dimensions of the images.
+        """
         self.images = data
         self.labels = labels
         self.grey_present = False
@@ -60,15 +60,19 @@ def __init__(self, data, labels, boxes=None):
  
 
     def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, save=True, show=False):
-  """ to save the file(plot_type) in its designated directory or to make the path for the directory if such directory doesn't exist and then displaying the file type.
-     :plot_param-the figure to be plotted for graphical visualization. 
-     :plot_type- the file in which all the visualizations are stored.
-     :file_name- the name of the file to be stored.
-     :x-label - the label to be put on the x-axis of the graph
-     :y-label- the label to be put on the y-axis of the graph
-     :save-parameter- the boolean parameter passed for saving the file.
-     :show-parameter- display the fiel along with its title and also displayin gthe plot.
-    """
+            """ to save the file(plot_type) in its designated directory or to make the path for the directory if such directory doesn't exist and then displaying the file type.
+                 :param plot-the figure to be plotted for graphical visualization.
+                 :type plot:png file.
+                 :param file_name: the name of the file to be stored.
+                 :type file_name:csv file
+                 :param x-label : the label to be put on the x-axis of the graph
+                 :type x-label: string
+                 :param y-label: the label to be put on the y-axis of the graph
+                 :type y-label: string
+                 : param save: the boolean parameter passed for saving the file.
+                 :type save:boolean
+                 :param show: display the fiel along with its title and also displayin gthe plot.
+                 :type show:boolean        """
   
         if save:
             save_dir = join(VIZ_ROOT, plot_type)
@@ -89,8 +93,7 @@ def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, sa
 
     def validate_images(self):
         """validate_images:the function used to validate images,whether or not  it has the required no of dimensions  and whether  it's a numpy array or not.
-            :self-the dataset on which the visualization and the analysis has to be performed."""
-        
+            : param self-the dataset on which the visualization and the analysis has to be performed."""
         for image, label in zip(self.images, self.labels):
             if type(image) != np.ndarray:
                 print('Image not a numpy array, skipping...')
@@ -105,8 +108,11 @@ def validate_images(self):
 
     def aspect_ratio_histogram(self, save=True, show=False):
         """ aspect_ratio_histogram:the function used to define the aspect ratio of the histogram.aspect_ratio=Width/Height.
-            :save-param:the boolean  for instructing  to save the file.
-            :show-param:to display the ratios,the plot ,the labels and everything related to visualisation."""
+            :param save:the boolean  for instructing  to save the file.
+            :type  save:boolean
+            :param show:to display the ratios,the plot ,the labels and everything related to visualisation.
+            :type show:boolean
+        """
         aspect_ratios = self.dataset['Width'] / self.dataset['Height']
         plot = sns.histplot(aspect_ratios, bins='auto')
         self.save_or_show(plot.figure, 'aspect_ratios', 'aspect_ratios', x_label='aspect_ratios', y_label='frequency', save=save, show=show)
@@ -114,16 +120,20 @@ def aspect_ratio_histogram(self, save=True, show=False):
 
     def area_vs_category(self, save=True, show=False):
         """area_vs_category:the plot to show the areas percategory(label).
-            :save-param:the boolean  for instructing  to save the file.
-            :show-param:to display the ratios,the plot ,the labels and everything related to visualisation."""
+            :param save:the boolean  for instructing  to save the file.
+            :type  save:boolean
+            :param show:to display the ratios,the plot ,the labels and everything related to visualisation.
+            :type show:boolean"""
         mean_areas = self.dataset.groupby('Label')['area'].mean()
         plot = sns.barplot(x=mean_areas.index, y=mean_areas.tolist())
         self.save_or_show(plot.figure, 'area_vs_category', 'area_vs_category', x_label='category',y_label= 'area', save=save, show=show)
 
     def mean_images(self, save=True, show=False):
         """The function for evaluating the mean of the areas per category.
-            :save-param:the boolean  for instructing  to save the file.
-            :show-param:to display the ratios,the plot ,the labels and everything related to visualisation."""
+            :param save:the boolean  for instructing  to save the file.
+            :type  save:boolean
+            :param show:to display the ratios,the plot ,the labels and everything related to visualisation.
+            :type show:boolean"""
         groups = self.dataset.groupby('Label')
         for group in groups:
             images = group[1]['Image']
@@ -147,8 +157,10 @@ def mean_images(self, save=True, show=False):
 
     def num_images_by_category(self, save=True, show=False):
         """ the function to display the no of images per category.
-            :save-param:the boolean  for instructing  to save the file.
-            :show-param:to display the ratios,the plot ,the labels and everything related to visualisation."""
+            :param save:the boolean  for instructing  to save the file.
+            :type  save:boolean
+            :param show:to display the ratios,the plot ,the labels and everything related to visualisation.
+            :type show:boolean"""
         counts = self.dataset['Label'].value_counts()
         plot = sns.barplot(x=counts.index, y=counts.tolist())
         self.save_or_show(plot.figure, 'num_images_by_category', 'bar_chart',x_label='category', y_label='No. of images', save=save, show=show)
@@ -156,10 +168,11 @@ def num_images_by_category(self, save=True, show=False):
         self.save_or_show(plt, 'num_images_by_category', 'pie_chart', save=save, show=show)
 
     def std_vs_mean(self, save=True, show=False):
-        """the function used to plot the graph of the standard deviation versus the mean.
-          : self_param:The dataset on which the analysis is used.
-          :save-param:the boolean  for instructing  to save the file.
-          :show-param:to display the ratios,the plot ,the labels and everything related to visualisation."""
+       """the function used to plot the graph of the standard deviation versus the mean.
+          :param save:the boolean  for instructing  to save the file.
+            :type  save:boolean
+            :param show:to display the ratios,the plot ,the labels and everything related to visualisation.
+            :type show:boolean"""
         
         groups = self.dataset.groupby('Label')
         y = []
@@ -185,9 +198,13 @@ def std_vs_mean(self, save=True, show=False):
 
     def t_sne(self, batch_size=32, save=True, show=False):
         """t-SNE gives you a feel or intuition of how the data is arranged in a high-dimensional space.
-            :batch_size:the dataset is dividied into batches for smooth functioning of the model on the dataset.
-            :save-param:the boolean  for instructing  to save the file.
-            ::show-param:to display the ratios,the plot ,the labels and everything related to visualisation."""
+            :param batch_size:the dataset is dividied into batches for smooth functioning of the model on the dataset.
+            :type batch_size:integer
+            :param save:the boolean  for instructing  to save the file.
+            :type  save:boolean
+            :param show:to display the ratios,the plot ,the labels and everything related to visualisation.
+            :type show:boolean
+            """
         
         model = ResNet50(weights='imagenet', pooling=max, include_top = False)
         features_list = []

From 6bbe76fa9b78581d3b42803f6478804eb81504a3 Mon Sep 17 00:00:00 2001
From: Sibasish-Padhy <70088281+Sibasish-Padhy@users.noreply.github.com>
Date: Tue, 30 Mar 2021 23:33:25 +0530
Subject: [PATCH 13/17] Update image_visualize.py

---
 klar_eda/visualize/image_visualize.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py
index 8f4a1c5..edc186a 100644
--- a/klar_eda/visualize/image_visualize.py
+++ b/klar_eda/visualize/image_visualize.py
@@ -28,9 +28,8 @@ class ImageDataVisualize:
 
     def __init__(self, data, labels, boxes=None):
         """ this function is for initializing the parameters to work on
-            :param self:the file from which we have to take the data to work on
-            :type self:csv file
             :param data:the images form our dataset
+            :type  data:csv file.
             :param labels:to categorize the images.
             :param boxes:a null parameter used for storing the dimensions of the images.
         """
@@ -61,7 +60,7 @@ def __init__(self, data, labels, boxes=None):
 
     def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, save=True, show=False):
             """ to save the file(plot_type) in its designated directory or to make the path for the directory if such directory doesn't exist and then displaying the file type.
-                 :param plot-the figure to be plotted for graphical visualization.
+                 :param plot:the figure to be plotted for graphical visualization.
                  :type plot:png file.
                  :param file_name: the name of the file to be stored.
                  :type file_name:csv file
@@ -71,7 +70,7 @@ def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, sa
                  :type y-label: string
                  : param save: the boolean parameter passed for saving the file.
                  :type save:boolean
-                 :param show: display the fiel along with its title and also displayin gthe plot.
+                 :param show: display the fiel along with its title and also displaying the plot.
                  :type show:boolean        """
   
         if save:
@@ -93,7 +92,7 @@ def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, sa
 
     def validate_images(self):
         """validate_images:the function used to validate images,whether or not  it has the required no of dimensions  and whether  it's a numpy array or not.
-            : param self-the dataset on which the visualization and the analysis has to be performed."""
+           : param self-the dataset on which the visualization and the analysis has to be performed."""
         for image, label in zip(self.images, self.labels):
             if type(image) != np.ndarray:
                 print('Image not a numpy array, skipping...')

From 51b920024493a79fdfa093281aac0fa39f2bcf21 Mon Sep 17 00:00:00 2001
From: ask149 <ashishkshirsagar@gmail.com>
Date: Wed, 31 Mar 2021 01:04:15 +0530
Subject: [PATCH 14/17] Refactor the code to the standards

---
 docsource/image_visualize.rst         |   6 +
 docsource/index.rst                   |   1 +
 klar_eda/visualize/image_visualize.py | 159 ++++++++++++++------------
 3 files changed, 94 insertions(+), 72 deletions(-)
 create mode 100644 docsource/image_visualize.rst

diff --git a/docsource/image_visualize.rst b/docsource/image_visualize.rst
new file mode 100644
index 0000000..46dbb00
--- /dev/null
+++ b/docsource/image_visualize.rst
@@ -0,0 +1,6 @@
+Image Visualize
+=========================
+
+.. automodule:: klar_eda.visualize.image_visualize
+   :members:
+   :undoc-members:
diff --git a/docsource/index.rst b/docsource/index.rst
index efdb37d..1ce5013 100644
--- a/docsource/index.rst
+++ b/docsource/index.rst
@@ -12,6 +12,7 @@ klar-eda's documentation!
 
    preprocess
    visualize
+   image_visualize
 
 
 Indices and tables
diff --git a/klar_eda/visualize/image_visualize.py b/klar_eda/visualize/image_visualize.py
index edc186a..5c6d0a6 100644
--- a/klar_eda/visualize/image_visualize.py
+++ b/klar_eda/visualize/image_visualize.py
@@ -14,8 +14,8 @@
 
 ############################################################################
 # To do: 1) resizing for the funnctions that require uniform size
-#        2) handle rgb/gray images 
-#        3) axis labels, plot title  
+#        2) handle rgb/gray images
+#        3) axis labels, plot title
 #        4) num components in eigen images
 #        5) optimize mean/eigen computation
 #        6) optimize std vs mean, different types of plots
@@ -27,11 +27,12 @@ class ImageDataVisualize:
 
 
     def __init__(self, data, labels, boxes=None):
-        """ this function is for initializing the parameters to work on
-            :param data:the images form our dataset
-            :type  data:csv file.
-            :param labels:to categorize the images.
-            :param boxes:a null parameter used for storing the dimensions of the images.
+        """Constructor for Image Data Visualization.
+
+        :param data: images
+        :type data: list of numpy image arrays
+        :param labels: labels corresponding to each image
+        :param boxes: list containing shape of each image
         """
         self.images = data
         self.labels = labels
@@ -56,23 +57,27 @@ def __init__(self, data, labels, boxes=None):
         self.dataset['area'] = self.dataset['Height'] * self.dataset['Width']
         print('Number of images after validation and filtering:', self.num_images)
 
- 
+
 
     def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, save=True, show=False):
-            """ to save the file(plot_type) in its designated directory or to make the path for the directory if such directory doesn't exist and then displaying the file type.
-                 :param plot:the figure to be plotted for graphical visualization.
-                 :type plot:png file.
-                 :param file_name: the name of the file to be stored.
-                 :type file_name:csv file
-                 :param x-label : the label to be put on the x-axis of the graph
-                 :type x-label: string
-                 :param y-label: the label to be put on the y-axis of the graph
-                 :type y-label: string
-                 : param save: the boolean parameter passed for saving the file.
-                 :type save:boolean
-                 :param show: display the fiel along with its title and also displaying the plot.
-                 :type show:boolean        """
-  
+        """To save the file(plot_type) in its designated directory or to make
+        the path for the directory if such directory doesn't exist and then
+        displaying the file type.
+
+        :param plot: The figure to be plotted for graphical visualization.
+        :type plot: png file.
+        :param file_name: The filename to be stored.
+        :type file_name: csv file
+        :param x-label : The label to be put on the x-axis of the graph
+        :type x-label: string
+        :param y-label: The label to be put on the y-axis of the graph
+        :type y-label: string
+        :param save: To save the results in the background
+        :type save: boolean
+        :param show: To display the images in the foreground
+        :type show: boolean
+        """
+
         if save:
             save_dir = join(VIZ_ROOT, plot_type)
             if not exists(save_dir):
@@ -90,14 +95,14 @@ def save_or_show(self, plot, plot_type, file_name,x_label=None, y_label=None, sa
         plt.clf()
 
 
+
+
     def validate_images(self):
-        """validate_images:the function used to validate images,whether or not  it has the required no of dimensions  and whether  it's a numpy array or not.
-           : param self-the dataset on which the visualization and the analysis has to be performed."""
+        """Function used to validate images, whether or not it has the required
+        no of dimensions and whether it's a numpy array or not."""
         for image, label in zip(self.images, self.labels):
             if type(image) != np.ndarray:
                 print('Image not a numpy array, skipping...')
-                self.images.remove(image)
-                self.labels.remove(label)
                 continue
             elif image.ndim < 2:
                 print('Image has less than 2 dimensions, skipping...')
@@ -105,38 +110,43 @@ def validate_images(self):
                 self.labels.remove(label)
                 continue
 
+                continue
+
     def aspect_ratio_histogram(self, save=True, show=False):
-        """ aspect_ratio_histogram:the function used to define the aspect ratio of the histogram.aspect_ratio=Width/Height.
-            :param save:the boolean  for instructing  to save the file.
-            :type  save:boolean
-            :param show:to display the ratios,the plot ,the labels and everything related to visualisation.
-            :type show:boolean
+        """Function used to plot the aspect ratio histogram for the dataset.
+
+        :param save: To save the results in the background
+        :type save: boolean
+        :param show: To display the images in the foreground
+        :type show: boolean
         """
         aspect_ratios = self.dataset['Width'] / self.dataset['Height']
         plot = sns.histplot(aspect_ratios, bins='auto')
-        self.save_or_show(plot.figure, 'aspect_ratios', 'aspect_ratios', x_label='aspect_ratios', y_label='frequency', save=save, show=show)
+
 
 
     def area_vs_category(self, save=True, show=False):
-        """area_vs_category:the plot to show the areas percategory(label).
-            :param save:the boolean  for instructing  to save the file.
-            :type  save:boolean
-            :param show:to display the ratios,the plot ,the labels and everything related to visualisation.
-            :type show:boolean"""
+        """Function used to plot area per category of the images.
+
+        :param save: To save the results in the background
+        :type save: boolean
+        :param show: To display the images in the foreground
+        :type show: boolean
+        """
         mean_areas = self.dataset.groupby('Label')['area'].mean()
         plot = sns.barplot(x=mean_areas.index, y=mean_areas.tolist())
         self.save_or_show(plot.figure, 'area_vs_category', 'area_vs_category', x_label='category',y_label= 'area', save=save, show=show)
 
     def mean_images(self, save=True, show=False):
-        """The function for evaluating the mean of the areas per category.
-            :param save:the boolean  for instructing  to save the file.
-            :type  save:boolean
-            :param show:to display the ratios,the plot ,the labels and everything related to visualisation.
-            :type show:boolean"""
+        """Function used for evaluating the mean of the areas per category.
+
+        :param save: To save the results in the background
+        :type save: boolean
+        :param show: To display the images in the foreground
+        :type show: boolean
+        """
         groups = self.dataset.groupby('Label')
-        for group in groups:
-            images = group[1]['Image']
-            mean_image = np.array(list(images)).mean(axis=0)
+        for group in groups:array(list(images)).mean(axis=0)
             plot = plt.imshow(mean_image/255)
             self.save_or_show(plot.figure, 'mean_images', str(group[0]), save=save, show=show)
 
@@ -154,29 +164,33 @@ def mean_images(self, save=True, show=False):
                 plot = plt.imshow(img)
                 self.save_or_show(plot.figure, 'eigen_images/{}'.format(group[0]), str(i), save=save, show=show)
 
+                self.save_or_show(plot.figure, 'eigen_images/{}'.format(group[0]), str(i), save=save, show=show)
+
     def num_images_by_category(self, save=True, show=False):
-        """ the function to display the no of images per category.
-            :param save:the boolean  for instructing  to save the file.
-            :type  save:boolean
-            :param show:to display the ratios,the plot ,the labels and everything related to visualisation.
-            :type show:boolean"""
+        """Function used to display the no of images per category.
+
+        :param save: To save the results in the background
+        :type save: boolean
+        :param show: To display the images in the foreground
+        :type show: boolean
+        """
         counts = self.dataset['Label'].value_counts()
         plot = sns.barplot(x=counts.index, y=counts.tolist())
-        self.save_or_show(plot.figure, 'num_images_by_category', 'bar_chart',x_label='category', y_label='No. of images', save=save, show=show)
-        plot = plt.pie(counts.tolist(), labels=counts.index)
+        self.save_or_show(plt, 'num_images_by_category', 'pie_chart', save=save, show=show)
+
         self.save_or_show(plt, 'num_images_by_category', 'pie_chart', save=save, show=show)
 
     def std_vs_mean(self, save=True, show=False):
-       """the function used to plot the graph of the standard deviation versus the mean.
-          :param save:the boolean  for instructing  to save the file.
-            :type  save:boolean
-            :param show:to display the ratios,the plot ,the labels and everything related to visualisation.
-            :type show:boolean"""
-        
+        """Function used to plot the graph of the standard deviation versus the
+        mean plot.
+
+        :param save: To save the results in the background
+        :type save: boolean
+        :param show: To display the images in the foreground
+        :type show: boolean
+        """
         groups = self.dataset.groupby('Label')
-        y = []
-        x = []
-        hue = []
+        y = [][]
         for group in groups:
             images = group[1]['Image']
             images = np.array(list(images))
@@ -196,24 +210,25 @@ def std_vs_mean(self, save=True, show=False):
 
 
     def t_sne(self, batch_size=32, save=True, show=False):
-        """t-SNE gives you a feel or intuition of how the data is arranged in a high-dimensional space.
-            :param batch_size:the dataset is dividied into batches for smooth functioning of the model on the dataset.
-            :type batch_size:integer
-            :param save:the boolean  for instructing  to save the file.
-            :type  save:boolean
-            :param show:to display the ratios,the plot ,the labels and everything related to visualisation.
-            :type show:boolean
-            """
+        """ t-distributed Stochastic Neighbor Embedding - used to visualize high dimensional data
         
+            :param batch_size: The size of the batch
+            :type batch_size: integer
+            :param save: To save the results in the background
+            :type save: boolean
+            :param show: To display the images in the foreground
+            :type show: boolean
+            """
+
         model = ResNet50(weights='imagenet', pooling=max, include_top = False)
         features_list = []
         print('Extracting features ...')
         for image in tqdm(self.images):
             if self.grey_present and (image.ndim < 3 or image.shape[-1] == 1):
                 image = np.stack((image.squeeze(),)*3, axis=-1)
-            image = np.expand_dims(image, axis=0) 
-            image = preprocess_input(image) 
-            features = model.predict(image) 
+            image = np.expand_dims(image, axis=0)
+            image = preprocess_input(image)
+            features = model.predict(image)
             features_reduce = features.squeeze()
             features_list.append(features_reduce)
 

From c2a82cccb8b39f97d48e39e9d83fa72017ed0ac1 Mon Sep 17 00:00:00 2001
From: Ashish <54330052+ashish-hacker@users.noreply.github.com>
Date: Fri, 2 Apr 2021 01:56:57 +0530
Subject: [PATCH 15/17]  Added the standardisation function (#29)

* Added the standardisation function

* Added Mean Normalization & changed name of the function standardize to z_score_normalization

* Added the standardisation function

* Added Mean Normalization & changed name of the function standardize to z_score_normalization

Co-authored-by: ask149 <ashishkshirsagar@gmail.com>
---
 klar_eda/preprocess/csv_preprocess.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/klar_eda/preprocess/csv_preprocess.py b/klar_eda/preprocess/csv_preprocess.py
index 862191c..1c157d4 100644
--- a/klar_eda/preprocess/csv_preprocess.py
+++ b/klar_eda/preprocess/csv_preprocess.py
@@ -84,6 +84,25 @@ def normalize_numerical(self):
         for col in self.numerical_column_list:
             if col != self.target_column:
                 self.df[col]=(self.df[col]-self.df[col].min())/(self.df[col].max()-self.df[col].min())
+    def standardize(self):
+        
+        ### Data use cases for Standardization: ###
+
+        # It makes the data with unit variance and zero mean. 
+        # This will be used when the features have different scales , for example if there are two features salary and age , Obviously age will be from 1-100 and salary can be substantially higher than age values. So if we fit the model directly the salary feature will have a larger impact on predicting the target variable. But it may not be the case.
+        # So It's necessary to standardise the data.  
+        # We should do standardization in case of algorithms where Gradient descent is used for optimizations, for achieving the minima faster. 
+        # Standardisation is also called z-score normalisation.
+
+        for i in df.columns:
+            self.df[i] = (self.df[i] - self.df[i].mean())/self.df[i].std() # Standardise the data z = (x - mean)/ (standard deviation)
+
+    def mean_normalization(self):
+        """ converts x to x' where,
+        x' = (x - mean(x))/(max(x) - min(x))
+        """
+        for col in df.columns:
+            self.df[i] = (self.df[i] - self.df[i].mean())/(self.df[i].max() - self.df[i].min())
 
     def encode_categorical(self):
         enc = OneHotEncoder(handle_unknown='ignore')
@@ -160,4 +179,4 @@ def convert_date_format(self, input_date, output_date_format = 'DD/MM/YYYY'):
         
         parsed_date = dateutil.parser.parse(input_date, dayfirst=True)
         self.converted_date = parsed_date.strftime(output_date_formats[output_date_format])
-        return self.converted_date
\ No newline at end of file
+        return self.converted_date

From 9903dab6df8db33c1856ed2f957aea808a749f8b Mon Sep 17 00:00:00 2001
From: Shankhanil Ghosh <shankha.rik@gmail.com>
Date: Mon, 5 Apr 2021 09:31:50 +0530
Subject: [PATCH 16/17] resolved conflict in csv_preprocess

---
 klar_eda/preprocess/csv_preprocess.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/klar_eda/preprocess/csv_preprocess.py b/klar_eda/preprocess/csv_preprocess.py
index 9a2bcd9..1c157d4 100644
--- a/klar_eda/preprocess/csv_preprocess.py
+++ b/klar_eda/preprocess/csv_preprocess.py
@@ -72,13 +72,8 @@ def fill_numerical_na(self, ret = False):
                         self.df[col] = y
             except Exception as e:
                 pass
-<<<<<<< HEAD
             if ret == True:
                 return self.df
-=======
-        if ret == True:
-            return self.df
->>>>>>> issue22
 
     def fill_categorical_na(self, ret = False):
         self.df = self.df.fillna("Unknown")

From b80fd1d9bfdc7e05827fe0bb9fedb08d4cc18990 Mon Sep 17 00:00:00 2001
From: Shankhanil Ghosh <shankha.rik@gmail.com>
Date: Fri, 16 Apr 2021 12:13:40 +0530
Subject: [PATCH 17/17] removed file

---
 klar_eda/preprocess/csv_preprocess.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/klar_eda/preprocess/csv_preprocess.py b/klar_eda/preprocess/csv_preprocess.py
index 1c157d4..9a2bcd9 100644
--- a/klar_eda/preprocess/csv_preprocess.py
+++ b/klar_eda/preprocess/csv_preprocess.py
@@ -72,8 +72,13 @@ def fill_numerical_na(self, ret = False):
                         self.df[col] = y
             except Exception as e:
                 pass
+<<<<<<< HEAD
             if ret == True:
                 return self.df
+=======
+        if ret == True:
+            return self.df
+>>>>>>> issue22
 
     def fill_categorical_na(self, ret = False):
         self.df = self.df.fillna("Unknown")