feat(model): Add log_reg agent

fossology · Jun 24, 2022 · 3c5b9d1 · 3c5b9d1
1 parent 5071444
commit 3c5b9d1
Show file tree

Hide file tree

Showing 9 changed files with 130 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -38,6 +38,7 @@ https://fossology.github.io/atarashi
     1.  `data/Ngram_keywords.json`
     2.  `licenses/<SPDX-version>.csv`
     3.  `licenses/processedList.csv`
+    4.  `models/logreg`
 - These files will be placed to their appropriate places by the install script.
 
 ### Installing just dependencies
@@ -60,6 +61,9 @@ Get the help by running `atarashi -h` or `atarashi --help`
 - Running **wordFrequencySimilarity** agent
 
     `atarashi -a wordFrequencySimilarity /path/to/file.c`
+- Running **logisticRegression** agent
+
+    `atarashi -a logisticRegression /path/to/file.c`
 - Running **tfidf** agent
     - With **Cosine similarity**
 

diff --git a/atarashi/agents/logisticRegression.py b/atarashi/agents/logisticRegression.py
@@ -0,0 +1,95 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Copyright 2022 Sushant Kumar ([email protected])
+SPDX-License-Identifier: GPL-2.0
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+version 2 as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""
+
+__author__ = 'Sushant Kumar'
+__email__ = '[email protected]'
+
+import pickle
+import os
+import argparse
+
+from atarashi.agents.atarashiAgent import AtarashiAgent
+from atarashi.libs.initialmatch import spdx_identifer
+
+
+class LogisticRegression(AtarashiAgent):
+
+    def __init__(self, licenseList):
+        super().__init__(licenseList)
+
+    def predict_shortname(self, processed_comment):
+        '''
+        :param filePath: extracted and preprocessed comment
+        :return: Returns the predicted license's short name
+        '''
+
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        with open(os.path.join(current_dir, './../data/models/logreg'), 'rb'
+                  ) as f:
+            classifier = pickle.load(f)
+
+        processed_comment = [processed_comment]
+        return classifier.predict(processed_comment)
+
+    def scan(self, filePath):
+        '''
+        Read the content content of filename, extract the comments and preprocess them.
+        Find the predicted short name for the preprocessed file.
+
+        :param filePath: Path of the file to scan
+        :return: Returns the license's short name
+        '''
+
+        match = []
+
+        with open(filePath) as file:
+            raw_data = file.read()
+
+        spdx_identifers = spdx_identifer(raw_data,
+                                         self.licenseList['shortname'])
+        if spdx_identifers:
+            match.extend(spdx_identifers)
+        else:
+            processed_comment = super().loadFile(filePath)
+            license_name = self.predict_shortname(processed_comment)
+
+            match.append({
+                'shortname': str(license_name[0]),
+                'sim_score': 1.0,
+                'sim_type': 'logisticRegression',
+                'description': '',
+            })
+        return match
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('processedLicenseList',
+                        help='Specify the processed license list file')
+    parser.add_argument('inputFile',
+                        help='Specify the input file which needs to be scanned'
+                        )
+
+    args = parser.parse_args()
+
+    licenseList = args.processedLicenseList
+    filename = args.inputFile
+
+    scanner = LogisticRegression(licenseList)
+    scanner.scan(filename)
diff --git a/atarashi/atarashii.py b/atarashi/atarashii.py
@@ -28,6 +28,7 @@
 from atarashi.agents.dameruLevenDist import DameruLevenDist
 from atarashi.agents.tfidf import TFIDF
 from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity
+from atarashi.agents.logisticRegression import LogisticRegression
 
 __author__ = "Aman Jain"
 __email__ = "[email protected]"
@@ -46,7 +47,6 @@ def atarashii_runner(inputFile, processedLicense, agent_name,
   :param ngramJsonLoc: Specify N-Gram Json File location
   :param verbose: Specify if verbose mode is on or not (Default is Off/ None)
   :return: Returns the array of JSON with scan results
-
   +------------+-----------------------------------------------------------+
   | shortname  | Short name of the license                                 |
   +------------+-----------------------------------------------------------+
@@ -78,6 +78,8 @@ def build_scanner_obj(processedLicense, agent_name, similarity="CosineSim",
     scanner = WordFrequencySimilarity(processedLicense)
   elif agent_name == "DLD":
     scanner = DameruLevenDist(processedLicense)
+  elif agent_name == "logisticRegression":
+    scanner = LogisticRegression(processedLicense)
   elif agent_name == "tfidf":
     scanner = TFIDF(processedLicense)
     if similarity == "CosineSim":
@@ -128,7 +130,8 @@ def main():
   parser.add_argument("-l", "--processedLicenseList", required=False,
                       help="Specify the location of processed license list file")
   parser.add_argument("-a", "--agent_name", required=True,
-                      choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
+                      choices=['wordFrequencySimilarity', 'DLD',
+                               'tfidf', 'Ngram', 'logisticRegression'],
                       help="Name of the agent that needs to be run")
   parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
                       choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],

diff --git a/atarashi/build_deps.py b/atarashi/build_deps.py
@@ -19,19 +19,19 @@
 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 """
 
+from atarashi.license.license_merger import license_merger
+from atarashi.license.licensePreprocessor import LicensePreprocessor
+from atarashi.license.licenseDownloader import LicenseDownloader
+from atarashi.libs.ngram import createNgrams
 __author__ = "Gaurav Mishra"
 __email__ = "[email protected]"
 
 import argparse
 import os
 import sys
+from requests import get
 sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + '/../')
 
-from atarashi.libs.ngram import createNgrams
-from atarashi.license.licenseDownloader import LicenseDownloader
-from atarashi.license.licensePreprocessor import LicensePreprocessor
-from atarashi.license.license_merger import license_merger
-
 
 """
 Creates required files for Atarashi.
@@ -40,11 +40,14 @@
 The merged CSV is then processesed which is then used to create the Ngrams.
 """
 
-def download_dependencies(threads = os.cpu_count(), verbose = 0):
+
+def download_dependencies(threads=os.cpu_count(), verbose=0):
   currentDir = os.path.dirname(os.path.abspath(__file__))
   licenseListCsv = currentDir + "/data/licenses/licenseList.csv"
   processedLicenseListCsv = currentDir + "/data/licenses/processedLicenses.csv"
   ngramJsonLoc = currentDir + "/data/Ngram_keywords.json"
+  model_logreg = currentDir + "/data/models/logreg"
+  url = "https://github.com/its-sushant/Minerva-Dataset-Generation/raw/feat/model/Model-train/models/logreg"
 
   print("** Downloading SPDX licenses **")
   spdxLicenseList = LicenseDownloader.download_license(threads)
@@ -57,6 +60,11 @@ def download_dependencies(threads = os.cpu_count(), verbose = 0):
     verbose = verbose)
   print("** Generating Ngrams **")
   createNgrams(processedLicenseListCsv, ngramJsonLoc, threads, verbose)
+  print("** Downloading Models **")
+  r = get(url)
+  f = open(model_logreg, 'wb')
+  f.write(r.content)
+
 
 if __name__ == "__main__":
   parser = argparse.ArgumentParser()

diff --git a/atarashi/data/models/.gitkeep b/atarashi/data/models/.gitkeep
diff --git a/atarashi/evaluator/evaluator.py b/atarashi/evaluator/evaluator.py
@@ -56,7 +56,6 @@ def processFile(scan_input):
   '''
   processFile function runs the agent command on the bash/terminal and gets the
   result for the given file
-
   :param filepath: The path of the file to be scanned
   :param similarity: Similarity type of the agent
   :return: Returns 1 if the result found by agent is correct and otherwise returns false
@@ -89,7 +88,6 @@ def evaluate(scanner):
   The Function runs the agent command on the bash/terminal and gets the result.
   The license name is then parsed from the result and matched with the actual
   name. Successful matched % is then returned as accuracy.
-
   :param scanner: Scanner object prepared to run scans
   :return: Time elapsed in the evaluation & the accuracy
   :rtype: float, int
@@ -118,7 +116,8 @@ def evaluate(scanner):
   defaultJSON = resource_filename("atarashi", "data/Ngram_keywords.json")
   parser = argparse.ArgumentParser()
   parser.add_argument("-a", "--agent_name", required=True,
-                      choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
+                      choices=['wordFrequencySimilarity', 'DLD',
+                               'tfidf', 'Ngram', 'logisticRegression'],
                       help="Name of the agent that needs to be run")
   parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
                       choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
@@ -156,4 +155,3 @@ def evaluate(scanner):
   print('      ' + '+' * 44)
 
   shutil.rmtree('TestFiles')
-
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,5 +10,6 @@ requires = [
   "textdistance>=3.0.3",
   "pyxDamerauLevenshtein>=1.5",
   "nirjas>=0.0.5",
-  "urllib3>=1.24.1"
+  "urllib3>=1.24.1",
+  "requests>=2.23.0"
 ]
diff --git a/requirements.txt b/requirements.txt
@@ -8,3 +8,4 @@ textdistance>=3.0.3
 setuptools>=39.2.0
 nirjas>=0.0.5
 urllib3>=1.24.1
+requests>=2.23.0
diff --git a/setup.py b/setup.py
@@ -55,7 +55,8 @@ def read(fname):
   'tqdm>=4.42.0',
   'pandas>=0.23.1',
   'urllib3>=1.24.1',
-  'nirjas>=0.0.5'
+  'nirjas>=0.0.5',
+  'requests>=2.23.0'
 ]
 
 requirements = [
@@ -68,7 +69,8 @@ def read(fname):
   'textdistance>=3.0.3',
   'pyxDamerauLevenshtein>=1.5',
   'urllib3>=1.24.1',
-  'nirjas>=0.0.5'
+  'nirjas>=0.0.5',
+  'requests>=2.23.0'
 ]
 
 class BuildAtarashiDependencies(distutils.cmd.Command):
@@ -78,6 +80,7 @@ class BuildAtarashiDependencies(distutils.cmd.Command):
   1.  data/Ngram_keywords.json
   2.  data/licenses/<spdx_license>.csv
   3.  data/licenses/processedLicenses.csv
+  4.  data/models/logreg
   """
   description = 'build Atarashi dependency files'
   user_options = [
@@ -144,7 +147,8 @@ def run(self):
   package_data = {
     'atarashi': [
       'data/Ngram_keywords.json',
-      'data/licenses/processedLicenses.csv'
+      'data/licenses/processedLicenses.csv',
+      'data/models/logreg'
     ]
   },
   cmdclass = {