Skip to content

Commit

Permalink
feat(model): Add log_reg agent
Browse files Browse the repository at this point in the history
  • Loading branch information
its-sushant committed Jun 24, 2022
1 parent 5071444 commit 3c5b9d1
Show file tree
Hide file tree
Showing 9 changed files with 130 additions and 16 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ https://fossology.github.io/atarashi
1. `data/Ngram_keywords.json`
2. `licenses/<SPDX-version>.csv`
3. `licenses/processedList.csv`
4. `models/logreg`
- These files will be placed to their appropriate places by the install script.

### Installing just dependencies
Expand All @@ -60,6 +61,9 @@ Get the help by running `atarashi -h` or `atarashi --help`
- Running **wordFrequencySimilarity** agent

`atarashi -a wordFrequencySimilarity /path/to/file.c`
- Running **logisticRegression** agent

`atarashi -a logisticRegression /path/to/file.c`
- Running **tfidf** agent
- With **Cosine similarity**

Expand Down
95 changes: 95 additions & 0 deletions atarashi/agents/logisticRegression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

"""
Copyright 2022 Sushant Kumar ([email protected])
SPDX-License-Identifier: GPL-2.0
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
version 2 as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""

__author__ = 'Sushant Kumar'
__email__ = '[email protected]'

import pickle
import os
import argparse

from atarashi.agents.atarashiAgent import AtarashiAgent
from atarashi.libs.initialmatch import spdx_identifer


class LogisticRegression(AtarashiAgent):

def __init__(self, licenseList):
super().__init__(licenseList)

def predict_shortname(self, processed_comment):
'''
:param filePath: extracted and preprocessed comment
:return: Returns the predicted license's short name
'''

current_dir = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(current_dir, './../data/models/logreg'), 'rb'
) as f:
classifier = pickle.load(f)

processed_comment = [processed_comment]
return classifier.predict(processed_comment)

def scan(self, filePath):
'''
Read the content content of filename, extract the comments and preprocess them.
Find the predicted short name for the preprocessed file.
:param filePath: Path of the file to scan
:return: Returns the license's short name
'''

match = []

with open(filePath) as file:
raw_data = file.read()

spdx_identifers = spdx_identifer(raw_data,
self.licenseList['shortname'])
if spdx_identifers:
match.extend(spdx_identifers)
else:
processed_comment = super().loadFile(filePath)
license_name = self.predict_shortname(processed_comment)

match.append({
'shortname': str(license_name[0]),
'sim_score': 1.0,
'sim_type': 'logisticRegression',
'description': '',
})
return match


if __name__ == '__main__':

parser = argparse.ArgumentParser()
parser.add_argument('processedLicenseList',
help='Specify the processed license list file')
parser.add_argument('inputFile',
help='Specify the input file which needs to be scanned'
)

args = parser.parse_args()

licenseList = args.processedLicenseList
filename = args.inputFile

scanner = LogisticRegression(licenseList)
scanner.scan(filename)
7 changes: 5 additions & 2 deletions atarashi/atarashii.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from atarashi.agents.dameruLevenDist import DameruLevenDist
from atarashi.agents.tfidf import TFIDF
from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity
from atarashi.agents.logisticRegression import LogisticRegression

__author__ = "Aman Jain"
__email__ = "[email protected]"
Expand All @@ -46,7 +47,6 @@ def atarashii_runner(inputFile, processedLicense, agent_name,
:param ngramJsonLoc: Specify N-Gram Json File location
:param verbose: Specify if verbose mode is on or not (Default is Off/ None)
:return: Returns the array of JSON with scan results
+------------+-----------------------------------------------------------+
| shortname | Short name of the license |
+------------+-----------------------------------------------------------+
Expand Down Expand Up @@ -78,6 +78,8 @@ def build_scanner_obj(processedLicense, agent_name, similarity="CosineSim",
scanner = WordFrequencySimilarity(processedLicense)
elif agent_name == "DLD":
scanner = DameruLevenDist(processedLicense)
elif agent_name == "logisticRegression":
scanner = LogisticRegression(processedLicense)
elif agent_name == "tfidf":
scanner = TFIDF(processedLicense)
if similarity == "CosineSim":
Expand Down Expand Up @@ -128,7 +130,8 @@ def main():
parser.add_argument("-l", "--processedLicenseList", required=False,
help="Specify the location of processed license list file")
parser.add_argument("-a", "--agent_name", required=True,
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
choices=['wordFrequencySimilarity', 'DLD',
'tfidf', 'Ngram', 'logisticRegression'],
help="Name of the agent that needs to be run")
parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
Expand Down
20 changes: 14 additions & 6 deletions atarashi/build_deps.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,19 @@
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""

from atarashi.license.license_merger import license_merger
from atarashi.license.licensePreprocessor import LicensePreprocessor
from atarashi.license.licenseDownloader import LicenseDownloader
from atarashi.libs.ngram import createNgrams
__author__ = "Gaurav Mishra"
__email__ = "[email protected]"

import argparse
import os
import sys
from requests import get
sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + '/../')

from atarashi.libs.ngram import createNgrams
from atarashi.license.licenseDownloader import LicenseDownloader
from atarashi.license.licensePreprocessor import LicensePreprocessor
from atarashi.license.license_merger import license_merger


"""
Creates required files for Atarashi.
Expand All @@ -40,11 +40,14 @@
The merged CSV is then processesed which is then used to create the Ngrams.
"""

def download_dependencies(threads = os.cpu_count(), verbose = 0):

def download_dependencies(threads=os.cpu_count(), verbose=0):
currentDir = os.path.dirname(os.path.abspath(__file__))
licenseListCsv = currentDir + "/data/licenses/licenseList.csv"
processedLicenseListCsv = currentDir + "/data/licenses/processedLicenses.csv"
ngramJsonLoc = currentDir + "/data/Ngram_keywords.json"
model_logreg = currentDir + "/data/models/logreg"
url = "https://github.com/its-sushant/Minerva-Dataset-Generation/raw/feat/model/Model-train/models/logreg"

print("** Downloading SPDX licenses **")
spdxLicenseList = LicenseDownloader.download_license(threads)
Expand All @@ -57,6 +60,11 @@ def download_dependencies(threads = os.cpu_count(), verbose = 0):
verbose = verbose)
print("** Generating Ngrams **")
createNgrams(processedLicenseListCsv, ngramJsonLoc, threads, verbose)
print("** Downloading Models **")
r = get(url)
f = open(model_logreg, 'wb')
f.write(r.content)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
Expand Down
Empty file added atarashi/data/models/.gitkeep
Empty file.
6 changes: 2 additions & 4 deletions atarashi/evaluator/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ def processFile(scan_input):
'''
processFile function runs the agent command on the bash/terminal and gets the
result for the given file
:param filepath: The path of the file to be scanned
:param similarity: Similarity type of the agent
:return: Returns 1 if the result found by agent is correct and otherwise returns false
Expand Down Expand Up @@ -89,7 +88,6 @@ def evaluate(scanner):
The Function runs the agent command on the bash/terminal and gets the result.
The license name is then parsed from the result and matched with the actual
name. Successful matched % is then returned as accuracy.
:param scanner: Scanner object prepared to run scans
:return: Time elapsed in the evaluation & the accuracy
:rtype: float, int
Expand Down Expand Up @@ -118,7 +116,8 @@ def evaluate(scanner):
defaultJSON = resource_filename("atarashi", "data/Ngram_keywords.json")
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--agent_name", required=True,
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
choices=['wordFrequencySimilarity', 'DLD',
'tfidf', 'Ngram', 'logisticRegression'],
help="Name of the agent that needs to be run")
parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
Expand Down Expand Up @@ -156,4 +155,3 @@ def evaluate(scanner):
print(' ' + '+' * 44)

shutil.rmtree('TestFiles')

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ requires = [
"textdistance>=3.0.3",
"pyxDamerauLevenshtein>=1.5",
"nirjas>=0.0.5",
"urllib3>=1.24.1"
"urllib3>=1.24.1",
"requests>=2.23.0"
]
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ textdistance>=3.0.3
setuptools>=39.2.0
nirjas>=0.0.5
urllib3>=1.24.1
requests>=2.23.0
10 changes: 7 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ def read(fname):
'tqdm>=4.42.0',
'pandas>=0.23.1',
'urllib3>=1.24.1',
'nirjas>=0.0.5'
'nirjas>=0.0.5',
'requests>=2.23.0'
]

requirements = [
Expand All @@ -68,7 +69,8 @@ def read(fname):
'textdistance>=3.0.3',
'pyxDamerauLevenshtein>=1.5',
'urllib3>=1.24.1',
'nirjas>=0.0.5'
'nirjas>=0.0.5',
'requests>=2.23.0'
]

class BuildAtarashiDependencies(distutils.cmd.Command):
Expand All @@ -78,6 +80,7 @@ class BuildAtarashiDependencies(distutils.cmd.Command):
1. data/Ngram_keywords.json
2. data/licenses/<spdx_license>.csv
3. data/licenses/processedLicenses.csv
4. data/models/logreg
"""
description = 'build Atarashi dependency files'
user_options = [
Expand Down Expand Up @@ -144,7 +147,8 @@ def run(self):
package_data = {
'atarashi': [
'data/Ngram_keywords.json',
'data/licenses/processedLicenses.csv'
'data/licenses/processedLicenses.csv',
'data/models/logreg'
]
},
cmdclass = {
Expand Down

0 comments on commit 3c5b9d1

Please sign in to comment.