Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat(model): Add agent for logistic regression model #100

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ Get the help by running `atarashi -h` or `atarashi --help`
- Running **wordFrequencySimilarity** agent

`atarashi -a wordFrequencySimilarity /path/to/file.c`
- Running **logisticRegression** agent

`atarashi -a logisticRegression /path/to/file.c`
- Running **tfidf** agent
- With **Cosine similarity**

Expand Down
89 changes: 89 additions & 0 deletions atarashi/agents/logisticRegression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

"""
Copyright 2022 Sushant Kumar ([email protected])
SPDX-License-Identifier: GPL-2.0
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
version 2 as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""

__author__ = 'Sushant Kumar'
__email__ = '[email protected]'

import argparse

from atarashi.agents.atarashiAgent import AtarashiAgent
from atarashi.libs.initialmatch import spdx_identifer
from logreg import logreg


class LogisticRegression(AtarashiAgent):

def __init__(self, licenseList):
super().__init__(licenseList)

def predict_shortname(self, processed_comment):
'''
:param filePath: extracted and preprocessed comment
:return: Returns the predicted license's short name
'''

processed_comment = [processed_comment]
return logreg(processed_comment)

def scan(self, filePath):
'''
Read the content of filename, extract the comments and preprocess them.
Find the predicted short name for the preprocessed file.

:param filePath: Path of the file to scan
:return: Returns the license's short name
'''

match = []

with open(filePath) as file:
raw_data = file.read()

spdx_identifers = spdx_identifer(raw_data,
self.licenseList['shortname'])
if spdx_identifers:
match.extend(spdx_identifers)
else:
processed_comment = super().loadFile(filePath)
license_name = self.predict_shortname(processed_comment)

match.append({
'shortname': str(license_name[0]),
'sim_score': 1.0,
'sim_type': 'logisticRegression',
'description': '',
})
return match


if __name__ == '__main__':

parser = argparse.ArgumentParser()
parser.add_argument('processedLicenseList',
help='Specify the processed license list file')
parser.add_argument('inputFile',
help='Specify the input file which needs to be scanned'
)

args = parser.parse_args()

licenseList = args.processedLicenseList
filename = args.inputFile

scanner = LogisticRegression(licenseList)
scanner.scan(filename)
7 changes: 5 additions & 2 deletions atarashi/atarashii.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from atarashi.agents.dameruLevenDist import DameruLevenDist
from atarashi.agents.tfidf import TFIDF
from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity
from atarashi.agents.logisticRegression import LogisticRegression

__author__ = "Aman Jain"
__email__ = "[email protected]"
Expand All @@ -46,7 +47,6 @@ def atarashii_runner(inputFile, processedLicense, agent_name,
:param ngramJsonLoc: Specify N-Gram Json File location
:param verbose: Specify if verbose mode is on or not (Default is Off/ None)
:return: Returns the array of JSON with scan results
+------------+-----------------------------------------------------------+
| shortname | Short name of the license |
+------------+-----------------------------------------------------------+
Expand Down Expand Up @@ -78,6 +78,8 @@ def build_scanner_obj(processedLicense, agent_name, similarity="CosineSim",
scanner = WordFrequencySimilarity(processedLicense)
elif agent_name == "DLD":
scanner = DameruLevenDist(processedLicense)
elif agent_name == "logisticRegression":
scanner = LogisticRegression(processedLicense)
elif agent_name == "tfidf":
scanner = TFIDF(processedLicense)
if similarity == "CosineSim":
Expand Down Expand Up @@ -128,7 +130,8 @@ def main():
parser.add_argument("-l", "--processedLicenseList", required=False,
help="Specify the location of processed license list file")
parser.add_argument("-a", "--agent_name", required=True,
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
choices=['wordFrequencySimilarity', 'DLD',
'tfidf', 'Ngram', 'logisticRegression'],
help="Name of the agent that needs to be run")
parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
Expand Down
13 changes: 7 additions & 6 deletions atarashi/build_deps.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""

from atarashi.license.license_merger import license_merger
from atarashi.license.licensePreprocessor import LicensePreprocessor
from atarashi.license.licenseDownloader import LicenseDownloader
from atarashi.libs.ngram import createNgrams
__author__ = "Gaurav Mishra"
__email__ = "[email protected]"

Expand All @@ -27,11 +31,6 @@
import sys
sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + '/../')

from atarashi.libs.ngram import createNgrams
from atarashi.license.licenseDownloader import LicenseDownloader
from atarashi.license.licensePreprocessor import LicensePreprocessor
from atarashi.license.license_merger import license_merger


"""
Creates required files for Atarashi.
Expand All @@ -40,7 +39,8 @@
The merged CSV is then processesed which is then used to create the Ngrams.
"""

def download_dependencies(threads = os.cpu_count(), verbose = 0):

def download_dependencies(threads=os.cpu_count(), verbose=0):
currentDir = os.path.dirname(os.path.abspath(__file__))
licenseListCsv = currentDir + "/data/licenses/licenseList.csv"
processedLicenseListCsv = currentDir + "/data/licenses/processedLicenses.csv"
Expand All @@ -58,6 +58,7 @@ def download_dependencies(threads = os.cpu_count(), verbose = 0):
print("** Generating Ngrams **")
createNgrams(processedLicenseListCsv, ngramJsonLoc, threads, verbose)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-t", "--threads", required = False, default = os.cpu_count(),
Expand Down
6 changes: 2 additions & 4 deletions atarashi/evaluator/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ def processFile(scan_input):
'''
processFile function runs the agent command on the bash/terminal and gets the
result for the given file
:param filepath: The path of the file to be scanned
:param similarity: Similarity type of the agent
:return: Returns 1 if the result found by agent is correct and otherwise returns false
Expand Down Expand Up @@ -89,7 +88,6 @@ def evaluate(scanner):
The Function runs the agent command on the bash/terminal and gets the result.
The license name is then parsed from the result and matched with the actual
name. Successful matched % is then returned as accuracy.
:param scanner: Scanner object prepared to run scans
:return: Time elapsed in the evaluation & the accuracy
:rtype: float, int
Expand Down Expand Up @@ -118,7 +116,8 @@ def evaluate(scanner):
defaultJSON = resource_filename("atarashi", "data/Ngram_keywords.json")
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--agent_name", required=True,
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
choices=['wordFrequencySimilarity', 'DLD',
'tfidf', 'Ngram', 'logisticRegression'],
help="Name of the agent that needs to be run")
parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
Expand Down Expand Up @@ -156,4 +155,3 @@ def evaluate(scanner):
print(' ' + '+' * 44)

shutil.rmtree('TestFiles')

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ requires = [
"textdistance>=3.0.3",
"pyxDamerauLevenshtein>=1.5",
"nirjas>=0.0.5",
"urllib3>=1.24.1"
"urllib3>=1.24.1",
"logreg>=0.1.0"
]
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ textdistance>=3.0.3
setuptools>=39.2.0
nirjas>=0.0.5
urllib3>=1.24.1
logreg>=0.1.0
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ def read(fname):
'tqdm>=4.42.0',
'pandas>=0.23.1',
'urllib3>=1.24.1',
'nirjas>=0.0.5'
'nirjas>=0.0.5',
'logreg>=0.1.0'
]

requirements = [
Expand All @@ -68,7 +69,8 @@ def read(fname):
'textdistance>=3.0.3',
'pyxDamerauLevenshtein>=1.5',
'urllib3>=1.24.1',
'nirjas>=0.0.5'
'nirjas>=0.0.5',
'logreg>=0.1.0'
]

class BuildAtarashiDependencies(distutils.cmd.Command):
Expand Down