From 5474ca8ace16ba1671a273cc1c8cf7a0d586e3c0 Mon Sep 17 00:00:00 2001 From: Aman Dwivedi Date: Wed, 30 Dec 2020 14:10:12 +0530 Subject: [PATCH] refactor(argparse): Shifted from argparse to plac --- MANIFEST.in | 3 +- README.md | 2 +- atarashi/agents/atarashiAgent.py | 2 +- atarashi/agents/cosineSimNgram.py | 32 +++++----- atarashi/agents/dameruLevenDist.py | 28 ++++----- atarashi/agents/tfidf.py | 30 ++++------ atarashi/agents/wordFrequencySimilarity.py | 27 ++++----- atarashi/atarashii.py | 70 ++++++++++------------ atarashi/build_deps.py | 21 +++---- atarashi/evaluator/evaluator.py | 50 ++++++++-------- atarashi/imtihaan.py | 36 +++++------ atarashi/libs/commentPreprocessor.py | 28 ++++----- atarashi/libs/initialmatch.py | 2 +- atarashi/libs/license_clustering.py | 19 +++--- atarashi/libs/ngram.py | 29 ++++----- atarashi/libs/utils.py | 2 +- atarashi/license/licenseDownloader.py | 21 ++++--- atarashi/license/licenseLoader.py | 2 +- atarashi/license/licensePreprocessor.py | 24 ++++---- atarashi/license/license_merger.py | 22 +++---- pyproject.toml | 3 +- requirements.txt | 3 +- setup.py | 8 ++- 23 files changed, 215 insertions(+), 249 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 0957c605..eca21c5a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -14,5 +14,4 @@ include atarashi/data/Ngram_keywords.json prune .git prune venv -prune test* - +prune test* \ No newline at end of file diff --git a/README.md b/README.md index 8ed5f854..2d527ed6 100644 --- a/README.md +++ b/README.md @@ -195,4 +195,4 @@ This will generate file in `docs/_build/html`. Go to: index.html You can change the theme of the documentation by changing `html_theme` in config.py file in `docs/` folder. You can choose from {'alabaster', 'classic', 'sphinxdoc', 'scrolls', 'agogo', 'traditional', 'nature', 'haiku', 'pyramid', 'bizstyle'} -[Reference](https://www.sphinx-doc.org/en/master/usage/theming.html) +[Reference](https://www.sphinx-doc.org/en/master/usage/theming.html) \ No newline at end of file diff --git a/atarashi/agents/atarashiAgent.py b/atarashi/agents/atarashiAgent.py index 46332e8d..75bc11bd 100644 --- a/atarashi/agents/atarashiAgent.py +++ b/atarashi/agents/atarashiAgent.py @@ -72,4 +72,4 @@ def exactMatcher(licenseText, licenses): output.append(licenses.iloc[idx]['shortname']) if not output: return -1 - return output + return output \ No newline at end of file diff --git a/atarashi/agents/cosineSimNgram.py b/atarashi/agents/cosineSimNgram.py index acc6b391..a3d44d86 100644 --- a/atarashi/agents/cosineSimNgram.py +++ b/atarashi/agents/cosineSimNgram.py @@ -18,7 +18,7 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ -import argparse +import plac from enum import Enum import itertools import json @@ -184,24 +184,16 @@ def setSimAlgo(self, newAlgo): self.simType = newAlgo -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("processedLicenseList", help="Specify the processed license list file") - parser.add_argument("ngramJson", help="Specify the location of NGRAM JSON") - parser.add_argument("inputFile", help="Specify the input file which needs to be scanned") - parser.add_argument("-s", "--similarity", required=False, default="BigramCosineSim", - choices=["CosineSim", "DiceSim", "BigramCosineSim"], - help="Specify the similarity algorithm that you want") - parser.add_argument("-v", "--verbose", help="increase output verbosity", - action='count', default=0) - args = parser.parse_args() - - licenseList = args.processedLicenseList - ngramJsonLoc = args.ngramJson - inputFile = args.inputFile - simType = args.similarity - verbose = args.verbose +@plac.annotations( + licenseList = plac.Annotation("Specify the processed license list file", "positional", None, str, metavar="processedLicenseList"), + ngramJsonLoc = plac.Annotation("Specify the location of NGRAM JSON", metavar="ngramJson"), + inputFile = plac.Annotation("Specify the input file which needs to be scanned"), + similarity = plac.Annotation("Specify the similarity algorithm that you want", "option", "s", str, ["CosineSim", "DiceSim", "BigramCosineSim"], metavar="{CosineSim,DiceSim,BigramCosineSim}"), + verbose = plac.Annotation("increase output verbosity", "flag", "v") +) +def main(licenseList, ngramJsonLoc, inputFile, similarity="BigramCosineSim", verbose=False): + simType = similarity scanner = NgramAgent(licenseList, ngramJson=ngramJsonLoc, verbose=verbose) if simType == "CosineSim": scanner.setSimAlgo(NgramAgent.NgramAlgo.cosineSim) @@ -215,3 +207,7 @@ def setSimAlgo(self, newAlgo): print("N-Gram identifier and " + str(simType) + " is " + str(result)) else: print("Result is nothing") + + +if __name__ == "__main__": + plac.call(main) \ No newline at end of file diff --git a/atarashi/agents/dameruLevenDist.py b/atarashi/agents/dameruLevenDist.py index aa6b0351..e1ea27ed 100644 --- a/atarashi/agents/dameruLevenDist.py +++ b/atarashi/agents/dameruLevenDist.py @@ -19,7 +19,7 @@ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ -import argparse +import plac import sys from pyxdameraulevenshtein import damerau_levenshtein_distance @@ -62,18 +62,18 @@ def scan(self, filePath): return temp[0] -if __name__ == "__main__": - print("The file has been run directly") - parser = argparse.ArgumentParser() - parser.add_argument("inputFile", help="Specify the input file which needs to be scanned") - parser.add_argument("processedLicenseList", - help="Specify the processed license list file which contains licenses") - parser.add_argument("-v", "--verbose", help="increase output verbosity", - action="count", default=0) - args = parser.parse_args() - filename = args.inputFile - licenseList = args.processedLicenseList - verbose = args.verbose +@plac.annotations( + filename = plac.Annotation("Specify the input file which needs to be scanned", metavar="inputFile"), + licenseList = plac.Annotation("Specify the processed license list file which contains licenses", "positional", None, str, metavar="processedLicenseList"), + verbose = plac.Annotation("increase output verbosity", "flag", "v") +) + +def main(filename, licenseList, verbose=False): + print("The file has been run directly") scanner = DameruLevenDist(licenseList, verbose=verbose) - print("License Detected using Dameru Leven Distance: " + str(scanner.scan(filename))) + print("License Detected using Dameru Leven Distance: " + str(scanner.scan(filename))) + + +if __name__ == "__main__": + plac.call(main) \ No newline at end of file diff --git a/atarashi/agents/tfidf.py b/atarashi/agents/tfidf.py index abfa3af1..07e676f3 100644 --- a/atarashi/agents/tfidf.py +++ b/atarashi/agents/tfidf.py @@ -22,7 +22,7 @@ __author__ = "Aman Jain" __email__ = "amanjain5221@gmail.com" -import argparse +import plac from enum import Enum import itertools import time @@ -151,24 +151,15 @@ def setSimAlgo(self, newAlgo): self.algo = newAlgo -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("-s", "--tfidf_similarity", required=False, - default="ScoreSim", - choices=["CosineSim", "ScoreSim"], - help="Specify the similarity algorithm that you want") - parser.add_argument("inputFile", help="Specify the input file which needs to be scanned") - parser.add_argument("processedLicenseList", - help="Specify the processed license list file which contains licenses") - parser.add_argument("-v", "--verbose", help="increase output verbosity", - action="count", default=0) - args = parser.parse_args() - - tfidf_similarity = args.tfidf_similarity - filename = args.inputFile - licenseList = args.processedLicenseList - verbose = args.verbose +@plac.annotations( + filename = plac.Annotation("Specify the input file which needs to be scanned", metavar="inputFile"), + licenseList = plac.Annotation("Specify the processed license list file which contains licenses", "positional", None, str, metavar="processedLicenseList"), + tfidf_similarity = plac.Annotation("Specify the similarity algorithm that you want", "option", "s", str, ["CosineSim", "ScoreSim"], metavar="{CosineSim,ScoreSim}"), + verbose = plac.Annotation("increase output verbosity", "flag", "v") +) + +def main(filename, licenseList, tfidf_similarity="ScoreSim", verbose=False): scanner = TFIDF(licenseList, verbose=verbose) if tfidf_similarity == "CosineSim": scanner.setSimAlgo(TFIDF.TfidfAlgo.cosineSim) @@ -176,3 +167,6 @@ def setSimAlgo(self, newAlgo): else: scanner.setSimAlgo(TFIDF.TfidfAlgo.scoreSim) print("License Detected using TF-IDF algorithm + sum score " + str(scanner.scan(filename))) + +if __name__ == "__main__": + plac.call(main) \ No newline at end of file diff --git a/atarashi/agents/wordFrequencySimilarity.py b/atarashi/agents/wordFrequencySimilarity.py index f365af64..fca62064 100644 --- a/atarashi/agents/wordFrequencySimilarity.py +++ b/atarashi/agents/wordFrequencySimilarity.py @@ -22,7 +22,7 @@ __author__ = "Aman Jain" __email__ = "amanjain5221@gmail.com" -import argparse +import plac import re from atarashi.agents.atarashiAgent import AtarashiAgent, exactMatcher @@ -79,19 +79,18 @@ def scan(self, filePath): return temp -if __name__ == "__main__": - print("The file has been called from main") - parser = argparse.ArgumentParser() - parser.add_argument("inputFile", help = "Specify the input file which needs to be scanned") - parser.add_argument("processedLicenseList", - help = "Specify the processed license list file which contains licenses") - parser.add_argument("-v", "--verbose", help = "increase output verbosity", - action = "count", default = 0) - - args = parser.parse_args() - filename = args.inputFile - licenseList = args.processedLicenseList - verbose = args.verbose +@plac.annotations( + filename = plac.Annotation("Specify the input file which needs to be scanned", metavar="inputFile"), + licenseList = plac.Annotation("Specify the processed license list file which contains licenses", "positional", None, str, metavar="processedLicenseList"), + verbose = plac.Annotation("increase output verbosity", "flag", "v") +) + +def main(filename, licenseList, verbose=False): + print("The file has been called from main") scanner = WordFrequencySimilarity(licenseList, verbose = verbose) print("The result from Histogram similarity algo is ", scanner.scan(filename)) + + +if __name__ == "__main__": + plac.call(main) \ No newline at end of file diff --git a/atarashi/atarashii.py b/atarashi/atarashii.py index 2540491a..bb588a2a 100644 --- a/atarashi/atarashii.py +++ b/atarashi/atarashii.py @@ -18,7 +18,7 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ -import argparse +import plac import os import json from pkg_resources import resource_filename @@ -33,7 +33,9 @@ __version__ = "0.0.11" -def atarashii_runner(inputFile, processedLicense, agent_name, similarity="CosineSim", ngramJsonLoc=None, verbose=None): + + +def atarashii_runner(inputFile, agent_name, processedLicense, similarity="CosineSim", ngram_json=None, verbose=None): ''' :param inputFile: Input File for scanning of license :param processedLicense: Processed License List (CSV) path (Default path already provided) @@ -68,7 +70,7 @@ def atarashii_runner(inputFile, processedLicense, agent_name, similarity="Cosine print("Please choose similarity from {CosineSim,ScoreSim}") return -1 elif agent_name == "Ngram": - scanner = NgramAgent(processedLicense, ngramJson=ngramJsonLoc) + scanner = NgramAgent(processedLicense, ngramJson=ngram_json) if similarity == "CosineSim": scanner.setSimAlgo(NgramAgent.NgramAlgo.cosineSim) elif similarity == "DiceSim": @@ -84,43 +86,31 @@ def atarashii_runner(inputFile, processedLicense, agent_name, similarity="Cosine return result -def main(): - ''' - Calls atarashii_runner for each file in the folder/ repository specified by user - Prints the Input file path and the JSON output from atarashii_runner - ''' +@plac.annotations( + agent_name = plac.Annotation("Name of the agent that needs to be run", "option", "a", str, ["wordFrequencySimilarity", "DLD", "tfidf", "Ngram"], metavar="{wordFrequencySimilarity,DLD,tfidf,Ngram}"), + inputFile = plac.Annotation("Specify the input file path to scan", "positional", None, str, metavar="inputFile"), + processedLicense = plac.Annotation("Specify the location of processed license list file", "option", "l", str, metavar="PROCESSEDLICENSELIST"), + ngram_json = plac.Annotation("Specify the location of Ngram JSON (for Ngram agent only)", "option", "j"), + similarity = plac.Annotation("Specify the similarity algorithm that you want. First 2 are for TFIDF and last 3 are for Ngram", "option", "s", str, ["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"], metavar="{ScoreSim,CosineSim,DiceSim,BigramCosineSim}"), + verbose = plac.Annotation("increase output verbosity", "flag", "v") +) + +def evaluate(inputFile, processedLicense, ngram_json, agent_name="wordFrequencySimilarity", similarity="CosineSim", verbose=False): defaultProcessed = resource_filename("atarashi", "data/licenses/processedLicenses.csv") defaultJSON = resource_filename("atarashi", "data/Ngram_keywords.json") - parser = argparse.ArgumentParser() - parser.add_argument("inputFile", help="Specify the input file path to scan") - parser.add_argument("-l", "--processedLicenseList", required=False, - help="Specify the location of processed license list file") - parser.add_argument("-a", "--agent_name", required=True, - choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'], - help="Name of the agent that needs to be run") - parser.add_argument("-s", "--similarity", required=False, default="CosineSim", - choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"], - help="Specify the similarity algorithm that you want." - " First 2 are for TFIDF and last 3 are for Ngram") - parser.add_argument("-j", "--ngram_json", required=False, - help="Specify the location of Ngram JSON (for Ngram agent only)") - parser.add_argument("-v", "--verbose", help="increase output verbosity", - action="count", default=0) - parser.add_argument('-V', '--version', action='version', version='%(prog)s ' + __version__) - args = parser.parse_args() - inputFile = args.inputFile - agent_name = args.agent_name - similarity = args.similarity - verbose = args.verbose - processedLicense = args.processedLicenseList - ngram_json = args.ngram_json if processedLicense is None: processedLicense = defaultProcessed if ngram_json is None: ngram_json = defaultJSON + if similarity is None: + similarity = "CosineSim" - result = atarashii_runner(inputFile, processedLicense, agent_name, similarity, ngram_json, verbose) + ''' + Calls atarashii_runner for each file in the folder/ repository specified by user + Prints the Input file path and the JSON output from atarashii_runner + ''' + result = atarashii_runner(inputFile, agent_name, processedLicense, similarity, ngram_json, verbose) if agent_name == "wordFrequencySimilarity": result = [{ "shortname": str(result), @@ -135,11 +125,17 @@ def main(): "sim_type": "dld", "description": "" }] - result = list(result) - result = {"file": os.path.abspath(inputFile), "results": result} - result = json.dumps(result, sort_keys=True, ensure_ascii=False, indent=4) - print(result + "\n") + + if result != -1: + result = list(result) + result = {"file": os.path.abspath(inputFile), "results": result} + result = json.dumps(result, sort_keys=True, ensure_ascii=False, indent=4) + print(result + "\n") + + +def main(): + plac.call(evaluate) if __name__ == '__main__': - main() + plac.call(evaluate) \ No newline at end of file diff --git a/atarashi/build_deps.py b/atarashi/build_deps.py index 170c165c..848903ac 100755 --- a/atarashi/build_deps.py +++ b/atarashi/build_deps.py @@ -22,7 +22,7 @@ __author__ = "Gaurav Mishra" __email__ = "gmishx@gmail.com" -import argparse +import plac import os import sys sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + '/../') @@ -40,7 +40,12 @@ The merged CSV is then processesed which is then used to create the Ngrams. """ -def download_dependencies(threads = os.cpu_count(), verbose = 0): +@plac.annotations( + threads = plac.Annotation("No of threads to use for download. Default: CPU count", "option", "t", int, metavar="THREADS"), + verbose = plac.Annotation("increase output verbosity", "flag", "v") +) + +def download_dependencies(threads = os.cpu_count(), verbose = False): currentDir = os.path.dirname(os.path.abspath(__file__)) licenseListCsv = currentDir + "/data/licenses/licenseList.csv" processedLicenseListCsv = currentDir + "/data/licenses/processedLicenses.csv" @@ -59,14 +64,4 @@ def download_dependencies(threads = os.cpu_count(), verbose = 0): createNgrams(processedLicenseListCsv, ngramJsonLoc, threads, verbose) if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("-t", "--threads", required = False, default = os.cpu_count(), - type = int, - help = "No of threads to use for download. Default: CPU count") - parser.add_argument("-v", "--verbose", help = "increase output verbosity", - action = "count", default = 0) - args = parser.parse_args() - threads = args.threads - verbose = args.verbose - - download_dependencies(threads, verbose) + plac.call(download_dependencies) \ No newline at end of file diff --git a/atarashi/evaluator/evaluator.py b/atarashi/evaluator/evaluator.py index 637fe3ad..d5bd97dd 100644 --- a/atarashi/evaluator/evaluator.py +++ b/atarashi/evaluator/evaluator.py @@ -23,8 +23,9 @@ from tqdm import tqdm import shutil import sys -import argparse -from multiprocessing import Pool +import plac +from multiprocessing import Pool, freeze_support +from functools import partial __author__ = "Ayush Bhardwaj" __email__ = "classicayush@gmail.com" @@ -78,10 +79,8 @@ def getCommand(agent_name, similarity): return -1 return command -filesScanned = 0 -match = 0 -def processFile(filepath): +def processFile(filepath, command): ''' processFile function runs the agent command on the bash/terminal and gets the result for the given file @@ -115,6 +114,8 @@ def processFile(filepath): return 0 except Exception: return 0 + else: + return 0 def evaluate(command): ''' @@ -133,7 +134,7 @@ def evaluate(command): fileList.append(filepath) with Pool(os.cpu_count()) as p: - result = list(tqdm(p.imap_unordered(processFile, fileList), total=len(fileList), unit="files")) + result = list(tqdm(p.imap_unordered(partial(processFile, command=command), fileList), total=len(fileList), unit="files")) # success_count is the count of successfully matched files success_count = sum(result) @@ -144,27 +145,21 @@ def evaluate(command): return (timeElapsed, accuracy) -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("-a", "--agent_name", required=True, - choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'], help="Name of the agent that you want to evaluate") - parser.add_argument("-s", "--similarity", required=False, - default=" ", choices=["ScoreSim", "CosineSim", "DiceSim", " ", "BigramCosineSim"], help="Specify the similarity algorithm that you want to evaluate" - " First 2 are for TFIDF and last 3 are for Ngram") - args = parser.parse_args() - agent_name = args.agent_name - similarity = args.similarity - +@plac.annotations( + similarity = plac.Annotation("Specify the similarity algorithm that you want to evaluate. First 2 are for TFIDF and last 3 are for Ngram", "option", "s", str, ["ScoreSim", "CosineSim", "DiceSim", " ", "BigramCosineSim"], metavar="{ScoreSim,CosineSim,DiceSim, ,BigramCosineSim}"), + agent_name = plac.Annotation("Name of the agent that you want to evaluate", "option", "a", str, ["wordFrequencySimilarity", "DLD", "tfidf", "Ngram"], metavar="{wordFrequencySimilarity,DLD,tfidf,Ngram}") +) +def main(similarity, agent_name="wordFrequencySimilarity"): command = getCommand(agent_name, similarity) - timeElapsed, accuracy = evaluate(command) - print('\n' + ' ++++++++++++++++++ Result ++++++++++++++++++') - print(' ++++++++++++++++++++++++++++++++++++++++++++') - prGreen(" ---> Total time elapsed: " + str(round(timeElapsed, 2)) + " Seconds <---") - prGreen(" ---> Accuracy: " + str(round(accuracy, 2)) + "% <---") - print(' ++++++++++++++++++++++++++++++++++++++++++++') - print(' ++++++++++++++++++++++++++++++++++++++++++++') - + if command != -1: + timeElapsed, accuracy = evaluate(command) + print('\n' + ' ++++++++++++++++++ Result ++++++++++++++++++') + print(' ++++++++++++++++++++++++++++++++++++++++++++') + prGreen(" ---> Total time elapsed: " + str(round(timeElapsed, 2)) + " Seconds <---") + prGreen(" ---> Accuracy: " + str(round(accuracy, 2)) + "% <---") + print(' ++++++++++++++++++++++++++++++++++++++++++++') + print(' ++++++++++++++++++++++++++++++++++++++++++++') zf = zipfile.ZipFile("TestFiles.zip", "w") for dirname, subdirs, files in os.walk("TestFiles"): @@ -173,5 +168,8 @@ def evaluate(command): zf.write(os.path.join(dirname, filename)) zf.close() - shutil.rmtree('TestFiles') + shutil.rmtree('TestFiles') +if __name__ == "__main__": + freeze_support() + plac.call(main) \ No newline at end of file diff --git a/atarashi/imtihaan.py b/atarashi/imtihaan.py index 82992a21..3832fdbd 100644 --- a/atarashi/imtihaan.py +++ b/atarashi/imtihaan.py @@ -18,7 +18,7 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ -import argparse +import plac import os import sys from sys import exit @@ -30,28 +30,17 @@ __author__ = "Aman Jain" __email__ = "amanjain5221@gmail.com" -args = None -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("processedLicenseList", help="Specify the processed license list file which contains licenses") - parser.add_argument("AgentName", choices=['DLD', 'tfidf', 'Ngram'], - help="Name of the agent that needs to be run") - parser.add_argument("TestFiles", help="Specify the folder path that needs to be tested") - parser.add_argument("-s", "--similarity", required=False, default="CosineSim", - choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"], - help="Specify the similarity algorithm that you want." - " First 2 are for TFIDF and last 3 are for Ngram") - parser.add_argument("-j", "--ngram_json", required=False, - help="Specify the location of Ngram JSON (for Ngram agent only)") - parser.add_argument("-v", "--verbose", help="increase output verbosity", action="store_true") - args = parser.parse_args() - agent_name = args.AgentName - processedLicense = args.processedLicenseList - testFilePath = args.TestFiles - similarity = args.similarity - ngram_json = args.ngram_json +@plac.annotations( + ngram_json = plac.Annotation("Specify the location of Ngram JSON (for Ngram agent only)", "option", "j"), + processedLicense = plac.Annotation("Specify the processed license list file which contains licenses", metavar="processedLicenseList"), + agent_name = plac.Annotation("Name of the agent that needs to be run", "positional", None, str, ["DLD", "tfidf", "Ngram"], metavar="{DLD,tfidf,Ngram}"), + testFilePath = plac.Annotation("Specify the folder path that needs to be tested", metavar="TestFiles"), + similarity = plac.Annotation("Specify the similarity algorithm that you want. First 2 are for TFIDF and last 3 are for Ngram", "option", "s", str, ["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"], metavar="{ScoreSim,CosineSim,DiceSim,BigramCosineSim}"), + verbose = plac.Annotation("increase output verbosity", "flag", "v") +) +def main(ngram_json, processedLicense, agent_name, testFilePath, similarity="CosineSim", verbose=False): pathname = os.path.dirname(sys.argv[0]) testFilePath = os.path.abspath(testFilePath) @@ -85,4 +74,7 @@ print(filepath.split('tests/')[1]) actual_license = filepath.split('/')[-1].split('.c')[0] result = scanner.scan(filepath) - print("Actual License: " + actual_license + "\nResult: " + str(result) + "\n") + print("Actual License: " + actual_license + "\nResult: " + str(result) + "\n") + +if __name__ == "__main__": + plac.call(main) \ No newline at end of file diff --git a/atarashi/libs/commentPreprocessor.py b/atarashi/libs/commentPreprocessor.py index a208ba69..91365db0 100644 --- a/atarashi/libs/commentPreprocessor.py +++ b/atarashi/libs/commentPreprocessor.py @@ -19,7 +19,7 @@ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ -import argparse +import plac from nirjas import extract import json import os @@ -31,7 +31,6 @@ __author__ = "Aman Jain" __email__ = "amanjain5221@gmail.com" -args = None def licenseComment(data): list = ['source', 'free', 'under','use', 'copyright', 'grant', 'software', 'license','licence', 'agreement', 'distribute', 'redistribution', 'liability', 'rights', 'reserved', 'general', 'public', 'modify', 'modified', 'modification', 'permission','permitted' 'granted', 'distributed', 'notice', 'distribution', 'terms', 'freely', 'licensed', 'merchantibility','redistributed', 'see', 'read', '(c)', 'copying', 'legal', 'licensing', 'spdx'] @@ -139,20 +138,15 @@ def extract(inputFile): return outputFile -if __name__ == "__main__": - print("The file has been run directly") - parser = argparse.ArgumentParser() - parser.add_argument("-p", "--process", required=True, - choices=['preprocess', 'extract'], - help="Which process you want to run") - parser.add_argument("inputFile", help="Specify the input file which needs to be processed") - parser.add_argument("-v", "--verbose", help="increase output verbosity", - action="count", default=0) - args = parser.parse_args() - process = args.process - inputFile = args.inputFile - verbose = args.verbose +@plac.annotations( + process = plac.Annotation("Which process you want to run", "option", "p", str, ["preprocess", "extract"], metavar="{preprocess,extract}"), + inputFile = plac.Annotation("Specify the input file which needs to be processed"), + verbose = plac.Annotation("increase output verbosity", "flag", "v") +) + +def main(process, inputFile, verbose=False): + print("The file has been run directly") if process == "extract": tempLoc = str(CommentPreprocessor.extract(inputFile)) print("Temporary output file path: ", tempLoc) @@ -162,3 +156,7 @@ def extract(inputFile): with open(inputFile) as file: data = file.read().replace('\n', ' ') print("Preprocessed data is: ", str(CommentPreprocessor.preprocess(data))) + + +if __name__ == "__main__": + plac.call(main) \ No newline at end of file diff --git a/atarashi/libs/initialmatch.py b/atarashi/libs/initialmatch.py index 1c47bb55..a2c0024a 100644 --- a/atarashi/libs/initialmatch.py +++ b/atarashi/libs/initialmatch.py @@ -130,4 +130,4 @@ def initial_match(filePath, processedData, licenses): }) matches = list(itertools.chain(spdx_identifiers, exact_match_header, exact_match_fulltext, header_sim_match[:5])) - return matches + return matches \ No newline at end of file diff --git a/atarashi/libs/license_clustering.py b/atarashi/libs/license_clustering.py index f3f603ef..0e41477d 100644 --- a/atarashi/libs/license_clustering.py +++ b/atarashi/libs/license_clustering.py @@ -18,7 +18,7 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ -import argparse +import plac import time from atarashi.libs.utils import cosine_similarity @@ -120,17 +120,16 @@ def cluster_licenses(licenseList, verbose=0): return result -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("processedLicenseList", help="Specify the processed license list file") - parser.add_argument("-v", "--verbose", help="increase output verbosity", - action="count", default=0) - args = parser.parse_args() - - licenseList = args.processedLicenseList - verbose = args.verbose +@plac.annotations( + licenseList = plac.Annotation("Specify the processed license list file", "positional", None, str, metavar="processedLicenseList"), + verbose = plac.Annotation("increase output verbosity", "flag", "v") +) +def main(licenseList, verbose=False): start = time.time() cluster = cluster_licenses(licenseList, verbose) print("Time taken is ", str(time.time() - start)) print(cluster) + +if __name__ == "__main__": + plac.call(main) \ No newline at end of file diff --git a/atarashi/libs/ngram.py b/atarashi/libs/ngram.py index cf112211..597dec8b 100644 --- a/atarashi/libs/ngram.py +++ b/atarashi/libs/ngram.py @@ -18,7 +18,7 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ -import argparse +import plac import json from multiprocessing import Pool as ThreadPool import os @@ -137,23 +137,15 @@ def createNgrams(licenseList, ngramJsonLoc, threads=os.cpu_count(), verbose=0): return ngramJsonLoc, matched_output, no_keyword_matched -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("processedLicenseList", help="Specify the processed license list file") - parser.add_argument("ngramJson", help="Specify the location to store " - "NGRAM JSON") - parser.add_argument("-t", "--threads", required=False, default=os.cpu_count(), - type=int, - help="No of threads to use for download. Default: CPU count") - parser.add_argument("-v", "--verbose", help="increase output verbosity", - action="count", default=0) - args = parser.parse_args() - - licenseList = args.processedLicenseList - threads = args.threads - ngramJsonLoc = args.ngramJson - verbose = args.verbose +@plac.annotations( + licenseList = plac.Annotation("Specify the processed license list file", "positional", None, str, metavar="processedLicenseList"), + ngramJsonLoc = plac.Annotation("Specify the location to store NGRAM JSON", metavar="ngramJson"), + threads = plac.Annotation("No of threads to use for download. Default: CPU count", "option", "t", int), + verbose = plac.Annotation("increase output verbosity", "flag", "v") +) + +def main(licenseList, ngramJsonLoc, threads=os.cpu_count(), verbose=False): createNgrams(licenseList, ngramJsonLoc, threads, verbose=verbose) if verbose > 0: print(matched_output) @@ -168,3 +160,6 @@ def createNgrams(licenseList, ngramJsonLoc, threads=os.cpu_count(), verbose=0): 4. store the unique ngrams in a file (maybe csv or any file) ''' + +if __name__ == '__main__': + plac.call(main) \ No newline at end of file diff --git a/atarashi/libs/utils.py b/atarashi/libs/utils.py index b3b5d1a1..dedb9177 100644 --- a/atarashi/libs/utils.py +++ b/atarashi/libs/utils.py @@ -66,4 +66,4 @@ def cosine_similarity(a, b): if temp == 0: return 0 else: - return dot_product / temp + return dot_product / temp \ No newline at end of file diff --git a/atarashi/license/licenseDownloader.py b/atarashi/license/licenseDownloader.py index ad3f0d5a..0d17af9d 100644 --- a/atarashi/license/licenseDownloader.py +++ b/atarashi/license/licenseDownloader.py @@ -18,7 +18,7 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ -import argparse +import plac from builtins import staticmethod import json from multiprocessing import Pool as ThreadPool @@ -168,14 +168,13 @@ def fetch_exceptional_license(license): return pd.DataFrame(licenseDict, columns=csvColumns) -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("-t", "--threads", required=False, default=os.cpu_count(), - type=int, - help="No of threads to use for download. Default: CPU count") - parser.add_argument("-f", "--force", action="store_true", - help="Force download regardless of existing list") - args = parser.parse_args() - threads = args.threads - force = args.force +@plac.annotations( + force = plac.Annotation("Force download regardless of existing list", "flag", "f"), + threads = plac.Annotation("No of threads to use for download. Default: CPU count", "option", "t", int) +) + +def main(force, threads): print(LicenseDownloader.download_license(threads, force)) + +if __name__ == "__main__": + plac.call(main) \ No newline at end of file diff --git a/atarashi/license/licenseLoader.py b/atarashi/license/licenseLoader.py index 81fb4825..82424910 100644 --- a/atarashi/license/licenseLoader.py +++ b/atarashi/license/licenseLoader.py @@ -36,4 +36,4 @@ def fetch_licenses(licenseList): # common ''' licenseDataFrame = pd.read_csv(licenseList) licenseDataFrame = licenseDataFrame.replace(np.nan, '', regex = True) - return licenseDataFrame + return licenseDataFrame \ No newline at end of file diff --git a/atarashi/license/licensePreprocessor.py b/atarashi/license/licensePreprocessor.py index c2efe7c9..722d24d6 100644 --- a/atarashi/license/licensePreprocessor.py +++ b/atarashi/license/licensePreprocessor.py @@ -22,7 +22,7 @@ __author__ = "Gaurav Mishra" __email__ = "gmishx@gmail.com" -import argparse +import plac import os from pathlib import Path @@ -31,8 +31,6 @@ from atarashi.libs.commentPreprocessor import CommentPreprocessor from atarashi.license.licenseLoader import LicenseLoader -args = None - class LicensePreprocessor(object): @@ -103,15 +101,19 @@ def create_processed_file(licenseList, processedFile, verbose=0): return processedFile -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("licenseList", help="Specify the license list file which contains licenses") - parser.add_argument("processedFile", help="Specify the destination to store processed list") - parser.add_argument("-v", "--verbose", help="increase output verbosity", - action="count", default=0) - args = parser.parse_args() +@plac.annotations( + licenseList = plac.Annotation("Specify the license list file which contains licenses", "positional"), + processedFile = plac.Annotation("Specify the destination to store processed list", "positional"), + verbose = plac.Annotation("increase output verbosity", "flag", "v") +) + + +def main(licenseList, processedFile, verbose=False): licenseList = os.path.abspath(args.licenseList) processedFile = os.path.abspath(args.processedFile) - verbose = args.verbose print("Use: " + LicensePreprocessor.create_processed_file(licenseList, processedFile, verbose)) + + +if __name__ == "__main__": + plac.call(main) \ No newline at end of file diff --git a/atarashi/license/license_merger.py b/atarashi/license/license_merger.py index 182b2c66..8720f8e3 100644 --- a/atarashi/license/license_merger.py +++ b/atarashi/license/license_merger.py @@ -22,7 +22,7 @@ __author__ = "Aman Jain" __email__ = "amanjain5221@gmail.com" -import argparse +import plac import os from pathlib import Path @@ -95,18 +95,18 @@ def license_merger(licenseList, requiredlicenseList, verbose=0): return str(Path(os.path.abspath(requiredlicenseList))) -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("licenseList", help="Specify the license list file of fossology which contains licenses") - parser.add_argument("requiredlicenseList", help="Specify the license list file in which you want to merge licenses") - parser.add_argument("-v", "--verbose", help="increase output verbosity", - action="count", default=0) - args = parser.parse_args() +@plac.annotations( + licenseList = plac.Annotation("Specify the license list file of fossology which contains licenses", "positional"), + requiredlicenseList = plac.Annotation("Specify the license list file in which you want to merge licenses", "positional"), + verbose = plac.Annotation("increase output verbosity", "flag", "v") +) - licenseList = args.licenseList - requiredlicenseList = args.requiredlicenseList - verbose = args.verbose +def main(licenseList, requiredlicenseList, verbose=False): filePath = license_merger(licenseList, requiredlicenseList, verbose) if filePath: print("Updated", filePath) + + +if __name__ == "__main__": + plac.call(main) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 716e93dc..cc4323cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,5 +10,6 @@ requires = [ "textdistance>=3.0.3", "pyxDamerauLevenshtein>=1.5", "nirjas>=0.0.3", - "urllib3>=1.24.1" + "urllib3>=1.24.1", + "plac>=1.2.0" ] diff --git a/requirements.txt b/requirements.txt index b90ce4a7..b1cea506 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,5 @@ spacy>=2.0.11 textdistance>=3.0.3 setuptools>=39.2.0 nirjas>=0.0.3 -urllib3>=1.24.1 \ No newline at end of file +urllib3>=1.24.1 +plac>=1.2.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 5177656b..3f657b87 100755 --- a/setup.py +++ b/setup.py @@ -55,7 +55,8 @@ def read(fname): 'tqdm>=4.23.4', 'pandas>=0.23.1', 'urllib3>=1.24.1', - 'nirjas>=0.0.3' + 'nirjas>=0.0.3', + 'plac>=1.2.0' ] requirements = [ @@ -68,7 +69,8 @@ def read(fname): 'textdistance>=3.0.3', 'pyxDamerauLevenshtein>=1.5', 'urllib3>=1.24.1', - 'nirjas>=0.0.3' + 'nirjas>=0.0.3', + 'plac>=1.2.0' ] class BuildAtarashiDependencies(distutils.cmd.Command): @@ -153,4 +155,4 @@ def run(self): }, ) -setup(**metadata) +setup(**metadata) \ No newline at end of file