diff --git a/README.md b/README.md index 8b093e0..17fc925 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,37 @@ pip install scoss You can use SCOSS as a Command Line Interface, or a library in your project, or web-app interface ### Command Line Interface (CLI) +See document by passing ```--help``` argument. +``` +scoss --help +Usage: scoss [OPTIONS] + +Options: + -i, --input-dir TEXT Input directory. [required] + -o, --output-dir TEXT Output directory. + -tc, --threshold-combination [AND|OR] + AND: All metrics are greater than threshold. + OR: At least 1 metric is greater than + threshold. + + -mo, --moss FLOAT RANGE Use moss metric and set up moss threshold. + -co, --count-operator FLOAT RANGE + Use count operator metric and set up count + operator threshold. + + -so, --set-operator FLOAT RANGE + Use set operator metric and set up set + operator threshold. + + -ho, --hash-operator FLOAT RANGE + Use hash operator metric and set up hash + operator threshold. + + --help Show this message and exit. +``` +To get plagiarism report of a directory containing source code files, add ```-i/ --input-dir``` option. Add at least 1 similarity metric in [```-mo/--moss```, ```-co/--count-operator```, ```-so/--set-operator```, ```-ho/--hash-operator```] and its threshold (in range [0,1]). If using 2 or more metrics, you need to define how they should be combined using ```-tc/--threshold-combination``` (```AND``` will be used by default). -Comming soon... - +Basic command: ```scoss -i path/to/source_code_dir/ -tc OR -co 0.1 -ho 0.1 -mo 0.1 -o another_path/to/plagiarism_report/``` ### Using as a library 1. Define a `Scoss` object and register some metrics: @@ -63,4 +91,4 @@ This project is in development, if you find any issues, please create an issue [ ## Acknowledgements This project is sponsored and led by Prof. Do Phan Thuan, Hanoi University of Science and Technology. -A part of this code adapts this source code https://github.com/soachishti/moss.py as baseline for `SMoss`. \ No newline at end of file +A part of this code adapts this source code https://github.com/soachishti/moss.py as baseline for `SMoss`. diff --git a/scoss/assets/comparison.html b/scoss/assets/comparison.html deleted file mode 100644 index 4ac1d00..0000000 --- a/scoss/assets/comparison.html +++ /dev/null @@ -1,52 +0,0 @@ - - - - - - Result - - - -

File Result

- - - - - - - - - - - - -
source1source2{{metric}}
{{file1}}{{file2}}{{score}}
- - - - - - - - - - -
{{file1}}{{file2}}
{{data1}}{{data2}}
- - diff --git a/scoss/assets/smoss_comparison.html b/scoss/assets/smoss_comparison.html deleted file mode 100644 index 82264d6..0000000 --- a/scoss/assets/smoss_comparison.html +++ /dev/null @@ -1,50 +0,0 @@ - - - - Matches for submission/a01-sample.py and submission/a01-sample.py - - - - - - - diff --git a/scoss/assets/summary.html b/scoss/assets/summary.html deleted file mode 100644 index 613c352..0000000 --- a/scoss/assets/summary.html +++ /dev/null @@ -1,72 +0,0 @@ - - - - - - Result - - - - - - - -

Result

- - - - {%for head in heads%} - - {%endfor%} - - {%for link in links%} - - - - {%for metric in link['scores'] %} - {% if 'alignments' in link %} - - {% else %} - - {% endif %} - {%endfor%} - - {%endfor%} -
{{head}}
{{link['source1']}}{{link['source2']}}{{link['scores'][metric]}}{{link['scores'][metric]}}
- - - diff --git a/scoss/cli.py b/scoss/cli.py new file mode 100644 index 0000000..cfcd37c --- /dev/null +++ b/scoss/cli.py @@ -0,0 +1,44 @@ +import scoss + +import click + +@click.command() +@click.option( + '--input-dir', '-i', required=True, + help='Input directory.' +) +@click.option( + '--output-dir', '-o', + help='Output directory.' +) +@click.option( + '--threshold-combination', '-tc', + type=click.Choice(['AND','OR'], case_sensitive=False), + help='AND: All metrics are greater than threshold.\nOR: At least 1 metric is greater than threshold.' +) +@click.option( + '--moss', '-mo', type=click.FloatRange(0,1), + help='Use moss metric and set up moss threshold.' +) +@click.option( + '--count-operator', '-co', type=click.FloatRange(0,1), + help='Use count operator metric and set up count operator threshold.' +) +@click.option( + '--set-operator', '-so', type=click.FloatRange(0,1), + help='Use set operator metric and set up set operator threshold.' +) +@click.option( + '--hash-operator', '-ho', type=click.FloatRange(0,1), + help='Use hash operator metric and set up hash operator threshold.' +) +def scoss_command(input_dir, output_dir, threshold_combination,\ + moss, count_operator, set_operator, hash_operator): + if not output_dir: + output_dir = './' + + scoss.get_all_plagiarism(input_dir, output_dir, threshold_combination, + moss, count_operator, set_operator, hash_operator) + +if __name__ == '__main__': + scoss_command() \ No newline at end of file diff --git a/scoss/html_template.py b/scoss/html_template.py new file mode 100644 index 0000000..1c4f84a --- /dev/null +++ b/scoss/html_template.py @@ -0,0 +1,176 @@ +COMPARISON_HTML=r''' + + + + + Result + + + +

File Result

+ + + + + + + + + + + + +
source1source2{{metric}}
{{file1}}{{file2}}{{score}}
+ + + + + + + + + + +
{{file1}}{{file2}}
{{data1}}{{data2}}
+ +''' + +SMOSS_COMPARISON_HTML=r''' + + + Matches for submission/a01-sample.py and submission/a01-sample.py + + + + + + +''' + +SUMMARY_HTML=r''' + + + + + Result + + + + + + + +

Result

+ + + + {%for head in heads%} + + {%endfor%} + + {%for link in links%} + + + + {%for metric in link['scores'] %} + {% if metric in link['alignments'] %} + + {% else %} + + {% endif %} + {%endfor%} + + {%endfor%} +
{{head}}
{{link['source1']}}{{link['source2']}}{{link['scores'][metric]}}{{link['scores'][metric]}}
+ + +''' \ No newline at end of file diff --git a/scoss/main.py b/scoss/main.py index 1e2d511..d8bf57b 100644 --- a/scoss/main.py +++ b/scoss/main.py @@ -2,8 +2,14 @@ from scoss.metrics.metric_list import MetricList from scoss.my_source import MySource - +from scoss.scoss import Scoss +from scoss.html_template import * +from scoss import smoss +from jinja2 import Environment import os +import sys +import csv +from tqdm import tqdm def check_similarity(metric_name, src_str_1, src_str_2, lang): metric = MetricList([metric_name]) @@ -21,4 +27,242 @@ def align_source(metric_name, src_str_1, src_str_2, lang): alignments = metric.align_source(src1, src2) return alignments[metric_name] +# def get_all_files(contest_path): +# subdirs = [x[0] for x in os.walk(contest_path)] +# all_files = {} +# for i in range(1, len(subdirs)): +# listOfFiles = [] +# subdir = subdirs[i] +# subdir_name = os.path.basename(os.path.normpath(subdir)) +# for (dirpath, _, filenames) in os.walk(subdir): +# listOfFiles += [os.path.join(dirpath, file) for file in filenames] +# for f in listOfFiles: +# if os.stat(f).st_size == 0: +# continue +# ext = f.split('.')[-1] +# if (subdir_name, ext) in all_files: +# all_files[subdir_name, ext].append(f) +# else: +# all_files[subdir_name, ext] = [f] +# return all_files + +def create_dir(filepath): + wdir = filepath + if os.path.isfile(os.path.abspath(filepath)): + wdir = os.path.dirname(filepath) + if not os.path.exists(wdir): + try: + os.makedirs(wdir) + except OSError: # Guard against race condition + pass + +def get_all_files(dir_path): + all_files = {} + for f in os.listdir(dir_path): + f = os.path.join(dir_path, f) + if os.stat(f).st_size == 0: + continue + ext = f.split('.')[-1] + if ext in all_files: + all_files[ext].append(f) + else: + all_files[ext] = [f] + return all_files + +def get_all_plagiarism(input_dir, output_dir, threshold_combination_type='AND', moss_threshold=None, + count_operator_threshold=None, set_operator_threshold=None, hash_operator_threshold=None): + if moss_threshold == None and count_operator_threshold == None and \ + set_operator_threshold == None and hash_operator_threshold == None: + print('Please choose at least 1 metric from [moss, count_operator, set_operator, hash_operator]', file=sys.stderr) + sys.exit(-1) + if not threshold_combination_type: + threshold_combination_type = 'AND' + all_files = get_all_files(input_dir) + input_dir_name = os.path.basename(os.path.normpath(input_dir)) + # output_dir = os.path.join(output_dir, '{}_plagiarism_report/'.format(input_dir_name)) + result_dir = os.path.join(output_dir, '{}_source_comparisons/'.format(input_dir_name)) + create_dir(result_dir) + + heads = None + all_links = [] + for ext, file_list in all_files.items(): + cur_dir_path = os.path.join(result_dir, ext) + create_dir(cur_dir_path) + scoss_matches_dict = {} + if count_operator_threshold != None or \ + set_operator_threshold != None or \ + hash_operator_threshold != None: + print('Getting scoss plagiarism for {} language in {}...'.format(ext, input_dir_name)) + try: + sc = Scoss(lang=ext) + except ValueError as er: + print(er) + continue + if count_operator_threshold != None: + sc.add_metric('count_operator', threshold=count_operator_threshold) + if set_operator_threshold != None: + sc.add_metric('set_operator', threshold=set_operator_threshold) + if hash_operator_threshold != None: + sc.add_metric('hash_operator', threshold=hash_operator_threshold) + for f in file_list: + user_filename = os.path.basename(f) + # problem_dir = os.path.basename(os.path.dirname(f)) + sc.add_file(f, user_filename) + sc.run() + if threshold_combination_type.upper() == 'AND': + scoss_matches = sc.get_matches(or_thresholds=False, and_thresholds=True) + else: # Be careful + scoss_matches = sc.get_matches(or_thresholds=True, and_thresholds=False) + for match in scoss_matches: + if match['source1'] < match['source2']: + scoss_matches_dict[match['source1'], match['source2']] = match['scores'] + else: + scoss_matches_dict[match['source2'], match['source1']] = match['scores'] + print('Successfully getting scoss plagiarism for {} language in {}!'.format(ext, input_dir_name)) + + smoss_matches_dict = {} + if moss_threshold != None: + print('Getting smoss plagiarism for {} language in {}...'.format(ext, input_dir_name)) + try: + sm = smoss.SMoss(lang=ext) + except: + print('Unsupported languge: ', ext) + continue + sm.set_threshold(moss_threshold) + for f in file_list: + user_filename = os.path.basename(f) + # problem_dir = os.path.basename(os.path.dirname(f)) + sm.add_file(f, user_filename) + sm.run() + smoss_matches = sm.get_matches() + for match in smoss_matches: + if match['source1'] < match['source2']: + smoss_matches_dict[match['source1'], match['source2']] = match['scores'] + else: + smoss_matches_dict[match['source2'], match['source1']] = match['scores'] + print('Successfully getting smoss plagiarism for {} language in {}!'.format(ext, input_dir_name)) + ################################################################################### + all_matches_dict = {} + # print('scoss_matches_dict = ', scoss_matches_dict) + # print('smoss_matches_dict = ', smoss_matches_dict) + if not scoss_matches_dict and not smoss_matches_dict: + continue + elif not scoss_matches_dict or not smoss_matches_dict: + scoss_matches_dict.update(smoss_matches_dict) + all_matches_dict = scoss_matches_dict + else: + for k, v in scoss_matches_dict.items(): + if k in smoss_matches_dict: + all_matches_dict[k] = v + all_matches_dict[k].update(smoss_matches_dict[k]) + elif threshold_combination_type == 'AND': + continue + else: # OR_threshold + all_matches_dict[k] = v + all_matches_dict[k].update({'moss_score':0}) + for k, v in all_matches_dict.items(): + scores = list(all_matches_dict[k].values()) + all_matches_dict[k]['average_score'] = sum(scores) / len(scores) + + # Sort all_matches_dict by average_score + all_matches_dict = {k: v for k, v in sorted(all_matches_dict.items(), key=lambda item: -item[1]['average_score'])} + # all_matches_dict = sorted(all_matches_dict, key = lambda i: float(i['scores']['average_score']), reverse=True) + values_view = all_matches_dict.values() + value_iterator = iter(values_view) + first_score = next(value_iterator) + heads = ['source1', 'source2'] + list(first_score.keys()) + links = [] + for (src1, src2), scores in tqdm(all_matches_dict.items(), desc='Creating comparison reports', unit=' comparisons'): + dic = {} + dic['source1'] = src1 + dic['source2'] = src2 + dic['scores'] = {} + dic['alignments'] = {} + # print('scores =', scores) + for metric in scores.keys(): + C = int(scores[metric]*255) + R = C + G = 0 + B = 0 + span = ''.format(R,G,B) + str(format(scores[metric]*100, '.2f')) +'%' + if metric == 'average_score': + dic['scores'][metric] = span + links.append(dic) + continue + elif metric == 'moss_score': + try: + compe = sm.get_matches_file()[src1][src2] + except KeyError: + compe = '' + else: + source_str1 = sc.get_sources()[src1].source_str + source_str2 = sc.get_sources()[src2].source_str + data1 = [i.replace('<', '<').replace('>', '>') for i in source_str1.split('\n')] + data2 = [i.replace('<', '<').replace('>', '>') for i in source_str2.split('\n')] + html1 = '' + html2 = '' + alignment = align_source(metric, source_str1, source_str2, ext) + for line in alignment: + if line[0] == -1 : + html1 += '
  
' + temp2 = '
'+  str(line[1])+ '	'+  data2[line[1]-1] + '
' + html2 += temp2 + elif line[1] == -1 : + html2 += '
  
' + temp1 = '
'+  str(line[0])+ '	'+  data1[line[0]-1] + '
' + html1 += temp1 + elif line[0] != -1 and line[0] != -1: + if line[2] >=0.25 and line[2] <0.75: + temp1 = '
'+  str(line[0])+ '	'+  data1[line[0]-1] + '
' + html1 += temp1 + temp2 = '
'+  str(line[1])+ '	'+  data2[line[1]-1] + '
' + html2 += temp2 + elif line[2] >= 0.75: + temp1 = '
'+  str(line[0])+ '	'+  data1[line[0]-1] + '
' + html1 += temp1 + temp2 = '
'+  str(line[1])+ '	'+  data2[line[1]-1] + '
' + html2 += temp2 + else: + temp1 = '
'+  str(line[0])+ '	'+  data1[line[0]-1] + '
' + html1 += temp1 + temp2 = '
'+  str(line[1])+ '	'+  data2[line[1]-1] + '
' + html2 += temp2 + compe = Environment().from_string(COMPARISON_HTML).render(file1=match['source1'], file2=match['source2'], \ + metric=metric, score=span, \ + data1=html1, data2=html2) + name_file = '{}_{}_{}.html'.format(src1, src2, metric) + # create_dir(name_file) + with open(os.path.join(cur_dir_path, name_file), 'w', encoding='utf-8') as file: + file.write(compe) + dic['scores'][metric] = span + dic['alignments'][metric] = '{}_source_comparisons/{}/{}'.format(input_dir_name, ext, name_file) + links.append(dic) + all_links += links + + # page = Environment().from_string(SUMMARY_HTML).render(heads=heads, links=links) + # with open(os.path.join(output_dir, 'summary_{}.html'.format(ext)), 'w') as file: + # file.write(page) + + if not heads: + print("There is no plagiarism activities!") + sys.exit(0) + all_links = sorted(all_links, key = lambda i: float(i['scores']['average_score'].split('">')[-1].split('%')[0]), reverse=True) + page = Environment().from_string(SUMMARY_HTML).render(heads=heads, links=all_links) + with open(os.path.join(output_dir, '{}_summary.html'.format(input_dir_name)), 'w') as file: + file.write(page) + + with open(os.path.join(output_dir,'{}_summary.csv'.format(input_dir_name)), mode='w', newline='') as f: + writer = csv.writer(f) + writer.writerow(heads) + for link in all_links: + row = [link['source1'], link['source2']] + for k, v in link['scores'].items(): + row.append(v.split('">')[-1].split('%')[0]+'%') + writer.writerow(row) + + + + + + diff --git a/scoss/my_source.py b/scoss/my_source.py index f43a471..bc11936 100644 --- a/scoss/my_source.py +++ b/scoss/my_source.py @@ -25,7 +25,7 @@ def from_file(cls, filepath, lang=None, name=None): return the Source object :rtype: Source """ - with open(filepath) as f: + with open(filepath, encoding='utf-8') as f: source_str = f.read() if lang is None: ext = os.path.splitext(filepath)[1][1:] diff --git a/scoss/scoss.py b/scoss/scoss.py index a689375..4bade93 100644 --- a/scoss/scoss.py +++ b/scoss/scoss.py @@ -4,6 +4,7 @@ from scoss.metrics.token_based_metric import * from scoss.utils import check_language from scoss.my_source import MySource +from scoss.html_template import * from jinja2 import Environment from collections import OrderedDict, defaultdict @@ -246,7 +247,7 @@ def save_as_html(self, output_dir='./', or_thresholds=False, and_thresholds=True trimmed: output_dir: save all html files in output_dir, if output_dir=None -> donot save Return: - ret: A dictionary of html files. example: {'summary.html': HTML1, 'match1.html': HTML2, ....} + ret: A dictionary of html files. example: {'summary.html': SUMMARY_HTML, 'match1.html': COMPARISON_HTML, ....} """ def score_color(score): @@ -258,13 +259,6 @@ def score_color(score): R, G, B) + str(format(score*100, '.2f')) + '%' return span - HTML1 = "" - HTML2 = "" - with open('./scoss/assets/summary.html', mode='r') as f: - HTML1 = f.read() - with open('./scoss/assets/comparison.html', mode='r') as f: - HTML2 = f.read() - print("Running...") matches = self.get_matches(or_thresholds, and_thresholds) @@ -283,7 +277,7 @@ def score_color(score): links = matches print("Saving summary...") - page = Environment().from_string(HTML1).render(heads=heads, links=links) + page = Environment().from_string(COMPARISON_HTML).render(heads=heads, links=links) with open(os.path.join(output_dir, 'summary.html'), 'w') as file: file.write(page) @@ -369,14 +363,17 @@ def score_color(score): span = score_color(match['scores'][metric]) dic['scores'][metric] = span dic['alignments'][metric] = name_file - compe = Environment().from_string(HTML2).render(file1=match['source1'], file2=match['source2'], + compe = Environment().from_string(COMPARISON_HTML).render(file1=match['source1'], file2=match['source2'], metric=metric, score=span, data1=html1, data2=html2) with open(os.path.join(output_dir, name_file), 'w') as file: file.write(compe) links.append(dic) - page = Environment().from_string(HTML1).render(heads=heads, links=links) + page = Environment().from_string(SUMMARY_HTML).render(heads=heads, links=links) with open(os.path.join(output_dir, 'summary.html'), 'w') as file: file.write(page) print("Done!") + + def get_sources(self): + return self.__sources \ No newline at end of file diff --git a/scoss/smoss.py b/scoss/smoss.py index 64a9d7b..c396e8e 100644 --- a/scoss/smoss.py +++ b/scoss/smoss.py @@ -19,6 +19,7 @@ import requests from scoss.utils import check_language +from scoss.html_template import * try: from urllib.request import urlopen @@ -218,9 +219,8 @@ def parse_html_table(self, url): tds = soup.find_all('td') i = 0 self.__matches = [] - with open('./scoss/assets/smoss_comparison.html', mode='r') as f: - big_html_string = f.read() - bases = big_html_string.split('<<<>>>') + + bases = SMOSS_COMPARISON_HTML.split('<<<>>>') while i < len(tds): score_str = tds[i].contents[0].contents[0][-4:-2] score_str = ''.join(c for c in score_str if c.isdigit()) @@ -271,7 +271,7 @@ def parse_html_table(self, url): else: self.__matches_file[src2] = {src1:match_comparison} # with open(os.path.join('./tests/smoss_result/', 'big_all_html.html'), 'w') as file: - # file.write(big_html_string) + # file.write(SMOSS_COMPARISON_HTML) i += 3 def upload_file(self, s, src, mask, file_id, on_send): @@ -355,9 +355,6 @@ def save_html(url, file_name): def save_as_html(self, output_dir=None): if self.__state == SMossState.INIT: self.run() - HTML1 = "" - with open('./scoss/assets/summary.html', mode='r') as f: - HTML1 = f.read() if len(self.__matches) != 0: heads = [x for x in self.__matches[0].keys() if x != 'link'] @@ -379,7 +376,7 @@ def save_as_html(self, output_dir=None): dic['scores'][metric] = [name_file, span] links.append(dic) self.process_url(match['link'], name_file, output_dir) - page = Environment().from_string(HTML1).render(heads=heads, links=links) + page = Environment().from_string(SUMMARY_HTML).render(heads=heads, links=links) with open(os.path.join(output_dir, 'summary.html'), 'w') as file: file.write(page) diff --git a/scoss/utils/utils.py b/scoss/utils/utils.py index 96308f4..50c181e 100644 --- a/scoss/utils/utils.py +++ b/scoss/utils/utils.py @@ -15,5 +15,5 @@ def check_language(lang): return lang if lang in LANG_MAP: return LANG_MAP[lang] - raise ValueError("Unsupported languge") + raise ValueError("Unsupported languge: {}".format(lang)) diff --git a/setup.py b/setup.py index c297cd6..7f72488 100644 --- a/setup.py +++ b/setup.py @@ -4,9 +4,9 @@ long_description = fh.read() PROJECT_URLS = { - 'Bug Tracker': 'https://github.com/ngocjr7/geneticpython/issues', - 'Documentation': 'https://github.com/ngocjr7/geneticpython/blob/master/README.md', - 'Source Code': 'https://github.com/ngocjr7/geneticpython' + 'Bug Tracker': 'https://github.com/ngocjr7/scoss/issues', + 'Documentation': 'https://github.com/ngocjr7/scoss/blob/master/README.md', + 'Source Code': 'https://github.com/ngocjr7/scoss' } with open('requirements.txt') as f: @@ -21,6 +21,10 @@ author_email='ngocjr7@gmail.com', project_urls=PROJECT_URLS, version='0.0.2', + entry_points=''' + [console_scripts] + scoss=scoss.cli:scoss_command + ''', packages=find_packages(), install_requires=install_requires, python_requires='>=3.6')