diff --git a/README.md b/README.md
index 8b093e0..17fc925 100644
--- a/README.md
+++ b/README.md
@@ -18,9 +18,37 @@ pip install scoss
You can use SCOSS as a Command Line Interface, or a library in your project, or web-app interface
### Command Line Interface (CLI)
+See document by passing ```--help``` argument.
+```
+scoss --help
+Usage: scoss [OPTIONS]
+
+Options:
+ -i, --input-dir TEXT Input directory. [required]
+ -o, --output-dir TEXT Output directory.
+ -tc, --threshold-combination [AND|OR]
+ AND: All metrics are greater than threshold.
+ OR: At least 1 metric is greater than
+ threshold.
+
+ -mo, --moss FLOAT RANGE Use moss metric and set up moss threshold.
+ -co, --count-operator FLOAT RANGE
+ Use count operator metric and set up count
+ operator threshold.
+
+ -so, --set-operator FLOAT RANGE
+ Use set operator metric and set up set
+ operator threshold.
+
+ -ho, --hash-operator FLOAT RANGE
+ Use hash operator metric and set up hash
+ operator threshold.
+
+ --help Show this message and exit.
+```
+To get plagiarism report of a directory containing source code files, add ```-i/ --input-dir``` option. Add at least 1 similarity metric in [```-mo/--moss```, ```-co/--count-operator```, ```-so/--set-operator```, ```-ho/--hash-operator```] and its threshold (in range [0,1]). If using 2 or more metrics, you need to define how they should be combined using ```-tc/--threshold-combination``` (```AND``` will be used by default).
-Comming soon...
-
+Basic command: ```scoss -i path/to/source_code_dir/ -tc OR -co 0.1 -ho 0.1 -mo 0.1 -o another_path/to/plagiarism_report/```
### Using as a library
1. Define a `Scoss` object and register some metrics:
@@ -63,4 +91,4 @@ This project is in development, if you find any issues, please create an issue [
## Acknowledgements
This project is sponsored and led by Prof. Do Phan Thuan, Hanoi University of Science and Technology.
-A part of this code adapts this source code https://github.com/soachishti/moss.py as baseline for `SMoss`.
\ No newline at end of file
+A part of this code adapts this source code https://github.com/soachishti/moss.py as baseline for `SMoss`.
diff --git a/scoss/assets/comparison.html b/scoss/assets/comparison.html
deleted file mode 100644
index 4ac1d00..0000000
--- a/scoss/assets/comparison.html
+++ /dev/null
@@ -1,52 +0,0 @@
-
-
-
-
-
- Result
-
-
-
-File Result
-
-
- source1
- source2
- {{metric}}
-
-
- {{file1}}
- {{file2}}
- {{score}}
-
-
-
-
-
-
- {{file1}}
- {{file2}}
-
-
- {{data1}}
- {{data2}}
-
-
-
-
diff --git a/scoss/assets/smoss_comparison.html b/scoss/assets/smoss_comparison.html
deleted file mode 100644
index 82264d6..0000000
--- a/scoss/assets/smoss_comparison.html
+++ /dev/null
@@ -1,50 +0,0 @@
-
-
-
- Matches for submission/a01-sample.py and submission/a01-sample.py
-
-
-
-
-
-
-
diff --git a/scoss/assets/summary.html b/scoss/assets/summary.html
deleted file mode 100644
index 613c352..0000000
--- a/scoss/assets/summary.html
+++ /dev/null
@@ -1,72 +0,0 @@
-
-
-
-
-
- Result
-
-
-
-
-
-
-
-Result
-
-
-
- {%for head in heads%}
- {{head}}
- {%endfor%}
-
- {%for link in links%}
-
- {{link['source1']}}
- {{link['source2']}}
- {%for metric in link['scores'] %}
- {% if 'alignments' in link %}
- {{link['scores'][metric]}}
- {% else %}
- {{link['scores'][metric]}}
- {% endif %}
- {%endfor%}
-
- {%endfor%}
-
-
-
-
diff --git a/scoss/cli.py b/scoss/cli.py
new file mode 100644
index 0000000..cfcd37c
--- /dev/null
+++ b/scoss/cli.py
@@ -0,0 +1,44 @@
+import scoss
+
+import click
+
+@click.command()
+@click.option(
+ '--input-dir', '-i', required=True,
+ help='Input directory.'
+)
+@click.option(
+ '--output-dir', '-o',
+ help='Output directory.'
+)
+@click.option(
+ '--threshold-combination', '-tc',
+ type=click.Choice(['AND','OR'], case_sensitive=False),
+ help='AND: All metrics are greater than threshold.\nOR: At least 1 metric is greater than threshold.'
+)
+@click.option(
+ '--moss', '-mo', type=click.FloatRange(0,1),
+ help='Use moss metric and set up moss threshold.'
+)
+@click.option(
+ '--count-operator', '-co', type=click.FloatRange(0,1),
+ help='Use count operator metric and set up count operator threshold.'
+)
+@click.option(
+ '--set-operator', '-so', type=click.FloatRange(0,1),
+ help='Use set operator metric and set up set operator threshold.'
+)
+@click.option(
+ '--hash-operator', '-ho', type=click.FloatRange(0,1),
+ help='Use hash operator metric and set up hash operator threshold.'
+)
+def scoss_command(input_dir, output_dir, threshold_combination,\
+ moss, count_operator, set_operator, hash_operator):
+ if not output_dir:
+ output_dir = './'
+
+ scoss.get_all_plagiarism(input_dir, output_dir, threshold_combination,
+ moss, count_operator, set_operator, hash_operator)
+
+if __name__ == '__main__':
+ scoss_command()
\ No newline at end of file
diff --git a/scoss/html_template.py b/scoss/html_template.py
new file mode 100644
index 0000000..1c4f84a
--- /dev/null
+++ b/scoss/html_template.py
@@ -0,0 +1,176 @@
+COMPARISON_HTML=r'''
+
+
+
+
+ Result
+
+
+
+File Result
+
+
+ source1
+ source2
+ {{metric}}
+
+
+ {{file1}}
+ {{file2}}
+ {{score}}
+
+
+
+
+
+
+ {{file1}}
+ {{file2}}
+
+
+ {{data1}}
+ {{data2}}
+
+
+
+'''
+
+SMOSS_COMPARISON_HTML=r'''
+
+
+ Matches for submission/a01-sample.py and submission/a01-sample.py
+
+
+
+
+
+
+'''
+
+SUMMARY_HTML=r'''
+
+
+
+
+ Result
+
+
+
+
+
+
+
+Result
+
+
+
+ {%for head in heads%}
+ {{head}}
+ {%endfor%}
+
+ {%for link in links%}
+
+ {{link['source1']}}
+ {{link['source2']}}
+ {%for metric in link['scores'] %}
+ {% if metric in link['alignments'] %}
+ {{link['scores'][metric]}}
+ {% else %}
+ {{link['scores'][metric]}}
+ {% endif %}
+ {%endfor%}
+
+ {%endfor%}
+
+
+
+'''
\ No newline at end of file
diff --git a/scoss/main.py b/scoss/main.py
index 1e2d511..d8bf57b 100644
--- a/scoss/main.py
+++ b/scoss/main.py
@@ -2,8 +2,14 @@
from scoss.metrics.metric_list import MetricList
from scoss.my_source import MySource
-
+from scoss.scoss import Scoss
+from scoss.html_template import *
+from scoss import smoss
+from jinja2 import Environment
import os
+import sys
+import csv
+from tqdm import tqdm
def check_similarity(metric_name, src_str_1, src_str_2, lang):
metric = MetricList([metric_name])
@@ -21,4 +27,242 @@ def align_source(metric_name, src_str_1, src_str_2, lang):
alignments = metric.align_source(src1, src2)
return alignments[metric_name]
+# def get_all_files(contest_path):
+# subdirs = [x[0] for x in os.walk(contest_path)]
+# all_files = {}
+# for i in range(1, len(subdirs)):
+# listOfFiles = []
+# subdir = subdirs[i]
+# subdir_name = os.path.basename(os.path.normpath(subdir))
+# for (dirpath, _, filenames) in os.walk(subdir):
+# listOfFiles += [os.path.join(dirpath, file) for file in filenames]
+# for f in listOfFiles:
+# if os.stat(f).st_size == 0:
+# continue
+# ext = f.split('.')[-1]
+# if (subdir_name, ext) in all_files:
+# all_files[subdir_name, ext].append(f)
+# else:
+# all_files[subdir_name, ext] = [f]
+# return all_files
+
+def create_dir(filepath):
+ wdir = filepath
+ if os.path.isfile(os.path.abspath(filepath)):
+ wdir = os.path.dirname(filepath)
+ if not os.path.exists(wdir):
+ try:
+ os.makedirs(wdir)
+ except OSError: # Guard against race condition
+ pass
+
+def get_all_files(dir_path):
+ all_files = {}
+ for f in os.listdir(dir_path):
+ f = os.path.join(dir_path, f)
+ if os.stat(f).st_size == 0:
+ continue
+ ext = f.split('.')[-1]
+ if ext in all_files:
+ all_files[ext].append(f)
+ else:
+ all_files[ext] = [f]
+ return all_files
+
+def get_all_plagiarism(input_dir, output_dir, threshold_combination_type='AND', moss_threshold=None,
+ count_operator_threshold=None, set_operator_threshold=None, hash_operator_threshold=None):
+ if moss_threshold == None and count_operator_threshold == None and \
+ set_operator_threshold == None and hash_operator_threshold == None:
+ print('Please choose at least 1 metric from [moss, count_operator, set_operator, hash_operator]', file=sys.stderr)
+ sys.exit(-1)
+ if not threshold_combination_type:
+ threshold_combination_type = 'AND'
+ all_files = get_all_files(input_dir)
+ input_dir_name = os.path.basename(os.path.normpath(input_dir))
+ # output_dir = os.path.join(output_dir, '{}_plagiarism_report/'.format(input_dir_name))
+ result_dir = os.path.join(output_dir, '{}_source_comparisons/'.format(input_dir_name))
+ create_dir(result_dir)
+
+ heads = None
+ all_links = []
+ for ext, file_list in all_files.items():
+ cur_dir_path = os.path.join(result_dir, ext)
+ create_dir(cur_dir_path)
+ scoss_matches_dict = {}
+ if count_operator_threshold != None or \
+ set_operator_threshold != None or \
+ hash_operator_threshold != None:
+ print('Getting scoss plagiarism for {} language in {}...'.format(ext, input_dir_name))
+ try:
+ sc = Scoss(lang=ext)
+ except ValueError as er:
+ print(er)
+ continue
+ if count_operator_threshold != None:
+ sc.add_metric('count_operator', threshold=count_operator_threshold)
+ if set_operator_threshold != None:
+ sc.add_metric('set_operator', threshold=set_operator_threshold)
+ if hash_operator_threshold != None:
+ sc.add_metric('hash_operator', threshold=hash_operator_threshold)
+ for f in file_list:
+ user_filename = os.path.basename(f)
+ # problem_dir = os.path.basename(os.path.dirname(f))
+ sc.add_file(f, user_filename)
+ sc.run()
+ if threshold_combination_type.upper() == 'AND':
+ scoss_matches = sc.get_matches(or_thresholds=False, and_thresholds=True)
+ else: # Be careful
+ scoss_matches = sc.get_matches(or_thresholds=True, and_thresholds=False)
+ for match in scoss_matches:
+ if match['source1'] < match['source2']:
+ scoss_matches_dict[match['source1'], match['source2']] = match['scores']
+ else:
+ scoss_matches_dict[match['source2'], match['source1']] = match['scores']
+ print('Successfully getting scoss plagiarism for {} language in {}!'.format(ext, input_dir_name))
+
+ smoss_matches_dict = {}
+ if moss_threshold != None:
+ print('Getting smoss plagiarism for {} language in {}...'.format(ext, input_dir_name))
+ try:
+ sm = smoss.SMoss(lang=ext)
+ except:
+ print('Unsupported languge: ', ext)
+ continue
+ sm.set_threshold(moss_threshold)
+ for f in file_list:
+ user_filename = os.path.basename(f)
+ # problem_dir = os.path.basename(os.path.dirname(f))
+ sm.add_file(f, user_filename)
+ sm.run()
+ smoss_matches = sm.get_matches()
+ for match in smoss_matches:
+ if match['source1'] < match['source2']:
+ smoss_matches_dict[match['source1'], match['source2']] = match['scores']
+ else:
+ smoss_matches_dict[match['source2'], match['source1']] = match['scores']
+ print('Successfully getting smoss plagiarism for {} language in {}!'.format(ext, input_dir_name))
+ ###################################################################################
+ all_matches_dict = {}
+ # print('scoss_matches_dict = ', scoss_matches_dict)
+ # print('smoss_matches_dict = ', smoss_matches_dict)
+ if not scoss_matches_dict and not smoss_matches_dict:
+ continue
+ elif not scoss_matches_dict or not smoss_matches_dict:
+ scoss_matches_dict.update(smoss_matches_dict)
+ all_matches_dict = scoss_matches_dict
+ else:
+ for k, v in scoss_matches_dict.items():
+ if k in smoss_matches_dict:
+ all_matches_dict[k] = v
+ all_matches_dict[k].update(smoss_matches_dict[k])
+ elif threshold_combination_type == 'AND':
+ continue
+ else: # OR_threshold
+ all_matches_dict[k] = v
+ all_matches_dict[k].update({'moss_score':0})
+ for k, v in all_matches_dict.items():
+ scores = list(all_matches_dict[k].values())
+ all_matches_dict[k]['average_score'] = sum(scores) / len(scores)
+
+ # Sort all_matches_dict by average_score
+ all_matches_dict = {k: v for k, v in sorted(all_matches_dict.items(), key=lambda item: -item[1]['average_score'])}
+ # all_matches_dict = sorted(all_matches_dict, key = lambda i: float(i['scores']['average_score']), reverse=True)
+ values_view = all_matches_dict.values()
+ value_iterator = iter(values_view)
+ first_score = next(value_iterator)
+ heads = ['source1', 'source2'] + list(first_score.keys())
+ links = []
+ for (src1, src2), scores in tqdm(all_matches_dict.items(), desc='Creating comparison reports', unit=' comparisons'):
+ dic = {}
+ dic['source1'] = src1
+ dic['source2'] = src2
+ dic['scores'] = {}
+ dic['alignments'] = {}
+ # print('scores =', scores)
+ for metric in scores.keys():
+ C = int(scores[metric]*255)
+ R = C
+ G = 0
+ B = 0
+ span = ''.format(R,G,B) + str(format(scores[metric]*100, '.2f')) +'% '
+ if metric == 'average_score':
+ dic['scores'][metric] = span
+ links.append(dic)
+ continue
+ elif metric == 'moss_score':
+ try:
+ compe = sm.get_matches_file()[src1][src2]
+ except KeyError:
+ compe = ''
+ else:
+ source_str1 = sc.get_sources()[src1].source_str
+ source_str2 = sc.get_sources()[src2].source_str
+ data1 = [i.replace('<', '<').replace('>', '>') for i in source_str1.split('\n')]
+ data2 = [i.replace('<', '<').replace('>', '>') for i in source_str2.split('\n')]
+ html1 = ''
+ html2 = ''
+ alignment = align_source(metric, source_str1, source_str2, ext)
+ for line in alignment:
+ if line[0] == -1 :
+ html1 += ' '
+ temp2 = ''+ str(line[1])+ ' '+ data2[line[1]-1] + ' '
+ html2 += temp2
+ elif line[1] == -1 :
+ html2 += ' '
+ temp1 = ''+ str(line[0])+ ' '+ data1[line[0]-1] + ' '
+ html1 += temp1
+ elif line[0] != -1 and line[0] != -1:
+ if line[2] >=0.25 and line[2] <0.75:
+ temp1 = ''+ str(line[0])+ ' '+ data1[line[0]-1] + ' '
+ html1 += temp1
+ temp2 = ''+ str(line[1])+ ' '+ data2[line[1]-1] + ' '
+ html2 += temp2
+ elif line[2] >= 0.75:
+ temp1 = ''+ str(line[0])+ ' '+ data1[line[0]-1] + ' '
+ html1 += temp1
+ temp2 = ''+ str(line[1])+ ' '+ data2[line[1]-1] + ' '
+ html2 += temp2
+ else:
+ temp1 = ''+ str(line[0])+ ' '+ data1[line[0]-1] + ' '
+ html1 += temp1
+ temp2 = ''+ str(line[1])+ ' '+ data2[line[1]-1] + ' '
+ html2 += temp2
+ compe = Environment().from_string(COMPARISON_HTML).render(file1=match['source1'], file2=match['source2'], \
+ metric=metric, score=span, \
+ data1=html1, data2=html2)
+ name_file = '{}_{}_{}.html'.format(src1, src2, metric)
+ # create_dir(name_file)
+ with open(os.path.join(cur_dir_path, name_file), 'w', encoding='utf-8') as file:
+ file.write(compe)
+ dic['scores'][metric] = span
+ dic['alignments'][metric] = '{}_source_comparisons/{}/{}'.format(input_dir_name, ext, name_file)
+ links.append(dic)
+ all_links += links
+
+ # page = Environment().from_string(SUMMARY_HTML).render(heads=heads, links=links)
+ # with open(os.path.join(output_dir, 'summary_{}.html'.format(ext)), 'w') as file:
+ # file.write(page)
+
+ if not heads:
+ print("There is no plagiarism activities!")
+ sys.exit(0)
+ all_links = sorted(all_links, key = lambda i: float(i['scores']['average_score'].split('">')[-1].split('%')[0]), reverse=True)
+ page = Environment().from_string(SUMMARY_HTML).render(heads=heads, links=all_links)
+ with open(os.path.join(output_dir, '{}_summary.html'.format(input_dir_name)), 'w') as file:
+ file.write(page)
+
+ with open(os.path.join(output_dir,'{}_summary.csv'.format(input_dir_name)), mode='w', newline='') as f:
+ writer = csv.writer(f)
+ writer.writerow(heads)
+ for link in all_links:
+ row = [link['source1'], link['source2']]
+ for k, v in link['scores'].items():
+ row.append(v.split('">')[-1].split('%')[0]+'%')
+ writer.writerow(row)
+
+
+
+
+
+
diff --git a/scoss/my_source.py b/scoss/my_source.py
index f43a471..bc11936 100644
--- a/scoss/my_source.py
+++ b/scoss/my_source.py
@@ -25,7 +25,7 @@ def from_file(cls, filepath, lang=None, name=None):
return the Source object
:rtype: Source
"""
- with open(filepath) as f:
+ with open(filepath, encoding='utf-8') as f:
source_str = f.read()
if lang is None:
ext = os.path.splitext(filepath)[1][1:]
diff --git a/scoss/scoss.py b/scoss/scoss.py
index a689375..4bade93 100644
--- a/scoss/scoss.py
+++ b/scoss/scoss.py
@@ -4,6 +4,7 @@
from scoss.metrics.token_based_metric import *
from scoss.utils import check_language
from scoss.my_source import MySource
+from scoss.html_template import *
from jinja2 import Environment
from collections import OrderedDict, defaultdict
@@ -246,7 +247,7 @@ def save_as_html(self, output_dir='./', or_thresholds=False, and_thresholds=True
trimmed:
output_dir: save all html files in output_dir, if output_dir=None -> donot save
Return:
- ret: A dictionary of html files. example: {'summary.html': HTML1, 'match1.html': HTML2, ....}
+ ret: A dictionary of html files. example: {'summary.html': SUMMARY_HTML, 'match1.html': COMPARISON_HTML, ....}
"""
def score_color(score):
@@ -258,13 +259,6 @@ def score_color(score):
R, G, B) + str(format(score*100, '.2f')) + '%'
return span
- HTML1 = ""
- HTML2 = ""
- with open('./scoss/assets/summary.html', mode='r') as f:
- HTML1 = f.read()
- with open('./scoss/assets/comparison.html', mode='r') as f:
- HTML2 = f.read()
-
print("Running...")
matches = self.get_matches(or_thresholds, and_thresholds)
@@ -283,7 +277,7 @@ def score_color(score):
links = matches
print("Saving summary...")
- page = Environment().from_string(HTML1).render(heads=heads, links=links)
+ page = Environment().from_string(COMPARISON_HTML).render(heads=heads, links=links)
with open(os.path.join(output_dir, 'summary.html'), 'w') as file:
file.write(page)
@@ -369,14 +363,17 @@ def score_color(score):
span = score_color(match['scores'][metric])
dic['scores'][metric] = span
dic['alignments'][metric] = name_file
- compe = Environment().from_string(HTML2).render(file1=match['source1'], file2=match['source2'],
+ compe = Environment().from_string(COMPARISON_HTML).render(file1=match['source1'], file2=match['source2'],
metric=metric, score=span,
data1=html1, data2=html2)
with open(os.path.join(output_dir, name_file), 'w') as file:
file.write(compe)
links.append(dic)
- page = Environment().from_string(HTML1).render(heads=heads, links=links)
+ page = Environment().from_string(SUMMARY_HTML).render(heads=heads, links=links)
with open(os.path.join(output_dir, 'summary.html'), 'w') as file:
file.write(page)
print("Done!")
+
+ def get_sources(self):
+ return self.__sources
\ No newline at end of file
diff --git a/scoss/smoss.py b/scoss/smoss.py
index 64a9d7b..c396e8e 100644
--- a/scoss/smoss.py
+++ b/scoss/smoss.py
@@ -19,6 +19,7 @@
import requests
from scoss.utils import check_language
+from scoss.html_template import *
try:
from urllib.request import urlopen
@@ -218,9 +219,8 @@ def parse_html_table(self, url):
tds = soup.find_all('td')
i = 0
self.__matches = []
- with open('./scoss/assets/smoss_comparison.html', mode='r') as f:
- big_html_string = f.read()
- bases = big_html_string.split('<<<>>>')
+
+ bases = SMOSS_COMPARISON_HTML.split('<<<>>>')
while i < len(tds):
score_str = tds[i].contents[0].contents[0][-4:-2]
score_str = ''.join(c for c in score_str if c.isdigit())
@@ -271,7 +271,7 @@ def parse_html_table(self, url):
else:
self.__matches_file[src2] = {src1:match_comparison}
# with open(os.path.join('./tests/smoss_result/', 'big_all_html.html'), 'w') as file:
- # file.write(big_html_string)
+ # file.write(SMOSS_COMPARISON_HTML)
i += 3
def upload_file(self, s, src, mask, file_id, on_send):
@@ -355,9 +355,6 @@ def save_html(url, file_name):
def save_as_html(self, output_dir=None):
if self.__state == SMossState.INIT:
self.run()
- HTML1 = ""
- with open('./scoss/assets/summary.html', mode='r') as f:
- HTML1 = f.read()
if len(self.__matches) != 0:
heads = [x for x in self.__matches[0].keys() if x != 'link']
@@ -379,7 +376,7 @@ def save_as_html(self, output_dir=None):
dic['scores'][metric] = [name_file, span]
links.append(dic)
self.process_url(match['link'], name_file, output_dir)
- page = Environment().from_string(HTML1).render(heads=heads, links=links)
+ page = Environment().from_string(SUMMARY_HTML).render(heads=heads, links=links)
with open(os.path.join(output_dir, 'summary.html'), 'w') as file:
file.write(page)
diff --git a/scoss/utils/utils.py b/scoss/utils/utils.py
index 96308f4..50c181e 100644
--- a/scoss/utils/utils.py
+++ b/scoss/utils/utils.py
@@ -15,5 +15,5 @@ def check_language(lang):
return lang
if lang in LANG_MAP:
return LANG_MAP[lang]
- raise ValueError("Unsupported languge")
+ raise ValueError("Unsupported languge: {}".format(lang))
diff --git a/setup.py b/setup.py
index c297cd6..7f72488 100644
--- a/setup.py
+++ b/setup.py
@@ -4,9 +4,9 @@
long_description = fh.read()
PROJECT_URLS = {
- 'Bug Tracker': 'https://github.com/ngocjr7/geneticpython/issues',
- 'Documentation': 'https://github.com/ngocjr7/geneticpython/blob/master/README.md',
- 'Source Code': 'https://github.com/ngocjr7/geneticpython'
+ 'Bug Tracker': 'https://github.com/ngocjr7/scoss/issues',
+ 'Documentation': 'https://github.com/ngocjr7/scoss/blob/master/README.md',
+ 'Source Code': 'https://github.com/ngocjr7/scoss'
}
with open('requirements.txt') as f:
@@ -21,6 +21,10 @@
author_email='ngocjr7@gmail.com',
project_urls=PROJECT_URLS,
version='0.0.2',
+ entry_points='''
+ [console_scripts]
+ scoss=scoss.cli:scoss_command
+ ''',
packages=find_packages(),
install_requires=install_requires,
python_requires='>=3.6')