Merge pull request #1 from ngocjr7/thaidd

Thaidd
BK-SCOSS · Apr 5, 2021 · dbce045 · dbce045
2 parents a3719b3 + 13d3b1f
commit dbce045
Show file tree

Hide file tree

Showing 11 changed files with 348 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -18,9 +18,37 @@ pip install scoss
 You can use SCOSS as a Command Line Interface, or a library in your project, or web-app interface
 
 ### Command Line Interface (CLI)
+See document by passing ```--help``` argument.
+```
+scoss --help
+Usage: scoss [OPTIONS]
+
+Options:
+  -sd, --submission-dir TEXT      Submission directory.  [required]
+  -o, --output-dir TEXT           Output directory.
+  -tc, --threshold-combination [AND|OR]
+                                  AND: All metrics are greater than threshold.
+                                  OR: At least 1 metric is greater than
+                                  threshold.
+
+  -mo, --moss FLOAT RANGE         Use moss metric and set up moss threshold.
+  -co, --count-operator FLOAT RANGE
+                                  Use count operator metric and set up count
+                                  operator threshold.
+
+  -so, --set-operator FLOAT RANGE
+                                  Use set operator metric and set up set
+                                  operator threshold.
+
+  -ho, --hash-operator FLOAT RANGE
+                                  Use hash operator metric and set up hash
+                                  operator threshold.
+
+  --help                          Show this message and exit.
+```
+To get plagiarism report of a submission directory, add ```-sd/ --submission-dir``` option. Add at least 1 similarity metric in [```-mo/--moss```, ```-co/--count-operator```, ```-so/--set-operator```, ```-ho/--hash-operator```] and its threshold (in range [0,1]). If using 2 or more metrics, you need to define how they should be combined using ```-tc/--threshold-combination``` (```AND``` will be used by default).
 
-Comming soon...
-
+Basic command: ```scoss -sd tests/data/299721 -tc OR -co 0.1 -ho 0.1 -mo 0.1 -o tests/data```
 ### Using as a library
 
 1. Define a `Scoss` object and register some metrics:
@@ -63,4 +91,4 @@ This project is in development, if you find any issues, please create an issue [
 ## Acknowledgements
 This project is sponsored and led by Prof. Do Phan Thuan, Hanoi University of Science and Technology.
 
-A part of this code adapts this source code https://github.com/soachishti/moss.py as baseline for `SMoss`.
+A part of this code adapts this source code https://github.com/soachishti/moss.py as baseline for `SMoss`.
diff --git a/scoss/assets/summary.html b/scoss/assets/summary.html
@@ -48,7 +48,7 @@ <h2>Result</h2>
     <td>{{link['source1']}}</a></td>
     <td>{{link['source2']}}</a></td>
     {%for metric in link['scores'] %}
-        {% if 'alignments' in link %}
+        {% if metric in link['alignments'] %}
             <td><a href="{{link['alignments'][metric]}}"  target="_blank" >{{link['scores'][metric]}}</a></td>
         {% else %}
             <td><span>{{link['scores'][metric]}}</span></td>
@@ -69,4 +69,4 @@ <h2>Result</h2>
     });
 </script>
 </body>
-</html>
+</html>
diff --git a/scoss/cli.py b/scoss/cli.py
@@ -0,0 +1,46 @@
+import scoss
+
+import click
+
+@click.command()
+@click.option(
+    '--submission-dir', '-sd', required=True,
+    help='Submission directory.'
+)
+@click.option(
+    '--output-dir', '-o', 
+    help='Output directory.'
+)
+@click.option(
+    '--threshold-combination', '-tc', 
+    type=click.Choice(['AND','OR'], case_sensitive=False),
+    help='AND: All metrics are greater than threshold.\nOR: At least 1 metric is greater than threshold.'
+)
+@click.option(
+    '--moss', '-mo', type=click.FloatRange(0,1),
+    help='Use moss metric and set up moss threshold.'
+)
+@click.option(
+    '--count-operator', '-co', type=click.FloatRange(0,1),
+    help='Use count operator metric and set up count operator threshold.'
+)
+@click.option(
+    '--set-operator', '-so', type=click.FloatRange(0,1),
+    help='Use set operator metric and set up set operator threshold.'
+)
+@click.option(
+    '--hash-operator', '-ho', type=click.FloatRange(0,1),
+    help='Use hash operator metric and set up hash operator threshold.'
+)
+def scoss_command(submission_dir, output_dir, threshold_combination,\
+    moss, count_operator, set_operator, hash_operator):
+    if not output_dir:
+        output_dir = './'
+
+    scoss.get_all_plagiarism(submission_dir, output_dir, threshold_combination, 
+        moss, count_operator, set_operator, hash_operator)
+
+if __name__ == '__main__':
+    # python scoss_cli.py -sd tests/data/299721 -tc OR -co 0.1 -ho 0.1 -mo 0.1 -o tests/data
+    # python scoss_cli.py -sd tests/data/299721 -tc OR -co 0.9 -ho 0.7 -o tests/data
+    scoss_command()
diff --git a/scoss/main.py b/scoss/main.py
@@ -2,8 +2,13 @@
 
 from scoss.metrics.metric_list import MetricList
 from scoss.my_source import MySource
-
+from scoss.scoss import Scoss
+from scoss import smoss
+from jinja2 import Environment
 import os
+import sys
+import csv
+from tqdm import tqdm
 
 def check_similarity(metric_name, src_str_1, src_str_2, lang):
     metric = MetricList([metric_name])
@@ -21,4 +26,235 @@ def align_source(metric_name, src_str_1, src_str_2, lang):
     alignments = metric.align_source(src1, src2)
     return alignments[metric_name]
 
+def get_all_files(contest_path):
+    subdirs = [x[0] for x in os.walk(contest_path)]
+    all_files = {}
+    for i in range(1, len(subdirs)):
+        listOfFiles = []
+        subdir = subdirs[i]
+        subdir_name = os.path.basename(os.path.normpath(subdir))
+        for (dirpath, _, filenames) in os.walk(subdir):
+            listOfFiles += [os.path.join(dirpath, file) for file in filenames]
+        for f in listOfFiles:
+            if os.stat(f).st_size == 0:
+                continue
+            ext = f.split('.')[-1]
+            if (subdir_name, ext) in all_files:
+                all_files[subdir_name, ext].append(f)
+            else:
+                all_files[subdir_name, ext] = [f]
+    return all_files
+
+def create_dir(filepath):
+    wdir = filepath
+    if os.path.isfile(os.path.abspath(filepath)):
+        wdir = os.path.dirname(filepath)
+    if not os.path.exists(wdir):
+        try:
+            os.makedirs(wdir)
+        except OSError: # Guard against race condition
+            pass
+
+def get_all_plagiarism(input_dir, output_dir, threshold_combination_type='AND', moss_threshold=None,
+    count_operator_threshold=None, set_operator_threshold=None, hash_operator_threshold=None):
+    if moss_threshold == None and count_operator_threshold == None and \
+        set_operator_threshold == None and hash_operator_threshold == None:
+        print('Please choose at least 1 metric from [moss, count_operator, set_operator, hash_operator]', file=sys.stderr)
+        sys.exit(-1)
+    if not threshold_combination_type:
+        threshold_combination_type = 'AND'
+    all_files = get_all_files(input_dir)
+    output_dir = os.path.join(output_dir, 'plagiarism_report_{}/'.format(os.path.basename(os.path.normpath(input_dir))))
+    result_dir = os.path.join(output_dir, 'source_code_comparisons/')
+    create_dir(result_dir)
+
+    with open('./scoss/assets/summary.html', mode='r') as f:
+        HTML1 = f.read()
+    with open('./scoss/assets/comparison.html', mode='r') as f:
+        HTML2 = f.read()
+    heads = None
+    all_links = []
+    for (dir_name, ext), file_list in all_files.items(): 
+        cur_dir_name = '{}_{}'.format(dir_name, ext)
+        cur_dir_path = os.path.join(result_dir, cur_dir_name)
+        # print(cur_dir_path)
+        create_dir(cur_dir_path)
+        scoss_matches_dict = {}
+        if count_operator_threshold != None or \
+            set_operator_threshold != None or \
+            hash_operator_threshold != None:
+            print('Getting scoss plagiarism for {} language in problem {}...'.format(ext, dir_name))
+            try:
+                sc = Scoss(lang=ext)
+            except ValueError as er:
+                print(er)
+                continue
+            if count_operator_threshold != None:
+                sc.add_metric('count_operator', threshold=count_operator_threshold)
+            if set_operator_threshold != None:
+                sc.add_metric('set_operator', threshold=set_operator_threshold)
+            if hash_operator_threshold != None:
+                sc.add_metric('hash_operator', threshold=hash_operator_threshold)
+            for f in file_list:
+                user_filename = os.path.basename(f)
+                problem_dir = os.path.basename(os.path.dirname(f))
+                sc.add_file(f, '{}_{}'.format(problem_dir, user_filename))
+            sc.run()
+            if threshold_combination_type.upper() == 'AND':
+                scoss_matches = sc.get_matches(or_thresholds=False, and_thresholds=True)
+            else: # Be careful
+                scoss_matches = sc.get_matches(or_thresholds=True, and_thresholds=False)
+            for match in scoss_matches:
+                if match['source1'] < match['source2']:
+                    scoss_matches_dict[match['source1'], match['source2']] = match['scores']
+                else:
+                    scoss_matches_dict[match['source2'], match['source1']] = match['scores']
+            print('Successfully getting scoss plagiarism for {} language in problem {}!'.format(ext, dir_name))
+
+        smoss_matches_dict = {}
+        if moss_threshold != None:
+            print('Getting scoss plagiarism for {} language in problem {}...'.format(ext, dir_name))
+            try:
+                sm = smoss.SMoss(lang=ext)
+            except:
+                print('Unsupported languge: ', ext)
+                continue
+            sm.set_threshold(moss_threshold)
+            for f in file_list:
+                user_filename = os.path.basename(f)
+                problem_dir = os.path.basename(os.path.dirname(f))
+                sm.add_file(f, '{}_{}'.format(problem_dir, user_filename))
+            sm.run()
+            smoss_matches = sm.get_matches()
+            for match in smoss_matches:
+                if match['source1'] < match['source2']:
+                    smoss_matches_dict[match['source1'], match['source2']] = match['scores']
+                else:
+                    smoss_matches_dict[match['source2'], match['source1']] = match['scores']
+            print('Successfully getting smoss plagiarism for {} language in problem {}!'.format(ext, dir_name))
+        ###################################################################################
+        all_matches_dict = {}    
+        # print('scoss_matches_dict = ', scoss_matches_dict)   
+        # print('smoss_matches_dict = ', smoss_matches_dict)   
+        if not scoss_matches_dict and not smoss_matches_dict:
+            continue
+        elif not scoss_matches_dict or not smoss_matches_dict:
+            scoss_matches_dict.update(smoss_matches_dict)
+            all_matches_dict = scoss_matches_dict
+        else:
+            for k, v in scoss_matches_dict.items():
+                if k in smoss_matches_dict:
+                    all_matches_dict[k] = v
+                    all_matches_dict[k].update(smoss_matches_dict[k])
+                elif threshold_combination_type == 'AND':
+                    continue
+                else: # OR_threshold
+                    all_matches_dict[k] = v
+                    all_matches_dict[k].update({'moss_score':0})
+        for k, v in all_matches_dict.items():
+            scores = list(all_matches_dict[k].values())
+            all_matches_dict[k]['average_score'] = sum(scores) / len(scores)
+
+        # Sort all_matches_dict by average_score
+        all_matches_dict = {k: v for k, v in sorted(all_matches_dict.items(), key=lambda item: -item[1]['average_score'])}
+        # all_matches_dict = sorted(all_matches_dict, key = lambda i: float(i['scores']['average_score']), reverse=True)
+        values_view = all_matches_dict.values()
+        value_iterator = iter(values_view)
+        first_score = next(value_iterator)
+        heads = ['source1', 'source2'] + list(first_score.keys())
+        links = []
+        print('Creating results')
+        for (src1, src2), scores in tqdm(all_matches_dict.items()):
+            dic = {}
+            dic['source1'] = src1
+            dic['source2'] = src2
+            dic['scores'] = {}
+            dic['alignments'] = {}
+            # print('scores =', scores)
+            for metric in scores.keys():
+                C = int(scores[metric]*255)
+                R = C
+                G = 0
+                B = 0
+                span = '<span style="color: rgb({}, {}, {})">'.format(R,G,B) + str(format(scores[metric]*100, '.2f')) +'%</span>'
+                if metric == 'average_score':
+                    dic['scores'][metric] = span
+                    links.append(dic)
+                    continue
+                elif metric == 'moss_score':
+                    try:
+                        compe = sm.get_matches_file()[src1][src2]
+                    except KeyError:
+                        compe = ''
+                else:
+                    source_str1 = sc.get_sources()[src1].source_str
+                    source_str2 = sc.get_sources()[src2].source_str
+                    data1 = [i.replace('<', '&lt').replace('>', '&gt') for i in source_str1.split('\n')]
+                    data2 = [i.replace('<', '&lt').replace('>', '&gt') for i in source_str2.split('\n')]
+                    html1 = ''
+                    html2 = ''
+                    alignment = align_source(metric, source_str1, source_str2, ext)
+                    for line in alignment:
+                        if line[0] == -1 :
+                            html1 += '<pre >  </pre>'
+                            temp2 = '<pre >'+  str(line[1])+ '	'+  data2[line[1]-1] + '</pre>'
+                            html2 += temp2
+                        elif line[1] == -1 :
+                            html2 += '<pre >  </pre>'
+                            temp1 = '<pre >'+  str(line[0])+ '	'+  data1[line[0]-1] + '</pre>'
+                            html1 += temp1
+                        elif line[0] != -1 and line[0] != -1:
+                            if line[2] >=0.25 and line[2] <0.75:
+                                temp1 = '<pre style="color: #ffb600">'+  str(line[0])+ '	'+  data1[line[0]-1] + '</pre>'
+                                html1 += temp1
+                                temp2 = '<pre style="color: #ffb600">'+  str(line[1])+ '	'+  data2[line[1]-1] + '</pre>'
+                                html2 += temp2
+                            elif line[2] >= 0.75:
+                                temp1 = '<pre style="color: red">'+  str(line[0])+ '	'+  data1[line[0]-1] + '</pre>'
+                                html1 += temp1
+                                temp2 = '<pre style="color: red">'+  str(line[1])+ '	'+  data2[line[1]-1] + '</pre>'
+                                html2 += temp2
+                            else:
+                                temp1 = '<pre style="color: black">'+  str(line[0])+ '	'+  data1[line[0]-1] + '</pre>'
+                                html1 += temp1
+                                temp2 = '<pre style="color: black">'+  str(line[1])+ '	'+  data2[line[1]-1] + '</pre>'
+                                html2 += temp2
+                    compe = Environment().from_string(HTML2).render(file1=match['source1'], file2=match['source2'], \
+                                    metric=metric, score=span, \
+                                    data1=html1, data2=html2)
+                name_file = '{}_{}_{}.html'.format(src1, src2, metric)
+                # create_dir(name_file)
+                with open(os.path.join(cur_dir_path, name_file), 'w', encoding='utf-8') as file:
+                    file.write(compe)
+                dic['scores'][metric] = span
+                dic['alignments'][metric] = 'source_code_comparisons/{}/{}'.format(cur_dir_name, name_file)
+                links.append(dic)
+        all_links += links
+
+        page = Environment().from_string(HTML1).render(heads=heads, links=links)
+        with open(os.path.join(output_dir, 'summary_{}.html'.format(cur_dir_name)), 'w') as file:
+            file.write(page)
+
+    if not heads:
+        print("There is no plagiarism activities!")
+        sys.exit(0)
+    all_links = sorted(all_links, key = lambda i: i['scores']['average_score'].split('">')[-1].split('%')[0], reverse=True)
+    page = Environment().from_string(HTML1).render(heads=heads, links=all_links)
+    with open(os.path.join(output_dir, 'all_summary.html'), 'w') as file:
+        file.write(page)
+
+    with open(os.path.join(output_dir,'all_summary.csv'), mode='w', newline='') as f:
+        writer = csv.writer(f) 
+        writer.writerow(heads)
+        for link in all_links:
+            row = [link['source1'], link['source2']]
+            for k, v in link['scores'].items():
+                row.append(v.split('">')[-1].split('%')[0]+'%')
+            writer.writerow(row)
+
+
+
+
+
+
 
diff --git a/scoss/my_source.py b/scoss/my_source.py
@@ -25,7 +25,7 @@ def from_file(cls, filepath, lang=None, name=None):
             return the Source object
             :rtype: Source
         """
-        with open(filepath) as f:
+        with open(filepath, encoding='utf-8') as f:
             source_str = f.read()
         if lang is None:
             ext = os.path.splitext(filepath)[1][1:]

diff --git a/scoss/scoss.py b/scoss/scoss.py
@@ -380,3 +380,6 @@ def score_color(score):
             with open(os.path.join(output_dir, 'summary.html'), 'w') as file:
                 file.write(page)
         print("Done!")
+
+    def get_sources(self):
+        return self.__sources
diff --git a/setup.py b/setup.py
@@ -21,6 +21,10 @@
       author_email='[email protected]',
       project_urls=PROJECT_URLS,
       version='0.0.2', 
+      entry_points='''
+        [console_scripts]
+        scoss=scoss.cli:scoss_command
+      ''',
       packages=find_packages(),
       install_requires=install_requires,
       python_requires='>=3.6')