Skip to content

Commit

Permalink
Merge pull request #16 from IACR/clean_abstract
Browse files Browse the repository at this point in the history
Clean comments, todos, footnotes from abstracts
  • Loading branch information
jwbos authored Feb 26, 2023
2 parents 72e4c5e + 93c19da commit 7180b35
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 2 deletions.
11 changes: 11 additions & 0 deletions webapp/metadata/meta_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from nameparser import HumanName
from pylatexenc.latex2text import LatexNodes2Text
from arxiv_latex_cleaner import arxiv_latex_cleaner

def get_key_val(line):
"""If line has form key: value, then return key, value."""
Expand Down Expand Up @@ -120,3 +121,13 @@ def read_meta(metafile):
raise Exception('unexpected line {}'.format(line))
return data

def clean_abstract(text):
"""Remove comments, todos, \begin{comment} from abstract."""
lines = text.splitlines(keepends=True)
# There is some doubt about whether to include things like \textrm
# in the commands_only_to_delete. It depends on how mathjax or
# katex is configured.
args = {'commands_only_to_delete': [],
'commands_to_delete': ['todo', 'footnote']}
clean_lines = arxiv_latex_cleaner._remove_comments_and_commands_to_delete(lines, args)
return ''.join(clean_lines)
32 changes: 31 additions & 1 deletion webapp/metadata/tests/metadata_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,40 @@
import sys
sys.path.insert(0, '../')
from compilation import Compilation, FileTree
from meta_parse import clean_abstract

def test_filetree():
comp = Compilation.parse_raw(Path('testdata/compilation.json').read_text(encoding='UTF-8'))
comp.output_tree = FileTree.from_path(Path('../'))
print(comp.output_tree)
assert comp.output_tree.name == '..'
assert len(comp.output_tree.children) == 7

def test_clean_abstract():
input = """This is an abstract
that has comments. %% this should be removed
% comments may start a line
% blank line above should survive.
%% multi-line commments are removed
%% along with spaces lines, which are just glue.
This is the second paragraph. \\begin{comment}
within comment environment.
\\end{comment}
You may still~\\footnote{Go bye bye} use percentages like 10\\% of the content.
We remove \\todo{this is a removable todo} and \\todo[inline]{so is this} but
the last one is not removed because arxiv_latex_cleaner does not recognize it.
"""
output = clean_abstract(input)
print(output)
assert 'should be removed' not in output
assert '\n\n' in output
assert '\n \n' not in output
assert '\\begin{comment}' not in output
assert 'comment environment' not in output
# \footnote is removed.
assert 'bye not in output'
assert '%' in output
# \todo is removed.
assert 'removable' not in output
# We might wish to catch this in the future.
assert 'so is this' in output
1 change: 1 addition & 0 deletions webapp/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ python-dotenv
flask-wtf
flask-login
flask-sqlalchemy
arxiv-latex-cleaner
3 changes: 2 additions & 1 deletion webapp/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from . import db, task_queue
#from .metadata import meta_parse
from .metadata.latex.iacrcc.parser import meta_parse
from .metadata.meta_parse import clean_abstract
from .metadata.compilation import Compilation, Meta, CompileStatus, VersionEnum, FileTree
from .db_models import CompileRecord, TaskStatus

Expand Down Expand Up @@ -101,7 +102,7 @@ def run_latex_task(paper_path, paperid, version, task_key):
compilation.status = CompileStatus.MISSING_ABSTRACT
compilation.error_log.append('An abstract is required.')
else:
data['abstract'] = abstract_file.read_text(encoding='UTF-8')
data['abstract'] = clean_abstract(abstract_file.read_text(encoding='UTF-8'))
compilation.meta = Meta(**data)
if compilation.meta.version != VersionEnum.FINAL:
compilation.status = CompileStatus.WRONG_VERSION
Expand Down

0 comments on commit 7180b35

Please sign in to comment.