Skip to content

Commit

Permalink
Handle standalone Caret annotations as suggested insertions
Browse files Browse the repository at this point in the history
Fixes issue #61
  • Loading branch information
0xabu committed Dec 30, 2024
1 parent 8ed540c commit b23213f
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 9 deletions.
11 changes: 8 additions & 3 deletions pdfannots/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,9 +290,14 @@ def capture_char(self, text: str) -> None:
assert last_charseq != 0
i = bisect.bisect_left(self.context_subscribers, (last_charseq,))
assert 0 <= i < len(self.context_subscribers)
(found_charseq, found_annot) = self.context_subscribers.pop(i)
assert found_charseq == last_charseq
assert found_annot is a
while True:
(found_charseq, found_annot) = self.context_subscribers[i]
assert found_charseq == last_charseq
if found_annot is a:
self.context_subscribers.pop(i)
break
i += 1
assert i < len(self.context_subscribers)

else:
# This is the first hit for the annotation, so set the pre-context.
Expand Down
14 changes: 9 additions & 5 deletions pdfannots/printer/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,8 @@ def format_bullet(

return ret

def merge_strikeout_context(self, annot: Annotation, text: str) -> str:
"""Merge the context for a strikeout annotation into the text."""
def merge_context(self, annot: Annotation, text: str) -> str:
"""Merge the context for a strikeout or caret annotation into the text."""
(pre, post) = annot.get_context(self.remove_hyphens)

if pre:
Expand All @@ -208,7 +208,12 @@ def merge_strikeout_context(self, annot: Annotation, text: str) -> str:
if post:
post = trim_context(post, keep_right=False)

return pre + '~~' + text + '~~' + post
if annot.subtype == AnnotationType.StrikeOut:
return pre + '~~' + text + '~~' + post
else:
assert annot.subtype == AnnotationType.Caret
assert text.isspace()
return pre.rstrip(' ') + ' ^ ' + post.lstrip(' ')

def format_annot(
self,
Expand All @@ -229,8 +234,7 @@ def format_annot(
comment = [l for l in contents.splitlines() if l] if contents else []

if annot.has_context():
assert annot.subtype == AnnotationType.StrikeOut
text = self.merge_strikeout_context(annot, text)
text = self.merge_context(annot, text)

# we are either printing: item text and item contents, or one of the two
# if we see an annotation with neither, something has gone wrong
Expand Down
7 changes: 6 additions & 1 deletion pdfannots/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,11 @@ def __init__(
box = Box(min(xvals), min(yvals), max(xvals), max(yvals))
boxes.append(box)

# Kludge for Caret annotations that lack quadpoints, but need to capture context
if quadpoints is None and subtype == AnnotationType.Caret:
assert rect is not None
boxes.append(Box.from_coords(rect))

# Compute a meaningful position of this annotation on the page
assert rect or boxes
(x0, y0, x1, y1) = rect if rect else boxes[0].get_coords()
Expand Down Expand Up @@ -399,7 +404,7 @@ def gettext(self, remove_hyphens: bool = False) -> typ.Optional[str]:

def wants_context(self) -> bool:
"""Returns true if this annotation type should include context."""
return self.subtype == AnnotationType.StrikeOut
return self.subtype in {AnnotationType.Caret, AnnotationType.StrikeOut}

def set_pre_context(self, pre_context: str) -> None:
assert self.pre_context is None
Expand Down
11 changes: 11 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,17 @@ def test(self) -> None:
self.assertEqual(self.annots[2].gettext(), 'This was a novel idea at the time')


class Issue61(ExtractionTestBase):
filename = 'issue61.pdf'

def test(self) -> None:
self.assertEqual(len(self.annots), 1)
a = self.annots[0]
self.assertEqual(a.subtype, AnnotationType.Caret)
self.assertEqual(a.contents, 'and machine learning')
self.assertTrue(a.has_context())


class Pr24(ExtractionTestBase):
filename = 'pr24.pdf'

Expand Down
Binary file added tests/issue61.pdf
Binary file not shown.

0 comments on commit b23213f

Please sign in to comment.