Skip to content

Commit

Permalink
Now discovers formatting for intraline page numbers.
Browse files Browse the repository at this point in the history
  • Loading branch information
Douglas H. King committed Jul 17, 2019
1 parent 2318fa5 commit 7bc55a3
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 20 deletions.
23 changes: 14 additions & 9 deletions discoverpagination/helpers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from math import floor
from typing import List, Dict, Tuple
from math import ceil
from typing import List, Dict, Tuple, TypeVar
from difflib import SequenceMatcher
from random import shuffle

PAGE_NUMBER_TEMPLATE_STR = r'${page_number}'
KT = TypeVar('KT')
VT = TypeVar('VT')

PAGE_NUMBER_NAME = r'page_number_of_a_really_long_length'
PAGE_NUMBER_TEMPLATE_STR = r'${' + PAGE_NUMBER_NAME + r'}'


def section_continuous_numbers(arr: List[int]) -> List[List[int]]:
Expand All @@ -28,9 +32,9 @@ def section_continuous_numbers(arr: List[int]) -> List[List[int]]:
return number_lists


def split_dictionary_in_half(d: Dict):
def shuffle_split_dictionary_in_half(d: Dict[KT, VT]) -> Tuple[Dict[KT, VT], Dict[KT, VT]]:
shuffled_keys = list(d.keys())
half_length = floor(len(shuffled_keys) / 2)
half_length = ceil(len(shuffled_keys) / 2)
shuffle(shuffled_keys)

first_half = {k: v for k, v in d.items() if shuffled_keys.index(k) <= half_length - 1}
Expand All @@ -39,7 +43,7 @@ def split_dictionary_in_half(d: Dict):
return first_half, second_half


def normalize_page_numbers_to_template(page_number_dict: Dict[int, Tuple[int, str]]):
def normalize_page_numbers_to_template(page_number_dict: Dict[int, Tuple[int, str]]) -> Dict[int, Tuple[int, str]]:
return {k: (v[0], v[1].replace(str(k), PAGE_NUMBER_TEMPLATE_STR)) for k, v in page_number_dict.items()}


Expand Down Expand Up @@ -70,7 +74,7 @@ def get_window_from_found_pages(section_pages: List[int], found_pages: Dict[int,
return start_line, end_line


def longest_match_in_page_marker_pair(page_markers: Dict[int, Tuple[int, str]]):
def longest_match_in_page_marker_pair(page_markers: Dict[int, Tuple[int, str]]) -> str:
if len(page_markers) != 2:
raise ValueError("'page_markers' must be a Dict of len 2.")
normalized_page_numbers = normalize_page_numbers_to_template(page_markers)
Expand All @@ -80,7 +84,7 @@ def longest_match_in_page_marker_pair(page_markers: Dict[int, Tuple[int, str]]):
return longest_match_from_list(normalized_lines)


def longest_match_from_list(normalized_lines: List[str]):
def longest_match_from_list(normalized_lines: List[str]) -> str:
if len(normalized_lines) != 2:
raise ValueError("'normalized_lines' must be a list of len 2.")

Expand Down Expand Up @@ -117,4 +121,5 @@ def get_lines_range_from_page_range(start_page: int, end_page: int, found_pages:
return start_line, end_line


__all__ = ['section_continuous_numbers', 'get_window_from_found_pages', 'sort_unique', 'get_lines_range_from_page_range', 'PAGE_NUMBER_TEMPLATE_STR', 'normalize_page_numbers_to_template', 'longest_match_in_page_marker_pair', 'longest_match_from_list', 'split_dictionary_in_half']
__all__ = ['section_continuous_numbers', 'get_window_from_found_pages', 'sort_unique', 'get_lines_range_from_page_range', 'PAGE_NUMBER_TEMPLATE_STR', 'normalize_page_numbers_to_template', 'longest_match_in_page_marker_pair', 'longest_match_from_list',
'shuffle_split_dictionary_in_half', 'PAGE_NUMBER_NAME']
21 changes: 11 additions & 10 deletions discoverpagination/paginateddocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from math import ceil

page_number_template = Template(
r"(?<!(rule|form|year|ears|tion) )(?<![-/,.$$:;])(?<!&#)" + PAGE_NUMBER_TEMPLATE_STR + r"(?![\-:,%/\d])(?!\W? (day|of|west|east|south|years?))(?!\.(\d|\S))")
r"(?<!(rule|form|year|ears|tion) )(?<!\d.)(?<![-/,$$:;])(?<!&#)(?<=\D)" + PAGE_NUMBER_TEMPLATE_STR + r"(?=\D)(?![\-:,%/\d])(?!\.(\d|\S))")
tag_attribute_PATTERN = r'(v?align|src|alt|colspan|rowspan|style|cellpadding|id|width|height|(bg)?color|cellspacing|border|face|name|size)=(\'|").*?(\3)'
tag_attribute_RE = re.compile(tag_attribute_PATTERN, flags=re.DOTALL | re.IGNORECASE)

Expand Down Expand Up @@ -59,18 +59,19 @@ def gather_intraline_combined_page_markers(page_markers_forward, page_markers_re
normalized_reverse = normalize_page_numbers_to_template(page_markers_reverse)

worsted_forward = {k: v for k, v in normalized_forward.items() if k >= forward_min + fifteen_percent_forward}
worsted_reverse = {k: v for k, v in page_markers_reverse.keys() if k <= reverse_max - fifteen_percent_reverse}
worsted_reverse = {k: v for k, v in normalized_reverse.items() if k <= reverse_max - fifteen_percent_reverse}

best_match_forward = recurse_through_page_markers(worsted_forward)
best_match_reverse = recurse_through_page_markers(worsted_reverse)
pass


def recurse_through_page_markers(page_markers):
def recurse_through_page_markers(page_markers: Dict[int, Tuple[int, str]]) -> str:
if len(page_markers) == 1:
return page_markers[page_markers.keys()[0]][1]
if len(page_markers) > 2:
first_dict, second_dict = split_dictionary_in_half(page_markers)
recurse_through_page_markers(first_dict), recurse_through_page_markers(second_dict)
return list(page_markers.values())[0][1]
if len(page_markers) >= 2:
first_dict, second_dict = shuffle_split_dictionary_in_half(page_markers)
return longest_match_from_list([recurse_through_page_markers(first_dict), recurse_through_page_markers(second_dict)])


def iterative_page_number_search(template, document_slice, known_pages, start_page):
Expand All @@ -79,7 +80,7 @@ def iterative_page_number_search(template, document_slice, known_pages, start_pa
found_page = True
while found_page:
page_number = last_page + 1
page_find_regex = template.substitute(page_number=page_number)
page_find_regex = template.substitute({f"{PAGE_NUMBER_NAME}": page_number})

for line_number, line in enumerate(document_slice[last_index:], last_index):
found_page = re.search(page_find_regex, line)
Expand Down Expand Up @@ -108,7 +109,7 @@ def reversed_sliced_page_number_search(template, forward_document_slice, known_p
while found_page and start_page <= last_page:

page_number = last_page
page_find_regex = template.substitute(page_number=page_number)
page_find_regex = template.substitute({f"{PAGE_NUMBER_NAME}": page_number})
for line_number, line in reversed_document_slice:
found_page = re.search(page_find_regex, line, re.IGNORECASE)
if found_page:
Expand Down Expand Up @@ -234,7 +235,7 @@ def __init__(self, document: TextIO, start_page: int = 1, clean_xml=False, write
def get_page_template(self):
likely_page_markers = Counter()
for i in self.page_endings.keys():
discovered_template = self.page_endings[i][1].replace(str(i), "${page_number}")
discovered_template = self.page_endings[i][1].replace(str(i), PAGE_NUMBER_TEMPLATE_STR)
likely_page_markers[discovered_template] = likely_page_markers[discovered_template] + 1
return max(likely_page_markers.items(), key=operator.itemgetter(1))[0]

Expand Down
70 changes: 69 additions & 1 deletion test/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,14 @@ def test_split_dictionary_in_half_len_2(self):
subject = {1: "one", 2: "two"}

# action
first, second = split_dictionary_in_half(subject)
first, second = shuffle_split_dictionary_in_half(subject)

# assert

self.assertEqual(2, len(subject))
self.assertEqual(1, len(first))
self.assertEqual(1, len(second))

if 1 not in first:
self.assertTrue(1 in second)

Expand All @@ -30,6 +35,69 @@ def test_split_dictionary_in_half_len_2(self):
if 2 in first:
self.assertTrue(2 not in second)

# No overlapping keys
self.assertFalse([k for k in first.keys() if k in second.keys()])

def test_split_dictionary_in_half_len_5(self):
# arrange
subject = {1: "one", 2: "two", 3: "three", 4: "four", 5: "five"}

# action
first, second = shuffle_split_dictionary_in_half(subject)

# assert
self.assertEqual(5, len(subject))
self.assertEqual(3, len(first))
self.assertEqual(2, len(second))

# No overlapping keys
self.assertFalse([k for k in first.keys() if k in second.keys()])

def test_split_dictionary_in_half_len_6(self):
# arrange
subject = {1: "one", 2: "two", 3: "three", 4: "four", 5: "five", 6: "six"}

# action
first, second = shuffle_split_dictionary_in_half(subject)

# assert
self.assertEqual(6, len(subject))
self.assertEqual(3, len(first))
self.assertEqual(3, len(second))

# No overlapping keys
self.assertFalse([k for k in first.keys() if k in second.keys()])

def test_split_dictionary_in_half_len_1(self):
# arrange
subject = {1: "one"}

# action
first, second = shuffle_split_dictionary_in_half(subject)

# assert
self.assertEqual(1, len(subject))
self.assertEqual(1, len(first))
self.assertEqual(0, len(second))

# No overlapping keys
self.assertFalse([k for k in first.keys() if k in second.keys()])

def test_split_dictionary_in_half_len_0(self):
# arrange
subject = {}

# action
first, second = shuffle_split_dictionary_in_half(subject)

# assert
self.assertEqual(0, len(subject))
self.assertEqual(0, len(first))
self.assertEqual(0, len(second))

# No overlapping keys
self.assertFalse([k for k in first.keys() if k in second.keys()])


class TestSectionContinuousNumbers(TestCase):

Expand Down

0 comments on commit 7bc55a3

Please sign in to comment.