Skip to content

Commit

Permalink
Use scroll for retrieving scans
Browse files Browse the repository at this point in the history
  • Loading branch information
marijnkoolen committed Dec 16, 2021
1 parent aa6410f commit 9301acd
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 3 deletions.
6 changes: 4 additions & 2 deletions republic/elastic/republic_retrieving.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,10 @@ def retrieve_scan_by_id(self, scan_id: str) -> Union[pdm.PageXMLScan, None]:
return pagexml.json_to_pagexml_scan(response['_source'])

def retrieve_scans_by_query(self, query: dict) -> List[pdm.PageXMLScan]:
response = self.es_anno.search(index=self.config['scan_index'], body=query)
return parse_hits_as_scans(response)
for hit in self.scroll_hits(self.es_anno, query, self.config['scan_index'], size=2, scroll='5m'):
yield pagexml.json_to_pagexml_scan(hit['_source'])
# response = self.es_anno.search(index=self.config['scan_index'], body=query)
# return parse_hits_as_scans(response)

def retrieve_text_repo_scans_by_inventory(self,
inventory_num: int) -> Generator[pdm.PageXMLScan, None, None]:
Expand Down
1 change: 0 additions & 1 deletion republic/parser/pagexml/republic_pagexml_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,6 @@ def split_merged_regions(text_regions: List[pdm.PageXMLTextRegion]) -> List[pdm.
# print('column separator box:', column_separator.coords.box)
# print('line box:', line.coords.box)
# raise ValueError('cannot sort line to left or right of separator')
print('left:', len(left_lines), '\tright:', len(right_lines))
if len(left_lines) > 0:
left_coords = pdm.parse_derived_coords(left_lines)
left_tr = pdm.PageXMLTextRegion(lines=left_lines, coords=left_coords, metadata=tr.metadata)
Expand Down

0 comments on commit 9301acd

Please sign in to comment.