Skip to content

Commit

Permalink
TLDR-783 Add notebooks page to docs (#491)
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget authored Sep 3, 2024
1 parent cbaa665 commit 47486dd
Show file tree
Hide file tree
Showing 33 changed files with 363 additions and 141 deletions.
10 changes: 10 additions & 0 deletions dedoc/api/schema/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ class Annotation(BaseModel):
"""
The piece of information about the text line: it's appearance or links to another document object.
For example Annotation(1, 13, "italic", "True") says that text between 1st and 13th symbol was written in italic.
:ivar start: start of the annotated text
:ivar end: end of the annotated text (end isn't included)
:ivar name: annotation's name, specific for each type of annotation
:ivar value: information about annotated text, depends on the type of annotation, e.g. "True"/"False", "10.0", etc.
:vartype start: int
:vartype end: int
:vartype name: str
:vartype value: str
"""
start: int = Field(description="Start of the annotated text", example=0)
end: int = Field(description="End of the annotated text (end isn't included)", example=5)
Expand Down
10 changes: 10 additions & 0 deletions dedoc/api/schema/cell_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,16 @@
class CellWithMeta(BaseModel):
"""
Holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible).
:ivar lines: list of textual lines of the cell
:ivar colspan: number of columns to span (for cells merged horizontally)
:ivar rowspan: number of rows to span (for cells merged vertically)
:ivar invisible: indicator for displaying or hiding cell text - cells that are merged with others are hidden (for HTML display)
:vartype lines: List[LineWithMeta]
:vartype colspan: int
:vartype rowspan: int
:vartype invisible: bool
"""
lines: List[LineWithMeta] = Field(description="Textual lines of the cell with annotations")
rowspan: int = Field(description="Number of rows to span like in HTML format", example=1)
Expand Down
6 changes: 6 additions & 0 deletions dedoc/api/schema/document_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@
class DocumentContent(BaseModel):
"""
Content of the document - structured text and tables.
:ivar tables: list of document tables
:ivar structure: tree structure of the document nodes with text and additional metadata
:vartype tables: List[Table]
:vartype structure: TreeNode
"""
structure: TreeNode = Field(description="Tree structure where content of the document is organized")
tables: List[Table] = Field(description="List of document tables")
20 changes: 20 additions & 0 deletions dedoc/api/schema/document_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,26 @@
class DocumentMetadata(BaseModel):
"""
Document metadata like its name, size, author, etc.
:ivar file_name: original document name (before rename and conversion, so it can contain non-ascii symbols, spaces and so on)
:ivar temporary_file_name: file name during parsing (unique name after rename and conversion)
:ivar size: size of the original file in bytes
:ivar modified_time: time of the last modification in unix time format (seconds since the epoch)
:ivar created_time: time of the creation in unixtime
:ivar access_time: time of the last access to the file in unixtime
:ivar file_type: mime type of the file
:ivar uid: document unique identifier (useful for attached files)
:vartype file_name: str
:vartype temporary_file_name: str
:vartype size: int
:vartype modified_time: int
:vartype created_time: int
:vartype access_time: int
:vartype file_type: str
:vartype uid: str
Additional variables may be added with other file metadata.
"""
class Config:
extra = Extra.allow
Expand Down
12 changes: 11 additions & 1 deletion dedoc/api/schema/line_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,20 @@
class LineMetadata(BaseModel):
"""
Holds information about document node/line metadata, such as page number or line type.
:ivar paragraph_type: type of the document line/paragraph (header, list_item, list, etc.)
:ivar page_id: page number where paragraph starts, the numeration starts from page 0
:ivar line_id: line number inside the entire document, the numeration starts from line 0
:vartype paragraph_type: str
:vartype page_id: int
:vartype line_id: Optional[int]
Additional variables may be added with other line metadata.
"""
class Config:
extra = Extra.allow

paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list) and etc.", example="raw_text")
paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list, etc.)", example="raw_text")
page_id: int = Field(description="Page number of the line/paragraph beginning", example=0)
line_id: Optional[int] = Field(description="Line number", example=1)
8 changes: 7 additions & 1 deletion dedoc/api/schema/line_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@
class LineWithMeta(BaseModel):
"""
Textual line with text annotations.
:ivar text: text of the line
:ivar annotations: text annotations (font, size, bold, italic, etc.)
:vartype text: str
:vartype annotations: List[Annotation]
"""
text: str = Field(description="Text of the line", example="Some text")
annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic and etc)")
annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic, etc.)")
12 changes: 12 additions & 0 deletions dedoc/api/schema/parsed_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,18 @@
class ParsedDocument(BaseModel):
"""
Holds information about the document content, metadata and attachments.
:ivar content: document text (hierarchy of nodes) and tables
:ivar attachments: result of analysis of attached files (empty if with_attachments=False)
:ivar metadata: document metadata such as size, creation date and so on.
:ivar warnings: list of warnings and possible errors, arising in the process of document parsing
:ivar version: version of the program that parsed this document
:vartype content: DocumentContent
:vartype attachments: List[ParsedDocument]
:vartype metadata: DocumentMetadata
:vartype warnings: List[str]
:vartype version: str
"""
content: DocumentContent = Field(description="Document text and tables")
metadata: DocumentMetadata = Field(description="Document metadata such as size, creation date and so on")
Expand Down
6 changes: 6 additions & 0 deletions dedoc/api/schema/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ class Table(BaseModel):
Holds information about tables in the document.
We assume that a table has rectangle form (has the same number of columns in each row).
Table representation is row-based i.e. external list contains list of rows.
:ivar metadata: a list of lists of table cells (cell has text lines, colspan and rowspan attributes)
:ivar cells: table metadata as location, title and so on
:vartype metadata: TableMetadata
:vartype cells: List[List[CellWithMeta]]
"""
cells: List[List[CellWithMeta]] = Field(description="List of lists of table cells (cell has text, colspan and rowspan attributes)")
metadata: TableMetadata = Field(description="Table meta information")
10 changes: 10 additions & 0 deletions dedoc/api/schema/table_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,16 @@
class TableMetadata(BaseModel):
"""
Holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on.
:ivar page_id: number of the page where table starts
:ivar uid: unique identifier of the table (used for linking table to text)
:ivar rotated_angle: value of the rotation angle by which the table was rotated during recognition
:ivar title: table's title
:vartype page_id: Optional[int]
:vartype uid: str
:vartype rotated_angle: float
:vartype title: str
"""
page_id: Optional[int] = Field(description="Number of the page where the table starts", example=0)
uid: str = Field(description="Unique identifier of the table", example="e8ba5523-8546-4804-898c-2f4835a1804f")
Expand Down
12 changes: 12 additions & 0 deletions dedoc/api/schema/tree_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,18 @@ class TreeNode(BaseModel):
"""
Helps to represent document as recursive tree structure.
It has list of children `TreeNode` nodes (empty list for a leaf node).
:ivar node_id: unique node identifier
:ivar text: text of the node (may contain several lines)
:ivar annotations: some metadata related to the part of the text (as font size)
:ivar metadata: metadata refers to entire node (as node type)
:ivar subparagraphs: list of child of this node
:vartype node_id: str
:vartype text: str
:vartype annotations: List[Annotation]
:vartype metadata: LineMetadata
:vartype subparagraphs: List[TreeNode]
"""
node_id: str = Field(description="Document element identifier. It is unique within a document content tree. "
"The identifier consists of numbers separated by dots where each number "
Expand Down
24 changes: 18 additions & 6 deletions dedoc/data_structures/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@ class Annotation(Serializable):
Base class for text annotations of all kinds.
Annotation is the piece of information about the text line: it's appearance or links to another document object.
Look to the concrete kind of annotations to get mode examples.
:ivar start: start of the annotated text
:ivar end: end of the annotated text (end isn't included)
:ivar name: annotation's name, specific for each type of annotation
:ivar value: information about annotated text, depends on the type of annotation, e.g. "True"/"False", "10.0", etc.
:ivar is_mergeable: is it possible to merge annotations with the same value
:vartype start: int
:vartype end: int
:vartype name: str
:vartype value: str
:vartype is_mergeable: bool
"""

def __init__(self, start: int, end: int, name: str, value: str, is_mergeable: bool = True) -> None:
Expand All @@ -20,11 +32,11 @@ def __init__(self, start: int, end: int, name: str, value: str, is_mergeable: bo
:param value: information about annotated text
:param is_mergeable: is it possible to merge annotations with the same value
"""
self.start = start
self.end = end
self.name = name
self.value = value
self.is_mergeable = is_mergeable
self.start: int = start
self.end: int = end
self.name: str = name
self.value: str = value
self.is_mergeable: bool = is_mergeable

def __eq__(self, o: object) -> bool:
if not isinstance(o, Annotation):
Expand All @@ -35,7 +47,7 @@ def __str__(self) -> str:
return f"{self.name.capitalize()}({self.start}:{self.end}, {self.value})"

def __repr__(self) -> str:
return f"{self.name.capitalize()}(...)"
return self.__str__()

def to_api_schema(self) -> ApiAnnotation:
return ApiAnnotation(start=self.start, end=self.end, name=self.name, value=self.value)
22 changes: 16 additions & 6 deletions dedoc/data_structures/attached_file.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,28 @@
class AttachedFile:
"""
Holds information about files, attached to the parsed document.
:ivar original_name: original name of the attached file if it was possible to extract it
:ivar tmp_file_path: path to the attached file on disk - its name is different from original_name
:ivar need_content_analysis: does the attached file need parsing (enable recursive parsing in :class:`~dedoc.DedocManager`)
:ivar uid: unique identifier of the attached file
:vartype original_name: str
:vartype tmp_file_path: str
:vartype need_content_analysis: bool
:vartype uid: str
"""
def __init__(self, original_name: str, tmp_file_path: str, need_content_analysis: bool, uid: str) -> None:
"""
:param original_name: Name of the file from which the attachments are extracted
:param tmp_file_path: path to the attachment file.
:param original_name: original name of the attached file
:param tmp_file_path: path to the attachment file
:param need_content_analysis: indicator should we parse the attachment's content or simply save it without parsing
:param uid: unique identifier of the attachment
"""
self.original_name = original_name
self.tmp_file_path = tmp_file_path
self.need_content_analysis = need_content_analysis
self.uid = uid
self.original_name: str = original_name
self.tmp_file_path: str = tmp_file_path
self.need_content_analysis: bool = need_content_analysis
self.uid: str = uid

def get_filename_in_path(self) -> str:
return self.tmp_file_path
Expand Down
18 changes: 14 additions & 4 deletions dedoc/data_structures/cell_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@
class CellWithMeta(Serializable):
"""
This class holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible).
:ivar lines: list of textual lines of the cell
:ivar colspan: number of columns to span (for cells merged horizontally)
:ivar rowspan: number of rows to span (for cells merged vertically)
:ivar invisible: indicator for displaying or hiding cell text - cells that are merged with others are hidden (for HTML display)
:vartype lines: List[LineWithMeta]
:vartype colspan: int
:vartype rowspan: int
:vartype invisible: bool
"""
def __init__(self, lines: List[LineWithMeta], colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
"""
Expand All @@ -17,10 +27,10 @@ def __init__(self, lines: List[LineWithMeta], colspan: int = 1, rowspan: int = 1
:param rowspan: number of rows to span like in HTML format
:param invisible: indicator for displaying or hiding cell text
"""
self.lines = lines
self.colspan = colspan
self.rowspan = rowspan
self.invisible = invisible
self.lines: List[LineWithMeta] = lines
self.colspan: int = colspan
self.rowspan: int = rowspan
self.invisible: bool = invisible

def __repr__(self) -> str:
return f"CellWithMeta({self.get_text()[:65]})"
Expand Down
20 changes: 14 additions & 6 deletions dedoc/data_structures/document_content.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List
from typing import List, Optional

from dedoc.api.schema.document_content import DocumentContent as ApiDocumentContent
from dedoc.data_structures.serializable import Serializable
Expand All @@ -9,16 +9,24 @@
class DocumentContent(Serializable):
"""
This class holds the document content - structured text and tables.
:ivar tables: list of document tables
:ivar structure: tree structure of the document nodes with text and additional metadata
:ivar warnings: list of warnings, obtained in the process of the document parsing
:vartype tables: List[Table]
:vartype structure: TreeNode
:vartype warnings: List[str]
"""
def __init__(self, tables: List[Table], structure: TreeNode, warnings: List[str] = None) -> None:
def __init__(self, tables: List[Table], structure: TreeNode, warnings: Optional[List[str]] = None) -> None:
"""
:param tables: list of document tables
:param structure: tree structure in which content of the document is organized
:param warnings: list of warnings, obtained in the process of the document structure constructing
:param warnings: list of warnings
"""
self.tables = tables
self.structure = structure
self.warnings = warnings if warnings is not None else []
self.tables: List[Table] = tables
self.structure: TreeNode = structure
self.warnings: List[str] = warnings if warnings is not None else []

def to_api_schema(self) -> ApiDocumentContent:
structure = self.structure.to_api_schema()
Expand Down
Loading

0 comments on commit 47486dd

Please sign in to comment.