Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-783 Add notebooks page to docs #491

Merged
merged 2 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions dedoc/api/schema/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ class Annotation(BaseModel):
"""
The piece of information about the text line: it's appearance or links to another document object.
For example Annotation(1, 13, "italic", "True") says that text between 1st and 13th symbol was written in italic.

:ivar start: start of the annotated text
:ivar end: end of the annotated text (end isn't included)
:ivar name: annotation's name, specific for each type of annotation
:ivar value: information about annotated text, depends on the type of annotation, e.g. "True"/"False", "10.0", etc.

:vartype start: int
:vartype end: int
:vartype name: str
:vartype value: str
"""
start: int = Field(description="Start of the annotated text", example=0)
end: int = Field(description="End of the annotated text (end isn't included)", example=5)
Expand Down
10 changes: 10 additions & 0 deletions dedoc/api/schema/cell_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,16 @@
class CellWithMeta(BaseModel):
"""
Holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible).

:ivar lines: list of textual lines of the cell
:ivar colspan: number of columns to span (for cells merged horizontally)
:ivar rowspan: number of rows to span (for cells merged vertically)
:ivar invisible: indicator for displaying or hiding cell text - cells that are merged with others are hidden (for HTML display)

:vartype lines: List[LineWithMeta]
:vartype colspan: int
:vartype rowspan: int
:vartype invisible: bool
"""
lines: List[LineWithMeta] = Field(description="Textual lines of the cell with annotations")
rowspan: int = Field(description="Number of rows to span like in HTML format", example=1)
Expand Down
6 changes: 6 additions & 0 deletions dedoc/api/schema/document_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@
class DocumentContent(BaseModel):
"""
Content of the document - structured text and tables.
:ivar tables: list of document tables
:ivar structure: tree structure of the document nodes with text and additional metadata
:vartype tables: List[Table]
:vartype structure: TreeNode
"""
structure: TreeNode = Field(description="Tree structure where content of the document is organized")
tables: List[Table] = Field(description="List of document tables")
20 changes: 20 additions & 0 deletions dedoc/api/schema/document_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,26 @@
class DocumentMetadata(BaseModel):
"""
Document metadata like its name, size, author, etc.

:ivar file_name: original document name (before rename and conversion, so it can contain non-ascii symbols, spaces and so on)
:ivar temporary_file_name: file name during parsing (unique name after rename and conversion)
:ivar size: size of the original file in bytes
:ivar modified_time: time of the last modification in unix time format (seconds since the epoch)
:ivar created_time: time of the creation in unixtime
:ivar access_time: time of the last access to the file in unixtime
:ivar file_type: mime type of the file
:ivar uid: document unique identifier (useful for attached files)

:vartype file_name: str
:vartype temporary_file_name: str
:vartype size: int
:vartype modified_time: int
:vartype created_time: int
:vartype access_time: int
:vartype file_type: str
:vartype uid: str

Additional variables may be added with other file metadata.
"""
class Config:
extra = Extra.allow
Expand Down
12 changes: 11 additions & 1 deletion dedoc/api/schema/line_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,20 @@
class LineMetadata(BaseModel):
"""
Holds information about document node/line metadata, such as page number or line type.

:ivar paragraph_type: type of the document line/paragraph (header, list_item, list, etc.)
:ivar page_id: page number where paragraph starts, the numeration starts from page 0
:ivar line_id: line number inside the entire document, the numeration starts from line 0

:vartype paragraph_type: str
:vartype page_id: int
:vartype line_id: Optional[int]

Additional variables may be added with other line metadata.
"""
class Config:
extra = Extra.allow

paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list) and etc.", example="raw_text")
paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list, etc.)", example="raw_text")
page_id: int = Field(description="Page number of the line/paragraph beginning", example=0)
line_id: Optional[int] = Field(description="Line number", example=1)
8 changes: 7 additions & 1 deletion dedoc/api/schema/line_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@
class LineWithMeta(BaseModel):
"""
Textual line with text annotations.

:ivar text: text of the line
:ivar annotations: text annotations (font, size, bold, italic, etc.)

:vartype text: str
:vartype annotations: List[Annotation]
"""
text: str = Field(description="Text of the line", example="Some text")
annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic and etc)")
annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic, etc.)")
12 changes: 12 additions & 0 deletions dedoc/api/schema/parsed_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,18 @@
class ParsedDocument(BaseModel):
"""
Holds information about the document content, metadata and attachments.

:ivar content: document text (hierarchy of nodes) and tables
:ivar attachments: result of analysis of attached files (empty if with_attachments=False)
:ivar metadata: document metadata such as size, creation date and so on.
:ivar warnings: list of warnings and possible errors, arising in the process of document parsing
:ivar version: version of the program that parsed this document

:vartype content: DocumentContent
:vartype attachments: List[ParsedDocument]
:vartype metadata: DocumentMetadata
:vartype warnings: List[str]
:vartype version: str
"""
content: DocumentContent = Field(description="Document text and tables")
metadata: DocumentMetadata = Field(description="Document metadata such as size, creation date and so on")
Expand Down
6 changes: 6 additions & 0 deletions dedoc/api/schema/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ class Table(BaseModel):
Holds information about tables in the document.
We assume that a table has rectangle form (has the same number of columns in each row).
Table representation is row-based i.e. external list contains list of rows.

:ivar metadata: a list of lists of table cells (cell has text lines, colspan and rowspan attributes)
:ivar cells: table metadata as location, title and so on

:vartype metadata: TableMetadata
:vartype cells: List[List[CellWithMeta]]
"""
cells: List[List[CellWithMeta]] = Field(description="List of lists of table cells (cell has text, colspan and rowspan attributes)")
metadata: TableMetadata = Field(description="Table meta information")
10 changes: 10 additions & 0 deletions dedoc/api/schema/table_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,16 @@
class TableMetadata(BaseModel):
"""
Holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on.

:ivar page_id: number of the page where table starts
:ivar uid: unique identifier of the table (used for linking table to text)
:ivar rotated_angle: value of the rotation angle by which the table was rotated during recognition
:ivar title: table's title

:vartype page_id: Optional[int]
:vartype uid: str
:vartype rotated_angle: float
:vartype title: str
"""
page_id: Optional[int] = Field(description="Number of the page where the table starts", example=0)
uid: str = Field(description="Unique identifier of the table", example="e8ba5523-8546-4804-898c-2f4835a1804f")
Expand Down
12 changes: 12 additions & 0 deletions dedoc/api/schema/tree_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,18 @@ class TreeNode(BaseModel):
"""
Helps to represent document as recursive tree structure.
It has list of children `TreeNode` nodes (empty list for a leaf node).
:ivar node_id: unique node identifier
:ivar text: text of the node (may contain several lines)
:ivar annotations: some metadata related to the part of the text (as font size)
:ivar metadata: metadata refers to entire node (as node type)
:ivar subparagraphs: list of child of this node
:vartype node_id: str
:vartype text: str
:vartype annotations: List[Annotation]
:vartype metadata: LineMetadata
:vartype subparagraphs: List[TreeNode]
"""
node_id: str = Field(description="Document element identifier. It is unique within a document content tree. "
"The identifier consists of numbers separated by dots where each number "
Expand Down
24 changes: 18 additions & 6 deletions dedoc/data_structures/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@ class Annotation(Serializable):
Base class for text annotations of all kinds.
Annotation is the piece of information about the text line: it's appearance or links to another document object.
Look to the concrete kind of annotations to get mode examples.

:ivar start: start of the annotated text
:ivar end: end of the annotated text (end isn't included)
:ivar name: annotation's name, specific for each type of annotation
:ivar value: information about annotated text, depends on the type of annotation, e.g. "True"/"False", "10.0", etc.
:ivar is_mergeable: is it possible to merge annotations with the same value

:vartype start: int
:vartype end: int
:vartype name: str
:vartype value: str
:vartype is_mergeable: bool
"""

def __init__(self, start: int, end: int, name: str, value: str, is_mergeable: bool = True) -> None:
Expand All @@ -20,11 +32,11 @@ def __init__(self, start: int, end: int, name: str, value: str, is_mergeable: bo
:param value: information about annotated text
:param is_mergeable: is it possible to merge annotations with the same value
"""
self.start = start
self.end = end
self.name = name
self.value = value
self.is_mergeable = is_mergeable
self.start: int = start
self.end: int = end
self.name: str = name
self.value: str = value
self.is_mergeable: bool = is_mergeable

def __eq__(self, o: object) -> bool:
if not isinstance(o, Annotation):
Expand All @@ -35,7 +47,7 @@ def __str__(self) -> str:
return f"{self.name.capitalize()}({self.start}:{self.end}, {self.value})"

def __repr__(self) -> str:
return f"{self.name.capitalize()}(...)"
return self.__str__()

def to_api_schema(self) -> ApiAnnotation:
return ApiAnnotation(start=self.start, end=self.end, name=self.name, value=self.value)
22 changes: 16 additions & 6 deletions dedoc/data_structures/attached_file.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,28 @@
class AttachedFile:
"""
Holds information about files, attached to the parsed document.

:ivar original_name: original name of the attached file if it was possible to extract it
:ivar tmp_file_path: path to the attached file on disk - its name is different from original_name
:ivar need_content_analysis: does the attached file need parsing (enable recursive parsing in :class:`~dedoc.DedocManager`)
:ivar uid: unique identifier of the attached file

:vartype original_name: str
:vartype tmp_file_path: str
:vartype need_content_analysis: bool
:vartype uid: str
"""
def __init__(self, original_name: str, tmp_file_path: str, need_content_analysis: bool, uid: str) -> None:
"""
:param original_name: Name of the file from which the attachments are extracted
:param tmp_file_path: path to the attachment file.
:param original_name: original name of the attached file
:param tmp_file_path: path to the attachment file
:param need_content_analysis: indicator should we parse the attachment's content or simply save it without parsing
:param uid: unique identifier of the attachment
"""
self.original_name = original_name
self.tmp_file_path = tmp_file_path
self.need_content_analysis = need_content_analysis
self.uid = uid
self.original_name: str = original_name
self.tmp_file_path: str = tmp_file_path
self.need_content_analysis: bool = need_content_analysis
self.uid: str = uid

def get_filename_in_path(self) -> str:
return self.tmp_file_path
Expand Down
18 changes: 14 additions & 4 deletions dedoc/data_structures/cell_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@
class CellWithMeta(Serializable):
"""
This class holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible).

:ivar lines: list of textual lines of the cell
:ivar colspan: number of columns to span (for cells merged horizontally)
:ivar rowspan: number of rows to span (for cells merged vertically)
:ivar invisible: indicator for displaying or hiding cell text - cells that are merged with others are hidden (for HTML display)

:vartype lines: List[LineWithMeta]
:vartype colspan: int
:vartype rowspan: int
:vartype invisible: bool
"""
def __init__(self, lines: List[LineWithMeta], colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
"""
Expand All @@ -17,10 +27,10 @@ def __init__(self, lines: List[LineWithMeta], colspan: int = 1, rowspan: int = 1
:param rowspan: number of rows to span like in HTML format
:param invisible: indicator for displaying or hiding cell text
"""
self.lines = lines
self.colspan = colspan
self.rowspan = rowspan
self.invisible = invisible
self.lines: List[LineWithMeta] = lines
self.colspan: int = colspan
self.rowspan: int = rowspan
self.invisible: bool = invisible

def __repr__(self) -> str:
return f"CellWithMeta({self.get_text()[:65]})"
Expand Down
20 changes: 14 additions & 6 deletions dedoc/data_structures/document_content.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List
from typing import List, Optional

from dedoc.api.schema.document_content import DocumentContent as ApiDocumentContent
from dedoc.data_structures.serializable import Serializable
Expand All @@ -9,16 +9,24 @@
class DocumentContent(Serializable):
"""
This class holds the document content - structured text and tables.

:ivar tables: list of document tables
:ivar structure: tree structure of the document nodes with text and additional metadata
:ivar warnings: list of warnings, obtained in the process of the document parsing

:vartype tables: List[Table]
:vartype structure: TreeNode
:vartype warnings: List[str]
"""
def __init__(self, tables: List[Table], structure: TreeNode, warnings: List[str] = None) -> None:
def __init__(self, tables: List[Table], structure: TreeNode, warnings: Optional[List[str]] = None) -> None:
"""
:param tables: list of document tables
:param structure: tree structure in which content of the document is organized
:param warnings: list of warnings, obtained in the process of the document structure constructing
:param warnings: list of warnings
"""
self.tables = tables
self.structure = structure
self.warnings = warnings if warnings is not None else []
self.tables: List[Table] = tables
self.structure: TreeNode = structure
self.warnings: List[str] = warnings if warnings is not None else []

def to_api_schema(self) -> ApiDocumentContent:
structure = self.structure.to_api_schema()
Expand Down
Loading
Loading