diff --git a/dedoc/api/schema/annotation.py b/dedoc/api/schema/annotation.py index 9add75dd..225de396 100644 --- a/dedoc/api/schema/annotation.py +++ b/dedoc/api/schema/annotation.py @@ -5,6 +5,16 @@ class Annotation(BaseModel): """ The piece of information about the text line: it's appearance or links to another document object. For example Annotation(1, 13, "italic", "True") says that text between 1st and 13th symbol was written in italic. + + :ivar start: start of the annotated text + :ivar end: end of the annotated text (end isn't included) + :ivar name: annotation's name, specific for each type of annotation + :ivar value: information about annotated text, depends on the type of annotation, e.g. "True"/"False", "10.0", etc. + + :vartype start: int + :vartype end: int + :vartype name: str + :vartype value: str """ start: int = Field(description="Start of the annotated text", example=0) end: int = Field(description="End of the annotated text (end isn't included)", example=5) diff --git a/dedoc/api/schema/cell_with_meta.py b/dedoc/api/schema/cell_with_meta.py index efeb0fdf..05cb6f66 100644 --- a/dedoc/api/schema/cell_with_meta.py +++ b/dedoc/api/schema/cell_with_meta.py @@ -8,6 +8,16 @@ class CellWithMeta(BaseModel): """ Holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible). + + :ivar lines: list of textual lines of the cell + :ivar colspan: number of columns to span (for cells merged horizontally) + :ivar rowspan: number of rows to span (for cells merged vertically) + :ivar invisible: indicator for displaying or hiding cell text - cells that are merged with others are hidden (for HTML display) + + :vartype lines: List[LineWithMeta] + :vartype colspan: int + :vartype rowspan: int + :vartype invisible: bool """ lines: List[LineWithMeta] = Field(description="Textual lines of the cell with annotations") rowspan: int = Field(description="Number of rows to span like in HTML format", example=1) diff --git a/dedoc/api/schema/document_content.py b/dedoc/api/schema/document_content.py index 5127650e..e9d8a47c 100644 --- a/dedoc/api/schema/document_content.py +++ b/dedoc/api/schema/document_content.py @@ -9,6 +9,12 @@ class DocumentContent(BaseModel): """ Content of the document - structured text and tables. + + :ivar tables: list of document tables + :ivar structure: tree structure of the document nodes with text and additional metadata + + :vartype tables: List[Table] + :vartype structure: TreeNode """ structure: TreeNode = Field(description="Tree structure where content of the document is organized") tables: List[Table] = Field(description="List of document tables") diff --git a/dedoc/api/schema/document_metadata.py b/dedoc/api/schema/document_metadata.py index 4d814fc3..fb45c075 100644 --- a/dedoc/api/schema/document_metadata.py +++ b/dedoc/api/schema/document_metadata.py @@ -4,6 +4,26 @@ class DocumentMetadata(BaseModel): """ Document metadata like its name, size, author, etc. + + :ivar file_name: original document name (before rename and conversion, so it can contain non-ascii symbols, spaces and so on) + :ivar temporary_file_name: file name during parsing (unique name after rename and conversion) + :ivar size: size of the original file in bytes + :ivar modified_time: time of the last modification in unix time format (seconds since the epoch) + :ivar created_time: time of the creation in unixtime + :ivar access_time: time of the last access to the file in unixtime + :ivar file_type: mime type of the file + :ivar uid: document unique identifier (useful for attached files) + + :vartype file_name: str + :vartype temporary_file_name: str + :vartype size: int + :vartype modified_time: int + :vartype created_time: int + :vartype access_time: int + :vartype file_type: str + :vartype uid: str + + Additional variables may be added with other file metadata. """ class Config: extra = Extra.allow diff --git a/dedoc/api/schema/line_metadata.py b/dedoc/api/schema/line_metadata.py index 37e893d8..e123f28d 100644 --- a/dedoc/api/schema/line_metadata.py +++ b/dedoc/api/schema/line_metadata.py @@ -6,10 +6,20 @@ class LineMetadata(BaseModel): """ Holds information about document node/line metadata, such as page number or line type. + + :ivar paragraph_type: type of the document line/paragraph (header, list_item, list, etc.) + :ivar page_id: page number where paragraph starts, the numeration starts from page 0 + :ivar line_id: line number inside the entire document, the numeration starts from line 0 + + :vartype paragraph_type: str + :vartype page_id: int + :vartype line_id: Optional[int] + + Additional variables may be added with other line metadata. """ class Config: extra = Extra.allow - paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list) and etc.", example="raw_text") + paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list, etc.)", example="raw_text") page_id: int = Field(description="Page number of the line/paragraph beginning", example=0) line_id: Optional[int] = Field(description="Line number", example=1) diff --git a/dedoc/api/schema/line_with_meta.py b/dedoc/api/schema/line_with_meta.py index 1c155ab5..a8f61b1d 100644 --- a/dedoc/api/schema/line_with_meta.py +++ b/dedoc/api/schema/line_with_meta.py @@ -8,6 +8,12 @@ class LineWithMeta(BaseModel): """ Textual line with text annotations. + + :ivar text: text of the line + :ivar annotations: text annotations (font, size, bold, italic, etc.) + + :vartype text: str + :vartype annotations: List[Annotation] """ text: str = Field(description="Text of the line", example="Some text") - annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic and etc)") + annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic, etc.)") diff --git a/dedoc/api/schema/parsed_document.py b/dedoc/api/schema/parsed_document.py index 076540a4..d4a7d846 100644 --- a/dedoc/api/schema/parsed_document.py +++ b/dedoc/api/schema/parsed_document.py @@ -9,6 +9,18 @@ class ParsedDocument(BaseModel): """ Holds information about the document content, metadata and attachments. + + :ivar content: document text (hierarchy of nodes) and tables + :ivar attachments: result of analysis of attached files (empty if with_attachments=False) + :ivar metadata: document metadata such as size, creation date and so on. + :ivar warnings: list of warnings and possible errors, arising in the process of document parsing + :ivar version: version of the program that parsed this document + + :vartype content: DocumentContent + :vartype attachments: List[ParsedDocument] + :vartype metadata: DocumentMetadata + :vartype warnings: List[str] + :vartype version: str """ content: DocumentContent = Field(description="Document text and tables") metadata: DocumentMetadata = Field(description="Document metadata such as size, creation date and so on") diff --git a/dedoc/api/schema/table.py b/dedoc/api/schema/table.py index 52b2b59c..e834f1bf 100644 --- a/dedoc/api/schema/table.py +++ b/dedoc/api/schema/table.py @@ -11,6 +11,12 @@ class Table(BaseModel): Holds information about tables in the document. We assume that a table has rectangle form (has the same number of columns in each row). Table representation is row-based i.e. external list contains list of rows. + + :ivar metadata: a list of lists of table cells (cell has text lines, colspan and rowspan attributes) + :ivar cells: table metadata as location, title and so on + + :vartype metadata: TableMetadata + :vartype cells: List[List[CellWithMeta]] """ cells: List[List[CellWithMeta]] = Field(description="List of lists of table cells (cell has text, colspan and rowspan attributes)") metadata: TableMetadata = Field(description="Table meta information") diff --git a/dedoc/api/schema/table_metadata.py b/dedoc/api/schema/table_metadata.py index 53299a16..b75dbc21 100644 --- a/dedoc/api/schema/table_metadata.py +++ b/dedoc/api/schema/table_metadata.py @@ -6,6 +6,16 @@ class TableMetadata(BaseModel): """ Holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on. + + :ivar page_id: number of the page where table starts + :ivar uid: unique identifier of the table (used for linking table to text) + :ivar rotated_angle: value of the rotation angle by which the table was rotated during recognition + :ivar title: table's title + + :vartype page_id: Optional[int] + :vartype uid: str + :vartype rotated_angle: float + :vartype title: str """ page_id: Optional[int] = Field(description="Number of the page where the table starts", example=0) uid: str = Field(description="Unique identifier of the table", example="e8ba5523-8546-4804-898c-2f4835a1804f") diff --git a/dedoc/api/schema/tree_node.py b/dedoc/api/schema/tree_node.py index 5eeedd42..2aeeccae 100644 --- a/dedoc/api/schema/tree_node.py +++ b/dedoc/api/schema/tree_node.py @@ -10,6 +10,18 @@ class TreeNode(BaseModel): """ Helps to represent document as recursive tree structure. It has list of children `TreeNode` nodes (empty list for a leaf node). + + :ivar node_id: unique node identifier + :ivar text: text of the node (may contain several lines) + :ivar annotations: some metadata related to the part of the text (as font size) + :ivar metadata: metadata refers to entire node (as node type) + :ivar subparagraphs: list of child of this node + + :vartype node_id: str + :vartype text: str + :vartype annotations: List[Annotation] + :vartype metadata: LineMetadata + :vartype subparagraphs: List[TreeNode] """ node_id: str = Field(description="Document element identifier. It is unique within a document content tree. " "The identifier consists of numbers separated by dots where each number " diff --git a/dedoc/data_structures/annotation.py b/dedoc/data_structures/annotation.py index 2820f5a4..3cede658 100644 --- a/dedoc/data_structures/annotation.py +++ b/dedoc/data_structures/annotation.py @@ -7,6 +7,18 @@ class Annotation(Serializable): Base class for text annotations of all kinds. Annotation is the piece of information about the text line: it's appearance or links to another document object. Look to the concrete kind of annotations to get mode examples. + + :ivar start: start of the annotated text + :ivar end: end of the annotated text (end isn't included) + :ivar name: annotation's name, specific for each type of annotation + :ivar value: information about annotated text, depends on the type of annotation, e.g. "True"/"False", "10.0", etc. + :ivar is_mergeable: is it possible to merge annotations with the same value + + :vartype start: int + :vartype end: int + :vartype name: str + :vartype value: str + :vartype is_mergeable: bool """ def __init__(self, start: int, end: int, name: str, value: str, is_mergeable: bool = True) -> None: @@ -20,11 +32,11 @@ def __init__(self, start: int, end: int, name: str, value: str, is_mergeable: bo :param value: information about annotated text :param is_mergeable: is it possible to merge annotations with the same value """ - self.start = start - self.end = end - self.name = name - self.value = value - self.is_mergeable = is_mergeable + self.start: int = start + self.end: int = end + self.name: str = name + self.value: str = value + self.is_mergeable: bool = is_mergeable def __eq__(self, o: object) -> bool: if not isinstance(o, Annotation): @@ -35,7 +47,7 @@ def __str__(self) -> str: return f"{self.name.capitalize()}({self.start}:{self.end}, {self.value})" def __repr__(self) -> str: - return f"{self.name.capitalize()}(...)" + return self.__str__() def to_api_schema(self) -> ApiAnnotation: return ApiAnnotation(start=self.start, end=self.end, name=self.name, value=self.value) diff --git a/dedoc/data_structures/attached_file.py b/dedoc/data_structures/attached_file.py index c838acd6..9aac4d09 100644 --- a/dedoc/data_structures/attached_file.py +++ b/dedoc/data_structures/attached_file.py @@ -1,18 +1,28 @@ class AttachedFile: """ Holds information about files, attached to the parsed document. + + :ivar original_name: original name of the attached file if it was possible to extract it + :ivar tmp_file_path: path to the attached file on disk - its name is different from original_name + :ivar need_content_analysis: does the attached file need parsing (enable recursive parsing in :class:`~dedoc.DedocManager`) + :ivar uid: unique identifier of the attached file + + :vartype original_name: str + :vartype tmp_file_path: str + :vartype need_content_analysis: bool + :vartype uid: str """ def __init__(self, original_name: str, tmp_file_path: str, need_content_analysis: bool, uid: str) -> None: """ - :param original_name: Name of the file from which the attachments are extracted - :param tmp_file_path: path to the attachment file. + :param original_name: original name of the attached file + :param tmp_file_path: path to the attachment file :param need_content_analysis: indicator should we parse the attachment's content or simply save it without parsing :param uid: unique identifier of the attachment """ - self.original_name = original_name - self.tmp_file_path = tmp_file_path - self.need_content_analysis = need_content_analysis - self.uid = uid + self.original_name: str = original_name + self.tmp_file_path: str = tmp_file_path + self.need_content_analysis: bool = need_content_analysis + self.uid: str = uid def get_filename_in_path(self) -> str: return self.tmp_file_path diff --git a/dedoc/data_structures/cell_with_meta.py b/dedoc/data_structures/cell_with_meta.py index c2ff7cac..133d69bf 100644 --- a/dedoc/data_structures/cell_with_meta.py +++ b/dedoc/data_structures/cell_with_meta.py @@ -9,6 +9,16 @@ class CellWithMeta(Serializable): """ This class holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible). + + :ivar lines: list of textual lines of the cell + :ivar colspan: number of columns to span (for cells merged horizontally) + :ivar rowspan: number of rows to span (for cells merged vertically) + :ivar invisible: indicator for displaying or hiding cell text - cells that are merged with others are hidden (for HTML display) + + :vartype lines: List[LineWithMeta] + :vartype colspan: int + :vartype rowspan: int + :vartype invisible: bool """ def __init__(self, lines: List[LineWithMeta], colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None: """ @@ -17,10 +27,10 @@ def __init__(self, lines: List[LineWithMeta], colspan: int = 1, rowspan: int = 1 :param rowspan: number of rows to span like in HTML format :param invisible: indicator for displaying or hiding cell text """ - self.lines = lines - self.colspan = colspan - self.rowspan = rowspan - self.invisible = invisible + self.lines: List[LineWithMeta] = lines + self.colspan: int = colspan + self.rowspan: int = rowspan + self.invisible: bool = invisible def __repr__(self) -> str: return f"CellWithMeta({self.get_text()[:65]})" diff --git a/dedoc/data_structures/document_content.py b/dedoc/data_structures/document_content.py index ad4fa81e..4b420249 100644 --- a/dedoc/data_structures/document_content.py +++ b/dedoc/data_structures/document_content.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Optional from dedoc.api.schema.document_content import DocumentContent as ApiDocumentContent from dedoc.data_structures.serializable import Serializable @@ -9,16 +9,24 @@ class DocumentContent(Serializable): """ This class holds the document content - structured text and tables. + + :ivar tables: list of document tables + :ivar structure: tree structure of the document nodes with text and additional metadata + :ivar warnings: list of warnings, obtained in the process of the document parsing + + :vartype tables: List[Table] + :vartype structure: TreeNode + :vartype warnings: List[str] """ - def __init__(self, tables: List[Table], structure: TreeNode, warnings: List[str] = None) -> None: + def __init__(self, tables: List[Table], structure: TreeNode, warnings: Optional[List[str]] = None) -> None: """ :param tables: list of document tables :param structure: tree structure in which content of the document is organized - :param warnings: list of warnings, obtained in the process of the document structure constructing + :param warnings: list of warnings """ - self.tables = tables - self.structure = structure - self.warnings = warnings if warnings is not None else [] + self.tables: List[Table] = tables + self.structure: TreeNode = structure + self.warnings: List[str] = warnings if warnings is not None else [] def to_api_schema(self) -> ApiDocumentContent: structure = self.structure.to_api_schema() diff --git a/dedoc/data_structures/document_metadata.py b/dedoc/data_structures/document_metadata.py index ec51d143..23f8a11f 100644 --- a/dedoc/data_structures/document_metadata.py +++ b/dedoc/data_structures/document_metadata.py @@ -1,4 +1,4 @@ -from typing import Dict, Union +from typing import Dict, Optional, Union from dedoc.api.schema.document_metadata import DocumentMetadata as ApiDocumentMetadata from dedoc.data_structures.serializable import Serializable @@ -7,6 +7,26 @@ class DocumentMetadata(Serializable): """ This class holds information about document metadata. + + :ivar file_name: original document name (before rename and conversion, so it can contain non-ascii symbols, spaces and so on) + :ivar temporary_file_name: file name during parsing (unique name after rename and conversion) + :ivar size: size of the original file in bytes + :ivar modified_time: time of the last modification in unix time format (seconds since the epoch) + :ivar created_time: time of the creation in unixtime + :ivar access_time: time of the last access to the file in unixtime + :ivar file_type: mime type of the file + :ivar uid: document unique identifier (useful for attached files) + + :vartype file_name: str + :vartype temporary_file_name: str + :vartype size: int + :vartype modified_time: int + :vartype created_time: int + :vartype access_time: int + :vartype file_type: str + :vartype uid: str + + Additional variables may be added with other file metadata. """ def __init__(self, @@ -17,30 +37,30 @@ def __init__(self, created_time: int, access_time: int, file_type: str, - uid: str = None, + uid: Optional[str] = None, **kwargs: Dict[str, Union[str, int, float]]) -> None: """ - :param uid: document unique identifier (useful for attached files) - :param file_name: original document name (before rename and conversion, so it can contain non-ascii symbols, spaces and so on) - :param temporary_file_name: file name during parsing (unique name after rename and conversion); + :param uid: document unique identifier + :param file_name: original document name + :param temporary_file_name: file name during parsing :param size: size of the original file in bytes - :param modified_time: time of the last modification in unix time format (seconds since the epoch) + :param modified_time: time of the last modification in unix time format :param created_time: time of the creation in unixtime :param access_time: time of the last access to the file in unixtime :param file_type: mime type of the file """ import uuid - self.file_name = file_name - self.temporary_file_name = temporary_file_name - self.size = size - self.modified_time = modified_time - self.created_time = created_time - self.access_time = access_time - self.file_type = file_type + self.file_name: str = file_name + self.temporary_file_name: str = temporary_file_name + self.size: int = size + self.modified_time: int = modified_time + self.created_time: int = created_time + self.access_time: int = access_time + self.file_type: str = file_type for key, value in kwargs.items(): self.add_attribute(key, value) - self.uid = f"doc_uid_auto_{uuid.uuid1()}" if uid is None else uid + self.uid: str = f"doc_uid_auto_{uuid.uuid1()}" if uid is None else uid def add_attribute(self, key: str, value: Union[str, int, float]) -> None: setattr(self, key, value) diff --git a/dedoc/data_structures/hierarchy_level.py b/dedoc/data_structures/hierarchy_level.py index ab2ea053..f7964bc9 100644 --- a/dedoc/data_structures/hierarchy_level.py +++ b/dedoc/data_structures/hierarchy_level.py @@ -15,6 +15,16 @@ class HierarchyLevel: For the least important lines (line_type=raw_text) both levels are None. Look to the :ref:`hierarchy level description ` to get more details. + + :ivar level_1: value of a line's primary importance + :ivar level_2: level of the line inside specific class + :ivar can_be_multiline: is used to unify lines inside tree node, if line can be multiline, it can be joined with another line + :ivar line_type: type of the line, e.g. raw text, list item, header, etc. + + :vartype level_1: Optional[int] + :vartype level_2: Optional[int] + :vartype can_be_multiline: bool + :vartype line_type: str """ root = "root" toc = "toc" @@ -33,14 +43,14 @@ def __init__(self, level_1: Optional[int], level_2: Optional[int], can_be_multil :param level_1: value of a line's primary importance :param level_2: level of the line inside specific class :param can_be_multiline: is used to unify lines inside tree node, if line can be multiline, it can be joined with another line - :param line_type: type of the line, e.g. raw text, list item, header, etc. + :param line_type: type of the line """ assert level_1 is None or level_1 >= 0 assert level_2 is None or level_2 >= 0 - self.level_1 = level_1 - self.level_2 = level_2 - self.can_be_multiline = can_be_multiline - self.line_type = line_type + self.level_1: Optional[int] = level_1 + self.level_2: Optional[int] = level_2 + self.can_be_multiline: bool = can_be_multiline + self.line_type: str = line_type def __is_defined(self, other: "HierarchyLevel") -> bool: return self.level_1 is not None and self.level_2 is not None and other.level_1 is not None and other.level_2 is not None diff --git a/dedoc/data_structures/line_metadata.py b/dedoc/data_structures/line_metadata.py index 20a0dce8..9b255d5d 100644 --- a/dedoc/data_structures/line_metadata.py +++ b/dedoc/data_structures/line_metadata.py @@ -8,6 +8,20 @@ class LineMetadata(Serializable): """ This class holds information about document node (and document line) metadata, such as page number or line level in a document hierarchy. + + :ivar tag_hierarchy_level: the hierarchy level of the line with its type directly extracted by some of the readers + (usually information got from tags e.g. in docx or html readers) + :ivar hierarchy_level: the hierarchy level of the line extracted by some of the structure extractors - the result type and level of the line. + The lower the level of the hierarchy, the closer it is to the root, it's used to construct document tree. + :ivar page_id: page number where paragraph starts, the numeration starts from page 0 + :ivar line_id: line number inside the entire document, the numeration starts from line 0 + + :vartype tag_hierarchy_level: HierarchyLevel + :vartype hierarchy_level: Optional[HierarchyLevel] + :vartype page_id: int + :vartype line_id: Optional[int] + + Additional variables may be added with other line metadata. """ def __init__(self, @@ -20,14 +34,12 @@ def __init__(self, :param page_id: page number where paragraph starts, the numeration starts from page 0 :param line_id: line number inside the entire document, the numeration starts from line 0 :param tag_hierarchy_level: the hierarchy level of the line with its type directly extracted by some of the readers - (usually information got from tags e.g. in docx or html readers) :param hierarchy_level: the hierarchy level of the line extracted by some of the structure extractors - the result type and level of the line. - The lower the level of the hierarchy, the closer it is to the root, it's used to construct document tree. """ - self.tag_hierarchy_level = HierarchyLevel.create_unknown() if tag_hierarchy_level is None else tag_hierarchy_level - self.hierarchy_level = hierarchy_level - self.page_id = page_id - self.line_id = line_id + self.tag_hierarchy_level: HierarchyLevel = HierarchyLevel.create_unknown() if tag_hierarchy_level is None else tag_hierarchy_level + self.hierarchy_level: Optional[HierarchyLevel] = hierarchy_level + self.page_id: int = page_id + self.line_id: Optional[int] = line_id for key, value in kwargs.items(): setattr(self, key, value) diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py index 16d6256c..41e9f934 100644 --- a/dedoc/data_structures/line_with_meta.py +++ b/dedoc/data_structures/line_with_meta.py @@ -119,18 +119,30 @@ def __extract_annotations_by_slice(self, start: int, stop: int) -> List[Annotati @property def line(self) -> str: + """ + Raw text of the document line + """ return self._line @property def metadata(self) -> LineMetadata: + """ + Line metadata related to the entire line, as line or page number, hierarchy level + """ return self._metadata @property def annotations(self) -> List[Annotation]: + """ + Metadata that refers to some part of the text, for example, font size, font type, etc. + """ return self._annotations @property def uid(self) -> str: + """ + Unique identifier of the line + """ return self._uid def set_line(self, line: str) -> None: diff --git a/dedoc/data_structures/parsed_document.py b/dedoc/data_structures/parsed_document.py index 862b87d0..67a700cb 100644 --- a/dedoc/data_structures/parsed_document.py +++ b/dedoc/data_structures/parsed_document.py @@ -9,11 +9,21 @@ class ParsedDocument(Serializable): """ This class holds information about the document content, metadata and attachments. + + :ivar content: document text (hierarchy of nodes) and tables + :ivar attachments: result of analysis of attached files (empty if with_attachments=False) + :ivar metadata: document metadata such as size, creation date and so on. + :ivar warnings: list of warnings and possible errors, arising in the process of document parsing + + :vartype content: DocumentContent + :vartype attachments: List[ParsedDocument] + :vartype metadata: DocumentMetadata + :vartype warnings: List[str] """ def __init__(self, metadata: DocumentMetadata, - content: Optional[DocumentContent], - warnings: List[str] = None, + content: DocumentContent, + warnings: Optional[List[str]] = None, attachments: Optional[List["ParsedDocument"]] = None) -> None: """ :param metadata: document metadata such as size, creation date and so on. @@ -21,10 +31,10 @@ def __init__(self, :param attachments: result of analysis of attached files :param warnings: list of warnings and possible errors, arising in the process of document parsing """ - self.metadata = metadata - self.content = content - self.attachments = [] if attachments is None else attachments - self.warnings = warnings if warnings is not None else [] + self.metadata: DocumentMetadata = metadata + self.content: DocumentContent = content + self.attachments: List["ParsedDocument"] = [] if attachments is None else attachments + self.warnings: List[str] = warnings if warnings is not None else [] def add_attachments(self, new_attachment: List["ParsedDocument"]) -> None: if self.attachments is None: diff --git a/dedoc/data_structures/table.py b/dedoc/data_structures/table.py index 65ac6d49..1f53bf55 100644 --- a/dedoc/data_structures/table.py +++ b/dedoc/data_structures/table.py @@ -10,15 +10,22 @@ class Table(Serializable): """ This class holds information about tables in the document. We assume that a table has rectangle form (has the same number of columns in each row). + If some cells are merged, they are duplicated and information about merge is stored in rowspan and colspan. Table representation is row-based i.e. external list contains list of rows. + + :ivar metadata: a list of lists of table cells (cell has text lines, colspan and rowspan attributes) + :ivar cells: table metadata as location, title and so on + + :vartype metadata: TableMetadata + :vartype cells: List[List[CellWithMeta]] """ def __init__(self, cells: List[List[CellWithMeta]], metadata: TableMetadata) -> None: """ - :param cells: a list of lists of cells (cell has text, colspan and rowspan attributes) - :param metadata: some table metadata as location, size and so on + :param cells: a list of lists of cells + :param metadata: table metadata """ - self.metadata = metadata - self.cells = cells + self.metadata: TableMetadata = metadata + self.cells: List[List[CellWithMeta]] = cells def to_api_schema(self) -> ApiTable: cells = [[cell.to_api_schema() for cell in row] for row in self.cells] diff --git a/dedoc/data_structures/table_metadata.py b/dedoc/data_structures/table_metadata.py index e85c747e..cdcab2fb 100644 --- a/dedoc/data_structures/table_metadata.py +++ b/dedoc/data_structures/table_metadata.py @@ -7,20 +7,30 @@ class TableMetadata(Serializable): """ This class holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on. + + :ivar page_id: number of the page where table starts + :ivar uid: unique identifier of the table (used for linking table to text) + :ivar rotated_angle: value of the rotation angle by which the table was rotated during recognition + :ivar title: table's title + + :vartype page_id: Optional[int] + :vartype uid: str + :vartype rotated_angle: float + :vartype title: str """ def __init__(self, page_id: Optional[int], uid: Optional[str] = None, rotated_angle: float = 0.0, title: str = "") -> None: """ :param page_id: number of the page where table starts :param uid: unique identifier of the table - :param rotated_angle: value of the rotation angle by which the table was rotated during recognition + :param rotated_angle: rotation angle by which the table was rotated during recognition :param title: table's title """ import uuid - self.page_id = page_id - self.uid = str(uuid.uuid4()) if not uid else uid - self.rotated_angle = rotated_angle - self.title = title + self.page_id: Optional[int] = page_id + self.uid: str = str(uuid.uuid4()) if not uid else uid + self.rotated_angle: float = rotated_angle + self.title: str = title def to_api_schema(self) -> ApiTableMetadata: return ApiTableMetadata(uid=self.uid, page_id=self.page_id, rotated_angle=self.rotated_angle, title=self.title) diff --git a/dedoc/data_structures/tree_node.py b/dedoc/data_structures/tree_node.py index 6cde3554..de380584 100644 --- a/dedoc/data_structures/tree_node.py +++ b/dedoc/data_structures/tree_node.py @@ -12,6 +12,20 @@ class TreeNode(Serializable): """ TreeNode helps to represent document as recursive tree structure. It has parent node (None for root ot the tree) and list of children nodes (empty list for list node). + + :ivar node_id: unique node identifier + :ivar text: text of the node (may contain several lines) + :ivar annotations: some metadata related to the part of the text (as font size) + :ivar metadata: metadata refers to entire node (as node type) + :ivar subparagraphs: list of child of this node + :ivar parent: parent node (None for root, not none for other nodes) + + :vartype node_id: str + :vartype text: str + :vartype annotations: List[Annotation] + :vartype metadata: LineMetadata + :vartype subparagraphs: List[TreeNode] + :vartype parent: TreeNode """ def __init__(self, node_id: str, @@ -23,17 +37,17 @@ def __init__(self, """ :param node_id: node id is unique in one document :param text: text of the node - :param annotations: some metadata related to the part of the text (as font size) - :param metadata: metadata refers to entire node (as node type) + :param annotations: metadata related to the part of the text + :param metadata: metadata refers to entire node :param subparagraphs: list of child of this node - :param parent: parent node (None for root, not none for other nodes) + :param parent: parent node """ - self.node_id = node_id - self.text = text - self.annotations = annotations - self.metadata = metadata - self.subparagraphs = subparagraphs - self.parent = parent + self.node_id: str = node_id + self.text: str = text + self.annotations: List[Annotation] = annotations + self.metadata: LineMetadata = metadata + self.subparagraphs: List["TreeNode"] = subparagraphs + self.parent: "TreeNode" = parent def to_api_schema(self) -> ApiTreeNode: annotations = [annotation.to_api_schema() for annotation in self.annotations] diff --git a/dedoc/data_structures/unstructured_document.py b/dedoc/data_structures/unstructured_document.py index 94197e2e..256ef8db 100644 --- a/dedoc/data_structures/unstructured_document.py +++ b/dedoc/data_structures/unstructured_document.py @@ -9,12 +9,24 @@ class UnstructuredDocument: """ This class holds information about raw document content: its text, tables and attachments, that have been procured using one of the readers. Text is represented as a flat list of lines, hierarchy level of each line isn't defined (only tag hierarchy level may exist). + + :ivar lines: list of textual lines with metadata returned by a reader + :ivar tables: list of document tables returned by a reader + :ivar attachments: list of document attached files + :ivar metadata: information about the document (like in :class:`~dedoc.data_structures.DocumentMetadata`) + :ivar warnings: list of warnings, obtained in the process of the document parsing + + :vartype lines: List[LineWithMeta] + :vartype tables: List[Table] + :vartype attachments: List[AttachedFile] + :vartype metadata: dict + :vartype warnings: List[str] """ def __init__(self, tables: List[Table], lines: List[LineWithMeta], attachments: List[AttachedFile], - warnings: List[str] = None, + warnings: Optional[List[str]] = None, metadata: Optional[dict] = None) -> None: """ :param tables: list of document tables @@ -23,11 +35,11 @@ def __init__(self, :param warnings: list of warnings, obtained in the process of the document parsing :param metadata: additional data """ - self.tables = tables - self.lines = lines - self.attachments = attachments - self.warnings = warnings if warnings else [] - self.metadata = metadata if metadata is not None else {} + self.tables: List[Table] = tables + self.lines: List[LineWithMeta] = lines + self.attachments: List[AttachedFile] = attachments + self.warnings: List[str] = warnings if warnings else [] + self.metadata: dict = metadata if metadata is not None else {} def get_text(self) -> str: return LineWithMeta.join(self.lines).line diff --git a/docs/source/_static/notebooks_data/doc_example.jpeg b/docs/source/_static/notebooks_data/doc_example.jpeg new file mode 100644 index 00000000..25915a03 Binary files /dev/null and b/docs/source/_static/notebooks_data/doc_example.jpeg differ diff --git a/docs/source/_static/notebooks_data/doc_tables.pdf b/docs/source/_static/notebooks_data/doc_tables.pdf new file mode 100644 index 00000000..db96312e Binary files /dev/null and b/docs/source/_static/notebooks_data/doc_tables.pdf differ diff --git a/docs/source/_static/notebooks_data/doc_tables_1.jpeg b/docs/source/_static/notebooks_data/doc_tables_1.jpeg new file mode 100644 index 00000000..ad211018 Binary files /dev/null and b/docs/source/_static/notebooks_data/doc_tables_1.jpeg differ diff --git a/docs/source/_static/notebooks_data/doc_tables_2.jpeg b/docs/source/_static/notebooks_data/doc_tables_2.jpeg new file mode 100644 index 00000000..1d299c04 Binary files /dev/null and b/docs/source/_static/notebooks_data/doc_tables_2.jpeg differ diff --git a/docs/source/_static/notebooks_data/table_1.png b/docs/source/_static/notebooks_data/table_1.png new file mode 100644 index 00000000..c585e440 Binary files /dev/null and b/docs/source/_static/notebooks_data/table_1.png differ diff --git a/docs/source/_static/notebooks_data/table_2.png b/docs/source/_static/notebooks_data/table_2.png new file mode 100644 index 00000000..5728f069 Binary files /dev/null and b/docs/source/_static/notebooks_data/table_2.png differ diff --git a/docs/source/dedoc_api_usage/api_schema.rst b/docs/source/dedoc_api_usage/api_schema.rst index ea8d5b8a..adb3d48a 100644 --- a/docs/source/dedoc_api_usage/api_schema.rst +++ b/docs/source/dedoc_api_usage/api_schema.rst @@ -8,69 +8,20 @@ Json schema of the output is also available during dedoc application running on .. autoclass:: dedoc.api.schema.ParsedDocument - .. autoattribute:: content - .. autoattribute:: metadata - .. autoattribute:: version - .. autoattribute:: warnings - .. autoattribute:: attachments - .. autoclass:: dedoc.api.schema.DocumentContent - .. autoattribute:: structure - .. autoattribute:: tables - .. autoclass:: dedoc.api.schema.DocumentMetadata - .. autoattribute:: uid - .. autoattribute:: file_name - .. autoattribute:: temporary_file_name - .. autoattribute:: size - .. autoattribute:: modified_time - .. autoattribute:: created_time - .. autoattribute:: access_time - .. autoattribute:: file_type - .. autoclass:: dedoc.api.schema.TreeNode - .. autoattribute:: node_id - .. autoattribute:: text - .. autoattribute:: annotations - .. autoattribute:: metadata - .. autoattribute:: subparagraphs - .. autoclass:: dedoc.api.schema.LineWithMeta - .. autoattribute:: text - .. autoattribute:: annotations - .. autoclass:: dedoc.api.schema.LineMetadata - .. autoattribute:: paragraph_type - .. autoattribute:: page_id - .. autoattribute:: line_id - .. autoclass:: dedoc.api.schema.Table - .. autoattribute:: cells - .. autoattribute:: metadata - .. autoclass:: dedoc.api.schema.TableMetadata - .. autoattribute:: page_id - .. autoattribute:: uid - .. autoattribute:: rotated_angle - .. autoattribute:: title - .. autoclass:: dedoc.api.schema.CellWithMeta - .. autoattribute:: lines - .. autoattribute:: rowspan - .. autoattribute:: colspan - .. autoattribute:: invisible - .. autoclass:: dedoc.api.schema.Annotation - - .. autoattribute:: start - .. autoattribute:: end - .. autoattribute:: name - .. autoattribute:: value diff --git a/docs/source/index.rst b/docs/source/index.rst index b9dc7e2e..fade9141 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -229,6 +229,7 @@ This type of structure is configurable (see :ref:`using_patterns`). :maxdepth: 1 :caption: Tutorials + tutorials/notebooks tutorials/add_new_doc_format tutorials/add_new_structure_type tutorials/add_new_language diff --git a/docs/source/modules/data_structures.rst b/docs/source/modules/data_structures.rst index 35efcbc8..d6785ac7 100644 --- a/docs/source/modules/data_structures.rst +++ b/docs/source/modules/data_structures.rst @@ -7,33 +7,28 @@ Main classes defining a document -------------------------------- .. autoclass:: dedoc.data_structures.UnstructuredDocument - :special-members: __init__ .. autoclass:: dedoc.data_structures.ParsedDocument :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.data_structures.DocumentContent :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.data_structures.DocumentMetadata :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.data_structures.TreeNode :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.data_structures.LineWithMeta :show-inheritance: :special-members: __init__, __lt__ :members: - :undoc-members: line, uid, metadata, annotations + :undoc-members: set_line, set_metadata .. automethod:: __len__ .. automethod:: __getitem__ @@ -41,26 +36,22 @@ Main classes defining a document .. autoclass:: dedoc.data_structures.LineMetadata :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.data_structures.HierarchyLevel - :special-members: __init__, __eq__, __lt__ + :special-members: __eq__, __lt__ :members: .. autoclass:: dedoc.data_structures.Table :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.data_structures.TableMetadata :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.data_structures.CellWithMeta :show-inheritance: - :special-members: __init__ :members: @@ -83,7 +74,6 @@ Helper classes .. autoattribute:: height .. autoclass:: dedoc.data_structures.AttachedFile - :special-members: __init__ :members: .. _annotations: @@ -93,7 +83,6 @@ Annotations of the text lines .. autoclass:: dedoc.data_structures.Annotation :show-inheritance: - :special-members: __init__ Concrete annotations ~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/tutorials/notebooks.rst b/docs/source/tutorials/notebooks.rst new file mode 100644 index 00000000..9abea57f --- /dev/null +++ b/docs/source/tutorials/notebooks.rst @@ -0,0 +1,32 @@ +Notebooks with examples of Dedoc usage +====================================== + +.. _table_notebooks: + +.. flat-table:: Notebooks with Dedoc usage examples + :widths: 70 30 + :header-rows: 1 + :class: tight-table + + * - Task description + - Link to the notebook + + * - Document text preprocessing for the following document classification: + * automatic detection of document format: DOC, DOCX, PDF or any image format; + * text extraction and its structuring; + * saving the result to JSON file. + - `Notebook 1 `_ + + * - Tables text and structure extraction from images of scanned documents: + * automatic detection of document format: PDF or any image format; + * tables extraction including multi-paged tables; + * grouping tables by document page where they are located; + * saving each page to CSV file. + - `Notebook 2 `_ + + * - ADVANCED: Extract text from scanned documents and get its location on the document image: + * automatic detection of image format; + * text extraction from image; + * text location visualization; + * text recognition confidence visualization. + - `Notebook 3 `_