feat: Add comprehensive XML support with structured Markdown conversion

microsoft · Jan 31, 2025 · b89f51a · b89f51a
1 parent bfde857
commit b89f51a
Show file tree

Hide file tree

Showing 2 changed files with 114 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -15,6 +15,8 @@ It supports:
 - Audio (EXIF metadata and speech transcription)
 - HTML
 - Text-based formats (CSV, JSON, XML)
+  - XML support includes general XML files, RSS feeds, and Atom feeds
+  - Preserves XML structure and attributes in Markdown format
 - ZIP files (iterates over contents)
 
 To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: `pip install -e .`
@@ -51,6 +53,31 @@ result = md.convert("test.xlsx")
 print(result.text_content)
 ```
 
+#### XML Support
+
+The library provides comprehensive XML support:
+
+```python
+from markitdown import MarkItDown
+
+md = MarkItDown()
+
+# General XML files
+result = md.convert("data.xml")
+
+# RSS feeds
+result = md.convert("feed.rss")
+
+# Atom feeds
+result = md.convert("feed.atom")
+```
+
+XML files are converted to a structured Markdown format that preserves:
+- XML element hierarchy using Markdown headers
+- Element attributes as lists
+- Text content
+- Special handling for RSS and Atom feeds with proper formatting
+
 To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
 
 ```python

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
@@ -1399,6 +1399,91 @@ def convert(
         )
 
 
+class XMLConverter(DocumentConverter):
+    """Convert general XML files to markdown.
+    
+    This converter handles general XML files and converts them to a structured Markdown format.
+    It preserves the XML hierarchy, attributes, and text content. For RSS and Atom feeds,
+    it delegates to the specialized RSSConverter.
+    
+    Features:
+    - Converts XML element hierarchy to Markdown headers
+    - Preserves element attributes as lists
+    - Maintains text content
+    - Automatically detects and delegates RSS/Atom feeds
+    - Provides clear error messages for invalid XML
+    
+    Supported file extensions:
+    - .xml: General XML files
+    - .docbook: DocBook XML files
+    - .qtl: QTL files
+    - .rng: RELAX NG files
+    """
+
+    def convert(self, local_path: str, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not XML type
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".xml", ".docbook", ".qtl", ".rng"]:
+            return None
+
+        try:
+            doc = minidom.parse(local_path)
+
+            # Check if it's an RSS or Atom feed - if so, let RSSConverter handle it
+            if (doc.getElementsByTagName("rss") or 
+                (doc.getElementsByTagName("feed") and doc.getElementsByTagName("entry"))):
+                return None
+
+            md_content = self._convert_xml_to_markdown(doc)
+            return DocumentConverterResult(
+                title=None,
+                text_content=md_content
+            )
+        except Exception as e:
+            # Provide more detailed error information
+            error_msg = f"XML dönüştürme hatası: {str(e)}\n"
+            error_msg += "Lütfen dosyanın geçerli bir XML dosyası olduğunu kontrol edin."
+            return DocumentConverterResult(
+                title=None,
+                text_content=error_msg
+            )
+
+    def _convert_xml_to_markdown(self, doc: minidom.Document) -> str:
+        """Convert XML document to markdown format"""
+        md_content = ""
+
+        # Get root element
+        root = doc.documentElement
+        md_content += f"# {root.tagName}\n\n"
+
+        # Convert child nodes
+        md_content += self._process_node(root, level=1)
+
+        return md_content.strip()
+
+    def _process_node(self, node: minidom.Element, level: int = 0) -> str:
+        """Process an XML node and its children recursively"""
+        content = ""
+
+        # Process attributes
+        if node.attributes and node.attributes.length > 0:
+            content += "**Attributes:**\n\n"
+            for attr in node.attributes.items():
+                content += f"- {attr[0]}: {attr[1]}\n"
+            content += "\n"
+
+        # Process child nodes
+        for child in node.childNodes:
+            if child.nodeType == minidom.Node.TEXT_NODE:
+                text = child.data.strip()
+                if text:
+                    content += f"{text}\n"
+            elif child.nodeType == minidom.Node.ELEMENT_NODE:
+                content += f"{'#' * (level + 2)} {child.tagName}\n\n"
+                content += self._process_node(child, level + 1)
+
+        return content
+
 class FileConversionException(BaseException):
     pass
 
@@ -1472,7 +1557,8 @@ def __init__(
         # To this end, the most specific converters should appear below the most generic converters
         self.register_page_converter(PlainTextConverter())
         self.register_page_converter(HtmlConverter())
-        self.register_page_converter(RSSConverter())
+        self.register_page_converter(XMLConverter())  # Generic XML converter
+        self.register_page_converter(RSSConverter())  # Specific XML type
         self.register_page_converter(WikipediaConverter())
         self.register_page_converter(YouTubeConverter())
         self.register_page_converter(BingSerpConverter())