Merge pull request #25 from cneud/black

format code with black
cneud · Oct 12, 2023 · 4c75025 · 4c75025
2 parents bf06691 + dbf7657
commit 4c75025
Show file tree

Hide file tree

Showing 2 changed files with 170 additions and 123 deletions.
diff --git a/src/alto_tools/alto_tools.py b/src/alto_tools/alto_tools.py
@@ -10,11 +10,11 @@
 import sys
 import xml.etree.ElementTree as ET
 
-__version__ = '0.1.0'
+__version__ = "0.1.0"
 
 
 def alto_parse(alto, **kargs):
-    """ Convert ALTO xml file to element tree """
+    """Convert ALTO xml file to element tree"""
     try:
         xml = ET.parse(alto, **kargs)
     except ET.ParseError as e:
@@ -23,99 +23,114 @@ def alto_parse(alto, **kargs):
     # https://www.loc.gov/standards/alto/ | https://github.com/altoxml
     # alto-bnf (unofficial) BnF ALTO dialect - for further info see
     # http://bibnum.bnf.fr/alto_prod/documentation/alto_prod.html
-    namespace = {'alto-1': 'http://schema.ccs-gmbh.com/ALTO',
-                 'alto-1-xsd': 'http://schema.ccs-gmbh.com/ALTO/alto-1-4.xsd',
-                 'alto-2': 'http://www.loc.gov/standards/alto/ns-v2#',
-                 'alto-2-xsd': 'https://www.loc.gov/standards/alto/alto.xsd',
-                 'alto-3': 'http://www.loc.gov/standards/alto/ns-v3#',
-                 'alto-3-xsd': 'http://www.loc.gov/standards/alto/v3/alto.xsd',
-                 'alto-4': 'http://www.loc.gov/standards/alto/ns-v4#',
-                 'alto-4-xsd': 'http://www.loc.gov/standards/alto/v4/alto.xsd',
-                 'alto-bnf': 'http://bibnum.bnf.fr/ns/alto_prod'}
+    namespace = {
+        "alto-1": "http://schema.ccs-gmbh.com/ALTO",
+        "alto-1-xsd": "http://schema.ccs-gmbh.com/ALTO/alto-1-4.xsd",
+        "alto-2": "http://www.loc.gov/standards/alto/ns-v2#",
+        "alto-2-xsd": "https://www.loc.gov/standards/alto/alto.xsd",
+        "alto-3": "http://www.loc.gov/standards/alto/ns-v3#",
+        "alto-3-xsd": "http://www.loc.gov/standards/alto/v3/alto.xsd",
+        "alto-4": "http://www.loc.gov/standards/alto/ns-v4#",
+        "alto-4-xsd": "http://www.loc.gov/standards/alto/v4/alto.xsd",
+        "alto-bnf": "http://bibnum.bnf.fr/ns/alto_prod",
+    }
     # Extract namespace from document root
-    if 'http://' in str(xml.getroot().tag.split('}')[0].strip('{')):
-        xmlns = xml.getroot().tag.split('}')[0].strip('{')
+    if "http://" in str(xml.getroot().tag.split("}")[0].strip("{")):
+        xmlns = xml.getroot().tag.split("}")[0].strip("{")
     else:
         try:
             ns = xml.getroot().attrib
-            xmlns = str(ns).split(' ')[1].strip('}').strip("'")
+            xmlns = str(ns).split(" ")[1].strip("}").strip("'")
         except IndexError:
             sys.stderr.write(
-                f'\nERROR: File "{alto.name}": no namespace declaration found.')
-            xmlns = 'no_namespace_found'
+                f'\nERROR: File "{alto.name}": no namespace declaration found.'
+            )
+            xmlns = "no_namespace_found"
     if xmlns in namespace.values():
         return alto, xml, xmlns
     else:
-        sys.stdout.write(f'\nERROR: File "{alto.name}": namespace {xmlns} is not registered.\n')
+        sys.stdout.write(
+            f'\nERROR: File "{alto.name}": namespace {xmlns} is not registered.\n'
+        )
 
 
 def alto_text(xml, xmlns):
-    """ Extract text content from ALTO xml file """
+    """Extract text content from ALTO xml file"""
     # Ensure use of UTF-8
-    if isinstance(sys.stdout, io.TextIOWrapper) and sys.stdout.encoding != 'UTF-8':
-        sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')
+    if isinstance(sys.stdout, io.TextIOWrapper) and sys.stdout.encoding != "UTF-8":
+        sys.stdout = codecs.getwriter("utf-8")(sys.stdout.buffer, "strict")
     # Find all <TextLine> elements
-    for lines in xml.iterfind('.//{%s}TextLine' % xmlns):
+    for lines in xml.iterfind(".//{%s}TextLine" % xmlns):
         # New line after every <TextLine> element
-        sys.stdout.write('\n')
+        sys.stdout.write("\n")
         # Find all <String> elements
-        for line in lines.findall('{%s}String' % xmlns):
+        for line in lines.findall("{%s}String" % xmlns):
             # Check if there are no hyphenated words
-            if ('SUBS_CONTENT' not in line.attrib and 'SUBS_TYPE' not in line.attrib):
-            # Get value of attribute @CONTENT from all <String> elements
-                text = line.attrib.get('CONTENT') + ' '
+            if "SUBS_CONTENT" not in line.attrib and "SUBS_TYPE" not in line.attrib:
+                # Get value of attribute @CONTENT from all <String> elements
+                text = line.attrib.get("CONTENT") + " "
             else:
                 #  Handling of hyphenation to avoid duplicates, see
                 #  https://github.com/cneud/alto-tools/issues/16
-                if ('HypPart1' in line.attrib.get('SUBS_TYPE')):
-                    # Get the first part of the hyphenated word from @CONTENT 
+                if "HypPart1" in line.attrib.get("SUBS_TYPE"):
+                    # Get the first part of the hyphenated word from @CONTENT
                     # (instead of using @SUBS_CONTENT)
-                    if ('HypPart1' in line.attrib.get('SUBS_TYPE')):
-                        text = line.attrib.get('CONTENT')
+                    if "HypPart1" in line.attrib.get("SUBS_TYPE"):
+                        text = line.attrib.get("CONTENT")
                     # Concatenate second part of the hyphenated word from @CONTENT
-                    if ('HypPart2' in line.attrib.get('SUBS_TYPE')):
-                        text = line.attrib.get('CONTENT') + ' '
+                    if "HypPart2" in line.attrib.get("SUBS_TYPE"):
+                        text = line.attrib.get("CONTENT") + " "
             sys.stdout.write(text)
 
 
 def alto_illustrations(alto, xml, xmlns):
-    """ Extract bounding box coordinates of illustration regions from ALTO xml file """
+    """Extract bounding box coordinates of illustration regions from ALTO xml file"""
     # Find all <Illustration> elements
-    for illustration in xml.iterfind('.//{%s}Illustration' % xmlns):
+    for illustration in xml.iterfind(".//{%s}Illustration" % xmlns):
         # Get @ID of <Illustration> element
-        illustration_id = illustration.attrib.get('ID')
+        illustration_id = illustration.attrib.get("ID")
         # Get coordinates of <Illustration> element
-        illustration_coords = (illustration.attrib.get('HEIGHT') + ','
-                            + illustration.attrib.get('WIDTH') + ','
-                            + illustration.attrib.get('VPOS') + ','
-                            + illustration.attrib.get('HPOS'))
-        illustrations = illustration_id + '=' + illustration_coords
-        sys.stdout.write(f'\nFile: {alto.name}, Illustration: {illustrations}')
+        illustration_coords = (
+            illustration.attrib.get("HEIGHT")
+            + ","
+            + illustration.attrib.get("WIDTH")
+            + ","
+            + illustration.attrib.get("VPOS")
+            + ","
+            + illustration.attrib.get("HPOS")
+        )
+        illustrations = illustration_id + "=" + illustration_coords
+        sys.stdout.write(f"\nFile: {alto.name}, Illustration: {illustrations}")
 
 
 def alto_graphics(alto, xml, xmlns):
-    """ Extract bounding box coordinates of graphical elements from ALTO xml file """
+    """Extract bounding box coordinates of graphical elements from ALTO xml file"""
     # Find all <GraphicalElement> elements
-    for graphic in xml.iterfind('.//{%s}GraphicalElement' % xmlns):
+    for graphic in xml.iterfind(".//{%s}GraphicalElement" % xmlns):
         # Get @ID of <GraphicalElement> element
-        graphic_id = graphic.attrib.get('ID')
+        graphic_id = graphic.attrib.get("ID")
         # Get coordinates of <GraphicalElement> element
-        graphic_coords = (graphic.attrib.get('HEIGHT') + ','
-                       + graphic.attrib.get('WIDTH') + ','
-                       + graphic.attrib.get('VPOS') + ','
-                       + graphic.attrib.get('HPOS'))
-        graphics = graphic_id + '=' + graphic_coords
-        sys.stdout.write(f'\nFile: {alto.name}, GraphicalElement: {graphics}')
+        graphic_coords = (
+            graphic.attrib.get("HEIGHT")
+            + ","
+            + graphic.attrib.get("WIDTH")
+            + ","
+            + graphic.attrib.get("VPOS")
+            + ","
+            + graphic.attrib.get("HPOS")
+        )
+        graphics = graphic_id + "=" + graphic_coords
+        sys.stdout.write(f"\nFile: {alto.name}, GraphicalElement: {graphics}")
 
 
 def alto_confidence(alto, xml, xmlns):
-    """ Calculate mean word confidence score for ALTO xml file """
+    """Calculate mean word confidence score for ALTO xml file"""
     score = 0
     count = 0
     # Find all <String> elements
-    for conf in xml.iterfind('.//{%s}String' % xmlns):
+    for conf in xml.iterfind(".//{%s}String" % xmlns):
         # Get value of attribute @WC (Word Confidence) of all <String> elements
-        wc = conf.attrib.get('WC')
+        wc = conf.attrib.get("WC")
         # Calculate sum of all @WC values as float
         if wc is not None:
             score += float(wc)
@@ -125,57 +140,79 @@ def alto_confidence(alto, xml, xmlns):
     if count > 0:
         confidence = score / count
         result = round(100 * confidence, 2)
-        sys.stdout.write(f'\nFile: {alto.name}, Confidence: {result}')
+        sys.stdout.write(f"\nFile: {alto.name}, Confidence: {result}")
         return result
     else:
-        sys.stdout.write(f'\nFile: {alto.name}, Confidence: 00.00')
+        sys.stdout.write(f"\nFile: {alto.name}, Confidence: 00.00")
         return 0
 
 
 def parse_arguments():
     parser = argparse.ArgumentParser(
         description="ALTO Tools: simple tools for performing various operations on ALTO xml files",
         add_help=True,
-        prog='alto_tools.py',
-        usage='python %(prog)s INPUT [option]')
-    parser.add_argument('INPUT',
-                        nargs='+',
-                        help='path to ALTO file or directory containing ALTO files')
+        prog="alto_tools.py",
+        usage="python %(prog)s INPUT [option]",
+    )
+    parser.add_argument(
+        "INPUT", nargs="+", help="path to ALTO file or directory containing ALTO files"
+    )
     g = parser.add_mutually_exclusive_group()
-    g.add_argument('-v', '--version',
-                        action='version',
-                        version=__version__,
-                        help='show version number and exit')
-    g.add_argument('-c', '--confidence',
-                        action="store_true",
-                        default=False,
-                        dest='confidence',
-                        help='extract mean OCR word confidence score')
-    g.add_argument('-t', '--text',
-                        action="store_true",
-                        default=False,
-                        dest='text',
-                        help='extract UTF8-encoded text content')
-    g.add_argument('-i', '--illustrations',
-                        action="store_true",
-                        default=False,
-                        dest='illustrations',
-                        help='extract bounding boxes of illustrations')
-    g.add_argument('-g', '--graphics',
-                        action="store_true",
-                        default=False,
-                        dest='graphics',
-                        help='extract bounding boxes of graphical elements')
-    parser.add_argument('-x', '--xml-encoding',
-                        action="store_true",
-                        default=None,
-                        dest='xml_encoding',
-                        help='XML encoding')
-    parser.add_argument('-e', '--file-encoding',
-                        action="store_true",
-                        default='UTF-8',
-                        dest='file_encoding',
-                        help='file encoding')
+    g.add_argument(
+        "-v",
+        "--version",
+        action="version",
+        version=__version__,
+        help="show version number and exit",
+    )
+    g.add_argument(
+        "-c",
+        "--confidence",
+        action="store_true",
+        default=False,
+        dest="confidence",
+        help="extract mean OCR word confidence score",
+    )
+    g.add_argument(
+        "-t",
+        "--text",
+        action="store_true",
+        default=False,
+        dest="text",
+        help="extract UTF8-encoded text content",
+    )
+    g.add_argument(
+        "-i",
+        "--illustrations",
+        action="store_true",
+        default=False,
+        dest="illustrations",
+        help="extract bounding boxes of illustrations",
+    )
+    g.add_argument(
+        "-g",
+        "--graphics",
+        action="store_true",
+        default=False,
+        dest="graphics",
+        help="extract bounding boxes of graphical elements",
+    )
+    parser.add_argument(
+        "-x",
+        "--xml-encoding",
+        action="store_true",
+        default=None,
+        dest="xml_encoding",
+        help="XML encoding",
+    )
+    parser.add_argument(
+        "-e",
+        "--file-encoding",
+        action="store_true",
+        default="UTF-8",
+        dest="file_encoding",
+        help="file encoding",
+    )
     args = parser.parse_args()
     return args
 
@@ -200,35 +237,37 @@ def walker(inputs, fnfilter=lambda fn: True):
 
 def main():
     if sys.version_info < (3, 0):
-        sys.stdout.write('Python 3 is required.\n')
+        sys.stdout.write("Python 3 is required.\n")
         sys.exit(-1)
 
     args = parse_arguments()
     if not len(sys.argv) > 2:
-        sys.stdout.write('\nNo operation specified, ')
-        os.system('python alto_tools.py -h')
+        sys.stdout.write("\nNo operation specified, ")
+        os.system("python alto_tools.py -h")
         sys.exit(-1)
     else:
-        fnfilter = lambda fn: fn.endswith('.xml') or fn.endswith('.alto')
+        fnfilter = lambda fn: fn.endswith(".xml") or fn.endswith(".alto")
         confidence_sum = 0
         for filename in walker(args.INPUT, fnfilter):
             try:
                 if args.xml_encoding:
                     xml_encoding = args.xml_encoding
-                    if xml_encoding == 'auto':
-                        with open(filename, 'rb') as f:
-                            m = re.search('encoding="(.*?)"', f.read(45).decode('utf-8'))
+                    if xml_encoding == "auto":
+                        with open(filename, "rb") as f:
+                            m = re.search(
+                                'encoding="(.*?)"', f.read(45).decode("utf-8")
+                            )
                             xml_encoding = m.group(1)
                     xmlp = ET.XMLParser(encoding=xml_encoding)
-                    alto, xml, xmlns = alto_parse(filename, parser = xmlp)
+                    alto, xml, xmlns = alto_parse(filename, parser=xmlp)
                 else:
-                    with open(filename, 'r',  encoding=args.file_encoding) as alto:
+                    with open(filename, "r", encoding=args.file_encoding) as alto:
                         alto, xml, xmlns = alto_parse(alto)
             except IndexError:
                 continue
             except ET.ParseError as e:
                 print("Error parsing %s" % filename, file=sys.stderr)
-                raise(e)
+                raise (e)
             if args.confidence:
                 confidence_sum += alto_confidence(alto, xml, xmlns)
             if args.text:
@@ -241,7 +280,9 @@ def main():
         if number_of_files >= 2:
             if args.confidence:
                 print(
-                    f"\n\nConfidence of folder: {round(confidence_sum/number_of_files, 2)}")
+                    f"\n\nConfidence of folder: {round(confidence_sum/number_of_files, 2)}"
+                )
+
 
 if __name__ == "__main__":
     main()