Slew of changes to satisfy merge conflicts in browse develop branch

arXiv · Mar 13, 2024 · 912578f · 912578f
1 parent 0bc396c
commit 912578f
Show file tree

Hide file tree

Showing 10 changed files with 194 additions and 72 deletions.
diff --git a/arxiv/__init__.py b/arxiv/__init__.py
diff --git a/arxiv/base/tests/test_alerts.py b/arxiv/base/tests/test_alerts.py
@@ -1,7 +1,7 @@
 """Tests for :mod:`arxiv.base.alerts`."""
 
 from unittest import TestCase, mock
-from flask import Markup
+from markupsafe import Markup
 
 from arxiv.base import alerts
 

diff --git a/arxiv/base/tests/test_logging.py b/arxiv/base/tests/test_logging.py
@@ -12,7 +12,15 @@ def test_get_logger_no_app_nor_request(self):
         """There is no application nor request context."""
         stream = StringIO()
 
-        logger = logging.getLogger('foologger', stream)
+        logger = logging.getLogger('foologger')
+        handler = logging.StreamHandler(stream)
+        handler.setFormatter(
+            logging.Formatter(
+                '%(levelname)s: "%(message)s"'
+            )
+        )
+        handler.terminator = ''
+        logger.addHandler(handler)
         self.assertIsInstance(logger, pyLogging.Logger,
                               "Should return a logging.Logger instance")
 

diff --git a/arxiv/document/metadata.py b/arxiv/document/metadata.py
@@ -2,7 +2,7 @@
 from collections import abc
 from dataclasses import dataclass, field
 from datetime import datetime
-from typing import Iterator, List, Optional, Set, Literal
+from typing import Iterator, List, Optional, Set, Literal, Sequence
 
 from ..taxonomy import definitions
 from ..taxonomy.category import Category, Group, Archive
@@ -113,7 +113,7 @@ class DocMetadata:
     license: License = field(default_factory=License)
     """License associated with the article."""
 
-    version_history: List[VersionEntry] = field(default_factory=list)
+    version_history: Sequence[VersionEntry] = field(default_factory=list)
     """Version history, consisting of at least one version history entry."""
 
     is_definitive: bool = field(default=False)

diff --git a/arxiv/document/parse_abs.py b/arxiv/document/parse_abs.py
@@ -2,7 +2,7 @@
 
 import os
 import re
-from typing import Any, Dict, List, Tuple, Optional
+from typing import Any, Dict, List, Tuple, Optional, Sequence
 from datetime import datetime
 
 from zoneinfo import ZoneInfo
@@ -59,18 +59,18 @@
 The latest versions of these papers should always have the "Categories:" line.
 """
 
+
 _fs_tz: Optional[ZoneInfo] = None
 """FS timezone if in a flask app."""
 
+
 def parse_abs_file(filename: str) -> DocMetadata:
-    """Parse an arXiv .abs file in the file system.
+    """Parse an arXiv .abs file.
 
     The modified time on the abs file will be used as the modified time for the
     abstract. It will be pulled from `flask.config` if in a app_context. It
     can be specified with tz arg.
-    
     """
-
     absfile = to_anypath(filename)
     try:
         with absfile.open(mode='r', encoding='latin-1') as absf:
@@ -85,11 +85,11 @@ def parse_abs_file(filename: str) -> DocMetadata:
     except FileNotFoundError:
         raise AbsNotFoundException
     except UnicodeDecodeError as e:
-        raise AbsParsingException(f'Failed to decode .abs file "{filename.canonical_name}": {e}')
-
+        raise AbsParsingException(f'Failed to decode .abs file "{filename}": {e}')
 
 
-def parse_abs(raw: str, modified: datetime) -> DocMetadata:
+
+def parse_abs(raw: str, modified:datetime) -> DocMetadata:
     """Parse an abs with fields and an abstract."""
 
     # There are two main components to an .abs file that contain data,
@@ -215,8 +215,9 @@ def parse_abs_top(raw: str, modified:datetime, abstract:str) -> DocMetadata:
         # private=private  # TODO, not implemented
     )
 
+
 def _parse_version_entries(arxiv_id: str, version_entry_list: List) \
-        -> Tuple[int, List[VersionEntry], str]:
+        -> Tuple[int, Sequence[VersionEntry], str]:
     """Parse the version entries from the arXiv .abs file."""
     version_count = 0
     version_entries = list()
@@ -236,12 +237,14 @@ def _parse_version_entries(arxiv_id: str, version_entry_list: List) \
         source_type = SourceFlag(code=date_match.group('source_type'))
         kb = int(date_match.group('size_kilobytes'))
         ve = VersionEntry(
+            #id = Identifier(f"{arxiv_id}v{version_count}"),
             raw=date_match.group(0),
             source_flag=source_type,
             size_kilobytes=kb,
             submitted_date=submitted_date,
             version=version_count,
-            is_withdrawn=kb == 0 or source_type.ignore
+            is_withdrawn=kb == 0 or source_type.ignore,
+            is_current = version_count == len(version_entry_list)
         )
         version_entries.append(ve)
 
@@ -296,10 +299,11 @@ def alt_component_split(components: List[str]) -> List[str]:
     alt_comp.append('')
     return alt_comp
 
+
 def _get_tz() -> ZoneInfo:
     """Gets the timezone from the flask current_app."""
     global _fs_tz
     if _fs_tz is None:
         _fs_tz = ZoneInfo(current_app.config["FS_TZ"])
 
-    return _fs_tz
+    return _fs_tz
diff --git a/arxiv/document/version.py b/arxiv/document/version.py
@@ -1,5 +1,5 @@
 """Representations of a version of a document."""
-from typing import Literal, Optional
+from typing import Literal, Optional, List
 from dataclasses import dataclass, field
 from datetime import datetime
 
@@ -9,7 +9,7 @@
 Excluding NULL."""
 
 
-@dataclass(frozen=True)
+@dataclass
 class SourceFlag:
     """Represents arXiv article source file type."""
 
@@ -103,7 +103,7 @@ def is_single_file(self) -> bool:
         return self.code is not None and '1' in self.code
 
 
-@dataclass(frozen=True)
+@dataclass
 class VersionEntry:
     """Represents a single arXiv article version history entry."""
 
@@ -127,6 +127,37 @@ class VersionEntry:
     source_format: Optional[SOURCE_FORMAT] = None
     """Source format."""
 
+    is_current: bool = False
+    """Is the version the highest existing version?"""
+
     @property
     def withdrawn_or_ignore(self) -> bool:
-        return self.source_flag.ignore or self.is_withdrawn
+        return self.source_flag.ignore or self.is_withdrawn
+
+    def formats(self) -> List[str]:
+        if self.is_withdrawn or self.size_kilobytes == 0:
+            return []
+
+        if self.source_flag.ignore:
+            if not self.source_flag.source_encrypted:
+                return ['src']
+            else:
+                return []
+
+        formats = []
+        if self.source_flag.ps_only or self.source_format == "ps":
+            formats.extend(['pdf', 'ps'])
+        elif self.source_flag.pdflatex or self.source_format == "pdflatex":
+            formats.extend(['pdf', 'src'])
+        elif self.source_flag.pdf_only or self.source_format == "pdfonly":
+            formats.extend(['pdf'])
+        elif self.source_flag.html or self.source_format == "html":
+            formats.extend(['html'])
+        elif self.source_flag.docx or self.source_format == "docx":
+            formats.extend(['pdf'])
+        else:
+            formats.extend(['pdf', 'ps', 'src'])
+
+        # other is added for display purposes maybe move to controller or template?
+        formats.extend(['other'])
+        return formats
diff --git a/arxiv/formats/__init__.py b/arxiv/formats/__init__.py
@@ -1,14 +1,16 @@
 """Shared functions that support determination of dissemination formats."""
 import re
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import logging
 import tarfile
 from operator import itemgetter
 from tarfile import CompressionError, ReadError
 from typing import Dict
 
+from ..document.version import SourceFlag
 from ..files import FileObj
+from ..files.anypath import APath
 
 logger = logging.getLogger(__name__)
 
@@ -41,9 +43,7 @@ def formats_from_source_file_name(source_file_path: str) -> List[str]:
     return []
 
 
-def formats_from_source_flag(source_flag: str,
-                             format_pref: Optional[str] = None,
-                             cache_flag: bool = False) -> List[str]:
+def formats_from_source_flag(source_flag: Union[str, SourceFlag]) -> List[str]:
     """Get the dissemination formats based on source type and preference.
 
     Source file types are represented by single-character codes:
@@ -69,66 +69,93 @@ def formats_from_source_flag(source_flag: str,
     F - PDF only
         PDF-only submission with .tar.gz package (likely because of anc files)
     """
-    formats = []
-    if not source_flag:
-        source_flag = ''
-    if not format_pref:
-        format_pref = ''
+    if isinstance(source_flag, SourceFlag):
+        source_flag = source_flag.code
+
+    source_flag = source_flag if source_flag else ''
     has_encrypted_source = re.search('S', source_flag, re.IGNORECASE)
     has_ignore = re.search('I', source_flag, re.IGNORECASE)
+    if has_ignore:
+        if not has_encrypted_source:
+            return ['src']
+        else:
+            return []
+
     has_ps_only = re.search('P', source_flag, re.IGNORECASE)
     has_pdflatex = re.search('D', source_flag, re.IGNORECASE)
     has_pdf_only = re.search('F', source_flag, re.IGNORECASE)
     has_html = re.search('H', source_flag, re.IGNORECASE)
     has_docx_or_odf = re.search(r'[XO]', source_flag, re.IGNORECASE)
-    has_src_pref = format_pref and re.search('src', format_pref)
-    append_other = False
 
-    if has_ignore and not has_encrypted_source:
-        formats.append('src')
-    elif has_ps_only:
-        formats.extend(['pdf', 'ps', 'other'])
+    formats: list[str] = []
+    if has_ps_only:
+        formats.extend(['pdf', 'ps'])
     elif has_pdflatex:
-        formats.extend(['pdf', 'other'])
-        # PDFtex has source so honor src preference
-        if has_src_pref and not has_encrypted_source:
-            formats.insert(1, 'src')
+        formats.extend(['pdf', 'src'])
     elif has_pdf_only:
-        formats.extend(['pdf', 'other'])
+        formats.extend(['pdf'])
     elif has_html:
-        formats.extend(['html', 'other'])
+        formats.extend(['html'])
     elif has_docx_or_odf:
-        formats.extend(['pdf', 'other'])
-    elif cache_flag:
-        # this is the case where the source is not newer than the cache file
-        # and the cache file is empty
-        formats.extend(['nops', 'other'])
+        formats.extend(['pdf'])
     else:
-        if re.search('pdf', format_pref):
+        formats.extend(['pdf', 'ps', 'src'])
+
+    formats.extend(['other'])
+    return formats
+
+def get_all_formats(src_fmt: str) -> List[str]:
+        """Returns the list of all formats that the given src can
+        be disseminated in. Takes sources format and knows what
+        transformations can be applied.
+
+        Does not include sub-formats (like types of ps).
+        """
+        formats: List[str] = []
+        if src_fmt == 'ps':
+            formats.extend([src_fmt, 'pdf'])
+        elif src_fmt == 'pdf' or src_fmt == 'html':
+            formats.append(src_fmt)
+        elif src_fmt == 'dvi':
+            formats.extend([src_fmt, 'ps', 'pdf'])
+        elif src_fmt == 'tex':
+            formats.extend(['dvi', 'ps', 'pdf'])
+        elif src_fmt == 'pdftex':
             formats.append('pdf')
-        elif re.search('400', format_pref):
-            formats.append('ps(400)')
-        elif re.search('600', format_pref):
-            formats.append('ps(600)')
-        elif re.search('fname=cm', format_pref):
-            formats.append('ps(cm)')
-        elif re.search('fname=CM', format_pref):
-            formats.append('ps(CM)')
-        elif re.search('dvi', format_pref):
-            formats.append('dvi')
-        elif has_src_pref:
-            formats.extend(['pdf', 'ps'])
-            if not has_encrypted_source:
-                formats.append('src')
-        else:
-            formats.extend(['pdf', 'ps'])
+        elif src_fmt == 'docx' or src_fmt == 'odf':
+            formats.extend(['pdf', src_fmt])
 
-        append_other = True
+        return formats
 
-    if append_other:
-        formats.append('other')
+def has_ancillary_files(source_flag: str) -> bool:
+    """Check source type for indication of ancillary files."""
+    if not source_flag:
+        return False
+    return re.search('A', source_flag, re.IGNORECASE) is not None
+
+
+def list_ancillary_files(tarball_path: APath) -> List[Dict]:
+    """Return a list of ancillary files in a tarball (.tar.gz file)."""
+    if not tarball_path or not tarball_path.suffixes == ['.tar', '.gz'] \
+       or not tarball_path.is_file():
+        return []
+
+    anc_files = []
+    try:
+        with tarball_path.open( mode='rb') as fh:
+            with tarfile.open(fileobj=fh, mode='r') as tf:
+                for member in \
+                        (m for m in tf if re.search(r'^anc\/', m.name) and m.isfile()):
+                    name = re.sub(r'^anc\/', '', member.name)
+                    size_bytes = member.size
+                    anc_files.append({'name': name, 'size_bytes': size_bytes})
+    except (ReadError, CompressionError) as ex:
+        logger.error("Error while trying to read anc files from %s: %s", tarball_path, ex)
+        return []
+    if len(anc_files) > 1:
+        anc_files = sorted(anc_files, key=itemgetter('name'))
+    return anc_files
 
-    return formats
 
 
 def has_ancillary_files(source_flag: str) -> bool:

diff --git a/arxiv/util/tests/test_authors.py b/arxiv/util/tests/test_authors.py
@@ -1,7 +1,7 @@
 """Tests for author and affiliation parsing."""
 from unittest import TestCase
 
-from ..authors import parse_author_affil, split_authors
+from arxiv.authors import parse_author_affil, split_authors
 
 
 class TestAuthorAffiliationParsing(TestCase):