Skip to content

Commit

Permalink
Slew of changes to satisfy merge conflicts in browse develop branch
Browse files Browse the repository at this point in the history
  • Loading branch information
mnazzaro committed Mar 13, 2024
1 parent 0bc396c commit 912578f
Show file tree
Hide file tree
Showing 10 changed files with 194 additions and 72 deletions.
Empty file added arxiv/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion arxiv/base/tests/test_alerts.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Tests for :mod:`arxiv.base.alerts`."""

from unittest import TestCase, mock
from flask import Markup
from markupsafe import Markup

from arxiv.base import alerts

Expand Down
10 changes: 9 additions & 1 deletion arxiv/base/tests/test_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,15 @@ def test_get_logger_no_app_nor_request(self):
"""There is no application nor request context."""
stream = StringIO()

logger = logging.getLogger('foologger', stream)
logger = logging.getLogger('foologger')
handler = logging.StreamHandler(stream)
handler.setFormatter(
logging.Formatter(
'%(levelname)s: "%(message)s"'
)
)
handler.terminator = ''
logger.addHandler(handler)
self.assertIsInstance(logger, pyLogging.Logger,
"Should return a logging.Logger instance")

Expand Down
4 changes: 2 additions & 2 deletions arxiv/document/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from collections import abc
from dataclasses import dataclass, field
from datetime import datetime
from typing import Iterator, List, Optional, Set, Literal
from typing import Iterator, List, Optional, Set, Literal, Sequence

from ..taxonomy import definitions
from ..taxonomy.category import Category, Group, Archive
Expand Down Expand Up @@ -113,7 +113,7 @@ class DocMetadata:
license: License = field(default_factory=License)
"""License associated with the article."""

version_history: List[VersionEntry] = field(default_factory=list)
version_history: Sequence[VersionEntry] = field(default_factory=list)
"""Version history, consisting of at least one version history entry."""

is_definitive: bool = field(default=False)
Expand Down
24 changes: 14 additions & 10 deletions arxiv/document/parse_abs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
import re
from typing import Any, Dict, List, Tuple, Optional
from typing import Any, Dict, List, Tuple, Optional, Sequence
from datetime import datetime

from zoneinfo import ZoneInfo
Expand Down Expand Up @@ -59,18 +59,18 @@
The latest versions of these papers should always have the "Categories:" line.
"""


_fs_tz: Optional[ZoneInfo] = None
"""FS timezone if in a flask app."""


def parse_abs_file(filename: str) -> DocMetadata:
"""Parse an arXiv .abs file in the file system.
"""Parse an arXiv .abs file.
The modified time on the abs file will be used as the modified time for the
abstract. It will be pulled from `flask.config` if in a app_context. It
can be specified with tz arg.
"""

absfile = to_anypath(filename)
try:
with absfile.open(mode='r', encoding='latin-1') as absf:
Expand All @@ -85,11 +85,11 @@ def parse_abs_file(filename: str) -> DocMetadata:
except FileNotFoundError:
raise AbsNotFoundException
except UnicodeDecodeError as e:
raise AbsParsingException(f'Failed to decode .abs file "{filename.canonical_name}": {e}')

raise AbsParsingException(f'Failed to decode .abs file "{filename}": {e}')


def parse_abs(raw: str, modified: datetime) -> DocMetadata:

def parse_abs(raw: str, modified:datetime) -> DocMetadata:
"""Parse an abs with fields and an abstract."""

# There are two main components to an .abs file that contain data,
Expand Down Expand Up @@ -215,8 +215,9 @@ def parse_abs_top(raw: str, modified:datetime, abstract:str) -> DocMetadata:
# private=private # TODO, not implemented
)


def _parse_version_entries(arxiv_id: str, version_entry_list: List) \
-> Tuple[int, List[VersionEntry], str]:
-> Tuple[int, Sequence[VersionEntry], str]:
"""Parse the version entries from the arXiv .abs file."""
version_count = 0
version_entries = list()
Expand All @@ -236,12 +237,14 @@ def _parse_version_entries(arxiv_id: str, version_entry_list: List) \
source_type = SourceFlag(code=date_match.group('source_type'))
kb = int(date_match.group('size_kilobytes'))
ve = VersionEntry(
#id = Identifier(f"{arxiv_id}v{version_count}"),
raw=date_match.group(0),
source_flag=source_type,
size_kilobytes=kb,
submitted_date=submitted_date,
version=version_count,
is_withdrawn=kb == 0 or source_type.ignore
is_withdrawn=kb == 0 or source_type.ignore,
is_current = version_count == len(version_entry_list)
)
version_entries.append(ve)

Expand Down Expand Up @@ -296,10 +299,11 @@ def alt_component_split(components: List[str]) -> List[str]:
alt_comp.append('')
return alt_comp


def _get_tz() -> ZoneInfo:
"""Gets the timezone from the flask current_app."""
global _fs_tz
if _fs_tz is None:
_fs_tz = ZoneInfo(current_app.config["FS_TZ"])

return _fs_tz
return _fs_tz
39 changes: 35 additions & 4 deletions arxiv/document/version.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Representations of a version of a document."""
from typing import Literal, Optional
from typing import Literal, Optional, List
from dataclasses import dataclass, field
from datetime import datetime

Expand All @@ -9,7 +9,7 @@
Excluding NULL."""


@dataclass(frozen=True)
@dataclass
class SourceFlag:
"""Represents arXiv article source file type."""

Expand Down Expand Up @@ -103,7 +103,7 @@ def is_single_file(self) -> bool:
return self.code is not None and '1' in self.code


@dataclass(frozen=True)
@dataclass
class VersionEntry:
"""Represents a single arXiv article version history entry."""

Expand All @@ -127,6 +127,37 @@ class VersionEntry:
source_format: Optional[SOURCE_FORMAT] = None
"""Source format."""

is_current: bool = False
"""Is the version the highest existing version?"""

@property
def withdrawn_or_ignore(self) -> bool:
return self.source_flag.ignore or self.is_withdrawn
return self.source_flag.ignore or self.is_withdrawn

def formats(self) -> List[str]:
if self.is_withdrawn or self.size_kilobytes == 0:
return []

if self.source_flag.ignore:
if not self.source_flag.source_encrypted:
return ['src']
else:
return []

formats = []
if self.source_flag.ps_only or self.source_format == "ps":
formats.extend(['pdf', 'ps'])
elif self.source_flag.pdflatex or self.source_format == "pdflatex":
formats.extend(['pdf', 'src'])
elif self.source_flag.pdf_only or self.source_format == "pdfonly":
formats.extend(['pdf'])
elif self.source_flag.html or self.source_format == "html":
formats.extend(['html'])
elif self.source_flag.docx or self.source_format == "docx":
formats.extend(['pdf'])
else:
formats.extend(['pdf', 'ps', 'src'])

# other is added for display purposes maybe move to controller or template?
formats.extend(['other'])
return formats
121 changes: 74 additions & 47 deletions arxiv/formats/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
"""Shared functions that support determination of dissemination formats."""
import re
from typing import List, Optional
from typing import List, Optional, Union

import logging
import tarfile
from operator import itemgetter
from tarfile import CompressionError, ReadError
from typing import Dict

from ..document.version import SourceFlag
from ..files import FileObj
from ..files.anypath import APath

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -41,9 +43,7 @@ def formats_from_source_file_name(source_file_path: str) -> List[str]:
return []


def formats_from_source_flag(source_flag: str,
format_pref: Optional[str] = None,
cache_flag: bool = False) -> List[str]:
def formats_from_source_flag(source_flag: Union[str, SourceFlag]) -> List[str]:
"""Get the dissemination formats based on source type and preference.
Source file types are represented by single-character codes:
Expand All @@ -69,66 +69,93 @@ def formats_from_source_flag(source_flag: str,
F - PDF only
PDF-only submission with .tar.gz package (likely because of anc files)
"""
formats = []
if not source_flag:
source_flag = ''
if not format_pref:
format_pref = ''
if isinstance(source_flag, SourceFlag):
source_flag = source_flag.code

source_flag = source_flag if source_flag else ''
has_encrypted_source = re.search('S', source_flag, re.IGNORECASE)
has_ignore = re.search('I', source_flag, re.IGNORECASE)
if has_ignore:
if not has_encrypted_source:
return ['src']
else:
return []

has_ps_only = re.search('P', source_flag, re.IGNORECASE)
has_pdflatex = re.search('D', source_flag, re.IGNORECASE)
has_pdf_only = re.search('F', source_flag, re.IGNORECASE)
has_html = re.search('H', source_flag, re.IGNORECASE)
has_docx_or_odf = re.search(r'[XO]', source_flag, re.IGNORECASE)
has_src_pref = format_pref and re.search('src', format_pref)
append_other = False

if has_ignore and not has_encrypted_source:
formats.append('src')
elif has_ps_only:
formats.extend(['pdf', 'ps', 'other'])
formats: list[str] = []
if has_ps_only:
formats.extend(['pdf', 'ps'])
elif has_pdflatex:
formats.extend(['pdf', 'other'])
# PDFtex has source so honor src preference
if has_src_pref and not has_encrypted_source:
formats.insert(1, 'src')
formats.extend(['pdf', 'src'])
elif has_pdf_only:
formats.extend(['pdf', 'other'])
formats.extend(['pdf'])
elif has_html:
formats.extend(['html', 'other'])
formats.extend(['html'])
elif has_docx_or_odf:
formats.extend(['pdf', 'other'])
elif cache_flag:
# this is the case where the source is not newer than the cache file
# and the cache file is empty
formats.extend(['nops', 'other'])
formats.extend(['pdf'])
else:
if re.search('pdf', format_pref):
formats.extend(['pdf', 'ps', 'src'])

formats.extend(['other'])
return formats

def get_all_formats(src_fmt: str) -> List[str]:
"""Returns the list of all formats that the given src can
be disseminated in. Takes sources format and knows what
transformations can be applied.
Does not include sub-formats (like types of ps).
"""
formats: List[str] = []
if src_fmt == 'ps':
formats.extend([src_fmt, 'pdf'])
elif src_fmt == 'pdf' or src_fmt == 'html':
formats.append(src_fmt)
elif src_fmt == 'dvi':
formats.extend([src_fmt, 'ps', 'pdf'])
elif src_fmt == 'tex':
formats.extend(['dvi', 'ps', 'pdf'])
elif src_fmt == 'pdftex':
formats.append('pdf')
elif re.search('400', format_pref):
formats.append('ps(400)')
elif re.search('600', format_pref):
formats.append('ps(600)')
elif re.search('fname=cm', format_pref):
formats.append('ps(cm)')
elif re.search('fname=CM', format_pref):
formats.append('ps(CM)')
elif re.search('dvi', format_pref):
formats.append('dvi')
elif has_src_pref:
formats.extend(['pdf', 'ps'])
if not has_encrypted_source:
formats.append('src')
else:
formats.extend(['pdf', 'ps'])
elif src_fmt == 'docx' or src_fmt == 'odf':
formats.extend(['pdf', src_fmt])

append_other = True
return formats

if append_other:
formats.append('other')
def has_ancillary_files(source_flag: str) -> bool:
"""Check source type for indication of ancillary files."""
if not source_flag:
return False
return re.search('A', source_flag, re.IGNORECASE) is not None


def list_ancillary_files(tarball_path: APath) -> List[Dict]:
"""Return a list of ancillary files in a tarball (.tar.gz file)."""
if not tarball_path or not tarball_path.suffixes == ['.tar', '.gz'] \
or not tarball_path.is_file():
return []

anc_files = []
try:
with tarball_path.open( mode='rb') as fh:
with tarfile.open(fileobj=fh, mode='r') as tf:
for member in \
(m for m in tf if re.search(r'^anc\/', m.name) and m.isfile()):
name = re.sub(r'^anc\/', '', member.name)
size_bytes = member.size
anc_files.append({'name': name, 'size_bytes': size_bytes})
except (ReadError, CompressionError) as ex:
logger.error("Error while trying to read anc files from %s: %s", tarball_path, ex)
return []
if len(anc_files) > 1:
anc_files = sorted(anc_files, key=itemgetter('name'))
return anc_files

return formats


def has_ancillary_files(source_flag: str) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion arxiv/util/tests/test_authors.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Tests for author and affiliation parsing."""
from unittest import TestCase

from ..authors import parse_author_affil, split_authors
from arxiv.authors import parse_author_affil, split_authors


class TestAuthorAffiliationParsing(TestCase):
Expand Down
Loading

0 comments on commit 912578f

Please sign in to comment.