Skip to content

Commit

Permalink
added describe methods
Browse files Browse the repository at this point in the history
  • Loading branch information
xrotwang committed Jan 21, 2025
1 parent 928f027 commit 4d8c1bc
Show file tree
Hide file tree
Showing 2 changed files with 162 additions and 56 deletions.
204 changes: 154 additions & 50 deletions src/pyigt/graid.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,53 +14,35 @@
SymbolDict = typing.Dict[typing.Union[typing.Tuple[str, str], str], str]


class Gloss(typing.Protocol):
class Gloss(typing.Protocol): # pragma: no cover
"""
Classes passed to GRAID as `other_glosses` must implement this protocol. I.e. implement a
classmethod `from_annotation`, which returns an instance of the class if the annotation matches
the pattern or `None` otherwise.
"""
@classmethod
def from_annotation(cls, annotation: str, parser=None) -> typing.Optional["Gloss"]:
def from_annotation(cls, annotation: str, parser: "GRAID" = None) -> typing.Optional["Gloss"]:
"""
:return: `None` to signal that the annotation was not parsed, `Gloss` instance otherwise.
"""
... # pragma: no cover

def __str__(self) -> str:
"""
The full gloss, re-assembled (and possibly normalized) or as passed to `from_annotation`.
"""
...

@dataclasses.dataclass
class CrossIndex:
referent_property: str
function: str
subconstituent_marker: str = None
morpheme_separator: str = None

def __str__(self):
return '{}{}pro_{}_{}'.format(
self.morpheme_separator or '',
self.subconstituent_marker + '_' if self.subconstituent_marker else '',
self.referent_property,
self.function,
)

@classmethod
def from_annotation(cls, ann, parser) -> typing.Optional["CrossIndex"]:
kw = {}
if any(ann.startswith(sep) for sep in parser.morpheme_separators):
kw['morpheme_separator'], ann = ann[:1], ann[1:]
for scm in parser.subconstituent_markers:
if ann.startswith(scm + '_'):
kw['subconstituent_marker'], ann = scm, ann[len(scm) + 1:]
m = re.fullmatch(
r'pro_(?P<rp>{})_(?P<f>{})'.format(
re_or(parser.referent_properties), re_or(parser.syntactic_functions)),
ann)
if m:
kw['referent_property'], kw['function'] = m.group('rp'), m.group('f')
return cls(**kw)
def describe(self, parser: "GRAID" = None) -> typing.Dict[str, str]:
...


def update_symbols(symbols: SymbolDict,
d: SymbolDict,
attaches: typing.Union[typing.Literal['left'], typing.Literal['right']] = None):
"""
Utility function to update GRAID symbol `dict`s.
"""
if d:
assert all(isinstance(g, str) and g.count('_') < 2 if attaches else 1 for g in d)
if attaches:
Expand All @@ -74,6 +56,9 @@ def update_symbols(symbols: SymbolDict,


def re_or(items: typing.Iterable[str]) -> str:
"""
Concatenate strings in as regular expression pattern matching any of them.
"""
return r'|'.join(re.escape(item) for item in items if isinstance(item, str))


Expand All @@ -94,7 +79,7 @@ def __init__(self,
other_glosses: typing.Optional[typing.List[Gloss]] = None,
with_cross_index=False):
"""
Basically all lists of symbols specified by the GRAID standard may be extended with new,
Almost all lists of symbols specified by the GRAID standard may be extended with new,
corpus-specific symbols. Such custom symbols can be supplied as values for the arguments
of this method as follows: For all but `subconstituent_symbols` the symbols should be
formatted as `dict` mapping each symbol to a short description. Symbols should generally
Expand Down Expand Up @@ -222,7 +207,7 @@ def __init__(self,
if with_cross_index:
self.other_glosses.append(CrossIndex)

def iter_expressions(self, s):
def iter_expressions(self, s) -> typing.Generator[str, None, None]:
sep = None
for item in itertools.dropwhile(
lambda ss: not ss, re.split(r'({})'.format(re_or(self.morpheme_separators)), s)):
Expand All @@ -234,7 +219,11 @@ def iter_expressions(self, s):
sep = None
assert not sep, 'Trailing morpheme separator in gloss: {}'.format(s)

def __call__(self, gloss):
def __call__(self, gloss: str) \
-> typing.List[typing.Union[Gloss, "Boundary", "Symbol", "Predicate", "Referent"]]:
"""
Call a GRAID object to parse a full-word GRAID annotation.
"""
return [self.parse_expression(exp) for exp in self.iter_expressions(gloss.strip())]

def parse_expression(self, expression):
Expand Down Expand Up @@ -282,8 +271,17 @@ class Symbol:
def __str__(self):
return '{}{}'.format(self.morpheme_separator or '', self.symbol)

def describe(self, parser: GRAID = None):
parser = parser or GRAID()
res = collections.OrderedDict()
if self.morpheme_separator:
res[self.morpheme_separator] = parser.morpheme_separators[self.morpheme_separator]
res[self.symbol] = parser.other_symbols[self.symbol]
return res

@classmethod
def from_annotation(cls, ann, parser) -> typing.Optional["Symbol"]:
parser = parser or GRAID()
kw = {}
if any(ann.startswith(sep) for sep in parser.morpheme_separators):
kw['morpheme_separator'], ann = ann[:1], ann[1:]
Expand All @@ -302,8 +300,30 @@ class Boundary:
function_qualifiers: typing.List[str] = dataclasses.field(default_factory=list)
qualifiers: typing.List[str] = dataclasses.field(default_factory=list)

def describe(self, parser: GRAID = None):
parser = parser or GRAID()
res = collections.OrderedDict()
res[self.boundary_type] = parser.boundary_markers[self.boundary_type]
if self.ds:
res['ds'] = 'direct speech'
if self.clause_type:
res[self.clause_type] = parser.clause_types[self.clause_type]
if self.neg:
res['neg'] = 'negative polarity'
if self.property:
res[self.property] = parser.referent_properties[self.property]
for q in self.qualifiers:
res[q] = parser.clause_boundary_symbols[q]
if self.function:
res[self.function] = parser.predicative_functions.get(
self.function, parser.syntactic_functions.get(self.function))
for q in self.function_qualifiers:
res[q] = parser.syntactic_function_specifiers[q]
return res

@classmethod
def from_annotation(cls, annotation: str, parser=None) -> typing.Optional["Boundary"]:
parser = parser or GRAID()
for marker in parser.boundary_markers:
if annotation.startswith(marker):
break
Expand Down Expand Up @@ -354,11 +374,6 @@ def __str__(self):
''.join('_' + fq for fq in self.function_qualifiers),
)

#
# Referents:
# list of -/= separated annotations.
#


@dataclasses.dataclass
class Expression:
Expand All @@ -378,19 +393,25 @@ def __str__(self):
res += ':{}'.format('_'.join([self.function] + self.function_qualifiers))
return res

def describe(self, parser): # pragma: no cover
res = 'form: '
def describe(self, parser: GRAID = None):
parser = parser or GRAID()
res = collections.OrderedDict()
if (self.morpheme_separator, self.form_gloss) in parser.predicate_glosses:
res += parser.predicate_glosses[(self.morpheme_separator, self.form_gloss)]
res[self.morpheme_separator + self.form_gloss] = parser.predicate_glosses[
(self.morpheme_separator, self.form_gloss)]
else:
res += parser.predicate_glosses[self.form_gloss]
if self.morpheme_separator:
res[self.morpheme_separator] = parser.morpheme_separators[self.morpheme_separator]
res[self.form_gloss] = parser.predicate_glosses[self.form_gloss]

if self.form_qualifiers:
res += ' ({})'.format(
'; '.join(parser.form_gloss_specifiers[q] for q in self.form_qualifiers))
res['{}_{}'.format(self.form_qualifiers[0], self.form_gloss)] = (
parser.predicate_glosses[(self.form_qualifiers[0], self.form_gloss)])

if self.function:
res += '. function: {}'.format(parser.predicative_functions[self.function])
if self.function_qualifiers:
res += ' ({})'.format('; '.join(self.function_qualifiers))
res[self.function] = parser.predicative_functions.get(
self.function, parser.syntactic_functions.get(self.function))
assert not self.function_qualifiers
return res

@classmethod
Expand Down Expand Up @@ -449,6 +470,48 @@ def __str__(self):
res += ':{}'.format('_'.join([self.function] + self.function_qualifiers))
return res

def describe(self, parser: GRAID = None):
parser = parser or GRAID()
res = collections.OrderedDict()
if (self.morpheme_separator, self.form_gloss) in parser.form_glosses:
res[self.morpheme_separator + self.form_gloss] = parser.form_glosses[
(self.morpheme_separator, self.form_gloss)]
else:
if self.morpheme_separator:
res[self.morpheme_separator] = parser.morpheme_separators[self.morpheme_separator]

if self.subconstituent:
res[self.subconstituent] = parser.subconstituent_markers[self.subconstituent]
for q in self.subconstituent_qualifiers:
res[q] = parser.subconstituent_symbols[self.subconstituent][q]

if self.form_gloss:
res[self.form_gloss] = parser.form_glosses[self.form_gloss]

for i, q in enumerate(reversed(self.form_qualifiers)):
if i == 0:
if (q, self.form_gloss) in parser.form_glosses:
res['{}_{}'.format(q, self.form_gloss)] = (
parser.form_glosses[(q, self.form_gloss)])
else:
res[q] = parser.form_glosses.get(q, parser.form_gloss_specifiers.get(q))
else:
res[q] = parser.form_glosses.get(q, parser.form_gloss_specifiers.get(q))

start = 0
if self.function:
if (self.function_qualifiers and # noqa: W504
(self.function, self.function_qualifiers[0]) in parser.syntactic_functions):
res['{}_{}'.format(self.function, self.function_qualifiers[0])] = (
parser.syntactic_functions)[(self.function, self.function_qualifiers[0])]
start = 1
else:
res[self.function] = parser.predicative_functions.get(
self.function, parser.syntactic_functions.get(self.function))
for q in self.function_qualifiers[start:]:
res[q] = parser.syntactic_function_specifiers[q]
return res

@classmethod
def from_annotation(cls, annotation: str, parser=None) -> "Referent":
"""
Expand Down Expand Up @@ -504,3 +567,44 @@ def from_annotation(cls, annotation: str, parser=None) -> "Referent":
else:
raise ValueError(annotation)
return cls(**kw)


@dataclasses.dataclass
class CrossIndex:
"""
Several Multi-CAST corpora include annotations of "cross-indeces". The GRAID parser can be
conditioned to recognize such indeces by passing `with_cross_index=True` on instantiation.
"""
referent_property: str
function: str
subconstituent_marker: str = None
morpheme_separator: str = None

def __str__(self):
return '{}{}pro_{}_{}'.format(
self.morpheme_separator or '',
self.subconstituent_marker + '_' if self.subconstituent_marker else '',
self.referent_property,
self.function,
)

def describe(self, parser: GRAID = None) -> typing.Dict[str, str]:
parser = parser or GRAID()
return {'symbol': str(self)}

@classmethod
def from_annotation(cls, ann, parser: GRAID = None) -> typing.Optional["CrossIndex"]:
parser = parser or GRAID()
kw = {}
if any(ann.startswith(sep) for sep in parser.morpheme_separators):
kw['morpheme_separator'], ann = ann[:1], ann[1:]
for scm in parser.subconstituent_markers:
if ann.startswith(scm + '_'):
kw['subconstituent_marker'], ann = scm, ann[len(scm) + 1:]
m = re.fullmatch(
r'pro_(?P<rp>{})_(?P<f>{})'.format(
re_or(parser.referent_properties), re_or(parser.syntactic_functions)),
ann)
if m:
kw['referent_property'], kw['function'] = m.group('rp'), m.group('f')
return cls(**kw)
14 changes: 8 additions & 6 deletions tests/test_graid.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,15 @@ def graid():
('rn_refl_pro.h:poss', Referent, None),
('predex', Referent, lambda r: r.function == 'predex'),
('adp', Referent, None),
('=adp', Referent, None),
('voc', Referent, lambda r: r.form_gloss == None),
('-pro', Referent, lambda r: r.form_gloss == 'pro'),
('-v', Predicate, None),
]
)
def test_GRAID(graid, expr, type_, res):
obj = graid.parse_expression(expr)
if isinstance(obj, Predicate):
assert obj.describe(graid)
assert obj.describe(graid)
assert isinstance(obj, type_)
if not res:
assert str(obj) == expr
Expand All @@ -62,8 +63,8 @@ def test_GRAID(graid, expr, type_, res):
({}, 'v:pred_dem', None, ValueError),
({}, 'v:prex', None, ValueError),
( # Custom specified form gloss:
dict(form_glosses={'rex_f0': 'x', 'f0': 'y'}),
'rex_f0:s',
dict(form_glosses={'rex_f0': 'x', 'f0': 'y'}, form_gloss_specifiers={'abc': ''}),
'abc_rex_f0:s',
lambda r: r.form_gloss == 'f0',
None),
( # Custom specified form gloss does not introduce a general specifier:
Expand All @@ -89,8 +90,8 @@ def test_GRAID(graid, expr, type_, res):
lambda r: r.subconstituent == 'lv' and r.subconstituent_qualifiers == ['aux'],
None),
(
dict(clause_boundary_symbols={'dem': 'x'}),
'#cc_dem',
dict(clause_boundary_symbols={'dem': 'x'}, syntactic_function_specifiers={'dem': 'x'}),
'#cc_dem:a_dem',
lambda r: r.qualifiers == ['dem'],
None),
(
Expand Down Expand Up @@ -118,6 +119,7 @@ def test_custom_GRAID(kw, expr, res, exp):
graid.parse_expression(expr)
else:
obj = graid.parse_expression(expr)
assert obj.describe(graid)
assert str(obj) == expr
if res:
assert res(obj)
Expand Down

0 comments on commit 4d8c1bc

Please sign in to comment.