Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vary canonical form behavior by language #136

Merged
merged 3 commits into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions 3-tidy-up.js
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ function handleLine(line) {
const parsedLine = JSON.parse(line);
const { pos, sounds, forms } = parsedLine;
if(!pos) return;
const word = getCanonicalForm(parsedLine);
const word = getCanonicalWordForm(parsedLine);
if (!word) return;
const readings = getReadings(word, parsedLine);

Expand Down Expand Up @@ -371,11 +371,26 @@ function processEnglishInflectionGlosses(glosses, word, pos) {
}
}

function getCanonicalForm({word, forms}) {
function getCanonicalWordForm({word, forms}) {
if(!forms) return word;

const canonicalForm = forms.find(form =>
form.tags &&
switch(sourceIso) {
case 'ar':
case 'fa':
case 'la':
case 'ru':
return getCanonicalForm(word, forms); // canonical form is known to contain accent marks and such
case 'de':
// case 'fr': // canonical form sometimes just prepends the definite article, but many differ from the word in apostrophe variant. I don't know which is used in practice so leaving it until there's a yomitan preprocessor for french apostrophe usage.
case 'en':
return word; // canonical form is redundant, e.g. just prepends the definite article
default:
return getCanonicalForm(word, forms); // default could go either way. keeping existing behavior for now
}
}

function getCanonicalForm(word, forms) {
const canonicalForm = forms.find(form => form.tags &&
form.tags.includes('canonical')
);
if (canonicalForm && canonicalForm.form) {
Expand Down
21 changes: 21 additions & 0 deletions data/test/dict/en/en/tag_bank_1.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,26 @@
0,
"figuratively",
0
],
[
"n",
"partOfSpeech",
-1,
"noun",
1
],
[
"arch",
"archaism",
4,
"archaic",
-4
],
[
"ltrry",
"",
0,
"literary",
0
]
]
12 changes: 12 additions & 0 deletions data/test/dict/en/en/term_bank_1.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,17 @@
],
0,
""
],
[
"wain",
"",
"n arch ltrry",
"n",
0,
[
"A wagon; a four-wheeled cart for hauling loads, usually pulled by horses or oxen."
],
0,
""
]
]
17 changes: 17 additions & 0 deletions data/test/dict/en/en/term_bank_2.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,22 @@
],
0,
""
],
[
"wains",
"",
"non-lemma",
"",
0,
[
[
"wain",
[
"plural"
]
]
],
0,
""
]
]
13 changes: 13 additions & 0 deletions data/test/ipa/en/en/term_meta_bank_1.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,18 @@
}
]
}
],
[
"wain",
"ipa",
{
"reading": "wain",
"transcriptions": [
{
"ipa": "/weɪn/",
"tags": []
}
]
}
]
]
1 change: 1 addition & 0 deletions data/test/kaikki/en-en.json

Large diffs are not rendered by default.

22 changes: 22 additions & 0 deletions data/test/tidy/en-en-forms-0.json
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,28 @@
]
]
}
],
[
"wain",
{
"_type": "map",
"map": [
[
"wains",
{
"_type": "map",
"map": [
[
"noun",
[
"plural"
]
]
]
}
]
]
}
]
]
}
23 changes: 23 additions & 0 deletions data/test/tidy/en-en-lemmas.json
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,28 @@
]
}
}
},
"wain": {
"wain": {
"noun": {
"ipa": [
{
"ipa": "/weɪn/",
"tags": []
}
],
"senses": [
{
"glosses": [
"(archaic or literary) A wagon; a four-wheeled cart for hauling loads, usually pulled by horses or oxen."
],
"tags": [
"archaic",
"literary"
]
}
]
}
}
}
}
Loading