added pos tools

clarin-eric · Nov 13, 2024 · c3fe5e3 · c3fe5e3
1 parent 468bbad
commit c3fe5e3
Show file tree

Hide file tree

Showing 68 changed files with 1,232 additions and 0 deletions.
diff --git a/tools/pos-and-lemmatisation/abltagger-lem.json b/tools/pos-and-lemmatisation/abltagger-lem.json
@@ -0,0 +1,18 @@
+{
+      "Name": "ABLTagger (Lemmatizer)",
+      "URL": "http://hdl.handle.net/20.500.12537/134",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "The lemmatiser achieves an accuracy of 98.3% on MIM-Gold (21.05, cross-validation).",
+      "Functionality": ["lemma"],
+      "Language": ["isl"],
+      "Licence": "The MIT License",
+      "Platform": [],
+      "Infrastructure": "CLARIN-IS",
+      "Group": "For a single language",
+      "Input format": ["tokenised plain text"],
+      "Output format": [],
+      "Access": {
+	"Download": "http://hdl.handle.net/20.500.12537/134"
+	},
+      "Publication": "Steingrímsson et al. (2019)"
+}
diff --git a/tools/pos-and-lemmatisation/abltagger-pos.json b/tools/pos-and-lemmatisation/abltagger-pos.json
@@ -0,0 +1,18 @@
+{
+      "Name": "ABLTagger (PoS)",
+      "URL": "http://hdl.handle.net/20.500.12537/115",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool is a part of speech tagger for Icelandic. This entry contains pretrained models for ABLTagger v3.0.0. There are two versions, small and large, of PoS taggers that work with the revised tagset that achieve an accuracy of ~96.7% and ~97.8% on MIM-Gold (cross-validation, excluding \"x\" and \"e\" tags), respectively.",
+      "Functionality": ["PoS"],
+      "Language": ["isl"],
+      "Licence": "Apache License 2.0",
+      "Platform": [],
+      "Infrastructure": "CLARIN-IS",
+      "Group": "For a single language",
+      "Input format": ["tokenised plain or pre-tagged text"],
+      "Output format": [],
+      "Access": {
+	"Download": "http://hdl.handle.net/20.500.12537/115"
+	},
+      "Publication": "Steingrímsson et al. (2019)"
+}
diff --git a/tools/pos-and-lemmatisation/afrikaans-tnt.json b/tools/pos-and-lemmatisation/afrikaans-tnt.json
@@ -0,0 +1,17 @@
+{
+      "Name": "Afrikaans TnT-Tagger",
+      "URL": "https://hdl.handle.net/20.500.12185/143",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool is based on the <a href=\"http://www.coli.uni-saarland.de/~thorsten/tnt/\">TnT tagger</a> (Brants <a href=\"https://www.clarin.eu/resource-families/tools-part-speech-tagging-and-lemmatization#Brants%202000\">2000</a>). The tagset used by the tool was especially designed for Afrikaans and consists of 139 PoS-tags.",
+      "Functionality": ["PoS"],
+      "Language": ["afr"],
+      "Licence": "research only",
+      "Platform": [],
+      "Infrastructure": "SADiLaR",
+      "Group": "For a single language",
+      "Input format": ["plain text"],
+      "Output format": ["plain text"],
+      "Access": {
+	},
+      "Publication": ""
+}
diff --git a/tools/pos-and-lemmatisation/assamese-pos.json b/tools/pos-and-lemmatisation/assamese-pos.json
@@ -0,0 +1,17 @@
+{
+      "Name": "Assamese POS Tagger",
+      "URL": "http://hdl.handle.net/11321/620",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool is a <a href=\"https://taku910.github.io/crfpp/\">CRF++</a> based PoS-tagger.",
+      "Functionality": ["PoS"],
+      "Language": ["asm"],
+      "Licence": "",
+      "Platform": [],
+      "Infrastructure": "CLARIN-PL",
+      "Group": "For a single language",
+      "Input format": [],
+      "Output format": [],
+      "Access": {
+	},
+      "Publication": ""
+}
diff --git a/tools/pos-and-lemmatisation/char-level-pos-slv.json b/tools/pos-and-lemmatisation/char-level-pos-slv.json
@@ -0,0 +1,18 @@
+{
+      "Name": "Character-level part-of-speech tagger of Slovene language",
+      "URL": "http://hdl.handle.net/11356/1211",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool uses convolutional and LSTM neural networks. The tool has been trained on the <a href=\"http://hdl.handle.net/11356/1181\">ssj500k 2.1 corpus</a>.",
+      "Functionality": ["PoS"],
+      "Language": ["slv"],
+      "Licence": "GNU General Public Licence, version 3",
+      "Platform": [],
+      "Infrastructure": "CLARIN.SI",
+      "Group": "For a single language",
+      "Input format": ["XML", "TEI", "plain text"],
+      "Output format": [],
+      "Access": {
+	"Download": "http://hdl.handle.net/11356/1211"
+	},
+      "Publication": "Belej (2018)"
+}
diff --git a/tools/pos-and-lemmatisation/clarin-dk-nlp-toolbox.json b/tools/pos-and-lemmatisation/clarin-dk-nlp-toolbox.json
@@ -0,0 +1,18 @@
+{
+      "Name": "CLARIN DK NLP Toolbox",
+      "URL": "https://clarin.dk/clarindk/toolchains-wizard.jsp",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool is an NLP toolchain that is part of the core CLARIN-DK structure.",
+      "Functionality": ["PoS", "lemma", "frequency lists"],
+      "Language": ["dan", "eng"],
+      "Licence": "",
+      "Platform": [],
+      "Infrastructure": "CLARIN-DK",
+      "Group": "For multiple languages",
+      "Input format": ["plain text", "rtf", "pdf"],
+      "Output format": ["plain text", "rtf"],
+      "Access": {
+	"Web application": "https://clarin.dk/clarindk/toolchains-wizard.jsp"
+	},
+      "Publication": ""
+}
diff --git a/tools/pos-and-lemmatisation/clark.json b/tools/pos-and-lemmatisation/clark.json
@@ -0,0 +1,18 @@
+{
+      "Name": "CLaRK",
+      "URL": "http://bultreebank.org/en/clark/",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool is an XML-based software system for corpora development implemented in JAVA. The main aim behind the design of the system is the minimization of human intervention during the creation of language resources. CLaRK includes <a href=\"http://bultreebank.org/en/clark/bulgarian-nlp-pipeline-in-clark-system/\">BTB-Pipe</a>, which is a language pipeline for Bulgarian that comprises the following modules: sentence splitting, MSD-tagging, lemmatization, dependency parsing.",
+      "Functionality": ["sentence splitting", "PoS", "lemma", "syntactic parsing"],
+      "Language": ["bul"],
+      "Licence": "",
+      "Platform": [],
+      "Infrastructure": "ClaDA-BG",
+      "Group": "For a single language",
+      "Input format": ["XML"],
+      "Output format": ["XML"],
+      "Access": {
+	"Download": "http://bultreebank.org/en/clark/bulgarian-nlp-pipeline-in-clark-system/"
+	},
+      "Publication": "Simov et al. (2001)"
+}
diff --git a/tools/pos-and-lemmatisation/claws.json b/tools/pos-and-lemmatisation/claws.json
@@ -0,0 +1,18 @@
+{
+      "Name": "CLAWS",
+      "URL": "https://www.clarin.ac.uk/claws",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "CLAWS (the Constituent Likelihood Automatic Word-tagging System), has been continuously developed since the early 1980s. The latest version of the tagger, CLAWS4, was used to PoS tag approx. 100 million words of the <a href=\"http://www.natcorp.ox.ac.uk/\">British National Corpus (BNC)</a>,  and all the English corpora in Mark Davies' <a href=\"https://corpus.byu.edu/\">BYU corpus server</a>.  Users can choose to have output in either the smaller <a href=\"http://ucrel.lancs.ac.uk/claws5tags.html\">C5 tagset</a> or the larger <a href=\"http://ucrel.lancs.ac.uk/claws7tags.html\">C7 tagset</a>.",
+      "Functionality": ["PoS/MSD"],
+      "Language": ["eng"],
+      "Licence": "<a href=\"http://ucrel.lancs.ac.uk/claws/purchase.html\">Terms of Service</a>",
+      "Platform": [],
+      "Infrastructure": "CLARIN UK",
+      "Group": "For a single language",
+      "Input format": ["plain text"],
+      "Output format": ["horizontal", "vertical", "pseudo-XML"],
+      "Access": {
+	"Web application": "http://ucrel-api.lancaster.ac.uk/claws/free.html"
+	},
+      "Publication": "Garside and Smith (1997)"
+}
diff --git a/tools/pos-and-lemmatisation/corpus-by.json b/tools/pos-and-lemmatisation/corpus-by.json
@@ -0,0 +1,18 @@
+{
+      "Name": "Corpus.by Lemmatizer",
+      "URL": "https://www.corpus.by/Lemmatizer/?lang=en",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool is part of the <a href=\"https://www.corpus.by/\">corpus.by</a> platform.",
+      "Functionality": ["lemma"],
+      "Language": ["bel"],
+      "Licence": "",
+      "Platform": [],
+      "Infrastructure": "CLARIN Knowledge Centre for Belarusian text and speech processing",
+      "Group": "For a single language",
+      "Input format": ["plain text"],
+      "Output format": ["plain text"],
+      "Access": {
+	"Web service": "https://www.corpus.by/Lemmatizer/?lang=en"
+	},
+      "Publication": ""
+}
diff --git a/tools/pos-and-lemmatisation/cst-lemmatizer.json b/tools/pos-and-lemmatisation/cst-lemmatizer.json
@@ -0,0 +1,18 @@
+{
+      "Name": "CST’s lemmatizer",
+      "URL": "http://hdl.handle.net/11372/LRT-1249",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool uses affix rules (affix: prefix, infix, suffix, circumfix).",
+      "Functionality": ["lemma"],
+      "Language": ["bul", "ces", "dan", "nld", "eng", "est", "fas", "fra", "deu", "ell", "hun", "isl", "ita", "lat", "mkd", "pol", "por", "ron", "rus", "srp", "slk", "slv", "spa", "ukr"],
+      "Licence": "",
+      "Platform": [],
+      "Infrastructure": "LINDAT/CLARIN-DK",
+      "Group": "For multiple languages",
+      "Input format": [],
+      "Output format": [],
+      "Access": {
+	"Download": "http://cst.dk/download/uk/"
+	},
+      "Publication": "Jongejan and Dalianis (2009)"
+}
diff --git a/tools/pos-and-lemmatisation/estnltk.json b/tools/pos-and-lemmatisation/estnltk.json
@@ -0,0 +1,18 @@
+{
+      "Name": "EstNLTK",
+      "URL": "https://estnltk.github.io/estnltk/1.4.1/",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool provides common natural language processing functionality such as morphological analysis and named entity recognition for the Estonian language.\nWeb API documentation is available <a href=\"https://estnltk.github.io/estnltk/1.4.1/api/index.html\">here</a>.",
+      "Functionality": ["MSD", "NER"],
+      "Language": ["est"],
+      "Licence": "Available - Unrestricted Use",
+      "Platform": [],
+      "Infrastructure": "CELR",
+      "Group": "For a single language",
+      "Input format": ["plain text"],
+      "Output format": ["plain text"],
+      "Access": {
+	"Download": "https://estnltk.github.io/estnltk/1.4.1/"
+	},
+      "Publication": "Orasmaa et al. (2016)"
+}
diff --git a/tools/pos-and-lemmatisation/fintag.json b/tools/pos-and-lemmatisation/fintag.json
@@ -0,0 +1,19 @@
+{
+      "Name": "FinTag",
+      "URL": "http://urn.fi/urn:nbn:fi:lb-201908161",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This toolchain provides finnish-postag, a part-of-speech and morphology tagger for Finnish, and finnish-nertag, a named entity recogniser for Finnish. Both tools take running text from standard input and produce tabular output (one token per line) to standard output.",
+      "Functionality": ["PoS", "lemma", "NER"],
+      "Language": ["fin"],
+      "Licence": "GPL",
+      "Platform": [],
+      "Infrastructure": "FIN-CLARIN",
+      "Group": "For a single language",
+      "Input format": ["plain text", "pdf", "doc", "scv", "epub", "html", "odt", "xls"],
+      "Output format": ["TSV"],
+      "Access": {
+	"Download": "http://urn.fi/urn:nbn:fi:lb-201908162",
+	"Web application": "https://www.kielipankki.fi/tools/demo/cgi-bin/fintag.py"
+	},
+      "Publication": ""
+}
diff --git a/tools/pos-and-lemmatisation/freeling.json b/tools/pos-and-lemmatisation/freeling.json
@@ -0,0 +1,18 @@
+{
+      "Name": "Freeling",
+      "URL": "http://hdl.handle.net/20.500.11752/ILC-72",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This toolchain was developed in the <a href=\"http://www.panacea-lr.eu/\">PANACEA</a> project and implements <a href=\"http://devel.cpl.upc.edu/freeling/downloads?order=time&desc=1\">Freeling 2.1 libraries</a>.",
+      "Functionality": ["PoS", "lemma"],
+      "Language": ["ita"],
+      "Licence": "",
+      "Platform": [],
+      "Infrastructure": "CLARIN-IT",
+      "Group": "For a single language",
+      "Input format": [],
+      "Output format": [],
+      "Access": {
+	"Web application": "https://ilc4clarin.ilc.cnr.it/en/service/freeling-it"
+	},
+      "Publication": "Padró et al. (2010)"
+}
diff --git a/tools/pos-and-lemmatisation/frog.json b/tools/pos-and-lemmatisation/frog.json
@@ -0,0 +1,18 @@
+{
+      "Name": "Frog",
+      "URL": "http://hdl.handle.net/10032/198143d2010e74ae17d4223dfc00e2a8",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool is an integration of memory-based NLP modules developed for Dutch. All NLP modules are based on <a href=\"https://languagemachines.github.io/timbl/\">TiMBL</a>, the Tilburg memory-based learning software package. Where possible, Frog makes use of multi-processor support to run subtasks in parallel.",
+      "Functionality": ["PoS", "MSD", "lemma", "NE", "phrase chunks", "dependency relations with head words"],
+      "Language": ["nld"],
+      "Licence": "GNU General Public Licence",
+      "Platform": [],
+      "Infrastructure": "CLARIAH-NL",
+      "Group": "For a single language",
+      "Input format": [],
+      "Output format": ["FoLiA XML"],
+      "Access": {
+	"Download": "https://github.com/LanguageMachines/frog/releases/"
+	},
+      "Publication": "van den Bosch et al. (2007)"
+}
diff --git a/tools/pos-and-lemmatisation/genia-tagger.json b/tools/pos-and-lemmatisation/genia-tagger.json
@@ -0,0 +1,18 @@
+{
+      "Name": "GENIA Tagger",
+      "URL": "https://hdl.handle.net/21.11115/0000-000B-D330-0",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool is used for annotating biomedical texts such as <a href=\"https://www.nlm.nih.gov/bsd/medline.html\">MEDLINE</a> abstracts.",
+      "Functionality": ["PoS", "lemma", "chunks", "named entities"],
+      "Language": ["eng", "ces", "slk"],
+      "Licence": "proprietary - commercial",
+      "Platform": [],
+      "Infrastructure": "PORTULAN",
+      "Group": "For multiple languages",
+      "Input format": [],
+      "Output format": [],
+      "Access": {
+	"Download": "http://www.nactem.ac.uk/GENIA/tagger/"
+	},
+      "Publication": "Tsurouka et al. (2015)"
+}
diff --git a/tools/pos-and-lemmatisation/hmm-tagger.json b/tools/pos-and-lemmatisation/hmm-tagger.json
@@ -0,0 +1,18 @@
+{
+      "Name": "HMM tagger",
+      "URL": "http://hdl.handle.net/11858/00-097C-0000-0001-48F9-4",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool uses Hidden Markov Models and is an implementation of the UFAL tagger.",
+      "Functionality": ["MSD"],
+      "Language": ["ces"],
+      "Licence": "GNU General Public Licence, version 2",
+      "Platform": [],
+      "Infrastructure": "LINDAT",
+      "Group": "For a single language",
+      "Input format": [],
+      "Output format": [],
+      "Access": {
+	"Download": "http://hdl.handle.net/11858/00-097C-0000-0001-48F9-4"
+	},
+      "Publication": ""
+}
diff --git a/tools/pos-and-lemmatisation/hunpos.json b/tools/pos-and-lemmatisation/hunpos.json
@@ -0,0 +1,18 @@
+{
+      "Name": "hunpos",
+      "URL": "https://hdl.handle.net/11372/LRT-1205",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool is an open source reimplementation of the <a href=\"http://www.coli.uni-saarland.de/~thorsten/tnt/\">TnT tagger</a> (Brants <a href=\"http://www.coli.uni-saarland.de/~thorsten/publications/Brants-TR-TnT.pdf\">2000</a>).",
+      "Functionality": ["PoS"],
+      "Language": ["hun"],
+      "Licence": "New BSD License",
+      "Platform": [],
+      "Infrastructure": "LINDAT",
+      "Group": "For a single language",
+      "Input format": [],
+      "Output format": [],
+      "Access": {
+	"Download": "https://code.google.com/archive/p/hunpos/downloads"
+	},
+      "Publication": "Halácsy et al. (2007)"
+}
diff --git a/tools/pos-and-lemmatisation/icenlp.json b/tools/pos-and-lemmatisation/icenlp.json
@@ -0,0 +1,19 @@
+{
+      "Name": "IceNLP Natural Language Processing toolkit",
+      "URL": "http://hdl.handle.net/20.500.12537/8",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool is an open source NLP toolkit for analyzing and processing Icelandic text. The toolkit is implemented in Java.",
+      "Functionality": ["PoS", "lemma", "shallow syntactic parsing"],
+      "Language": ["isl"],
+      "Licence": "GNU General Public License, version 2",
+      "Platform": [],
+      "Infrastructure": "CLARIN-IS",
+      "Group": "For a single language",
+      "Input format": ["plain text"],
+      "Output format": ["plain text"],
+      "Access": {
+	"Download": "https://github.com/hrafnl/icenlp",
+	"Web application": "http://nlp.cs.ru.is:8080/IceNLPWeb/icenlp.html"
+	},
+      "Publication": "Loftsson and Rögnvaldsson (2007)"
+}
diff --git a/tools/pos-and-lemmatisation/ilsp-feature.json b/tools/pos-and-lemmatisation/ilsp-feature.json
@@ -0,0 +1,18 @@
+{
+      "Name": "ILSP Feature-based multi-tiered POS Tagger",
+      "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23E8-3",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool is a FBT-based multitiered tagger. FBT is a variant of the well-known transformation based learning paradigm aiming at improving the quality of tagging highly inflective languages such as Greek.",
+      "Functionality": ["PoS"],
+      "Language": ["ell"],
+      "Licence": "terms of service (Restrictions: Academic - Non Commercial Use)",
+      "Platform": [],
+      "Infrastructure": "CLARIN:EL",
+      "Group": "For a single language",
+      "Input format": ["Application/vnd.xmi+xml"],
+      "Output format": ["Application/vnd.xmi+xml"],
+      "Access": {
+	"Web application": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23E8-3"
+	},
+      "Publication": "Papageorgiou et al. (2000)"
+}
diff --git a/tools/pos-and-lemmatisation/inl-labs.json b/tools/pos-and-lemmatisation/inl-labs.json
@@ -0,0 +1,18 @@
+{
+      "Name": "INL Labs tagger/lemmatizer tools",
+      "URL": "http://hdl.handle.net/10032/79a7f85fc70d1cf276c4c6a0a56dd176",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool employs a PoS tagger that is trained on the \"<a href=\"http://hdl.handle.net/10032/ad544a1cfddde8aeb8fca5b02188e581\">Letters as loot</a>\" historical corpus and a lemmatizer that is trained on the <a href=\"https://ivdnt.org/taalmaterialen/2126-tstc-int-historische-woordenlijst-j\">INL historical lexicon</a>.",
+      "Functionality": ["PoS", "lemma"],
+      "Language": ["nld"],
+      "Licence": "CLARIN PUB",
+      "Platform": [],
+      "Infrastructure": "CLARIAH-NL",
+      "Group": "For a single language",
+      "Input format": ["plain text", "TEI", "epub", "html", "docx", "alto"],
+      "Output format": ["styled", "XML"],
+      "Access": {
+	"Web application": "http://inl-labs.taalbanknederlands.inl.nl/succeed/tagger/ui"
+	},
+      "Publication": ""
+}
diff --git a/tools/pos-and-lemmatisation/janes-tagger.json b/tools/pos-and-lemmatisation/janes-tagger.json
@@ -0,0 +1,18 @@
+{
+      "Name": "janes-tagger",
+      "URL": "https://github.com/clarinsi/janes-tagger",
+	"Family": "Part-of-Speech Tagging and Lemmatisation",
+      "Description": "This tool, which was developed in the context of the <a href=\"http://nl.ijs.si/janes/english/\">JANES</a> project, tags non-standard Slovenian, with Croatian and Serbian to follow.",
+      "Functionality": ["PoS", "lemma"],
+      "Language": ["slv"],
+      "Licence": "",
+      "Platform": [],
+      "Infrastructure": "CLARIN.SI",
+      "Group": "For a single language",
+      "Input format": ["plain text"],
+      "Output format": [],
+      "Access": {
+	"Download": "https://github.com/clarinsi/janes-tagger"
+	},
+      "Publication": ""
+}