Merge pull request #1151 from kermitt2/flavor

Managing model specializations/variants (flavors)
kermitt2 · Jan 6, 2025 · 7c0bccf · 7c0bccf
2 parents 5b0dd4f + 5b05460
commit 7c0bccf
Show file tree

Hide file tree

Showing 103 changed files with 331,717 additions and 104 deletions.
diff --git a/build.gradle b/build.gradle
@@ -503,20 +503,27 @@ project(":grobid-trainer") {
         "train_name_header"           : "org.grobid.trainer.NameHeaderTrainer",
         "train_name_citation"         : "org.grobid.trainer.NameCitationTrainer",
         "train_affiliation_address"   : "org.grobid.trainer.AffiliationAddressTrainer",
-        "train_header"                : "org.grobid.trainer.HeaderTrainer",
+//        "train_header"                : "org.grobid.trainer.HeaderTrainer",
         "train_fulltext"              : "org.grobid.trainer.FulltextTrainer",
         "train_shorttext"             : "org.grobid.trainer.ShorttextTrainer",
         "train_figure"                : "org.grobid.trainer.FigureTrainer",
         "train_table"                 : "org.grobid.trainer.TableTrainer",
         "train_citation"              : "org.grobid.trainer.CitationTrainer",
         "train_date"                  : "org.grobid.trainer.DateTrainer",
-        "train_segmentation"          : "org.grobid.trainer.SegmentationTrainer",
+//        "train_segmentation"          : "org.grobid.trainer.SegmentationTrainer",
         "train_reference_segmentation": "org.grobid.trainer.ReferenceSegmenterTrainer",
         "train_ebook_model"           : "org.grobid.trainer.EbookTrainer",
         "train_patent_citation"       : "org.grobid.trainer.PatentParserTrainer",
         "train_funding_acknowledgement" : "org.grobid.trainer.FundingAcknowledgementTrainer"
     ]
 
+    def complexTrainerTasks = [
+        "train_header"                : ["org.grobid.trainer.HeaderTrainer", ""],
+        "train_header_ietf"           : ["org.grobid.trainer.HeaderTrainer", "sdo/ietf"],
+        "train_segmentation"          : ["org.grobid.trainer.SegmentationTrainer", ""],
+        "train_segmentation_ietf"     : ["org.grobid.trainer.SegmentationTrainer", "sdo/ietf"]
+    ]
+
     def libraries = ""
     if (Os.isFamily(Os.FAMILY_MAC)) {
         if (Os.OS_ARCH.equals("aarch64")) {
@@ -541,6 +548,18 @@ project(":grobid-trainer") {
         }
     }
 
+    complexTrainerTasks.each { taskName, mainClassNameAndArgs ->
+        tasks.create(name: taskName, type: JavaExec, group: 'modeltraining') {
+            main = mainClassNameAndArgs[0]
+            classpath = sourceSets.main.runtimeClasspath
+            if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0)
+                jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.lang=ALL-UNNAMED"
+            if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0)
+                jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.lang=ALL-UNNAMED"    
+            args mainClassNameAndArgs[1]
+        }
+    }
+
     // evaluation tasks
     ext.getArg = { propName, defaultVal ->
         return project.hasProperty(propName) ? project.getProperty(propName) : defaultVal;

diff --git a/doc/Grobid-service.md b/doc/Grobid-service.md
@@ -192,6 +192,7 @@ Convert the complete input document into TEI XML format (header, body and biblio
 |           |                       |                      | `generateIDs`            | optional        | if supplied as a string equal to `1`, it generates uniqe identifiers for each text component                                                                                                                                                                       |
 |           |                       |                      | `start`                  | optional        | Start page number of the PDF to be considered, previous pages will be skipped/ignored, integer with first page starting at `1`, (default `-1`, start from the first page of the PDF)                                                                               |
 |           |                       |                      | `end`                    | optional        | End page number of the PDF to be considered, next pages will be skipped/ignored, integer with first page starting at `1` (default `-1`, end with the last page of the PDF)                                                                                         |
+|           |                       |                      | `flavor`                 | optional        | Indicate which flavor to apply for structuring the document. Useful when the default structuring cannot be applied to a specific document (e.g. the body is empty. More technical details and available flavor names in the [dedicated page](Grobid-specialized-processes.md). |
 
 Response status codes:
 

diff --git a/doc/Grobid-specialized-processes.md b/doc/Grobid-specialized-processes.md
@@ -0,0 +1,51 @@
+# GROBID specialised processing (aka flavors)
+
+## Introduction
+
+This is a simple management of alternative models to use when processing a document. 
+A model variant (or flavor) is for example an alternative header model trained with its own training data and labels (to cover documents with specific header section different from scholar articles), or an alternative segmentation model for segmenting something else than scholar papers.
+
+To process a document with alternative model(s), we use a string called "flavor" to identify it. 
+If the flavor is indicated, the selected model will use the "flavor" model if it exists, and the normal model if the flavor does exist for this model (so defaulting back then to the standard models).
+
+Flavor model training data are always located as subdirectories of the standard training data path, e.g. for the flavor "sdo/ietf", the training data of the header model for this flavor will be under `grobid-trainer/resources/dataset/header/article/light-ref`. 
+The training data of the segmentation model for this flavor will be under `grobid-trainer/resources/dataset/segmentation/article/light`, and so on.
+
+For running grobid following a particular flavor, we add the flavor name as additional parameter of the service:
+
+```shell
+curl -v --form input=@./XP123456.pdf --form "flavor=sdo/ietf" localhost:8070/api/processFulltextDocument
+```
+
+Following, an updated view of the cascade architecture:
+
+![cascade-with-flavors.png](img/cascade-with-flavors.png)
+
+## Flavors
+
+At the moment, the flavored processes are available as follows:
+
+| Name                                                      | Identifier | Flavored models          | Description                                   | Advantages                                                   | Limitations |
+|-----------------------------------------------------------|------------|--------------------------|-----------------------------------------------|--------------------------------------------------------------|-------------|
+| Internet Engineering Task Force (IETF) Standard Documents | `sdo/ietf` | `segmentation`, `header` | Processing of the IETF Standard documentation | Supports the procesisng of a different flavor of documents   |             | 
+| 3GPP Working Procedures Standard Documents                | `sdo/3gpp` | N/A                      |                                               |
+
+
+## Training the specialised flavor models  
+
+The training data for the flavors are following the same structure as the standard models. 
+In other words the annotated training data for, e.g., the lightweight segmentation model with references, for articles, are following the guidelines as the standard grobid segmentation model. 
+The Grobid parser select automatically the right subset of labels to include. 
+However, this can be implemented at discretion of the user, so for example a flavor `sdo/ietf` for parsing standards documents for IETF, can be following their specific guidelines.
+
+For training the specialised models the same procedure as for the standard models is used, but the flavor is indicated in the training command, e.g. to train the segmentation model for the flavor `article/light`: 
+
+```shell
+./gradlew train_segmentation_sdo_ietf
+```
+
+or the header model for the flavor `sdo/ietf`: 
+
+```shell
+./gradlew train_header_article_sdo_ietf
+```
diff --git a/doc/img/cascade-with-flavors.png b/doc/img/cascade-with-flavors.png
diff --git a/doc/index.md b/doc/index.md
@@ -25,6 +25,8 @@
 
 * [GROBID configuration](Configuration.md)
 
+* [GROBID specialized processes](Grobid-specialized-processes.md)
+
 * [Troubleshooting and known issues](Troubleshooting.md)
 
 * [Use Grobid library in third party Java applications](Grobid-java-library.md)

diff --git a/grobid-core/src/main/java/org/grobid/core/GrobidModels.java b/grobid-core/src/main/java/org/grobid/core/GrobidModels.java
@@ -4,8 +4,11 @@
 import org.grobid.core.utilities.GrobidProperties;
 
 import java.io.File;
+import java.util.Arrays;
+import java.util.List;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
+import java.util.stream.Collectors;
 
 import static org.grobid.core.engines.EngineParsers.LOGGER;
 
@@ -20,6 +23,8 @@ public enum GrobidModels implements GrobidModel {
 
     AFFILIATION_ADDRESS("affiliation-address"),
     SEGMENTATION("segmentation"),
+    SEGMENTATION_SDO_IETF("segmentation/sdo/ietf"),
+    SEGMENTATION_SDO_3GPP("segmentation/sdo/3gpp"),
     CITATION("citation"),
     REFERENCE_SEGMENTER("reference-segmenter"),
     DATE("date"),
@@ -33,6 +38,8 @@ public enum GrobidModels implements GrobidModel {
     FIGURE("figure"),
     TABLE("table"),
     HEADER("header"),
+    HEADER_SDO_3GPP("header/sdo/3gpp"),
+    HEADER_SDO_IETF("header/sdo/ietf"),
     NAMES_CITATION("name/citation"),
     NAMES_HEADER("name/header"),
     PATENT_PATENT("patent/patent"),
@@ -62,19 +69,43 @@ public enum GrobidModels implements GrobidModel {
     // This is used in particular for scientific or technical documents like standards (SDO) 
     // which have a particular overall zoning and/or header, while the rest of the content 
     // is similar to other general technical and scientific document
-    public enum Collection {
+    public enum Flavor {
+        _3GPP("sdo/3gpp"),
         IETF("sdo/ietf");
 
         public final String label;
 
-        private Collection(String label) {
+        private Flavor(String label) {
             this.label = label;
         }
 
         public String getLabel() {
             return label;
         }
-    };
+
+        public String getPlainLabel() {
+            return label.replace("/", "_");
+        }
+
+        public static Flavor fromLabel(String text) {
+            for (Flavor f : Flavor.values()) {
+                if (f.label.equalsIgnoreCase(text)) {
+                    return f;
+                }
+            }
+            return null;
+        }
+
+        public static List<String> getLabels() {
+            return Arrays.stream(Flavor.values())
+                .map(Flavor::getLabel)
+                .collect(Collectors.toList());
+        }
+
+        public String toString() {
+            return getLabel();
+        }
+    }
 
     /**
      * Absolute path to the model.
@@ -124,6 +155,13 @@ public String toString() {
         return folderName;
     }
 
+    public static GrobidModel getModelFlavor(GrobidModel model, Flavor flavor) {
+        if (flavor == null) {
+            return model;
+        } else 
+            return modelFor(model.toString() + "/" + flavor.getLabel().toLowerCase());
+    }
+
     public static GrobidModel modelFor(final String name) {
         if (models.isEmpty()) {
             for (GrobidModel model : values())

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java
@@ -4,6 +4,7 @@
 import org.apache.commons.lang3.tuple.MutablePair;
 import org.apache.commons.lang3.tuple.MutableTriple;
 import org.apache.commons.lang3.tuple.Pair;
+import org.grobid.core.GrobidModels;
 import org.grobid.core.data.*;
 import org.grobid.core.document.Document;
 import org.grobid.core.document.DocumentSource;
@@ -364,10 +365,11 @@ public String processHeader(
      * @param consolidateHeader the consolidation option allows GROBID to exploit Crossref web services for improving header
      *                    information. 0 (no consolidation, default value), 1 (consolidate the citation and inject extra
      *                    metadata) or 2 (consolidate the citation and inject DOI only)
-     * @param consolidateFunder the consolidation option allows GROBID to exploit Crossref Funder Registry web services for improving header
+     * @param consolidateFunders the consolidation option allows GROBID to exploit Crossref Funder Registry web services for improving header
      *                    information. 0 (no consolidation, default value), 1 (consolidate the citation and inject extra
      *                    metadata) or 2 (consolidate the citation and inject DOI only)
-     * @param result      bib result
+     * @param includeRawAffiliations includes the raw affiliation in the output
+     * @param includeRawCopyrights includes the raw copyright information in the output
      * @return the TEI representation of the extracted bibliographical
      *         information
      */
@@ -427,10 +429,11 @@ public String processHeader(
      * @param consolidateHeader the consolidation option allows GROBID to exploit Crossref web services for improving header
      *                    information. 0 (no consolidation, default value), 1 (consolidate the citation and inject extra
      *                    metadata) or 2 (consolidate the citation and inject DOI only)
-     * @param consolidateFunder the consolidation option allows GROBID to exploit Crossref Funder Registry web services for improving header
+     * @param consolidateFunders the consolidation option allows GROBID to exploit Crossref Funder Registry web services for improving header
      *                    information. 0 (no consolidation, default value), 1 (consolidate the citation and inject extra
      *                    metadata) or 2 (consolidate the citation and inject DOI only)
-     * @param result      bib result
+     * @param includeRawAffiliations includes the raw affiliation in the output
+     * @param includeRawCopyrights includes the raw copyright information in the output
      * @return the TEI representation of the extracted bibliographical
      *         information
      */
@@ -557,7 +560,13 @@ public void createTraining(File inputFile, String pathRaw, String pathTEI, int i
      */
     public String fullTextToTEI(File inputFile,
                                 GrobidAnalysisConfig config) throws Exception {
-        return fullTextToTEIDoc(inputFile, null, config).getTei();
+        return fullTextToTEIDoc(inputFile, null,null, config).getTei();
+    }
+
+    public String fullTextToTEI(File inputFile, 
+                                GrobidModels.Flavor flavor,
+                                GrobidAnalysisConfig config) throws Exception {
+        return fullTextToTEIDoc(inputFile, flavor, null, config).getTei();
     }
 
     /**
@@ -573,36 +582,39 @@ public String fullTextToTEI(File inputFile,
      * @return the resulting structured document as a TEI string.
      */
     public String fullTextToTEI(File inputFile,
+                                GrobidModels.Flavor flavor,
                                 String md5Str,
                                 GrobidAnalysisConfig config) throws Exception {
-        return fullTextToTEIDoc(inputFile, md5Str, config).getTei();
+        return fullTextToTEIDoc(inputFile, flavor, md5Str, config).getTei();
     }
 
     public Document fullTextToTEIDoc(File inputFile,
-                                     String md5Str,
-                                     GrobidAnalysisConfig config) throws Exception {
+                                    GrobidModels.Flavor flavor,
+                                    String md5Str,
+                                    GrobidAnalysisConfig config) throws Exception {
         FullTextParser fullTextParser = parsers.getFullTextParser();
         Document resultDoc;
         LOGGER.debug("Starting processing fullTextToTEI on " + inputFile);
         long time = System.currentTimeMillis();
-        resultDoc = fullTextParser.processing(inputFile, md5Str, config);
+        resultDoc = fullTextParser.processing(inputFile, flavor, md5Str, config);
         LOGGER.debug("Ending processing fullTextToTEI on " + inputFile + ". Time to process: "
 			+ (System.currentTimeMillis() - time) + "ms");
         return resultDoc;
     }
 
     public Document fullTextToTEIDoc(File inputFile,
                                      GrobidAnalysisConfig config) throws Exception {
-        return fullTextToTEIDoc(inputFile, null, config);
+        return fullTextToTEIDoc(inputFile, null, null, config);
     }
 
     public Document fullTextToTEIDoc(DocumentSource documentSource,
-                                     GrobidAnalysisConfig config) throws Exception {
+                                    GrobidModels.Flavor flavor,
+                                    GrobidAnalysisConfig config) throws Exception {
         FullTextParser fullTextParser = parsers.getFullTextParser();
         Document resultDoc;
         LOGGER.debug("Starting processing fullTextToTEI on " + documentSource);
         long time = System.currentTimeMillis();
-        resultDoc = fullTextParser.processing(documentSource, config);
+        resultDoc = fullTextParser.processing(documentSource, flavor, config);
         LOGGER.debug("Ending processing fullTextToTEI on " + documentSource + ". Time to process: "
                 + (System.currentTimeMillis() - time) + "ms");
         return resultDoc;