Skip to content

Commit

Permalink
rename methods for better clarity, move utility methods in Kotlin
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Dec 18, 2024
1 parent 3778a6e commit 21f85c9
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 89 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ else if (config.getConsolidateCitations() == 2)

resultBody = label(bodytext);
//Correct subsequent I-<figure> or I-<table>
resultBody = LabelUtils.adjustInvalidSequenceOfStartLabels(resultBody);
resultBody = LabelUtils.postProcessFulltextFixInvalidTableOrFigure(resultBody);

// we apply now the figure and table models based on the fulltext labeled output
figures = processFigures(resultBody, layoutTokenization.getTokenization(), doc);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.grobid.core.engines;

import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
Expand Down Expand Up @@ -261,7 +262,7 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc,
//resHeader.setKeyword(keywords.replace("\n", " ").replace(" ", " "));
resHeader.setKeyword(keywords);
List<Keyword> keywordsSegmented = BiblioItem.segmentKeywords(keywords);
if ((keywordsSegmented != null) && (keywordsSegmented.size() > 0))
if (CollectionUtils.isNotEmpty(keywordsSegmented))
resHeader.setKeywords(keywordsSegmented);
}

Expand Down Expand Up @@ -311,7 +312,7 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc,
}

// copyrights/license identification
if (resHeader.getCopyright() != null && resHeader.getCopyright().length()>0) {
if (StringUtils.isNotBlank(resHeader.getCopyright())) {
if (GrobidProperties.getGrobidEngineName("copyright").equals("delft")) {
CopyrightsLicense copyrightsLicense = LicenseClassifier.getInstance().classify(resHeader.getCopyright());
if (copyrightsLicense != null)
Expand Down Expand Up @@ -928,6 +929,7 @@ else if (biblio.getPublicationDate() == null)
// this will need to be reviewed with more training data, for the moment
// avoid concatenation for abstracts as it brings more noise than correct pieces
//biblio.setAbstract(biblio.getAbstract() + " " + clusterContent);
//TODO: avoid dumping text on the floor
} else {
biblio.setAbstract(clusterContent);
List<LayoutToken> tokens = cluster.concatTokens();
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package org.grobid.core.utilities

import org.apache.commons.lang3.StringUtils
import org.grobid.core.engines.label.TaggingLabels

object LabelUtils {
/**
* Post-process text labeled by the fulltext model on chunks that are known to be text (no table, or figure)
* It converts table and figure labels to paragraph labels.
*/
@JvmStatic
fun postProcessFullTextLabeledText(fulltextLabeledText: String): String {
val result = StringBuilder()

val lines = fulltextLabeledText
.split("\n".toRegex())
.dropLastWhile { it.isEmpty() }
.toTypedArray()
var previousLabel: String? = null

for (i in lines.indices) {
val line = lines[i]
if (StringUtils.isBlank(line)) continue

val pieces = line.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()
val label = pieces[pieces.size - 1]
if (label == "I-" + TaggingLabels.FIGURE.label || label == "I-" + TaggingLabels.TABLE.label) {
if (previousLabel == null || !previousLabel.endsWith(TaggingLabels.PARAGRAPH.label)) {
pieces[pieces.size - 1] = "I-" + TaggingLabels.PARAGRAPH.label
} else {
pieces[pieces.size - 1] = TaggingLabels.PARAGRAPH.label
}
} else if (label == TaggingLabels.FIGURE.label || label == TaggingLabels.TABLE.label) {
pieces[pieces.size - 1] = TaggingLabels.PARAGRAPH.label
}
result.append(pieces.joinToString("\t"))
previousLabel = label
result.append("\n")
}

return result.toString()
}

/**
* This method correct the fulltext sequence when the model has predicted several unlikely
* start sequences of table or figures.
* For example: I-<figure> followed by another I-<figure> (or table) </figure></figure>
**/
@JvmStatic
fun postProcessFulltextFixInvalidTableOrFigure(fulltextLabeledText: String): String {
val result = StringBuilder()

val lines = fulltextLabeledText
.split("\n".toRegex())
.dropLastWhile { it.isEmpty() }
.toTypedArray()

var previousLabel: String? = null
for (i in lines.indices) {
val line = lines[i]
if (StringUtils.isBlank(line)) continue

val pieces = line
.split("\t".toRegex())
.dropLastWhile { it.isEmpty() }
.toTypedArray()

val label = pieces[pieces.size - 1]
if (label == "I-" + TaggingLabels.FIGURE.label) {
if (StringUtils.equals(previousLabel, "I-" + TaggingLabels.FIGURE.label)) {
pieces[pieces.size - 1] = TaggingLabels.FIGURE.label
}
} else if (label == "I-" + TaggingLabels.TABLE.label) {
if (StringUtils.equals(previousLabel, "I-" + TaggingLabels.TABLE.label)) {
pieces[pieces.size - 1] = TaggingLabels.TABLE.label
}
}

result.append(pieces.joinToString("\t"))
previousLabel = label
result.append("\n")
}

return result.toString()
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ class LabelUtilsTest {
// }

@Test
fun testAdjustInvalidSequenceOfStartLabels_noChangeNeeded_shouldReturnSameSequence() {
fun testPostProcessFulltextFixInvalidTableOrFigure_noChangeNeeded_shouldReturnSameTableOrFigureSequence() {
val bodyResult =
"B\tb\tB\tB\tB\tB\tB\tB\tB\tB\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t11\t0\tNUMBER\t0\t0\t<paragraph>\n" +
".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKEND\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t11\t0\tNUMBER\t0\t0\t<paragraph>\n" +
Expand Down Expand Up @@ -163,13 +163,13 @@ class LabelUtilsTest {
".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t3\t0\tNUMBER\t0\t0\t<paragraph>\n" +
"The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t<paragraph>\n"

val postProcessed = LabelUtils.adjustInvalidSequenceOfStartLabels(bodyResult)
val postProcessed = LabelUtils.postProcessFulltextFixInvalidTableOrFigure(bodyResult)

assertThat(postProcessed, `is`(bodyResult))
}

@Test
fun testAdjustInvalidSequenceOfStartLabels_singleChangeNeeded_shouldCorrectTheSequence() {
fun testPostProcessFulltextFixInvalidTableOrFigure_singleChangeNeeded_shouldCorrectTheTableOrFigureSequence() {
val bodyResult =
"of\tof\to\tof\tof\tof\tf\tof\tof\tof\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t<paragraph>\n" +
"10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t1\tI-<citation_marker>\n" +
Expand All @@ -194,7 +194,7 @@ class LabelUtilsTest {
"calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t<figure>\n"


val postProcessed = LabelUtils.adjustInvalidSequenceOfStartLabels(bodyResult)
val postProcessed = LabelUtils.postProcessFulltextFixInvalidTableOrFigure(bodyResult)

assertThat(postProcessed, not(bodyResult))

Expand All @@ -217,7 +217,7 @@ class LabelUtilsTest {
}

@Test
fun testAdjustInvalidSequenceOfStartLabels_MultipleChangeNeeded_shouldCorrectTheSequence() {
fun testPostProcessFulltextFixInvalidTableOrFigure_MultipleChangeNeeded_shouldCorrectTheTableOrFigureSequence() {
val bodyResult =
"of\tof\to\tof\tof\tof\tf\tof\tof\tof\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t<paragraph>\n" +
"10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t1\tI-<citation_marker>\n" +
Expand Down Expand Up @@ -253,7 +253,7 @@ class LabelUtilsTest {
"calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t<table>\n"


val postProcessed = LabelUtils.adjustInvalidSequenceOfStartLabels(bodyResult)
val postProcessed = LabelUtils.postProcessFulltextFixInvalidTableOrFigure(bodyResult)

assertThat(postProcessed, not(bodyResult))

Expand Down

0 comments on commit 21f85c9

Please sign in to comment.