Skip to content

Commit

Permalink
keep the non-classified text into the same table/figure
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Jan 4, 2025
1 parent f6642f2 commit 9adc814
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 16 deletions.
14 changes: 12 additions & 2 deletions grobid-core/src/main/java/org/grobid/core/data/Figure.java
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
if (config.isGenerateTeiCoordinates("figure")) {
List<BoundingBox> theBoxes = null;
// non graphic elements
if (getLayoutTokens() != null && getLayoutTokens().size() > 0) {
if (CollectionUtils.isNotEmpty(getLayoutTokens())) {
theBoxes = BoundingBoxCalculator.calculate(getLayoutTokens());
}

Expand All @@ -352,7 +352,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
// here we bound all figure graphics in one single box (given that we can have hundred graphics
// in a single figure)
BoundingBox theGraphicsBox = null;
if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
if (CollectionUtils.isNotEmpty(graphicObjects)) {
for (GraphicObject graphicObject : graphicObjects) {
if (theGraphicsBox == null) {
theGraphicsBox = graphicObject.getBoundingBox();
Expand Down Expand Up @@ -455,6 +455,16 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form

figureElement.appendChild(desc);
}

if (CollectionUtils.isNotEmpty(discardedPiecesTokens)) {
for (List<LayoutToken> discardedPieceTokens : discardedPiecesTokens) {
Element note = XmlBuilderUtils.teiElement("note");
note.addAttribute(new Attribute("type", "other"));
note.appendChild(LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(discardedPieceTokens)).trim());
figureElement.appendChild(note);
}
}

if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
for (GraphicObject graphicObject : graphicObjects) {
Element go = XmlBuilderUtils.teiElement("graphic");
Expand Down
13 changes: 12 additions & 1 deletion grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.grobid.core.data;

import org.apache.commons.collections4.CollectionUtils;
import org.grobid.core.GrobidModels;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.data.table.Cell;
Expand Down Expand Up @@ -254,8 +255,18 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
tableElement.appendChild(desc);
tableElement.appendChild(contentEl);

if (noteNode != null)
if (noteNode != null) {
tableElement.appendChild(noteNode);
}

if (CollectionUtils.isNotEmpty(discardedPiecesTokens)) {
for (List<LayoutToken> discardedPieceTokens : discardedPiecesTokens) {
Element note = XmlBuilderUtils.teiElement("note");
note.addAttribute(new Attribute("type", "other"));
note.appendChild(LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(discardedPieceTokens)).trim());
tableElement.appendChild(note);
}
}

return tableElement.toXML();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,7 @@ class FigureParser extends AbstractParser {
public Figure processing(List<LayoutToken> tokenizationFigure, String featureVector) {
String res;
try {
//System.out.println("---------------------featureVector-----------------------");
//System.out.println(featureVector);
res = label(featureVector);;
//System.out.println("---------------------res-----------------------");
//System.out.println(res);
} catch (Exception e) {
throw new GrobidException("Sequence labeling with figure model fails.", e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,6 @@ else if (config.getConsolidateCitations() == 2)
figure.setLabeledCaption(captionProcess.getLeft());
figure.setCaptionLayoutTokens(captionProcess.getRight());
}
if (CollectionUtils.isNotEmpty(figure.getDiscardedPiecesTokens())) {
resHeader.getDiscardedPiecesTokens().addAll(figure.getDiscardedPiecesTokens());
}
}

long numberFiguresFulltextModel = Arrays.stream(bodyResults.split("\n"))
Expand Down Expand Up @@ -324,9 +321,6 @@ else if (config.getConsolidateCitations() == 2)
table.setLabeledNote(noteProcess.getLeft());
table.setNoteLayoutTokens(noteProcess.getRight());
}
if (CollectionUtils.isNotEmpty(table.getDiscardedPiecesTokens())) {
resHeader.getDiscardedPiecesTokens().addAll(table.getDiscardedPiecesTokens());
}
}

equations = processEquations(bodyResults, bodyLayoutTokens.getTokenization(), doc);
Expand All @@ -352,12 +346,13 @@ else if (config.getConsolidateCitations() == 2)
// callout in superscript is by error labeled as a numerical reference callout)
List<MarkerType> markerTypes = null;

if (bodyResults != null){
if (bodyResults != null) {
markerTypes = postProcessCallout(bodyResults, bodyLayoutTokens);
}

// final combination
toTEI(doc, // document
toTEI(
doc, // document
bodyResults,
annexResults, // labeled data for body and annex
bodyLayoutTokens,
Expand All @@ -367,7 +362,8 @@ else if (config.getConsolidateCitations() == 2)
tables,
equations,
markerTypes,
config);
config
);
return doc;
} catch (GrobidException e) {
throw e;
Expand Down

0 comments on commit 9adc814

Please sign in to comment.