Skip to content

Commit

Permalink
make the start/end page for header processing customizable #282
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Jan 9, 2025
1 parent 7c0bccf commit 8b0f16d
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 81 deletions.
15 changes: 9 additions & 6 deletions doc/Grobid-service.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,12 +135,15 @@ Extract the header of the input PDF document, normalize it and convert it into a

`consolidateHeader` is a string of value `0` (no consolidation), `1` (consolidate and inject all extra metadata, default value), `2` (consolidate the header metadata and inject DOI only) or `3` (consolidate using only extracted DOI, if extracted, and do not try to consolidate using any other metadata).

| method | request type | response type | parameters | requirement | description |
|------------|-----------------------|---------------------|--------------------------|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| POST, PUT | `multipart/form-data` | `application/xml` | `input` | required | PDF file to be processed |
| | | | `consolidateHeader` | optional | consolidateHeader is a string of value `0` (no consolidation), `1` (consolidate and inject all extra metadata, default value), `2` (consolidate the header and inject DOI only), or `3` (consolidate using only extracted DOI - if extracted) . |
| | | | `includeRawAffiliations` | optional | `includeRawAffiliations` is a boolean value, `0` (default, do not include raw affiliation string in the result) or `1` (include raw affiliation string in the result). |
| | | | `includeRawCopyrights` | optional | `includeRawCopyrights` is a boolean value, `0` (default, do not include raw copyrights/license string in the result) or `1` (include raw copyrights/license string in the result). |
| method | request type | response type | parameters | requirement | description |
|-----------|-----------------------|-------------------|--------------------------|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| POST, PUT | `multipart/form-data` | `application/xml` | `input` | required | PDF file to be processed |
| | | | `consolidateHeader` | optional | consolidateHeader is a string of value `0` (no consolidation), `1` (consolidate and inject all extra metadata, default value), `2` (consolidate the header and inject DOI only), or `3` (consolidate using only extracted DOI - if extracted) . |
| | | | `includeRawAffiliations` | optional | `includeRawAffiliations` is a boolean value, `0` (default, do not include raw affiliation string in the result) or `1` (include raw affiliation string in the result). |
| | | | `includeRawCopyrights` | optional | `includeRawCopyrights` is a boolean value, `0` (default, do not include raw copyrights/license string in the result) or `1` (include raw copyrights/license string in the result). |
| | | | `start` | optional | Start page number of the PDF to be considered, previous pages will be skipped/ignored, integer with first page starting at `1`, (default `-1`, start from the first page of the PDF) |
| | | | `end` | optional | End page number of the PDF to be considered, next pages will be skipped/ignored, integer with first page starting at `1` (default `2`, end with the last page of the PDF) |


Use `Accept: application/x-bibtex` to retrieve BibTeX format instead of XML TEI.

Expand Down
6 changes: 4 additions & 2 deletions grobid-core/src/main/java/org/grobid/core/engines/Engine.java
Original file line number Diff line number Diff line change
Expand Up @@ -408,11 +408,13 @@ public String processHeader(
int consolidate,
boolean includeRawAffiliations,
boolean includeRawCopyrights,
int startPage,
int endPage,
BiblioItem result
) {
GrobidAnalysisConfig config = new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder()
.startPage(0)
.endPage(2)
.startPage(startPage)
.endPage(endPage)
.consolidateHeader(consolidate)
.includeRawAffiliations(includeRawAffiliations)
.includeRawCopyrights(includeRawCopyrights)
Expand Down
106 changes: 33 additions & 73 deletions grobid-service/src/main/java/org/grobid/service/GrobidRestService.java
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,16 @@ public Response processHeaderDocumentReturnXml_post(
@FormDataParam(INPUT) InputStream inputStream,
@DefaultValue("0") @FormDataParam(CONSOLIDATE_HEADER) String consolidate,
@DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations,
@DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights) {
@DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights,
@DefaultValue("0") @FormDataParam("start") int startPage,
@DefaultValue("2") @FormDataParam("end") int endPage) {
int consol = validateConsolidationParam(consolidate);
return restProcessFiles.processStatelessHeaderDocument(
inputStream, consol,
validateIncludeRawParam(includeRawAffiliations),
validateIncludeRawParam(includeRawCopyrights),
startPage,
endPage,
ExpectedResponseType.XML
);
}
Expand Down Expand Up @@ -175,8 +179,17 @@ public Response processStatelessHeaderDocumentReturnXml(
@FormDataParam(INPUT) InputStream inputStream,
@DefaultValue("0") @FormDataParam(CONSOLIDATE_HEADER) String consolidate,
@DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations,
@DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights) {
return processHeaderDocumentReturnXml_post(inputStream, consolidate, includeRawAffiliations, includeRawCopyrights);
@DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights,
@DefaultValue("0") @FormDataParam("start") int startPage,
@DefaultValue("2") @FormDataParam("end") int endPage) {
return processHeaderDocumentReturnXml_post(
inputStream,
consolidate,
includeRawAffiliations,
includeRawCopyrights,
startPage,
endPage
);

Check warning

Code scanning / CodeQL

Information exposure through an error message Medium

Error information
can be exposed to an external user.
Error information
can be exposed to an external user.
Error information
can be exposed to an external user.
}

@Path(PATH_HEADER)
Expand All @@ -187,12 +200,16 @@ public Response processHeaderDocumentReturnBibTeX_post(
@FormDataParam(INPUT) InputStream inputStream,
@DefaultValue("0") @FormDataParam(CONSOLIDATE_HEADER) String consolidate,
@DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations,
@DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights) {
@DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights,
@DefaultValue("0") @FormDataParam("start") int startPage,
@DefaultValue("2") @FormDataParam("end") int endPage) {
int consol = validateConsolidationParam(consolidate);
return restProcessFiles.processStatelessHeaderDocument(
inputStream, consol,
validateIncludeRawParam(includeRawAffiliations),
validateIncludeRawParam(includeRawCopyrights),
startPage,
endPage,
ExpectedResponseType.BIBTEX
);
}
Expand All @@ -205,8 +222,17 @@ public Response processStatelessHeaderDocumentReturnBibTeX(
@FormDataParam(INPUT) InputStream inputStream,
@DefaultValue("0") @FormDataParam(CONSOLIDATE_HEADER) String consolidate,
@DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations,
@DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights) {
return processHeaderDocumentReturnBibTeX_post(inputStream, consolidate, includeRawAffiliations, includeRawCopyrights);
@DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights,
@DefaultValue("0") @FormDataParam("start") int startPage,
@DefaultValue("2") @FormDataParam("end") int endPage) {
return processHeaderDocumentReturnBibTeX_post(
inputStream,
consolidate,
includeRawAffiliations,
includeRawCopyrights,
startPage,
endPage
);
}

@Path(PATH_FULL_TEXT)
Expand Down Expand Up @@ -630,73 +656,7 @@ public Response processCitationListReturnBibTeX_post(
.includeRawCitations(validateIncludeRawParam(includeRawCitations))
.build();
return restProcessString.processCitationList(citations, config, ExpectedResponseType.BIBTEX);
}

/**
* @see org.grobid.service.process.GrobidRestProcessAdmin#processSHA1(String)
*/
/*@Path(PATH_SHA1)
@Consumes(MediaType.APPLICATION_FORM_URLENCODED)
@Produces(MediaType.TEXT_PLAIN)
@POST
public Response processSHA1Post(@FormParam(SHA1) String sha1) {
return restProcessAdmin.processSHA1(sha1);
}*/

/**
* @see org.grobid.service.process.GrobidRestProcessAdmin#processSHA1(String)
*/
/*@Path(PATH_SHA1)
@Consumes(MediaType.TEXT_PLAIN)
@Produces(MediaType.TEXT_PLAIN)
@GET
public Response processSHA1Get(@QueryParam(SHA1) String sha1) {
return restProcessAdmin.processSHA1(sha1);
}*/

/**
* @see org.grobid.service.process.GrobidRestProcessAdmin#getAllPropertiesValues(String)
*/
/*@Path(PATH_ALL_PROPS)
@Consumes(MediaType.APPLICATION_FORM_URLENCODED)
@Produces(MediaType.TEXT_PLAIN)
@POST
public Response getAllPropertiesValuesPost(@FormParam(SHA1) String sha1) {
return restProcessAdmin.getAllPropertiesValues(sha1);
}*/

/**
* @see org.grobid.service.process.GrobidRestProcessAdmin#getAllPropertiesValues(String)
*/
/*@Path(PATH_ALL_PROPS)
@Consumes(MediaType.TEXT_PLAIN)
@Produces(MediaType.TEXT_PLAIN)
@GET
public Response getAllPropertiesValuesGet(@QueryParam(SHA1) String sha1) {
return restProcessAdmin.getAllPropertiesValues(sha1);
}*/

/**
* @see org.grobid.service.process.GrobidRestProcessAdmin#changePropertyValue(String)
*/
/*@Path(PATH_CHANGE_PROPERTY_VALUE)
@Consumes(MediaType.APPLICATION_FORM_URLENCODED)
@Produces(MediaType.TEXT_PLAIN)
@POST
public Response changePropertyValuePost(@FormParam(XML) String xml) {
return restProcessAdmin.changePropertyValue(xml);
}*/

/**
* @see org.grobid.service.process.GrobidRestProcessAdmin#changePropertyValue(String)
*/
/*@Path(PATH_CHANGE_PROPERTY_VALUE)
@Consumes(MediaType.TEXT_PLAIN)
@Produces(MediaType.TEXT_PLAIN)
@GET
public Response changePropertyValueGet(@QueryParam(XML) String xml) {
return restProcessAdmin.changePropertyValue(xml);
}*/
}

@Path(PATH_REFERENCES)
@Consumes(MediaType.MULTIPART_FORM_DATA)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ public Response processStatelessHeaderDocument(
final int consolidate,
final boolean includeRawAffiliations,
final boolean includeRawCopyrights,
int startPage,
int endPage,
ExpectedResponseType expectedResponseType
) {
LOGGER.debug(methodLogIn());
Expand Down Expand Up @@ -104,6 +106,8 @@ public Response processStatelessHeaderDocument(
consolidate,
includeRawAffiliations,
includeRawCopyrights,
startPage,
endPage,
result
);

Expand Down

0 comments on commit 8b0f16d

Please sign in to comment.