Skip to content

Commit

Permalink
Merge pull request #897 from Islandora/hocr
Browse files Browse the repository at this point in the history
Add hOCR option to Text Extraction Media Attachment action and IIIF Manifest
  • Loading branch information
wgilling authored Oct 21, 2022
2 parents 0948436 + 2e47801 commit bdbef45
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 7 deletions.
34 changes: 30 additions & 4 deletions modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php
Original file line number Diff line number Diff line change
Expand Up @@ -189,21 +189,27 @@ public function render() {
*/
protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_base_id) {
$canvases = [];
foreach ($this->options['iiif_tile_field'] as $iiif_tile_field) {
foreach (array_filter(array_values($this->options['iiif_tile_field'])) as $iiif_tile_field) {
$viewsField = $this->view->field[$iiif_tile_field];
$iiif_ocr_file_field = !empty($this->options['iiif_ocr_file_field']) ? array_filter(array_values($this->options['iiif_ocr_file_field'])) : [];
$ocrField = count($iiif_ocr_file_field) > 0 ? $this->view->field[$iiif_ocr_file_field[0]] : NULL;
$entity = $viewsField->getEntity($row);

if (isset($entity->{$viewsField->definition['field_name']})) {

/** @var \Drupal\Core\Field\FieldItemListInterface $images */
$images = $entity->{$viewsField->definition['field_name']};
foreach ($images as $image) {
foreach ($images as $i => $image) {
if (!$image->entity->access('view')) {
// If the user does not have permission to view the file, skip it.
continue;
}

$ocrs = $entity->{$ocrField->definition['field_name']};

// Create the IIIF URL for this file
// Visiting $iiif_url will resolve to the info.json for the image.
$ocr = isset($ocrs[$i]) ? $ocrs[$i] : FALSE;
$file_url = $image->entity->createFileUrl(FALSE);
$mime_type = $image->entity->getMimeType();
$iiif_url = rtrim($iiif_address, '/') . '/' . urlencode($file_url);
Expand Down Expand Up @@ -241,8 +247,7 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas
}
}
}

$canvases[] = [
$tmp_canvas = [
// @see https://iiif.io/api/presentation/2.1/#canvas
'@id' => $canvas_id,
'@type' => 'sc:Canvas',
Expand Down Expand Up @@ -271,6 +276,17 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas
],
],
];

if (isset($ocr) && $ocr != FALSE) {
$tmp_canvas['seeAlso'] = [
'@id' => $ocr->entity->createFileUrl(FALSE),
'format' => 'text/vnd.hocr+html',
'profile' => 'http://kba.cloud/hocr-spec',
'label' => 'hOCR embedded text',
];
}

$canvases[] = $tmp_canvas;
}
}
}
Expand Down Expand Up @@ -313,6 +329,7 @@ protected function defineOptions() {
$options = parent::defineOptions();

$options['iiif_tile_field'] = ['default' => ''];
$options['iiif_ocr_file_field'] = ['default' => ''];

return $options;
}
Expand Down Expand Up @@ -368,6 +385,15 @@ public function buildOptionsForm(&$form, FormStateInterface $form_state) {
// otherwise could lock up the form when setting up a View.
'#required' => count($field_options) > 0,
];

$form['iiif_ocr_file_field'] = [
'#title' => $this->t('Structured OCR data file field'),
'#type' => 'checkboxes',
'#default_value' => $this->options['iiif_ocr_file_field'],
'#description' => $this->t('The source of structured OCR text for each entity.'),
'#options' => $field_options,
'#required' => FALSE,
];
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
use Drupal\islandora\Plugin\Action\AbstractGenerateDerivativeMediaFile;

/**
* Emits a Node for generating fits derivatives event.
* Generates OCR derivatives event.
*
* @Action(
* id = "generate_extracted_text_file",
Expand All @@ -29,6 +29,7 @@ public function defaultConfiguration() {
$config['destination_media_type'] = 'file';
$config['scheme'] = $this->config->get('default_scheme');
$config['destination_text_field_name'] = '';
$config['text_format'] = 'plain_text';
return $config;
}

Expand All @@ -38,7 +39,7 @@ public function defaultConfiguration() {
public function buildConfigurationForm(array $form, FormStateInterface $form_state) {
$map = $this->entityFieldManager->getFieldMapByFieldType('text_long');
$file_fields = $map['media'];
$field_options = array_combine(array_keys($file_fields), array_keys($file_fields));
$field_options = ['none' => $this->t('None')] + array_combine(array_keys($file_fields), array_keys($file_fields));
$form = parent::buildConfigurationForm($form, $form_state);
$form['mimetype']['#description'] = $this->t('Mimetype to convert to (e.g. application/xml, etc...)');
$form['mimetype']['#value'] = 'text/plain';
Expand All @@ -48,13 +49,23 @@ public function buildConfigurationForm(array $form, FormStateInterface $form_sta
$last = array_slice($form, count($form) - $position + 1);

$middle['destination_text_field_name'] = [
'#required' => TRUE,
'#required' => FALSE,
'#type' => 'select',
'#options' => $field_options,
'#title' => $this->t('Destination Text field Name'),
'#default_value' => $this->configuration['destination_text_field_name'],
'#description' => $this->t('Text field on Media Type to hold extracted text.'),
];
$middle['text_format'] = [
'#type' => 'select',
'#title' => $this->t('Format'),
'#options' => [
'plain_text' => $this->t('Plain text'),
'hocr' => $this->t('hOCR text with positional data'),
],
'#default_value' => $this->configuration['text_format'],
'#description' => $this->t("The type of text to be returned."),
];
$form = array_merge($first, $middle, $last);

unset($form['args']);
Expand All @@ -81,17 +92,29 @@ public function validateConfigurationForm(array &$form, FormStateInterface $form
public function submitConfigurationForm(array &$form, FormStateInterface $form_state) {
parent::submitConfigurationForm($form, $form_state);
$this->configuration['destination_text_field_name'] = $form_state->getValue('destination_text_field_name');
$this->configuration['text_format'] = $form_state->getValue('text_format');
switch ($form_state->getValue('text_format')) {
case 'hocr':
$this->configuration['args'] = '-c tessedit_create_hocr=1 -c hocr_font_info=0';
break;

case 'plain_text':
$his->configuration['args'] = '';
break;
}
}

/**
* Override this to return arbitrary data as an array to be json encoded.
*/
protected function generateData(EntityInterface $entity) {

$data = parent::generateData($entity);
$route_params = [
'media' => $entity->id(),
'destination_field' => $this->configuration['destination_field_name'],
'destination_text_field' => $this->configuration['destination_text_field_name'],
'text_format' => $this->configuration['text_format'],
];
$data['destination_uri'] = Url::fromRoute('islandora_text_extraction.attach_file_to_media', $route_params)
->setAbsolute()
Expand Down

0 comments on commit bdbef45

Please sign in to comment.