From 0bea8da572110f8910bd1e14542edf7bfcccf018 Mon Sep 17 00:00:00 2001 From: Alexander O'Neill Date: Tue, 12 Jul 2022 16:23:33 -0300 Subject: [PATCH 01/11] WIP Modify GenerateOCRDerivativeFile to support hOCR --- .../Action/GenerateOCRDerivativeFile.php | 28 +++++++++++++++++-- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/modules/islandora_text_extraction/src/Plugin/Action/GenerateOCRDerivativeFile.php b/modules/islandora_text_extraction/src/Plugin/Action/GenerateOCRDerivativeFile.php index f6b8034a9..73318e35d 100644 --- a/modules/islandora_text_extraction/src/Plugin/Action/GenerateOCRDerivativeFile.php +++ b/modules/islandora_text_extraction/src/Plugin/Action/GenerateOCRDerivativeFile.php @@ -8,7 +8,7 @@ use Drupal\islandora\Plugin\Action\AbstractGenerateDerivativeMediaFile; /** - * Emits a Node for generating fits derivatives event. + * Generates OCR derivatives event. * * @Action( * id = "generate_extracted_text_file", @@ -29,6 +29,7 @@ public function defaultConfiguration() { $config['destination_media_type'] = 'file'; $config['scheme'] = $this->config->get('default_scheme'); $config['destination_text_field_name'] = ''; + $config['text_format'] = 'plain_text'; return $config; } @@ -38,7 +39,7 @@ public function defaultConfiguration() { public function buildConfigurationForm(array $form, FormStateInterface $form_state) { $map = $this->entityFieldManager->getFieldMapByFieldType('text_long'); $file_fields = $map['media']; - $field_options = array_combine(array_keys($file_fields), array_keys($file_fields)); + $field_options = ['none' => $this->t('None')] + array_combine(array_keys($file_fields), array_keys($file_fields)); $form = parent::buildConfigurationForm($form, $form_state); $form['mimetype']['#description'] = $this->t('Mimetype to convert to (e.g. application/xml, etc...)'); $form['mimetype']['#value'] = 'text/plain'; @@ -48,13 +49,23 @@ public function buildConfigurationForm(array $form, FormStateInterface $form_sta $last = array_slice($form, count($form) - $position + 1); $middle['destination_text_field_name'] = [ - '#required' => TRUE, + '#required' => FALSE, '#type' => 'select', '#options' => $field_options, '#title' => $this->t('Destination Text field Name'), '#default_value' => $this->configuration['destination_text_field_name'], '#description' => $this->t('Text field on Media Type to hold extracted text.'), ]; + $middle['text_format'] = [ + '#type' => 'select', + '#title' => $this->t('Format'), + '#options' => [ + 'plain_text' => $this->t('Plain text'), + 'hocr' => $this->t('hOCR text with positional data'), + ], + '#default_value' => $this->configuration['text_format'], + '#description' => $this->t("The type of text to be returned."), + ]; $form = array_merge($first, $middle, $last); unset($form['args']); @@ -81,17 +92,28 @@ public function validateConfigurationForm(array &$form, FormStateInterface $form public function submitConfigurationForm(array &$form, FormStateInterface $form_state) { parent::submitConfigurationForm($form, $form_state); $this->configuration['destination_text_field_name'] = $form_state->getValue('destination_text_field_name'); + $this->configuration['text_format'] = $form_state->getValue('text_format'); + switch ($form_state->getValue('text_format')) { + case 'hocr': + $this->configuration['args'] = '-c tessedit_create_hocr=1 -c hocr_font_info=0'; + break; + case 'plain_text': + $his->configuration['args'] = ''; + break; + } } /** * Override this to return arbitrary data as an array to be json encoded. */ protected function generateData(EntityInterface $entity) { + $data = parent::generateData($entity); $route_params = [ 'media' => $entity->id(), 'destination_field' => $this->configuration['destination_field_name'], 'destination_text_field' => $this->configuration['destination_text_field_name'], + 'text_format' => $this->configuration['text_format'], ]; $data['destination_uri'] = Url::fromRoute('islandora_text_extraction.attach_file_to_media', $route_params) ->setAbsolute() From bd17a381ead0ee6eef852511e8da589c2aa9b8ae Mon Sep 17 00:00:00 2001 From: Alexander O'Neill Date: Wed, 27 Jul 2022 19:24:32 +0000 Subject: [PATCH 02/11] Add Structured OCR field to IIIF Manifest view. --- .../src/Plugin/views/style/IIIFManifest.php | 10 ++++++++++ .../Action/AbstractGenerateDerivativeMediaFile.php | 5 ++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php index 194670016..446358d05 100644 --- a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php +++ b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php @@ -313,6 +313,7 @@ protected function defineOptions() { $options = parent::defineOptions(); $options['iiif_tile_field'] = ['default' => '']; + $options['iiif_ocr_file_field'] = ['default' => '']; return $options; } @@ -368,6 +369,15 @@ public function buildOptionsForm(&$form, FormStateInterface $form_state) { // otherwise could lock up the form when setting up a View. '#required' => count($field_options) > 0, ]; + + $form['iiif_ocr_file_field'] = [ + '#title' => $this->t('Structured OCR data file field'), + '#type' => 'checkboxes', + '#default_value' => $this->options['iiif_ocr_file_field'], + '#description' => $this->t('The source of structured OCR text for each entity.'), + '#options' => $field_options, + '#required' => FALSE, + ]; } /** diff --git a/src/Plugin/Action/AbstractGenerateDerivativeMediaFile.php b/src/Plugin/Action/AbstractGenerateDerivativeMediaFile.php index be9eca80b..2c4e92240 100644 --- a/src/Plugin/Action/AbstractGenerateDerivativeMediaFile.php +++ b/src/Plugin/Action/AbstractGenerateDerivativeMediaFile.php @@ -5,7 +5,6 @@ use Drupal\Core\Entity\EntityInterface; use Drupal\Core\Form\FormStateInterface; use Drupal\Core\Url; - /** * Emits a Node for generating derivatives event. * @@ -39,8 +38,8 @@ public function defaultConfiguration() { */ protected function generateData(EntityInterface $entity) { $data = parent::generateData($entity); - if (get_class($entity) != 'Drupal\media\Entity\Media') { - throw new \RuntimeException("Entity {$entity->getEntityTypeId()} {$entity->id()} is not a media", 500); + if (get_class($entity) != 'Drupal\media\Entity\Media') { + throw new \RuntimeException("Entity {$entity->getEntityTypeId()} {$entity->id()} is not a media", 500); } $source_file = $this->mediaSource->getSourceFile($entity); if (!$source_file) { From 0644795c54b7bbfe94f4163aaf53e90408feca69 Mon Sep 17 00:00:00 2001 From: Alexander O'Neill Date: Wed, 24 Aug 2022 19:13:29 +0000 Subject: [PATCH 03/11] Skip empty image fields when constructing IIIF manifest. --- modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php index 446358d05..ba971b474 100644 --- a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php +++ b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php @@ -189,7 +189,7 @@ public function render() { */ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_base_id) { $canvases = []; - foreach ($this->options['iiif_tile_field'] as $iiif_tile_field) { + foreach (array_filter(array_values($this->options['iiif_tile_field'])) as $iiif_tile_field) { $viewsField = $this->view->field[$iiif_tile_field]; $entity = $viewsField->getEntity($row); From 4179f5cee7e33f8e9eb8e3e6778272143ddd6bf8 Mon Sep 17 00:00:00 2001 From: Alexander O'Neill Date: Thu, 25 Aug 2022 19:45:23 +0000 Subject: [PATCH 04/11] WIP get hocr field in iiif view. --- modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php index ba971b474..b06b0f5d9 100644 --- a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php +++ b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php @@ -191,6 +191,7 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas $canvases = []; foreach (array_filter(array_values($this->options['iiif_tile_field'])) as $iiif_tile_field) { $viewsField = $this->view->field[$iiif_tile_field]; + $ocrField = array_pop(array_filter(array_values($this->options['iiif_ocr_file_field']))); $entity = $viewsField->getEntity($row); if (isset($entity->{$viewsField->definition['field_name']})) { From 49c48a1493e1df1a55d36ed5d54db35d7331f24f Mon Sep 17 00:00:00 2001 From: Alexander O'Neill Date: Fri, 26 Aug 2022 17:52:20 +0000 Subject: [PATCH 05/11] WIP: Add hOCR file stream to IIIF Manifest. --- .../src/Plugin/views/style/IIIFManifest.php | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php index b06b0f5d9..ad5ea178c 100644 --- a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php +++ b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php @@ -191,20 +191,25 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas $canvases = []; foreach (array_filter(array_values($this->options['iiif_tile_field'])) as $iiif_tile_field) { $viewsField = $this->view->field[$iiif_tile_field]; - $ocrField = array_pop(array_filter(array_values($this->options['iiif_ocr_file_field']))); + $iiif_ocr_file_field = array_filter(array_values($this->options['iiif_ocr_file_field'])); + $ocrField = count($iiif_ocr_file_field) > 0 ? $this->view->field[$iiif_ocr_file_field[0]] : NULL; $entity = $viewsField->getEntity($row); if (isset($entity->{$viewsField->definition['field_name']})) { /** @var \Drupal\Core\Field\FieldItemListInterface $images */ - $images = $entity->{$viewsField->definition['field_name']}; + $images = $entity->{$viewsField->definition['field_name']}; foreach ($images as $image) { if (!$image->entity->access('view')) { // If the user does not have permission to view the file, skip it. continue; } + + $ocrs = $entity->{$ocrField->definition['field_name']}; + // Create the IIIF URL for this file // Visiting $iiif_url will resolve to the info.json for the image. + $ocr = isset($ocrs[$i]) ? $ocrs[$i] : NULL; $file_url = $image->entity->createFileUrl(FALSE); $mime_type = $image->entity->getMimeType(); $iiif_url = rtrim($iiif_address, '/') . '/' . urlencode($file_url); @@ -242,8 +247,7 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas } } } - - $canvases[] = [ + $tmp_canvas = [ // @see https://iiif.io/api/presentation/2.1/#canvas '@id' => $canvas_id, '@type' => 'sc:Canvas', @@ -272,6 +276,17 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas ], ], ]; + + if (isset($ocr)) { + $tmp_canvas['seeAlso'] = [ + '@id' => $ocr->entity->createFileUrl(FALSE), + 'format' => 'text/vnd.hocr+html', + 'profile' => 'http://kba.cloud/hocr-spec', + 'label' => 'hOCR embedded text', + ]; + } + + $canvases[] = $tmp_canvas; } } } From 5e1d53d377ae35bb19a8d2415773a9715e4c0cb6 Mon Sep 17 00:00:00 2001 From: Alexander O'Neill Date: Wed, 7 Sep 2022 16:16:31 +0000 Subject: [PATCH 06/11] Add empty check when adding hOCR to IIIF manifest. --- .../islandora_iiif/src/Plugin/views/style/IIIFManifest.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php index ad5ea178c..561b216dd 100644 --- a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php +++ b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php @@ -191,13 +191,13 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas $canvases = []; foreach (array_filter(array_values($this->options['iiif_tile_field'])) as $iiif_tile_field) { $viewsField = $this->view->field[$iiif_tile_field]; - $iiif_ocr_file_field = array_filter(array_values($this->options['iiif_ocr_file_field'])); + $iiif_ocr_file_field = !empty($this->options['iiif_ocr_file_field']) ? array_filter(array_values($this->options['iiif_ocr_file_field'])): array(); $ocrField = count($iiif_ocr_file_field) > 0 ? $this->view->field[$iiif_ocr_file_field[0]] : NULL; $entity = $viewsField->getEntity($row); if (isset($entity->{$viewsField->definition['field_name']})) { - /** @var \Drupal\Core\Field\FieldItemListInterface $images */ + /** @var \Drupal\Core\Field\FieldItemListInterface $images */ $images = $entity->{$viewsField->definition['field_name']}; foreach ($images as $image) { if (!$image->entity->access('view')) { @@ -209,7 +209,7 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas // Create the IIIF URL for this file // Visiting $iiif_url will resolve to the info.json for the image. - $ocr = isset($ocrs[$i]) ? $ocrs[$i] : NULL; + $ocr = isset($ocrs[$i]) ? $ocrs[$i] : FALSE; $file_url = $image->entity->createFileUrl(FALSE); $mime_type = $image->entity->getMimeType(); $iiif_url = rtrim($iiif_address, '/') . '/' . urlencode($file_url); From bf25e2447ac50b75411c10f7f0f014bf4edf0d8c Mon Sep 17 00:00:00 2001 From: Alexander O'Neill Date: Wed, 7 Sep 2022 18:43:03 +0000 Subject: [PATCH 07/11] Fix error caused by rebase. --- .../src/Plugin/views/style/IIIFManifest.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php index 561b216dd..594e75d8f 100644 --- a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php +++ b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php @@ -197,15 +197,15 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas if (isset($entity->{$viewsField->definition['field_name']})) { - /** @var \Drupal\Core\Field\FieldItemListInterface $images */ - $images = $entity->{$viewsField->definition['field_name']}; - foreach ($images as $image) { + /** @var \Drupal\Core\Field\FieldItemListInterface $images */ + $images = $entity->{$viewsField->definition['field_name']}; + foreach ($images as $i => $image) { if (!$image->entity->access('view')) { // If the user does not have permission to view the file, skip it. continue; } - $ocrs = $entity->{$ocrField->definition['field_name']}; + $ocrs = $entity->{$ocrField->definition['field_name']}; // Create the IIIF URL for this file // Visiting $iiif_url will resolve to the info.json for the image. From 78cee0a35a2310273e125f948fc76009870d0e3c Mon Sep 17 00:00:00 2001 From: Alexander O'Neill Date: Thu, 8 Sep 2022 12:46:23 +0000 Subject: [PATCH 08/11] Fix PHPCS errors. --- .../islandora_iiif/src/Plugin/views/style/IIIFManifest.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php index 594e75d8f..a09152423 100644 --- a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php +++ b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php @@ -191,7 +191,7 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas $canvases = []; foreach (array_filter(array_values($this->options['iiif_tile_field'])) as $iiif_tile_field) { $viewsField = $this->view->field[$iiif_tile_field]; - $iiif_ocr_file_field = !empty($this->options['iiif_ocr_file_field']) ? array_filter(array_values($this->options['iiif_ocr_file_field'])): array(); + $iiif_ocr_file_field = !empty($this->options['iiif_ocr_file_field']) ? array_filter(array_values($this->options['iiif_ocr_file_field'])) : []; $ocrField = count($iiif_ocr_file_field) > 0 ? $this->view->field[$iiif_ocr_file_field[0]] : NULL; $entity = $viewsField->getEntity($row); @@ -205,7 +205,7 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas continue; } - $ocrs = $entity->{$ocrField->definition['field_name']}; + $ocrs = $entity->{$ocrField->definition['field_name']}; // Create the IIIF URL for this file // Visiting $iiif_url will resolve to the info.json for the image. From a41ecaa754a071904bb4a397996a2f0a127d30ad Mon Sep 17 00:00:00 2001 From: Alexander O'Neill Date: Thu, 8 Sep 2022 12:55:15 +0000 Subject: [PATCH 09/11] Fix PHPCS errors. --- src/Plugin/Action/AbstractGenerateDerivativeMediaFile.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Plugin/Action/AbstractGenerateDerivativeMediaFile.php b/src/Plugin/Action/AbstractGenerateDerivativeMediaFile.php index 2c4e92240..be9eca80b 100644 --- a/src/Plugin/Action/AbstractGenerateDerivativeMediaFile.php +++ b/src/Plugin/Action/AbstractGenerateDerivativeMediaFile.php @@ -5,6 +5,7 @@ use Drupal\Core\Entity\EntityInterface; use Drupal\Core\Form\FormStateInterface; use Drupal\Core\Url; + /** * Emits a Node for generating derivatives event. * @@ -38,8 +39,8 @@ public function defaultConfiguration() { */ protected function generateData(EntityInterface $entity) { $data = parent::generateData($entity); - if (get_class($entity) != 'Drupal\media\Entity\Media') { - throw new \RuntimeException("Entity {$entity->getEntityTypeId()} {$entity->id()} is not a media", 500); + if (get_class($entity) != 'Drupal\media\Entity\Media') { + throw new \RuntimeException("Entity {$entity->getEntityTypeId()} {$entity->id()} is not a media", 500); } $source_file = $this->mediaSource->getSourceFile($entity); if (!$source_file) { From c07d1f65401a0e96fae994953945be79b83779ba Mon Sep 17 00:00:00 2001 From: Alexander O'Neill Date: Thu, 8 Sep 2022 13:03:45 +0000 Subject: [PATCH 10/11] Fix PHPCS Errors. --- .../src/Plugin/Action/GenerateOCRDerivativeFile.php | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/islandora_text_extraction/src/Plugin/Action/GenerateOCRDerivativeFile.php b/modules/islandora_text_extraction/src/Plugin/Action/GenerateOCRDerivativeFile.php index 73318e35d..4ff0d93fc 100644 --- a/modules/islandora_text_extraction/src/Plugin/Action/GenerateOCRDerivativeFile.php +++ b/modules/islandora_text_extraction/src/Plugin/Action/GenerateOCRDerivativeFile.php @@ -97,6 +97,7 @@ public function submitConfigurationForm(array &$form, FormStateInterface $form_s case 'hocr': $this->configuration['args'] = '-c tessedit_create_hocr=1 -c hocr_font_info=0'; break; + case 'plain_text': $his->configuration['args'] = ''; break; From 2e4780163e50fb2312e1463dd41832c50f21c24a Mon Sep 17 00:00:00 2001 From: Alexander O'Neill Date: Thu, 20 Oct 2022 02:39:22 +0000 Subject: [PATCH 11/11] Add check for falsity in IIIF Manifest along with 'isset()' --- modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php index a09152423..cc4b5e94e 100644 --- a/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php +++ b/modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php @@ -277,7 +277,7 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas ], ]; - if (isset($ocr)) { + if (isset($ocr) && $ocr != FALSE) { $tmp_canvas['seeAlso'] = [ '@id' => $ocr->entity->createFileUrl(FALSE), 'format' => 'text/vnd.hocr+html',