From a1bdb8049ddebc03baf4436e9304f987868bd9ba Mon Sep 17 00:00:00 2001 From: Sandro Mani Date: Thu, 13 Oct 2016 22:46:06 +0200 Subject: [PATCH] Purge graphic items which overlap with text blocks or are too small from hOCR result --- gtk/src/Geometry.hh | 2 +- gtk/src/OutputEditorHOCR.cc | 42 +++++++++++++++++++++++++++------- gtk/src/OutputEditorHOCR.hh | 2 +- qt/src/OutputEditorHOCR.cc | 45 +++++++++++++++++++++++++++++-------- qt/src/OutputEditorHOCR.hh | 2 +- 5 files changed, 73 insertions(+), 20 deletions(-) diff --git a/gtk/src/Geometry.hh b/gtk/src/Geometry.hh index f78e64b3..2a9b5066 100644 --- a/gtk/src/Geometry.hh +++ b/gtk/src/Geometry.hh @@ -63,7 +63,7 @@ public: bool contains(const Point& p) const{ return p.x >= x && p.x <= x + width && p.y >= y && p.y <= y + height; } - bool overlaps(const Rectangle& r) { + bool overlaps(const Rectangle& r) const { return x < r.x + r.width && x + width > r.x && y < r.y + r.height && y + height > r.y; } Rectangle unite(const Rectangle& r) const{ diff --git a/gtk/src/OutputEditorHOCR.cc b/gtk/src/OutputEditorHOCR.cc index cd4cb16d..70154bf0 100644 --- a/gtk/src/OutputEditorHOCR.cc +++ b/gtk/src/OutputEditorHOCR.cc @@ -493,10 +493,10 @@ void OutputEditorHOCR::addPage(const Glib::ustring& hocrText, ReadSessionData da Glib::ustring pageTitle = Glib::ustring::compose("image '%1'; bbox %2 %3 %4 %5; pageno %6; rot %7; res %8", data.file, x1, y1, x2, y2, data.page, data.angle, data.resolution); pageDiv->set_attribute("title", pageTitle); - addPage(pageDiv, Gio::File::create_for_path(data.file)->get_basename(), data.page); + addPage(pageDiv, Gio::File::create_for_path(data.file)->get_basename(), data.page, true); } -void OutputEditorHOCR::addPage(xmlpp::Element* pageDiv, const Glib::ustring& filename, int page) +void OutputEditorHOCR::addPage(xmlpp::Element* pageDiv, const Glib::ustring& filename, int page, bool cleanGraphics) { m_connectionItemViewRowEdited.block(true); pageDiv->set_attribute("id", Glib::ustring::compose("page_%1", ++m_idCounter)); @@ -525,11 +525,40 @@ void OutputEditorHOCR::addPage(xmlpp::Element* pageDiv, const Glib::ustring& fil std::map langCache; + std::vector> graphicElements; xmlpp::Element* element = getFirstChildElement(pageDiv, "div"); while(element) { // Boxes without text are images titleAttr = getAttribute(element, "title"); if(!addChildItems(getFirstChildElement(element), pageItem, langCache) && s_bboxRx->match(titleAttr, matchInfo)) { + x1 = std::atoi(matchInfo.fetch(1).c_str()); + y1 = std::atoi(matchInfo.fetch(2).c_str()); + x2 = std::atoi(matchInfo.fetch(3).c_str()); + y2 = std::atoi(matchInfo.fetch(4).c_str()); + graphicElements.push_back(std::make_pair(element, Geometry::Rectangle(x1, y1, x2-x1, y2-y1))); + } + element = getNextSiblingElement(element); + } + + // Discard graphic elements which intersect with text block or which are too small + int numTextBlocks = pageItem->children().size(); + for(const std::pair& pair : graphicElements) { + xmlpp::Element* element = pair.first; + const Geometry::Rectangle& bbox = pair.second; + bool deleteGraphic = false; + if(cleanGraphics) { + if(bbox.width < 10 || bbox.height < 10) { + deleteGraphic = true; + } else { + for(int i = 0; i < numTextBlocks; ++i) { + if(bbox.overlaps((*pageItem->children()[i])[m_itemStoreCols.bbox])){ + deleteGraphic = true; + break; + } + } + } + } + if(!deleteGraphic) { Gtk::TreeIter item = m_itemStore->append(pageItem->children()); item->set_value(m_itemStoreCols.text, Glib::ustring(_("Graphic"))); item->set_value(m_itemStoreCols.selected, true); @@ -542,13 +571,10 @@ void OutputEditorHOCR::addPage(xmlpp::Element* pageDiv, const Glib::ustring& fil item->set_value(m_itemStoreCols.id, getAttribute(element, "id")); item->set_value(m_itemStoreCols.itemClass, Glib::ustring("ocr_graphic")); item->set_value(m_itemStoreCols.textColor, Glib::ustring("#000")); - x1 = std::atoi(matchInfo.fetch(1).c_str()); - y1 = std::atoi(matchInfo.fetch(2).c_str()); - x2 = std::atoi(matchInfo.fetch(3).c_str()); - y2 = std::atoi(matchInfo.fetch(4).c_str()); item->set_value(m_itemStoreCols.bbox, Geometry::Rectangle(x1, y1, x2-x1, y2-y1)); + } else { + element->get_parent()->remove_child(element); } - element = getNextSiblingElement(element); } pageItem->set_value(m_itemStoreCols.source, getElementXML(pageDiv)); m_itemView->expand_row(Gtk::TreePath(pageItem), true); @@ -1155,7 +1181,7 @@ void OutputEditorHOCR::open() int page = 0; while(div) { ++page; - addPage(div, files.front()->get_basename(), page); + addPage(div, files.front()->get_basename(), page, false); div = getNextSiblingElement(div, "div"); } } diff --git a/gtk/src/OutputEditorHOCR.hh b/gtk/src/OutputEditorHOCR.hh index cdbc09ac..da34cbc7 100644 --- a/gtk/src/OutputEditorHOCR.hh +++ b/gtk/src/OutputEditorHOCR.hh @@ -145,7 +145,7 @@ private: sigc::connection m_connectionPropViewRowEdited; Gtk::TreeIter currentItem(); - void addPage(xmlpp::Element* pageDiv, const Glib::ustring& filename, int page); + void addPage(xmlpp::Element* pageDiv, const Glib::ustring& filename, int page, bool cleanGraphics); bool addChildItems(xmlpp::Element* element, Gtk::TreeIter parentItem, std::map& langCache); void printChildren(PDFPainter& painter, Gtk::TreeIter item, const PDFSettings& pdfSettings) const; bool setCurrentSource(xmlpp::Element* pageElement, int* pageDpi = 0) const; diff --git a/qt/src/OutputEditorHOCR.cc b/qt/src/OutputEditorHOCR.cc index 23270a84..31e8d82a 100644 --- a/qt/src/OutputEditorHOCR.cc +++ b/qt/src/OutputEditorHOCR.cc @@ -394,10 +394,10 @@ void OutputEditorHOCR::addPage(const QString& hocrText, ReadSessionData data) .arg(data.angle) .arg(data.resolution); pageDiv.setAttribute("title", pageTitle); - addPage(pageDiv, QFileInfo(data.file).fileName(), data.page); + addPage(pageDiv, QFileInfo(data.file).fileName(), data.page, true); } -void OutputEditorHOCR::addPage(QDomElement pageDiv, const QString& filename, int page) +void OutputEditorHOCR::addPage(QDomElement pageDiv, const QString& filename, int page, bool cleanGraphics) { pageDiv.setAttribute("id", QString("page_%1").arg(++m_idCounter)); s_bboxRx.indexIn(pageDiv.attribute("title")); @@ -415,24 +415,51 @@ void OutputEditorHOCR::addPage(QDomElement pageDiv, const QString& filename, int ui.treeWidgetItems->addTopLevelItem(pageItem); QMap langCache; + QList> graphicElements; QDomElement element = pageDiv.firstChildElement("div"); while(!element.isNull()) { // Boxes without text are images if(!addChildItems(element.firstChildElement(), pageItem, langCache) && s_bboxRx.indexIn(element.attribute("title")) != -1) { + x1 = s_bboxRx.cap(1).toInt(); + y1 = s_bboxRx.cap(2).toInt(); + x2 = s_bboxRx.cap(3).toInt(); + y2 = s_bboxRx.cap(4).toInt(); + graphicElements.append(qMakePair(element, QRect(x1, y1, x2 - x1, y2 - y1))); + } + element = element.nextSiblingElement(); + } + + // Discard graphic elements which intersect with text block or which are too small + int numTextBlocks = pageItem->childCount(); + for(const QPair& pair: graphicElements) { + const QDomElement& element = pair.first; + const QRect& bbox = pair.second; + bool deleteGraphic = false; + if(cleanGraphics) { + if(bbox.width() < 10 || bbox.height() < 10) { + deleteGraphic = true; + } else { + for(int i = 0; i < numTextBlocks; ++i) { + if(bbox.intersects(pageItem->child(i)->data(0, BBoxRole).toRect())) { + deleteGraphic = true; + break; + } + } + } + } + if(!deleteGraphic) { QTreeWidgetItem* item = new QTreeWidgetItem(QStringList() << _("Graphic")); item->setCheckState(0, Qt::Checked); item->setIcon(0, QIcon(":/icons/item_halftone")); item->setData(0, IdRole, element.attribute("id")); item->setData(0, ClassRole, "ocr_graphic"); - x1 = s_bboxRx.cap(1).toInt(); - y1 = s_bboxRx.cap(2).toInt(); - x2 = s_bboxRx.cap(3).toInt(); - y2 = s_bboxRx.cap(4).toInt(); - item->setData(0, BBoxRole, QRect(x1, y1, x2 - x1, y2 - y1)); + item->setData(0, BBoxRole, bbox); pageItem->addChild(item); + } else { + element.parentNode().removeChild(element); } - element = element.nextSiblingElement(); } + QString str; QTextStream ss(&str); pageDiv.save(ss, 1); @@ -968,7 +995,7 @@ void OutputEditorHOCR::open() int page = 0; while(!div.isNull()) { ++page; - addPage(div, QFileInfo(filename).fileName(), page); + addPage(div, QFileInfo(filename).fileName(), page, false); div = div.nextSiblingElement("div"); } } diff --git a/qt/src/OutputEditorHOCR.hh b/qt/src/OutputEditorHOCR.hh index f4b3ef65..6795e897 100644 --- a/qt/src/OutputEditorHOCR.hh +++ b/qt/src/OutputEditorHOCR.hh @@ -116,7 +116,7 @@ private: QGraphicsPixmapItem* m_preview = nullptr; void findReplace(bool backwards, bool replace); - void addPage(QDomElement pageDiv, const QString& filename, int page); + void addPage(QDomElement pageDiv, const QString& filename, int page, bool cleanGraphics); bool addChildItems(QDomElement element, QTreeWidgetItem* parentItem, QMap& langCache); QDomElement elementById(QDomElement element, const QString& id) const; void expandChildren(QTreeWidgetItem* item) const;