Skip to content

Commit

Permalink
Purge graphic items which overlap with text blocks or are too small f…
Browse files Browse the repository at this point in the history
…rom hOCR result
  • Loading branch information
manisandro committed Oct 13, 2016
1 parent 325a3ff commit a1bdb80
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 20 deletions.
2 changes: 1 addition & 1 deletion gtk/src/Geometry.hh
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ public:
bool contains(const Point& p) const{
return p.x >= x && p.x <= x + width && p.y >= y && p.y <= y + height;
}
bool overlaps(const Rectangle& r) {
bool overlaps(const Rectangle& r) const {
return x < r.x + r.width && x + width > r.x && y < r.y + r.height && y + height > r.y;
}
Rectangle unite(const Rectangle& r) const{
Expand Down
42 changes: 34 additions & 8 deletions gtk/src/OutputEditorHOCR.cc
Original file line number Diff line number Diff line change
Expand Up @@ -493,10 +493,10 @@ void OutputEditorHOCR::addPage(const Glib::ustring& hocrText, ReadSessionData da
Glib::ustring pageTitle = Glib::ustring::compose("image '%1'; bbox %2 %3 %4 %5; pageno %6; rot %7; res %8",
data.file, x1, y1, x2, y2, data.page, data.angle, data.resolution);
pageDiv->set_attribute("title", pageTitle);
addPage(pageDiv, Gio::File::create_for_path(data.file)->get_basename(), data.page);
addPage(pageDiv, Gio::File::create_for_path(data.file)->get_basename(), data.page, true);
}

void OutputEditorHOCR::addPage(xmlpp::Element* pageDiv, const Glib::ustring& filename, int page)
void OutputEditorHOCR::addPage(xmlpp::Element* pageDiv, const Glib::ustring& filename, int page, bool cleanGraphics)
{
m_connectionItemViewRowEdited.block(true);
pageDiv->set_attribute("id", Glib::ustring::compose("page_%1", ++m_idCounter));
Expand Down Expand Up @@ -525,11 +525,40 @@ void OutputEditorHOCR::addPage(xmlpp::Element* pageDiv, const Glib::ustring& fil

std::map<Glib::ustring,Glib::ustring> langCache;

std::vector<std::pair<xmlpp::Element*,Geometry::Rectangle>> graphicElements;
xmlpp::Element* element = getFirstChildElement(pageDiv, "div");
while(element) {
// Boxes without text are images
titleAttr = getAttribute(element, "title");
if(!addChildItems(getFirstChildElement(element), pageItem, langCache) && s_bboxRx->match(titleAttr, matchInfo)) {
x1 = std::atoi(matchInfo.fetch(1).c_str());
y1 = std::atoi(matchInfo.fetch(2).c_str());
x2 = std::atoi(matchInfo.fetch(3).c_str());
y2 = std::atoi(matchInfo.fetch(4).c_str());
graphicElements.push_back(std::make_pair(element, Geometry::Rectangle(x1, y1, x2-x1, y2-y1)));
}
element = getNextSiblingElement(element);
}

// Discard graphic elements which intersect with text block or which are too small
int numTextBlocks = pageItem->children().size();
for(const std::pair<xmlpp::Element*,Geometry::Rectangle>& pair : graphicElements) {
xmlpp::Element* element = pair.first;
const Geometry::Rectangle& bbox = pair.second;
bool deleteGraphic = false;
if(cleanGraphics) {
if(bbox.width < 10 || bbox.height < 10) {
deleteGraphic = true;
} else {
for(int i = 0; i < numTextBlocks; ++i) {
if(bbox.overlaps((*pageItem->children()[i])[m_itemStoreCols.bbox])){
deleteGraphic = true;
break;
}
}
}
}
if(!deleteGraphic) {
Gtk::TreeIter item = m_itemStore->append(pageItem->children());
item->set_value(m_itemStoreCols.text, Glib::ustring(_("Graphic")));
item->set_value(m_itemStoreCols.selected, true);
Expand All @@ -542,13 +571,10 @@ void OutputEditorHOCR::addPage(xmlpp::Element* pageDiv, const Glib::ustring& fil
item->set_value(m_itemStoreCols.id, getAttribute(element, "id"));
item->set_value(m_itemStoreCols.itemClass, Glib::ustring("ocr_graphic"));
item->set_value(m_itemStoreCols.textColor, Glib::ustring("#000"));
x1 = std::atoi(matchInfo.fetch(1).c_str());
y1 = std::atoi(matchInfo.fetch(2).c_str());
x2 = std::atoi(matchInfo.fetch(3).c_str());
y2 = std::atoi(matchInfo.fetch(4).c_str());
item->set_value(m_itemStoreCols.bbox, Geometry::Rectangle(x1, y1, x2-x1, y2-y1));
} else {
element->get_parent()->remove_child(element);
}
element = getNextSiblingElement(element);
}
pageItem->set_value(m_itemStoreCols.source, getElementXML(pageDiv));
m_itemView->expand_row(Gtk::TreePath(pageItem), true);
Expand Down Expand Up @@ -1155,7 +1181,7 @@ void OutputEditorHOCR::open()
int page = 0;
while(div) {
++page;
addPage(div, files.front()->get_basename(), page);
addPage(div, files.front()->get_basename(), page, false);
div = getNextSiblingElement(div, "div");
}
}
Expand Down
2 changes: 1 addition & 1 deletion gtk/src/OutputEditorHOCR.hh
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ private:
sigc::connection m_connectionPropViewRowEdited;

Gtk::TreeIter currentItem();
void addPage(xmlpp::Element* pageDiv, const Glib::ustring& filename, int page);
void addPage(xmlpp::Element* pageDiv, const Glib::ustring& filename, int page, bool cleanGraphics);
bool addChildItems(xmlpp::Element* element, Gtk::TreeIter parentItem, std::map<Glib::ustring, Glib::ustring>& langCache);
void printChildren(PDFPainter& painter, Gtk::TreeIter item, const PDFSettings& pdfSettings) const;
bool setCurrentSource(xmlpp::Element* pageElement, int* pageDpi = 0) const;
Expand Down
45 changes: 36 additions & 9 deletions qt/src/OutputEditorHOCR.cc
Original file line number Diff line number Diff line change
Expand Up @@ -394,10 +394,10 @@ void OutputEditorHOCR::addPage(const QString& hocrText, ReadSessionData data)
.arg(data.angle)
.arg(data.resolution);
pageDiv.setAttribute("title", pageTitle);
addPage(pageDiv, QFileInfo(data.file).fileName(), data.page);
addPage(pageDiv, QFileInfo(data.file).fileName(), data.page, true);
}

void OutputEditorHOCR::addPage(QDomElement pageDiv, const QString& filename, int page)
void OutputEditorHOCR::addPage(QDomElement pageDiv, const QString& filename, int page, bool cleanGraphics)
{
pageDiv.setAttribute("id", QString("page_%1").arg(++m_idCounter));
s_bboxRx.indexIn(pageDiv.attribute("title"));
Expand All @@ -415,24 +415,51 @@ void OutputEditorHOCR::addPage(QDomElement pageDiv, const QString& filename, int
ui.treeWidgetItems->addTopLevelItem(pageItem);
QMap<QString,QString> langCache;

QList<QPair<QDomElement,QRect>> graphicElements;
QDomElement element = pageDiv.firstChildElement("div");
while(!element.isNull()) {
// Boxes without text are images
if(!addChildItems(element.firstChildElement(), pageItem, langCache) && s_bboxRx.indexIn(element.attribute("title")) != -1) {
x1 = s_bboxRx.cap(1).toInt();
y1 = s_bboxRx.cap(2).toInt();
x2 = s_bboxRx.cap(3).toInt();
y2 = s_bboxRx.cap(4).toInt();
graphicElements.append(qMakePair(element, QRect(x1, y1, x2 - x1, y2 - y1)));
}
element = element.nextSiblingElement();
}

// Discard graphic elements which intersect with text block or which are too small
int numTextBlocks = pageItem->childCount();
for(const QPair<QDomElement,QRect>& pair: graphicElements) {
const QDomElement& element = pair.first;
const QRect& bbox = pair.second;
bool deleteGraphic = false;
if(cleanGraphics) {
if(bbox.width() < 10 || bbox.height() < 10) {
deleteGraphic = true;
} else {
for(int i = 0; i < numTextBlocks; ++i) {
if(bbox.intersects(pageItem->child(i)->data(0, BBoxRole).toRect())) {
deleteGraphic = true;
break;
}
}
}
}
if(!deleteGraphic) {
QTreeWidgetItem* item = new QTreeWidgetItem(QStringList() << _("Graphic"));
item->setCheckState(0, Qt::Checked);
item->setIcon(0, QIcon(":/icons/item_halftone"));
item->setData(0, IdRole, element.attribute("id"));
item->setData(0, ClassRole, "ocr_graphic");
x1 = s_bboxRx.cap(1).toInt();
y1 = s_bboxRx.cap(2).toInt();
x2 = s_bboxRx.cap(3).toInt();
y2 = s_bboxRx.cap(4).toInt();
item->setData(0, BBoxRole, QRect(x1, y1, x2 - x1, y2 - y1));
item->setData(0, BBoxRole, bbox);
pageItem->addChild(item);
} else {
element.parentNode().removeChild(element);
}
element = element.nextSiblingElement();
}

QString str;
QTextStream ss(&str);
pageDiv.save(ss, 1);
Expand Down Expand Up @@ -968,7 +995,7 @@ void OutputEditorHOCR::open()
int page = 0;
while(!div.isNull()) {
++page;
addPage(div, QFileInfo(filename).fileName(), page);
addPage(div, QFileInfo(filename).fileName(), page, false);
div = div.nextSiblingElement("div");
}
}
Expand Down
2 changes: 1 addition & 1 deletion qt/src/OutputEditorHOCR.hh
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ private:
QGraphicsPixmapItem* m_preview = nullptr;

void findReplace(bool backwards, bool replace);
void addPage(QDomElement pageDiv, const QString& filename, int page);
void addPage(QDomElement pageDiv, const QString& filename, int page, bool cleanGraphics);
bool addChildItems(QDomElement element, QTreeWidgetItem* parentItem, QMap<QString, QString>& langCache);
QDomElement elementById(QDomElement element, const QString& id) const;
void expandChildren(QTreeWidgetItem* item) const;
Expand Down

0 comments on commit a1bdb80

Please sign in to comment.