Skip to content

Commit

Permalink
fine-tuned the import process of jsonl files which had been missing
Browse files Browse the repository at this point in the history
to actually be able to make searches and browse the index with the host
browser
  • Loading branch information
Orbiter committed May 10, 2024
1 parent de941c6 commit b295e38
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 13 deletions.
20 changes: 20 additions & 0 deletions source/net/yacy/document/importer/JsonListImporter.java
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,11 @@ public void run() {
}
if ((json.opt("index") != null && json.length() == 1) || json.length() == 0) continue;
final SolrInputDocument surrogate = new SolrInputDocument();

// set default values which act as constraints for a proper search
CollectionSchema.httpstatus_i.add(surrogate, 200);

// get fields for json object
jsonreader: for (final String key: json.keySet()) {
final Object o = json.opt(key);
if (o == null) continue;
Expand Down Expand Up @@ -212,10 +217,19 @@ public void run() {
final String id = ASCII.String(durl.hash());
surrogate.setField(CollectionSchema.sku.getSolrFieldName(), durl.toNormalform(true));
surrogate.setField(CollectionSchema.id.getSolrFieldName(), id);
surrogate.setField(CollectionSchema.host_s.getSolrFieldName(), durl.getHost());
surrogate.setField(CollectionSchema.host_id_s.getSolrFieldName(), id.substring(6));
continue jsonreader;
}
if (key.equals("description")) {
// in YaCy descriptions are full-text indexed and also multi-value fields
final List<Object> descriptions = new ArrayList<>();
descriptions.add(o.toString());
CollectionSchema.description_txt.add(surrogate, descriptions);
continue jsonreader;
}
if (key.equals("referrer_url_s")) {
// same patch as for urls which require re-calculation of id's; in this case we store the id only!
final DigestURL durl = new DigestURL(o.toString());
final String id = ASCII.String(durl.hash());
surrogate.setField(CollectionSchema.referrer_id_s.getSolrFieldName(), id);
Expand All @@ -236,6 +250,12 @@ public void run() {
continue jsonreader;
}

// check if required fields are still missing and compute them
if (!surrogate.containsKey(CollectionSchema.host_s.getSolrFieldName())) {
final DigestURL durl = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
surrogate.setField(CollectionSchema.host_s.getSolrFieldName(), durl.getHost());
}

// regular situation, just read content of field
surrogate.setField(key, o.toString());
}
Expand Down
29 changes: 16 additions & 13 deletions source/net/yacy/search/Switchboard.java
Original file line number Diff line number Diff line change
Expand Up @@ -2325,7 +2325,10 @@ public int surrogateQueueSize() {
|| s.endsWith(".xml.zip")
|| s.endsWith(".warc")
|| s.endsWith(".warc.gz")
|| s.endsWith(".jsonl")
|| s.endsWith(".jsonl.gz")
|| s.endsWith(".jsonlist")
|| s.endsWith(".jsonlist.gz")
|| s.endsWith(".flatjson") ) {
count++;
}
Expand Down Expand Up @@ -3167,19 +3170,19 @@ public IndexingQueueEntry condenseDocument(final IndexingQueueEntry in) {
}

// check mustmatch pattern
Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
final Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}

// check mustnotmatch
Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
final Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
Expand All @@ -3192,13 +3195,13 @@ public IndexingQueueEntry condenseDocument(final IndexingQueueEntry in) {

// check canonical
if (profile.noindexWhenCanonicalUnequalURL()) {
AnchorURL canonical = document.getCanonical();
DigestURL source = document.dc_source();
final AnchorURL canonical = document.getCanonical();
final DigestURL source = document.dc_source();
if (canonical != null && source != null) {
String canonical_norm = canonical.toNormalform(true);
String source_norm = source.toNormalform(true);
final String canonical_norm = canonical.toNormalform(true);
final String source_norm = source.toNormalform(true);
if (!canonical_norm.equals(source_norm)) {
String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
final String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
Expand All @@ -3216,19 +3219,19 @@ public IndexingQueueEntry condenseDocument(final IndexingQueueEntry in) {
}

// check content pattern must-match
Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
final Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
continue docloop;
}

// check content pattern must-not-match
Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
final Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
Expand Down

0 comments on commit b295e38

Please sign in to comment.