fine-tuned the import process of jsonl files which had been missing

to actually be able to make searches and browse the index with the host browser
yacy · May 10, 2024 · b295e38 · b295e38
1 parent de941c6
commit b295e38
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 13 deletions.
diff --git a/source/net/yacy/document/importer/JsonListImporter.java b/source/net/yacy/document/importer/JsonListImporter.java
@@ -148,6 +148,11 @@ public void run() {
             }
             if ((json.opt("index") != null && json.length() == 1) || json.length() == 0) continue;
             final SolrInputDocument surrogate = new SolrInputDocument();
+
+            // set default values which act as constraints for a proper search
+            CollectionSchema.httpstatus_i.add(surrogate, 200);
+
+            // get fields for json object
             jsonreader: for (final String key: json.keySet()) {
                 final Object o = json.opt(key);
                 if (o == null) continue;
@@ -212,10 +217,19 @@ public void run() {
                         final String id = ASCII.String(durl.hash());
                         surrogate.setField(CollectionSchema.sku.getSolrFieldName(), durl.toNormalform(true));
                         surrogate.setField(CollectionSchema.id.getSolrFieldName(), id);
+                        surrogate.setField(CollectionSchema.host_s.getSolrFieldName(), durl.getHost());
                         surrogate.setField(CollectionSchema.host_id_s.getSolrFieldName(), id.substring(6));
                         continue jsonreader;
                     }
+                    if (key.equals("description")) {
+                        // in YaCy descriptions are full-text indexed and also multi-value fields
+                        final List<Object> descriptions = new ArrayList<>();
+                        descriptions.add(o.toString());
+                        CollectionSchema.description_txt.add(surrogate, descriptions);
+                        continue jsonreader;
+                    }
                     if (key.equals("referrer_url_s")) {
+                        // same patch as for urls which require re-calculation of id's; in this case we store the id only!
                         final DigestURL durl = new DigestURL(o.toString());
                         final String id = ASCII.String(durl.hash());
                         surrogate.setField(CollectionSchema.referrer_id_s.getSolrFieldName(), id);
@@ -236,6 +250,12 @@ public void run() {
                         continue jsonreader;
                     }
 
+                    // check if required fields are still missing and compute them
+                    if (!surrogate.containsKey(CollectionSchema.host_s.getSolrFieldName())) {
+                        final DigestURL durl = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
+                        surrogate.setField(CollectionSchema.host_s.getSolrFieldName(), durl.getHost());
+                    }
+
                     // regular situation, just read content of field
                     surrogate.setField(key, o.toString());
                 }

diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
@@ -2325,7 +2325,10 @@ public int surrogateQueueSize() {
                     || s.endsWith(".xml.zip")
                     || s.endsWith(".warc")
                     || s.endsWith(".warc.gz")
+                    || s.endsWith(".jsonl")
+                    || s.endsWith(".jsonl.gz")
                     || s.endsWith(".jsonlist")
+                    || s.endsWith(".jsonlist.gz")
                     || s.endsWith(".flatjson") ) {
                 count++;
             }
@@ -3167,19 +3170,19 @@ public IndexingQueueEntry condenseDocument(final IndexingQueueEntry in) {
         }
 
         // check mustmatch pattern
-        Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
+        final Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
         if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) {
-            String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
+            final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
             if (this.log.isInfo()) this.log.info(info);
             // create a new errorURL DB entry
             this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
             return new IndexingQueueEntry(in.queueEntry, in.documents, null);
         }
 
         // check mustnotmatch
-        Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
+        final Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
         if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) {
-            String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
+            final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
             if (this.log.isInfo()) this.log.info(info);
             // create a new errorURL DB entry
             this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
@@ -3192,13 +3195,13 @@ public IndexingQueueEntry condenseDocument(final IndexingQueueEntry in) {
 
             // check canonical
             if (profile.noindexWhenCanonicalUnequalURL()) {
-                AnchorURL canonical = document.getCanonical();
-                DigestURL source = document.dc_source();
+                final AnchorURL canonical = document.getCanonical();
+                final DigestURL source = document.dc_source();
                 if (canonical != null && source != null) {
-                    String canonical_norm = canonical.toNormalform(true);
-                    String source_norm = source.toNormalform(true);
+                    final String canonical_norm = canonical.toNormalform(true);
+                    final String source_norm = source.toNormalform(true);
                     if (!canonical_norm.equals(source_norm)) {
-                        String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
+                        final String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
                         if (this.log.isInfo()) this.log.info(info);
                         // create a new errorURL DB entry
                         this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
@@ -3216,19 +3219,19 @@ public IndexingQueueEntry condenseDocument(final IndexingQueueEntry in) {
             }
 
             // check content pattern must-match
-            Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
+            final Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
             if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) {
-                String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
+                final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
                 if (this.log.isInfo()) this.log.info(info);
                 // create a new errorURL DB entry
                 this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
                 continue docloop;
             }
 
             // check content pattern must-not-match
-            Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
+            final Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
             if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) {
-                String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
+                final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
                 if (this.log.isInfo()) this.log.info(info);
                 // create a new errorURL DB entry
                 this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);