diff --git a/solr/packaging/test/test_roundtrip_export_import.bats b/solr/packaging/test/test_roundtrip_export_import.bats new file mode 100644 index 00000000000..aef3e2c680b --- /dev/null +++ b/solr/packaging/test/test_roundtrip_export_import.bats @@ -0,0 +1,103 @@ +#!/usr/bin/env bats + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# These tests demonstrate that you can round trip data out of Solr and back. +# However you need to be careful about which fields you select, as if you include copyFields +# then on the import you run into issues. +# "json" format works with the bin/solr post tool. +# "jsonl" format works with curl and posting directory to /update/json +# There are no options for javabin or cbor. + +load bats_helper + +setup() { + common_clean_setup +} + +teardown() { + # save a snapshot of SOLR_HOME for failed tests + save_home_on_failure + + solr stop --all >/dev/null 2>&1 +} + +@test "roundtrip export and import using .json and post command" { + run solr start -e techproducts + run solr export --fields id,name,manu,manu_id_s,cat,features,price,popularity,inStock,store,manufacturedate_dt,payloads --solr-url "http://localhost:${SOLR_PORT}" --name techproducts --query "*:* -id:test" --output "${BATS_TEST_TMPDIR}/output" + + assert [ -e ${BATS_TEST_TMPDIR}/output.json ] + + run solr create -c COLL_NAME -d sample_techproducts_configs + + run solr post --format --solr-url "http://localhost:${SOLR_PORT}" -c COLL_NAME ${BATS_TEST_TMPDIR}/output.json + + assert_output --partial '1 files indexed.' + refute_output --partial 'ERROR' + run curl "http://localhost:${SOLR_PORT}/solr/COLL_NAME/select?q=*:*&fl=id,name,manu,manu_id_s,cat,features,price,popularity,inStock,store,manufacturedate_dt,payloads" + assert_output --partial '"numFound":32' + + # export once more the imported data, and compare that export to the original export from techproducts + run solr export --fields id,name,manu,manu_id_s,cat,features,price,popularity,inStock,store,manufacturedate_dt,payloads --solr-url http://localhost:${SOLR_PORT} --name COLL_NAME --query "*:* -id:test" --output "${BATS_TEST_TMPDIR}/COLL_NAME" + + run diff <(jq -S . ${BATS_TEST_TMPDIR}/output.json) <(jq -S . ${BATS_TEST_TMPDIR}/COLL_NAME.json) + assert_success + assert_output "" +} + +@test "roundtrip export and import using .jsonl and curl command" { + run solr start -e techproducts + run solr export --format jsonl --fields id,name,manu,manu_id_s,cat,features,price,popularity,inStock,store,manufacturedate_dt,payloads --solr-url http://localhost:${SOLR_PORT} --name techproducts --query "*:* -id:test" --output "${BATS_TEST_TMPDIR}/output" + + assert [ -e ${BATS_TEST_TMPDIR}/output.jsonl ] + + run solr create -c COLL_NAME -d sample_techproducts_configs + + run curl "http://localhost:${SOLR_PORT}/api/collections/COLL_NAME/update/json" -H 'Content-type:application/json' -d "@${BATS_TEST_TMPDIR}/output.jsonl" + + assert_output --partial '"status":0' + run curl "http://localhost:${SOLR_PORT}/solr/COLL_NAME/select?q=*:*" + assert_output --partial '"numFound":32' + + run solr export --format jsonl --fields id,name,manu,manu_id_s,cat,features,price,popularity,inStock,store,manufacturedate_dt,payloads --solr-url http://localhost:${SOLR_PORT} --name COLL_NAME --query "*:* -id:test" --output "${BATS_TEST_TMPDIR}/COLL_NAME" + + run diff <(jq -c . ${BATS_TEST_TMPDIR}/output.jsonl | sort) <(jq -c . ${BATS_TEST_TMPDIR}/COLL_NAME.jsonl | sort) + assert_success + assert_output "" +} + +@test "roundtrip export and import using .javabin and curl command" { + run solr start -e techproducts + run solr export --format javabin --fields id,name,manu,manu_id_s,cat,features,price,popularity,inStock,store,manufacturedate_dt,payloads --solr-url http://localhost:${SOLR_PORT} --name techproducts --query "*:* -id:test" --output "${BATS_TEST_TMPDIR}/output" + run solr export --fields id,name,manu,manu_id_s,cat,features,price,popularity,inStock,store,manufacturedate_dt,payloads --solr-url "http://localhost:${SOLR_PORT}" --name techproducts --query "*:* -id:test" --output "${BATS_TEST_TMPDIR}/output" + + assert [ -e ${BATS_TEST_TMPDIR}/output.javabin ] + + run solr create -c COLL_NAME -d sample_techproducts_configs + + run curl "http://localhost:${SOLR_PORT}/solr/COLL_NAME/update?commit=true" -H 'Content-type:application/javabin' --data-binary "@${BATS_TEST_TMPDIR}/output.javabin" + + assert_output --partial '"status":0' + run curl "http://localhost:${SOLR_PORT}/solr/COLL_NAME/select?q=*:*" + assert_output --partial '"numFound":32' + + # We compare the success of the round trip of javabin formatted data using json formatted exports to leverage `diff` command. + run solr export --fields id,name,manu,manu_id_s,cat,features,price,popularity,inStock,store,manufacturedate_dt,payloads --solr-url http://localhost:${SOLR_PORT} --name COLL_NAME --query "*:* -id:test" --output "${BATS_TEST_TMPDIR}/COLL_NAME" + + run diff <(jq -S . ${BATS_TEST_TMPDIR}/output.json) <(jq -S . ${BATS_TEST_TMPDIR}/COLL_NAME.json) + assert_success + assert_output "" +} diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc b/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc index 6cab3f6789a..0e05a415e0e 100644 --- a/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc +++ b/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc @@ -1703,7 +1703,8 @@ The default is `\*:*` which will export all documents. |=== + Comma separated list of fields to be exported. -By default all fields are fetched. +By default all fields are fetched. +If your schema uses `copyFields` and you reindex the data, then you probably want to specify exactly which fields are being exported to simplify the import process. `--limit `:: + diff --git a/solr/solr-ref-guide/modules/indexing-guide/pages/reindexing.adoc b/solr/solr-ref-guide/modules/indexing-guide/pages/reindexing.adoc index 821c00dafb8..9dcc9ccd29b 100644 --- a/solr/solr-ref-guide/modules/indexing-guide/pages/reindexing.adoc +++ b/solr/solr-ref-guide/modules/indexing-guide/pages/reindexing.adoc @@ -165,6 +165,20 @@ Once the alias is in place and you are satisfied you no longer need the old data One advantage of this option is that you can switch back to the old collection if you discover problems our testing did not uncover. Of course this option can require more resources until the old collection can be deleted. +=== Exporting/Importing Data from Solr + +Sometimes you don't want to run your full ETL pipeline to reindex into another collection, you just want to take the data in your existing collection, export it, and then import it back. + +There are a number of third party tools that do this, see https://solr.cool/ for more information. However, if you want to use what ships with Solr then we have some options: + +1. Use `bin/solr export` with the JSON output format (`.json`), and the `bin/solr post` tool to post that data back. +1. Use `bin/solr export` with the JSON with Lines format (`.jsonl`), and use `curl` to post that data back to Solr via the Json Update Request Handler. +1. Use `bin/solr export` with Solr's Java binary format (`.javabin`), and use `curl` to post that data back to Solr via the default Update Request Handler. + +You can see some examples of this in our automated BATS tests: https://github.com/apache/solr/blob/main/solr/packaging/test/test_roundtrip_export_import.bats + +See the reference guide for xref:deployment-guide:solr-control-script-reference.adoc#exporting-and-importing[Exporting and Importing]. + == Changes that Do Not Require Reindex The types of changes that do not require or strongly indicate reindexing are changes that do not impact the index.