Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

restore: precheck cluster is empty when first time full restore (#45014) #58774

Open
wants to merge 1 commit into
base: release-6.5
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions br/cmd/br/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,12 @@ func printWorkaroundOnFullRestoreError(command *cobra.Command, err error) {
fmt.Println("#######################################################################")
switch {
case errors.ErrorEqual(err, berrors.ErrRestoreNotFreshCluster):
fmt.Println("# the target cluster is not fresh, br cannot restore system tables.")
fmt.Println("# the target cluster is not fresh, cannot restore.")
fmt.Println("# you can drop existing databases and tables and start restore again")
case errors.ErrorEqual(err, berrors.ErrRestoreIncompatibleSys):
fmt.Println("# the target cluster is not compatible with the backup data,")
fmt.Println("# br cannot restore system tables.")
fmt.Println("# you can remove 'with-sys-table' flag to skip restoring system tables")
}
fmt.Println("# you can remove 'with-sys-table' flag to skip restoring system tables")
fmt.Println("#######################################################################")
}

Expand Down
2 changes: 1 addition & 1 deletion br/pkg/backup/push.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ func (push *pushDown) pushBackup(
}
failpoint.Inject("backup-timeout-error", func(val failpoint.Value) {
msg := val.(string)
logutil.CL(ctx).Debug("failpoint backup-timeout-error injected.", zap.String("msg", msg))
logutil.CL(ctx).Info("failpoint backup-timeout-error injected.", zap.String("msg", msg))
resp.Error = &backuppb.Error{
Msg: msg,
}
Expand Down
65 changes: 65 additions & 0 deletions br/pkg/task/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,7 @@ func runRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConf
return errors.Trace(err)
}

<<<<<<< HEAD
// todo: move this check into InitFullClusterRestore, we should move restore config into a separate package
// to avoid import cycle problem which we won't do it in this pr, then refactor this
//
Expand All @@ -624,6 +625,70 @@ func runRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConf
if err = client.CheckTargetClusterFresh(ctx); err != nil {
return errors.Trace(err)
}
=======
if client.IsIncremental() {
// don't support checkpoint for the ddl restore
log.Info("the incremental snapshot restore doesn't support checkpoint mode, so unuse checkpoint.")
cfg.UseCheckpoint = false
}

restoreSchedulers, schedulersConfig, err := restorePreWork(ctx, client, mgr, true)
if err != nil {
return errors.Trace(err)
}

schedulersRemovable := false
defer func() {
// don't reset pd scheduler if checkpoint mode is used and restored is not finished
if cfg.UseCheckpoint && !schedulersRemovable {
log.Info("skip removing pd schehduler for next retry")
return
}
log.Info("start to remove the pd scheduler")
// run the post-work to avoid being stuck in the import
// mode or emptied schedulers.
restorePostWork(ctx, client, restoreSchedulers)
log.Info("finish removing pd scheduler")
}()

var checkpointSetWithTableID map[int64]map[string]struct{}
if cfg.UseCheckpoint {
taskName := cfg.generateSnapshotRestoreTaskName(client.GetClusterID(ctx))
sets, restoreSchedulersConfigFromCheckpoint, err := client.InitCheckpoint(ctx, s, taskName, schedulersConfig, cfg.UseCheckpoint)
if err != nil {
return errors.Trace(err)
}
if restoreSchedulersConfigFromCheckpoint != nil {
restoreSchedulers = mgr.MakeUndoFunctionByConfig(*restoreSchedulersConfigFromCheckpoint)
}
checkpointSetWithTableID = sets

defer func() {
// need to flush the whole checkpoint data so that br can quickly jump to
// the log kv restore step when the next retry.
log.Info("wait for flush checkpoint...")
client.WaitForFinishCheckpoint(ctx, len(cfg.FullBackupStorage) > 0 || !schedulersRemovable)
}()
}

if isFullRestore(cmdName) {
// we need check cluster is fresh every time. except restore from a checkpoint.
if client.IsFull() && len(checkpointSetWithTableID) == 0 {
if err = client.CheckTargetClusterFresh(ctx); err != nil {
return errors.Trace(err)
}
}
// todo: move this check into InitFullClusterRestore, we should move restore config into a separate package
// to avoid import cycle problem which we won't do it in this pr, then refactor this
//
// if it's point restore and reached here, then cmdName=FullRestoreCmd and len(cfg.FullBackupStorage) > 0
if cfg.WithSysTable {
client.InitFullClusterRestore(cfg.ExplicitFilter)
}
}

if client.IsFullClusterRestore() && client.HasBackedUpSysDB() {
>>>>>>> 8c5ca7b2008 (restore: precheck cluster is empty when first time full restore (#45014))
if err = client.CheckSysTableCompatibility(mgr.GetDomain(), tables); err != nil {
return errors.Trace(err)
}
Expand Down
5 changes: 5 additions & 0 deletions br/tests/_utils/run_br
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,13 @@

set -eux

<<<<<<< HEAD
bin/br.test -test.coverprofile="$TEST_DIR/cov.$TEST_NAME.$$.out.log" DEVEL "$@" \
-L "debug" \
=======
br.test -test.coverprofile="$TEST_DIR/cov.$TEST_NAME.$$.out.log" DEVEL "$@" \
-L "info" \
>>>>>>> 8c5ca7b2008 (restore: precheck cluster is empty when first time full restore (#45014))
--ca "$TEST_DIR/certs/ca.pem" \
--cert "$TEST_DIR/certs/br.pem" \
--key "$TEST_DIR/certs/br.key"
3 changes: 3 additions & 0 deletions br/tests/br_backup_empty/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ if [ $? -ne 0 ]; then
exit 1
fi

i=1
while [ $i -le $DB_COUNT ]; do
run_sql "DROP DATABASE $DB$i;"
i=$(($i+1))
Expand All @@ -71,6 +72,7 @@ run_sql "CREATE TABLE ${DB}1.usertable1 ( \
echo "backup empty table start..."
run_br --pd $PD_ADDR backup full -s "local://$TEST_DIR/empty_table"

i=1
while [ $i -le $DB_COUNT ]; do
run_sql "DROP DATABASE $DB$i;"
i=$(($i+1))
Expand All @@ -83,6 +85,7 @@ run_br --pd $PD_ADDR restore full -s "local://$TEST_DIR/empty_table"
# insert one row to make sure table is restored.
run_sql "INSERT INTO ${DB}1.usertable1 VALUES (\"a\", \"b\");"

i=1
while [ $i -le $DB_COUNT ]; do
run_sql "DROP DATABASE $DB$i;"
i=$(($i+1))
Expand Down
1 change: 1 addition & 0 deletions br/tests/br_full_ddl/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ fi

# clear restore environment
run_sql "DROP DATABASE $DB;"
run_sql "DROP DATABASE __tidb_br_temporary_mysql;"
# restore full
echo "restore start..."
export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/pdutil/PDEnabledPauseConfig=return(true)"
Expand Down
2 changes: 1 addition & 1 deletion br/tests/br_systables/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ add_test_data() {
}

delete_test_data() {
run_sql "DROP TABLE usertest.test;"
run_sql "DROP DATABASE usertest;"
}

rollback_modify() {
Expand Down
Loading