This is an automated cherry-pick of pingcap#45014

Signed-off-by: ti-chi-bot <[email protected]>
ti-chi-bot · Jan 8, 2025 · b074975 · b074975
1 parent 8ce140d
commit b074975
Show file tree

Hide file tree

Showing 7 changed files with 111 additions and 1 deletion.
diff --git a/br/cmd/br/restore.go b/br/cmd/br/restore.go
@@ -43,6 +43,27 @@ func runRestoreCommand(command *cobra.Command, cmdName string) error {
 	return nil
 }
 
+<<<<<<< HEAD
+=======
+// print workaround when we met not fresh or incompatible cluster error on full cluster restore
+func printWorkaroundOnFullRestoreError(command *cobra.Command, err error) {
+	if !errors.ErrorEqual(err, berrors.ErrRestoreNotFreshCluster) &&
+		!errors.ErrorEqual(err, berrors.ErrRestoreIncompatibleSys) {
+		return
+	}
+	fmt.Println("#######################################################################")
+	switch {
+	case errors.ErrorEqual(err, berrors.ErrRestoreNotFreshCluster):
+		fmt.Println("# the target cluster is not fresh, cannot restore.")
+		fmt.Println("# you can drop existing databases and tables and start restore again")
+	case errors.ErrorEqual(err, berrors.ErrRestoreIncompatibleSys):
+		fmt.Println("# the target cluster is not compatible with the backup data,")
+		fmt.Println("# you can remove 'with-sys-table' flag to skip restoring system tables")
+	}
+	fmt.Println("#######################################################################")
+}
+
+>>>>>>> 8c5ca7b2008 (restore: precheck cluster is empty when first time full restore (#45014))
 func runRestoreRawCommand(command *cobra.Command, cmdName string) error {
 	cfg := task.RestoreRawConfig{
 		RawKvConfig: task.RawKvConfig{Config: task.Config{LogProgress: HasLogFile()}},

diff --git a/br/pkg/backup/push.go b/br/pkg/backup/push.go
@@ -127,6 +127,16 @@ func (push *pushDown) pushBackup(
 				// Finished.
 				return res, nil
 			}
+<<<<<<< HEAD
+=======
+			failpoint.Inject("backup-timeout-error", func(val failpoint.Value) {
+				msg := val.(string)
+				logutil.CL(ctx).Info("failpoint backup-timeout-error injected.", zap.String("msg", msg))
+				resp.Error = &backuppb.Error{
+					Msg: msg,
+				}
+			})
+>>>>>>> 8c5ca7b2008 (restore: precheck cluster is empty when first time full restore (#45014))
 			failpoint.Inject("backup-storage-error", func(val failpoint.Value) {
 				msg := val.(string)
 				logutil.CL(ctx).Debug("failpoint backup-storage-error injected.", zap.String("msg", msg))

diff --git a/br/pkg/task/restore.go b/br/pkg/task/restore.go
@@ -425,6 +425,76 @@ func RunRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConf
 		return errors.Trace(err)
 	}
 
+<<<<<<< HEAD
+=======
+	if client.IsIncremental() {
+		// don't support checkpoint for the ddl restore
+		log.Info("the incremental snapshot restore doesn't support checkpoint mode, so unuse checkpoint.")
+		cfg.UseCheckpoint = false
+	}
+
+	restoreSchedulers, schedulersConfig, err := restorePreWork(ctx, client, mgr, true)
+	if err != nil {
+		return errors.Trace(err)
+	}
+
+	schedulersRemovable := false
+	defer func() {
+		// don't reset pd scheduler if checkpoint mode is used and restored is not finished
+		if cfg.UseCheckpoint && !schedulersRemovable {
+			log.Info("skip removing pd schehduler for next retry")
+			return
+		}
+		log.Info("start to remove the pd scheduler")
+		// run the post-work to avoid being stuck in the import
+		// mode or emptied schedulers.
+		restorePostWork(ctx, client, restoreSchedulers)
+		log.Info("finish removing pd scheduler")
+	}()
+
+	var checkpointSetWithTableID map[int64]map[string]struct{}
+	if cfg.UseCheckpoint {
+		taskName := cfg.generateSnapshotRestoreTaskName(client.GetClusterID(ctx))
+		sets, restoreSchedulersConfigFromCheckpoint, err := client.InitCheckpoint(ctx, s, taskName, schedulersConfig, cfg.UseCheckpoint)
+		if err != nil {
+			return errors.Trace(err)
+		}
+		if restoreSchedulersConfigFromCheckpoint != nil {
+			restoreSchedulers = mgr.MakeUndoFunctionByConfig(*restoreSchedulersConfigFromCheckpoint)
+		}
+		checkpointSetWithTableID = sets
+
+		defer func() {
+			// need to flush the whole checkpoint data so that br can quickly jump to
+			// the log kv restore step when the next retry.
+			log.Info("wait for flush checkpoint...")
+			client.WaitForFinishCheckpoint(ctx, len(cfg.FullBackupStorage) > 0 || !schedulersRemovable)
+		}()
+	}
+
+	if isFullRestore(cmdName) {
+		// we need check cluster is fresh every time. except restore from a checkpoint.
+		if client.IsFull() && len(checkpointSetWithTableID) == 0 {
+			if err = client.CheckTargetClusterFresh(ctx); err != nil {
+				return errors.Trace(err)
+			}
+		}
+		// todo: move this check into InitFullClusterRestore, we should move restore config into a separate package
+		// to avoid import cycle problem which we won't do it in this pr, then refactor this
+		//
+		// if it's point restore and reached here, then cmdName=FullRestoreCmd and len(cfg.FullBackupStorage) > 0
+		if cfg.WithSysTable {
+			client.InitFullClusterRestore(cfg.ExplicitFilter)
+		}
+	}
+
+	if client.IsFullClusterRestore() && client.HasBackedUpSysDB() {
+		if err = client.CheckSysTableCompatibility(mgr.GetDomain(), tables); err != nil {
+			return errors.Trace(err)
+		}
+	}
+
+>>>>>>> 8c5ca7b2008 (restore: precheck cluster is empty when first time full restore (#45014))
 	sp := utils.BRServiceSafePoint{
 		BackupTS: restoreTS,
 		TTL:      utils.DefaultBRGCSafePointTTL,

diff --git a/br/tests/_utils/run_br b/br/tests/_utils/run_br
@@ -16,8 +16,13 @@
 
 set -eux
 
+<<<<<<< HEAD
 bin/br.test -test.coverprofile="$TEST_DIR/cov.$TEST_NAME.$$.out.log" DEVEL "$@" \
     -L "debug" \
+=======
+br.test -test.coverprofile="$TEST_DIR/cov.$TEST_NAME.$$.out.log" DEVEL "$@" \
+    -L "info" \
+>>>>>>> 8c5ca7b2008 (restore: precheck cluster is empty when first time full restore (#45014))
     --ca "$TEST_DIR/certs/ca.pem" \
     --cert "$TEST_DIR/certs/br.pem" \
     --key "$TEST_DIR/certs/br.key"
diff --git a/br/tests/br_backup_empty/run.sh b/br/tests/br_backup_empty/run.sh
@@ -48,6 +48,7 @@ if [ $? -ne 0 ]; then
     exit 1
 fi
 
+i=1
 while [ $i -le $DB_COUNT ]; do
     run_sql "DROP DATABASE $DB$i;"
     i=$(($i+1))
@@ -70,6 +71,7 @@ run_sql "CREATE TABLE ${DB}1.usertable1 ( \
 echo "backup empty table start..."
 run_br --pd $PD_ADDR backup full -s "local://$TEST_DIR/empty_table"
 
+i=1
 while [ $i -le $DB_COUNT ]; do
     run_sql "DROP DATABASE $DB$i;"
     i=$(($i+1))
@@ -81,6 +83,7 @@ run_br --pd $PD_ADDR restore full -s "local://$TEST_DIR/empty_table"
 # insert one row to make sure table is restored.
 run_sql "INSERT INTO ${DB}1.usertable1 VALUES (\"a\", \"b\");"
 
+i=1
 while [ $i -le $DB_COUNT ]; do
     run_sql "DROP DATABASE $DB$i;"
     i=$(($i+1))

diff --git a/br/tests/br_full_ddl/run.sh b/br/tests/br_full_ddl/run.sh
@@ -135,6 +135,7 @@ fi
 
 # clear restore environment
 run_sql "DROP DATABASE $DB;"
+run_sql "DROP DATABASE __tidb_br_temporary_mysql;"
 # restore full
 echo "restore start..."
 export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/pdutil/PDEnabledPauseConfig=return(true)"

diff --git a/br/tests/br_systables/run.sh b/br/tests/br_systables/run.sh
@@ -42,7 +42,7 @@ add_test_data() {
 }
 
 delete_test_data() {
-    run_sql "DROP TABLE usertest.test;"
+    run_sql "DROP DATABASE usertest;"
 }
 
 rollback_modify() {