Skip to content

Commit

Permalink
- Auto-Tuning - Introduction of basic stats regarding file counts/siz…
Browse files Browse the repository at this point in the history
…es for large tables. We'll make adjustments to DISTRIBUTE BY and Tez Groupings to provide more efficient/balanced migrations with better/more optimized file sizes after migration for migrations using SQL. #53

- AAdditional table filters (`-tfs|--table-filter-size-limit` and `-tfp|--table-filter-partition-count-limit`) that check a tables data size and partition count limits can also be applied to narrow the range of tables you'll process.  #55
- Add property to tables migrated with "STORAGE_MIGRATION" to identify and filter them out from future runs. #56
- `-cto|--compress-text-output` option and additional session level settings using basic stats.
- HDP3 scenario that doesn't support MANAGEDLOCATION element in database properties. #52

- AVRO Schema Only Fix..  #58
- Cleanup messaging around legacy config settings.
- Fix/Added `dbRegEx` command line parameter: #57

Configuration Breaking Change.  If you see note about `A configuration element is no longer valid, progress.  Please remove the element from the configuration yaml and try again.` with `Caused by: com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException: Unrecognized field "tblRegEx"`, please remove the properties `dbRegEx`, `tblRegEx` and `tblExcludeRegEx` from the config yaml.
  • Loading branch information
dstreev committed Jun 1, 2023
1 parent 0ad4c4f commit d6096c9
Show file tree
Hide file tree
Showing 45 changed files with 1,841 additions and 1,279 deletions.
586 changes: 358 additions & 228 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

<groupId>com.cloudera.utils.hadoop</groupId>
<artifactId>hms-mirror</artifactId>
<version>1.5.5.1-SNAPSHOT</version>
<version>1.5.6.0-SNAPSHOT</version>
<name>hms-mirror</name>

<url>https://github.com/cloudera-labs/hms_mirror</url>
Expand Down
33 changes: 33 additions & 0 deletions src/main/java/com/cloudera/utils/hadoop/hms/Context.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package com.cloudera.utils.hadoop.hms;

import com.cloudera.utils.hadoop.hms.mirror.Config;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class Context {
private static final Context instance = new Context();
private List<String> supportFileSystems = new ArrayList<String>(Arrays.asList(
"hdfs","ofs","s3","s3a","s3n","wasb","adls","gf"
));
private Config config = null;

private Context() {};

public static Context getInstance() {
return instance;
}

public Config getConfig() {
return config;
}

public void setConfig(Config config) {
this.config = config;
}

public List<String> getSupportFileSystems() {
return supportFileSystems;
}
}
105 changes: 79 additions & 26 deletions src/main/java/com/cloudera/utils/hadoop/hms/Mirror.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import com.cloudera.utils.hadoop.hms.util.Protect;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException;
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
import com.google.common.collect.Sets;
import org.apache.commons.cli.*;
Expand Down Expand Up @@ -224,14 +225,18 @@ public long init(String[] args) {
System.out.println("Using Config: " + configFile);
String yamlCfgFile = FileUtils.readFileToString(cfgFile, StandardCharsets.UTF_8);
config = mapper.readerFor(Config.class).readValue(yamlCfgFile);
Context.getInstance().setConfig(config);
} catch (UnrecognizedPropertyException upe) {
System.out.println("\n>>>>> READ THIS BEFORE CONTINUING. Minor configuration fix REQUIRED. <<<<<");
throw new RuntimeException("\nThere may have been a breaking change in the configuration since the previous " +
"release. Review the note below and remove the 'Unrecognized field' from the configuration and try " +
"again.\n\n", upe);
} catch (Throwable t) {
// Look for yaml update errors.
if (t.toString().contains("MismatchedInputException")) {
throw new RuntimeException("The format of the 'config' yaml file MAY HAVE CHANGED from the last release. Please make a copy and run " +
"'-su|--setup' again to recreate in the new format", t);
} else {
// config = new Config();
// config.getErrors().set(CONFIGURATION_REMOVED_OR_INVALID.getCode(), t.getMessage());
LOG.error(t);
throw new RuntimeException("A configuration element is no longer valid, progress. Please remove the element from the configuration yaml and try again.", t);
}
Expand Down Expand Up @@ -389,9 +394,23 @@ public long init(String[] args) {
}
}

// Skip Optimizations.
if (cmd.hasOption("so")) {
config.getOptimization().setSkip(Boolean.TRUE);
}
// Sort Dynamic Partitions
if (cmd.hasOption("sdpi")) {
config.getOptimization().setSortDynamicPartitionInserts(Boolean.TRUE);
}
// AutoTune.
if (cmd.hasOption("at")) {
config.getOptimization().setAutoTune(Boolean.TRUE);
}

//Compress TEXT Output.
if (cmd.hasOption("cto")) {
config.getOptimization().setCompressTextOutput(Boolean.TRUE);
}

if (cmd.hasOption("po")) {
// property overrides.
Expand All @@ -414,11 +433,6 @@ public long init(String[] args) {
config.getOptimization().getOverrides().setPropertyOverridesStr(overrides, Overrides.Side.RIGHT);
}

// Skip Optimizations.
if (cmd.hasOption("so")) {
config.getOptimization().setSkip(Boolean.TRUE);
}

if (cmd.hasOption("mnn")) {
config.setMigratedNonNative(Boolean.TRUE);
}
Expand Down Expand Up @@ -606,15 +620,23 @@ public long init(String[] args) {
}

if (cmd.hasOption("dbRegEx")) {
config.setDbRegEx(cmd.getOptionValue("dbRegEx"));
config.getFilter().setDbRegEx(cmd.getOptionValue("dbRegEx"));
}

if (cmd.hasOption("tf")) {
config.setTblRegEx(cmd.getOptionValue("tf"));
config.getFilter().setTblRegEx(cmd.getOptionValue("tf"));
}

if (cmd.hasOption("tef")) {
config.setTblExcludeRegEx(cmd.getOptionValue("tef"));
config.getFilter().setTblExcludeRegEx(cmd.getOptionValue("tef"));
}

if (cmd.hasOption("tfs")) {
config.getFilter().setTblSizeLimit(Long.parseLong(cmd.getOptionValue("tfs")));
}

if (cmd.hasOption("tfp")) {
config.getFilter().setTblPartitionLimit(Integer.parseInt(cmd.getOptionValue("tfp")));
}

}
Expand Down Expand Up @@ -665,10 +687,6 @@ public long init(String[] args) {
config.setDatabases(databases);
}

if (config.getDatabases() == null || config.getDatabases().length == 0) {
throw new RuntimeException("No databases specified");
}

if (cmd.hasOption("e") && config.getDataStrategy() != DataStrategy.DUMP) {
if (cmd.hasOption("accept")) {
config.getAcceptance().setSilentOverride(Boolean.TRUE);
Expand Down Expand Up @@ -1102,6 +1120,7 @@ public void doit() {
}

reportFile.write(mdReportStr);
reportFile.flush();
reportFile.close();
// Convert to HTML
List<Extension> extensions = Arrays.asList(TablesExtension.create(), YamlFrontMatterExtension.create());
Expand Down Expand Up @@ -1352,10 +1371,33 @@ private Options getOptions() {
propertyRightOverrides.setArgs(100);
options.addOption(propertyRightOverrides);

OptionGroup optimizationsGroup = new OptionGroup();
optimizationsGroup.setRequired(Boolean.FALSE);

Option skipOptimizationsOption = new Option("so", "skip-optimizations", false,
"Skip any optimizations during data movement, like dynamic sorting or distribute by");
skipOptimizationsOption.setRequired(Boolean.FALSE);
options.addOption(skipOptimizationsOption);
optimizationsGroup.addOption(skipOptimizationsOption);

Option sdpiOption = new Option("sdpi", "sort-dynamic-partition-inserts", false,
"Used to set `hive.optimize.sort.dynamic.partition` in TEZ for optimal partition inserts. " +
"When not specified, will use prescriptive sorting by adding 'DISTRIBUTE BY' to transfer SQL. " +
"default: false");
sdpiOption.setRequired(Boolean.FALSE);
optimizationsGroup.addOption(sdpiOption);

Option autoTuneOption = new Option("at", "auto-tune", false,
"Auto-tune Session Settings for SELECT's and DISTRIBUTION for Partition INSERT's.");
autoTuneOption.setRequired(Boolean.FALSE);
optimizationsGroup.addOption(autoTuneOption);

options.addOptionGroup(optimizationsGroup);

Option compressTextOutputOption = new Option("cto", "compress-test-output", false,
"Data movement (SQL/STORAGE_MIGRATION) of TEXT based file formats will be compressed in the new " +
"table.");
compressTextOutputOption.setRequired(Boolean.FALSE);
options.addOption(compressTextOutputOption);

Option forceExternalLocationOption = new Option("fel", "force-external-location", false,
"Under some conditions, the LOCATION element for EXTERNAL tables is removed (ie: -rdl). " +
Expand Down Expand Up @@ -1452,13 +1494,6 @@ private Options getOptions() {
mnnoOption.setRequired(Boolean.FALSE);
migrationOptionsGroup.addOption(mnnoOption);

Option sdpiOption = new Option("sdpi", "sort-dynamic-partition-inserts", false,
"Used to set `hive.optimize.sort.dynamic.partition` in TEZ for optimal partition inserts. " +
"When not specified, will use prescriptive sorting by adding 'DISTRIBUTE BY' to transfer SQL. " +
"default: false");
sdpiOption.setRequired(Boolean.FALSE);
options.addOption(sdpiOption);

Option viewOption = new Option("v", "views-only", false,
"Process VIEWs ONLY");
viewOption.setRequired(false);
Expand Down Expand Up @@ -1617,6 +1652,11 @@ private Options getOptions() {
dbOption.setArgName("databases");
dbOption.setArgs(100);

Option dbRegExOption = new Option("dbRegEx", "database-regex", true,
"RegEx of Database to include in process.");
dbRegExOption.setRequired(Boolean.FALSE);
dbRegExOption.setArgName("regex");

Option helpOption = new Option("h", "help", false,
"Help");
helpOption.setRequired(Boolean.FALSE);
Expand All @@ -1643,6 +1683,7 @@ private Options getOptions() {

OptionGroup dbGroup = new OptionGroup();
dbGroup.addOption(dbOption);
dbGroup.addOption(dbRegExOption);
dbGroup.addOption(helpOption);
dbGroup.addOption(setupOption);
dbGroup.addOption(pwOption);
Expand Down Expand Up @@ -1694,6 +1735,19 @@ private Options getOptions() {

options.addOptionGroup(filterGroup);

Option tableSizeFilterOption = new Option("tfs", "table-filter-size-limit", true,
"Filter tables OUT that are above the indicated size. Expressed in MB");
tableSizeFilterOption.setRequired(Boolean.FALSE);
tableSizeFilterOption.setArgName("size MB");
options.addOption(tableSizeFilterOption);

Option tablePartitionCountFilterOption = new Option("tfp", "table-filter-partition-count-limit", true,
"Filter partition tables OUT that are have more than specified here. Non Partitioned table aren't " +
"filtered.");
tablePartitionCountFilterOption.setRequired(Boolean.FALSE);
tablePartitionCountFilterOption.setArgName("partition-count");
options.addOption(tablePartitionCountFilterOption);

Option cfgOption = new Option("cfg", "config", true,
"Config with details for the HMS-Mirror. Default: $HOME/.hms-mirror/cfg/default.yaml");
cfgOption.setRequired(false);
Expand All @@ -1703,19 +1757,19 @@ private Options getOptions() {
return options;
}

protected Boolean setupSql(Environment environment, List<Pair> sqlPairList) {
public Boolean setupSql(Environment environment, List<Pair> sqlPairList) {
Boolean rtn = Boolean.TRUE;
rtn = config.getCluster(environment).runClusterSql(sqlPairList);
return rtn;
}

protected long setupSqlLeft(String[] args, List<Pair> sqlPairList) {
public long setupSqlLeft(String[] args, List<Pair> sqlPairList) {
long rtn = 0l;
rtn = setupSql(args, sqlPairList, null);
return rtn;
}

protected long setupSqlRight(String[] args, List<Pair> sqlPairList) {
public long setupSqlRight(String[] args, List<Pair> sqlPairList) {
long rtn = 0l;
rtn = setupSql(args, null, sqlPairList);
return rtn;
Expand Down Expand Up @@ -1806,7 +1860,6 @@ public long go(String[] args) {
} else {
returnCode = -1;
}
System.err.println(e.getMessage());
e.printStackTrace();
System.err.println("\nSee log for stack trace ($HOME/.hms-mirror/logs)");
} finally {
Expand Down
Loading

0 comments on commit d6096c9

Please sign in to comment.