From 0916c21196cbfcaa6f52f399884fdce5c1a5c119 Mon Sep 17 00:00:00 2001 From: Brendan Hayward Date: Thu, 29 Oct 2020 19:53:12 +1100 Subject: [PATCH 1/3] create separate train RF command in CLI --- .../csiro/variantspark/cli/TrainRFCmd.scala | 107 ++++++++++++++++++ .../variantspark/cli/VariantSparkApp.scala | 1 + 2 files changed, 108 insertions(+) create mode 100644 src/main/scala/au/csiro/variantspark/cli/TrainRFCmd.scala diff --git a/src/main/scala/au/csiro/variantspark/cli/TrainRFCmd.scala b/src/main/scala/au/csiro/variantspark/cli/TrainRFCmd.scala new file mode 100644 index 00000000..9b2cd8f5 --- /dev/null +++ b/src/main/scala/au/csiro/variantspark/cli/TrainRFCmd.scala @@ -0,0 +1,107 @@ +package au.csiro.variantspark.cli + +import au.csiro.pbdava.ssparkle.common.arg4j.{AppRunner, TestArgs} +import au.csiro.pbdava.ssparkle.common.utils.Logging +import au.csiro.sparkle.common.args4j.ArgsApp +import au.csiro.variantspark.utils.defRng +import au.csiro.variantspark.cli.args.{ + FeatureSourceArgs, + LabelSourceArgs, + ModelOutputArgs, + RandomForestArgs +} +import au.csiro.variantspark.cmd.EchoUtils._ +import au.csiro.variantspark.cmd.Echoable +import org.apache.commons.lang3.builder.ToStringBuilder +import org.apache.hadoop.conf.Configuration +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.tree.{RandomForest => SparkForest} +import org.apache.spark.mllib.tree.model.{RandomForestModel => SparkForestModel} +import org.apache.spark.rdd.RDD +import org.apache.spark.serializer.{JavaSerializer, SerializerInstance} +import org.kohsuke.args4j.Option + +import scala.collection._ +import scala.util.Random + +class TrainRFCmd + extends ArgsApp with LabelSourceArgs with RandomForestArgs with FeatureSourceArgs + with ModelOutputArgs with Echoable with Logging with TestArgs { + + @Option(name = "-lf", required = false, usage = "Path to label file", + aliases = Array("--label-file")) + val labelFile: String = null + + @Option(name = "-lc", required = false, usage = "Label file column name", + aliases = Array("--label-column")) + val labelColumn: String = null + + val javaSerializer = new JavaSerializer(conf) + val si: SerializerInstance = javaSerializer.newInstance() + + override def testArgs: Array[String] = + Array("-im", "file.model", "-if", "file.data", "-of", "outputpredictions.file") + + override def run(): Unit = { + implicit val hadoopConf: Configuration = sc.hadoopConfiguration + + // echo(s"Loading labels from: ${featuresFile}, column: ${featureColumn}") + val labels: List[String] = + if (featuresFile != null) { + val labFile = + spark.read.format("csv").option("header", "true").load(featuresFile) + val labCol = labFile.select(featureColumn).rdd.map(_(0)).collect.toList + labCol.map(_.toString) + } else { + // val labelSource = new CsvLabelSource(featuresFile, featureColumn) + // val labels = labelSource.getLabels(featureSource.sampleNames) + val dummyLabels = + List("blue", "brown", "black", "green", "yellow", "grey") + // val featureCount = 2000 + val featureCount = featureSource.features.count.toInt + val phenoLabelIndex = Range(0, featureCount).toList + .map(_ => dummyLabels(Random.nextInt.abs % dummyLabels.length)) + phenoLabelIndex + } + /* write map of labels to file for lookup after prediction + allows human readable labels in results + */ + val label2pt = labels.toSet.zipWithIndex.toMap + val pt2label = label2pt.map(l => l.swap) + sc.parallelize(pt2label.toSeq).saveAsTextFile(modelFile + ".labelMap") + echo(s"Loaded labels from file: ${labels.toSet}") + echo(s"Loaded labels: ${dumpList(labels)}") + + val labPts = labels zip featureSource.features.collect map { + case (label, feat) => + LabeledPoint(label2pt(label).toDouble, feat.valueAsVector) + } + val labPtsRDD = sc.parallelize(labPts) + val catInfo = immutable.Map[Int, Int]() + val numClasses = labels.toSet.size + val numTrees = scala.Option(nTrees).getOrElse(5) + val subsetStrat = "auto" + val impurity = "gini" + val maxDepth: Int = scala.Option(rfMaxDepth).getOrElse(30) + val maxBins = 32 + val intSeed = defRng.nextLong.toInt + val sparkRFModel: SparkForestModel = SparkForest.trainClassifier(labPtsRDD, numClasses, + catInfo, numTrees, subsetStrat, impurity, scala.math.min(maxDepth, 30), maxBins, intSeed) + println("running train cmd") + logInfo("Running with params: " + ToStringBuilder.reflectionToString(this)) + echo(s"Analyzing random forest model") + echo(s"Using spark RF Model: ${sparkRFModel.toString}") + echo(s"Using labels: ${labels}") + echo(s"Loaded rows: ${dumpList(featureSource.sampleNames)}") + + if (modelFile != null) { + sparkRFModel.save(sc, modelFile) + } + } +} + +object TrainRFCmd { + def main(args: Array[String]) { + AppRunner.mains[TrainRFCmd](args) + } +} diff --git a/src/main/scala/au/csiro/variantspark/cli/VariantSparkApp.scala b/src/main/scala/au/csiro/variantspark/cli/VariantSparkApp.scala index efa957ad..d56d08ef 100644 --- a/src/main/scala/au/csiro/variantspark/cli/VariantSparkApp.scala +++ b/src/main/scala/au/csiro/variantspark/cli/VariantSparkApp.scala @@ -16,6 +16,7 @@ class VariantSparkApp extends MultiCmdApp { registerClass("analyze-rf", classOf[AnalyzeRFCmd]) registerClass("build-index", classOf[BuildVarIndexCmd]) registerClass("pdist", classOf[PairWiseDistanceCmd]) + registerClass("trainrf", classOf[TrainRFCmd]) } object VariantSparkApp { From b53491658c843e191830caa9fce6e38436af25f4 Mon Sep 17 00:00:00 2001 From: Brendan Hayward Date: Fri, 30 Oct 2020 18:40:21 +1100 Subject: [PATCH 2/3] train RF model, format cmd files --- .../csiro/variantspark/cli/AnalyzeRFCmd.scala | 2 +- .../variantspark/cli/ImportanceCmd.scala | 59 ++---- .../csiro/variantspark/cli/TrainRFCmd.scala | 171 +++++++++++------- .../variantspark/cli/VariantSparkApp.scala | 14 +- 4 files changed, 133 insertions(+), 113 deletions(-) diff --git a/src/main/scala/au/csiro/variantspark/cli/AnalyzeRFCmd.scala b/src/main/scala/au/csiro/variantspark/cli/AnalyzeRFCmd.scala index 9834f8a2..f0fe80dd 100644 --- a/src/main/scala/au/csiro/variantspark/cli/AnalyzeRFCmd.scala +++ b/src/main/scala/au/csiro/variantspark/cli/AnalyzeRFCmd.scala @@ -35,7 +35,7 @@ class AnalyzeRFCmd val inputModel: String = null @Option(name = "-ii", required = false, usage = "Path to input variable index file", - aliases = Array("--input-idnex")) + aliases = Array("--input-index")) val inputIndex: String = null @Option(name = "-ob", required = false, usage = "Path to output importance", diff --git a/src/main/scala/au/csiro/variantspark/cli/ImportanceCmd.scala b/src/main/scala/au/csiro/variantspark/cli/ImportanceCmd.scala index 2f858457..6e5d0a9a 100644 --- a/src/main/scala/au/csiro/variantspark/cli/ImportanceCmd.scala +++ b/src/main/scala/au/csiro/variantspark/cli/ImportanceCmd.scala @@ -1,53 +1,24 @@ package au.csiro.variantspark.cli +import au.csiro.pbdava.ssparkle.common.arg4j.{AppRunner, TestArgs} +import au.csiro.pbdava.ssparkle.common.utils.{CSVUtils, Logging, ReusablePrintStream, Timer} +import au.csiro.pbdava.ssparkle.spark.{SparkApp, SparkUtils} import au.csiro.sparkle.common.args4j.ArgsApp -import au.csiro.sparkle.cmd.CmdApp -import org.kohsuke.args4j.Option -import au.csiro.pbdava.ssparkle.common.arg4j.AppRunner -import au.csiro.pbdava.ssparkle.spark.SparkApp -import collection.JavaConverters._ -import au.csiro.variantspark.input.VCFSource -import au.csiro.variantspark.input.VCFFeatureSource -import au.csiro.variantspark.input.HashingLabelSource -import org.apache.spark.mllib.linalg.{Vector, Vectors} -import au.csiro.variantspark.input.CsvLabelSource +import au.csiro.variantspark.algo.{RandomForest, RandomForestCallback, RandomForestParams, _} +import au.csiro.variantspark.cli.args.{ + FeatureSourceArgs, + ImportanceArgs, + ModelOutputArgs, + RandomForestArgs +} +import au.csiro.variantspark.cmd.EchoUtils._ import au.csiro.variantspark.cmd.Echoable -import au.csiro.pbdava.ssparkle.common.utils.Logging +import au.csiro.variantspark.input.CsvLabelSource +import au.csiro.variantspark.utils.{HdfsPath, defRng} import org.apache.commons.lang3.builder.ToStringBuilder -import au.csiro.variantspark.cmd.EchoUtils._ -import au.csiro.pbdava.ssparkle.common.utils.LoanUtils -import com.github.tototoshi.csv.CSVWriter -import au.csiro.pbdava.ssparkle.common.arg4j.TestArgs -import org.apache.hadoop.fs.FileSystem -import org.apache.commons.math3.stat.correlation.PearsonsCorrelation -import au.csiro.pbdava.ssparkle.spark.SparkUtils -import au.csiro.pbdava.ssparkle.common.utils.ReusablePrintStream -import au.csiro.variantspark.algo.RandomForestCallback -import au.csiro.variantspark.utils.VectorRDDFunction._ -import au.csiro.variantspark.input.CsvFeatureSource -import au.csiro.variantspark.algo.RandomForestParams -import au.csiro.variantspark.data.BoundedOrdinalVariable -import au.csiro.pbdava.ssparkle.common.utils.Timer -import au.csiro.variantspark.utils.defRng -import au.csiro.variantspark.input.ParquetFeatureSource -import au.csiro.variantspark.utils.IndexedRDDFunction._ -import java.io.ObjectOutputStream -import java.io.FileOutputStream import org.apache.hadoop.conf.Configuration -import au.csiro.variantspark.utils.HdfsPath -import au.csiro.pbdava.ssparkle.common.utils.CSVUtils -import au.csiro.variantspark.cli.args.ImportanceArgs -import au.csiro.variantspark.cli.args.RandomForestArgs -import au.csiro.variantspark.cli.args.FeatureSourceArgs -import au.csiro.variantspark.data.ContinuousVariable -import au.csiro.variantspark.algo.RandomForest -import au.csiro.variantspark.data.FeatureBuilder -import scala.reflect.ClassTag -import au.csiro.variantspark.input._ -import au.csiro.variantspark.algo._ -import au.csiro.variantspark.data.VariableType -import org.apache.spark.rdd.RDD -import au.csiro.variantspark.cli.args.ModelOutputArgs +import org.apache.hadoop.fs.FileSystem +import org.kohsuke.args4j.Option class ImportanceCmd extends ArgsApp with SparkApp with FeatureSourceArgs with ImportanceArgs with RandomForestArgs diff --git a/src/main/scala/au/csiro/variantspark/cli/TrainRFCmd.scala b/src/main/scala/au/csiro/variantspark/cli/TrainRFCmd.scala index 9b2cd8f5..9b89bddf 100644 --- a/src/main/scala/au/csiro/variantspark/cli/TrainRFCmd.scala +++ b/src/main/scala/au/csiro/variantspark/cli/TrainRFCmd.scala @@ -1,32 +1,35 @@ package au.csiro.variantspark.cli - import au.csiro.pbdava.ssparkle.common.arg4j.{AppRunner, TestArgs} -import au.csiro.pbdava.ssparkle.common.utils.Logging +import au.csiro.pbdava.ssparkle.common.utils.{Logging, Timer} +import au.csiro.pbdava.ssparkle.spark.SparkUtils import au.csiro.sparkle.common.args4j.ArgsApp -import au.csiro.variantspark.utils.defRng -import au.csiro.variantspark.cli.args.{ - FeatureSourceArgs, - LabelSourceArgs, - ModelOutputArgs, - RandomForestArgs +import au.csiro.variantspark.algo.{ + DefTreeRepresentationFactory, + RandomForest, + RandomForestCallback, + RandomForestParams } +import au.csiro.variantspark.cli.args._ import au.csiro.variantspark.cmd.EchoUtils._ import au.csiro.variantspark.cmd.Echoable +import au.csiro.variantspark.input.CsvLabelSource +import au.csiro.variantspark.utils.defRng import org.apache.commons.lang3.builder.ToStringBuilder import org.apache.hadoop.conf.Configuration -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.tree.{RandomForest => SparkForest} -import org.apache.spark.mllib.tree.model.{RandomForestModel => SparkForestModel} -import org.apache.spark.rdd.RDD import org.apache.spark.serializer.{JavaSerializer, SerializerInstance} import org.kohsuke.args4j.Option -import scala.collection._ -import scala.util.Random - class TrainRFCmd - extends ArgsApp with LabelSourceArgs with RandomForestArgs with FeatureSourceArgs - with ModelOutputArgs with Echoable with Logging with TestArgs { + extends ArgsApp with LabelSourceArgs with RandomForestArgs with ImportanceArgs + with FeatureSourceArgs with ModelOutputArgs with Echoable with Logging with TestArgs { + + @Option(name = "-im", required = false, usage = "Path to input model", + aliases = Array("--input-model")) + val inputModel: String = null + + @Option(name = "-ii", required = false, usage = "Path to input variable index file", + aliases = Array("--input-index")) + val inputIndex: String = null @Option(name = "-lf", required = false, usage = "Path to label file", aliases = Array("--label-file")) @@ -36,6 +39,21 @@ class TrainRFCmd aliases = Array("--label-column")) val labelColumn: String = null + @Option(name = "-sr", required = false, usage = "Random seed to use (def=)", + aliases = Array("--seed")) + val randomSeed: Long = defRng.nextLong + + @Option(name = "-on", required = false, + usage = "The number of top important variables to include in output." + + " Use `0` for all variables. (def=20)", + aliases = Array("--output-n-variables")) + val nVariables: Int = 20 + + @Option(name = "-od", required = false, + usage = "Include important variables data in output file (def=no)", + aliases = Array("--output-include-data")) + val includeData: Boolean = false + val javaSerializer = new JavaSerializer(conf) val si: SerializerInstance = javaSerializer.newInstance() @@ -45,58 +63,89 @@ class TrainRFCmd override def run(): Unit = { implicit val hadoopConf: Configuration = sc.hadoopConfiguration - // echo(s"Loading labels from: ${featuresFile}, column: ${featureColumn}") - val labels: List[String] = - if (featuresFile != null) { - val labFile = - spark.read.format("csv").option("header", "true").load(featuresFile) - val labCol = labFile.select(featureColumn).rdd.map(_(0)).collect.toList - labCol.map(_.toString) - } else { - // val labelSource = new CsvLabelSource(featuresFile, featureColumn) - // val labels = labelSource.getLabels(featureSource.sampleNames) - val dummyLabels = - List("blue", "brown", "black", "green", "yellow", "grey") - // val featureCount = 2000 - val featureCount = featureSource.features.count.toInt - val phenoLabelIndex = Range(0, featureCount).toList - .map(_ => dummyLabels(Random.nextInt.abs % dummyLabels.length)) - phenoLabelIndex + val dataLoadingTimer = Timer() + echo(s"Loaded rows: ${dumpList(featureSource.sampleNames)}") + + val inputData = DefTreeRepresentationFactory + .createRepresentation(featureSource.features.zipWithIndex()) + .cache() + val totalVariables = inputData.count() + + val variablePreview = inputData.map(_.label).take(defaultPreviewSize).toList + echo(s"Loaded variables: ${dumpListHead(variablePreview, totalVariables)}," + + s" took: ${dataLoadingTimer.durationInSec}") + echoDataPreview() + + echo(s"Loading labels from: ${featuresFile}, column: ${featureColumn}") + val labelSource = new CsvLabelSource(featuresFile, featureColumn) + val labels = labelSource.getLabels(featureSource.sampleNames) + echo(s"Loaded labels: ${dumpList(labels.toList)}") + echo(s"Training random forest with trees: ${nTrees} (batch size: ${rfBatchSize})") + echo(s"Random seed is: ${randomSeed}") + val treeBuildingTimer = Timer() + val rf: RandomForest = new RandomForest(RandomForestParams(oob = rfEstimateOob, + seed = randomSeed, maxDepth = rfMaxDepth, minNodeSize = rfMinNodeSize, + bootstrap = !rfSampleNoReplacement, subsample = rfSubsampleFraction, + nTryFraction = if (rfMTry > 0) rfMTry.toDouble / totalVariables else rfMTryFraction, + correctImpurity = correctImportance, airRandomSeed = airRandomSeed)) + val trainingData = inputData + + implicit val rfCallback: RandomForestCallback = new RandomForestCallback() { + var totalTime: Long = 0L + var totalTrees: Int = 0 + override def onParamsResolved(actualParams: RandomForestParams) { + echo(s"RF Params: ${actualParams}") + echo(s"RF Params mTry: ${(actualParams.nTryFraction * totalVariables).toLong}") + } + override def onTreeComplete(nTrees: Int, oobError: Double, elapsedTimeMs: Long) { + totalTime += elapsedTimeMs + totalTrees += nTrees + echo( + s"Finished trees: ${totalTrees}, current oobError: ${oobError}," + + s" totalTime: ${totalTime / 1000.0} s, " + + s" avg timePerTree: ${totalTime / (1000.0 * totalTrees)} s") + echo( + s"Last build trees: ${nTrees}, time: ${elapsedTimeMs} ms," + + s" timePerTree: ${elapsedTimeMs / nTrees} ms") } - /* write map of labels to file for lookup after prediction - allows human readable labels in results - */ - val label2pt = labels.toSet.zipWithIndex.toMap - val pt2label = label2pt.map(l => l.swap) - sc.parallelize(pt2label.toSeq).saveAsTextFile(modelFile + ".labelMap") - echo(s"Loaded labels from file: ${labels.toSet}") - echo(s"Loaded labels: ${dumpList(labels)}") - - val labPts = labels zip featureSource.features.collect map { - case (label, feat) => - LabeledPoint(label2pt(label).toDouble, feat.valueAsVector) } - val labPtsRDD = sc.parallelize(labPts) - val catInfo = immutable.Map[Int, Int]() - val numClasses = labels.toSet.size - val numTrees = scala.Option(nTrees).getOrElse(5) - val subsetStrat = "auto" - val impurity = "gini" - val maxDepth: Int = scala.Option(rfMaxDepth).getOrElse(30) - val maxBins = 32 - val intSeed = defRng.nextLong.toInt - val sparkRFModel: SparkForestModel = SparkForest.trainClassifier(labPtsRDD, numClasses, - catInfo, numTrees, subsetStrat, impurity, scala.math.min(maxDepth, 30), maxBins, intSeed) + + val result = rf.batchTrainTyped(trainingData, labels, nTrees, rfBatchSize) + + echo( + s"Random forest oob accuracy: ${result.oobError}," + + s" took: ${treeBuildingTimer.durationInSec} s") + + // build index for names + val allImportantVariables = result.normalizedVariableImportance(importanceNormalizer).toSeq + val topImportantVariables = limitVariables(allImportantVariables, nVariables) + val topImportantVariableIndexes = topImportantVariables.map(_._1).toSet + + val variablesToIndex = if (requiresFullIndex) { + allImportantVariables.map(_._1).toSet + } else { + topImportantVariableIndexes + } + + val index = SparkUtils.withBroadcast(sc)(variablesToIndex) { br_indexes => + inputData + .filter(t => br_indexes.value.contains(t.index)) + .map(f => (f.index, f.label)) + .collectAsMap() + } + + val varImportance = topImportantVariables.map({ + case (i, importance) => (index(i), importance) + }) + println("running train cmd") logInfo("Running with params: " + ToStringBuilder.reflectionToString(this)) echo(s"Analyzing random forest model") - echo(s"Using spark RF Model: ${sparkRFModel.toString}") + echo(s"Using spark RF Model: ${result.toString}") echo(s"Using labels: ${labels}") echo(s"Loaded rows: ${dumpList(featureSource.sampleNames)}") - if (modelFile != null) { - sparkRFModel.save(sc, modelFile) - } + saveModel(result, index.toMap) } } diff --git a/src/main/scala/au/csiro/variantspark/cli/VariantSparkApp.scala b/src/main/scala/au/csiro/variantspark/cli/VariantSparkApp.scala index d56d08ef..8e7c10a1 100644 --- a/src/main/scala/au/csiro/variantspark/cli/VariantSparkApp.scala +++ b/src/main/scala/au/csiro/variantspark/cli/VariantSparkApp.scala @@ -4,18 +4,18 @@ import au.csiro.sparkle.cmd.MultiCmdApp import au.csiro.pbdava.ssparkle.common.arg4j.AppRunner class VariantSparkApp extends MultiCmdApp { - registerClass("test", classOf[TestCmd]) + registerClass("analyze-rf", classOf[AnalyzeRFCmd]) + registerClass("build-index", classOf[BuildVarIndexCmd]) + registerClass("convert", classOf[ConvertCmd]) registerClass("filter", classOf[FilterCmd]) - registerClass("importance", classOf[ImportanceCmd]) - registerClass("null-importance", classOf[NullImportanceCmd]) - registerClass("importance-ca", classOf[CochranArmanCmd]) registerClass("gen-features", classOf[GenerateFeaturesCmd]) registerClass("gen-labels", classOf[GenerateLabelsCmd]) registerClass("gen-labels-with-noise", classOf[GenerateLabelsNoiseCmd]) - registerClass("convert", classOf[ConvertCmd]) - registerClass("analyze-rf", classOf[AnalyzeRFCmd]) - registerClass("build-index", classOf[BuildVarIndexCmd]) + registerClass("importance", classOf[ImportanceCmd]) + registerClass("importance-ca", classOf[CochranArmanCmd]) + registerClass("null-importance", classOf[NullImportanceCmd]) registerClass("pdist", classOf[PairWiseDistanceCmd]) + registerClass("test", classOf[TestCmd]) registerClass("trainrf", classOf[TrainRFCmd]) } From 2b1f69d26509394eb666007e639dce5dbd7885b4 Mon Sep 17 00:00:00 2001 From: Brendan Hayward Date: Thu, 5 Nov 2020 18:31:32 +1100 Subject: [PATCH 3/3] inlude correct args, imports in trainRF cmd --- .../scala/au/csiro/variantspark/cli/TrainRFCmd.scala | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/main/scala/au/csiro/variantspark/cli/TrainRFCmd.scala b/src/main/scala/au/csiro/variantspark/cli/TrainRFCmd.scala index 9b89bddf..d3483828 100644 --- a/src/main/scala/au/csiro/variantspark/cli/TrainRFCmd.scala +++ b/src/main/scala/au/csiro/variantspark/cli/TrainRFCmd.scala @@ -1,7 +1,7 @@ package au.csiro.variantspark.cli import au.csiro.pbdava.ssparkle.common.arg4j.{AppRunner, TestArgs} import au.csiro.pbdava.ssparkle.common.utils.{Logging, Timer} -import au.csiro.pbdava.ssparkle.spark.SparkUtils +import au.csiro.pbdava.ssparkle.spark.{SparkApp, SparkUtils} import au.csiro.sparkle.common.args4j.ArgsApp import au.csiro.variantspark.algo.{ DefTreeRepresentationFactory, @@ -20,9 +20,13 @@ import org.apache.spark.serializer.{JavaSerializer, SerializerInstance} import org.kohsuke.args4j.Option class TrainRFCmd - extends ArgsApp with LabelSourceArgs with RandomForestArgs with ImportanceArgs + extends ArgsApp with SparkApp with LabelSourceArgs with RandomForestArgs with ImportanceArgs with FeatureSourceArgs with ModelOutputArgs with Echoable with Logging with TestArgs { + @Option(name = "-of", required = false, usage = "Path to output file (def = stdout)", + aliases = Array("--output-file")) + val outputFile: String = null + @Option(name = "-im", required = false, usage = "Path to input model", aliases = Array("--input-model")) val inputModel: String = null @@ -146,6 +150,7 @@ class TrainRFCmd echo(s"Loaded rows: ${dumpList(featureSource.sampleNames)}") saveModel(result, index.toMap) + echo(s"inputFile: ${inputFile}") } }