From 2ced8c1d1f19a1d98274b5386f19b10e5a4dbe5b Mon Sep 17 00:00:00 2001 From: Varant Zanoyan Date: Tue, 5 Dec 2023 14:12:08 -0800 Subject: [PATCH 1/7] WIP --- .../aggregator/base/TimedAggregators.scala | 9 +- .../aggregator/windowing/HopsAggregator.scala | 6 +- .../windowing/SawtoothOnlineAggregator.scala | 8 +- .../aggregator/test/ApproxDistinctTest.scala | 6 +- .../test/ApproxPercentilesTest.scala | 6 +- .../aggregator/test/RowAggregatorTest.scala | 6 +- .../test/SawtoothAggregatorTest.scala | 6 +- .../aggregator/test/VarianceTest.scala | 6 +- .../scala/ai/chronon/api/Extensions.scala | 33 +++- .../ai/chronon/api/ParametricMacro.scala | 7 +- .../ai/chronon/api/ThriftJsonCodec.scala | 5 +- .../api/test/DataTypeConversionTest.scala | 4 +- .../ai/chronon/flink/AsyncKVStoreWriter.scala | 11 +- .../scala/ai/chronon/flink/AvroCodecFn.scala | 4 +- .../chronon/flink/SparkExpressionEvalFn.scala | 5 +- log.py | 53 ++++++ .../main/scala/ai/chronon/online/Api.scala | 18 +- .../ai/chronon/online/DataStreamBuilder.scala | 13 +- .../scala/ai/chronon/online/Fetcher.scala | 26 ++- .../scala/ai/chronon/online/FetcherBase.scala | 17 +- .../ai/chronon/online/MetadataStore.scala | 27 +-- .../ai/chronon/online/TileCodecTest.scala | 6 +- .../online/test/DataStreamBuilderTest.scala | 4 +- project/FolderCleaner.scala | 4 +- project/ThriftGen.scala | 8 +- .../scala/ai/chronon/spark/Analyzer.scala | 49 +++--- .../ai/chronon/spark/BootstrapInfo.scala | 29 +-- .../scala/ai/chronon/spark/Comparison.scala | 8 +- .../main/scala/ai/chronon/spark/Driver.scala | 86 +++++++-- .../scala/ai/chronon/spark/Extensions.scala | 24 ++- .../scala/ai/chronon/spark/FastHashing.scala | 5 +- .../main/scala/ai/chronon/spark/GroupBy.scala | 39 +++-- .../ai/chronon/spark/GroupByUpload.scala | 13 +- .../main/scala/ai/chronon/spark/Join.scala | 22 ++- .../scala/ai/chronon/spark/JoinBase.scala | 40 +++-- .../scala/ai/chronon/spark/JoinUtils.scala | 16 +- .../main/scala/ai/chronon/spark/KvRdd.scala | 8 +- .../scala/ai/chronon/spark/LabelJoin.scala | 30 ++-- .../ai/chronon/spark/LocalDataLoader.scala | 12 +- .../ai/chronon/spark/LogFlattenerJob.scala | 15 +- .../ai/chronon/spark/MetadataExporter.scala | 8 +- .../chronon/spark/SparkSessionBuilder.scala | 4 +- .../scala/ai/chronon/spark/StagingQuery.scala | 17 +- .../scala/ai/chronon/spark/TableUtils.scala | 49 +++--- .../chronon/spark/stats/CompareBaseJob.scala | 6 +- .../ai/chronon/spark/stats/CompareJob.scala | 21 ++- .../chronon/spark/stats/ConsistencyJob.scala | 24 +-- .../ai/chronon/spark/stats/SummaryJob.scala | 14 +- .../ai/chronon/spark/streaming/GroupBy.scala | 8 +- .../spark/streaming/JoinSourceRunner.scala | 31 ++-- .../spark/streaming/KafkaStreamBuilder.scala | 8 +- .../spark/streaming/StreamingStats.scala | 6 +- .../spark/streaming/TopicChecker.scala | 7 +- .../ai/chronon/spark/test/AnalyzerTest.scala | 6 +- .../spark/test/ChainingFetcherTest.scala | 24 +-- .../ai/chronon/spark/test/CompareTest.scala | 6 +- .../spark/test/FeatureWithLabelJoinTest.scala | 28 +-- .../chronon/spark/test/FetchStatsTest.scala | 10 +- .../ai/chronon/spark/test/FetcherTest.scala | 35 ++-- .../ai/chronon/spark/test/GroupByTest.scala | 28 +-- .../spark/test/GroupByUploadTest.scala | 6 +- .../chronon/spark/test/InMemoryKvStore.scala | 7 +- .../chronon/spark/test/InMemoryStream.scala | 6 +- .../ai/chronon/spark/test/JoinTest.scala | 94 +++++----- .../ai/chronon/spark/test/LabelJoinTest.scala | 52 +++--- .../spark/test/MetadataExporterTest.scala | 14 +- .../scala/ai/chronon/spark/test/MockApi.scala | 11 +- .../ai/chronon/spark/test/MutationsTest.scala | 22 +-- .../chronon/spark/test/StagingQueryTest.scala | 54 +++--- .../chronon/spark/test/StatsComputeTest.scala | 20 ++- .../spark/test/bootstrap/DerivationTest.scala | 34 ++-- .../test/bootstrap/LogBootstrapTest.scala | 12 +- .../test/bootstrap/TableBootstrapTest.scala | 12 +- test/HopsAggregator.scala | 165 ++++++++++++++++++ test/log.py | 68 ++++++++ 75 files changed, 1100 insertions(+), 511 deletions(-) create mode 100644 log.py create mode 100644 test/HopsAggregator.scala create mode 100644 test/log.py diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/base/TimedAggregators.scala b/aggregator/src/main/scala/ai/chronon/aggregator/base/TimedAggregators.scala index e4ef7dda4..1bf360b44 100644 --- a/aggregator/src/main/scala/ai/chronon/aggregator/base/TimedAggregators.scala +++ b/aggregator/src/main/scala/ai/chronon/aggregator/base/TimedAggregators.scala @@ -16,12 +16,14 @@ package ai.chronon.aggregator.base +import org.slf4j.LoggerFactory import ai.chronon.aggregator.base.TimeTuple.typ import ai.chronon.api._ import java.util object TimeTuple extends Ordering[util.ArrayList[Any]] { + private val logger = LoggerFactory.getLogger(getClass) type typ = util.ArrayList[Any] def `type`(inputType: DataType): DataType = @@ -53,6 +55,7 @@ object TimeTuple extends Ordering[util.ArrayList[Any]] { } abstract class TimeOrdered(inputType: DataType) extends TimedAggregator[Any, TimeTuple.typ, Any] { + private val logger = LoggerFactory.getLogger(getClass) override def outputType: DataType = inputType override def irType: DataType = TimeTuple.`type`(inputType) @@ -72,6 +75,7 @@ abstract class TimeOrdered(inputType: DataType) extends TimedAggregator[Any, Tim } class First(inputType: DataType) extends TimeOrdered(inputType) { + private val logger = LoggerFactory.getLogger(getClass) //mutating override def update( ir: util.ArrayList[Any], @@ -92,6 +96,7 @@ class First(inputType: DataType) extends TimeOrdered(inputType) { } class Last(inputType: DataType) extends TimeOrdered(inputType) { + private val logger = LoggerFactory.getLogger(getClass) //mutating override def update( ir: util.ArrayList[Any], @@ -119,6 +124,7 @@ class OrderByLimitTimed( limit: Int, ordering: Ordering[TimeTuple.typ] ) extends TimedAggregator[Any, util.ArrayList[TimeTuple.typ], util.ArrayList[Any]] { + private val logger = LoggerFactory.getLogger(getClass) type Container = util.ArrayList[TimeTuple.typ] private val minHeap = new MinHeap[TimeTuple.typ](limit, ordering) @@ -129,7 +135,7 @@ class OrderByLimitTimed( override final def prepare(input: Any, ts: Long): Container = { // val gson = new Gson() val tuple = TimeTuple.make(ts, input) -// println(s"init: ${gson.toJson(tuple)}") +// logger.info(s"init: ${gson.toJson(tuple)}") val arr = new Container() arr.add(tuple) arr @@ -145,6 +151,7 @@ class OrderByLimitTimed( minHeap.merge(state1, state2) override def finalize(state: Container): util.ArrayList[Any] = { + private val logger = LoggerFactory.getLogger(getClass) val sorted = minHeap.sort(state) val result = new util.ArrayList[Any](state.size()) val it = sorted.iterator diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/windowing/HopsAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/windowing/HopsAggregator.scala index dea5faa20..bc66cff36 100644 --- a/aggregator/src/main/scala/ai/chronon/aggregator/windowing/HopsAggregator.scala +++ b/aggregator/src/main/scala/ai/chronon/aggregator/windowing/HopsAggregator.scala @@ -16,6 +16,7 @@ package ai.chronon.aggregator.windowing +import org.slf4j.LoggerFactory import ai.chronon.aggregator.row.RowAggregator import ai.chronon.aggregator.windowing.HopsAggregator._ import ai.chronon.api.Extensions.{AggregationOps, AggregationsOps, WindowOps, WindowUtils} @@ -31,6 +32,7 @@ import java.util // t class HopsAggregatorBase(aggregations: Seq[Aggregation], inputSchema: Seq[(String, DataType)], resolution: Resolution) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) @transient lazy val rowAggregator = new RowAggregator(inputSchema, aggregations.flatMap(_.unWindowed)) @@ -93,6 +95,7 @@ class HopsAggregator(minQueryTs: Long, inputSchema: Seq[(String, DataType)], resolution: Resolution) extends HopsAggregatorBase(aggregations, inputSchema, resolution) { + private val logger = LoggerFactory.getLogger(getClass) val leftBoundaries: Array[Option[Long]] = { // Nikhil is pretty confident we won't call this when aggregations is empty @@ -135,7 +138,7 @@ class HopsAggregator(minQueryTs: Long, .zip(readableLeftBounds) .map { case (hop, left) => s"$hop->$left" } .mkString(", ") - println(s"""Left bounds: $readableHopsToBoundsMap + logger.info(s"""Left bounds: $readableHopsToBoundsMap |minQueryTs = ${TsUtils.toStr(minQueryTs)}""".stripMargin) result } @@ -154,6 +157,7 @@ class HopsAggregator(minQueryTs: Long, } object HopsAggregator { + private val logger = LoggerFactory.getLogger(getClass) // [IR1, IR2, IR3,.... IRN, ts_millis_long] // hops have timestamps attached to the end type HopIr = Array[Any] diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/windowing/SawtoothOnlineAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/windowing/SawtoothOnlineAggregator.scala index b36951281..6af83a824 100644 --- a/aggregator/src/main/scala/ai/chronon/aggregator/windowing/SawtoothOnlineAggregator.scala +++ b/aggregator/src/main/scala/ai/chronon/aggregator/windowing/SawtoothOnlineAggregator.scala @@ -16,6 +16,7 @@ package ai.chronon.aggregator.windowing +import org.slf4j.LoggerFactory import scala.collection.Seq import ai.chronon.api.Extensions.{AggregationPartOps, WindowOps} import ai.chronon.api._ @@ -31,6 +32,7 @@ class SawtoothOnlineAggregator(val batchEndTs: Long, inputSchema: Seq[(String, DataType)], resolution: Resolution, tailBufferMillis: Long) { + private val logger = LoggerFactory.getLogger(getClass) // logically, batch response is arranged like so // sum-90d => sum_ir_88d, [(sum_ir_1d, ts)] -> 1d is the hopSize for 90d @@ -46,10 +48,10 @@ class SawtoothOnlineAggregator(val batchEndTs: Long, val batchTailTs: Array[Option[Long]] = tailTs(batchEndTs) - println(s"Batch End: ${TsUtils.toStr(batchEndTs)}") - println("Window Tails: ") + logger.info(s"Batch End: ${TsUtils.toStr(batchEndTs)}") + logger.info("Window Tails: ") for (i <- windowMappings.indices) { - println(s" ${windowMappings(i).aggregationPart.outputColumnName} -> ${batchTailTs(i).map(TsUtils.toStr)}") + logger.info(s" ${windowMappings(i).aggregationPart.outputColumnName} -> ${batchTailTs(i).map(TsUtils.toStr)}") } def update(batchIr: BatchIr, row: Row): BatchIr = update(batchEndTs, batchIr, row, batchTailTs) diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxDistinctTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxDistinctTest.scala index 2416a894f..4b82e5eb2 100644 --- a/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxDistinctTest.scala +++ b/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxDistinctTest.scala @@ -16,11 +16,13 @@ package ai.chronon.aggregator.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.base.ApproxDistinctCount import junit.framework.TestCase import org.junit.Assert._ class ApproxDistinctTest extends TestCase { + private val logger = LoggerFactory.getLogger(getClass) def testErrorBound(uniques: Int, errorBound: Int, lgK: Int): Unit = { val uniqueElems = 1 to uniques val duplicates = uniqueElems ++ uniqueElems ++ uniqueElems @@ -28,7 +30,7 @@ class ApproxDistinctTest extends TestCase { val ir = counter.prepare(duplicates.head) duplicates.tail.foreach { elem => counter.update(ir, elem) } val estimated = counter.finalize(ir) - // println(s"estimated - $estimated, actual - $uniques, bound - $errorBound") + // logger.info(s"estimated - $estimated, actual - $uniques, bound - $errorBound") assertTrue(Math.abs(estimated - uniques) < errorBound) } @@ -46,7 +48,7 @@ class ApproxDistinctTest extends TestCase { } val ir = irList.reduceLeft(counter.merge) val estimated = counter.finalize(ir) - // println(s"estimated - $estimated, actual - $uniques, bound - $errorBound") + // logger.info(s"estimated - $estimated, actual - $uniques, bound - $errorBound") assertTrue(Math.abs(estimated - uniques) < errorBound) } diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxPercentilesTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxPercentilesTest.scala index 9799991e0..a1ed5672c 100644 --- a/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxPercentilesTest.scala +++ b/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxPercentilesTest.scala @@ -16,6 +16,7 @@ package ai.chronon.aggregator.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.base.ApproxPercentiles import ai.chronon.aggregator.row.StatsGenerator import com.yahoo.sketches.kll.KllFloatsSketch @@ -25,6 +26,7 @@ import org.junit.Assert._ import scala.util.Random class ApproxPercentilesTest extends TestCase { + private val logger = LoggerFactory.getLogger(getClass) def testBasicImpl(nums: Int, slide: Int, k: Int, percentiles: Array[Double], errorPercent: Float): Unit = { val sorted = (0 to nums).map(_.toFloat) val elems = Random.shuffle(sorted.toList).toArray @@ -42,7 +44,7 @@ class ApproxPercentilesTest extends TestCase { val expected = result.indices.map(_ * step).map(_.toFloat).toArray val diffs = result.indices.map(i => Math.abs(result(i) - expected(i))) val errorMargin = (nums.toFloat * errorPercent) / 100.0 - println(s""" + logger.info(s""" |sketch size: ${merged.getSerializedSizeBytes} |result: ${result.toVector} |result size: ${result.size} @@ -66,7 +68,7 @@ class ApproxPercentilesTest extends TestCase { sample1.map(sketch1.update) sample2.map(sketch2.update) val drift = StatsGenerator.PSIKllSketch(sketch1.toByteArray, sketch2.toByteArray).asInstanceOf[Double] - println(s"PSI drift: $drift") + logger.info(s"PSI drift: $drift") drift } diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/RowAggregatorTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/RowAggregatorTest.scala index 58c25ce6a..853f85074 100644 --- a/aggregator/src/test/scala/ai/chronon/aggregator/test/RowAggregatorTest.scala +++ b/aggregator/src/test/scala/ai/chronon/aggregator/test/RowAggregatorTest.scala @@ -16,6 +16,7 @@ package ai.chronon.aggregator.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.row.RowAggregator import ai.chronon.api._ import junit.framework.TestCase @@ -25,6 +26,7 @@ import java.util import scala.collection.JavaConverters._ class TestRow(val fieldsSeq: Any*)(tsIndex: Int = 0) extends Row { + private val logger = LoggerFactory.getLogger(getClass) val fields: util.List[Any] = new java.util.ArrayList[Any](fieldsSeq.asJava) override val length: Int = fields.size() @@ -39,16 +41,18 @@ class TestRow(val fieldsSeq: Any*)(tsIndex: Int = 0) extends Row { override def mutationTs: Long = timeStamp - def print(): Unit = println(fieldsSeq) + def print(): Unit = logger.info(fieldsSeq) def set(index: Int, any: Any): Unit = fields.set(index, any) } object TestRow { + private val logger = LoggerFactory.getLogger(getClass) def apply(inputsArray: Any*): TestRow = new TestRow(inputsArray: _*)() } class RowAggregatorTest extends TestCase { + private val logger = LoggerFactory.getLogger(getClass) def testUpdate(): Unit = { val rows = List( TestRow(1L, 4, 5.0f, "A", Seq(5, 3, 4), Seq("D", "A", "B", "A"), Map("A" -> 1, "B" -> 2)), diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/SawtoothAggregatorTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/SawtoothAggregatorTest.scala index 73dfec18e..47e83c7be 100644 --- a/aggregator/src/test/scala/ai/chronon/aggregator/test/SawtoothAggregatorTest.scala +++ b/aggregator/src/test/scala/ai/chronon/aggregator/test/SawtoothAggregatorTest.scala @@ -16,6 +16,7 @@ package ai.chronon.aggregator.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.row.RowAggregator import ai.chronon.aggregator.test.SawtoothAggregatorTest.sawtoothAggregate import ai.chronon.aggregator.windowing._ @@ -30,6 +31,7 @@ import scala.collection.mutable import scala.collection.Seq class Timer { + private val logger = LoggerFactory.getLogger(getClass) var ts: Long = System.currentTimeMillis() @@ -38,12 +40,13 @@ class Timer { // TODO: Write this out into a file checked into git // or incorporate proper benchmarks def publish(name: String, reset: Boolean = true): Unit = { - println(s"${name.padTo(25, ' ')} ${System.currentTimeMillis() - ts} ms") + logger.info(s"${name.padTo(25, ' ')} ${System.currentTimeMillis() - ts} ms") if (reset) ts = System.currentTimeMillis() } } class SawtoothAggregatorTest extends TestCase { + private val logger = LoggerFactory.getLogger(getClass) def testTailAccuracy(): Unit = { val timer = new Timer @@ -175,6 +178,7 @@ class SawtoothAggregatorTest extends TestCase { } object SawtoothAggregatorTest { + private val logger = LoggerFactory.getLogger(getClass) // the result is irs in sorted order of queries // with head real-time accuracy and tail hop accuracy // NOTE: This provides a sketch for a distributed topology diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/VarianceTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/VarianceTest.scala index 71cdedf6b..d1a831250 100644 --- a/aggregator/src/test/scala/ai/chronon/aggregator/test/VarianceTest.scala +++ b/aggregator/src/test/scala/ai/chronon/aggregator/test/VarianceTest.scala @@ -16,11 +16,13 @@ package ai.chronon.aggregator.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.base.Variance import junit.framework.TestCase import org.junit.Assert._ class VarianceTest extends TestCase { + private val logger = LoggerFactory.getLogger(getClass) def mean(elems: Seq[Double]): Double = elems.sum / elems.length def naive(elems: Seq[Double]): Double = { @@ -52,8 +54,8 @@ class VarianceTest extends TestCase { val nums = (0 until cardinality).map { _ => min + math.random * (max - min) } val naiveResult = naive(nums) val welfordResult = welford(nums) - println(s"naive $naiveResult - welford $welfordResult - sum of squares ${sumOfSquares(nums)}") - println((naiveResult - welfordResult) / naiveResult) + logger.info(s"naive $naiveResult - welford $welfordResult - sum of squares ${sumOfSquares(nums)}") + logger.info((naiveResult - welfordResult) / naiveResult) assertTrue((naiveResult - welfordResult) / naiveResult < 0.0000001) } diff --git a/api/src/main/scala/ai/chronon/api/Extensions.scala b/api/src/main/scala/ai/chronon/api/Extensions.scala index 76c781d5a..9884475d1 100644 --- a/api/src/main/scala/ai/chronon/api/Extensions.scala +++ b/api/src/main/scala/ai/chronon/api/Extensions.scala @@ -16,6 +16,7 @@ package ai.chronon.api +import org.slf4j.LoggerFactory import ai.chronon.api.DataModel._ import ai.chronon.api.Operation._ import com.fasterxml.jackson.core.`type`.TypeReference @@ -31,8 +32,10 @@ import scala.util.ScalaJavaConversions.{IteratorOps, ListOps, MapOps} import scala.util.{Failure, Success, Try} object Extensions { + private val logger = LoggerFactory.getLogger(getClass) implicit class TimeUnitOps(timeUnit: TimeUnit) { + private val logger = LoggerFactory.getLogger(getClass) def str: String = timeUnit match { case TimeUnit.HOURS => "h" @@ -47,6 +50,7 @@ object Extensions { } implicit class OperationOps(operation: Operation) { + private val logger = LoggerFactory.getLogger(getClass) def isSimple: Boolean = operation match { case Operation.FIRST | Operation.LAST | Operation.LAST_K | Operation.FIRST_K => false @@ -58,6 +62,7 @@ object Extensions { } implicit class WindowOps(window: Window) { + private val logger = LoggerFactory.getLogger(getClass) private def unbounded: Boolean = window.length == Int.MaxValue || window.length <= 0 def str: String = @@ -70,6 +75,7 @@ object Extensions { } object WindowUtils { + private val logger = LoggerFactory.getLogger(getClass) val Unbounded: Window = new Window(Int.MaxValue, TimeUnit.DAYS) val Hour: Window = new Window(1, TimeUnit.HOURS) val Day: Window = new Window(1, TimeUnit.DAYS) @@ -93,6 +99,7 @@ object Extensions { } implicit class MetadataOps(metaData: MetaData) { + private val logger = LoggerFactory.getLogger(getClass) def cleanName: String = metaData.name.sanitize def outputTable = s"${metaData.outputNamespace}.${metaData.cleanName}" @@ -150,6 +157,7 @@ object Extensions { // one per output column - so single window // not exposed to users implicit class AggregationPartOps(aggregationPart: AggregationPart) { + private val logger = LoggerFactory.getLogger(getClass) def getInt(arg: String, default: Option[Int] = None): Int = { val argOpt = Option(aggregationPart.argMap) @@ -177,6 +185,7 @@ object Extensions { } implicit class AggregationOps(aggregation: Aggregation) { + private val logger = LoggerFactory.getLogger(getClass) // one agg part per bucket per window // unspecified windows are translated to one unbounded window @@ -232,6 +241,9 @@ object Extensions { case class UnpackedAggregations(perBucket: Array[AggregationPart], perWindow: Array[WindowMapping]) object UnpackedAggregations { + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) def from(aggregations: Seq[Aggregation]): UnpackedAggregations = { var counter = 0 val perBucket = new mutable.ArrayBuffer[AggregationPart] @@ -277,6 +289,7 @@ object Extensions { } implicit class AggregationsOps(aggregations: Seq[Aggregation]) { + private val logger = LoggerFactory.getLogger(getClass) def hasTimedAggregations: Boolean = aggregations.exists(_.operation match { case LAST_K | FIRST_K | LAST | FIRST => true @@ -300,6 +313,7 @@ object Extensions { } implicit class SourceOps(source: Source) { + private val logger = LoggerFactory.getLogger(getClass) def dataModel: DataModel = { assert(source.isSetEntities || source.isSetEvents || source.isSetJoinSource, "Source type is not specified") if (source.isSetEntities) Entities @@ -412,6 +426,7 @@ object Extensions { } implicit class GroupByOps(groupBy: GroupBy) extends GroupBy(groupBy) { + private val logger = LoggerFactory.getLogger(getClass) def maxWindow: Option[Window] = { val allWindowsOpt = Option(groupBy.aggregations) .flatMap(_.toScala.toSeq.allWindowsOpt) @@ -532,6 +547,7 @@ object Extensions { case class QueryParts(selects: Option[Seq[String]], wheres: Seq[String]) def buildQueryParts(query: Query): QueryParts = { + private val logger = LoggerFactory.getLogger(getClass) val selects = query.getQuerySelects val timeColumn = Option(query.timeColumn).getOrElse(Constants.TimeColumn) @@ -612,12 +628,14 @@ object Extensions { } implicit class StringOps(string: String) { + private val logger = LoggerFactory.getLogger(getClass) def sanitize: String = Option(string).map(_.replaceAll("[^a-zA-Z0-9_]", "_")).orNull def cleanSpec: String = string.split("/").head } implicit class ExternalSourceOps(externalSource: ExternalSource) extends ExternalSource(externalSource) { + private val logger = LoggerFactory.getLogger(getClass) private def schemaNames(schema: TDataType): Array[String] = schemaFields(schema).map(_.name) private def schemaFields(schema: TDataType): Array[StructField] = @@ -634,6 +652,7 @@ object Extensions { } object KeyMappingHelper { + private val logger = LoggerFactory.getLogger(getClass) // key mapping is defined as {left_col1: right_col1}, on the right there can be two keys [right_col1, right_col2] // Left is implicitly assumed to have right_col2 // We need to convert a map {left_col1: a, right_col2: b, irrelevant_col: c} into {right_col1: a, right_col2: b} @@ -649,6 +668,7 @@ object Extensions { } implicit class ExternalPartOps(externalPart: ExternalPart) extends ExternalPart(externalPart) { + private val logger = LoggerFactory.getLogger(getClass) lazy val fullName: String = Constants.ExternalPrefix + "_" + Option(externalPart.prefix).map(_ + "_").getOrElse("") + @@ -690,6 +710,7 @@ object Extensions { } implicit class JoinPartOps(joinPart: JoinPart) extends JoinPart(joinPart) { + private val logger = LoggerFactory.getLogger(getClass) lazy val fullPrefix = (Option(prefix) ++ Some(groupBy.getMetaData.cleanName)).mkString("_") lazy val leftToRight: Map[String, String] = rightToLeft.map { case (key, value) => value -> key } @@ -719,6 +740,7 @@ object Extensions { } implicit class LabelPartOps(val labelPart: LabelPart) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) def leftKeyCols: Array[String] = { labelPart.labels.toScala .flatMap { @@ -747,6 +769,7 @@ object Extensions { } implicit class BootstrapPartOps(val bootstrapPart: BootstrapPart) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) /** * Compress the info such that the hash can be stored at record and @@ -778,11 +801,13 @@ object Extensions { } object JoinOps { + private val logger = LoggerFactory.getLogger(getClass) private val identifierRegex: Pattern = Pattern.compile("[a-zA-Z_][a-zA-Z0-9_]*") def isIdentifier(s: String): Boolean = identifierRegex.matcher(s).matches() } implicit class JoinOps(val join: Join) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) // all keys as they should appear in left that are being used on right def leftKeyCols: Array[String] = { join.joinParts.toScala @@ -923,7 +948,7 @@ object Extensions { } .filter(_.nonEmpty) .mkString(joiner) - println(s"Generated join left side skew filter:\n $result") + logger.info(s"Generated join left side skew filter:\n $result") result } } @@ -943,7 +968,7 @@ object Extensions { .mkString(joiner) if (result.nonEmpty) { - println(s"Generated join part skew filter for ${joinPart.groupBy.metaData.name}:\n $result") + logger.info(s"Generated join part skew filter for ${joinPart.groupBy.metaData.name}:\n $result") Some(result) } else None } @@ -985,6 +1010,7 @@ object Extensions { } implicit class StringsOps(strs: Iterable[String]) { + private val logger = LoggerFactory.getLogger(getClass) def pretty: String = { if (strs.nonEmpty) "\n " + strs.mkString(",\n ") + "\n" @@ -996,6 +1022,7 @@ object Extensions { } implicit class QueryOps(query: Query) { + private val logger = LoggerFactory.getLogger(getClass) def setupsSeq: Seq[String] = { Option(query.setups) .map( @@ -1008,6 +1035,7 @@ object Extensions { } implicit class ThrowableOps(throwable: Throwable) { + private val logger = LoggerFactory.getLogger(getClass) def traceString: String = { val sw = new StringWriter() val pw = new PrintWriter(sw) @@ -1017,6 +1045,7 @@ object Extensions { } implicit class DerivationOps(derivations: List[Derivation]) { + private val logger = LoggerFactory.getLogger(getClass) lazy val derivationsContainStar: Boolean = derivations.iterator.exists(_.name == "*") lazy val derivationsWithoutStar: List[Derivation] = derivations.filterNot(_.name == "*") lazy val areDerivationsRenameOnly: Boolean = derivationsWithoutStar.forall(d => JoinOps.isIdentifier(d.expression)) diff --git a/api/src/main/scala/ai/chronon/api/ParametricMacro.scala b/api/src/main/scala/ai/chronon/api/ParametricMacro.scala index 637e948ff..46750785b 100644 --- a/api/src/main/scala/ai/chronon/api/ParametricMacro.scala +++ b/api/src/main/scala/ai/chronon/api/ParametricMacro.scala @@ -16,10 +16,12 @@ package ai.chronon.api +import org.slf4j.LoggerFactory import scala.collection.mutable // takes a map of macro names and functions and applies the functions on macro arguments case class ParametricMacro(value: String, func: Map[String, String] => String) { + private val logger = LoggerFactory.getLogger(getClass) private val pattern = s"""\\{\\{\\s*$value(\\([\\s0-9A-Za-z_.,=]*\\))*\\s*}}""".r def replace(str: String): String = { @@ -38,7 +40,7 @@ case class ParametricMacro(value: String, func: Map[String, String] => String) { argSeq.tail :+ (argSeq.head + "," + token) } } - println(parsed) + logger.info(parsed) parsed.map(_.split("=").map(_.trim)).map(x => x(0) -> x(1)).toMap } val result = func(argMap.getOrElse(Map.empty[String, String])) @@ -51,10 +53,11 @@ case class ParametricMacro(value: String, func: Map[String, String] => String) { } object ParametricMacro { + private val logger = LoggerFactory.getLogger(getClass) def main(args: Array[String]): Unit = { val mc = ParametricMacro("something", { x => "st:" + x.keys.mkString("/") + "|" + x.values.mkString("/") }) val str = "something nothing-{{ something( a_1=b,, 3.1, c=d) }}-something after-{{ thing:a1=b1 }}{{ something }}" val replaced = mc.replace(str) - println(replaced) + logger.info(replaced) } } diff --git a/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala b/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala index 79c8e8f24..0942d1422 100644 --- a/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala +++ b/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala @@ -16,6 +16,7 @@ package ai.chronon.api +import org.slf4j.LoggerFactory import ai.chronon.api.Extensions.StringsOps import com.fasterxml.jackson.databind.{DeserializationFeature, JsonNode, ObjectMapper} import org.apache.thrift.protocol.{TCompactProtocol, TSimpleJSONProtocol} @@ -28,6 +29,7 @@ import scala.reflect.ClassTag import scala.util.ScalaJavaConversions.ListOps object ThriftJsonCodec { + private val logger = LoggerFactory.getLogger(getClass) def serializer = new TSerializer(new TSimpleJSONProtocol.Factory()) @@ -63,7 +65,7 @@ object ThriftJsonCodec { base } catch { case e: Exception => { - println("Failed to deserialize using compact protocol, trying Json.") + logger.info("Failed to deserialize using compact protocol, trying Json.") fromJsonStr(new String(bytes), check = false, base.getClass) } } @@ -88,6 +90,7 @@ object ThriftJsonCodec { } def fromJsonFile[T <: TBase[_, _]: Manifest: ClassTag](fileName: String, check: Boolean): T = { + private val logger = LoggerFactory.getLogger(getClass) val src = fromFile(fileName) val jsonStr = try src.mkString diff --git a/api/src/test/scala/ai/chronon/api/test/DataTypeConversionTest.scala b/api/src/test/scala/ai/chronon/api/test/DataTypeConversionTest.scala index 09cfe648c..1640d8f98 100644 --- a/api/src/test/scala/ai/chronon/api/test/DataTypeConversionTest.scala +++ b/api/src/test/scala/ai/chronon/api/test/DataTypeConversionTest.scala @@ -16,6 +16,7 @@ package ai.chronon.api.test +import org.slf4j.LoggerFactory import ai.chronon.api._ import org.apache.thrift.TSerializer import org.apache.thrift.protocol.TSimpleJSONProtocol @@ -23,6 +24,7 @@ import org.junit.Assert._ import org.junit.Test class DataTypeConversionTest { + private val logger = LoggerFactory.getLogger(getClass) @Test def testDataTypeToThriftAndBack(): Unit = { // build some complex type @@ -45,7 +47,7 @@ class DataTypeConversionTest { // serialize with TSimpleJson - this is what python code will do val jsonSerializer = new TSerializer(new TSimpleJSONProtocol.Factory()) val json = new String(jsonSerializer.serialize(thriftType)) - println(json) + logger.info(json) val reversedTType = ThriftJsonCodec.fromJsonStr[TDataType](json, check = true, classOf[TDataType]) val reversed = DataType.fromTDataType(reversedTType) diff --git a/flink/src/main/scala/ai/chronon/flink/AsyncKVStoreWriter.scala b/flink/src/main/scala/ai/chronon/flink/AsyncKVStoreWriter.scala index dafb7664e..81e82142b 100644 --- a/flink/src/main/scala/ai/chronon/flink/AsyncKVStoreWriter.scala +++ b/flink/src/main/scala/ai/chronon/flink/AsyncKVStoreWriter.scala @@ -1,5 +1,6 @@ package ai.chronon.flink +import org.slf4j.LoggerFactory import ai.chronon.online.{Api, KVStore} import ai.chronon.online.KVStore.PutRequest import org.apache.flink.configuration.Configuration @@ -16,6 +17,8 @@ import scala.util.{Failure, Success} case class WriteResponse(putRequest: PutRequest, status: Boolean) object AsyncKVStoreWriter { + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) private val kvStoreConcurrency = 10 private val defaultTimeoutMillis = 1000L @@ -45,6 +48,7 @@ object AsyncKVStoreWriter { * This was moved to flink-rpc-akka in Flink 1.16 and made private, so we reproduce the direct execution context here */ private class DirectExecutionContext extends ExecutionContext { + private val logger = LoggerFactory.getLogger(getClass) override def execute(runnable: Runnable): Unit = runnable.run() @@ -64,6 +68,7 @@ object AsyncKVStoreWriter { */ class AsyncKVStoreWriter(onlineImpl: Api, featureGroupName: String) extends RichAsyncFunction[PutRequest, WriteResponse] { + private val logger = LoggerFactory.getLogger(getClass) @transient private var kvStore: KVStore = _ @@ -88,7 +93,7 @@ class AsyncKVStoreWriter(onlineImpl: Api, featureGroupName: String) } override def timeout(input: PutRequest, resultFuture: ResultFuture[WriteResponse]): Unit = { - println(s"Timed out writing to KV Store for object: $input") + logger.info(s"Timed out writing to KV Store for object: $input") errorCounter.inc() resultFuture.complete(util.Arrays.asList[WriteResponse](WriteResponse(input, status = false))) } @@ -102,7 +107,7 @@ class AsyncKVStoreWriter(onlineImpl: Api, featureGroupName: String) successCounter.inc() } else { errorCounter.inc() - println(s"Failed to write to KVStore for object: $input") + logger.info(s"Failed to write to KVStore for object: $input") } resultFuture.complete(util.Arrays.asList[WriteResponse](WriteResponse(input, status = succeeded))) case Failure(exception) => @@ -110,7 +115,7 @@ class AsyncKVStoreWriter(onlineImpl: Api, featureGroupName: String) // in the KVStore - we log the exception and skip the object to // not fail the app errorCounter.inc() - println(s"Caught exception writing to KVStore for object: $input - $exception") + logger.info(s"Caught exception writing to KVStore for object: $input - $exception") resultFuture.complete(util.Arrays.asList[WriteResponse](WriteResponse(input, status = false))) } } diff --git a/flink/src/main/scala/ai/chronon/flink/AvroCodecFn.scala b/flink/src/main/scala/ai/chronon/flink/AvroCodecFn.scala index d10bb26cc..db626ed8d 100644 --- a/flink/src/main/scala/ai/chronon/flink/AvroCodecFn.scala +++ b/flink/src/main/scala/ai/chronon/flink/AvroCodecFn.scala @@ -1,5 +1,6 @@ package ai.chronon.flink +import org.slf4j.LoggerFactory import ai.chronon.api.Extensions.GroupByOps import ai.chronon.api.{Constants, DataModel, Query, StructType => ChrononStructType} import ai.chronon.online.{AvroConversions, GroupByServingInfoParsed} @@ -19,6 +20,7 @@ import scala.jdk.CollectionConverters._ */ case class AvroCodecFn[T](groupByServingInfoParsed: GroupByServingInfoParsed) extends RichFlatMapFunction[Map[String, Any], PutRequest] { + private val logger = LoggerFactory.getLogger(getClass) @transient protected var avroConversionErrorCounter: Counter = _ @@ -86,7 +88,7 @@ case class AvroCodecFn[T](groupByServingInfoParsed: GroupByServingInfoParsed) case e: Exception => // To improve availability, we don't rethrow the exception. We just drop the event // and track the errors in a metric. If there are too many errors we'll get alerted/paged. - println(s"Error converting to Avro bytes - $e") + logger.info(s"Error converting to Avro bytes - $e") avroConversionErrorCounter.inc() } diff --git a/flink/src/main/scala/ai/chronon/flink/SparkExpressionEvalFn.scala b/flink/src/main/scala/ai/chronon/flink/SparkExpressionEvalFn.scala index 1bcec1f6a..439eb91af 100644 --- a/flink/src/main/scala/ai/chronon/flink/SparkExpressionEvalFn.scala +++ b/flink/src/main/scala/ai/chronon/flink/SparkExpressionEvalFn.scala @@ -1,5 +1,6 @@ package ai.chronon.flink +import org.slf4j.LoggerFactory import ai.chronon.api.Extensions.{GroupByOps, MetadataOps} import ai.chronon.api.{Constants, GroupBy, Query, StructType => ChrononStructType} import ai.chronon.online.{CatalystUtil, SparkConversions} @@ -27,6 +28,8 @@ import scala.jdk.CollectionConverters.{asScalaBufferConverter, mapAsScalaMapConv * @tparam T The type of the input data. */ class SparkExpressionEvalFn[T](encoder: Encoder[T], groupBy: GroupBy) extends RichFlatMapFunction[T, Map[String, Any]] { + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) private val query: Query = groupBy.streamingSource.get.getEvents.query @@ -100,7 +103,7 @@ class SparkExpressionEvalFn[T](encoder: Encoder[T], groupBy: GroupBy) extends Ri case e: Exception => // To improve availability, we don't rethrow the exception. We just drop the event // and track the errors in a metric. If there are too many errors we'll get alerted/paged. - println(s"Error evaluating Spark expression - $e") + logger.info(s"Error evaluating Spark expression - $e") exprEvalErrorCounter.inc() } } diff --git a/log.py b/log.py new file mode 100644 index 000000000..37226b62c --- /dev/null +++ b/log.py @@ -0,0 +1,53 @@ +import os +import re + +def process_file(file_path): + with open(file_path, 'r') as file: + lines = file.readlines() + + # Check if the file contains println + contains_println = any('println' in line for line in lines) + if not contains_println: + return # No need to modify the file + + # Prepare the import statement and logger instance + import_statement = 'import org.slf4j.LoggerFactory\n' + logger_instance = ' private val logger = LoggerFactory.getLogger(getClass)\n' + + # Determine where to insert the import + import_index = next((i for i, line in enumerate(lines) if line.startswith('import')), 0) + if import_index != 0 or not any(line.startswith('import org.slf4j.LoggerFactory') for line in lines): + lines.insert(import_index, import_statement) + import_index += 1 + + # Regex to match class or object definitions + object_or_class_regex = re.compile(r'\b(object|class)\s+\w+') + + # Insert logger instance after the opening brace of each class or object + for i in range(len(lines)): + if object_or_class_regex.search(lines[i]): + # Find the opening brace + brace_index = i + while brace_index < len(lines) and '{' not in lines[brace_index]: + brace_index += 1 + if brace_index < len(lines): + lines.insert(brace_index + 1, logger_instance) + + # Replace println with logger.info + updated_content = ''.join(lines).replace('println', 'logger.info') + + # Write the updated content + with open(file_path, 'w') as file: + file.write(updated_content) + +def search_and_replace(directory): + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.scala'): + process_file(os.path.join(root, file)) + +# Run from the current directory +current_directory = os.getcwd() +search_and_replace(current_directory) + +print("Replacement complete.") diff --git a/online/src/main/scala/ai/chronon/online/Api.scala b/online/src/main/scala/ai/chronon/online/Api.scala index 78342a659..2258832e0 100644 --- a/online/src/main/scala/ai/chronon/online/Api.scala +++ b/online/src/main/scala/ai/chronon/online/Api.scala @@ -16,6 +16,7 @@ package ai.chronon.online +import org.slf4j.LoggerFactory import ai.chronon.api.{Constants, StructType} import ai.chronon.online.KVStore.{GetRequest, GetResponse, PutRequest} import org.apache.spark.sql.SparkSession @@ -27,11 +28,15 @@ import scala.concurrent.{Await, ExecutionContext, Future} import scala.util.{Failure, Success, Try} object KVStore { + private val logger = LoggerFactory.getLogger(getClass) // a scan request essentially for the keyBytes // afterTsMillis - is used to limit the scan to more recent data case class GetRequest(keyBytes: Array[Byte], dataset: String, afterTsMillis: Option[Long] = None) case class TimedValue(bytes: Array[Byte], millis: Long) case class GetResponse(request: GetRequest, values: Try[Seq[TimedValue]]) { + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) def latest: Try[TimedValue] = values.map(_.maxBy(_.millis)) } case class PutRequest(keyBytes: Array[Byte], valueBytes: Array[Byte], dataset: String, tsMillis: Option[Long] = None) @@ -40,6 +45,7 @@ object KVStore { // the main system level api for key value storage // used for streaming writes, batch bulk uploads & fetching trait KVStore { + private val logger = LoggerFactory.getLogger(getClass) implicit val executionContext: ExecutionContext = FlexibleExecutionContext.buildExecutionContext def create(dataset: String): Unit @@ -69,7 +75,7 @@ trait KVStore { .map(_.head) .recover { case e: java.util.NoSuchElementException => - println( + logger.info( s"Failed request against ${request.dataset} check the related task to the upload of the dataset (GroupByUpload or MetadataUpload)") throw e } @@ -123,6 +129,10 @@ case class LoggableResponseBase64(keyBase64: String, schemaHash: String) abstract class StreamDecoder extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) def decode(bytes: Array[Byte]): Mutation def schema: StructType } @@ -132,6 +142,7 @@ trait StreamBuilder { } object ExternalSourceHandler { + private val logger = LoggerFactory.getLogger(getClass) private[ExternalSourceHandler] val executor = FlexibleExecutionContext.buildExecutionContext } @@ -140,6 +151,8 @@ object ExternalSourceHandler { // There is a Java Friendly Handler that extends this and handles conversions // see: [[ai.chronon.online.JavaExternalSourceHandler]] abstract class ExternalSourceHandler extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) implicit lazy val executionContext: ExecutionContext = ExternalSourceHandler.executor def fetch(requests: Seq[Fetcher.Request]): Future[Seq[Fetcher.Response]] } @@ -147,6 +160,9 @@ abstract class ExternalSourceHandler extends Serializable { // the implementer of this class should take a single argument, a scala map of string to string // chronon framework will construct this object with user conf supplied via CLI abstract class Api(userConf: Map[String, String]) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) lazy val fetcher: Fetcher = { if (fetcherObj == null) fetcherObj = buildFetcher() diff --git a/online/src/main/scala/ai/chronon/online/DataStreamBuilder.scala b/online/src/main/scala/ai/chronon/online/DataStreamBuilder.scala index ed819c2ba..cc73317eb 100644 --- a/online/src/main/scala/ai/chronon/online/DataStreamBuilder.scala +++ b/online/src/main/scala/ai/chronon/online/DataStreamBuilder.scala @@ -16,6 +16,7 @@ package ai.chronon.online +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.{Constants, DataModel} import ai.chronon.api.DataModel.DataModel @@ -27,6 +28,8 @@ import scala.util.{Failure, Success, Try} case class TopicInfo(name: String, topicType: String, params: Map[String, String]) object TopicInfo { + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) // default topic type is kafka // kafka://topic_name/schema=my_schema/host=X/port=Y should parse into TopicInfo(topic_name, kafka, {schema: my_schema, host: X, port Y}) def parse(topic: String): TopicInfo = { @@ -48,6 +51,7 @@ object TopicInfo { } case class DataStream(df: DataFrame, partitions: Int, topicInfo: TopicInfo) { + private val logger = LoggerFactory.getLogger(getClass) // apply a query to a given data stream def apply(query: api.Query, keys: Seq[String] = null, dataModel: DataModel = DataModel.Events): DataStream = { @@ -55,9 +59,9 @@ case class DataStream(df: DataFrame, partitions: Int, topicInfo: TopicInfo) { Option(query.setups).map(_.toScala.map { setup => Try(df.sparkSession.sql(setup)) match { case Failure(ex) => - println(s"[Failure] Setup command: ($setup) failed with exception: ${ex.toString}") + logger.info(s"[Failure] Setup command: ($setup) failed with exception: ${ex.toString}") ex.printStackTrace(System.out) - case Success(value) => println(s"[SUCCESS] Setup command: $setup") + case Success(value) => logger.info(s"[SUCCESS] Setup command: $setup") } }) @@ -71,12 +75,13 @@ case class DataStream(df: DataFrame, partitions: Int, topicInfo: TopicInfo) { case DataModel.Events => Map.empty }) val selectsOption: Option[Map[String, String]] = for { + private val logger = LoggerFactory.getLogger(getClass) selectMap <- Option(query.selects).map(_.toScala.toMap) keyMap = Option(keys).map(_.map(k => k -> k).toMap).getOrElse(Map.empty) } yield (keyMap ++ selectMap ++ timeSelects) val selectClauses = selectsOption.map { _.map { case (name, expr) => s"($expr) AS `$name`" }.toSeq } - println(s"Applying select clauses: $selectClauses") + logger.info(s"Applying select clauses: $selectClauses") val selectedDf = selectClauses.map { selects => df.selectExpr(selects: _*) }.getOrElse(df) // enrich where clauses @@ -93,7 +98,7 @@ case class DataStream(df: DataFrame, partitions: Int, topicInfo: TopicInfo) { val baseWheres = Option(query.wheres).map(_.toScala).getOrElse(Seq.empty[String]) val whereClauses = baseWheres ++ atLeastOneKeyIsPresent :+ timeIsPresent - println(s"Applying where clauses: $whereClauses") + logger.info(s"Applying where clauses: $whereClauses") val filteredDf = whereClauses.foldLeft(selectedDf)(_.where(_)) DataStream(filteredDf, partitions, topicInfo) } diff --git a/online/src/main/scala/ai/chronon/online/Fetcher.scala b/online/src/main/scala/ai/chronon/online/Fetcher.scala index 57ea5b0ed..38fa4093b 100644 --- a/online/src/main/scala/ai/chronon/online/Fetcher.scala +++ b/online/src/main/scala/ai/chronon/online/Fetcher.scala @@ -16,6 +16,7 @@ package ai.chronon.online +import org.slf4j.LoggerFactory import ai.chronon.aggregator.row.{ColumnAggregator, StatsGenerator} import ai.chronon.api import ai.chronon.api.Constants.UTF8 @@ -35,6 +36,7 @@ import scala.concurrent.Future import scala.util.{Failure, Success, Try} object Fetcher { + private val logger = LoggerFactory.getLogger(getClass) case class Request(name: String, keys: Map[String, AnyRef], atMillis: Option[Long] = None, @@ -48,6 +50,13 @@ object Fetcher { case class ResponseWithContext(request: Request, derivedValues: Map[String, AnyRef], baseValues: Map[String, AnyRef]) { + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) def combinedValues: Map[String, AnyRef] = baseValues ++ derivedValues } case class ColumnSpec(groupByName: String, @@ -56,6 +65,7 @@ object Fetcher { keyMapping: Option[Map[String, AnyRef]]) def logResponseStats(response: Response, context: Metrics.Context): Unit = { + private val logger = LoggerFactory.getLogger(getClass) val responseMap = response.values.get var exceptions = 0 var nulls = 0 @@ -80,6 +90,8 @@ class Fetcher(val kvStore: KVStore, debug: Boolean = false, val externalSourceRegistry: ExternalSourceRegistry = null) extends FetcherBase(kvStore, metaDataSet, timeoutMillis, debug) { + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) def buildJoinCodec(joinConf: api.Join): JoinCodec = { val keyFields = new mutable.LinkedHashSet[StructField] @@ -161,8 +173,8 @@ class Fetcher(val kvStore: KVStore, internalResponses.zip(externalResponses).map { case (internalResponse, externalResponse) => if (debug) { - println(internalResponse.values.get.keys.toSeq) - println(externalResponse.values.get.keys.toSeq) + logger.info(internalResponse.values.get.keys.toSeq) + logger.info(externalResponse.values.get.keys.toSeq) } val cleanInternalRequest = internalResponse.request.copy(context = None) assert( @@ -264,10 +276,10 @@ class Fetcher(val kvStore: KVStore, } if (debug) { - println(s"Logging ${resp.request.keys} : ${hash % 100000}: $samplePercent") + logger.info(s"Logging ${resp.request.keys} : ${hash % 100000}: $samplePercent") val gson = new Gson() val valuesFormatted = values.map { case (k, v) => s"$k -> ${gson.toJson(v)}" }.mkString(", ") - println(s"""Sampled join fetch + logger.info(s"""Sampled join fetch |Key Map: ${resp.request.keys} |Value Map: [${valuesFormatted}] |""".stripMargin) @@ -291,7 +303,7 @@ class Fetcher(val kvStore: KVStore, context.distribution("logging_request.overall.latency.millis", System.currentTimeMillis() - ts)) if (debug) { - println(s"Logged data with schema_hash ${codec.loggingSchemaHash}") + logger.info(s"Logged data with schema_hash ${codec.loggingSchemaHash}") } } } @@ -300,7 +312,7 @@ class Fetcher(val kvStore: KVStore, // to handle GroupByServingInfo staleness that results in encoding failure getJoinCodecs.refresh(resp.request.name) joinContext.foreach(_.incrementException(exception)) - println(s"logging failed due to ${exception.traceString}") + logger.info(s"logging failed due to ${exception.traceString}") } Response(resp.request, Success(resp.derivedValues)) } @@ -410,7 +422,7 @@ class Fetcher(val kvStore: KVStore, if (logFunc != null) { logFunc.accept(controlEvent) if (debug) { - println(s"schema data logged successfully with schema_hash ${enc.loggingSchemaHash}") + logger.info(s"schema data logged successfully with schema_hash ${enc.loggingSchemaHash}") } } } diff --git a/online/src/main/scala/ai/chronon/online/FetcherBase.scala b/online/src/main/scala/ai/chronon/online/FetcherBase.scala index dd845464e..b79d9de12 100644 --- a/online/src/main/scala/ai/chronon/online/FetcherBase.scala +++ b/online/src/main/scala/ai/chronon/online/FetcherBase.scala @@ -16,6 +16,7 @@ package ai.chronon.online +import org.slf4j.LoggerFactory import ai.chronon.aggregator.row.ColumnAggregator import ai.chronon.aggregator.windowing import ai.chronon.aggregator.windowing.{FinalBatchIr, SawtoothOnlineAggregator, TsUtils} @@ -42,6 +43,7 @@ class FetcherBase(kvStore: KVStore, timeoutMillis: Long = 10000, debug: Boolean = false) extends MetadataStore(kvStore, metaDataSet, timeoutMillis) { + private val logger = LoggerFactory.getLogger(getClass) private case class GroupByRequestMeta( groupByServingInfoParsed: GroupByServingInfoParsed, @@ -61,6 +63,7 @@ class FetcherBase(kvStore: KVStore, overallLatency: Long, context: Metrics.Context, totalResponseValueBytes: Int): Map[String, AnyRef] = { + private val logger = LoggerFactory.getLogger(getClass) val latestBatchValue = batchResponsesTry.map(_.maxBy(_.millis)) val servingInfo = latestBatchValue.map(timedVal => updateServingInfo(timedVal.millis, oldServingInfo)).getOrElse(oldServingInfo) @@ -87,7 +90,7 @@ class FetcherBase(kvStore: KVStore, case DataModel.Entities => servingInfo.mutationValueAvroCodec } if (batchBytes == null && (streamingResponses == null || streamingResponses.isEmpty)) { - if (debug) println("Both batch and streaming data are null") + if (debug) logger.info("Both batch and streaming data are null") null } else { val streamingRows: Array[Row] = streamingResponses.iterator @@ -103,7 +106,7 @@ class FetcherBase(kvStore: KVStore, val output = aggregator.lambdaAggregateFinalized(batchIr, streamingRows.iterator, queryTimeMs, mutations) if (debug) { val gson = new Gson() - println(s""" + logger.info(s""" |batch ir: ${gson.toJson(batchIr)} |streamingRows: ${gson.toJson(streamingRows)} |batchEnd in millis: ${servingInfo.batchEndTsMillis} @@ -139,14 +142,14 @@ class FetcherBase(kvStore: KVStore, groupByServingInfo: GroupByServingInfoParsed): GroupByServingInfoParsed = { val name = groupByServingInfo.groupBy.metaData.name if (batchEndTs > groupByServingInfo.batchEndTsMillis) { - println(s"""$name's value's batch timestamp of $batchEndTs is + logger.info(s"""$name's value's batch timestamp of $batchEndTs is |ahead of schema timestamp of ${groupByServingInfo.batchEndTsMillis}. |Forcing an update of schema.""".stripMargin) getGroupByServingInfo .force(name) .recover { case ex: Throwable => - println( + logger.info( s"Couldn't update GroupByServingInfo of $name due to ${ex.getMessage}. Proceeding with the old one.") ex.printStackTrace() groupByServingInfo @@ -255,7 +258,7 @@ class FetcherBase(kvStore: KVStore, val queryTs = request.atMillis.getOrElse(System.currentTimeMillis()) try { if (debug) - println( + logger.info( s"Constructing response for groupBy: ${groupByServingInfo.groupByOps.metaData.getName} " + s"for keys: ${request.keys}") constructGroupByResponse(batchResponseTryAll, @@ -369,7 +372,7 @@ class FetcherBase(kvStore: KVStore, .recover { // capture exception as a key case ex: Throwable => if (debug || Math.random() < 0.001) { - println(s"Failed to fetch $groupByRequest with \n${ex.traceString}") + logger.info(s"Failed to fetch $groupByRequest with \n${ex.traceString}") } Map(groupByRequest.name + "_exception" -> ex.traceString) } @@ -448,7 +451,7 @@ class FetcherBase(kvStore: KVStore, .recoverWith { // capture exception as a key case ex: Throwable => if (debug || Math.random() < 0.001) { - println(s"Failed to fetch $request with \n${ex.traceString}") + logger.info(s"Failed to fetch $request with \n${ex.traceString}") } Failure(ex) } diff --git a/online/src/main/scala/ai/chronon/online/MetadataStore.scala b/online/src/main/scala/ai/chronon/online/MetadataStore.scala index 685cf76ff..5eb03bc6e 100644 --- a/online/src/main/scala/ai/chronon/online/MetadataStore.scala +++ b/online/src/main/scala/ai/chronon/online/MetadataStore.scala @@ -16,6 +16,7 @@ package ai.chronon.online +import org.slf4j.LoggerFactory import ai.chronon.api.Constants.{ChrononMetadataKey, UTF8} import ai.chronon.api.Extensions.{JoinOps, MetadataOps, StringOps, WindowOps, WindowUtils} import ai.chronon.api._ @@ -36,6 +37,8 @@ import scala.util.{Failure, Success, Try} case class DataMetrics(series: Seq[(Long, SortedMap[String, Any])]) class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, timeoutMillis: Long) { + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) private var partitionSpec = PartitionSpec(format = "yyyy-MM-dd", spanMillis = WindowUtils.Day.millis) private val CONF_BATCH_SIZE = 50 @@ -73,7 +76,7 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, val result = getConf[Join](s"joins/$name") .recover { case e: java.util.NoSuchElementException => - println( + logger.info( s"Failed to fetch conf for join $name at joins/$name, please check metadata upload to make sure the join metadata for $name has been uploaded") throw e } @@ -92,7 +95,7 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, { join => Metrics.Context(environment = "join.meta.fetch", join = join) }) def putJoinConf(join: Join): Unit = { - println(s"uploading join conf to dataset: $dataset by key: joins/${join.metaData.nameToFilePath}") + logger.info(s"uploading join conf to dataset: $dataset by key: joins/${join.metaData.nameToFilePath}") kvStore.put( PutRequest(s"joins/${join.metaData.nameToFilePath}".getBytes(Constants.UTF8), ThriftJsonCodec.toJsonStr(join).getBytes(Constants.UTF8), @@ -104,7 +107,7 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, .getString(key, dataset, timeoutMillis) .recover { case e: java.util.NoSuchElementException => - println(s"Failed to retrieve $key for $dataset. Is it possible that hasn't been uploaded?") + logger.info(s"Failed to retrieve $key for $dataset. Is it possible that hasn't been uploaded?") throw e } .map(AvroCodec.of(_)) @@ -125,11 +128,11 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, val metaData = kvStore.getString(Constants.GroupByServingInfoKey, batchDataset, timeoutMillis).recover { case e: java.util.NoSuchElementException => - println( + logger.info( s"Failed to fetch metadata for $batchDataset, is it possible Group By Upload for $name has not succeeded?") throw e } - println(s"Fetched ${Constants.GroupByServingInfoKey} from : $batchDataset") + logger.info(s"Fetched ${Constants.GroupByServingInfoKey} from : $batchDataset") if (metaData.isFailure) { Failure( new RuntimeException(s"Couldn't fetch group by serving info for $batchDataset, " + @@ -158,13 +161,13 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, def putConf(configPath: String): Future[Seq[Boolean]] = { val configFile = new File(configPath) assert(configFile.exists(), s"$configFile does not exist") - println(s"Uploading Chronon configs from $configPath") + logger.info(s"Uploading Chronon configs from $configPath") val fileList = listFiles(configFile) val puts = fileList .filter { file => val name = parseName(file.getPath) - if (name.isEmpty) println(s"Skipping invalid file ${file.getPath}") + if (name.isEmpty) logger.info(s"Skipping invalid file ${file.getPath}") name.isDefined } .flatMap { file => @@ -173,11 +176,11 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, case value if value.contains("staging_queries/") => loadJson[StagingQuery](value) case value if value.contains("joins/") => loadJson[Join](value) case value if value.contains("group_bys/") => loadJson[GroupBy](value) - case _ => println(s"unknown config type in file $path"); None + case _ => logger.info(s"unknown config type in file $path"); None } val key = pathToKey(path) confJsonOpt.map { conf => - println(s"""Putting metadata for + logger.info(s"""Putting metadata for |key: $key |conf: $conf""".stripMargin) PutRequest(keyBytes = key.getBytes(), @@ -187,7 +190,7 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, } } val putsBatches = puts.grouped(CONF_BATCH_SIZE).toSeq - println(s"Putting ${puts.size} configs to KV Store, dataset=$dataset") + logger.info(s"Putting ${puts.size} configs to KV Store, dataset=$dataset") val futures = putsBatches.map(batch => kvStore.multiPut(batch)) Future.sequence(futures).map(_.flatten) } @@ -215,7 +218,7 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, Some(ThriftJsonCodec.toJsonStr(configConf)) } catch { case e: Throwable => - println(s"Failed to parse compiled Chronon config file: $file, \nerror=${e.getMessage}") + logger.info(s"Failed to parse compiled Chronon config file: $file, \nerror=${e.getMessage}") None } } @@ -232,7 +235,7 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, .map(_.asInstanceOf[String]) } catch { case ex: Throwable => - println(s"Failed to parse Chronon config file at $path as JSON with error: ${ex.getMessage}") + logger.info(s"Failed to parse Chronon config file at $path as JSON with error: ${ex.getMessage}") ex.printStackTrace() None } diff --git a/online/src/test/scala/ai/chronon/online/TileCodecTest.scala b/online/src/test/scala/ai/chronon/online/TileCodecTest.scala index 9124e5622..b60e0053c 100644 --- a/online/src/test/scala/ai/chronon/online/TileCodecTest.scala +++ b/online/src/test/scala/ai/chronon/online/TileCodecTest.scala @@ -16,12 +16,14 @@ package ai.chronon.online +import org.slf4j.LoggerFactory import ai.chronon.api.{Aggregation, Builders, FloatType, IntType, ListType, LongType, Operation, Row, StringType, TimeUnit, Window} import org.junit.Assert.assertEquals import org.junit.Test import scala.collection.JavaConverters._ class TileCodecTest { + private val logger = LoggerFactory.getLogger(getClass) private val histogram = Map[String, Int]("A" -> 3, "B" -> 2).asJava private val aggregationsAndExpected: Array[(Aggregation, Seq[Any])] = Array( @@ -105,7 +107,7 @@ class TileCodecTest { val windowedRowAggregator = TileCodec.buildWindowedRowAggregator(groupBy, schema) expectedFlattenedVals.zip(finalResults).zip(windowedRowAggregator.outputSchema.map(_._1)).foreach { case ((expected, actual), name) => - println(s"Checking: $name") + logger.info(s"Checking: $name") assertEquals(expected, actual) } } @@ -138,7 +140,7 @@ class TileCodecTest { val windowedRowAggregator = TileCodec.buildWindowedRowAggregator(groupBy, schema) expectedBucketedResults.zip(finalResults).zip(windowedRowAggregator.outputSchema.map(_._1)).foreach { case ((expected, actual), name) => - println(s"Checking: $name") + logger.info(s"Checking: $name") assertEquals(expected, actual) } } diff --git a/online/src/test/scala/ai/chronon/online/test/DataStreamBuilderTest.scala b/online/src/test/scala/ai/chronon/online/test/DataStreamBuilderTest.scala index af1ec3d82..2b6661027 100644 --- a/online/src/test/scala/ai/chronon/online/test/DataStreamBuilderTest.scala +++ b/online/src/test/scala/ai/chronon/online/test/DataStreamBuilderTest.scala @@ -16,6 +16,7 @@ package ai.chronon.online.test +import org.slf4j.LoggerFactory import ai.chronon.api.{Builders, DataModel, LongType, StringType, StructField, StructType} import ai.chronon.online.{DataStream, SparkConversions, TopicInfo} import ai.chronon.online.TopicInfo.parse @@ -26,6 +27,7 @@ import org.junit.Test import scala.util.ScalaJavaConversions.JListOps class DataStreamBuilderTest { + private val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = { System.setSecurityManager(null) val spark = SparkSession @@ -64,7 +66,7 @@ class DataStreamBuilderTest { def checkTopicInfo(actual: TopicInfo, expected: TopicInfo): Unit = { if (actual != expected) { - println(s"Actual topicInfo != expected topicInfo. Actual: $actual, expected: $expected") + logger.info(s"Actual topicInfo != expected topicInfo. Actual: $actual, expected: $expected") } assert(actual == expected) } diff --git a/project/FolderCleaner.scala b/project/FolderCleaner.scala index 0d975c2eb..96c2b213b 100644 --- a/project/FolderCleaner.scala +++ b/project/FolderCleaner.scala @@ -1,9 +1,11 @@ +import org.slf4j.LoggerFactory import java.io.File import scala.reflect.io.Directory object Folder { + private val logger = LoggerFactory.getLogger(getClass) def clean(files: File*): Unit = { - println(s"Removing folders ${files.map(_.getAbsolutePath)}") + logger.info(s"Removing folders ${files.map(_.getAbsolutePath)}") files.foreach { file => if (file.exists() && file.isDirectory) { val directory = new Directory(file) diff --git a/project/ThriftGen.scala b/project/ThriftGen.scala index b92f49060..3e9a3f945 100644 --- a/project/ThriftGen.scala +++ b/project/ThriftGen.scala @@ -1,17 +1,19 @@ +import org.slf4j.LoggerFactory import sbt._ import sys.process._ object Thrift { + private val logger = LoggerFactory.getLogger(getClass) def gen(inputPath: String, outputPath: String, language: String, cleanupSuffixPath: String = "", extension: String = null): Seq[File] = { s"""echo "Generating files from thrift file: $outputPath \ninto folder $inputPath" """ !; s"rm -rf $outputPath/$cleanupSuffixPath" !; s"mkdir -p $outputPath" !; s"thrift --gen $language -out $outputPath $inputPath" !; val files = (PathFinder(new File(outputPath)) ** s"*.${Option(extension).getOrElse(language)}").get() - println("Generated files list") - files.map(_.getPath).foreach { path => println(s" $path") } - println("\n") + logger.info("Generated files list") + files.map(_.getPath).foreach { path => logger.info(s" $path") } + logger.info("\n") files } } diff --git a/spark/src/main/scala/ai/chronon/spark/Analyzer.scala b/spark/src/main/scala/ai/chronon/spark/Analyzer.scala index 8e6949844..1cc9c1c42 100644 --- a/spark/src/main/scala/ai/chronon/spark/Analyzer.scala +++ b/spark/src/main/scala/ai/chronon/spark/Analyzer.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.{Accuracy, AggregationPart, Constants, DataType, TimeUnit, Window} import ai.chronon.api.Extensions._ @@ -36,8 +37,10 @@ import scala.util.ScalaJavaConversions.ListOps //@SerialVersionUID(3457890987L) //class ItemSketchSerializable(var mapSize: Int) extends ItemsSketch[String](mapSize) with Serializable {} + private val logger = LoggerFactory.getLogger(getClass) class ItemSketchSerializable extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) var sketch: ItemsSketch[String] = null def init(mapSize: Int): ItemSketchSerializable = { sketch = new ItemsSketch[String](mapSize) @@ -69,6 +72,7 @@ class Analyzer(tableUtils: TableUtils, sample: Double = 0.1, enableHitter: Boolean = false, silenceMode: Boolean = false) { + private val logger = LoggerFactory.getLogger(getClass) // include ts into heavy hitter analysis - useful to surface timestamps that have wrong units // include total approx row count - so it is easy to understand the percentage of skewed data def heavyHittersWithTsAndCount(df: DataFrame, @@ -156,6 +160,7 @@ class Analyzer(tableUtils: TableUtils, window: String = null, inputColumn: String = null, groupByName: String = null) { + private val logger = LoggerFactory.getLogger(getClass) def asMap: Map[String, String] = { Map( @@ -189,7 +194,7 @@ class Analyzer(tableUtils: TableUtils, groupByConf.setups.foreach(tableUtils.sql) val groupBy = GroupBy.from(groupByConf, range, tableUtils, computeDependency = enableHitter, finalize = true) val name = "group_by/" + prefix + groupByConf.metaData.name - println(s"""|Running GroupBy analysis for $name ...""".stripMargin) + logger.info(s"""|Running GroupBy analysis for $name ...""".stripMargin) val analysis = if (enableHitter) analyze(groupBy.inputDf, @@ -215,20 +220,20 @@ class Analyzer(tableUtils: TableUtils, groupBy.outputSchema } if (silenceMode) { - println(s"""ANALYSIS completed for group_by/${name}.""".stripMargin) + logger.info(s"""ANALYSIS completed for group_by/${name}.""".stripMargin) } else { - println(s""" + logger.info(s""" |ANALYSIS for $name: |$analysis """.stripMargin) if (includeOutputTableName) - println(s""" + logger.info(s""" |----- OUTPUT TABLE NAME ----- |${groupByConf.metaData.outputTable} """.stripMargin) val keySchema = groupBy.keySchema.fields.map { field => s" ${field.name} => ${field.dataType}" } schema.fields.map { field => s" ${field.name} => ${field.fieldType}" } - println(s""" + logger.info(s""" |----- KEY SCHEMA ----- |${keySchema.mkString("\n")} |----- OUTPUT SCHEMA ----- @@ -251,7 +256,7 @@ class Analyzer(tableUtils: TableUtils, def analyzeJoin(joinConf: api.Join, enableHitter: Boolean = false, validationAssert: Boolean = false) : (Map[String, DataType], ListBuffer[AggregationMetadata], Map[String, DataType]) = { val name = "joins/" + joinConf.metaData.name - println(s"""|Running join analysis for $name ...""".stripMargin) + logger.info(s"""|Running join analysis for $name ...""".stripMargin) // run SQL environment setups such as UDFs and JARs joinConf.setups.foreach(tableUtils.sql) @@ -269,7 +274,7 @@ class Analyzer(tableUtils: TableUtils, val rangeToFill = JoinUtils.getRangesToFill(joinConf.left, tableUtils, endDate, historicalBackfill = joinConf.historicalBackfill) - println(s"[Analyzer] Join range to fill $rangeToFill") + logger.info(s"[Analyzer] Join range to fill $rangeToFill") val unfilledRanges = tableUtils .unfilledRanges(joinConf.metaData.outputTable, rangeToFill, Some(Seq(joinConf.left.table))) .getOrElse(Seq.empty) @@ -302,9 +307,9 @@ class Analyzer(tableUtils: TableUtils, aggregationsMetadata.map(aggregation => (aggregation.name, aggregation.columnType)).toMap val statsSchema = StatsGenerator.statsIrSchema(api.StructType.from("Stats", rightSchema.toArray)) if (silenceMode) { - println(s"""ANALYSIS completed for join/${joinConf.metaData.cleanName}.""".stripMargin) + logger.info(s"""ANALYSIS completed for join/${joinConf.metaData.cleanName}.""".stripMargin) } else { - println(s""" + logger.info(s""" |ANALYSIS for join/${joinConf.metaData.cleanName}: |$analysis |----- OUTPUT TABLE NAME ----- @@ -319,26 +324,26 @@ class Analyzer(tableUtils: TableUtils, |""".stripMargin) } - println(s"----- Validations for join/${joinConf.metaData.cleanName} -----") + logger.info(s"----- Validations for join/${joinConf.metaData.cleanName} -----") if (!gbStartPartitions.isEmpty) { - println( + logger.info( "----- Following Group_Bys contains a startPartition. Please check if any startPartition will conflict with your backfill. -----") gbStartPartitions.foreach { case (gbName, startPartitions) => - println(s"$gbName : ${startPartitions.mkString(",")}") + logger.info(s"$gbName : ${startPartitions.mkString(",")}") } } if (keysWithError.isEmpty && noAccessTables.isEmpty && dataAvailabilityErrors.isEmpty) { - println("----- Backfill validation completed. No errors found. -----") + logger.info("----- Backfill validation completed. No errors found. -----") } else { - println(s"----- Schema validation completed. Found ${keysWithError.size} errors") + logger.info(s"----- Schema validation completed. Found ${keysWithError.size} errors") val keyErrorSet: Set[(String, String)] = keysWithError.toSet - println(keyErrorSet.map { case (key, errorMsg) => s"$key => $errorMsg" }.mkString("\n")) - println(s"---- Table permission check completed. Found permission errors in ${noAccessTables.size} tables ----") - println(noAccessTables.mkString("\n")) - println(s"---- Data availability check completed. Found issue in ${dataAvailabilityErrors.size} tables ----") + logger.info(keyErrorSet.map { case (key, errorMsg) => s"$key => $errorMsg" }.mkString("\n")) + logger.info(s"---- Table permission check completed. Found permission errors in ${noAccessTables.size} tables ----") + logger.info(noAccessTables.mkString("\n")) + logger.info(s"---- Data availability check completed. Found issue in ${dataAvailabilityErrors.size} tables ----") dataAvailabilityErrors.foreach(error => - println(s"Table ${error._1} : Group_By ${error._2} : Expected start ${error._3}")) + logger.info(s"Table ${error._1} : Group_By ${error._2} : Expected start ${error._3}")) } if (validationAssert) { @@ -377,7 +382,7 @@ class Analyzer(tableUtils: TableUtils, // validate the table permissions for given list of tables // return a list of tables that the user doesn't have access to def runTablePermissionValidation(sources: Set[String]): Set[String] = { - println(s"Validating ${sources.size} tables permissions ...") + logger.info(s"Validating ${sources.size} tables permissions ...") val today = tableUtils.partitionSpec.at(System.currentTimeMillis()) //todo: handle offset-by-1 depending on temporal vs snapshot accuracy val partitionFilter = tableUtils.partitionSpec.minus(today, new Window(2, TimeUnit.DAYS)) @@ -394,7 +399,7 @@ class Analyzer(tableUtils: TableUtils, groupBy: api.GroupBy, unfilledRanges: Seq[PartitionRange]): List[(String, String, String)] = { if (unfilledRanges.isEmpty) { - println("No unfilled ranges found.") + logger.info("No unfilled ranges found.") List.empty } else { val firstUnfilledPartition = unfilledRanges.min.start @@ -418,7 +423,7 @@ class Analyzer(tableUtils: TableUtils, } groupBy.sources.toScala.flatMap { source => val table = source.table - println(s"Checking table $table for data availability ... Expected start partition: $expectedStart") + logger.info(s"Checking table $table for data availability ... Expected start partition: $expectedStart") //check if partition available or table is cumulative if (!tableUtils.ifPartitionExistsInTable(table, expectedStart) && !source.isCumulative) { Some((table, groupBy.getMetaData.getName, expectedStart)) diff --git a/spark/src/main/scala/ai/chronon/spark/BootstrapInfo.scala b/spark/src/main/scala/ai/chronon/spark/BootstrapInfo.scala index 7d9877103..f9dcf19d7 100644 --- a/spark/src/main/scala/ai/chronon/spark/BootstrapInfo.scala +++ b/spark/src/main/scala/ai/chronon/spark/BootstrapInfo.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.Extensions._ import ai.chronon.api.{Constants, ExternalPart, JoinPart, StructField} @@ -52,6 +53,9 @@ case class BootstrapInfo( derivations: Array[StructField], hashToSchema: Map[String, Array[StructField]] ) { + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) lazy val fieldNames: Set[String] = fields.map(_.name).toSet @@ -68,6 +72,7 @@ case class BootstrapInfo( } object BootstrapInfo { + private val logger = LoggerFactory.getLogger(getClass) // Build metadata for the join that contains schema information for join parts, external parts and bootstrap parts def from(joinConf: api.Join, @@ -77,7 +82,7 @@ object BootstrapInfo { mutationScan: Boolean = true): BootstrapInfo = { // Enrich each join part with the expected output schema - println(s"\nCreating BootstrapInfo for GroupBys for Join ${joinConf.metaData.name}") + logger.info(s"\nCreating BootstrapInfo for GroupBys for Join ${joinConf.metaData.name}") var joinParts: Seq[JoinPartMetadata] = Option(joinConf.joinParts.toScala) .getOrElse(Seq.empty) .map(part => { @@ -110,7 +115,7 @@ object BootstrapInfo { }) // Enrich each external part with the expected output schema - println(s"\nCreating BootstrapInfo for ExternalParts for Join ${joinConf.metaData.name}") + logger.info(s"\nCreating BootstrapInfo for ExternalParts for Join ${joinConf.metaData.name}") val externalParts: Seq[ExternalPartMetadata] = Option(joinConf.onlineExternalParts.toScala) .getOrElse(Seq.empty) .map(part => ExternalPartMetadata(part, part.keySchemaFull, part.valueSchemaFull)) @@ -166,7 +171,7 @@ object BootstrapInfo { val exceptionList = mutable.ListBuffer[Throwable]() def collectException(assertion: => Unit): Unit = Try(assertion).failed.foreach(exceptionList += _) - println(s"\nCreating BootstrapInfo for Log Based Bootstraps for Join ${joinConf.metaData.name}") + logger.info(s"\nCreating BootstrapInfo for Log Based Bootstraps for Join ${joinConf.metaData.name}") // Verify that join keys are valid columns on the log table logBootstrapParts .foreach(part => { @@ -188,7 +193,7 @@ object BootstrapInfo { .toSeq }.toMap - println(s"\nCreating BootstrapInfo for Table Based Bootstraps for Join ${joinConf.metaData.name}") + logger.info(s"\nCreating BootstrapInfo for Table Based Bootstraps for Join ${joinConf.metaData.name}") // Verify that join keys are valid columns on the bootstrap source table val tableHashes = tableBootstrapParts .map(part => { @@ -293,13 +298,13 @@ object BootstrapInfo { } if (exceptionList.nonEmpty) { - exceptionList.foreach(t => println(t.traceString)) + exceptionList.foreach(t => logger.info(t.traceString)) throw new Exception(s"Validation failed for bootstrapInfo construction for join ${joinConf.metaData.name}") } - println(s"\n======= Finalized Bootstrap Info ${joinConf.metaData.name} =======\n") + logger.info(s"\n======= Finalized Bootstrap Info ${joinConf.metaData.name} =======\n") joinParts.foreach { metadata => - println(s"""Bootstrap Info for Join Part `${metadata.joinPart.groupBy.metaData.name}` + logger.info(s"""Bootstrap Info for Join Part `${metadata.joinPart.groupBy.metaData.name}` |Key Schema: |${stringify(metadata.keySchema)} |Value Schema: @@ -307,7 +312,7 @@ object BootstrapInfo { |""".stripMargin) } externalParts.foreach { metadata => - println(s"""Bootstrap Info for External Part `${metadata.externalPart.fullName}` + logger.info(s"""Bootstrap Info for External Part `${metadata.externalPart.fullName}` |Key Schema: |${stringify(metadata.keySchema)} |Value Schema: @@ -315,16 +320,16 @@ object BootstrapInfo { |""".stripMargin) } if (derivedSchema.nonEmpty) { - println(s"""Bootstrap Info for Derivations + logger.info(s"""Bootstrap Info for Derivations |${stringify(derivedSchema.map(_._1))} |""".stripMargin) } - println(s"""Bootstrap Info for Log Bootstraps + logger.info(s"""Bootstrap Info for Log Bootstraps |Log Hashes: ${logHashes.keys.prettyInline} |""".stripMargin) tableHashes.foreach { case (hash, (schema, _, query)) => - println(s"""Bootstrap Info for Table Bootstraps + logger.info(s"""Bootstrap Info for Table Bootstraps |Table Hash: ${hash} |Bootstrap Query: |\n${query}\n @@ -333,7 +338,7 @@ object BootstrapInfo { |""".stripMargin) } - println(s"\n======= Finalized Bootstrap Info ${joinConf.metaData.name} END =======\n") + logger.info(s"\n======= Finalized Bootstrap Info ${joinConf.metaData.name} END =======\n") bootstrapInfo } diff --git a/spark/src/main/scala/ai/chronon/spark/Comparison.scala b/spark/src/main/scala/ai/chronon/spark/Comparison.scala index ff445cf0d..bbd67ecf1 100644 --- a/spark/src/main/scala/ai/chronon/spark/Comparison.scala +++ b/spark/src/main/scala/ai/chronon/spark/Comparison.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.online.Extensions.StructTypeOps import com.google.gson.Gson import org.apache.spark.sql.DataFrame @@ -24,6 +25,7 @@ import org.apache.spark.sql.types.{DecimalType, DoubleType, FloatType, MapType} import java.util object Comparison { + private val logger = LoggerFactory.getLogger(getClass) // used for comparison def sortedJson(m: Map[String, Any]): String = { @@ -58,8 +60,8 @@ object Comparison { aName: String = "a", bName: String = "b"): DataFrame = { - println("====== side-by-side comparison ======") - println(s"keys: $keys\na_schema:\n${a.schema.pretty}\nb_schema:\n${b.schema.pretty}") + logger.info("====== side-by-side comparison ======") + logger.info(s"keys: $keys\na_schema:\n${a.schema.pretty}\nb_schema:\n${b.schema.pretty}") val prefixedExpectedDf = prefixColumnName(stringifyMaps(a), s"${aName}_") val prefixedOutputDf = prefixColumnName(stringifyMaps(b), s"${bName}_") @@ -98,7 +100,7 @@ object Comparison { } else { s"($left <> $right)" } Seq(s"(($left IS NULL AND $right IS NOT NULL) OR ($right IS NULL AND $left IS NOT NULL) OR $compareExpression)") } - println(s"Using comparison filter:\n ${comparisonFilters.mkString("\n ")}") + logger.info(s"Using comparison filter:\n ${comparisonFilters.mkString("\n ")}") if (comparisonFilters.nonEmpty) { finalDf.filter(comparisonFilters.mkString(" or ")) } else { diff --git a/spark/src/main/scala/ai/chronon/spark/Driver.scala b/spark/src/main/scala/ai/chronon/spark/Driver.scala index 9f2d431e0..ab9282666 100644 --- a/spark/src/main/scala/ai/chronon/spark/Driver.scala +++ b/spark/src/main/scala/ai/chronon/spark/Driver.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.Extensions.{GroupByOps, MetadataOps, SourceOps} import ai.chronon.api.ThriftJsonCodec @@ -50,11 +51,13 @@ import scala.util.{Failure, Success, Try} // useful to override spark.sql.extensions args - there is no good way to unset that conf apparently // so we give it dummy extensions class DummyExtensions extends (SparkSessionExtensions => Unit) { + private val logger = LoggerFactory.getLogger(getClass) override def apply(extensions: SparkSessionExtensions): Unit = {} } // The mega chronon cli object Driver { + private val logger = LoggerFactory.getLogger(getClass) def parseConf[T <: TBase[_, _]: Manifest: ClassTag](confPath: String): T = ThriftJsonCodec.fromJsonFile[T](confPath, check = true) @@ -202,21 +205,23 @@ object Driver { val result = CompareJob.getConsolidatedData(metrics, tableUtils.partitionSpec) if (result.nonEmpty) { - println("[Validation] Failed. Please try exporting the result and investigate.") + logger.info("[Validation] Failed. Please try exporting the result and investigate.") false } else { - println("[Validation] Success.") + logger.info("[Validation] Success.") true } } } object JoinBackfill { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("join") with OfflineSubcommand with LocalExportTableAbility with ResultValidationAbility { + private val logger = LoggerFactory.getLogger(getClass) val stepDays: ScallopOption[Int] = opt[Int](required = false, descr = "Runs backfill in steps, step-days at a time. Default is 30 days", @@ -253,16 +258,18 @@ object Driver { } df.show(numRows = 3, truncate = 0, vertical = true) - println(s"\nShowing three rows of output above.\nQuery table `${args.joinConf.metaData.outputTable}` for more.\n") + logger.info(s"\nShowing three rows of output above.\nQuery table `${args.joinConf.metaData.outputTable}` for more.\n") } } object GroupByBackfill { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("group-by-backfill") with OfflineSubcommand with LocalExportTableAbility with ResultValidationAbility { + private val logger = LoggerFactory.getLogger(getClass) val stepDays: ScallopOption[Int] = opt[Int](required = false, descr = "Runs backfill in steps, step-days at a time. Default is 30 days", @@ -292,7 +299,9 @@ object Driver { } object LabelJoin { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("label-join") with OfflineSubcommand with LocalExportTableAbility { + private val logger = LoggerFactory.getLogger(getClass) val stepDays: ScallopOption[Int] = opt[Int](required = false, descr = "Runs label join in steps, step-days at a time. Default is 30 days", @@ -317,7 +326,9 @@ object Driver { } object Analyzer { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("analyze") with OfflineSubcommand { + private val logger = LoggerFactory.getLogger(getClass) val startDate: ScallopOption[String] = opt[String](required = false, descr = "Finds heavy hitters & time-distributions until a specified start date", @@ -357,7 +368,9 @@ object Driver { } object MetadataExport { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("metadata-export") with OfflineSubcommand { + private val logger = LoggerFactory.getLogger(getClass) val inputRootPath: ScallopOption[String] = opt[String](required = true, descr = "Base path of config repo to export from") val outputRootPath: ScallopOption[String] = @@ -371,7 +384,9 @@ object Driver { } object StagingQueryBackfill { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("staging-query-backfill") with OfflineSubcommand with LocalExportTableAbility { + private val logger = LoggerFactory.getLogger(getClass) val stepDays: ScallopOption[Int] = opt[Int](required = false, descr = "Runs backfill in steps, step-days at a time. Default is 30 days", @@ -400,7 +415,9 @@ object Driver { } object DailyStats { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("stats-summary") with OfflineSubcommand { + private val logger = LoggerFactory.getLogger(getClass) val stepDays: ScallopOption[Int] = opt[Int](required = false, descr = "Runs backfill in steps, step-days at a time. Default is 30 days", @@ -420,7 +437,9 @@ object Driver { } object LogStats { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("log-summary") with OfflineSubcommand { + private val logger = LoggerFactory.getLogger(getClass) val stepDays: ScallopOption[Int] = opt[Int](required = false, descr = "Runs backfill in steps, step-days at a time. Default is 30 days", @@ -439,7 +458,9 @@ object Driver { } object GroupByUploader { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("group-by-upload") with OfflineSubcommand { + private val logger = LoggerFactory.getLogger(getClass) override def subcommandName() = "group-by-upload" } @@ -449,7 +470,9 @@ object Driver { } object ConsistencyMetricsCompute { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("consistency-metrics-compute") with OfflineSubcommand { + private val logger = LoggerFactory.getLogger(getClass) override def subcommandName() = "consistency-metrics-compute" } @@ -464,7 +487,9 @@ object Driver { } object CompareJoinQuery { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("compare-join-query") with OfflineSubcommand { + private val logger = LoggerFactory.getLogger(getClass) val queryConf: ScallopOption[String] = opt[String](required = true, descr = "Conf to the Staging Query to compare with") val startDate: ScallopOption[String] = @@ -502,6 +527,7 @@ object Driver { // hashmap implements serializable def serializableProps: Map[String, String] = { + private val logger = LoggerFactory.getLogger(getClass) val map = new mutable.HashMap[String, String]() propsInner.foreach { case (key, value) => map.update(key, value) } map.toMap @@ -521,8 +547,10 @@ object Driver { } object FetcherCli { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("fetch") with OnlineSubcommand { + private val logger = LoggerFactory.getLogger(getClass) val keyJson: ScallopOption[String] = opt[String](required = false, descr = "json of the keys to fetch") val name: ScallopOption[String] = opt[String](required = true, descr = "name of the join/group-by to fetch") val `type`: ScallopOption[String] = @@ -560,7 +588,7 @@ object Driver { ) series.get(keyMap("statsKey").asInstanceOf[String]) else series - println(s"--- [FETCHED RESULT] ---\n${objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(toPrint)}") + logger.info(s"--- [FETCHED RESULT] ---\n${objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(toPrint)}") } def run(args: Args): Unit = { @@ -582,19 +610,19 @@ object Driver { if (args.keyJson.isDefined) { Try(readMapList(args.keyJson())).toOption.getOrElse(Seq(readMap(args.keyJson()))) } else { - println(s"Reading requests from ${args.keyJsonFile()}") + logger.info(s"Reading requests from ${args.keyJsonFile()}") val file = Source.fromFile(args.keyJsonFile()) val mapList = file.getLines().map(json => readMap(json)).toList file.close() mapList } if (keyMapList.length > 1) { - println(s"Plan to send ${keyMapList.length} fetches with ${args.interval()} seconds interval") + logger.info(s"Plan to send ${keyMapList.length} fetches with ${args.interval()} seconds interval") } val fetcher = args.impl(args.serializableProps).buildFetcher(true) def iterate(): Unit = { keyMapList.foreach(keyMap => { - println(s"--- [START FETCHING for ${keyMap}] ---") + logger.info(s"--- [START FETCHING for ${keyMap}] ---") if (args.`type`() == "join-stats") { fetchStats(args, objectMapper, keyMap, fetcher) } else { @@ -614,13 +642,13 @@ object Driver { r.values match { case Success(valMap) => { if (valMap == null) { - println("No data present for the provided key.") + logger.info("No data present for the provided key.") } else { valMap.foreach { case (k, v) => tMap.put(k, v) } - println( + logger.info( s"--- [FETCHED RESULT] ---\n${objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(tMap)}") } - println(s"Fetched in: $awaitTimeMs ms") + logger.info(s"Fetched in: $awaitTimeMs ms") } case Failure(exception) => { exception.printStackTrace() @@ -632,14 +660,16 @@ object Driver { } iterate() while (args.loop()) { - println("loop is set to true, start next iteration. will only exit if manually killed.") + logger.info("loop is set to true, start next iteration. will only exit if manually killed.") iterate() } } } object MetadataUploader { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("metadata-upload") with OnlineSubcommand { + private val logger = LoggerFactory.getLogger(getClass) val confPath: ScallopOption[String] = opt[String](required = true, descr = "Path to the Chronon config file or directory") } @@ -647,13 +677,15 @@ object Driver { def run(args: Args): Unit = { val putRequest = args.metaDataStore.putConf(args.confPath()) val res = Await.result(putRequest, 1.hour) - println( + logger.info( s"Uploaded Chronon Configs to the KV store, success count = ${res.count(v => v)}, failure count = ${res.count(!_)}") } } object LogFlattener { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("log-flattener") with OfflineSubcommand { + private val logger = LoggerFactory.getLogger(getClass) val logTable: ScallopOption[String] = opt[String](required = true, descr = "Hive table with partitioned raw logs") @@ -683,17 +715,18 @@ object Driver { } object GroupByStreaming { + private val logger = LoggerFactory.getLogger(getClass) def dataStream(session: SparkSession, host: String, topic: String): DataFrame = { TopicChecker.topicShouldExist(topic, host) session.streams.addListener(new StreamingQueryListener() { override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = { - println("Query started: " + queryStarted.id) + logger.info("Query started: " + queryStarted.id) } override def onQueryTerminated(queryTerminated: QueryTerminatedEvent): Unit = { - println("Query terminated: " + queryTerminated.id) + logger.info("Query terminated: " + queryTerminated.id) } override def onQueryProgress(queryProgress: QueryProgressEvent): Unit = { - println("Query made progress: " + queryProgress.progress) + logger.info("Query made progress: " + queryProgress.progress) } }) session.readStream @@ -706,6 +739,7 @@ object Driver { } class Args extends Subcommand("group-by-streaming") with OnlineSubcommand { + private val logger = LoggerFactory.getLogger(getClass) val confPath: ScallopOption[String] = opt[String](required = true, descr = "path to groupBy conf") val DEFAULT_LAG_MILLIS = 2000 // 2seconds val kafkaBootstrap: ScallopOption[String] = @@ -742,7 +776,7 @@ object Driver { } s"$file $suffix" } - println(s"File Statuses:\n ${messages.mkString("\n ")}") + logger.info(s"File Statuses:\n ${messages.mkString("\n ")}") statuses.find(_._2 == true).map(_._1) } @@ -783,6 +817,7 @@ object Driver { } class Args(args: Array[String]) extends ScallopConf(args) { + private val logger = LoggerFactory.getLogger(getClass) object JoinBackFillArgs extends JoinBackfill.Args addSubcommand(JoinBackFillArgs) object LogFlattenerArgs extends LogFlattener.Args @@ -818,6 +853,21 @@ object Driver { } def onlineBuilder(userConf: Map[String, String], onlineJar: String, onlineClass: String): Api = { + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) val urls = Array(new File(onlineJar).toURI.toURL) val cl = ScalaClassLoader.fromURLs(urls, this.getClass.getClassLoader) val cls = cl.loadClass(onlineClass) @@ -850,9 +900,9 @@ object Driver { case args.LogStatsArgs => LogStats.run(args.LogStatsArgs) case args.MetadataExportArgs => MetadataExport.run(args.MetadataExportArgs) case args.LabelJoinArgs => LabelJoin.run(args.LabelJoinArgs) - case _ => println(s"Unknown subcommand: $x") + case _ => logger.info(s"Unknown subcommand: $x") } - case None => println(s"specify a subcommand please") + case None => logger.info(s"specify a subcommand please") } if (shouldExit) { System.exit(0) diff --git a/spark/src/main/scala/ai/chronon/spark/Extensions.scala b/spark/src/main/scala/ai/chronon/spark/Extensions.scala index 84e0581bc..1e2a02c2a 100644 --- a/spark/src/main/scala/ai/chronon/spark/Extensions.scala +++ b/spark/src/main/scala/ai/chronon/spark/Extensions.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.Constants import ai.chronon.online.{AvroCodec, AvroConversions, SparkConversions} @@ -32,8 +33,10 @@ import scala.collection.Seq import scala.reflect.ClassTag object Extensions { + private val logger = LoggerFactory.getLogger(getClass) implicit class StructTypeOps(schema: StructType) { + private val logger = LoggerFactory.getLogger(getClass) def pretty: String = { val schemaTuples = schema.fields.map { field => field.dataType.simpleString -> field.name @@ -57,6 +60,9 @@ object Extensions { case class DfStats(count: Long, partitionRange: PartitionRange) // helper class to maintain datafram stats that are necessary for downstream operations case class DfWithStats(df: DataFrame, partitionCounts: Map[String, Long])(implicit val tableUtils: TableUtils) { + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) private val minPartition: String = partitionCounts.keys.min private val maxPartition: String = partitionCounts.keys.max val partitionRange: PartitionRange = PartitionRange(minPartition, maxPartition) @@ -71,6 +77,7 @@ object Extensions { } object DfWithStats { + private val logger = LoggerFactory.getLogger(getClass) def apply(dataFrame: DataFrame)(implicit tableUtils: TableUtils): DfWithStats = { val partitionCounts = dataFrame .groupBy(col(TableUtils(dataFrame.sparkSession).partitionColumn)) @@ -83,6 +90,7 @@ object Extensions { } implicit class DataframeOps(df: DataFrame) { + private val logger = LoggerFactory.getLogger(getClass) private implicit val tableUtils: TableUtils = TableUtils(df.sparkSession) // This is safe to call on dataframes that are un-shuffled from their disk sources - @@ -98,7 +106,7 @@ object Extensions { def prunePartition(partitionRange: PartitionRange): DataFrame = { val pruneFilter = partitionRange.whereClauses().mkString(" AND ") - println(s"Pruning using $pruneFilter") + logger.info(s"Pruning using $pruneFilter") df.filter(pruneFilter) } @@ -121,7 +129,7 @@ object Extensions { val minMaxRow = minMaxRows(0) df.sparkSession.catalog.dropTempView(viewName) val (min, max) = (minMaxRow.getAs[T](0), minMaxRow.getAs[T](1)) - println(s"Computed Range for $columnName - min: $min, max: $max") + logger.info(s"Computed Range for $columnName - min: $min, max: $max") (min, max) } @@ -188,10 +196,10 @@ object Extensions { val approxCount = df.filter(df.col(col).isNotNull).select(approx_count_distinct(col)).collect()(0).getLong(0) if (approxCount == 0) { - println( + logger.info( s"Warning: approxCount for col ${col} from table ${tableName} is 0. Please double check your input data.") } - println(s""" [STARTED] Generating bloom filter on key `$col` for range $partitionRange from $tableName + logger.info(s""" [STARTED] Generating bloom filter on key `$col` for range $partitionRange from $tableName | Approximate distinct count of `$col`: $approxCount | Total count of rows: $totalCount |""".stripMargin) @@ -200,7 +208,7 @@ object Extensions { .stat .bloomFilter(col, approxCount + 1, fpp) // expectedNumItems must be positive - println(s""" + logger.info(s""" | [FINISHED] Generating bloom filter on key `$col` for range $partitionRange from $tableName | Approximate distinct count of `$col`: $approxCount | Total count of rows: $totalCount @@ -210,7 +218,7 @@ object Extensions { } def removeNulls(cols: Seq[String]): DataFrame = { - println(s"filtering nulls from columns: [${cols.mkString(", ")}]") + logger.info(s"filtering nulls from columns: [${cols.mkString(", ")}]") // do not use != or <> operator with null, it doesn't return false ever! df.filter(cols.map(_ + " IS NOT NULL").mkString(" AND ")) } @@ -272,12 +280,13 @@ object Extensions { def prettyPrint(timeColumns: Seq[String] = Seq(Constants.TimeColumn, Constants.MutationTimeColumn)): Unit = { val availableColumns = timeColumns.filter(df.schema.names.contains) - println(s"schema: ${df.schema.fieldNames.mkString("Array(", ", ", ")")}") + logger.info(s"schema: ${df.schema.fieldNames.mkString("Array(", ", ", ")")}") df.replaceWithReadableTime(availableColumns, dropOriginal = true).show(truncate = false) } } implicit class ArrayOps[T: ClassTag](arr: Array[T]) { + private val logger = LoggerFactory.getLogger(getClass) def uniqSort(ordering: Ordering[T]): Array[T] = { val tree = new util.TreeSet[T](ordering) for (i <- arr.indices) { @@ -295,6 +304,7 @@ object Extensions { } implicit class InternalRowOps(internalRow: InternalRow) { + private val logger = LoggerFactory.getLogger(getClass) def toRow(schema: StructType): Row = { new Row() { override def length: Int = { diff --git a/spark/src/main/scala/ai/chronon/spark/FastHashing.scala b/spark/src/main/scala/ai/chronon/spark/FastHashing.scala index 1cf405954..1c32b44e2 100644 --- a/spark/src/main/scala/ai/chronon/spark/FastHashing.scala +++ b/spark/src/main/scala/ai/chronon/spark/FastHashing.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.spark.Extensions._ import com.google.common.hash.{Hasher, Hashing} import org.apache.spark.sql.Row @@ -26,6 +27,7 @@ import java.nio.charset.Charset // TODO: drop data and hashInt, iff we see OOMs on executors for small IRs and large keys // That is the only case where key size would be a problem case class KeyWithHash(data: Array[Any], hash: Array[Byte], hashInt: Int) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) // 16-byte hash from murmur_128 // P(one collision) ~ 10^-6 when key count ~ 2.6×10^16 // in-comparison with a 8-byte hash (long) @@ -41,11 +43,12 @@ case class KeyWithHash(data: Array[Any], hash: Array[Byte], hashInt: Int) extend } object FastHashing { + private val logger = LoggerFactory.getLogger(getClass) // function to generate a fast-ish hasher // the approach tries to accumulate several tiny closures to compute the final hash def generateKeyBuilder(keys: Array[String], schema: StructType): Row => KeyWithHash = { val keySchema = StructType(schema.filter { sf => keys.contains(sf.name) }) - println(s"Generating key builder over keys:\n${keySchema.pretty}\n") + logger.info(s"Generating key builder over keys:\n${keySchema.pretty}\n") val keyIndices: Array[Int] = keys.map(schema.fieldIndex) // the hash function generation won't be in the hot path - so its okay to val hashFunctions: Array[(Hasher, Row) => Unit] = keys.zip(keyIndices).map { diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala index 258e9db4c..26e3d39f5 100644 --- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala +++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.aggregator.base.TimeTuple import ai.chronon.aggregator.row.RowAggregator import ai.chronon.aggregator.windowing._ @@ -43,6 +44,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation], skewFilter: Option[String] = None, finalize: Boolean = true) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) protected[spark] val tsIndex: Int = inputDf.schema.fieldNames.indexOf(Constants.TimeColumn) protected val selectedSchema: Array[(String, api.DataType)] = SparkConversions.toChrononSchema(inputDf.schema) @@ -119,8 +121,8 @@ class GroupBy(val aggregations: Seq[api.Aggregation], inputDf -> updateFunc } - println("prepped input schema") - println(preppedInputDf.schema.pretty) + logger.info("prepped input schema") + logger.info(preppedInputDf.schema.pretty) tableUtils .preAggRepartition(preppedInputDf) @@ -390,6 +392,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation], // TODO: truncate queryRange for caching object GroupBy { + private val logger = LoggerFactory.getLogger(getClass) // Need to use a case class here to allow null matching case class SourceDataProfile(earliestRequired: String, earliestPresent: String, latestAllowed: String) @@ -401,10 +404,12 @@ object GroupBy { tableUtils: TableUtils, computeDependency: Boolean = true, showDf: Boolean = false): api.GroupBy = { + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) val result = groupByConf.deepCopy() val newSources: java.util.List[api.Source] = groupByConf.sources.toScala.map { source => if (source.isSetJoinSource) { - println("Join source detected. Materializing the join.") + logger.info("Join source detected. Materializing the join.") val joinSource = source.getJoinSource val joinConf = joinSource.join // materialize the table with the right end date. QueryRange.end could be shifted for temporal events @@ -417,7 +422,7 @@ object GroupBy { if (computeDependency) { val df = join.computeJoin() if (showDf) { - println( + logger.info( s"printing output data from groupby::join_source: ${groupByConf.metaData.name}::${joinConf.metaData.name}") df.prettyPrint() } @@ -458,7 +463,7 @@ object GroupBy { finalize: Boolean = true, mutationScan: Boolean = true, showDf: Boolean = false): GroupBy = { - println(s"\n----[Processing GroupBy: ${groupByConfOld.metaData.name}]----") + logger.info(s"\n----[Processing GroupBy: ${groupByConfOld.metaData.name}]----") val groupByConf = replaceJoinSource(groupByConfOld, queryRange, tableUtils, computeDependency, showDf) val inputDf = groupByConf.sources.toScala .map { source => @@ -492,7 +497,7 @@ object GroupBy { val keyColumns = groupByConf.getKeyColumns.toScala val skewFilteredDf = skewFilter .map { sf => - println(s"$logPrefix filtering using skew filter:\n $sf") + logger.info(s"$logPrefix filtering using skew filter:\n $sf") val filtered = inputDf.filter(sf) filtered } @@ -504,7 +509,7 @@ object GroupBy { val nullFilterClause = groupByConf.keyColumns.toScala.map(key => s"($key IS NOT NULL)").mkString(" OR ") val nullFiltered = processedInputDf.filter(nullFilterClause) if (showDf) { - println(s"printing input date for groupBy: ${groupByConf.metaData.name}") + logger.info(s"printing input date for groupBy: ${groupByConf.metaData.name}") nullFiltered.prettyPrint() } @@ -536,7 +541,7 @@ object GroupBy { } else null if (showDf && mutationDf != null) { - println(s"printing mutation data for groupBy: ${groupByConf.metaData.name}") + logger.info(s"printing mutation data for groupBy: ${groupByConf.metaData.name}") mutationDf.prettyPrint() } @@ -576,7 +581,7 @@ object GroupBy { val queryableDataRange = PartitionRange(dataProfile.earliestRequired, Seq(queryEnd, dataProfile.latestAllowed).max)(tableUtils) val intersectedRange = sourceRange.intersect(queryableDataRange) - println(s""" + logger.info(s""" |Computing intersected range as: | query range: $queryRange | query window: $window @@ -624,14 +629,14 @@ object GroupBy { Some(Constants.TimeColumn -> Option(source.query.timeColumn).getOrElse(dsBasedTimestamp)) } } - println(s""" + logger.info(s""" |Time Mapping: $timeMapping |""".stripMargin) metaColumns ++= timeMapping val partitionConditions = intersectedRange.map(_.whereClauses()).getOrElse(Seq.empty) - println(s""" + logger.info(s""" |Rendering source query: | intersected/effective scan range: $intersectedRange | partitionConditions: $partitionConditions @@ -684,25 +689,25 @@ object GroupBy { if (isAnySourceCumulative) None else Some(inputTables)) if (groupByUnfilledRangesOpt.isEmpty) { - println(s"""Nothing to backfill for $outputTable - given + logger.info(s"""Nothing to backfill for $outputTable - given |endPartition of $endPartition |backfill start of ${groupByConf.backfillStartDate} |Exiting...""".stripMargin) return } val groupByUnfilledRanges = groupByUnfilledRangesOpt.get - println(s"group by unfilled ranges: $groupByUnfilledRanges") + logger.info(s"group by unfilled ranges: $groupByUnfilledRanges") val exceptions = mutable.Buffer.empty[String] groupByUnfilledRanges.foreach { case groupByUnfilledRange => try { val stepRanges = stepDays.map(groupByUnfilledRange.steps).getOrElse(Seq(groupByUnfilledRange)) - println(s"Group By ranges to compute: ${stepRanges.map { + logger.info(s"Group By ranges to compute: ${stepRanges.map { _.toString }.pretty}") stepRanges.zipWithIndex.foreach { case (range, index) => - println(s"Computing group by for range: $range [${index + 1}/${stepRanges.size}]") + logger.info(s"Computing group by for range: $range [${index + 1}/${stepRanges.size}]") val groupByBackfill = from(groupByConf, range, tableUtils, computeDependency = true) val outputDf = groupByConf.dataModel match { // group by backfills have to be snapshot only @@ -716,9 +721,9 @@ object GroupBy { val result = outputDf.select(finalOutputColumns: _*) result.save(outputTable, tableProps) } - println(s"Wrote to table $outputTable, into partitions: $range") + logger.info(s"Wrote to table $outputTable, into partitions: $range") } - println(s"Wrote to table $outputTable for range: $groupByUnfilledRange") + logger.info(s"Wrote to table $outputTable for range: $groupByUnfilledRange") } catch { case err: Throwable => diff --git a/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala b/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala index 1f2ee5cd9..56ac4fd7a 100644 --- a/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala +++ b/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.aggregator.windowing.{FinalBatchIr, FiveMinuteResolution, Resolution, SawtoothOnlineAggregator} import ai.chronon.api import ai.chronon.api.{Accuracy, Constants, DataModel, GroupByServingInfo, QueryUtils, ThriftJsonCodec} @@ -34,6 +35,7 @@ import scala.util.ScalaJavaConversions.{ListOps, MapOps} import scala.util.Try class GroupByUpload(endPartition: String, groupBy: GroupBy) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) implicit val sparkSession: SparkSession = groupBy.sparkSession implicit private val tableUtils: TableUtils = TableUtils(sparkSession) private def fromBase(rdd: RDD[(Array[Any], Array[Any])]): KvRdd = { @@ -62,7 +64,7 @@ class GroupByUpload(endPartition: String, groupBy: GroupBy) extends Serializable // Shared between events and mutations (temporal entities). def temporalEvents(resolution: Resolution = FiveMinuteResolution): KvRdd = { val endTs = tableUtils.partitionSpec.epochMillis(endPartition) - println(s"TemporalEvents upload end ts: $endTs") + logger.info(s"TemporalEvents upload end ts: $endTs") val sawtoothOnlineAggregator = new SawtoothOnlineAggregator( endTs, groupBy.aggregations, @@ -71,7 +73,7 @@ class GroupByUpload(endPartition: String, groupBy: GroupBy) extends Serializable val irSchema = SparkConversions.fromChrononSchema(sawtoothOnlineAggregator.batchIrSchema) val keyBuilder = FastHashing.generateKeyBuilder(groupBy.keyColumns.toArray, groupBy.inputDf.schema) - println(s""" + logger.info(s""" |BatchIR Element Size: ${SparkEnv.get.serializer .newInstance() .serialize(sawtoothOnlineAggregator.init) @@ -103,6 +105,7 @@ class GroupByUpload(endPartition: String, groupBy: GroupBy) extends Serializable } object GroupByUpload { + private val logger = LoggerFactory.getLogger(getClass) // TODO - remove this if spark streaming can't reach hive tables private def buildServingInfo(groupByConf: api.GroupBy, @@ -151,12 +154,12 @@ object GroupByUpload { } groupByServingInfo.setInputAvroSchema(inputSchema.toAvroSchema(name = "Input").toString(true)) } else { - println("Not setting InputAvroSchema to GroupByServingInfo as there is no streaming source defined.") + logger.info("Not setting InputAvroSchema to GroupByServingInfo as there is no streaming source defined.") } val result = new GroupByServingInfoParsed(groupByServingInfo, tableUtils.partitionSpec) val firstSource = groupByConf.sources.get(0) - println(s""" + logger.info(s""" |Built GroupByServingInfo for ${groupByConf.metaData.name}: |table: ${firstSource.table} / data-model: ${firstSource.dataModel} | keySchema: ${Try(result.keyChrononSchema.catalogString)} @@ -206,7 +209,7 @@ object GroupByUpload { // for mutations I need the snapshot from the previous day, but a batch end date of ds +1 lazy val otherGroupByUpload = new GroupByUpload(batchEndDate, groupBy) - println(s""" + logger.info(s""" |GroupBy upload for: ${groupByConf.metaData.team}.${groupByConf.metaData.name} |Accuracy: ${groupByConf.inferredAccuracy} |Data Model: ${groupByConf.dataModel} diff --git a/spark/src/main/scala/ai/chronon/spark/Join.scala b/spark/src/main/scala/ai/chronon/spark/Join.scala index 0adff47f4..d6bfc3378 100644 --- a/spark/src/main/scala/ai/chronon/spark/Join.scala +++ b/spark/src/main/scala/ai/chronon/spark/Join.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.Extensions._ import ai.chronon.api._ @@ -44,6 +45,8 @@ import scala.util.ScalaJavaConversions.{IterableOps, ListOps, MapOps} case class CoveringSet(hashes: Seq[String], rowCount: Long, isCovering: Boolean) object CoveringSet { + private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) def toFilterExpression(coveringSets: Seq[CoveringSet]): String = { val coveringSetHashExpression = "(" + coveringSets @@ -65,6 +68,7 @@ class Join(joinConf: api.Join, mutationScan: Boolean = true, showDf: Boolean = false) extends JoinBase(joinConf, endPartition, tableUtils, skipFirstHole, mutationScan, showDf) { + private val logger = LoggerFactory.getLogger(getClass) private val bootstrapTable = joinConf.metaData.bootstrapTable @@ -169,16 +173,16 @@ class Join(joinConf: api.Join, (joinPartMetadata, coveringSets) } - println( + logger.info( s"\n======= CoveringSet for JoinPart ${joinConf.metaData.name} for PartitionRange(${leftRange.start}, ${leftRange.end}) =======\n") coveringSetsPerJoinPart.foreach { case (joinPartMetadata, coveringSets) => - println(s"Bootstrap sets for join part ${joinPartMetadata.joinPart.groupBy.metaData.name}") + logger.info(s"Bootstrap sets for join part ${joinPartMetadata.joinPart.groupBy.metaData.name}") coveringSets.foreach { coveringSet => - println( + logger.info( s"CoveringSet(hash=${coveringSet.hashes.prettyInline}, rowCount=${coveringSet.rowCount}, isCovering=${coveringSet.isCovering})") } - println() + logger.info() } coveringSetsPerJoinPart @@ -344,7 +348,7 @@ class Join(joinConf: api.Join, val result = baseDf.select(finalOutputColumns: _*) if (showDf) { - println(s"printing results for join: ${joinConf.metaData.name}") + logger.info(s"printing results for join: ${joinConf.metaData.name}") result.prettyPrint() } result @@ -422,7 +426,7 @@ class Join(joinConf: api.Join, val joinedDf = parts.foldLeft(initDf) { case (partialDf, part) => { - println(s"\nProcessing Bootstrap from table ${part.table} for range ${unfilledRange}") + logger.info(s"\nProcessing Bootstrap from table ${part.table} for range ${unfilledRange}") val bootstrapRange = if (part.isSetQuery) { unfilledRange.intersect(PartitionRange(part.startPartition, part.endPartition)(tableUtils)) @@ -430,7 +434,7 @@ class Join(joinConf: api.Join, unfilledRange } if (!bootstrapRange.valid) { - println(s"partition range of bootstrap table ${part.table} is beyond unfilled range") + logger.info(s"partition range of bootstrap table ${part.table} is beyond unfilled range") partialDf } else { var bootstrapDf = tableUtils.sql( @@ -476,7 +480,7 @@ class Join(joinConf: api.Join, }) val elapsedMins = (System.currentTimeMillis() - startMillis) / (60 * 1000) - println(s"Finished computing bootstrap table ${joinConf.metaData.bootstrapTable} in ${elapsedMins} minutes") + logger.info(s"Finished computing bootstrap table ${joinConf.metaData.bootstrapTable} in ${elapsedMins} minutes") tableUtils.sql(range.genScanQuery(query = null, table = bootstrapTable)) } @@ -495,7 +499,7 @@ class Join(joinConf: api.Join, return Some(bootstrapDfWithStats) } val filterExpr = CoveringSet.toFilterExpression(coveringSets) - println(s"Using covering set filter: $filterExpr") + logger.info(s"Using covering set filter: $filterExpr") val filteredDf = bootstrapDf.where(filterExpr) val filteredCount = filteredDf.count() if (bootstrapDfWithStats.count == filteredCount) { // counting is faster than computing stats diff --git a/spark/src/main/scala/ai/chronon/spark/JoinBase.scala b/spark/src/main/scala/ai/chronon/spark/JoinBase.scala index 56c6e1fa0..c1d1c7f07 100644 --- a/spark/src/main/scala/ai/chronon/spark/JoinBase.scala +++ b/spark/src/main/scala/ai/chronon/spark/JoinBase.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.DataModel.{Entities, Events} import ai.chronon.api.Extensions._ @@ -38,6 +39,7 @@ abstract class JoinBase(joinConf: api.Join, skipFirstHole: Boolean, mutationScan: Boolean = true, showDf: Boolean = false) { + private val logger = LoggerFactory.getLogger(getClass) assert(Option(joinConf.metaData.outputNamespace).nonEmpty, s"output namespace could not be empty or null") val metrics: Metrics.Context = Metrics.Context(Metrics.Environment.JoinOffline, joinConf) private val outputTable = joinConf.metaData.outputTable @@ -98,14 +100,14 @@ abstract class JoinBase(joinConf: api.Join, keyRenamedRightDf } - println(s""" + logger.info(s""" |Join keys for ${joinPart.groupBy.metaData.name}: ${keys.mkString(", ")} |Left Schema: |${leftDf.schema.pretty} |Right Schema: |${joinableRightDf.schema.pretty}""".stripMargin) val joinedDf = coalescedJoin(leftDf, joinableRightDf, keys) - println(s"""Final Schema: + logger.info(s"""Final Schema: |${joinedDf.schema.pretty} |""".stripMargin) @@ -154,18 +156,18 @@ abstract class JoinBase(joinConf: api.Join, computeJoinPart(prunedLeft, joinPart, joinLevelBloomMapOpt) // Cache join part data into intermediate table if (filledDf.isDefined) { - println(s"Writing to join part table: $partTable for partition range $unfilledRange") + logger.info(s"Writing to join part table: $partTable for partition range $unfilledRange") filledDf.get.save(partTable, tableProps, stats = prunedLeft.map(_.stats)) } }) val elapsedMins = (System.currentTimeMillis() - start) / 60000 partMetrics.gauge(Metrics.Name.LatencyMinutes, elapsedMins) partMetrics.gauge(Metrics.Name.PartitionCount, partitionCount) - println(s"Wrote ${partitionCount} partitions to join part table: $partTable in $elapsedMins minutes") + logger.info(s"Wrote ${partitionCount} partitions to join part table: $partTable in $elapsedMins minutes") } } catch { case e: Exception => - println(s"Error while processing groupBy: ${joinConf.metaData.name}/${joinPart.groupBy.getMetaData.getName}") + logger.info(s"Error while processing groupBy: ${joinConf.metaData.name}/${joinPart.groupBy.getMetaData.getName}") throw e } if (tableUtils.tableExists(partTable)) { @@ -183,7 +185,7 @@ abstract class JoinBase(joinConf: api.Join, if (leftDfWithStats.isEmpty) { // happens when all rows are already filled by bootstrap tables - println(s"\nBackfill is NOT required for ${joinPart.groupBy.metaData.name} since all rows are bootstrapped.") + logger.info(s"\nBackfill is NOT required for ${joinPart.groupBy.metaData.name} since all rows are bootstrapped.") return None } @@ -191,7 +193,7 @@ abstract class JoinBase(joinConf: api.Join, val rowCount = leftDfWithStats.get.count val unfilledRange = leftDfWithStats.get.partitionRange - println(s"\nBackfill is required for ${joinPart.groupBy.metaData.name} for $rowCount rows on range $unfilledRange") + logger.info(s"\nBackfill is required for ${joinPart.groupBy.metaData.name} for $rowCount rows on range $unfilledRange") val rightBloomMap = JoinUtils.genBloomFilterIfNeeded(leftDf, joinPart, @@ -216,7 +218,7 @@ abstract class JoinBase(joinConf: api.Join, lazy val unfilledTimeRange = { val timeRange = leftDf.timeRange - println(s"left unfilled time range: $timeRange") + logger.info(s"left unfilled time range: $timeRange") timeRange } @@ -226,7 +228,7 @@ abstract class JoinBase(joinConf: api.Join, lazy val skewFilteredLeft = leftSkewFilter .map { sf => val filtered = leftDf.filter(sf) - println(s"""Skew filtering left-df for + logger.info(s"""Skew filtering left-df for |GroupBy: ${joinPart.groupBy.metaData.name} |filterClause: $sf |""".stripMargin) @@ -276,7 +278,7 @@ abstract class JoinBase(joinConf: api.Join, rightDf } if (showDf) { - println(s"printing results for joinPart: ${joinConf.metaData.name}::${joinPart.groupBy.metaData.name}") + logger.info(s"printing results for joinPart: ${joinConf.metaData.name}::${joinPart.groupBy.metaData.name}") rightDfWithDerivations.prettyPrint() } Some(rightDfWithDerivations) @@ -300,15 +302,15 @@ abstract class JoinBase(joinConf: api.Join, try { analyzer.analyzeJoin(joinConf, validationAssert = true) metrics.gauge(Metrics.Name.validationSuccess, 1) - println("Join conf validation succeeded. No error found.") + logger.info("Join conf validation succeeded. No error found.") } catch { case ex: AssertionError => metrics.gauge(Metrics.Name.validationFailure, 1) - println(s"Validation failed. Please check the validation error in log.") + logger.info(s"Validation failed. Please check the validation error in log.") if (tableUtils.backfillValidationEnforced) throw ex case e: Throwable => metrics.gauge(Metrics.Name.validationFailure, 1) - println(s"An unexpected error occurred during validation. ${e.getMessage}") + logger.info(s"An unexpected error occurred during validation. ${e.getMessage}") } // First run command to archive tables that have changed semantically since the last run @@ -325,14 +327,14 @@ abstract class JoinBase(joinConf: api.Join, endPartition, overrideStartPartition, joinConf.historicalBackfill) - println(s"Join range to fill $rangeToFill") + logger.info(s"Join range to fill $rangeToFill") val unfilledRanges = tableUtils .unfilledRanges(outputTable, rangeToFill, Some(Seq(joinConf.left.table)), skipFirstHole = skipFirstHole) .getOrElse(Seq.empty) def finalResult: DataFrame = tableUtils.sql(rangeToFill.genScanQuery(null, outputTable)) if (unfilledRanges.isEmpty) { - println(s"\nThere is no data to compute based on end partition of ${rangeToFill.end}.\n\n Exiting..") + logger.info(s"\nThere is no data to compute based on end partition of ${rangeToFill.end}.\n\n Exiting..") return finalResult } @@ -345,12 +347,12 @@ abstract class JoinBase(joinConf: api.Join, // build bootstrap info once for the entire job val bootstrapInfo = BootstrapInfo.from(joinConf, rangeToFill, tableUtils, leftSchema, mutationScan = mutationScan) - println(s"Join ranges to compute: ${stepRanges.map { _.toString }.pretty}") + logger.info(s"Join ranges to compute: ${stepRanges.map { _.toString }.pretty}") stepRanges.zipWithIndex.foreach { case (range, index) => val startMillis = System.currentTimeMillis() val progress = s"| [${index + 1}/${stepRanges.size}]" - println(s"Computing join for range: $range $progress") + logger.info(s"Computing join for range: $range $progress") leftDf(joinConf, range, tableUtils).map { leftDfInRange => if (showDf) leftDfInRange.prettyPrint() // set autoExpand = true to ensure backward compatibility due to column ordering changes @@ -358,10 +360,10 @@ abstract class JoinBase(joinConf: api.Join, val elapsedMins = (System.currentTimeMillis() - startMillis) / (60 * 1000) metrics.gauge(Metrics.Name.LatencyMinutes, elapsedMins) metrics.gauge(Metrics.Name.PartitionCount, range.partitions.length) - println(s"Wrote to table $outputTable, into partitions: $range $progress in $elapsedMins mins") + logger.info(s"Wrote to table $outputTable, into partitions: $range $progress in $elapsedMins mins") } } - println(s"Wrote to table $outputTable, into partitions: $unfilledRanges") + logger.info(s"Wrote to table $outputTable, into partitions: $unfilledRanges") finalResult } } diff --git a/spark/src/main/scala/ai/chronon/spark/JoinUtils.scala b/spark/src/main/scala/ai/chronon/spark/JoinUtils.scala index f30b4cfc9..017c8d99a 100644 --- a/spark/src/main/scala/ai/chronon/spark/JoinUtils.scala +++ b/spark/src/main/scala/ai/chronon/spark/JoinUtils.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api.Constants import ai.chronon.api.DataModel.Events import ai.chronon.api.Extensions.{JoinOps, _} @@ -30,6 +31,7 @@ import scala.collection.Seq import scala.util.ScalaJavaConversions.MapOps object JoinUtils { + private val logger = LoggerFactory.getLogger(getClass) /*** * Util methods for join computation @@ -53,12 +55,12 @@ object JoinUtils { val skewFilter = joinConf.skewFilter() val result = skewFilter .map(sf => { - println(s"left skew filter: $sf") + logger.info(s"left skew filter: $sf") df.filter(sf) }) .getOrElse(df) if (result.isEmpty) { - println(s"Left side query below produced 0 rows in range $range. Query:\n$scanQuery") + logger.info(s"Left side query below produced 0 rows in range $range. Query:\n$scanQuery") if (!allowEmpty) { return None } @@ -103,7 +105,7 @@ object JoinUtils { val overrideStart = if (historicalBackfill) { overrideStartPartition } else { - println(s"Historical backfill is set to false. Backfill latest single partition only: $endPartition") + logger.info(s"Historical backfill is set to false. Backfill latest single partition only: $endPartition") Some(endPartition) } lazy val defaultLeftStart = Option(leftSource.query.startPartition) @@ -294,12 +296,12 @@ object JoinUtils { unfilledRange: PartitionRange, tableUtils: TableUtils, joinLevelBloomMapOpt: Option[Map[String, BloomFilter]]): Option[Map[String, BloomFilter]] = { - println( + logger.info( s"\nRow count to be filled for ${joinPart.groupBy.metaData.name}. BloomFilter Threshold: ${tableUtils.bloomFilterThreshold}") // apply bloom filter when left row count is below threshold if (leftRowCount > tableUtils.bloomFilterThreshold) { - println("Row count is above threshold. Skip gen bloom filter.") + logger.info("Row count is above threshold. Skip gen bloom filter.") Option.empty } else { @@ -312,7 +314,7 @@ object JoinUtils { val rightBloomMap = joinPart.rightToLeft.mapValues(leftBlooms(_)).toMap val bloomSizes = rightBloomMap.map { case (col, bloom) => s"$col -> ${bloom.bitSize()}" }.pretty - println(s""" + logger.info(s""" Generating bloom filter for joinPart: | part name : ${joinPart.groupBy.metaData.name}, | left type : ${joinConf.left.dataModel}, @@ -342,7 +344,7 @@ object JoinUtils { oldSemanticJson <- props.get(Constants.SemanticHashKey); oldSemanticHash = gson.fromJson(oldSemanticJson, classOf[java.util.HashMap[String, String]]).toScala ) yield { - println(s"Comparing Hashes:\nNew: ${joinConf.semanticHash},\nOld: $oldSemanticHash") + logger.info(s"Comparing Hashes:\nNew: ${joinConf.semanticHash},\nOld: $oldSemanticHash") joinConf.tablesToDrop(oldSemanticHash) }).getOrElse(collection.Seq.empty) } diff --git a/spark/src/main/scala/ai/chronon/spark/KvRdd.scala b/spark/src/main/scala/ai/chronon/spark/KvRdd.scala index 669eef312..5c01e73a5 100644 --- a/spark/src/main/scala/ai/chronon/spark/KvRdd.scala +++ b/spark/src/main/scala/ai/chronon/spark/KvRdd.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.online.{AvroCodec, AvroConversions, SparkConversions} import ai.chronon.spark.Extensions._ @@ -26,6 +27,7 @@ import org.apache.spark.sql.types.{BinaryType, LongType, StringType, StructField import org.apache.spark.sql.{DataFrame, Row, SparkSession} object GenericRowHandler { + private val logger = LoggerFactory.getLogger(getClass) val func: Any => Array[Any] = { case x: GenericRowWithSchema => { val result = new Array[Any](x.length) @@ -70,6 +72,7 @@ sealed trait BaseKvRdd { case class KvRdd(data: RDD[(Array[Any], Array[Any])], keySchema: StructType, valueSchema: StructType)(implicit sparkSession: SparkSession) extends BaseKvRdd { + private val logger = LoggerFactory.getLogger(getClass) val withTime = false def toAvroDf(jsonPercent: Int = 1): DataFrame = { @@ -85,7 +88,7 @@ case class KvRdd(data: RDD[(Array[Any], Array[Any])], keySchema: StructType, val val result: Array[Any] = Array(keyToBytes(keys), valueToBytes(values), keyJson, valueJson) new GenericRow(result) } - println(s""" + logger.info(s""" |key schema: | ${AvroConversions.fromChrononSchema(keyZSchema).toString(true)} |value schema: @@ -111,6 +114,7 @@ case class TimedKvRdd(data: RDD[(Array[Any], Array[Any], Long)], valueSchema: StructType, storeSchemasPrefix: Option[String] = None)(implicit sparkSession: SparkSession) extends BaseKvRdd { + private val logger = LoggerFactory.getLogger(getClass) val withTime = true // TODO make json percent configurable @@ -127,7 +131,7 @@ case class TimedKvRdd(data: RDD[(Array[Any], Array[Any], Long)], } val schemasStr = Seq(keyZSchema, valueZSchema).map(AvroConversions.fromChrononSchema(_).toString(true)) - println(s""" + logger.info(s""" |key schema: | ${schemasStr(0)} |value schema: diff --git a/spark/src/main/scala/ai/chronon/spark/LabelJoin.scala b/spark/src/main/scala/ai/chronon/spark/LabelJoin.scala index 3eef6799f..cc21503a7 100644 --- a/spark/src/main/scala/ai/chronon/spark/LabelJoin.scala +++ b/spark/src/main/scala/ai/chronon/spark/LabelJoin.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.DataModel.{Entities, Events} import ai.chronon.api.Extensions._ @@ -31,6 +32,7 @@ import scala.collection.Seq import scala.util.ScalaJavaConversions.IterableOps class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { + private val logger = LoggerFactory.getLogger(getClass) assert(Option(joinConf.metaData.outputNamespace).nonEmpty, s"output namespace could not be empty or null") assert( @@ -92,7 +94,7 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { labelTable } else { // creating final join view with feature join output table - println(s"Joining label table : ${outputLabelTable} with joined output table : ${joinConf.metaData.outputTable}") + logger.info(s"Joining label table : ${outputLabelTable} with joined output table : ${joinConf.metaData.outputTable}") JoinUtils.createOrReplaceView( joinConf.metaData.outputFinalView, leftTable = joinConf.metaData.outputTable, @@ -102,11 +104,11 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { viewProperties = Map(Constants.LabelViewPropertyKeyLabelTable -> outputLabelTable, Constants.LabelViewPropertyFeatureTable -> joinConf.metaData.outputTable) ) - println(s"Final labeled view created: ${joinConf.metaData.outputFinalView}") + logger.info(s"Final labeled view created: ${joinConf.metaData.outputFinalView}") JoinUtils.createLatestLabelView(joinConf.metaData.outputLatestLabelView, baseView = joinConf.metaData.outputFinalView, tableUtils) - println(s"Final view with latest label created: ${joinConf.metaData.outputLatestLabelView}") + logger.info(s"Final view with latest label created: ${joinConf.metaData.outputLatestLabelView}") labelTable } } @@ -114,18 +116,18 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { def compute(leftRange: PartitionRange, stepDays: Option[Int] = None, labelDS: Option[String] = None): DataFrame = { val today = tableUtils.partitionSpec.at(System.currentTimeMillis()) val sanitizedLabelDs = labelDS.getOrElse(today) - println(s"Label join range to fill $leftRange") + logger.info(s"Label join range to fill $leftRange") def finalResult = tableUtils.sql(leftRange.genScanQuery(null, outputLabelTable)) val leftFeatureRange = leftRange stepDays.foreach(metrics.gauge("step_days", _)) val stepRanges = stepDays.map(leftFeatureRange.steps).getOrElse(Seq(leftFeatureRange)) - println(s"Label Join left ranges to compute: ${stepRanges.map { _.toString }.pretty}") + logger.info(s"Label Join left ranges to compute: ${stepRanges.map { _.toString }.pretty}") stepRanges.zipWithIndex.foreach { case (range, index) => val startMillis = System.currentTimeMillis() val progress = s"| [${index + 1}/${stepRanges.size}]" - println(s"Computing label join for range: $range Label DS: ${labelDS.getOrElse(today)} $progress") + logger.info(s"Computing label join for range: $range Label DS: ${labelDS.getOrElse(today)} $progress") JoinUtils.leftDf(joinConf, range, tableUtils).map { leftDfInRange => computeRange(leftDfInRange, range, sanitizedLabelDs) .save(outputLabelTable, @@ -135,10 +137,10 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { val elapsedMins = (System.currentTimeMillis() - startMillis) / (60 * 1000) metrics.gauge(Metrics.Name.LatencyMinutes, elapsedMins) metrics.gauge(Metrics.Name.PartitionCount, range.partitions.length) - println(s"Wrote to table $outputLabelTable, into partitions: $range $progress in $elapsedMins mins") + logger.info(s"Wrote to table $outputLabelTable, into partitions: $range $progress in $elapsedMins mins") } } - println(s"Wrote to table $outputLabelTable, into partitions: $leftFeatureRange") + logger.info(s"Wrote to table $outputLabelTable, into partitions: $leftFeatureRange") finalResult } @@ -168,7 +170,7 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { .foreach(leftRange => { val labeledDf = computeLabelPart(labelJoinPart, leftRange, leftBlooms) // Cache label part data into intermediate table - println(s"Writing to join part table: $partTable for partition range $leftRange") + logger.info(s"Writing to join part table: $partTable for partition range $leftRange") labeledDf.save(tableName = partTable, tableProperties = confTableProps, partitionColumns = Seq(Constants.LabelPartitionColumn)) @@ -176,11 +178,11 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { val elapsedMins = (System.currentTimeMillis() - start) / 60000 labelJoinPartMetrics.gauge(Metrics.Name.LatencyMinutes, elapsedMins) labelJoinPartMetrics.gauge(Metrics.Name.PartitionCount, partitionCount) - println(s"Wrote ${partitionCount} partitions to label part table: $partTable in $elapsedMins minutes") + logger.info(s"Wrote ${partitionCount} partitions to label part table: $partTable in $elapsedMins minutes") } } catch { case e: Exception => - println( + logger.info( s"Error while processing groupBy: " + s"${joinConf.metaData.name}/${labelJoinPart.groupBy.getMetaData.getName}") throw e @@ -191,7 +193,7 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { } val rowIdentifier = labelJoinConf.rowIdentifier(joinConf.rowIds, tableUtils.partitionColumn) - println("Label Join filtering left df with only row identifier:", rowIdentifier.mkString(", ")) + logger.info("Label Join filtering left df with only row identifier:", rowIdentifier.mkString(", ")) val leftFiltered = JoinUtils.filterColumns(leftDf, rowIdentifier) val joined = rightDfs.zip(labelJoinConf.labels.asScala).foldLeft(leftFiltered) { @@ -212,7 +214,7 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { val rightSkewFilter = joinConf.partSkewFilter(joinPart) val rightBloomMap = joinPart.rightToLeft.mapValues(leftBlooms(_)).toMap val bloomSizes = rightBloomMap.map { case (col, bloom) => s"$col -> ${bloom.bitSize()}" }.pretty - println(s""" + logger.info(s""" |Label JoinPart Info: | part name : ${joinPart.groupBy.metaData.name}, | left type : ${joinConf.left.dataModel}, @@ -266,7 +268,7 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { val partName = joinPart.groupBy.metaData.name - println(s"""Join keys for $partName: ${partLeftKeys.mkString(", ")} + logger.info(s"""Join keys for $partName: ${partLeftKeys.mkString(", ")} |Left Schema: |${updatedLeftDf.schema.pretty} | diff --git a/spark/src/main/scala/ai/chronon/spark/LocalDataLoader.scala b/spark/src/main/scala/ai/chronon/spark/LocalDataLoader.scala index b980bfc09..4d152ddb5 100644 --- a/spark/src/main/scala/ai/chronon/spark/LocalDataLoader.scala +++ b/spark/src/main/scala/ai/chronon/spark/LocalDataLoader.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.spark.Extensions.StructTypeOps import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.util.FailFastMode @@ -25,12 +26,13 @@ import org.apache.spark.sql.types.{StringType, TimestampType} import java.io.File object LocalDataLoader { + private val logger = LoggerFactory.getLogger(getClass) def writeTableFromFile(file: File, tableName: String, session: SparkSession): Unit = { - println(s"Checking table: ${tableName}") + logger.info(s"Checking table: ${tableName}") if (session.catalog.tableExists(tableName)) return val extension = file.getName.split("\\.").last if (!Seq("csv", "json", "jsonl").contains(extension)) { - println(s"Unable to load file due to invalid extension from file: ${file.getPath}") + logger.info(s"Unable to load file due to invalid extension from file: ${file.getPath}") return } @@ -53,9 +55,9 @@ object LocalDataLoader { .drop("ts_string") } - println(s"Loading data from ${file.getPath} into $tableName. Sample data and schema shown below") + logger.info(s"Loading data from ${file.getPath} into $tableName. Sample data and schema shown below") df.show(100) - println(df.schema.pretty) + logger.info(df.schema.pretty) if (df.schema.map(_.name).contains("ds")) { df.write.partitionBy("ds").saveAsTable(tableName) @@ -100,7 +102,7 @@ object LocalDataLoader { ): Unit = { assert(file.exists(), s"Non existent file: ${file.getPath}") assert(file.isFile, s"Cannot load a directory as a local table: ${file.getPath}") - println(s"Loading file(${file.getPath}) as $namespace.$tableName") + logger.info(s"Loading file(${file.getPath}) as $namespace.$tableName") if (!session.catalog.databaseExists(namespace)) session.sql(s"CREATE DATABASE $namespace") diff --git a/spark/src/main/scala/ai/chronon/spark/LogFlattenerJob.scala b/spark/src/main/scala/ai/chronon/spark/LogFlattenerJob.scala index e6ed12af4..742945162 100644 --- a/spark/src/main/scala/ai/chronon/spark/LogFlattenerJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/LogFlattenerJob.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.Extensions._ import ai.chronon.api._ @@ -50,6 +51,7 @@ class LogFlattenerJob(session: SparkSession, schemaTable: String, stepDays: Option[Int] = None) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) implicit val tableUtils: TableUtils = TableUtils(session) val joinTblProps: Map[String, String] = Option(joinConf.metaData.tableProperties) .map(_.toScala) @@ -69,13 +71,13 @@ class LogFlattenerJob(session: SparkSession, val ranges = unfilledRangeTry match { case Failure(_: AssertionError) => { - println(s""" + logger.info(s""" |The join name ${joinConf.metaData.nameToFilePath} does not have available logged data yet. |Please double check your logging status""".stripMargin) Seq() } case Success(None) => { - println( + logger.info( s"$outputTable seems to be caught up - to either " + s"$inputTable(latest ${tableUtils.lastAvailablePartition(inputTable)}) or $endDate.") Seq() @@ -196,7 +198,7 @@ class LogFlattenerJob(session: SparkSession, def buildLogTable(): Unit = { if (!joinConf.metaData.isSetSamplePercent) { - println(s"samplePercent is unset for ${joinConf.metaData.name}. Exit.") + logger.info(s"samplePercent is unset for ${joinConf.metaData.name}. Exit.") return } val unfilledRanges = getUnfilledRanges(logTable, joinConf.metaData.loggedTable) @@ -216,8 +218,8 @@ class LogFlattenerJob(session: SparkSession, val flattenedDf = flattenKeyValueBytes(rawDf, schemaMap) val schemaTblProps = buildTableProperties(schemaStringsMap) - println("======= Log table schema =======") - println(flattenedDf.schema.pretty) + logger.info("======= Log table schema =======") + logger.info(flattenedDf.schema.pretty) tableUtils.insertPartitions(flattenedDf, joinConf.metaData.loggedTable, tableProperties = @@ -237,7 +239,7 @@ class LogFlattenerJob(session: SparkSession, val failureCount = totalInputRowCount - totalOutputRowCount metrics.gauge(Metrics.Name.RowCount, totalOutputRowCount) metrics.gauge(Metrics.Name.FailureCount, failureCount) - println(s"Processed logs: ${totalOutputRowCount} rows success, ${failureCount} rows failed.") + logger.info(s"Processed logs: ${totalOutputRowCount} rows success, ${failureCount} rows failed.") metrics.gauge(Metrics.Name.ColumnBeforeCount, columnBeforeCount) metrics.gauge(Metrics.Name.ColumnAfterCount, columnAfterCount) val elapsedMins = (System.currentTimeMillis() - start) / 60000 @@ -246,6 +248,7 @@ class LogFlattenerJob(session: SparkSession, } object LogFlattenerJob { + private val logger = LoggerFactory.getLogger(getClass) def readSchemaTableProperties(tableUtils: TableUtils, logTable: String): Map[String, String] = { val curTblProps = tableUtils.getTableProperties(logTable).getOrElse(Map.empty) diff --git a/spark/src/main/scala/ai/chronon/spark/MetadataExporter.scala b/spark/src/main/scala/ai/chronon/spark/MetadataExporter.scala index cae3b775b..02c00dd4d 100644 --- a/spark/src/main/scala/ai/chronon/spark/MetadataExporter.scala +++ b/spark/src/main/scala/ai/chronon/spark/MetadataExporter.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import java.io.{BufferedWriter, File, FileWriter} import ai.chronon.api import ai.chronon.api.{DataType, ThriftJsonCodec} @@ -28,6 +29,7 @@ import java.nio.file.Paths import scala.collection.immutable.Map object MetadataExporter { + private val logger = LoggerFactory.getLogger(getClass) val GROUPBY_PATH_SUFFIX = "/group_bys" val JOIN_PATH_SUFFIX = "/joins" @@ -63,7 +65,7 @@ object MetadataExporter { } } catch { case exception: Throwable => - println(s"Exception while processing entity $path: ${ExceptionUtils.getStackTrace(exception)}") + logger.info(s"Exception while processing entity $path: ${ExceptionUtils.getStackTrace(exception)}") configData } mapper.writeValueAsString(enrichedData) @@ -76,7 +78,7 @@ object MetadataExporter { val writer = new BufferedWriter(new FileWriter(file)) writer.write(data) writer.close() - println(s"${path} : Wrote to output directory successfully") + logger.info(s"${path} : Wrote to output directory successfully") } def processEntities(inputPath: String, outputPath: String, suffix: String): Unit = { @@ -90,7 +92,7 @@ object MetadataExporter { } } val failuresAndTraces = processSuccess.filter(!_._2) - println( + logger.info( s"Successfully processed ${processSuccess.filter(_._2).length} from $suffix \n " + s"Failed to process ${failuresAndTraces.length}: \n ${failuresAndTraces.mkString("\n")}") } diff --git a/spark/src/main/scala/ai/chronon/spark/SparkSessionBuilder.scala b/spark/src/main/scala/ai/chronon/spark/SparkSessionBuilder.scala index 6a3aa3678..64b7c7372 100644 --- a/spark/src/main/scala/ai/chronon/spark/SparkSessionBuilder.scala +++ b/spark/src/main/scala/ai/chronon/spark/SparkSessionBuilder.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import org.apache.spark.sql.SparkSession import org.apache.spark.SPARK_VERSION @@ -25,6 +26,7 @@ import scala.reflect.io.Path import scala.util.Properties object SparkSessionBuilder { + private val logger = LoggerFactory.getLogger(getClass) val DefaultWarehouseDir = new File("/tmp/chronon/spark-warehouse") @@ -67,7 +69,7 @@ object SparkSessionBuilder { } val builder = if (local) { - println(s"Building local spark session with warehouse at $warehouseDir") + logger.info(s"Building local spark session with warehouse at $warehouseDir") val metastoreDb = s"jdbc:derby:;databaseName=$warehouseDir/metastore_db;create=true" baseBuilder // use all threads - or the tests will be slow diff --git a/spark/src/main/scala/ai/chronon/spark/StagingQuery.scala b/spark/src/main/scala/ai/chronon/spark/StagingQuery.scala index a19dfb455..0b201ef64 100644 --- a/spark/src/main/scala/ai/chronon/spark/StagingQuery.scala +++ b/spark/src/main/scala/ai/chronon/spark/StagingQuery.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.ParametricMacro import ai.chronon.api.Extensions._ @@ -25,6 +26,7 @@ import scala.collection.mutable import scala.util.ScalaJavaConversions._ class StagingQuery(stagingQueryConf: api.StagingQuery, endPartition: String, tableUtils: TableUtils) { + private val logger = LoggerFactory.getLogger(getClass) assert(Option(stagingQueryConf.metaData.outputNamespace).nonEmpty, s"output namespace could not be empty or null") private val outputTable = stagingQueryConf.metaData.outputTable private val tableProps = Option(stagingQueryConf.metaData.tableProperties) @@ -48,31 +50,31 @@ class StagingQuery(stagingQueryConf: api.StagingQuery, endPartition: String, tab tableUtils.unfilledRanges(outputTable, PartitionRange(stagingQueryConf.startPartition, endPartition)(tableUtils)) if (unfilledRanges.isEmpty) { - println(s"""No unfilled range for $outputTable given + logger.info(s"""No unfilled range for $outputTable given |start partition of ${stagingQueryConf.startPartition} |end partition of $endPartition |""".stripMargin) return } val stagingQueryUnfilledRanges = unfilledRanges.get - println(s"Staging Query unfilled ranges: $stagingQueryUnfilledRanges") + logger.info(s"Staging Query unfilled ranges: $stagingQueryUnfilledRanges") val exceptions = mutable.Buffer.empty[String] stagingQueryUnfilledRanges.foreach { stagingQueryUnfilledRange => try { val stepRanges = stepDays.map(stagingQueryUnfilledRange.steps).getOrElse(Seq(stagingQueryUnfilledRange)) - println(s"Staging query ranges to compute: ${stepRanges.map { _.toString }.pretty}") + logger.info(s"Staging query ranges to compute: ${stepRanges.map { _.toString }.pretty}") stepRanges.zipWithIndex.foreach { case (range, index) => val progress = s"| [${index + 1}/${stepRanges.size}]" - println(s"Computing staging query for range: $range $progress") + logger.info(s"Computing staging query for range: $range $progress") val renderedQuery = StagingQuery.substitute(tableUtils, stagingQueryConf.query, range.start, range.end, endPartition) - println(s"Rendered Staging Query to run is:\n$renderedQuery") + logger.info(s"Rendered Staging Query to run is:\n$renderedQuery") val df = tableUtils.sql(renderedQuery) tableUtils.insertPartitions(df, outputTable, tableProps, partitionCols, autoExpand = enableAutoExpand.get) - println(s"Wrote to table $outputTable, into partitions: $range $progress") + logger.info(s"Wrote to table $outputTable, into partitions: $range $progress") } - println(s"Finished writing Staging Query data to $outputTable") + logger.info(s"Finished writing Staging Query data to $outputTable") } catch { case err: Throwable => exceptions.append(s"Error handling range $stagingQueryUnfilledRange : ${err.getMessage}\n${err.traceString}") @@ -91,6 +93,7 @@ class StagingQuery(stagingQueryConf: api.StagingQuery, endPartition: String, tab } object StagingQuery { + private val logger = LoggerFactory.getLogger(getClass) def substitute(tu: TableUtils, query: String, start: String, end: String, latest: String): String = { val macros: Array[ParametricMacro] = Array( diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala index 20a9436a3..bdea6e9a8 100644 --- a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala +++ b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.aggregator.windowing.TsUtils import ai.chronon.api.{Constants, PartitionSpec} import ai.chronon.api.Extensions._ @@ -36,6 +37,7 @@ import scala.concurrent.{ExecutionContext, ExecutionContextExecutor} import scala.util.{Failure, Success, Try} case class TableUtils(sparkSession: SparkSession) { + private val logger = LoggerFactory.getLogger(getClass) private val ARCHIVE_TIMESTAMP_FORMAT = "yyyyMMddHHmmss" private lazy val archiveTimestampFormatter = DateTimeFormatter @@ -161,10 +163,10 @@ case class TableUtils(sparkSession: SparkSession) { sparkSession.read.format("iceberg").load(tableName) } match { case Success(_) => - println(s"IcebergCheck: Detected iceberg formatted table $tableName.") + logger.info(s"IcebergCheck: Detected iceberg formatted table $tableName.") true case _ => - println(s"IcebergCheck: Checked table $tableName is not iceberg format.") + logger.info(s"IcebergCheck: Checked table $tableName is not iceberg format.") false } @@ -231,7 +233,7 @@ case class TableUtils(sparkSession: SparkSession) { def checkTablePermission(tableName: String, fallbackPartition: String = partitionSpec.before(partitionSpec.at(System.currentTimeMillis()))): Boolean = { - println(s"Checking permission for table $tableName...") + logger.info(s"Checking permission for table $tableName...") try { // retrieve one row from the table val partitionFilter = lastAvailablePartition(tableName).getOrElse(fallbackPartition) @@ -240,14 +242,14 @@ case class TableUtils(sparkSession: SparkSession) { } catch { case e: RuntimeException => if (e.getMessage.contains("ACCESS DENIED")) - println(s"[Error] No access to table: $tableName ") + logger.info(s"[Error] No access to table: $tableName ") else { - println(s"[Error] Encountered exception when reading table: $tableName.") + logger.info(s"[Error] Encountered exception when reading table: $tableName.") e.printStackTrace() } false case ex: Exception => - println(s"[Error] Encountered exception when reading table: $tableName.") + logger.info(s"[Error] Encountered exception when reading table: $tableName.") ex.printStackTrace() true } @@ -284,7 +286,7 @@ case class TableUtils(sparkSession: SparkSession) { sql(creationSql) } catch { case e: Exception => - println(s"Failed to create table $tableName with error: ${e.getMessage}") + logger.info(s"Failed to create table $tableName with error: ${e.getMessage}") throw e } } else { @@ -318,7 +320,7 @@ case class TableUtils(sparkSession: SparkSession) { def sql(query: String): DataFrame = { val partitionCount = sparkSession.sparkContext.getConf.getInt("spark.default.parallelism", 1000) - println( + logger.info( s"\n----[Running query coalesced into at most $partitionCount partitions]----\n$query\n----[End of Query]----\n") val df = sparkSession.sql(query).coalesce(partitionCount) df @@ -354,13 +356,13 @@ case class TableUtils(sparkSession: SparkSession) { def wrapWithCache[T](opString: String, dataFrame: DataFrame)(func: => T): Try[T] = { val start = System.currentTimeMillis() cacheLevel.foreach { level => - println(s"Starting to cache dataframe before $opString - start @ ${TsUtils.toStr(start)}") + logger.info(s"Starting to cache dataframe before $opString - start @ ${TsUtils.toStr(start)}") dataFrame.persist(level) } def clear(): Unit = { cacheLevel.foreach(_ => dataFrame.unpersist(blockingCacheEviction)) val end = System.currentTimeMillis() - println( + logger.info( s"Cleared the dataframe cache after $opString - start @ ${TsUtils.toStr(start)} end @ ${TsUtils.toStr(end)}") } Try { @@ -403,7 +405,7 @@ case class TableUtils(sparkSession: SparkSession) { // set to one if tablePartitionCount=0 to avoid division by zero val nonZeroTablePartitionCount = if (tablePartitionCount == 0) 1 else tablePartitionCount - println(s"$rowCount rows requested to be written into table $tableName") + logger.info(s"$rowCount rows requested to be written into table $tableName") if (rowCount > 0) { val columnSizeEstimate = columnSizeEstimator(df.schema) @@ -430,7 +432,7 @@ case class TableUtils(sparkSession: SparkSession) { .flatMap(value => if (value > 0) Some(value) else None) if (outputParallelism.isDefined) { - println(s"Using custom outputParallelism ${outputParallelism.get}") + logger.info(s"Using custom outputParallelism ${outputParallelism.get}") } val dailyFileCount = outputParallelism.getOrElse(dailyFileCountBounded) @@ -439,8 +441,8 @@ case class TableUtils(sparkSession: SparkSession) { val saltCol = "random_partition_salt" val saltedDf = df.withColumn(saltCol, round(rand() * (dailyFileCount + 1))) - println( - s"repartitioning data for table $tableName by $shuffleParallelism spark tasks into $nonZeroTablePartitionCount table partitions and $dailyFileCount files per partition") + logger.info( + s"repartitioning data for table $tableName by $shuffleParallelism spark tasks into $tablePartitionCount table partitions and $dailyFileCount files per partition") val repartitionCols = if (df.schema.fieldNames.contains(partitionColumn)) { Seq(partitionColumn, saltCol) @@ -451,7 +453,7 @@ case class TableUtils(sparkSession: SparkSession) { .write .mode(saveMode) .insertInto(tableName) - println(s"Finished writing to $tableName") + logger.info(s"Finished writing to $tableName") } } @@ -560,7 +562,7 @@ case class TableUtils(sparkSession: SparkSession) { val inputMissing = fillablePartitions -- allInputExisting val missingPartitions = outputMissing -- inputMissing val missingChunks = chunk(missingPartitions) - println(s""" + logger.info(s""" |Unfilled range computation: | Output table: $outputTable | Missing output partitions: ${outputMissing.toSeq.sorted.prettyInline} @@ -584,14 +586,14 @@ case class TableUtils(sparkSession: SparkSession) { def dropTableIfExists(tableName: String): Unit = { val command = s"DROP TABLE IF EXISTS $tableName" - println(s"Dropping table with command: $command") + logger.info(s"Dropping table with command: $command") sql(command) } def archiveOrDropTableIfExists(tableName: String, timestamp: Option[Instant]): Unit = { val archiveTry = Try(archiveTableIfExists(tableName, timestamp)) archiveTry.failed.foreach { e => - println(s"""Fail to archive table ${tableName} + logger.info(s"""Fail to archive table ${tableName} |${e.getMessage} |Proceed to dropping the table instead. |""".stripMargin) @@ -604,7 +606,7 @@ case class TableUtils(sparkSession: SparkSession) { val humanReadableTimestamp = archiveTimestampFormatter.format(timestamp.getOrElse(Instant.now())) val finalArchiveTableName = s"${tableName}_${humanReadableTimestamp}" val command = s"ALTER TABLE $tableName RENAME TO $finalArchiveTableName" - println(s"Archiving table with command: $command") + logger.info(s"Archiving table with command: $command") sql(command) } } @@ -626,7 +628,7 @@ case class TableUtils(sparkSession: SparkSession) { val earliestHoleOpt = (inputPartitions -- outputPartitions).reduceLeftOption(Ordering[String].min) earliestHoleOpt.foreach { hole => val toDrop = outputPartitions.filter(_ > hole) - println(s""" + logger.info(s""" |Earliest hole at $hole in output table $outputTable, relative to $inputTable |Input Parts : ${inputPartitions.toArray.sorted.mkString("Array(", ", ", ")")} |Output Parts : ${outputPartitions.toArray.sorted.mkString("Array(", ", ", ")")} @@ -655,7 +657,7 @@ case class TableUtils(sparkSession: SparkSession) { val dropSql = s"ALTER TABLE $tableName DROP IF EXISTS $partitionSpecs" sql(dropSql) } else { - println(s"$tableName doesn't exist, please double check before drop partitions") + logger.info(s"$tableName doesn't exist, please double check before drop partitions") } } @@ -667,7 +669,7 @@ case class TableUtils(sparkSession: SparkSession) { val toDrop = Stream.iterate(startDate)(partitionSpec.after).takeWhile(_ <= endDate) dropPartitions(tableName, toDrop, partitionColumn, subPartitionFilters) } else { - println(s"$tableName doesn't exist, please double check before drop partitions") + logger.info(s"$tableName doesn't exist, please double check before drop partitions") } } @@ -730,7 +732,7 @@ case class TableUtils(sparkSession: SparkSession) { if (excludedFields.nonEmpty) { val excludedFieldsStr = excludedFields.map(tuple => s"columnName: ${tuple._1} dataType: ${tuple._2.dataType.catalogString}") - println( + logger.info( s"""Warning. Detected columns that exist in Hive table but not in updated schema. These are ignored in DDL. |${excludedFieldsStr.mkString("\n")} |""".stripMargin) @@ -746,6 +748,7 @@ case class TableUtils(sparkSession: SparkSession) { } sealed case class IncompatibleSchemaException(inconsistencies: Seq[(String, DataType, DataType)]) extends Exception { + private val logger = LoggerFactory.getLogger(getClass) override def getMessage: String = { val inconsistenciesStr = inconsistencies.map(tuple => s"columnName: ${tuple._1} existingType: ${tuple._2} newType: ${tuple._3}") diff --git a/spark/src/main/scala/ai/chronon/spark/stats/CompareBaseJob.scala b/spark/src/main/scala/ai/chronon/spark/stats/CompareBaseJob.scala index 0ae31c525..78ec9aaae 100644 --- a/spark/src/main/scala/ai/chronon/spark/stats/CompareBaseJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/stats/CompareBaseJob.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.stats +import org.slf4j.LoggerFactory import ai.chronon.api._ import ai.chronon.online.{SparkConversions, _} import ai.chronon.spark.Extensions._ @@ -26,6 +27,7 @@ import org.apache.spark.sql.types.DataType import scala.collection.mutable.ListBuffer object CompareBaseJob { + private val logger = LoggerFactory.getLogger(getClass) def checkConsistency( leftFields: Map[String, DataType], @@ -132,10 +134,10 @@ object CompareBaseJob { } else { leftDf } - println(s"Pruning fields from the left source for equivalent comparison - ${prunedColumns.mkString(",")}") + logger.info(s"Pruning fields from the left source for equivalent comparison - ${prunedColumns.mkString(",")}") // 3. Build comparison dataframe - println(s"""Join keys: ${keys.mkString(", ")} + logger.info(s"""Join keys: ${keys.mkString(", ")} |Left Schema: |${prunedLeftDf.schema.pretty} | diff --git a/spark/src/main/scala/ai/chronon/spark/stats/CompareJob.scala b/spark/src/main/scala/ai/chronon/spark/stats/CompareJob.scala index 87f7ba809..43255e177 100644 --- a/spark/src/main/scala/ai/chronon/spark/stats/CompareJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/stats/CompareJob.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.stats +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.{Constants, PartitionSpec} import ai.chronon.api.DataModel.Events @@ -38,6 +39,7 @@ class CompareJob( startDate: String, endDate: String ) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) val tableProps: Map[String, String] = Option(joinConf.metaData.tableProperties) .map(_.toScala) .orNull @@ -68,21 +70,21 @@ class CompareJob( CompareBaseJob.compare(leftDf, rightDf, getJoinKeys(joinConf, tableUtils), tableUtils, migrationCheck = true) // Save the comparison table - println("Saving comparison output..") - println(s"Comparison schema ${compareDf.schema.fields.map(sb => (sb.name, sb.dataType)).toMap.mkString("\n - ")}") + logger.info("Saving comparison output..") + logger.info(s"Comparison schema ${compareDf.schema.fields.map(sb => (sb.name, sb.dataType)).toMap.mkString("\n - ")}") tableUtils.insertUnPartitioned(compareDf, comparisonTableName, tableProps, saveMode = SaveMode.Overwrite) // Save the metrics table - println("Saving metrics output..") + logger.info("Saving metrics output..") val metricsDf = metricsTimedKvRdd.toFlatDf - println(s"Metrics schema ${metricsDf.schema.fields.map(sb => (sb.name, sb.dataType)).toMap.mkString("\n - ")}") + logger.info(s"Metrics schema ${metricsDf.schema.fields.map(sb => (sb.name, sb.dataType)).toMap.mkString("\n - ")}") tableUtils.insertUnPartitioned(metricsDf, metricsTableName, tableProps, saveMode = SaveMode.Overwrite) - println("Printing basic comparison results..") - println("(Note: This is just an estimation and not a detailed analysis of results)") + logger.info("Printing basic comparison results..") + logger.info("(Note: This is just an estimation and not a detailed analysis of results)") CompareJob.printAndGetBasicMetrics(metrics, tableUtils.partitionSpec) - println("Finished compare stats.") + logger.info("Finished compare stats.") (compareDf, metricsDf, metrics) } @@ -104,6 +106,7 @@ class CompareJob( } object CompareJob { + private val logger = LoggerFactory.getLogger(getClass) /** * Extract the discrepancy metrics (like missing records, data mismatch) from the hourly compare metrics, consolidate @@ -150,13 +153,13 @@ object CompareJob { val consolidatedData = getConsolidatedData(metrics, partitionSpec) if (consolidatedData.size == 0) { - println( + logger.info( s"No discrepancies found for data mismatches and missing counts. " + s"It is highly recommended to explore the full metrics.") } else { consolidatedData.foreach { case (date, mismatchCount) => - println(s"Found ${mismatchCount} mismatches on date '${date}'") + logger.info(s"Found ${mismatchCount} mismatches on date '${date}'") } } consolidatedData diff --git a/spark/src/main/scala/ai/chronon/spark/stats/ConsistencyJob.scala b/spark/src/main/scala/ai/chronon/spark/stats/ConsistencyJob.scala index 2b7e81521..85159a53b 100644 --- a/spark/src/main/scala/ai/chronon/spark/stats/ConsistencyJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/stats/ConsistencyJob.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.stats +import org.slf4j.LoggerFactory import ai.chronon import ai.chronon.api.Extensions._ import ai.chronon.api._ @@ -28,6 +29,7 @@ import java.util import scala.util.ScalaJavaConversions.{JListOps, ListOps, MapOps} class ConsistencyJob(session: SparkSession, joinConf: Join, endDate: String) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) val tblProperties: Map[String, String] = Option(joinConf.metaData.tableProperties) .map(_.toScala) @@ -36,7 +38,7 @@ class ConsistencyJob(session: SparkSession, joinConf: Join, endDate: String) ext // Replace join's left side with the logged table events to determine offline values of the aggregations. private def buildComparisonJoin(): Join = { - println("Building Join With left as logged") + logger.info("Building Join With left as logged") val copiedJoin = joinConf.deepCopy() val loggedSource: Source = new Source() val loggedEvents: EventSource = new EventSource() @@ -76,26 +78,26 @@ class ConsistencyJob(session: SparkSession, joinConf: Join, endDate: String) ext .getOrElse(Seq.empty) if (unfilledRanges.isEmpty) return val join = new chronon.spark.Join(buildComparisonJoin(), unfilledRanges.last.end, TableUtils(session)) - println("Starting compute Join for comparison table") + logger.info("Starting compute Join for comparison table") val compareDf = join.computeJoin(Some(30)) - println("======= side-by-side comparison schema =======") - println(compareDf.schema.pretty) + logger.info("======= side-by-side comparison schema =======") + logger.info(compareDf.schema.pretty) } def buildConsistencyMetrics(): DataMetrics = { // migrate legacy configs without consistencySamplePercent param if (!joinConf.metaData.isSetConsistencySamplePercent) { - println("consistencySamplePercent is unset and will default to 100") + logger.info("consistencySamplePercent is unset and will default to 100") joinConf.metaData.consistencySamplePercent = 100 } if (joinConf.metaData.consistencySamplePercent == 0) { - println(s"Exit ConsistencyJob because consistencySamplePercent = 0 for join conf ${joinConf.metaData.name}") + logger.info(s"Exit ConsistencyJob because consistencySamplePercent = 0 for join conf ${joinConf.metaData.name}") return DataMetrics(Seq()) } buildComparisonTable() - println("Determining Range between consistency table and comparison table") + logger.info("Determining Range between consistency table and comparison table") val unfilledRanges = tableUtils .unfilledRanges(joinConf.metaData.consistencyTable, PartitionRange(null, endDate), @@ -108,22 +110,22 @@ class ConsistencyJob(session: SparkSession, joinConf: Join, endDate: String) ext tableUtils.sql(unfilled.genScanQuery(null, joinConf.metaData.loggedTable)).drop(Constants.SchemaHash) // there could be external columns that are logged during online env, therefore they could not be used for computing OOC val loggedDfNoExternalCols = loggedDf.select(comparisonDf.columns.map(org.apache.spark.sql.functions.col): _*) - println("Starting compare job for stats") + logger.info("Starting compare job for stats") val joinKeys = if (joinConf.isSetRowIds) { joinConf.rowIds.toScala } else { JoinCodec.timeFields.map(_.name).toList ++ joinConf.leftKeyCols } - println(s"Using ${joinKeys.mkString("[", ",", "]")} as join keys between log and backfill.") + logger.info(s"Using ${joinKeys.mkString("[", ",", "]")} as join keys between log and backfill.") val (compareDf, metricsKvRdd, metrics) = CompareBaseJob.compare(comparisonDf, loggedDfNoExternalCols, keys = joinKeys, tableUtils, name = joinConf.metaData.nameToFilePath) - println("Saving output.") + logger.info("Saving output.") val outputDf = metricsKvRdd.toFlatDf.withTimeBasedColumn("ds") - println(s"output schema ${outputDf.schema.fields.map(sb => (sb.name, sb.dataType)).toMap.mkString("\n - ")}") + logger.info(s"output schema ${outputDf.schema.fields.map(sb => (sb.name, sb.dataType)).toMap.mkString("\n - ")}") tableUtils.insertPartitions(outputDf, joinConf.metaData.consistencyTable, tableProperties = tblProperties, diff --git a/spark/src/main/scala/ai/chronon/spark/stats/SummaryJob.scala b/spark/src/main/scala/ai/chronon/spark/stats/SummaryJob.scala index f6fdca014..175b096fd 100644 --- a/spark/src/main/scala/ai/chronon/spark/stats/SummaryJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/stats/SummaryJob.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.stats +import org.slf4j.LoggerFactory import ai.chronon.online.SparkConversions import ai.chronon.aggregator.row.StatsGenerator import ai.chronon.api.Extensions._ @@ -31,6 +32,7 @@ import org.apache.spark.sql.SparkSession * Follow pattern of OOC for computing offline and uploading online as well. */ class SummaryJob(session: SparkSession, joinConf: Join, endDate: String) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) val tableUtils: TableUtils = TableUtils(session) private val loggingStatsTable = joinConf.metaData.loggingStatsTable @@ -50,17 +52,17 @@ class SummaryJob(session: SparkSession, joinConf: Join, endDate: String) extends .unfilledRanges(outputTable, PartitionRange(null, endDate)(tableUtils), Some(Seq(inputTable))) .getOrElse(Seq.empty) if (unfilledRanges.isEmpty) { - println(s"No data to compute for $outputTable") + logger.info(s"No data to compute for $outputTable") return } unfilledRanges.foreach { computeRange => - println(s"Daily output statistics table $outputTable unfilled range: $computeRange") + logger.info(s"Daily output statistics table $outputTable unfilled range: $computeRange") val stepRanges = stepDays.map(computeRange.steps).getOrElse(Seq(computeRange)) - println(s"Ranges to compute: ${stepRanges.map(_.toString).pretty}") + logger.info(s"Ranges to compute: ${stepRanges.map(_.toString).pretty}") // We are going to build the aggregator to denormalize sketches for hive. stepRanges.zipWithIndex.foreach { case (range, index) => - println(s"Computing range [${index + 1}/${stepRanges.size}]: $range") + logger.info(s"Computing range [${index + 1}/${stepRanges.size}]: $range") val joinOutputDf = tableUtils.sql(s""" |SELECT * |FROM ${inputTable} @@ -82,10 +84,10 @@ class SummaryJob(session: SparkSession, joinConf: Join, endDate: String) extends stats .addDerivedMetrics(summaryKvRdd.toFlatDf, aggregator) .save(outputTable, tableProps) - println(s"Finished range [${index + 1}/${stepRanges.size}].") + logger.info(s"Finished range [${index + 1}/${stepRanges.size}].") } } - println("Finished writing stats.") + logger.info("Finished writing stats.") } /** diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/streaming/GroupBy.scala index b1f0f911d..f3fc5cf0a 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/GroupBy.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/GroupBy.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.streaming +import org.slf4j.LoggerFactory import ai.chronon import ai.chronon.api import ai.chronon.api.{Row => _, _} @@ -40,6 +41,7 @@ class GroupBy(inputStream: DataFrame, onlineImpl: Api, debug: Boolean = false) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) private def buildStreamingQuery(inputTable: String): String = { val streamingSource = groupByConf.streamingSource.get @@ -102,7 +104,7 @@ class GroupBy(inputStream: DataFrame, streamDecoder.decode(arr) } catch { case ex: Throwable => - println( + logger.info( s"Error while decoding streaming events for ${groupByConf.getMetaData.getName} with " + s"schema ${streamDecoder.schema.catalogString}" + s" \n${ex.traceString}") @@ -114,7 +116,7 @@ class GroupBy(inputStream: DataFrame, mutation != null && (!(mutation.before != null && mutation.after != null) || !(mutation.before sameElements mutation.after))) val streamSchema = SparkConversions.fromChrononSchema(streamDecoder.schema) - println(s""" + logger.info(s""" | group by serving info: $groupByServingInfo | Streaming source: $streamingSource | streaming Query: $streamingQuery @@ -171,7 +173,7 @@ class GroupBy(inputStream: DataFrame, val gson = new Gson() val formatter = DateTimeFormatter.ISO_LOCAL_DATE_TIME.withZone(ZoneId.from(ZoneOffset.UTC)) val pstFormatter = DateTimeFormatter.ISO_LOCAL_DATE_TIME.withZone(ZoneId.of("America/Los_Angeles")) - println(s""" + logger.info(s""" |streaming dataset: $streamingDataset |keys: ${gson.toJson(keys)} |values: ${gson.toJson(values)} diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala b/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala index 57ecef9be..d6d23de47 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.streaming +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.Extensions.{GroupByOps, SourceOps} import ai.chronon.api._ @@ -41,6 +42,7 @@ import scala.util.ScalaJavaConversions.{IteratorOps, JIteratorOps, ListOps, MapO // micro batching destroys and re-creates these objects repeatedly through ForeachBatchWriter and MapFunction // this allows for re-use object LocalIOCache { + private val logger = LoggerFactory.getLogger(getClass) private var fetcher: Fetcher = null private var kvStore: KVStore = null def getOrSetFetcher(builderFunc: () => Fetcher): Fetcher = { @@ -63,6 +65,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map session: SparkSession, apiImpl: Api) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) val context: Metrics.Context = Metrics.Context(Metrics.Environment.GroupByStreaming, groupByConf) @@ -73,6 +76,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map extends Serializable val valueZSchema: api.StructType = groupByConf.dataModel match { + private val logger = LoggerFactory.getLogger(getClass) case api.DataModel.Events => servingInfoProxy.valueChrononSchema case api.DataModel.Entities => servingInfoProxy.mutationValueChrononSchema } @@ -105,6 +109,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map private val microBatchIntervalMillis: Int = getProp("batch_interval_millis", "1000").toInt private case class PutRequestHelper(inputSchema: StructType) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) private val keyIndices: Array[Int] = keyColumns.map(inputSchema.fieldIndex) private val valueIndices: Array[Int] = valueColumns.map(inputSchema.fieldIndex) private val tsIndex: Int = inputSchema.fieldIndex(eventTimeColumn) @@ -131,7 +136,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map val gson = new Gson() val formatter = DateTimeFormatter.ISO_LOCAL_DATE_TIME.withZone(ZoneId.from(ZoneOffset.UTC)) val pstFormatter = DateTimeFormatter.ISO_LOCAL_DATE_TIME.withZone(ZoneId.of("America/Los_Angeles")) - println(s""" + logger.info(s""" |dataset: $streamingDataset |keys: ${gson.toJson(keys)} |values: ${gson.toJson(values)} @@ -194,7 +199,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map // GroupBy -> JoinSource (Join + outer_query) // Join -> // Join.left -> (left.(table, mutation_stream, etc) + inner_query) - println(s""" + logger.info(s""" |Schemas across chain of transformations |leftSchema: | ${leftSchema.catalogString} @@ -230,7 +235,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map streamDecoder.decode(arr) } catch { case ex: Throwable => - println(s"Error while decoding streaming events from stream: ${dataStream.topicInfo.name}") + logger.info(s"Error while decoding streaming events from stream: ${dataStream.topicInfo.name}") ex.printStackTrace() ingressContext.incrementException(ex) null @@ -242,7 +247,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map (mutation != null) && (!bothNull || !bothSame) } val streamSchema = SparkConversions.fromChrononSchema(streamDecoder.schema) - println(s""" + logger.info(s""" | streaming source: ${groupByConf.streamingSource.get} | streaming dataset: ${groupByConf.streamingDataset} | stream schema: ${streamSchema.catalogString} @@ -293,7 +298,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map def applyQuery(df: DataFrame, query: api.Query): DataFrame = { val queryParts = groupByConf.buildQueryParts(query) - println(s""" + logger.info(s""" |decoded schema: ${decoded.df.schema.catalogString} |queryParts: $queryParts |df schema: ${df.schema.prettyJson} @@ -307,7 +312,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map val leftSource: Dataset[Row] = applyQuery(decoded.df, left.query) // key format joins//join_name val joinRequestName = joinSource.join.metaData.getName.replaceFirst("\\.", "/") - println(s"Upstream join request name: $joinRequestName") + logger.info(s"Upstream join request name: $joinRequestName") val tableUtils = TableUtils(session) // the decoded schema is in lower case @@ -329,7 +334,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map val joinEncoder: Encoder[Row] = RowEncoder(schemas.joinSchema) val joinFields = schemas.joinSchema.fieldNames val leftColumns = schemas.leftSourceSchema.fieldNames - println(s""" + logger.info(s""" |left columns ${leftColumns.mkString(",")} |reqColumns ${reqColumns.mkString(",")} |Fetching upstream join to enrich the stream... Fetching lag time: $lagMillis @@ -342,7 +347,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map override def call(rows: util.Iterator[Row]): util.Iterator[Row] = { val shouldSample = Math.random() <= 0.1 val fetcher = LocalIOCache.getOrSetFetcher { () => - println(s"Initializing Fetcher. ${System.currentTimeMillis()}") + logger.info(s"Initializing Fetcher. ${System.currentTimeMillis()}") context.increment("chain.fetcher.init") apiImpl.buildFetcher(debug = debug) } @@ -370,7 +375,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map } if (debug && shouldSample) { - requests.foreach(request => println(s"request: ${request.keys}, ts: ${request.atMillis}")) + requests.foreach(request => logger.info(s"request: ${request.keys}, ts: ${request.atMillis}")) } val responsesFuture = fetcher.fetchJoin(requests = requests.toSeq) @@ -379,9 +384,9 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map val responses = Await.result(responsesFuture, 5.second) if (debug && shouldSample) { - println(s"responses/request size: ${responses.size}/${requests.size}\n responses: ${responses}") + logger.info(s"responses/request size: ${responses.size}/${requests.size}\n responses: ${responses}") responses.foreach(response => - println( + logger.info( s"request: ${response.request.keys}, ts: ${response.request.atMillis}, values: ${response.values}")) } responses.iterator.map { response => @@ -419,8 +424,8 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map val data = df.collect() val putRequests = data.map(putRequestHelper.toPutRequest) if (debug) { - println(s" Final df size to write: ${data.length}") - println(s" Size of putRequests to kv store- ${putRequests.length}") + logger.info(s" Final df size to write: ${data.length}") + logger.info(s" Size of putRequests to kv store- ${putRequests.length}") } else { putRequests.foreach(request => emitRequestMetric(request, context.withSuffix("egress"))) kvStore.multiPut(putRequests) diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/KafkaStreamBuilder.scala b/spark/src/main/scala/ai/chronon/spark/streaming/KafkaStreamBuilder.scala index 8f205cfcf..35515a9f5 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/KafkaStreamBuilder.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/KafkaStreamBuilder.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.streaming +import org.slf4j.LoggerFactory import ai.chronon.online.{DataStream, StreamBuilder, TopicInfo} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.StreamingQueryListener @@ -26,21 +27,22 @@ import org.apache.spark.sql.streaming.StreamingQueryListener.{ } object KafkaStreamBuilder extends StreamBuilder { + private val logger = LoggerFactory.getLogger(getClass) override def from(topicInfo: TopicInfo)(implicit session: SparkSession, conf: Map[String, String]): DataStream = { val conf = topicInfo.params val bootstrap = conf.getOrElse("bootstrap", conf("host") + conf.get("port").map(":" + _).getOrElse("")) TopicChecker.topicShouldExist(topicInfo.name, bootstrap) session.streams.addListener(new StreamingQueryListener() { override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = { - println("Query started: " + queryStarted.id) + logger.info("Query started: " + queryStarted.id) } override def onQueryTerminated(queryTerminated: QueryTerminatedEvent): Unit = { - println("Query terminated: " + queryTerminated.id) + logger.info("Query terminated: " + queryTerminated.id) } override def onQueryProgress(queryProgress: QueryProgressEvent): Unit = { - println("Query made progress: " + queryProgress.progress) + logger.info("Query made progress: " + queryProgress.progress) } }) val df = session.readStream diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/StreamingStats.scala b/spark/src/main/scala/ai/chronon/spark/streaming/StreamingStats.scala index 23d200446..ea621d89a 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/StreamingStats.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/StreamingStats.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.streaming +import org.slf4j.LoggerFactory import ai.chronon.online.KVStore.PutRequest import com.yahoo.sketches.kll.KllFloatsSketch import org.apache.commons.io.FileUtils @@ -24,6 +25,7 @@ import java.time.format.DateTimeFormatter import java.time.{Instant, ZoneId, ZoneOffset} class StreamingStats(val publishDelaySeconds: Int) { + private val logger = LoggerFactory.getLogger(getClass) private var latencyHistogram: KllFloatsSketch = new KllFloatsSketch() private var latencyMsTotal: Long = 0 private var writesTotal: Long = 0 @@ -43,7 +45,7 @@ class StreamingStats(val publishDelaySeconds: Int) { val medianLatency = latencyHistogram.getQuantile(.5) val p95Latency = latencyHistogram.getQuantile(.95) val p99Latency = latencyHistogram.getQuantile(.99) - println(s""" + logger.info(s""" |[$threadName][${timeString(utc, now)}] Wrote $writesTotal records in last ${now - startMs} ms. | Latency ms: ${latencyMsTotal / writesTotal} (avg) / $medianLatency (median) / $p95Latency (p95) / $p99Latency (p99) | Key Size: ${keyBytesTotal / writesTotal} bytes (avg) / ${readable(keyBytesTotal)} (total) @@ -56,7 +58,7 @@ class StreamingStats(val publishDelaySeconds: Int) { latencyHistogram = new KllFloatsSketch() startMs = now } else { - println("No writes registered") + logger.info("No writes registered") } } diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala b/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala index d25004575..d99ac61e8 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.streaming +import org.slf4j.LoggerFactory import ai.chronon.aggregator.base.BottomK import ai.chronon.api import ai.chronon.api.Extensions.{GroupByOps, SourceOps} @@ -34,6 +35,7 @@ import scala.reflect.ClassTag import scala.util.Try object TopicChecker { + private val logger = LoggerFactory.getLogger(getClass) def getPartitions(topic: String, bootstrap: String): Int = { val props = new Properties() @@ -76,7 +78,7 @@ object TopicChecker { | ------ End ------ |""".stripMargin) } else { - println(s"Found topic $topic in bootstrap $bootstrap.") + logger.info(s"Found topic $topic in bootstrap $bootstrap.") } } catch { case ex: Exception => throw new RuntimeException(s"Failed to check for topic ${topic} in ${bootstrap}", ex) @@ -84,6 +86,7 @@ object TopicChecker { } class Args(arguments: Seq[String]) extends ScallopConf(arguments) { + private val logger = LoggerFactory.getLogger(getClass) val conf: ScallopOption[String] = opt[String](descr = "Conf to pull topic and bootstrap server information") val bootstrap: ScallopOption[String] = opt[String](descr = "Kafka bootstrap server in host:port format") val topic: ScallopOption[String] = opt[String](descr = "kafka topic to check metadata for") @@ -106,7 +109,7 @@ object TopicChecker { } else { args.topic() -> args.bootstrap() } - println(getPartitions(topic, bootstrap)) + logger.info(getPartitions(topic, bootstrap)) System.exit(0) } } diff --git a/spark/src/test/scala/ai/chronon/spark/test/AnalyzerTest.scala b/spark/src/test/scala/ai/chronon/spark/test/AnalyzerTest.scala index 8989f6d2b..98d3ac8af 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/AnalyzerTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/AnalyzerTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.api import ai.chronon.api.{Accuracy, Builders, Operation, TimeUnit, Window} @@ -26,6 +27,7 @@ import org.junit.Assert.assertTrue import org.junit.Test class AnalyzerTest { + private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("AnalyzerTest", local = true) private val tableUtils = TableUtils(spark) @@ -68,8 +70,8 @@ class AnalyzerTest { val join = new Join(joinConf = joinConf, endPartition = oneMonthAgo, tableUtils) val computed = join.computeJoin() val expectedSchema = computed.schema.fields.map(field => s"${field.name} => ${field.dataType}").sorted - println("=== expected schema =====") - println(expectedSchema.mkString("\n")) + logger.info("=== expected schema =====") + logger.info(expectedSchema.mkString("\n")) assertTrue(expectedSchema sameElements analyzerSchema) } diff --git a/spark/src/test/scala/ai/chronon/spark/test/ChainingFetcherTest.scala b/spark/src/test/scala/ai/chronon/spark/test/ChainingFetcherTest.scala index 5623d7823..99ee21fa1 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/ChainingFetcherTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/ChainingFetcherTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.windowing.TsUtils import ai.chronon.api import ai.chronon.api.Constants.ChrononMetadataKey @@ -35,11 +36,12 @@ import java.lang import java.util.TimeZone import java.util.concurrent.Executors import scala.collection.Seq -import scala.Console.println +import scala.Console.logger.info import scala.concurrent.ExecutionContext import scala.util.ScalaJavaConversions._ class ChainingFetcherTest extends TestCase { + private val logger = LoggerFactory.getLogger(getClass) val sessionName = "ChainingFetcherTest" val spark: SparkSession = SparkSessionBuilder.build(sessionName, local = true) private val tableUtils = TableUtils(spark) @@ -103,7 +105,7 @@ class ChainingFetcherTest extends TestCase { .save(s"$namespace.${schema.name}") } - println("saved all data hand written for fetcher test") + logger.info("saved all data hand written for fetcher test") val startPartition = "2021-04-13" val endPartition = "2021-04-16" @@ -169,7 +171,7 @@ class ChainingFetcherTest extends TestCase { ).toList TestUtils.makeDf(spark, searchSchema, searchData).save(s"$namespace.${searchSchema.name}") - println("Created user search table.") + logger.info("Created user search table.") // construct chaining join val startPartition = "2021-04-14" @@ -217,7 +219,7 @@ class ChainingFetcherTest extends TestCase { val joinedDf = new ai.chronon.spark.Join(joinConf, endDs, tableUtils).computeJoin() val joinTable = s"$namespace.join_test_expected_${joinConf.metaData.cleanName}" joinedDf.save(joinTable) - println("=== Expected join table computed: === " + joinTable) + logger.info("=== Expected join table computed: === " + joinTable) joinedDf.show() val endDsExpected = tableUtils.sql(s"SELECT * FROM $joinTable WHERE ds='$endDs'") @@ -230,7 +232,7 @@ class ChainingFetcherTest extends TestCase { s"SELECT * FROM $joinTable WHERE ts >= unix_timestamp('$endDs', '${tableUtils.partitionSpec.format}')") } val endDsQueries = endDsEvents.drop(endDsEvents.schema.fieldNames.filter(_.contains("fetcher")): _*) - println("Queries:") + logger.info("Queries:") endDsQueries.show() val keys = joinConf.leftKeyCols @@ -268,7 +270,7 @@ class ChainingFetcherTest extends TestCase { .asInstanceOf[GenericRow] } - println(endDsExpected.schema.pretty) + logger.info(endDsExpected.schema.pretty) (endDsExpected, responseRows) } @@ -286,9 +288,9 @@ class ChainingFetcherTest extends TestCase { if (endDs != today) { responseDf = responseDf.drop("ds").withColumn("ds", lit(endDs)) } - println("expected:") + logger.info("expected:") expectedDf.show() - println("response:") + logger.info("response:") responseDf.show() // remove user during comparison since `user` is not the key @@ -299,9 +301,9 @@ class ChainingFetcherTest extends TestCase { bName = "offline") assertEquals(expectedDf.count(), responseDf.count()) if (diff.count() > 0) { - println(s"Total count: ${responseDf.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows:") + logger.info(s"Total count: ${responseDf.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows:") diff .withTimeBasedColumn("ts_string", "ts", "yy-MM-dd HH:mm") .select("ts_string", diff.schema.fieldNames: _*) diff --git a/spark/src/test/scala/ai/chronon/spark/test/CompareTest.scala b/spark/src/test/scala/ai/chronon/spark/test/CompareTest.scala index 317320f14..ba02f7335 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/CompareTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/CompareTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.windowing.TsUtils import ai.chronon.online.DataMetrics import ai.chronon.spark.stats.CompareBaseJob @@ -24,6 +25,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession} import org.junit.Test class CompareTest { + private val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build("CompareTest", local = true) private val tableUtils = TableUtils(spark) @@ -62,7 +64,7 @@ class CompareTest { CompareBaseJob.compare(leftDf, rightDf, keys, tableUtils) val metricsDf = metricsKvRdd.toFlatDf metricsDf.show() - println(result) + logger.info(result) assert(result.series.length == 4, "Invalid result length") for (rowIndex <- 0 until leftData.length) { for ((colName, index) <- leftColumns.zipWithIndex) { @@ -99,7 +101,7 @@ class CompareTest { ) val metricsDf = metricsKvRdd.toFlatDf metricsDf.show() - println(result) + logger.info(result) assert(result.series.length == 4, "Invalid result length") for (rowIndex <- 0 until leftData.length) { for ((colName, index) <- leftColumns.zipWithIndex) { diff --git a/spark/src/test/scala/ai/chronon/spark/test/FeatureWithLabelJoinTest.scala b/spark/src/test/scala/ai/chronon/spark/test/FeatureWithLabelJoinTest.scala index 9bad4e3d7..c05aa74b9 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/FeatureWithLabelJoinTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/FeatureWithLabelJoinTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.api.Extensions.{LabelPartOps, MetadataOps} import ai.chronon.api.{Builders, LongType, StringType, StructField, StructType} import ai.chronon.spark.{Comparison, LabelJoin, SparkSessionBuilder, TableUtils} @@ -25,6 +26,7 @@ import org.junit.Assert.assertEquals import org.junit.Test class FeatureWithLabelJoinTest { + private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("FeatureWithLabelJoinTest", local = true) private val namespace = "final_join" @@ -52,11 +54,11 @@ class FeatureWithLabelJoinTest { val runner = new LabelJoin(joinConf, tableUtils, labelDS) val labelDf = runner.computeLabelJoin() - println(" == First Run Label version 2022-10-30 == ") + logger.info(" == First Run Label version 2022-10-30 == ") prefixColumnName(labelDf, exceptions = labelJoinConf.rowIdentifier(null, tableUtils.partitionColumn)) .show() val featureDf = tableUtils.sparkSession.table(joinConf.metaData.outputTable) - println(" == Features == ") + logger.info(" == Features == ") featureDf.show() val computed = tableUtils.sql(s"select * from ${joinConf.metaData.outputFinalView}") val expectedFinal = featureDf.join(prefixColumnName(labelDf, @@ -69,7 +71,7 @@ class FeatureWithLabelJoinTest { // add another label version val secondRun = new LabelJoin(joinConf, tableUtils, "2022-11-11") val secondLabel = secondRun.computeLabelJoin() - println(" == Second Run Label version 2022-11-11 == ") + logger.info(" == Second Run Label version 2022-11-11 == ") secondLabel.show() val view = tableUtils.sql(s"select * from ${joinConf.metaData.outputFinalView} order by label_ds") view.show() @@ -137,11 +139,11 @@ class FeatureWithLabelJoinTest { val runner = new LabelJoin(joinConf, tableUtils, "2022-10-06") val labelDf = runner.computeLabelJoin() - println(" == Label DF == ") + logger.info(" == Label DF == ") prefixColumnName(labelDf, exceptions = labelJoinConf.rowIdentifier(null, tableUtils.partitionColumn)) .show() val featureDf = tableUtils.sparkSession.table(joinConf.metaData.outputTable) - println(" == Features DF == ") + logger.info(" == Features DF == ") featureDf.show() val computed = tableUtils.sql(s"select * from ${joinConf.metaData.outputFinalView}") val expectedFinal = featureDf.join(prefixColumnName(labelDf, @@ -179,16 +181,16 @@ class FeatureWithLabelJoinTest { } private def assertResult(computed: DataFrame, expected: DataFrame): Unit = { - println(" == Computed == ") + logger.info(" == Computed == ") computed.show() - println(" == Expected == ") + logger.info(" == Expected == ") expected.show() val diff = Comparison.sideBySide(computed, expected, List("listing", "ds", "label_ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(0, diff.count()) @@ -197,8 +199,8 @@ class FeatureWithLabelJoinTest { private def prefixColumnName(df: DataFrame, prefix: String = "label_", exceptions: Array[String] = null): DataFrame = { - println("exceptions") - println(exceptions.mkString(", ")) + logger.info("exceptions") + logger.info(exceptions.mkString(", ")) val renamedColumns = df.columns .map(col => { if (exceptions.contains(col) || col.startsWith(prefix)) { diff --git a/spark/src/test/scala/ai/chronon/spark/test/FetchStatsTest.scala b/spark/src/test/scala/ai/chronon/spark/test/FetchStatsTest.scala index b4d5648e3..3d4288d99 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/FetchStatsTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/FetchStatsTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.api import ai.chronon.api.{Accuracy, Builders, Operation, TimeUnit, Window} @@ -46,6 +47,7 @@ import scala.concurrent.{Await, ExecutionContext} * Fetch stats. */ class FetchStatsTest extends TestCase { + private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("FetchStatsTest", local = true) val tableUtils = TableUtils(spark) @@ -138,17 +140,17 @@ class FetchStatsTest extends TestCase { // Stats fetchStatsSeries(request, mockApi, true) val fetchedSeries = fetchStatsSeries(request, mockApi) - println(gson.toJson(fetchedSeries.values.get)) + logger.info(gson.toJson(fetchedSeries.values.get)) // LogStats fetchLogStatsSeries(request, mockApi, true) val fetchedLogSeries = fetchLogStatsSeries(request, mockApi) - println(gson.toJson(fetchedLogSeries.values.get)) + logger.info(gson.toJson(fetchedLogSeries.values.get)) // Online Offline Consistency fetchOOCSeries(request, mockApi, true) val fetchedOOCSeries = fetchOOCSeries(request, mockApi) - println(gson.toJson(fetchedOOCSeries.values.get)) + logger.info(gson.toJson(fetchedOOCSeries.values.get)) // Appendix: Incremental run to check incremental updates for summary job. OnlineUtils.serveStats(tableUtils, inMemoryKvStore, today, joinConf) @@ -160,7 +162,7 @@ class FetchStatsTest extends TestCase { // Request drifts val driftRequest = StatsRequest(joinConf.metaData.nameToFilePath + "/drift", None, None) val fetchedDriftSeries = fetchStatsSeries(driftRequest, mockApi) - println(gson.toJson(fetchedDriftSeries.values.get)) + logger.info(gson.toJson(fetchedDriftSeries.values.get)) } def fetchStatsSeries(request: StatsRequest, diff --git a/spark/src/test/scala/ai/chronon/spark/test/FetcherTest.scala b/spark/src/test/scala/ai/chronon/spark/test/FetcherTest.scala index 9248df979..201e6241f 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/FetcherTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/FetcherTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.aggregator.windowing.TsUtils import ai.chronon.api @@ -38,7 +39,7 @@ import org.junit.Assert.{assertEquals, assertFalse, assertTrue} import java.lang import java.util.TimeZone import java.util.concurrent.Executors -import scala.Console.println +import scala.Console.logger.info import scala.collection.Seq import scala.compat.java8.FutureConverters import scala.concurrent.duration.{Duration, SECONDS} @@ -47,6 +48,7 @@ import scala.io.Source import scala.util.ScalaJavaConversions._ class FetcherTest extends TestCase { + private val logger = LoggerFactory.getLogger(getClass) val sessionName = "FetcherTest" val spark: SparkSession = SparkSessionBuilder.build(sessionName, local = true) private val tableUtils = TableUtils(spark) @@ -183,7 +185,7 @@ class FetcherTest extends TestCase { .save(s"$namespace.${schema.name}") } - println("saved all data hand written for fetcher test") + logger.info("saved all data hand written for fetcher test") val startPartition = "2021-04-08" val endPartition = "2021-04-10" @@ -459,7 +461,7 @@ class FetcherTest extends TestCase { .withColumn("ts_lagged", laggedResponseDf.col("ts_millis") + lagMs) .withColumn("ts_millis", col("ts_lagged")) .drop("ts_lagged") - println("corrected lagged response") + logger.info("corrected lagged response") correctedLaggedResponse.show() correctedLaggedResponse.save(mockApi.logTable, partitionColumns = Seq(tableUtils.partitionColumn, "name")) @@ -471,14 +473,14 @@ class FetcherTest extends TestCase { // build consistency metrics val consistencyJob = new ConsistencyJob(spark, joinConf, today) val metrics = consistencyJob.buildConsistencyMetrics() - println(s"ooc metrics: $metrics".stripMargin) + logger.info(s"ooc metrics: $metrics".stripMargin) OnlineUtils.serveConsistency(tableUtils, inMemoryKvStore, today, joinConf) val fetcher = mockApi.buildFetcher() val consistencyFetch = fetcher.fetchConsistencyMetricsTimeseries(StatsRequest(joinConf.metaData.nameToFilePath, None, None)) val response = Await.result(consistencyFetch, Duration.Inf) val gson = new GsonBuilder().setPrettyPrinting().serializeNulls().create() - println(s""" + logger.info(s""" | | Fetched Consistency Metrics | ${gson.toJson(response.values.get)} @@ -503,7 +505,7 @@ class FetcherTest extends TestCase { .asInstanceOf[GenericRow] } - println(endDsExpected.schema.pretty) + logger.info(endDsExpected.schema.pretty) val keyishColumns = keys.toList ++ List(tableUtils.partitionColumn, Constants.TimeColumn) val responseRdd = tableUtils.sparkSession.sparkContext.parallelize(responseRows.toSeq) @@ -511,19 +513,19 @@ class FetcherTest extends TestCase { if (endDs != today) { responseDf = responseDf.drop("ds").withColumn("ds", lit(endDs)) } - println("expected:") + logger.info("expected:") endDsExpected.show() - println("response:") + logger.info("response:") responseDf.show() val diff = Comparison.sideBySide(responseDf, endDsExpected, keyishColumns, aName = "online", bName = "offline") assertEquals(endDsQueries.count(), responseDf.count()) if (diff.count() > 0) { - println("queries:") + logger.info("queries:") endDsQueries.show() - println(s"Total count: ${responseDf.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows:") + logger.info(s"Total count: ${responseDf.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows:") diff .withTimeBasedColumn("ts_string", "ts", "yy-MM-dd HH:mm") .select("ts_string", diff.schema.fieldNames: _*) @@ -565,14 +567,15 @@ class FetcherTest extends TestCase { val (responses, _) = FetcherTestUtil.joinResponses(spark, Array(request), mockApi) val responseMap = responses.head.values.get - println("====== Empty request response map ======") - println(responseMap) + logger.info("====== Empty request response map ======") + logger.info(responseMap) assertEquals(joinConf.joinParts.size() + joinConf.derivations.toScala.derivationsWithoutStar.size, responseMap.size) assertEquals(responseMap.keys.count(_.endsWith("_exception")), joinConf.joinParts.size()) } } object FetcherTestUtil { + private val logger = LoggerFactory.getLogger(getClass) def joinResponses(spark: SparkSession, requests: Array[Request], mockApi: MockApi, @@ -639,7 +642,7 @@ object FetcherTestUtil { } val fetcherNameString = if (useJavaFetcher) "Java" else "Scala" - println(s""" + logger.info(s""" |Averaging fetching stats for $fetcherNameString Fetcher over ${requests.length} requests $runCount times |with batch size: $chunkSize |average qps: ${qpsSum / runCount} @@ -654,7 +657,7 @@ object FetcherTestUtil { ) } if (samplePercent > 0) { - println(s"logged count: ${loggedDf.count()}") + logger.info(s"logged count: ${loggedDf.count()}") loggedDf.show() } result -> loggedDf diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala index f8c9876b2..835dd5486 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.{CStream, Column, NaiveAggregator} import ai.chronon.aggregator.windowing.FiveMinuteResolution import ai.chronon.api.Extensions._ @@ -45,6 +46,7 @@ import org.junit.Test import scala.collection.mutable class GroupByTest { + private val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTest", local = true) implicit val tableUtils = TableUtils(spark) @@ -78,7 +80,7 @@ class GroupByTest { val diff = Comparison.sideBySide(actualDf, expectedDf, List("user", tableUtils.partitionColumn)) if (diff.count() > 0) { diff.show() - println("diff result rows") + logger.info("diff result rows") } assertEquals(0, diff.count()) } @@ -130,7 +132,7 @@ class GroupByTest { val diff = Comparison.sideBySide(actualDf, expectedDf, List("user", tableUtils.partitionColumn)) if (diff.count() > 0) { diff.show() - println("diff result rows") + logger.info("diff result rows") } assertEquals(0, diff.count()) } @@ -177,10 +179,10 @@ class GroupByTest { val diff = Comparison.sideBySide(computed, expected, List("user", "ts")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows last_k_test") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows last_k_test") diff.show() diff.rdd.foreach { row => val gson = new Gson() @@ -195,7 +197,7 @@ class GroupByTest { val computedStr = gson.toJson(computed) val expectedStr = gson.toJson(expected) if (computedStr != expectedStr) { - println(s""" + logger.info(s""" |computed [$computedCount]: ${gson.toJson(computed)} |expected [$expectedCount]: ${gson.toJson(expected)} |""".stripMargin) @@ -263,7 +265,7 @@ class GroupByTest { val diff = Comparison.sideBySide(naiveDf, resultDf, List("user", Constants.TimeColumn)) if (diff.count() > 0) { diff.show() - println("diff result rows") + logger.info("diff result rows") } assertEquals(0, diff.count()) } @@ -544,16 +546,16 @@ class GroupByTest { | latestB.listing = COALESCE(C.listing, '--null--') AND latestB.ts = C.ts |""".stripMargin val expectedInputDf = spark.sql(expectedSQL) - println("Expected input DF: ") + logger.info("Expected input DF: ") expectedInputDf.show() - println("Computed input DF: ") + logger.info("Computed input DF: ") newGroupBy.inputDf.show() val diff = Comparison.sideBySide(newGroupBy.inputDf, expectedInputDf, List("listing", "user", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${newGroupBy.inputDf.count()}") - println(s"Expected count: ${expectedInputDf.count()}") - println(s"Diff count: ${diff.count()}") + logger.info(s"Actual count: ${newGroupBy.inputDf.count()}") + logger.info(s"Expected count: ${expectedInputDf.count()}") + logger.info(s"Diff count: ${diff.count()}") diff.show() } assertEquals(0, diff.count()) diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByUploadTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByUploadTest.scala index 59ffa6f20..111aafb8a 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/GroupByUploadTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByUploadTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.aggregator.windowing.TsUtils import ai.chronon.api.Extensions._ @@ -33,6 +34,7 @@ import scala.concurrent.Await import scala.util.ScalaJavaConversions.{JMapOps, ListOps, MapOps} class GroupByUploadTest { + private val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByUploadTest", local = true) private val namespace = "group_by_upload_test" @@ -333,8 +335,8 @@ class GroupByUploadTest { null, cRating(4.0, 2.0) ) - println(gson.toJson(categoryRatingResults)) - println(gson.toJson(expectedCategoryRatings)) + logger.info(gson.toJson(categoryRatingResults)) + logger.info(gson.toJson(expectedCategoryRatings)) categoryRatingResults.zip(expectedCategoryRatings).foreach { case (actual, expected) => assertEquals(actual, expected) diff --git a/spark/src/test/scala/ai/chronon/spark/test/InMemoryKvStore.scala b/spark/src/test/scala/ai/chronon/spark/test/InMemoryKvStore.scala index f50e27256..455c5ab95 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/InMemoryKvStore.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/InMemoryKvStore.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.api.Constants import ai.chronon.online.KVStore import ai.chronon.online.KVStore.{PutRequest, TimedValue} @@ -29,6 +30,7 @@ import scala.concurrent.Future import scala.util.Try class InMemoryKvStore(tableUtils: () => TableUtils) extends KVStore with Serializable { + private val logger = LoggerFactory.getLogger(getClass) //type aliases for readability type Key = String type Data = Array[Byte] @@ -129,7 +131,7 @@ class InMemoryKvStore(tableUtils: () => TableUtils) extends KVStore with Seriali val value = tableEntry.getValue value.foreach { case (version, data) => - println(s"table: $tableName, key: $key, value: $data, version: $version") + logger.info(s"table: $tableName, key: $key, value: $data, version: $version") } } } @@ -137,6 +139,7 @@ class InMemoryKvStore(tableUtils: () => TableUtils) extends KVStore with Seriali } object InMemoryKvStore { + private val logger = LoggerFactory.getLogger(getClass) val stores: ConcurrentHashMap[String, InMemoryKvStore] = new ConcurrentHashMap[String, InMemoryKvStore] // We would like to create one instance of InMemoryKVStore per executors, but share SparkContext @@ -147,7 +150,7 @@ object InMemoryKvStore { testName, new function.Function[String, InMemoryKvStore] { override def apply(name: String): InMemoryKvStore = { - println(s"Missing in-memory store for name: $name. Creating one") + logger.info(s"Missing in-memory store for name: $name. Creating one") new InMemoryKvStore(tableUtils) } } diff --git a/spark/src/test/scala/ai/chronon/spark/test/InMemoryStream.scala b/spark/src/test/scala/ai/chronon/spark/test/InMemoryStream.scala index ce3c7c6a1..8c075619c 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/InMemoryStream.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/InMemoryStream.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.api.{Constants, StructType} import ai.chronon.online.{AvroConversions, Mutation, SparkConversions} import ai.chronon.online.Extensions.StructTypeOps @@ -34,6 +35,7 @@ import org.apache.spark.sql.{DataFrame, Dataset, Encoder, Encoders, Row, SparkSe import java.util.Base64 class InMemoryStream { + private val logger = LoggerFactory.getLogger(getClass) private def encode(schema: org.apache.avro.Schema)(row: Row): Array[Byte] = { val gr: GenericRecord = new GenericData.Record(schema) @@ -61,7 +63,7 @@ class InMemoryStream { // encode input as avro byte array and insert into memory stream. def getInMemoryStreamDF(spark: SparkSession, inputDf: Dataset[Row]): DataFrame = { val schema: StructType = StructType.from("input", SparkConversions.toChrononSchema(inputDf.schema)) - println(s"Creating in-memory stream with schema: ${SparkConversions.fromChrononSchema(schema).catalogString}") + logger.info(s"Creating in-memory stream with schema: ${SparkConversions.fromChrononSchema(schema).catalogString}") val avroSchema = AvroConversions.fromChrononSchema(schema) import spark.implicits._ val input: MemoryStream[Array[Byte]] = @@ -83,7 +85,7 @@ class InMemoryStream { val inputDf = noDs.selectExpr(baseFields ++ mutationFields: _*) // encode and write - println(s"encoding stream with schema: ${inputDf.schema.catalogString}") + logger.info(s"encoding stream with schema: ${inputDf.schema.catalogString}") inputDf.show() val schema: StructType = StructType.from("input", SparkConversions.toChrononSchema(inputDf.schema)) val avroSchema = AvroConversions.fromChrononSchema(schema) diff --git a/spark/src/test/scala/ai/chronon/spark/test/JoinTest.scala b/spark/src/test/scala/ai/chronon/spark/test/JoinTest.scala index 2f93e4b96..6460d6fff 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/JoinTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/JoinTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.api import ai.chronon.api.{Accuracy, Builders, Constants, LongType, Operation, StringType, TimeUnit, Window} @@ -35,6 +36,7 @@ import scala.collection.JavaConverters._ import scala.util.ScalaJavaConversions.ListOps class JoinTest { + private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build( "JoinTest", @@ -83,7 +85,7 @@ class JoinTest { snapshotTable = dollarTable ) - //println("Rupee Source start partition $month") + //logger.info("Rupee Source start partition $month") val rupeeSource = Builders.Source.entities( query = Builders.Query( @@ -147,7 +149,7 @@ class JoinTest { dropStart, dropEnd ) - println(tableUtils.partitions(s"$namespace.test_user_transaction_features")) + logger.info(tableUtils.partitions(s"$namespace.test_user_transaction_features")) joinConf.joinParts.toScala .map(jp => joinConf.partOutputTable(jp)) @@ -161,7 +163,7 @@ class JoinTest { resetUDFs() val runner2 = new Join(joinConf, end, tableUtils) val computed = runner2.computeJoin(Some(3)) - println(s"join start = $start") + logger.info(s"join start = $start") val expectedQuery = s""" |WITH @@ -211,11 +213,11 @@ class JoinTest { val diff = Comparison.sideBySide(computed, expected, List("user_name", "user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") - println(s"Queries count: ${queries.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"Queries count: ${queries.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(0, diff.count()) @@ -229,7 +231,7 @@ class JoinTest { val endMinus2 = tableUtils.partitionSpec.minus(end, new Window(2, TimeUnit.DAYS)) tableUtils.dropPartitionRange(s"$namespace.test_user_transaction_features", endMinus1, endMinus1) - println(tableUtils.partitions(s"$namespace.test_user_transaction_features")) + logger.info(tableUtils.partitions(s"$namespace.test_user_transaction_features")) joinConf.joinParts.asScala .map(jp => joinConf.partOutputTable(jp)) @@ -243,11 +245,11 @@ class JoinTest { val diff2 = Comparison.sideBySide(computed2, expected2, List("user_name", "user", "ts", "ds")) if (diff2.count() > 0) { - println(s"Actual count: ${computed2.count()}") - println(s"Expected count: ${expected2.count()}") - println(s"Diff count: ${diff2.count()}") - println(s"Queries count: ${queries.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${computed2.count()}") + logger.info(s"Expected count: ${expected2.count()}") + logger.info(s"Diff count: ${diff2.count()}") + logger.info(s"Queries count: ${queries.count()}") + logger.info(s"diff result rows") diff2.show() } assertEquals(0, diff2.count()) @@ -342,18 +344,18 @@ class JoinTest { | AND countries.country = grouped_heights.country """.stripMargin) - println("showing join result") + logger.info("showing join result") computed.show() - println("showing query result") + logger.info("showing query result") expected.show() - println( + logger.info( s"Left side count: ${spark.sql(s"SELECT country, ds from $countryTable where ds >= '$start' and ds <= '$end'").count()}") - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") val diff = Comparison.sideBySide(computed, expected, List("country", "ds")) if (diff.count() > 0) { - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(diff.count(), 0) @@ -362,14 +364,14 @@ class JoinTest { * should not trigger a backfill and exit the program properly */ - // use console to redirect println message to Java IO + // use console to redirect logger.info message to Java IO val stream = new java.io.ByteArrayOutputStream() Console.withOut(stream) { // rerun the same join job runner.computeJoin(Some(7)) } val stdOutMsg = stream.toString() - println(s"std out message =\n $stdOutMsg") + logger.info(s"std out message =\n $stdOutMsg") // make sure that the program exits with target print statements assertTrue(stdOutMsg.contains(s"There is no data to compute based on end partition of $end.")) } @@ -415,12 +417,12 @@ class JoinTest { val runner = new Join(joinConf, end, tableUtils) val computed = runner.computeJoin(Some(7)) - println("showing join result") + logger.info("showing join result") computed.show() val leftSideCount = spark.sql(s"SELECT country, ds from $countryTable where ds == '$end'").count() - println(s"Left side expected count: $leftSideCount") - println(s"Actual count: ${computed.count()}") + logger.info(s"Left side expected count: $leftSideCount") + logger.info(s"Actual count: ${computed.count()}") assertEquals(leftSideCount, computed.count()) // There should be only one partition in computed df which equals to end partition val allPartitions = computed.select("ds").rdd.map(row => row(0)).collect().toSet @@ -491,8 +493,8 @@ class JoinTest { val diff = Comparison.sideBySide(computed, expected, List("item", "ts", "ds")) if (diff.count() > 0) { - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(diff.count(), 0) @@ -566,8 +568,8 @@ class JoinTest { tableUtils.sql(s"SELECT item, ts, ds from $itemQueriesTable where ds >= '$start' and ds <= '$dayAndMonthBefore'") assertEquals(queriesBare.count(), computed.count()) if (diff.count() > 0) { - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff .replaceWithReadableTime(Seq("ts", "a_user_unit_test_item_views_ts_max", "b_user_unit_test_item_views_ts_max"), dropOriginal = true) @@ -587,7 +589,7 @@ class JoinTest { // Run job val itemQueriesTable = s"$namespace.item_queries" - println("Item Queries DF: ") + logger.info("Item Queries DF: ") val q = s""" |SELECT @@ -628,8 +630,8 @@ class JoinTest { tableUtils.sql(s"SELECT item, ts, ds from $itemQueriesTable where ds >= '$start' and ds <= '$dayAndMonthBefore'") assertEquals(queriesBare.count(), computed.count()) if (diff.count() > 0) { - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(diff.count(), 0) @@ -702,7 +704,7 @@ class JoinTest { None, viewsGroupByCumulative.inferredAccuracy ) - println(renderedIncremental) + logger.info(renderedIncremental) assert(renderedIncremental.contains(s"(ds >= '2021-01-01') AND (ds <= '2021-01-01')")) } @@ -752,7 +754,7 @@ class JoinTest { val runner = new Join(joinConf, end, tableUtils) val computed = runner.computeJoin(Some(7)) - println(s"join start = $start") + logger.info(s"join start = $start") val expected = tableUtils.sql(s""" |WITH | users AS (SELECT user, ds from $usersTable where ds >= '$start' and ds <= '$end'), @@ -770,18 +772,18 @@ class JoinTest { | AND users.ds = grouped_names.ds """.stripMargin) - println("showing join result") + logger.info("showing join result") computed.show() - println("showing query result") + logger.info("showing query result") expected.show() - println( + logger.info( s"Left side count: ${spark.sql(s"SELECT user, ds from $namesTable where ds >= '$start' and ds <= '$end'").count()}") - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") val diff = Comparison.sideBySide(computed, expected, List("user", "ds")) if (diff.count() > 0) { - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(diff.count(), 0) @@ -805,7 +807,7 @@ class JoinTest { val leftChangeJoin = new Join(joinConf = leftChangeJoinConf, endPartition = dayAndMonthBefore, tableUtils) val leftChangeRecompute = JoinUtils.tablesToRecompute(leftChangeJoinConf, leftChangeJoinConf.metaData.outputTable, tableUtils) - println(leftChangeRecompute) + logger.info(leftChangeRecompute) assertEquals(leftChangeRecompute.size, 3) val partTable = s"${leftChangeJoinConf.metaData.outputTable}_user_unit_test_item_views" assertEquals(leftChangeRecompute, @@ -872,8 +874,8 @@ class JoinTest { tableUtils.sql(s"SELECT item, ts, ds from $itemQueriesTable where ds >= '$start' and ds <= '$dayAndMonthBefore'") assertEquals(queriesBare.count(), computed.count()) if (diff.count() > 0) { - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff .replaceWithReadableTime( Seq("ts", "a_user_3_unit_test_item_views_ts_max", "b_user_3_unit_test_item_views_ts_max"), @@ -1006,7 +1008,7 @@ class JoinTest { ) val skipBloomComputed = new Join(joinConf, today, testTableUtils).computeJoin() val leftSideCount = testSpark.sql(s"SELECT item, ts, ds from $itemQueriesTable where ds >= '$start'").count() - println("computed count: " + skipBloomComputed.count()) + logger.info("computed count: " + skipBloomComputed.count()) assertEquals(leftSideCount, skipBloomComputed.count()) } diff --git a/spark/src/test/scala/ai/chronon/spark/test/LabelJoinTest.scala b/spark/src/test/scala/ai/chronon/spark/test/LabelJoinTest.scala index e9a093e8c..70ac0d5f8 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/LabelJoinTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/LabelJoinTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.api.{Accuracy, Builders, Constants, Operation, TimeUnit, Window} import ai.chronon.spark._ import org.apache.spark.sql.{Row, SparkSession} @@ -23,6 +24,7 @@ import org.junit.Assert.assertEquals import org.junit.Test class LabelJoinTest { + private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("LabelJoinTest", local = true) @@ -47,7 +49,7 @@ class LabelJoinTest { ) val runner = new LabelJoin(joinConf, tableUtils, labelDS) val computed = runner.computeLabelJoin(skipFinalJoin = true) - println(" == Computed == ") + logger.info(" == Computed == ") computed.show() val expected = tableUtils.sql(s""" SELECT v.listing_id as listing, @@ -58,7 +60,7 @@ class LabelJoinTest { LEFT OUTER JOIN label_join.listing_attributes as a ON v.listing_id = a.listing_id WHERE a.ds = '2022-10-30'""".stripMargin) - println(" == Expected == ") + logger.info(" == Expected == ") expected.show() assertEquals(computed.count(), expected.count()) assertEquals(computed.select("label_ds").first().get(0), labelDS) @@ -67,9 +69,9 @@ class LabelJoinTest { expected, List("listing", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") diff.show() } assertEquals(0, diff.count()) @@ -88,7 +90,7 @@ class LabelJoinTest { ) val runner = new LabelJoin(joinConf, tableUtils, labelDS) val computed = runner.computeLabelJoin(skipFinalJoin = true) - println(" == Computed == ") + logger.info(" == Computed == ") computed.show() val expected = tableUtils.sql(s""" |SELECT listing, @@ -113,16 +115,16 @@ class LabelJoinTest { |) b |ON aa.listing = b.listing_id """.stripMargin) - println(" == Expected == ") + logger.info(" == Expected == ") expected.show() assertEquals(computed.count(), expected.count()) assertEquals(computed.select("label_ds").first().get(0), labelDS) val diff = Comparison.sideBySide(computed, expected, List("listing", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") diff.show() } assertEquals(0, diff.count()) @@ -141,7 +143,7 @@ class LabelJoinTest { // label ds does not exist in label table, labels should be null val runner = new LabelJoin(joinConf, tableUtils, "2022-11-01") val computed = runner.computeLabelJoin(skipFinalJoin = true) - println(" == Computed == ") + logger.info(" == Computed == ") computed.show() assertEquals(computed.select("label_ds").first().get(0), "2022-11-01") assertEquals(computed @@ -164,7 +166,7 @@ class LabelJoinTest { val runner = new LabelJoin(joinConf, tableUtils, labelDS) val computed = runner.computeLabelJoin(skipFinalJoin = true) - println(" == Computed == ") + logger.info(" == Computed == ") computed.show() assertEquals(computed.count(), 6) val computedRows = computed.collect() @@ -175,7 +177,7 @@ class LabelJoinTest { val runner2 = new LabelJoin(joinConf, tableUtils, labelDS) val refreshed = runner2.computeLabelJoin(skipFinalJoin = true) - println(" == Refreshed == ") + logger.info(" == Refreshed == ") refreshed.show() assertEquals(refreshed.count(), 6) val refreshedRows = refreshed.collect() @@ -195,7 +197,7 @@ class LabelJoinTest { ) val runner = new LabelJoin(joinConf, tableUtils, labelDS) val computed = runner.computeLabelJoin(skipFinalJoin = true) - println(" == First Run == ") + logger.info(" == First Run == ") computed.show() assertEquals(computed.count(), 6) @@ -216,7 +218,7 @@ class LabelJoinTest { ) val runner2 = new LabelJoin(updatedJoinConf, tableUtils, "2022-11-01") val updated = runner2.computeLabelJoin(skipFinalJoin = true) - println(" == Updated Run == ") + logger.info(" == Updated Run == ") updated.show() assertEquals(updated.count(), 12) assertEquals(updated.where(updated("label_ds") === "2022-11-01").count(), 6) @@ -337,7 +339,7 @@ class LabelJoinTest { ) val runner = new LabelJoin(joinConf, tableUtils, "2022-10-06") val computed = runner.computeLabelJoin(skipFinalJoin = true) - println(" == computed == ") + logger.info(" == computed == ") computed.show() val expected = tableUtils.sql( @@ -354,16 +356,16 @@ class LabelJoinTest { | WHERE v.ds == '2022-10-02' | GROUP BY v.listing_id, v.ds) |""".stripMargin) - println(" == Expected == ") + logger.info(" == Expected == ") expected.show() assertEquals(computed.count(), expected.count()) assertEquals(computed.select("label_ds").first().get(0), "2022-10-06") val diff = Comparison.sideBySide(computed, expected, List("listing", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") diff.show() } assertEquals(0, diff.count()) @@ -394,7 +396,7 @@ class LabelJoinTest { val today = tableUtils.partitionSpec.at(now) val runner = new LabelJoin(joinConf, tableUtils, today) val computed = runner.computeLabelJoin(skipFinalJoin = true) - println(" == computed == ") + logger.info(" == computed == ") computed.show() // For window based label, given specific label_ds and window, only one ds will be updated with label. @@ -414,13 +416,13 @@ class LabelJoinTest { | WHERE v.ds == DATE_SUB(from_unixtime(round($now / 1000), 'yyyy-MM-dd'), 4) | GROUP BY v.listing_id, v.ds) |""".stripMargin) - println(" == Expected == ") + logger.info(" == Expected == ") expected.show() val diff = Comparison.sideBySide(computed, expected, List("listing_id", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") diff.show() } assertEquals(0, diff.count()) diff --git a/spark/src/test/scala/ai/chronon/spark/test/MetadataExporterTest.scala b/spark/src/test/scala/ai/chronon/spark/test/MetadataExporterTest.scala index a9ec34db1..5f5b49cf6 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/MetadataExporterTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/MetadataExporterTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.api import ai.chronon.spark.Extensions._ @@ -30,6 +31,7 @@ import scala.io.Source import java.io.File class MetadataExporterTest extends TestCase { + private val logger = LoggerFactory.getLogger(getClass) val sessionName = "MetadataExporter" val spark: SparkSession = SparkSessionBuilder.build(sessionName, local = true) @@ -39,22 +41,22 @@ class MetadataExporterTest extends TestCase { val directory = new File(directoryPath) if (directory.exists && directory.isDirectory) { - println("Valid Directory") + logger.info("Valid Directory") val files = directory.listFiles for (file <- files) { - println(file.getPath) + logger.info(file.getPath) if (file.isFile) { - println(s"File: ${file.getName}") + logger.info(s"File: ${file.getName}") val source = Source.fromFile(file) val fileContents = source.getLines.mkString("\n") source.close() - println(fileContents) - println("----------------------------------------") + logger.info(fileContents) + logger.info("----------------------------------------") } } } else { - println("Invalid directory path!") + logger.info("Invalid directory path!") } } diff --git a/spark/src/test/scala/ai/chronon/spark/test/MockApi.scala b/spark/src/test/scala/ai/chronon/spark/test/MockApi.scala index 5745f35e4..b704ffe32 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/MockApi.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/MockApi.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.api.Extensions.{GroupByOps, SourceOps} import ai.chronon.api.{Constants, StructType} import ai.chronon.online.Fetcher.Response @@ -38,6 +39,7 @@ import scala.util.ScalaJavaConversions.{IteratorOps, JListOps, JMapOps} import scala.util.Success class MockDecoder(inputSchema: StructType) extends StreamDecoder { + private val logger = LoggerFactory.getLogger(getClass) private def byteArrayToAvro(avro: Array[Byte], schema: Schema): GenericRecord = { val reader = new SpecificDatumReader[GenericRecord](schema) @@ -65,9 +67,10 @@ class MockDecoder(inputSchema: StructType) extends StreamDecoder { } class MockStreamBuilder extends StreamBuilder { + private val logger = LoggerFactory.getLogger(getClass) override def from(topicInfo: TopicInfo)(implicit session: SparkSession, props: Map[String, String]): DataStream = { val tableUtils = TableUtils(session) - println(s"""building stream from topic: ${topicInfo.name}""") + logger.info(s"""building stream from topic: ${topicInfo.name}""") val ds = topicInfo.params("ds") val df = tableUtils.sql(s"select * from ${topicInfo.name} where ds >= '$ds'") val encodedDf = (new InMemoryStream).getContinuousStreamDF(session, df.drop("ds")) @@ -77,7 +80,9 @@ class MockStreamBuilder extends StreamBuilder { } class MockApi(kvStore: () => KVStore, val namespace: String) extends Api(null) { + private val logger = LoggerFactory.getLogger(getClass) class PlusOneExternalHandler extends ExternalSourceHandler { + private val logger = LoggerFactory.getLogger(getClass) override def fetch(requests: collection.Seq[Fetcher.Request]): Future[collection.Seq[Fetcher.Response]] = { Future( requests.map(req => @@ -87,6 +92,7 @@ class MockApi(kvStore: () => KVStore, val namespace: String) extends Api(null) { } class AlwaysFailsHandler extends JavaExternalSourceHandler { + private val logger = LoggerFactory.getLogger(getClass) override def fetchJava(requests: util.List[JavaRequest]): CompletableFuture[util.List[JavaResponse]] = { CompletableFuture.completedFuture[util.List[JavaResponse]]( requests @@ -106,6 +112,7 @@ class MockApi(kvStore: () => KVStore, val namespace: String) extends Api(null) { } class JavaPlusOneExternalHandler extends JavaExternalSourceHandler { + private val logger = LoggerFactory.getLogger(getClass) override def fetchJava(requests: util.List[JavaRequest]): CompletableFuture[util.List[JavaResponse]] = { CompletableFuture.completedFuture( requests @@ -132,7 +139,7 @@ class MockApi(kvStore: () => KVStore, val namespace: String) extends Api(null) { new ConcurrentLinkedQueue[LoggableResponseBase64] override def streamDecoder(parsedInfo: GroupByServingInfoParsed): StreamDecoder = { - println( + logger.info( s"decoding stream ${parsedInfo.groupBy.streamingSource.get.topic} with " + s"schema: ${SparkConversions.fromChrononSchema(parsedInfo.streamChrononSchema).catalogString}") new MockDecoder(parsedInfo.streamChrononSchema) diff --git a/spark/src/test/scala/ai/chronon/spark/test/MutationsTest.scala b/spark/src/test/scala/ai/chronon/spark/test/MutationsTest.scala index e061856bf..5bd56c710 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/MutationsTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/MutationsTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.aggregator.windowing.TsUtils import ai.chronon.api @@ -32,6 +33,7 @@ import org.junit.Test * Join is the events and the entity value at the exact timestamp of the ts. */ class MutationsTest { + private val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build( "MutationsTest", @@ -124,15 +126,15 @@ class MutationsTest { } val joinRdd = expectedRdd.join(computedRdd) if (totalExpectedRows == joinRdd.count()) return true - println("Failed to assert equality!") - println("== Joined RDD (listing_id, ts, rating_average)") + logger.info("Failed to assert equality!") + logger.info("== Joined RDD (listing_id, ts, rating_average)") val readableRDD = joinRdd.map { case ((id, ts, event, avg, ds), _) => Row(id, ts, event, avg, ds) } spark.createDataFrame(readableRDD, expectedSchema).show() - println("== Expected") + logger.info("== Expected") df.replaceWithReadableTime(Seq("ts"), false).show() - println("== Computed") + logger.info("== Computed") computed.replaceWithReadableTime(Seq("ts"), false).show() false } @@ -990,15 +992,15 @@ class MutationsTest { val expected = computeSimpleAverageThroughSql(testNamespace) val diff = Comparison.sideBySide(result, expected, List("listing_id", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${result.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${result.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() val recomputedResult = computeJoinFromTables(suffix, minDs, maxDs, null, Operation.AVERAGE) val recomputedDiff = Comparison.sideBySide(recomputedResult, expected, List("listing_id", "ts", "ds")) - println("Checking second run of the same data.") - println(s"recomputed diff result rows") + logger.info("Checking second run of the same data.") + logger.info(s"recomputed diff result rows") recomputedDiff.show() assert(recomputedDiff.count() == 0) } diff --git a/spark/src/test/scala/ai/chronon/spark/test/StagingQueryTest.scala b/spark/src/test/scala/ai/chronon/spark/test/StagingQueryTest.scala index c828cdc17..53a449306 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/StagingQueryTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/StagingQueryTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.api.Extensions._ import ai.chronon.api._ @@ -26,6 +27,7 @@ import org.junit.Assert.assertEquals import org.junit.Test class StagingQueryTest { + private val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build("StagingQueryTest", local = true) implicit private val tableUtils: TableUtils = TableUtils(spark) @@ -44,7 +46,7 @@ class StagingQueryTest { val df = DataFrameGen .events(spark, schema, count = 100000, partitions = 100) .dropDuplicates("ts") // duplicates can create issues in comparisons - println("Generated staging query data:") + logger.info("Generated staging query data:") df.show() val viewName = s"$namespace.test_staging_query_compare" df.save(viewName) @@ -67,12 +69,12 @@ class StagingQueryTest { val computed = tableUtils.sql(s"select * from ${stagingQueryConf.metaData.outputTable} WHERE user IS NOT NULL") val diff = Comparison.sideBySide(expected, computed, List("user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${expected.count()}") - println(expected.show()) - println(s"Computed count: ${computed.count()}") - println(computed.show()) - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${expected.count()}") + logger.info(expected.show()) + logger.info(s"Computed count: ${computed.count()}") + logger.info(computed.show()) + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(0, diff.count()) @@ -91,7 +93,7 @@ class StagingQueryTest { val df = DataFrameGen .events(spark, schema, count = 30, partitions = 8) .dropDuplicates("ts") // duplicates can create issues in comparisons - println("Generated staging query data:") + logger.info("Generated staging query data:") df.show() val viewName = s"$namespace.test_staging_query_view" df.save(viewName) @@ -141,12 +143,12 @@ class StagingQueryTest { val diffV2 = Comparison.sideBySide(expectedUpdated, computedUpdated, List("user", "ts", "ds")) if (diffV2.count() > 0) { - println(s"Actual count: ${expectedUpdated.count()}") - println(expectedUpdated.show()) - println(s"Computed count: ${computedUpdated.count()}") - println(computedUpdated.show()) - println(s"Diff count: ${diffV2.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${expectedUpdated.count()}") + logger.info(expectedUpdated.show()) + logger.info(s"Computed count: ${computedUpdated.count()}") + logger.info(computedUpdated.show()) + logger.info(s"Diff count: ${diffV2.count()}") + logger.info(s"diff result rows") diffV2.show() } assertEquals(0, diffV2.count()) @@ -197,12 +199,12 @@ class StagingQueryTest { |""".stripMargin) val diff = Comparison.sideBySide(expected, computed, List("user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${expected.count()}") - println(expected.show()) - println(s"Computed count: ${computed.count()}") - println(computed.show()) - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${expected.count()}") + logger.info(expected.show()) + logger.info(s"Computed count: ${computed.count()}") + logger.info(computed.show()) + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(0, diff.count()) @@ -249,12 +251,12 @@ class StagingQueryTest { |""".stripMargin) val diff = Comparison.sideBySide(expected, computed, List("user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${expected.count()}") - println(expected.show()) - println(s"Computed count: ${computed.count()}") - println(computed.show()) - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${expected.count()}") + logger.info(expected.show()) + logger.info(s"Computed count: ${computed.count()}") + logger.info(computed.show()) + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(0, diff.count()) diff --git a/spark/src/test/scala/ai/chronon/spark/test/StatsComputeTest.scala b/spark/src/test/scala/ai/chronon/spark/test/StatsComputeTest.scala index bf724d130..d079260bc 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/StatsComputeTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/StatsComputeTest.scala @@ -15,6 +15,7 @@ */ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.row.StatsGenerator import ai.chronon.aggregator.test.Column import ai.chronon.api._ @@ -27,6 +28,7 @@ import ai.chronon.spark.stats.StatsCompute import org.apache.spark.sql.functions.lit class StatsComputeTest { + private val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build("StatsComputeTest", local = true) implicit val tableUtils = TableUtils(spark) val namespace: String = "stats_compute_test" @@ -85,18 +87,18 @@ class StatsComputeTest { StructType.from("generatedTest", toChrononSchema(stats.selectedDf.schema))) val daily = stats.dailySummary(aggregator, timeBucketMinutes = 0).toFlatDf - println("Daily Stats") + logger.info("Daily Stats") daily.show() val bucketed = stats .dailySummary(aggregator) .toFlatDf .replaceWithReadableTime(Seq(Constants.TimeColumn), false) - println("Bucketed Stats") + logger.info("Bucketed Stats") bucketed.show() val denormalized = stats.addDerivedMetrics(bucketed, aggregator) - println("With Derived Data") + logger.info("With Derived Data") denormalized.show(truncate = false) } @@ -115,15 +117,15 @@ class StatsComputeTest { StructType.from("noTsTest", toChrononSchema(stats.selectedDf.schema))) val daily = stats.dailySummary(aggregator, timeBucketMinutes = 0).toFlatDf - println("Daily Stats") + logger.info("Daily Stats") daily.show() val bucketed = stats.dailySummary(aggregator).toFlatDf - println("Bucketed Stats") + logger.info("Bucketed Stats") bucketed.show() val denormalized = stats.addDerivedMetrics(bucketed, aggregator) - println("With Derived Data") + logger.info("With Derived Data") denormalized.show(truncate = false) } @@ -147,18 +149,18 @@ class StatsComputeTest { StructType.from("byteTest", toChrononSchema(stats.selectedDf.schema))) val daily = stats.dailySummary(aggregator, timeBucketMinutes = 0).toFlatDf - println("Daily Stats") + logger.info("Daily Stats") daily.show() val bucketed = stats .dailySummary(aggregator) .toFlatDf .replaceWithReadableTime(Seq(Constants.TimeColumn), false) - println("Bucketed Stats") + logger.info("Bucketed Stats") bucketed.show() val denormalized = stats.addDerivedMetrics(bucketed, aggregator) - println("With Derived Data") + logger.info("With Derived Data") denormalized.show(truncate = false) } } diff --git a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/DerivationTest.scala b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/DerivationTest.scala index c34c85b41..7e74cf0de 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/DerivationTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/DerivationTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test.bootstrap +import org.slf4j.LoggerFactory import ai.chronon.api.Builders.Derivation import ai.chronon.api.Extensions._ import ai.chronon.api._ @@ -35,6 +36,7 @@ import scala.concurrent.duration.Duration import scala.util.ScalaJavaConversions.JListOps class DerivationTest { + private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("DerivationTest", local = true) private val tableUtils = TableUtils(spark) @@ -277,10 +279,10 @@ class DerivationTest { val diff = Comparison.sideBySide(computed, expected, List("request_id", "user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } @@ -351,10 +353,10 @@ class DerivationTest { val diff = Comparison.sideBySide(outputDf, bootstrapDf, List("request_id", "user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${outputDf.count()}") - println(s"Expected count: ${bootstrapDf.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${outputDf.count()}") + logger.info(s"Expected count: ${bootstrapDf.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } @@ -485,10 +487,10 @@ class DerivationTest { val diff = Comparison.sideBySide(computedDf, expectedDf, List("request_id", "user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computedDf.count()}") - println(s"Expected count: ${expectedDf.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${computedDf.count()}") + logger.info(s"Expected count: ${expectedDf.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } @@ -656,10 +658,10 @@ class DerivationTest { val diff = Comparison.sideBySide(actualDf, expectedDf, List("user", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${actualDf.count()}") - println(s"Expected count: ${expectedDf.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${actualDf.count()}") + logger.info(s"Expected count: ${expectedDf.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(0, diff.count()) diff --git a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/LogBootstrapTest.scala b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/LogBootstrapTest.scala index 099c0d566..71337c0e1 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/LogBootstrapTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/LogBootstrapTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test.bootstrap +import org.slf4j.LoggerFactory import ai.chronon.api.Extensions._ import ai.chronon.api._ import ai.chronon.online.Fetcher.Request @@ -33,6 +34,7 @@ import scala.concurrent.duration.Duration import scala.util.ScalaJavaConversions._ class LogBootstrapTest { + private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("BootstrapTest", local = true) val namespace = "test_log_bootstrap" @@ -170,17 +172,17 @@ class LogBootstrapTest { val computed = joinJob.computeJoin() val overlapCount = baseOutput.join(logDf, Seq("request_id", "ds")).count() - println(s"""Debugging information: + logger.info(s"""Debugging information: |base count: ${baseOutput.count()} |overlap keys between base and log: ${overlapCount} |""".stripMargin) val diff = Comparison.sideBySide(computed, expected, List("request_id", "user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } diff --git a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/TableBootstrapTest.scala b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/TableBootstrapTest.scala index f693734b3..f30f0b20d 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/TableBootstrapTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/TableBootstrapTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test.bootstrap +import org.slf4j.LoggerFactory import ai.chronon.api.Extensions.JoinOps import ai.chronon.api._ import ai.chronon.spark.Extensions._ @@ -28,6 +29,7 @@ import org.junit.Test import scala.util.ScalaJavaConversions.JListOps class TableBootstrapTest { + private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("BootstrapTest", local = true) private val tableUtils = TableUtils(spark) @@ -136,7 +138,7 @@ class TableBootstrapTest { val overlapBaseBootstrap1 = baseOutput.join(bootstrapDf1, Seq("request_id", "ds")).count() val overlapBaseBootstrap2 = baseOutput.join(bootstrapDf2, Seq("request_id", "ds")).count() val overlapBootstrap12 = bootstrapDf1.join(bootstrapDf2, Seq("request_id", "ds")).count() - println(s"""Debug information: + logger.info(s"""Debug information: |base count: ${baseOutput.count()} |overlap keys between base and bootstrap1 count: ${overlapBaseBootstrap1} |overlap keys between base and bootstrap2 count: ${overlapBaseBootstrap2} @@ -145,10 +147,10 @@ class TableBootstrapTest { val diff = Comparison.sideBySide(computed, expected, List("request_id", "user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } diff --git a/test/HopsAggregator.scala b/test/HopsAggregator.scala new file mode 100644 index 000000000..6cfdfa3ce --- /dev/null +++ b/test/HopsAggregator.scala @@ -0,0 +1,165 @@ +/* + * Copyright (C) 2023 The Chronon Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.chronon.aggregator.windowing + +import org.slf4j.LoggerFactory +import ai.chronon.aggregator.row.RowAggregator +import ai.chronon.aggregator.windowing.HopsAggregator._ +import ai.chronon.api.Extensions.{AggregationOps, AggregationsOps, WindowOps, WindowUtils} +import ai.chronon.api.{Aggregation, DataType, Row} + +import scala.collection.Seq +import java.util + +// generate hops per spec, (NOT per window) for the given hop sizes in the resolution +// we use minQueryTs to construct only the relevant hops for a given hop size. +// 180day window , 5hr window (headStart(minTs) - 5hrs, maxTs) +// daily aggregates (headStart(minTs) - 180days, maxTs), +// t +class HopsAggregatorBase(aggregations: Seq[Aggregation], inputSchema: Seq[(String, DataType)], resolution: Resolution) + extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) + + @transient lazy val rowAggregator = + new RowAggregator(inputSchema, aggregations.flatMap(_.unWindowed)) + val hopSizes: Array[Long] = resolution.hopSizes + + def init(): IrMapType = + Array.fill(hopSizes.length)(new java.util.HashMap[Long, HopIr]) + + @transient + lazy val javaBuildHop: java.util.function.Function[Long, HopIr] = new java.util.function.Function[Long, HopIr] { + override def apply(ts: Long): HopIr = { + val v = new Array[Any](rowAggregator.length + 1) + v.update(rowAggregator.length, ts) + v + } + } + + // Zero-copy merging + // NOTE: inputs will be mutated in the process, use "clone" if you want re-use references + def merge(leftHops: IrMapType, rightHops: IrMapType): IrMapType = { + if (leftHops == null) return rightHops + if (rightHops == null) return leftHops + for (i <- hopSizes.indices) { // left and right will be same size + val leftMap = leftHops(i) + val rightIter = rightHops(i).entrySet().iterator() + while (rightIter.hasNext) { + val entry = rightIter.next() + val hopStart = entry.getKey + val rightIr = entry.getValue + if (rightIr != null) + leftMap.put(hopStart, rowAggregator.merge(leftMap.get(hopStart), rightIr)) + } + } + leftHops + } + + // hops have timestamps attached to the end. + // order by hopStart + @transient lazy val arrayOrdering: Ordering[HopIr] = new Ordering[HopIr] { + override def compare(x: HopIr, y: HopIr): Int = + Ordering[Long] + .compare(x.last.asInstanceOf[Long], y.last.asInstanceOf[Long]) + } + + def toTimeSortedArray(hopMaps: IrMapType): OutputArrayType = + hopMaps.map { m => + val resultIt = m.values.iterator() + val result = new Array[HopIr](m.size()) + for (i <- 0 until m.size()) { + result.update(i, resultIt.next()) + } + util.Arrays.sort(result, arrayOrdering) + result + } +} + +// HopsAggregatorBase + update method +class HopsAggregator(minQueryTs: Long, + aggregations: Seq[Aggregation], + inputSchema: Seq[(String, DataType)], + resolution: Resolution) + extends HopsAggregatorBase(aggregations, inputSchema, resolution) { + + val leftBoundaries: Array[Option[Long]] = { + // Nikhil is pretty confident we won't call this when aggregations is empty + val allWindows = aggregations.allWindowsOpt.get + .map { window => + Option(window).getOrElse(WindowUtils.Unbounded) + } // agg.windows(i) = Null => one of the windows is "unwindowed" + + // Use the max window for a given tail hop to determine + // from where(leftBoundary) a particular hops size is relevant + val hopSizeToMaxWindow = + allWindows + .groupBy(resolution.calculateTailHop) + .mapValues(_.map(_.millis).max) + + val maxHopSize = resolution.calculateTailHop(allWindows.maxBy(_.millis)) + + val result: Array[Option[Long]] = resolution.hopSizes.indices.map { hopIndex => + val hopSize = resolution.hopSizes(hopIndex) + // for windows with this hop as the tail hop size + val windowBasedLeftBoundary = hopSizeToMaxWindow.get(hopSize).map(TsUtils.round(minQueryTs, hopSize) - _) + // for windows larger with tail hop larger than this hop + val largerWindowBasedLeftBoundary = if (hopIndex == 0) { // largest window already + None + } else { // smaller hop is only used to construct windows' head with larger hopsize. + val previousHopSize = resolution.hopSizes(hopIndex - 1) + Some(TsUtils.round(minQueryTs, previousHopSize)) + } + if (hopSize > maxHopSize) { // this hop size is not relevant + None + } else { + (windowBasedLeftBoundary ++ largerWindowBasedLeftBoundary).reduceOption((a: Long, b: Long) => + Ordering[Long].min(a, b)) + } + }.toArray + + val readableHopSizes = resolution.hopSizes.map(WindowUtils.millisToString) + val readableLeftBounds = result.map(_.map(TsUtils.toStr).getOrElse("unused")) + val readableHopsToBoundsMap = readableHopSizes + .zip(readableLeftBounds) + .map { case (hop, left) => s"$hop->$left" } + .mkString(", ") + logger.info(s"""Left bounds: $readableHopsToBoundsMap + |minQueryTs = ${TsUtils.toStr(minQueryTs)}""".stripMargin) + result + } + + // used to collect hops of various sizes in a single pass of input rows + def update(hopMaps: IrMapType, row: Row): IrMapType = { + for (i <- hopSizes.indices) { + if (leftBoundaries(i).exists(row.ts >= _)) { // left inclusive + val hopStart = TsUtils.round(row.ts, hopSizes(i)) + val hopIr = hopMaps(i).computeIfAbsent(hopStart, javaBuildHop) + rowAggregator.update(hopIr, row) + } + } + hopMaps + } +} + +object HopsAggregator { + // [IR1, IR2, IR3,.... IRN, ts_millis_long] + // hops have timestamps attached to the end + type HopIr = Array[Any] + type OutputArrayType = Array[Array[HopIr]] + type IrMapType = Array[java.util.HashMap[Long, HopIr]] + +} diff --git a/test/log.py b/test/log.py new file mode 100644 index 000000000..f2da8101e --- /dev/null +++ b/test/log.py @@ -0,0 +1,68 @@ +import os +import re + +def process_file(file_path): + with open(file_path, 'r') as file: + lines = file.readlines() + + # Regex to match class or object (excluding case classes) definitions + object_or_class_regex = re.compile(r'\b(? 0 or not lines[i].strip().endswith('}')): + if '{' in lines[i]: + brace_count += lines[i].count('{') + if '}' in lines[i]: + brace_count -= lines[i].count('}') + i += 1 + end_index = i + + # Check if this block contains println + block_lines = lines[start_index:end_index] + if any(println_regex.search(line) for line in block_lines): + need_import = True + # Find the opening brace and insert logger after all header lines + brace_index = next((j for j, line in enumerate(block_lines) if '{' in line), len(block_lines) - 1) + block_lines.insert(brace_index + 1, logger_instance) + # Replace println within this block + block_lines = [line.replace('println', 'logger.info') for line in block_lines] + lines[start_index:end_index] = block_lines + else: + i += 1 + + # Add import statement if needed + if need_import: + import_index = next((i for i, line in enumerate(lines) if line.startswith('import')), 0) + if not any(line.startswith('import org.slf4j.LoggerFactory') for line in lines): + lines.insert(import_index, import_statement) + + # Write the updated content + updated_content = ''.join(lines) + with open(file_path, 'w') as file: + file.write(updated_content) + +def search_and_replace(directory): + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.scala'): + process_file(os.path.join(root, file)) + +# Run from the current directory +current_directory = os.getcwd() +search_and_replace(current_directory) + +print("Replacement complete.") From 12ed49e10b556940e7550a3f43d7d730185f91b7 Mon Sep 17 00:00:00 2001 From: Varant Zanoyan Date: Tue, 5 Dec 2023 14:12:20 -0800 Subject: [PATCH 2/7] WIP --- test/HopsAggregator.scala | 165 -------------------------------------- test/log.py | 68 ---------------- 2 files changed, 233 deletions(-) delete mode 100644 test/HopsAggregator.scala delete mode 100644 test/log.py diff --git a/test/HopsAggregator.scala b/test/HopsAggregator.scala deleted file mode 100644 index 6cfdfa3ce..000000000 --- a/test/HopsAggregator.scala +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Copyright (C) 2023 The Chronon Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package ai.chronon.aggregator.windowing - -import org.slf4j.LoggerFactory -import ai.chronon.aggregator.row.RowAggregator -import ai.chronon.aggregator.windowing.HopsAggregator._ -import ai.chronon.api.Extensions.{AggregationOps, AggregationsOps, WindowOps, WindowUtils} -import ai.chronon.api.{Aggregation, DataType, Row} - -import scala.collection.Seq -import java.util - -// generate hops per spec, (NOT per window) for the given hop sizes in the resolution -// we use minQueryTs to construct only the relevant hops for a given hop size. -// 180day window , 5hr window (headStart(minTs) - 5hrs, maxTs) -// daily aggregates (headStart(minTs) - 180days, maxTs), -// t -class HopsAggregatorBase(aggregations: Seq[Aggregation], inputSchema: Seq[(String, DataType)], resolution: Resolution) - extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) - - @transient lazy val rowAggregator = - new RowAggregator(inputSchema, aggregations.flatMap(_.unWindowed)) - val hopSizes: Array[Long] = resolution.hopSizes - - def init(): IrMapType = - Array.fill(hopSizes.length)(new java.util.HashMap[Long, HopIr]) - - @transient - lazy val javaBuildHop: java.util.function.Function[Long, HopIr] = new java.util.function.Function[Long, HopIr] { - override def apply(ts: Long): HopIr = { - val v = new Array[Any](rowAggregator.length + 1) - v.update(rowAggregator.length, ts) - v - } - } - - // Zero-copy merging - // NOTE: inputs will be mutated in the process, use "clone" if you want re-use references - def merge(leftHops: IrMapType, rightHops: IrMapType): IrMapType = { - if (leftHops == null) return rightHops - if (rightHops == null) return leftHops - for (i <- hopSizes.indices) { // left and right will be same size - val leftMap = leftHops(i) - val rightIter = rightHops(i).entrySet().iterator() - while (rightIter.hasNext) { - val entry = rightIter.next() - val hopStart = entry.getKey - val rightIr = entry.getValue - if (rightIr != null) - leftMap.put(hopStart, rowAggregator.merge(leftMap.get(hopStart), rightIr)) - } - } - leftHops - } - - // hops have timestamps attached to the end. - // order by hopStart - @transient lazy val arrayOrdering: Ordering[HopIr] = new Ordering[HopIr] { - override def compare(x: HopIr, y: HopIr): Int = - Ordering[Long] - .compare(x.last.asInstanceOf[Long], y.last.asInstanceOf[Long]) - } - - def toTimeSortedArray(hopMaps: IrMapType): OutputArrayType = - hopMaps.map { m => - val resultIt = m.values.iterator() - val result = new Array[HopIr](m.size()) - for (i <- 0 until m.size()) { - result.update(i, resultIt.next()) - } - util.Arrays.sort(result, arrayOrdering) - result - } -} - -// HopsAggregatorBase + update method -class HopsAggregator(minQueryTs: Long, - aggregations: Seq[Aggregation], - inputSchema: Seq[(String, DataType)], - resolution: Resolution) - extends HopsAggregatorBase(aggregations, inputSchema, resolution) { - - val leftBoundaries: Array[Option[Long]] = { - // Nikhil is pretty confident we won't call this when aggregations is empty - val allWindows = aggregations.allWindowsOpt.get - .map { window => - Option(window).getOrElse(WindowUtils.Unbounded) - } // agg.windows(i) = Null => one of the windows is "unwindowed" - - // Use the max window for a given tail hop to determine - // from where(leftBoundary) a particular hops size is relevant - val hopSizeToMaxWindow = - allWindows - .groupBy(resolution.calculateTailHop) - .mapValues(_.map(_.millis).max) - - val maxHopSize = resolution.calculateTailHop(allWindows.maxBy(_.millis)) - - val result: Array[Option[Long]] = resolution.hopSizes.indices.map { hopIndex => - val hopSize = resolution.hopSizes(hopIndex) - // for windows with this hop as the tail hop size - val windowBasedLeftBoundary = hopSizeToMaxWindow.get(hopSize).map(TsUtils.round(minQueryTs, hopSize) - _) - // for windows larger with tail hop larger than this hop - val largerWindowBasedLeftBoundary = if (hopIndex == 0) { // largest window already - None - } else { // smaller hop is only used to construct windows' head with larger hopsize. - val previousHopSize = resolution.hopSizes(hopIndex - 1) - Some(TsUtils.round(minQueryTs, previousHopSize)) - } - if (hopSize > maxHopSize) { // this hop size is not relevant - None - } else { - (windowBasedLeftBoundary ++ largerWindowBasedLeftBoundary).reduceOption((a: Long, b: Long) => - Ordering[Long].min(a, b)) - } - }.toArray - - val readableHopSizes = resolution.hopSizes.map(WindowUtils.millisToString) - val readableLeftBounds = result.map(_.map(TsUtils.toStr).getOrElse("unused")) - val readableHopsToBoundsMap = readableHopSizes - .zip(readableLeftBounds) - .map { case (hop, left) => s"$hop->$left" } - .mkString(", ") - logger.info(s"""Left bounds: $readableHopsToBoundsMap - |minQueryTs = ${TsUtils.toStr(minQueryTs)}""".stripMargin) - result - } - - // used to collect hops of various sizes in a single pass of input rows - def update(hopMaps: IrMapType, row: Row): IrMapType = { - for (i <- hopSizes.indices) { - if (leftBoundaries(i).exists(row.ts >= _)) { // left inclusive - val hopStart = TsUtils.round(row.ts, hopSizes(i)) - val hopIr = hopMaps(i).computeIfAbsent(hopStart, javaBuildHop) - rowAggregator.update(hopIr, row) - } - } - hopMaps - } -} - -object HopsAggregator { - // [IR1, IR2, IR3,.... IRN, ts_millis_long] - // hops have timestamps attached to the end - type HopIr = Array[Any] - type OutputArrayType = Array[Array[HopIr]] - type IrMapType = Array[java.util.HashMap[Long, HopIr]] - -} diff --git a/test/log.py b/test/log.py deleted file mode 100644 index f2da8101e..000000000 --- a/test/log.py +++ /dev/null @@ -1,68 +0,0 @@ -import os -import re - -def process_file(file_path): - with open(file_path, 'r') as file: - lines = file.readlines() - - # Regex to match class or object (excluding case classes) definitions - object_or_class_regex = re.compile(r'\b(? 0 or not lines[i].strip().endswith('}')): - if '{' in lines[i]: - brace_count += lines[i].count('{') - if '}' in lines[i]: - brace_count -= lines[i].count('}') - i += 1 - end_index = i - - # Check if this block contains println - block_lines = lines[start_index:end_index] - if any(println_regex.search(line) for line in block_lines): - need_import = True - # Find the opening brace and insert logger after all header lines - brace_index = next((j for j, line in enumerate(block_lines) if '{' in line), len(block_lines) - 1) - block_lines.insert(brace_index + 1, logger_instance) - # Replace println within this block - block_lines = [line.replace('println', 'logger.info') for line in block_lines] - lines[start_index:end_index] = block_lines - else: - i += 1 - - # Add import statement if needed - if need_import: - import_index = next((i for i, line in enumerate(lines) if line.startswith('import')), 0) - if not any(line.startswith('import org.slf4j.LoggerFactory') for line in lines): - lines.insert(import_index, import_statement) - - # Write the updated content - updated_content = ''.join(lines) - with open(file_path, 'w') as file: - file.write(updated_content) - -def search_and_replace(directory): - for root, dirs, files in os.walk(directory): - for file in files: - if file.endswith('.scala'): - process_file(os.path.join(root, file)) - -# Run from the current directory -current_directory = os.getcwd() -search_and_replace(current_directory) - -print("Replacement complete.") From b7093e4878502cc8891f51cbed21f1a767804574 Mon Sep 17 00:00:00 2001 From: Varant Zanoyan Date: Tue, 5 Dec 2023 15:29:09 -0800 Subject: [PATCH 3/7] WIP --- .../aggregator/base/TimedAggregators.scala | 9 +--- .../aggregator/windowing/HopsAggregator.scala | 2 - .../aggregator/test/ApproxDistinctTest.scala | 6 +-- .../aggregator/test/RowAggregatorTest.scala | 6 +-- .../test/SawtoothAggregatorTest.scala | 2 - .../scala/ai/chronon/api/Extensions.scala | 27 ---------- .../ai/chronon/api/ThriftJsonCodec.scala | 1 - .../ai/chronon/flink/AsyncKVStoreWriter.scala | 3 -- .../chronon/flink/SparkExpressionEvalFn.scala | 1 - log.py | 53 ------------------- .../main/scala/ai/chronon/online/Api.scala | 14 ----- .../ai/chronon/online/DataStreamBuilder.scala | 3 -- .../scala/ai/chronon/online/Fetcher.scala | 14 +---- .../ai/chronon/online/MetadataStore.scala | 1 - .../scala/ai/chronon/spark/Analyzer.scala | 5 +- .../ai/chronon/spark/BootstrapInfo.scala | 3 -- .../main/scala/ai/chronon/spark/Driver.scala | 42 --------------- .../scala/ai/chronon/spark/Extensions.scala | 8 --- .../scala/ai/chronon/spark/FastHashing.scala | 1 - .../main/scala/ai/chronon/spark/Join.scala | 2 - .../main/scala/ai/chronon/spark/KvRdd.scala | 1 - .../ai/chronon/spark/LogFlattenerJob.scala | 1 - .../scala/ai/chronon/spark/TableUtils.scala | 1 - .../spark/streaming/JoinSourceRunner.scala | 2 - .../scala/ai/chronon/spark/test/MockApi.scala | 11 +--- 25 files changed, 9 insertions(+), 210 deletions(-) delete mode 100644 log.py diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/base/TimedAggregators.scala b/aggregator/src/main/scala/ai/chronon/aggregator/base/TimedAggregators.scala index 1bf360b44..e4ef7dda4 100644 --- a/aggregator/src/main/scala/ai/chronon/aggregator/base/TimedAggregators.scala +++ b/aggregator/src/main/scala/ai/chronon/aggregator/base/TimedAggregators.scala @@ -16,14 +16,12 @@ package ai.chronon.aggregator.base -import org.slf4j.LoggerFactory import ai.chronon.aggregator.base.TimeTuple.typ import ai.chronon.api._ import java.util object TimeTuple extends Ordering[util.ArrayList[Any]] { - private val logger = LoggerFactory.getLogger(getClass) type typ = util.ArrayList[Any] def `type`(inputType: DataType): DataType = @@ -55,7 +53,6 @@ object TimeTuple extends Ordering[util.ArrayList[Any]] { } abstract class TimeOrdered(inputType: DataType) extends TimedAggregator[Any, TimeTuple.typ, Any] { - private val logger = LoggerFactory.getLogger(getClass) override def outputType: DataType = inputType override def irType: DataType = TimeTuple.`type`(inputType) @@ -75,7 +72,6 @@ abstract class TimeOrdered(inputType: DataType) extends TimedAggregator[Any, Tim } class First(inputType: DataType) extends TimeOrdered(inputType) { - private val logger = LoggerFactory.getLogger(getClass) //mutating override def update( ir: util.ArrayList[Any], @@ -96,7 +92,6 @@ class First(inputType: DataType) extends TimeOrdered(inputType) { } class Last(inputType: DataType) extends TimeOrdered(inputType) { - private val logger = LoggerFactory.getLogger(getClass) //mutating override def update( ir: util.ArrayList[Any], @@ -124,7 +119,6 @@ class OrderByLimitTimed( limit: Int, ordering: Ordering[TimeTuple.typ] ) extends TimedAggregator[Any, util.ArrayList[TimeTuple.typ], util.ArrayList[Any]] { - private val logger = LoggerFactory.getLogger(getClass) type Container = util.ArrayList[TimeTuple.typ] private val minHeap = new MinHeap[TimeTuple.typ](limit, ordering) @@ -135,7 +129,7 @@ class OrderByLimitTimed( override final def prepare(input: Any, ts: Long): Container = { // val gson = new Gson() val tuple = TimeTuple.make(ts, input) -// logger.info(s"init: ${gson.toJson(tuple)}") +// println(s"init: ${gson.toJson(tuple)}") val arr = new Container() arr.add(tuple) arr @@ -151,7 +145,6 @@ class OrderByLimitTimed( minHeap.merge(state1, state2) override def finalize(state: Container): util.ArrayList[Any] = { - private val logger = LoggerFactory.getLogger(getClass) val sorted = minHeap.sort(state) val result = new util.ArrayList[Any](state.size()) val it = sorted.iterator diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/windowing/HopsAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/windowing/HopsAggregator.scala index bc66cff36..627f5cdaa 100644 --- a/aggregator/src/main/scala/ai/chronon/aggregator/windowing/HopsAggregator.scala +++ b/aggregator/src/main/scala/ai/chronon/aggregator/windowing/HopsAggregator.scala @@ -32,7 +32,6 @@ import java.util // t class HopsAggregatorBase(aggregations: Seq[Aggregation], inputSchema: Seq[(String, DataType)], resolution: Resolution) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) @transient lazy val rowAggregator = new RowAggregator(inputSchema, aggregations.flatMap(_.unWindowed)) @@ -157,7 +156,6 @@ class HopsAggregator(minQueryTs: Long, } object HopsAggregator { - private val logger = LoggerFactory.getLogger(getClass) // [IR1, IR2, IR3,.... IRN, ts_millis_long] // hops have timestamps attached to the end type HopIr = Array[Any] diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxDistinctTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxDistinctTest.scala index 4b82e5eb2..2416a894f 100644 --- a/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxDistinctTest.scala +++ b/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxDistinctTest.scala @@ -16,13 +16,11 @@ package ai.chronon.aggregator.test -import org.slf4j.LoggerFactory import ai.chronon.aggregator.base.ApproxDistinctCount import junit.framework.TestCase import org.junit.Assert._ class ApproxDistinctTest extends TestCase { - private val logger = LoggerFactory.getLogger(getClass) def testErrorBound(uniques: Int, errorBound: Int, lgK: Int): Unit = { val uniqueElems = 1 to uniques val duplicates = uniqueElems ++ uniqueElems ++ uniqueElems @@ -30,7 +28,7 @@ class ApproxDistinctTest extends TestCase { val ir = counter.prepare(duplicates.head) duplicates.tail.foreach { elem => counter.update(ir, elem) } val estimated = counter.finalize(ir) - // logger.info(s"estimated - $estimated, actual - $uniques, bound - $errorBound") + // println(s"estimated - $estimated, actual - $uniques, bound - $errorBound") assertTrue(Math.abs(estimated - uniques) < errorBound) } @@ -48,7 +46,7 @@ class ApproxDistinctTest extends TestCase { } val ir = irList.reduceLeft(counter.merge) val estimated = counter.finalize(ir) - // logger.info(s"estimated - $estimated, actual - $uniques, bound - $errorBound") + // println(s"estimated - $estimated, actual - $uniques, bound - $errorBound") assertTrue(Math.abs(estimated - uniques) < errorBound) } diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/RowAggregatorTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/RowAggregatorTest.scala index 853f85074..58c25ce6a 100644 --- a/aggregator/src/test/scala/ai/chronon/aggregator/test/RowAggregatorTest.scala +++ b/aggregator/src/test/scala/ai/chronon/aggregator/test/RowAggregatorTest.scala @@ -16,7 +16,6 @@ package ai.chronon.aggregator.test -import org.slf4j.LoggerFactory import ai.chronon.aggregator.row.RowAggregator import ai.chronon.api._ import junit.framework.TestCase @@ -26,7 +25,6 @@ import java.util import scala.collection.JavaConverters._ class TestRow(val fieldsSeq: Any*)(tsIndex: Int = 0) extends Row { - private val logger = LoggerFactory.getLogger(getClass) val fields: util.List[Any] = new java.util.ArrayList[Any](fieldsSeq.asJava) override val length: Int = fields.size() @@ -41,18 +39,16 @@ class TestRow(val fieldsSeq: Any*)(tsIndex: Int = 0) extends Row { override def mutationTs: Long = timeStamp - def print(): Unit = logger.info(fieldsSeq) + def print(): Unit = println(fieldsSeq) def set(index: Int, any: Any): Unit = fields.set(index, any) } object TestRow { - private val logger = LoggerFactory.getLogger(getClass) def apply(inputsArray: Any*): TestRow = new TestRow(inputsArray: _*)() } class RowAggregatorTest extends TestCase { - private val logger = LoggerFactory.getLogger(getClass) def testUpdate(): Unit = { val rows = List( TestRow(1L, 4, 5.0f, "A", Seq(5, 3, 4), Seq("D", "A", "B", "A"), Map("A" -> 1, "B" -> 2)), diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/SawtoothAggregatorTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/SawtoothAggregatorTest.scala index 47e83c7be..f41fa0a78 100644 --- a/aggregator/src/test/scala/ai/chronon/aggregator/test/SawtoothAggregatorTest.scala +++ b/aggregator/src/test/scala/ai/chronon/aggregator/test/SawtoothAggregatorTest.scala @@ -46,7 +46,6 @@ class Timer { } class SawtoothAggregatorTest extends TestCase { - private val logger = LoggerFactory.getLogger(getClass) def testTailAccuracy(): Unit = { val timer = new Timer @@ -178,7 +177,6 @@ class SawtoothAggregatorTest extends TestCase { } object SawtoothAggregatorTest { - private val logger = LoggerFactory.getLogger(getClass) // the result is irs in sorted order of queries // with head real-time accuracy and tail hop accuracy // NOTE: This provides a sketch for a distributed topology diff --git a/api/src/main/scala/ai/chronon/api/Extensions.scala b/api/src/main/scala/ai/chronon/api/Extensions.scala index 9884475d1..895d41a44 100644 --- a/api/src/main/scala/ai/chronon/api/Extensions.scala +++ b/api/src/main/scala/ai/chronon/api/Extensions.scala @@ -32,10 +32,8 @@ import scala.util.ScalaJavaConversions.{IteratorOps, ListOps, MapOps} import scala.util.{Failure, Success, Try} object Extensions { - private val logger = LoggerFactory.getLogger(getClass) implicit class TimeUnitOps(timeUnit: TimeUnit) { - private val logger = LoggerFactory.getLogger(getClass) def str: String = timeUnit match { case TimeUnit.HOURS => "h" @@ -50,7 +48,6 @@ object Extensions { } implicit class OperationOps(operation: Operation) { - private val logger = LoggerFactory.getLogger(getClass) def isSimple: Boolean = operation match { case Operation.FIRST | Operation.LAST | Operation.LAST_K | Operation.FIRST_K => false @@ -62,7 +59,6 @@ object Extensions { } implicit class WindowOps(window: Window) { - private val logger = LoggerFactory.getLogger(getClass) private def unbounded: Boolean = window.length == Int.MaxValue || window.length <= 0 def str: String = @@ -75,7 +71,6 @@ object Extensions { } object WindowUtils { - private val logger = LoggerFactory.getLogger(getClass) val Unbounded: Window = new Window(Int.MaxValue, TimeUnit.DAYS) val Hour: Window = new Window(1, TimeUnit.HOURS) val Day: Window = new Window(1, TimeUnit.DAYS) @@ -99,7 +94,6 @@ object Extensions { } implicit class MetadataOps(metaData: MetaData) { - private val logger = LoggerFactory.getLogger(getClass) def cleanName: String = metaData.name.sanitize def outputTable = s"${metaData.outputNamespace}.${metaData.cleanName}" @@ -157,7 +151,6 @@ object Extensions { // one per output column - so single window // not exposed to users implicit class AggregationPartOps(aggregationPart: AggregationPart) { - private val logger = LoggerFactory.getLogger(getClass) def getInt(arg: String, default: Option[Int] = None): Int = { val argOpt = Option(aggregationPart.argMap) @@ -185,7 +178,6 @@ object Extensions { } implicit class AggregationOps(aggregation: Aggregation) { - private val logger = LoggerFactory.getLogger(getClass) // one agg part per bucket per window // unspecified windows are translated to one unbounded window @@ -241,9 +233,6 @@ object Extensions { case class UnpackedAggregations(perBucket: Array[AggregationPart], perWindow: Array[WindowMapping]) object UnpackedAggregations { - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) def from(aggregations: Seq[Aggregation]): UnpackedAggregations = { var counter = 0 val perBucket = new mutable.ArrayBuffer[AggregationPart] @@ -289,7 +278,6 @@ object Extensions { } implicit class AggregationsOps(aggregations: Seq[Aggregation]) { - private val logger = LoggerFactory.getLogger(getClass) def hasTimedAggregations: Boolean = aggregations.exists(_.operation match { case LAST_K | FIRST_K | LAST | FIRST => true @@ -313,7 +301,6 @@ object Extensions { } implicit class SourceOps(source: Source) { - private val logger = LoggerFactory.getLogger(getClass) def dataModel: DataModel = { assert(source.isSetEntities || source.isSetEvents || source.isSetJoinSource, "Source type is not specified") if (source.isSetEntities) Entities @@ -426,7 +413,6 @@ object Extensions { } implicit class GroupByOps(groupBy: GroupBy) extends GroupBy(groupBy) { - private val logger = LoggerFactory.getLogger(getClass) def maxWindow: Option[Window] = { val allWindowsOpt = Option(groupBy.aggregations) .flatMap(_.toScala.toSeq.allWindowsOpt) @@ -547,7 +533,6 @@ object Extensions { case class QueryParts(selects: Option[Seq[String]], wheres: Seq[String]) def buildQueryParts(query: Query): QueryParts = { - private val logger = LoggerFactory.getLogger(getClass) val selects = query.getQuerySelects val timeColumn = Option(query.timeColumn).getOrElse(Constants.TimeColumn) @@ -628,14 +613,12 @@ object Extensions { } implicit class StringOps(string: String) { - private val logger = LoggerFactory.getLogger(getClass) def sanitize: String = Option(string).map(_.replaceAll("[^a-zA-Z0-9_]", "_")).orNull def cleanSpec: String = string.split("/").head } implicit class ExternalSourceOps(externalSource: ExternalSource) extends ExternalSource(externalSource) { - private val logger = LoggerFactory.getLogger(getClass) private def schemaNames(schema: TDataType): Array[String] = schemaFields(schema).map(_.name) private def schemaFields(schema: TDataType): Array[StructField] = @@ -652,7 +635,6 @@ object Extensions { } object KeyMappingHelper { - private val logger = LoggerFactory.getLogger(getClass) // key mapping is defined as {left_col1: right_col1}, on the right there can be two keys [right_col1, right_col2] // Left is implicitly assumed to have right_col2 // We need to convert a map {left_col1: a, right_col2: b, irrelevant_col: c} into {right_col1: a, right_col2: b} @@ -668,7 +650,6 @@ object Extensions { } implicit class ExternalPartOps(externalPart: ExternalPart) extends ExternalPart(externalPart) { - private val logger = LoggerFactory.getLogger(getClass) lazy val fullName: String = Constants.ExternalPrefix + "_" + Option(externalPart.prefix).map(_ + "_").getOrElse("") + @@ -710,7 +691,6 @@ object Extensions { } implicit class JoinPartOps(joinPart: JoinPart) extends JoinPart(joinPart) { - private val logger = LoggerFactory.getLogger(getClass) lazy val fullPrefix = (Option(prefix) ++ Some(groupBy.getMetaData.cleanName)).mkString("_") lazy val leftToRight: Map[String, String] = rightToLeft.map { case (key, value) => value -> key } @@ -740,7 +720,6 @@ object Extensions { } implicit class LabelPartOps(val labelPart: LabelPart) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) def leftKeyCols: Array[String] = { labelPart.labels.toScala .flatMap { @@ -769,7 +748,6 @@ object Extensions { } implicit class BootstrapPartOps(val bootstrapPart: BootstrapPart) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) /** * Compress the info such that the hash can be stored at record and @@ -801,7 +779,6 @@ object Extensions { } object JoinOps { - private val logger = LoggerFactory.getLogger(getClass) private val identifierRegex: Pattern = Pattern.compile("[a-zA-Z_][a-zA-Z0-9_]*") def isIdentifier(s: String): Boolean = identifierRegex.matcher(s).matches() } @@ -1010,7 +987,6 @@ object Extensions { } implicit class StringsOps(strs: Iterable[String]) { - private val logger = LoggerFactory.getLogger(getClass) def pretty: String = { if (strs.nonEmpty) "\n " + strs.mkString(",\n ") + "\n" @@ -1022,7 +998,6 @@ object Extensions { } implicit class QueryOps(query: Query) { - private val logger = LoggerFactory.getLogger(getClass) def setupsSeq: Seq[String] = { Option(query.setups) .map( @@ -1035,7 +1010,6 @@ object Extensions { } implicit class ThrowableOps(throwable: Throwable) { - private val logger = LoggerFactory.getLogger(getClass) def traceString: String = { val sw = new StringWriter() val pw = new PrintWriter(sw) @@ -1045,7 +1019,6 @@ object Extensions { } implicit class DerivationOps(derivations: List[Derivation]) { - private val logger = LoggerFactory.getLogger(getClass) lazy val derivationsContainStar: Boolean = derivations.iterator.exists(_.name == "*") lazy val derivationsWithoutStar: List[Derivation] = derivations.filterNot(_.name == "*") lazy val areDerivationsRenameOnly: Boolean = derivationsWithoutStar.forall(d => JoinOps.isIdentifier(d.expression)) diff --git a/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala b/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala index 0942d1422..a13f7ab3d 100644 --- a/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala +++ b/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala @@ -90,7 +90,6 @@ object ThriftJsonCodec { } def fromJsonFile[T <: TBase[_, _]: Manifest: ClassTag](fileName: String, check: Boolean): T = { - private val logger = LoggerFactory.getLogger(getClass) val src = fromFile(fileName) val jsonStr = try src.mkString diff --git a/flink/src/main/scala/ai/chronon/flink/AsyncKVStoreWriter.scala b/flink/src/main/scala/ai/chronon/flink/AsyncKVStoreWriter.scala index 81e82142b..936b484b1 100644 --- a/flink/src/main/scala/ai/chronon/flink/AsyncKVStoreWriter.scala +++ b/flink/src/main/scala/ai/chronon/flink/AsyncKVStoreWriter.scala @@ -17,8 +17,6 @@ import scala.util.{Failure, Success} case class WriteResponse(putRequest: PutRequest, status: Boolean) object AsyncKVStoreWriter { - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) private val kvStoreConcurrency = 10 private val defaultTimeoutMillis = 1000L @@ -48,7 +46,6 @@ object AsyncKVStoreWriter { * This was moved to flink-rpc-akka in Flink 1.16 and made private, so we reproduce the direct execution context here */ private class DirectExecutionContext extends ExecutionContext { - private val logger = LoggerFactory.getLogger(getClass) override def execute(runnable: Runnable): Unit = runnable.run() diff --git a/flink/src/main/scala/ai/chronon/flink/SparkExpressionEvalFn.scala b/flink/src/main/scala/ai/chronon/flink/SparkExpressionEvalFn.scala index 439eb91af..6c5a32dcf 100644 --- a/flink/src/main/scala/ai/chronon/flink/SparkExpressionEvalFn.scala +++ b/flink/src/main/scala/ai/chronon/flink/SparkExpressionEvalFn.scala @@ -29,7 +29,6 @@ import scala.jdk.CollectionConverters.{asScalaBufferConverter, mapAsScalaMapConv */ class SparkExpressionEvalFn[T](encoder: Encoder[T], groupBy: GroupBy) extends RichFlatMapFunction[T, Map[String, Any]] { private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) private val query: Query = groupBy.streamingSource.get.getEvents.query diff --git a/log.py b/log.py deleted file mode 100644 index 37226b62c..000000000 --- a/log.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -import re - -def process_file(file_path): - with open(file_path, 'r') as file: - lines = file.readlines() - - # Check if the file contains println - contains_println = any('println' in line for line in lines) - if not contains_println: - return # No need to modify the file - - # Prepare the import statement and logger instance - import_statement = 'import org.slf4j.LoggerFactory\n' - logger_instance = ' private val logger = LoggerFactory.getLogger(getClass)\n' - - # Determine where to insert the import - import_index = next((i for i, line in enumerate(lines) if line.startswith('import')), 0) - if import_index != 0 or not any(line.startswith('import org.slf4j.LoggerFactory') for line in lines): - lines.insert(import_index, import_statement) - import_index += 1 - - # Regex to match class or object definitions - object_or_class_regex = re.compile(r'\b(object|class)\s+\w+') - - # Insert logger instance after the opening brace of each class or object - for i in range(len(lines)): - if object_or_class_regex.search(lines[i]): - # Find the opening brace - brace_index = i - while brace_index < len(lines) and '{' not in lines[brace_index]: - brace_index += 1 - if brace_index < len(lines): - lines.insert(brace_index + 1, logger_instance) - - # Replace println with logger.info - updated_content = ''.join(lines).replace('println', 'logger.info') - - # Write the updated content - with open(file_path, 'w') as file: - file.write(updated_content) - -def search_and_replace(directory): - for root, dirs, files in os.walk(directory): - for file in files: - if file.endswith('.scala'): - process_file(os.path.join(root, file)) - -# Run from the current directory -current_directory = os.getcwd() -search_and_replace(current_directory) - -print("Replacement complete.") diff --git a/online/src/main/scala/ai/chronon/online/Api.scala b/online/src/main/scala/ai/chronon/online/Api.scala index 2258832e0..af58081d3 100644 --- a/online/src/main/scala/ai/chronon/online/Api.scala +++ b/online/src/main/scala/ai/chronon/online/Api.scala @@ -28,15 +28,11 @@ import scala.concurrent.{Await, ExecutionContext, Future} import scala.util.{Failure, Success, Try} object KVStore { - private val logger = LoggerFactory.getLogger(getClass) // a scan request essentially for the keyBytes // afterTsMillis - is used to limit the scan to more recent data case class GetRequest(keyBytes: Array[Byte], dataset: String, afterTsMillis: Option[Long] = None) case class TimedValue(bytes: Array[Byte], millis: Long) case class GetResponse(request: GetRequest, values: Try[Seq[TimedValue]]) { - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) def latest: Try[TimedValue] = values.map(_.maxBy(_.millis)) } case class PutRequest(keyBytes: Array[Byte], valueBytes: Array[Byte], dataset: String, tsMillis: Option[Long] = None) @@ -129,10 +125,6 @@ case class LoggableResponseBase64(keyBase64: String, schemaHash: String) abstract class StreamDecoder extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) def decode(bytes: Array[Byte]): Mutation def schema: StructType } @@ -142,7 +134,6 @@ trait StreamBuilder { } object ExternalSourceHandler { - private val logger = LoggerFactory.getLogger(getClass) private[ExternalSourceHandler] val executor = FlexibleExecutionContext.buildExecutionContext } @@ -151,8 +142,6 @@ object ExternalSourceHandler { // There is a Java Friendly Handler that extends this and handles conversions // see: [[ai.chronon.online.JavaExternalSourceHandler]] abstract class ExternalSourceHandler extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) implicit lazy val executionContext: ExecutionContext = ExternalSourceHandler.executor def fetch(requests: Seq[Fetcher.Request]): Future[Seq[Fetcher.Response]] } @@ -160,9 +149,6 @@ abstract class ExternalSourceHandler extends Serializable { // the implementer of this class should take a single argument, a scala map of string to string // chronon framework will construct this object with user conf supplied via CLI abstract class Api(userConf: Map[String, String]) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) lazy val fetcher: Fetcher = { if (fetcherObj == null) fetcherObj = buildFetcher() diff --git a/online/src/main/scala/ai/chronon/online/DataStreamBuilder.scala b/online/src/main/scala/ai/chronon/online/DataStreamBuilder.scala index cc73317eb..597681d06 100644 --- a/online/src/main/scala/ai/chronon/online/DataStreamBuilder.scala +++ b/online/src/main/scala/ai/chronon/online/DataStreamBuilder.scala @@ -28,8 +28,6 @@ import scala.util.{Failure, Success, Try} case class TopicInfo(name: String, topicType: String, params: Map[String, String]) object TopicInfo { - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) // default topic type is kafka // kafka://topic_name/schema=my_schema/host=X/port=Y should parse into TopicInfo(topic_name, kafka, {schema: my_schema, host: X, port Y}) def parse(topic: String): TopicInfo = { @@ -75,7 +73,6 @@ case class DataStream(df: DataFrame, partitions: Int, topicInfo: TopicInfo) { case DataModel.Events => Map.empty }) val selectsOption: Option[Map[String, String]] = for { - private val logger = LoggerFactory.getLogger(getClass) selectMap <- Option(query.selects).map(_.toScala.toMap) keyMap = Option(keys).map(_.map(k => k -> k).toMap).getOrElse(Map.empty) } yield (keyMap ++ selectMap ++ timeSelects) diff --git a/online/src/main/scala/ai/chronon/online/Fetcher.scala b/online/src/main/scala/ai/chronon/online/Fetcher.scala index 38fa4093b..3022761c3 100644 --- a/online/src/main/scala/ai/chronon/online/Fetcher.scala +++ b/online/src/main/scala/ai/chronon/online/Fetcher.scala @@ -36,7 +36,6 @@ import scala.concurrent.Future import scala.util.{Failure, Success, Try} object Fetcher { - private val logger = LoggerFactory.getLogger(getClass) case class Request(name: String, keys: Map[String, AnyRef], atMillis: Option[Long] = None, @@ -50,13 +49,6 @@ object Fetcher { case class ResponseWithContext(request: Request, derivedValues: Map[String, AnyRef], baseValues: Map[String, AnyRef]) { - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) def combinedValues: Map[String, AnyRef] = baseValues ++ derivedValues } case class ColumnSpec(groupByName: String, @@ -65,7 +57,6 @@ object Fetcher { keyMapping: Option[Map[String, AnyRef]]) def logResponseStats(response: Response, context: Metrics.Context): Unit = { - private val logger = LoggerFactory.getLogger(getClass) val responseMap = response.values.get var exceptions = 0 var nulls = 0 @@ -91,7 +82,6 @@ class Fetcher(val kvStore: KVStore, val externalSourceRegistry: ExternalSourceRegistry = null) extends FetcherBase(kvStore, metaDataSet, timeoutMillis, debug) { private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) def buildJoinCodec(joinConf: api.Join): JoinCodec = { val keyFields = new mutable.LinkedHashSet[StructField] @@ -173,8 +163,8 @@ class Fetcher(val kvStore: KVStore, internalResponses.zip(externalResponses).map { case (internalResponse, externalResponse) => if (debug) { - logger.info(internalResponse.values.get.keys.toSeq) - logger.info(externalResponse.values.get.keys.toSeq) + logger.info(internalResponse.values.get.keys.toSeq.mkString(",")) + logger.info(externalResponse.values.get.keys.toSeq.mkString(",")) } val cleanInternalRequest = internalResponse.request.copy(context = None) assert( diff --git a/online/src/main/scala/ai/chronon/online/MetadataStore.scala b/online/src/main/scala/ai/chronon/online/MetadataStore.scala index 5eb03bc6e..6e016133a 100644 --- a/online/src/main/scala/ai/chronon/online/MetadataStore.scala +++ b/online/src/main/scala/ai/chronon/online/MetadataStore.scala @@ -37,7 +37,6 @@ import scala.util.{Failure, Success, Try} case class DataMetrics(series: Seq[(Long, SortedMap[String, Any])]) class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, timeoutMillis: Long) { - private val logger = LoggerFactory.getLogger(getClass) private val logger = LoggerFactory.getLogger(getClass) private var partitionSpec = PartitionSpec(format = "yyyy-MM-dd", spanMillis = WindowUtils.Day.millis) private val CONF_BATCH_SIZE = 50 diff --git a/spark/src/main/scala/ai/chronon/spark/Analyzer.scala b/spark/src/main/scala/ai/chronon/spark/Analyzer.scala index 1cc9c1c42..42d7e2d00 100644 --- a/spark/src/main/scala/ai/chronon/spark/Analyzer.scala +++ b/spark/src/main/scala/ai/chronon/spark/Analyzer.scala @@ -1,7 +1,7 @@ /* * Copyright (C) 2023 The Chronon Authors. * - * Licensed under the Apache License, Version 2.0 (the "License"); + * `L`icensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * @@ -37,10 +37,8 @@ import scala.util.ScalaJavaConversions.ListOps //@SerialVersionUID(3457890987L) //class ItemSketchSerializable(var mapSize: Int) extends ItemsSketch[String](mapSize) with Serializable {} - private val logger = LoggerFactory.getLogger(getClass) class ItemSketchSerializable extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) var sketch: ItemsSketch[String] = null def init(mapSize: Int): ItemSketchSerializable = { sketch = new ItemsSketch[String](mapSize) @@ -160,7 +158,6 @@ class Analyzer(tableUtils: TableUtils, window: String = null, inputColumn: String = null, groupByName: String = null) { - private val logger = LoggerFactory.getLogger(getClass) def asMap: Map[String, String] = { Map( diff --git a/spark/src/main/scala/ai/chronon/spark/BootstrapInfo.scala b/spark/src/main/scala/ai/chronon/spark/BootstrapInfo.scala index f9dcf19d7..4a4301b3f 100644 --- a/spark/src/main/scala/ai/chronon/spark/BootstrapInfo.scala +++ b/spark/src/main/scala/ai/chronon/spark/BootstrapInfo.scala @@ -53,9 +53,6 @@ case class BootstrapInfo( derivations: Array[StructField], hashToSchema: Map[String, Array[StructField]] ) { - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) lazy val fieldNames: Set[String] = fields.map(_.name).toSet diff --git a/spark/src/main/scala/ai/chronon/spark/Driver.scala b/spark/src/main/scala/ai/chronon/spark/Driver.scala index ab9282666..a403b2b7f 100644 --- a/spark/src/main/scala/ai/chronon/spark/Driver.scala +++ b/spark/src/main/scala/ai/chronon/spark/Driver.scala @@ -51,7 +51,6 @@ import scala.util.{Failure, Success, Try} // useful to override spark.sql.extensions args - there is no good way to unset that conf apparently // so we give it dummy extensions class DummyExtensions extends (SparkSessionExtensions => Unit) { - private val logger = LoggerFactory.getLogger(getClass) override def apply(extensions: SparkSessionExtensions): Unit = {} } @@ -221,7 +220,6 @@ object Driver { with OfflineSubcommand with LocalExportTableAbility with ResultValidationAbility { - private val logger = LoggerFactory.getLogger(getClass) val stepDays: ScallopOption[Int] = opt[Int](required = false, descr = "Runs backfill in steps, step-days at a time. Default is 30 days", @@ -269,7 +267,6 @@ object Driver { with OfflineSubcommand with LocalExportTableAbility with ResultValidationAbility { - private val logger = LoggerFactory.getLogger(getClass) val stepDays: ScallopOption[Int] = opt[Int](required = false, descr = "Runs backfill in steps, step-days at a time. Default is 30 days", @@ -299,9 +296,7 @@ object Driver { } object LabelJoin { - private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("label-join") with OfflineSubcommand with LocalExportTableAbility { - private val logger = LoggerFactory.getLogger(getClass) val stepDays: ScallopOption[Int] = opt[Int](required = false, descr = "Runs label join in steps, step-days at a time. Default is 30 days", @@ -326,9 +321,7 @@ object Driver { } object Analyzer { - private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("analyze") with OfflineSubcommand { - private val logger = LoggerFactory.getLogger(getClass) val startDate: ScallopOption[String] = opt[String](required = false, descr = "Finds heavy hitters & time-distributions until a specified start date", @@ -368,9 +361,7 @@ object Driver { } object MetadataExport { - private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("metadata-export") with OfflineSubcommand { - private val logger = LoggerFactory.getLogger(getClass) val inputRootPath: ScallopOption[String] = opt[String](required = true, descr = "Base path of config repo to export from") val outputRootPath: ScallopOption[String] = @@ -384,9 +375,7 @@ object Driver { } object StagingQueryBackfill { - private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("staging-query-backfill") with OfflineSubcommand with LocalExportTableAbility { - private val logger = LoggerFactory.getLogger(getClass) val stepDays: ScallopOption[Int] = opt[Int](required = false, descr = "Runs backfill in steps, step-days at a time. Default is 30 days", @@ -415,9 +404,7 @@ object Driver { } object DailyStats { - private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("stats-summary") with OfflineSubcommand { - private val logger = LoggerFactory.getLogger(getClass) val stepDays: ScallopOption[Int] = opt[Int](required = false, descr = "Runs backfill in steps, step-days at a time. Default is 30 days", @@ -437,9 +424,7 @@ object Driver { } object LogStats { - private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("log-summary") with OfflineSubcommand { - private val logger = LoggerFactory.getLogger(getClass) val stepDays: ScallopOption[Int] = opt[Int](required = false, descr = "Runs backfill in steps, step-days at a time. Default is 30 days", @@ -458,9 +443,7 @@ object Driver { } object GroupByUploader { - private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("group-by-upload") with OfflineSubcommand { - private val logger = LoggerFactory.getLogger(getClass) override def subcommandName() = "group-by-upload" } @@ -470,9 +453,7 @@ object Driver { } object ConsistencyMetricsCompute { - private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("consistency-metrics-compute") with OfflineSubcommand { - private val logger = LoggerFactory.getLogger(getClass) override def subcommandName() = "consistency-metrics-compute" } @@ -487,9 +468,7 @@ object Driver { } object CompareJoinQuery { - private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("compare-join-query") with OfflineSubcommand { - private val logger = LoggerFactory.getLogger(getClass) val queryConf: ScallopOption[String] = opt[String](required = true, descr = "Conf to the Staging Query to compare with") val startDate: ScallopOption[String] = @@ -527,7 +506,6 @@ object Driver { // hashmap implements serializable def serializableProps: Map[String, String] = { - private val logger = LoggerFactory.getLogger(getClass) val map = new mutable.HashMap[String, String]() propsInner.foreach { case (key, value) => map.update(key, value) } map.toMap @@ -550,7 +528,6 @@ object Driver { private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("fetch") with OnlineSubcommand { - private val logger = LoggerFactory.getLogger(getClass) val keyJson: ScallopOption[String] = opt[String](required = false, descr = "json of the keys to fetch") val name: ScallopOption[String] = opt[String](required = true, descr = "name of the join/group-by to fetch") val `type`: ScallopOption[String] = @@ -669,7 +646,6 @@ object Driver { object MetadataUploader { private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("metadata-upload") with OnlineSubcommand { - private val logger = LoggerFactory.getLogger(getClass) val confPath: ScallopOption[String] = opt[String](required = true, descr = "Path to the Chronon config file or directory") } @@ -683,9 +659,7 @@ object Driver { } object LogFlattener { - private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("log-flattener") with OfflineSubcommand { - private val logger = LoggerFactory.getLogger(getClass) val logTable: ScallopOption[String] = opt[String](required = true, descr = "Hive table with partitioned raw logs") @@ -817,7 +791,6 @@ object Driver { } class Args(args: Array[String]) extends ScallopConf(args) { - private val logger = LoggerFactory.getLogger(getClass) object JoinBackFillArgs extends JoinBackfill.Args addSubcommand(JoinBackFillArgs) object LogFlattenerArgs extends LogFlattener.Args @@ -853,21 +826,6 @@ object Driver { } def onlineBuilder(userConf: Map[String, String], onlineJar: String, onlineClass: String): Api = { - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) val urls = Array(new File(onlineJar).toURI.toURL) val cl = ScalaClassLoader.fromURLs(urls, this.getClass.getClassLoader) val cls = cl.loadClass(onlineClass) diff --git a/spark/src/main/scala/ai/chronon/spark/Extensions.scala b/spark/src/main/scala/ai/chronon/spark/Extensions.scala index 1e2a02c2a..3e3e809df 100644 --- a/spark/src/main/scala/ai/chronon/spark/Extensions.scala +++ b/spark/src/main/scala/ai/chronon/spark/Extensions.scala @@ -33,10 +33,8 @@ import scala.collection.Seq import scala.reflect.ClassTag object Extensions { - private val logger = LoggerFactory.getLogger(getClass) implicit class StructTypeOps(schema: StructType) { - private val logger = LoggerFactory.getLogger(getClass) def pretty: String = { val schemaTuples = schema.fields.map { field => field.dataType.simpleString -> field.name @@ -60,9 +58,6 @@ object Extensions { case class DfStats(count: Long, partitionRange: PartitionRange) // helper class to maintain datafram stats that are necessary for downstream operations case class DfWithStats(df: DataFrame, partitionCounts: Map[String, Long])(implicit val tableUtils: TableUtils) { - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) private val minPartition: String = partitionCounts.keys.min private val maxPartition: String = partitionCounts.keys.max val partitionRange: PartitionRange = PartitionRange(minPartition, maxPartition) @@ -77,7 +72,6 @@ object Extensions { } object DfWithStats { - private val logger = LoggerFactory.getLogger(getClass) def apply(dataFrame: DataFrame)(implicit tableUtils: TableUtils): DfWithStats = { val partitionCounts = dataFrame .groupBy(col(TableUtils(dataFrame.sparkSession).partitionColumn)) @@ -286,7 +280,6 @@ object Extensions { } implicit class ArrayOps[T: ClassTag](arr: Array[T]) { - private val logger = LoggerFactory.getLogger(getClass) def uniqSort(ordering: Ordering[T]): Array[T] = { val tree = new util.TreeSet[T](ordering) for (i <- arr.indices) { @@ -304,7 +297,6 @@ object Extensions { } implicit class InternalRowOps(internalRow: InternalRow) { - private val logger = LoggerFactory.getLogger(getClass) def toRow(schema: StructType): Row = { new Row() { override def length: Int = { diff --git a/spark/src/main/scala/ai/chronon/spark/FastHashing.scala b/spark/src/main/scala/ai/chronon/spark/FastHashing.scala index 1c32b44e2..d25a23633 100644 --- a/spark/src/main/scala/ai/chronon/spark/FastHashing.scala +++ b/spark/src/main/scala/ai/chronon/spark/FastHashing.scala @@ -27,7 +27,6 @@ import java.nio.charset.Charset // TODO: drop data and hashInt, iff we see OOMs on executors for small IRs and large keys // That is the only case where key size would be a problem case class KeyWithHash(data: Array[Any], hash: Array[Byte], hashInt: Int) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) // 16-byte hash from murmur_128 // P(one collision) ~ 10^-6 when key count ~ 2.6×10^16 // in-comparison with a 8-byte hash (long) diff --git a/spark/src/main/scala/ai/chronon/spark/Join.scala b/spark/src/main/scala/ai/chronon/spark/Join.scala index d6bfc3378..757b69d82 100644 --- a/spark/src/main/scala/ai/chronon/spark/Join.scala +++ b/spark/src/main/scala/ai/chronon/spark/Join.scala @@ -45,8 +45,6 @@ import scala.util.ScalaJavaConversions.{IterableOps, ListOps, MapOps} case class CoveringSet(hashes: Seq[String], rowCount: Long, isCovering: Boolean) object CoveringSet { - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) def toFilterExpression(coveringSets: Seq[CoveringSet]): String = { val coveringSetHashExpression = "(" + coveringSets diff --git a/spark/src/main/scala/ai/chronon/spark/KvRdd.scala b/spark/src/main/scala/ai/chronon/spark/KvRdd.scala index 5c01e73a5..bbbe9d553 100644 --- a/spark/src/main/scala/ai/chronon/spark/KvRdd.scala +++ b/spark/src/main/scala/ai/chronon/spark/KvRdd.scala @@ -27,7 +27,6 @@ import org.apache.spark.sql.types.{BinaryType, LongType, StringType, StructField import org.apache.spark.sql.{DataFrame, Row, SparkSession} object GenericRowHandler { - private val logger = LoggerFactory.getLogger(getClass) val func: Any => Array[Any] = { case x: GenericRowWithSchema => { val result = new Array[Any](x.length) diff --git a/spark/src/main/scala/ai/chronon/spark/LogFlattenerJob.scala b/spark/src/main/scala/ai/chronon/spark/LogFlattenerJob.scala index 742945162..39f8bc8f8 100644 --- a/spark/src/main/scala/ai/chronon/spark/LogFlattenerJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/LogFlattenerJob.scala @@ -248,7 +248,6 @@ class LogFlattenerJob(session: SparkSession, } object LogFlattenerJob { - private val logger = LoggerFactory.getLogger(getClass) def readSchemaTableProperties(tableUtils: TableUtils, logTable: String): Map[String, String] = { val curTblProps = tableUtils.getTableProperties(logTable).getOrElse(Map.empty) diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala index bdea6e9a8..2bfe7418d 100644 --- a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala +++ b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala @@ -748,7 +748,6 @@ case class TableUtils(sparkSession: SparkSession) { } sealed case class IncompatibleSchemaException(inconsistencies: Seq[(String, DataType, DataType)]) extends Exception { - private val logger = LoggerFactory.getLogger(getClass) override def getMessage: String = { val inconsistenciesStr = inconsistencies.map(tuple => s"columnName: ${tuple._1} existingType: ${tuple._2} newType: ${tuple._3}") diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala b/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala index d6d23de47..fc058d7b0 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala @@ -42,7 +42,6 @@ import scala.util.ScalaJavaConversions.{IteratorOps, JIteratorOps, ListOps, MapO // micro batching destroys and re-creates these objects repeatedly through ForeachBatchWriter and MapFunction // this allows for re-use object LocalIOCache { - private val logger = LoggerFactory.getLogger(getClass) private var fetcher: Fetcher = null private var kvStore: KVStore = null def getOrSetFetcher(builderFunc: () => Fetcher): Fetcher = { @@ -76,7 +75,6 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map extends Serializable val valueZSchema: api.StructType = groupByConf.dataModel match { - private val logger = LoggerFactory.getLogger(getClass) case api.DataModel.Events => servingInfoProxy.valueChrononSchema case api.DataModel.Entities => servingInfoProxy.mutationValueChrononSchema } diff --git a/spark/src/test/scala/ai/chronon/spark/test/MockApi.scala b/spark/src/test/scala/ai/chronon/spark/test/MockApi.scala index b704ffe32..5745f35e4 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/MockApi.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/MockApi.scala @@ -16,7 +16,6 @@ package ai.chronon.spark.test -import org.slf4j.LoggerFactory import ai.chronon.api.Extensions.{GroupByOps, SourceOps} import ai.chronon.api.{Constants, StructType} import ai.chronon.online.Fetcher.Response @@ -39,7 +38,6 @@ import scala.util.ScalaJavaConversions.{IteratorOps, JListOps, JMapOps} import scala.util.Success class MockDecoder(inputSchema: StructType) extends StreamDecoder { - private val logger = LoggerFactory.getLogger(getClass) private def byteArrayToAvro(avro: Array[Byte], schema: Schema): GenericRecord = { val reader = new SpecificDatumReader[GenericRecord](schema) @@ -67,10 +65,9 @@ class MockDecoder(inputSchema: StructType) extends StreamDecoder { } class MockStreamBuilder extends StreamBuilder { - private val logger = LoggerFactory.getLogger(getClass) override def from(topicInfo: TopicInfo)(implicit session: SparkSession, props: Map[String, String]): DataStream = { val tableUtils = TableUtils(session) - logger.info(s"""building stream from topic: ${topicInfo.name}""") + println(s"""building stream from topic: ${topicInfo.name}""") val ds = topicInfo.params("ds") val df = tableUtils.sql(s"select * from ${topicInfo.name} where ds >= '$ds'") val encodedDf = (new InMemoryStream).getContinuousStreamDF(session, df.drop("ds")) @@ -80,9 +77,7 @@ class MockStreamBuilder extends StreamBuilder { } class MockApi(kvStore: () => KVStore, val namespace: String) extends Api(null) { - private val logger = LoggerFactory.getLogger(getClass) class PlusOneExternalHandler extends ExternalSourceHandler { - private val logger = LoggerFactory.getLogger(getClass) override def fetch(requests: collection.Seq[Fetcher.Request]): Future[collection.Seq[Fetcher.Response]] = { Future( requests.map(req => @@ -92,7 +87,6 @@ class MockApi(kvStore: () => KVStore, val namespace: String) extends Api(null) { } class AlwaysFailsHandler extends JavaExternalSourceHandler { - private val logger = LoggerFactory.getLogger(getClass) override def fetchJava(requests: util.List[JavaRequest]): CompletableFuture[util.List[JavaResponse]] = { CompletableFuture.completedFuture[util.List[JavaResponse]]( requests @@ -112,7 +106,6 @@ class MockApi(kvStore: () => KVStore, val namespace: String) extends Api(null) { } class JavaPlusOneExternalHandler extends JavaExternalSourceHandler { - private val logger = LoggerFactory.getLogger(getClass) override def fetchJava(requests: util.List[JavaRequest]): CompletableFuture[util.List[JavaResponse]] = { CompletableFuture.completedFuture( requests @@ -139,7 +132,7 @@ class MockApi(kvStore: () => KVStore, val namespace: String) extends Api(null) { new ConcurrentLinkedQueue[LoggableResponseBase64] override def streamDecoder(parsedInfo: GroupByServingInfoParsed): StreamDecoder = { - logger.info( + println( s"decoding stream ${parsedInfo.groupBy.streamingSource.get.topic} with " + s"schema: ${SparkConversions.fromChrononSchema(parsedInfo.streamChrononSchema).catalogString}") new MockDecoder(parsedInfo.streamChrononSchema) From 931bfb324896da5302864d629f31a2432894363e Mon Sep 17 00:00:00 2001 From: Varant Zanoyan Date: Tue, 5 Dec 2023 16:14:04 -0800 Subject: [PATCH 4/7] WIP --- online/src/main/scala/ai/chronon/online/FetcherBase.scala | 2 +- spark/src/main/scala/ai/chronon/spark/GroupBy.scala | 6 ++---- spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala | 4 ++-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/online/src/main/scala/ai/chronon/online/FetcherBase.scala b/online/src/main/scala/ai/chronon/online/FetcherBase.scala index b79d9de12..478ddb240 100644 --- a/online/src/main/scala/ai/chronon/online/FetcherBase.scala +++ b/online/src/main/scala/ai/chronon/online/FetcherBase.scala @@ -63,7 +63,7 @@ class FetcherBase(kvStore: KVStore, overallLatency: Long, context: Metrics.Context, totalResponseValueBytes: Int): Map[String, AnyRef] = { - private val logger = LoggerFactory.getLogger(getClass) + val logger = LoggerFactory.getLogger(getClass) val latestBatchValue = batchResponsesTry.map(_.maxBy(_.millis)) val servingInfo = latestBatchValue.map(timedVal => updateServingInfo(timedVal.millis, oldServingInfo)).getOrElse(oldServingInfo) diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala index 26e3d39f5..2869558e9 100644 --- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala +++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala @@ -44,7 +44,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation], skewFilter: Option[String] = None, finalize: Boolean = true) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) + val logger = LoggerFactory.getLogger(getClass) protected[spark] val tsIndex: Int = inputDf.schema.fieldNames.indexOf(Constants.TimeColumn) protected val selectedSchema: Array[(String, api.DataType)] = SparkConversions.toChrononSchema(inputDf.schema) @@ -392,7 +392,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation], // TODO: truncate queryRange for caching object GroupBy { - private val logger = LoggerFactory.getLogger(getClass) + val logger = LoggerFactory.getLogger(getClass) // Need to use a case class here to allow null matching case class SourceDataProfile(earliestRequired: String, earliestPresent: String, latestAllowed: String) @@ -404,8 +404,6 @@ object GroupBy { tableUtils: TableUtils, computeDependency: Boolean = true, showDf: Boolean = false): api.GroupBy = { - private val logger = LoggerFactory.getLogger(getClass) - private val logger = LoggerFactory.getLogger(getClass) val result = groupByConf.deepCopy() val newSources: java.util.List[api.Source] = groupByConf.sources.toScala.map { source => if (source.isSetJoinSource) { diff --git a/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala b/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala index 56ac4fd7a..32f1ce719 100644 --- a/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala +++ b/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala @@ -35,7 +35,7 @@ import scala.util.ScalaJavaConversions.{ListOps, MapOps} import scala.util.Try class GroupByUpload(endPartition: String, groupBy: GroupBy) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) + val logger = LoggerFactory.getLogger(getClass) implicit val sparkSession: SparkSession = groupBy.sparkSession implicit private val tableUtils: TableUtils = TableUtils(sparkSession) private def fromBase(rdd: RDD[(Array[Any], Array[Any])]): KvRdd = { @@ -105,7 +105,7 @@ class GroupByUpload(endPartition: String, groupBy: GroupBy) extends Serializable } object GroupByUpload { - private val logger = LoggerFactory.getLogger(getClass) + val logger = LoggerFactory.getLogger(getClass) // TODO - remove this if spark streaming can't reach hive tables private def buildServingInfo(groupByConf: api.GroupBy, From 8d7fbab5ead869f859976f49747a635c10a89725 Mon Sep 17 00:00:00 2001 From: Varant Zanoyan Date: Tue, 5 Dec 2023 16:15:27 -0800 Subject: [PATCH 5/7] WIP --- .../main/scala/ai/chronon/api/Extensions.scala | 2 +- .../main/scala/ai/chronon/spark/Analyzer.scala | 3 ++- .../main/scala/ai/chronon/spark/Driver.scala | 18 ++++++++++-------- .../scala/ai/chronon/spark/Extensions.scala | 2 +- .../main/scala/ai/chronon/spark/JoinBase.scala | 6 ++++-- .../scala/ai/chronon/spark/LabelJoin.scala | 3 ++- .../ai/chronon/spark/stats/CompareJob.scala | 3 ++- .../spark/streaming/JoinSourceRunner.scala | 2 +- .../chronon/spark/streaming/TopicChecker.scala | 2 +- 9 files changed, 24 insertions(+), 17 deletions(-) diff --git a/api/src/main/scala/ai/chronon/api/Extensions.scala b/api/src/main/scala/ai/chronon/api/Extensions.scala index 895d41a44..5e87af31e 100644 --- a/api/src/main/scala/ai/chronon/api/Extensions.scala +++ b/api/src/main/scala/ai/chronon/api/Extensions.scala @@ -784,7 +784,7 @@ object Extensions { } implicit class JoinOps(val join: Join) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) // all keys as they should appear in left that are being used on right def leftKeyCols: Array[String] = { join.joinParts.toScala diff --git a/spark/src/main/scala/ai/chronon/spark/Analyzer.scala b/spark/src/main/scala/ai/chronon/spark/Analyzer.scala index 42d7e2d00..22d66e353 100644 --- a/spark/src/main/scala/ai/chronon/spark/Analyzer.scala +++ b/spark/src/main/scala/ai/chronon/spark/Analyzer.scala @@ -336,7 +336,8 @@ class Analyzer(tableUtils: TableUtils, logger.info(s"----- Schema validation completed. Found ${keysWithError.size} errors") val keyErrorSet: Set[(String, String)] = keysWithError.toSet logger.info(keyErrorSet.map { case (key, errorMsg) => s"$key => $errorMsg" }.mkString("\n")) - logger.info(s"---- Table permission check completed. Found permission errors in ${noAccessTables.size} tables ----") + logger.info( + s"---- Table permission check completed. Found permission errors in ${noAccessTables.size} tables ----") logger.info(noAccessTables.mkString("\n")) logger.info(s"---- Data availability check completed. Found issue in ${dataAvailabilityErrors.size} tables ----") dataAvailabilityErrors.foreach(error => diff --git a/spark/src/main/scala/ai/chronon/spark/Driver.scala b/spark/src/main/scala/ai/chronon/spark/Driver.scala index a403b2b7f..c813ed7ca 100644 --- a/spark/src/main/scala/ai/chronon/spark/Driver.scala +++ b/spark/src/main/scala/ai/chronon/spark/Driver.scala @@ -214,7 +214,7 @@ object Driver { } object JoinBackfill { - private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("join") with OfflineSubcommand @@ -256,12 +256,13 @@ object Driver { } df.show(numRows = 3, truncate = 0, vertical = true) - logger.info(s"\nShowing three rows of output above.\nQuery table `${args.joinConf.metaData.outputTable}` for more.\n") + logger.info( + s"\nShowing three rows of output above.\nQuery table `${args.joinConf.metaData.outputTable}` for more.\n") } } object GroupByBackfill { - private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("group-by-backfill") with OfflineSubcommand @@ -525,7 +526,7 @@ object Driver { } object FetcherCli { - private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("fetch") with OnlineSubcommand { val keyJson: ScallopOption[String] = opt[String](required = false, descr = "json of the keys to fetch") @@ -565,7 +566,8 @@ object Driver { ) series.get(keyMap("statsKey").asInstanceOf[String]) else series - logger.info(s"--- [FETCHED RESULT] ---\n${objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(toPrint)}") + logger.info( + s"--- [FETCHED RESULT] ---\n${objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(toPrint)}") } def run(args: Args): Unit = { @@ -644,7 +646,7 @@ object Driver { } object MetadataUploader { - private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("metadata-upload") with OnlineSubcommand { val confPath: ScallopOption[String] = opt[String](required = true, descr = "Path to the Chronon config file or directory") @@ -689,7 +691,7 @@ object Driver { } object GroupByStreaming { - private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) def dataStream(session: SparkSession, host: String, topic: String): DataFrame = { TopicChecker.topicShouldExist(topic, host) session.streams.addListener(new StreamingQueryListener() { @@ -713,7 +715,7 @@ object Driver { } class Args extends Subcommand("group-by-streaming") with OnlineSubcommand { - private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) val confPath: ScallopOption[String] = opt[String](required = true, descr = "path to groupBy conf") val DEFAULT_LAG_MILLIS = 2000 // 2seconds val kafkaBootstrap: ScallopOption[String] = diff --git a/spark/src/main/scala/ai/chronon/spark/Extensions.scala b/spark/src/main/scala/ai/chronon/spark/Extensions.scala index 3e3e809df..d237b183c 100644 --- a/spark/src/main/scala/ai/chronon/spark/Extensions.scala +++ b/spark/src/main/scala/ai/chronon/spark/Extensions.scala @@ -84,7 +84,7 @@ object Extensions { } implicit class DataframeOps(df: DataFrame) { - private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) private implicit val tableUtils: TableUtils = TableUtils(df.sparkSession) // This is safe to call on dataframes that are un-shuffled from their disk sources - diff --git a/spark/src/main/scala/ai/chronon/spark/JoinBase.scala b/spark/src/main/scala/ai/chronon/spark/JoinBase.scala index c1d1c7f07..8cbb4d4db 100644 --- a/spark/src/main/scala/ai/chronon/spark/JoinBase.scala +++ b/spark/src/main/scala/ai/chronon/spark/JoinBase.scala @@ -167,7 +167,8 @@ abstract class JoinBase(joinConf: api.Join, } } catch { case e: Exception => - logger.info(s"Error while processing groupBy: ${joinConf.metaData.name}/${joinPart.groupBy.getMetaData.getName}") + logger.info( + s"Error while processing groupBy: ${joinConf.metaData.name}/${joinPart.groupBy.getMetaData.getName}") throw e } if (tableUtils.tableExists(partTable)) { @@ -193,7 +194,8 @@ abstract class JoinBase(joinConf: api.Join, val rowCount = leftDfWithStats.get.count val unfilledRange = leftDfWithStats.get.partitionRange - logger.info(s"\nBackfill is required for ${joinPart.groupBy.metaData.name} for $rowCount rows on range $unfilledRange") + logger.info( + s"\nBackfill is required for ${joinPart.groupBy.metaData.name} for $rowCount rows on range $unfilledRange") val rightBloomMap = JoinUtils.genBloomFilterIfNeeded(leftDf, joinPart, diff --git a/spark/src/main/scala/ai/chronon/spark/LabelJoin.scala b/spark/src/main/scala/ai/chronon/spark/LabelJoin.scala index cc21503a7..788ad67be 100644 --- a/spark/src/main/scala/ai/chronon/spark/LabelJoin.scala +++ b/spark/src/main/scala/ai/chronon/spark/LabelJoin.scala @@ -94,7 +94,8 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { labelTable } else { // creating final join view with feature join output table - logger.info(s"Joining label table : ${outputLabelTable} with joined output table : ${joinConf.metaData.outputTable}") + logger.info( + s"Joining label table : ${outputLabelTable} with joined output table : ${joinConf.metaData.outputTable}") JoinUtils.createOrReplaceView( joinConf.metaData.outputFinalView, leftTable = joinConf.metaData.outputTable, diff --git a/spark/src/main/scala/ai/chronon/spark/stats/CompareJob.scala b/spark/src/main/scala/ai/chronon/spark/stats/CompareJob.scala index 43255e177..7993ff9e4 100644 --- a/spark/src/main/scala/ai/chronon/spark/stats/CompareJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/stats/CompareJob.scala @@ -71,7 +71,8 @@ class CompareJob( // Save the comparison table logger.info("Saving comparison output..") - logger.info(s"Comparison schema ${compareDf.schema.fields.map(sb => (sb.name, sb.dataType)).toMap.mkString("\n - ")}") + logger.info( + s"Comparison schema ${compareDf.schema.fields.map(sb => (sb.name, sb.dataType)).toMap.mkString("\n - ")}") tableUtils.insertUnPartitioned(compareDf, comparisonTableName, tableProps, saveMode = SaveMode.Overwrite) // Save the metrics table diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala b/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala index fc058d7b0..4fca7e51a 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala @@ -107,7 +107,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map private val microBatchIntervalMillis: Int = getProp("batch_interval_millis", "1000").toInt private case class PutRequestHelper(inputSchema: StructType) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) private val keyIndices: Array[Int] = keyColumns.map(inputSchema.fieldIndex) private val valueIndices: Array[Int] = valueColumns.map(inputSchema.fieldIndex) private val tsIndex: Int = inputSchema.fieldIndex(eventTimeColumn) diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala b/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala index d99ac61e8..a8e9bc0ae 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala @@ -86,7 +86,7 @@ object TopicChecker { } class Args(arguments: Seq[String]) extends ScallopConf(arguments) { - private val logger = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) val conf: ScallopOption[String] = opt[String](descr = "Conf to pull topic and bootstrap server information") val bootstrap: ScallopOption[String] = opt[String](descr = "Kafka bootstrap server in host:port format") val topic: ScallopOption[String] = opt[String](descr = "kafka topic to check metadata for") From 965d8993cdc542f2fd3e4de4b08778ec7af08aa6 Mon Sep 17 00:00:00 2001 From: Varant Zanoyan Date: Tue, 5 Dec 2023 18:06:47 -0800 Subject: [PATCH 6/7] WIP --- api/src/main/scala/ai/chronon/api/ParametricMacro.scala | 2 +- spark/src/main/scala/ai/chronon/spark/Join.scala | 1 - .../main/scala/ai/chronon/spark/streaming/TopicChecker.scala | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/api/src/main/scala/ai/chronon/api/ParametricMacro.scala b/api/src/main/scala/ai/chronon/api/ParametricMacro.scala index 46750785b..6645e2ebb 100644 --- a/api/src/main/scala/ai/chronon/api/ParametricMacro.scala +++ b/api/src/main/scala/ai/chronon/api/ParametricMacro.scala @@ -40,7 +40,7 @@ case class ParametricMacro(value: String, func: Map[String, String] => String) { argSeq.tail :+ (argSeq.head + "," + token) } } - logger.info(parsed) + logger.info(parsed.mkString(",")) parsed.map(_.split("=").map(_.trim)).map(x => x(0) -> x(1)).toMap } val result = func(argMap.getOrElse(Map.empty[String, String])) diff --git a/spark/src/main/scala/ai/chronon/spark/Join.scala b/spark/src/main/scala/ai/chronon/spark/Join.scala index 757b69d82..939d80c67 100644 --- a/spark/src/main/scala/ai/chronon/spark/Join.scala +++ b/spark/src/main/scala/ai/chronon/spark/Join.scala @@ -180,7 +180,6 @@ class Join(joinConf: api.Join, logger.info( s"CoveringSet(hash=${coveringSet.hashes.prettyInline}, rowCount=${coveringSet.rowCount}, isCovering=${coveringSet.isCovering})") } - logger.info() } coveringSetsPerJoinPart diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala b/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala index a8e9bc0ae..0b30293d4 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala @@ -109,7 +109,7 @@ object TopicChecker { } else { args.topic() -> args.bootstrap() } - logger.info(getPartitions(topic, bootstrap)) + logger.info(getPartitions(topic, bootstrap).toString) System.exit(0) } } From f49fa88cd45fe5fb95d6e1420ec9ca7dd0bed180 Mon Sep 17 00:00:00 2001 From: Varant Zanoyan Date: Thu, 7 Dec 2023 09:09:13 -0800 Subject: [PATCH 7/7] WIP --- .../aggregator/windowing/HopsAggregator.scala | 2 +- .../windowing/SawtoothOnlineAggregator.scala | 2 +- .../test/ApproxPercentilesTest.scala | 2 +- .../test/SawtoothAggregatorTest.scala | 2 +- .../aggregator/test/VarianceTest.scala | 4 +- .../scala/ai/chronon/api/Extensions.scala | 2 +- .../ai/chronon/api/ParametricMacro.scala | 4 +- .../ai/chronon/api/ThriftJsonCodec.scala | 2 +- .../api/test/DataTypeConversionTest.scala | 2 +- .../ai/chronon/flink/AsyncKVStoreWriter.scala | 2 +- .../scala/ai/chronon/flink/AvroCodecFn.scala | 2 +- .../chronon/flink/SparkExpressionEvalFn.scala | 2 +- .../main/scala/ai/chronon/online/Api.scala | 2 +- .../ai/chronon/online/DataStreamBuilder.scala | 2 +- .../scala/ai/chronon/online/Fetcher.scala | 2 +- .../scala/ai/chronon/online/FetcherBase.scala | 4 +- .../ai/chronon/online/MetadataStore.scala | 2 +- .../ai/chronon/online/TileCodecTest.scala | 2 +- .../online/test/DataStreamBuilderTest.scala | 2 +- project/FolderCleaner.scala | 2 +- project/ThriftGen.scala | 2 +- .../scala/ai/chronon/spark/Analyzer.scala | 2 +- .../ai/chronon/spark/BootstrapInfo.scala | 2 +- .../scala/ai/chronon/spark/Comparison.scala | 2 +- .../main/scala/ai/chronon/spark/Driver.scala | 14 +-- .../scala/ai/chronon/spark/Extensions.scala | 2 +- .../scala/ai/chronon/spark/FastHashing.scala | 2 +- .../main/scala/ai/chronon/spark/GroupBy.scala | 4 +- .../ai/chronon/spark/GroupByUpload.scala | 4 +- .../main/scala/ai/chronon/spark/Join.scala | 2 +- .../scala/ai/chronon/spark/JoinBase.scala | 2 +- .../scala/ai/chronon/spark/JoinUtils.scala | 2 +- .../main/scala/ai/chronon/spark/KvRdd.scala | 4 +- .../scala/ai/chronon/spark/LabelJoin.scala | 2 +- .../ai/chronon/spark/LocalDataLoader.scala | 2 +- .../ai/chronon/spark/LogFlattenerJob.scala | 2 +- .../ai/chronon/spark/MetadataExporter.scala | 2 +- .../chronon/spark/SparkSessionBuilder.scala | 2 +- .../scala/ai/chronon/spark/StagingQuery.scala | 4 +- .../scala/ai/chronon/spark/TableUtils.scala | 2 +- .../chronon/spark/stats/CompareBaseJob.scala | 2 +- .../ai/chronon/spark/stats/CompareJob.scala | 4 +- .../chronon/spark/stats/ConsistencyJob.scala | 2 +- .../ai/chronon/spark/stats/SummaryJob.scala | 2 +- .../ai/chronon/spark/streaming/GroupBy.scala | 2 +- .../spark/streaming/JoinSourceRunner.scala | 4 +- .../spark/streaming/KafkaStreamBuilder.scala | 2 +- .../spark/streaming/StreamingStats.scala | 2 +- .../spark/streaming/TopicChecker.scala | 4 +- .../ai/chronon/spark/test/AnalyzerTest.scala | 2 +- .../spark/test/ChainingFetcherTest.scala | 3 +- .../ai/chronon/spark/test/CompareTest.scala | 6 +- .../spark/test/FeatureWithLabelJoinTest.scala | 2 +- .../chronon/spark/test/FetchStatsTest.scala | 2 +- .../ai/chronon/spark/test/FetcherTest.scala | 6 +- .../ai/chronon/spark/test/GroupByTest.scala | 28 +++--- .../spark/test/GroupByUploadTest.scala | 2 +- .../chronon/spark/test/InMemoryKvStore.scala | 4 +- .../chronon/spark/test/InMemoryStream.scala | 2 +- .../ai/chronon/spark/test/JoinTest.scala | 94 +++++++++---------- .../ai/chronon/spark/test/LabelJoinTest.scala | 2 +- .../spark/test/MetadataExporterTest.scala | 2 +- .../ai/chronon/spark/test/MutationsTest.scala | 2 +- .../chronon/spark/test/StagingQueryTest.scala | 18 ++-- .../chronon/spark/test/StatsComputeTest.scala | 2 +- .../spark/test/bootstrap/DerivationTest.scala | 2 +- .../test/bootstrap/LogBootstrapTest.scala | 2 +- .../test/bootstrap/TableBootstrapTest.scala | 2 +- 68 files changed, 153 insertions(+), 160 deletions(-) diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/windowing/HopsAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/windowing/HopsAggregator.scala index 627f5cdaa..22361fd4d 100644 --- a/aggregator/src/main/scala/ai/chronon/aggregator/windowing/HopsAggregator.scala +++ b/aggregator/src/main/scala/ai/chronon/aggregator/windowing/HopsAggregator.scala @@ -94,7 +94,7 @@ class HopsAggregator(minQueryTs: Long, inputSchema: Seq[(String, DataType)], resolution: Resolution) extends HopsAggregatorBase(aggregations, inputSchema, resolution) { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val leftBoundaries: Array[Option[Long]] = { // Nikhil is pretty confident we won't call this when aggregations is empty diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/windowing/SawtoothOnlineAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/windowing/SawtoothOnlineAggregator.scala index 6af83a824..50b5fddac 100644 --- a/aggregator/src/main/scala/ai/chronon/aggregator/windowing/SawtoothOnlineAggregator.scala +++ b/aggregator/src/main/scala/ai/chronon/aggregator/windowing/SawtoothOnlineAggregator.scala @@ -32,7 +32,7 @@ class SawtoothOnlineAggregator(val batchEndTs: Long, inputSchema: Seq[(String, DataType)], resolution: Resolution, tailBufferMillis: Long) { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) // logically, batch response is arranged like so // sum-90d => sum_ir_88d, [(sum_ir_1d, ts)] -> 1d is the hopSize for 90d diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxPercentilesTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxPercentilesTest.scala index a1ed5672c..3eb8ff564 100644 --- a/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxPercentilesTest.scala +++ b/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxPercentilesTest.scala @@ -26,7 +26,7 @@ import org.junit.Assert._ import scala.util.Random class ApproxPercentilesTest extends TestCase { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) def testBasicImpl(nums: Int, slide: Int, k: Int, percentiles: Array[Double], errorPercent: Float): Unit = { val sorted = (0 to nums).map(_.toFloat) val elems = Random.shuffle(sorted.toList).toArray diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/SawtoothAggregatorTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/SawtoothAggregatorTest.scala index f41fa0a78..60bb5fc2c 100644 --- a/aggregator/src/test/scala/ai/chronon/aggregator/test/SawtoothAggregatorTest.scala +++ b/aggregator/src/test/scala/ai/chronon/aggregator/test/SawtoothAggregatorTest.scala @@ -31,7 +31,7 @@ import scala.collection.mutable import scala.collection.Seq class Timer { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) var ts: Long = System.currentTimeMillis() diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/VarianceTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/VarianceTest.scala index d1a831250..21f7b8a55 100644 --- a/aggregator/src/test/scala/ai/chronon/aggregator/test/VarianceTest.scala +++ b/aggregator/src/test/scala/ai/chronon/aggregator/test/VarianceTest.scala @@ -22,7 +22,7 @@ import junit.framework.TestCase import org.junit.Assert._ class VarianceTest extends TestCase { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) def mean(elems: Seq[Double]): Double = elems.sum / elems.length def naive(elems: Seq[Double]): Double = { @@ -55,7 +55,7 @@ class VarianceTest extends TestCase { val naiveResult = naive(nums) val welfordResult = welford(nums) logger.info(s"naive $naiveResult - welford $welfordResult - sum of squares ${sumOfSquares(nums)}") - logger.info((naiveResult - welfordResult) / naiveResult) + logger.info(((naiveResult - welfordResult) / naiveResult).toString) assertTrue((naiveResult - welfordResult) / naiveResult < 0.0000001) } diff --git a/api/src/main/scala/ai/chronon/api/Extensions.scala b/api/src/main/scala/ai/chronon/api/Extensions.scala index 5e87af31e..b78382ab3 100644 --- a/api/src/main/scala/ai/chronon/api/Extensions.scala +++ b/api/src/main/scala/ai/chronon/api/Extensions.scala @@ -784,7 +784,7 @@ object Extensions { } implicit class JoinOps(val join: Join) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) // all keys as they should appear in left that are being used on right def leftKeyCols: Array[String] = { join.joinParts.toScala diff --git a/api/src/main/scala/ai/chronon/api/ParametricMacro.scala b/api/src/main/scala/ai/chronon/api/ParametricMacro.scala index 6645e2ebb..0dad87991 100644 --- a/api/src/main/scala/ai/chronon/api/ParametricMacro.scala +++ b/api/src/main/scala/ai/chronon/api/ParametricMacro.scala @@ -21,7 +21,7 @@ import scala.collection.mutable // takes a map of macro names and functions and applies the functions on macro arguments case class ParametricMacro(value: String, func: Map[String, String] => String) { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) private val pattern = s"""\\{\\{\\s*$value(\\([\\s0-9A-Za-z_.,=]*\\))*\\s*}}""".r def replace(str: String): String = { @@ -53,7 +53,7 @@ case class ParametricMacro(value: String, func: Map[String, String] => String) { } object ParametricMacro { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) def main(args: Array[String]): Unit = { val mc = ParametricMacro("something", { x => "st:" + x.keys.mkString("/") + "|" + x.values.mkString("/") }) val str = "something nothing-{{ something( a_1=b,, 3.1, c=d) }}-something after-{{ thing:a1=b1 }}{{ something }}" diff --git a/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala b/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala index a13f7ab3d..c560fe5f7 100644 --- a/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala +++ b/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala @@ -29,7 +29,7 @@ import scala.reflect.ClassTag import scala.util.ScalaJavaConversions.ListOps object ThriftJsonCodec { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) def serializer = new TSerializer(new TSimpleJSONProtocol.Factory()) diff --git a/api/src/test/scala/ai/chronon/api/test/DataTypeConversionTest.scala b/api/src/test/scala/ai/chronon/api/test/DataTypeConversionTest.scala index 1640d8f98..422afc5fe 100644 --- a/api/src/test/scala/ai/chronon/api/test/DataTypeConversionTest.scala +++ b/api/src/test/scala/ai/chronon/api/test/DataTypeConversionTest.scala @@ -24,7 +24,7 @@ import org.junit.Assert._ import org.junit.Test class DataTypeConversionTest { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) @Test def testDataTypeToThriftAndBack(): Unit = { // build some complex type diff --git a/flink/src/main/scala/ai/chronon/flink/AsyncKVStoreWriter.scala b/flink/src/main/scala/ai/chronon/flink/AsyncKVStoreWriter.scala index 936b484b1..e99c5d28b 100644 --- a/flink/src/main/scala/ai/chronon/flink/AsyncKVStoreWriter.scala +++ b/flink/src/main/scala/ai/chronon/flink/AsyncKVStoreWriter.scala @@ -65,7 +65,7 @@ object AsyncKVStoreWriter { */ class AsyncKVStoreWriter(onlineImpl: Api, featureGroupName: String) extends RichAsyncFunction[PutRequest, WriteResponse] { - private val logger = LoggerFactory.getLogger(getClass) + private @transient lazy @transient lazy @transient lazy val logger = LoggerFactory.getLogger(getClass) @transient private var kvStore: KVStore = _ diff --git a/flink/src/main/scala/ai/chronon/flink/AvroCodecFn.scala b/flink/src/main/scala/ai/chronon/flink/AvroCodecFn.scala index db626ed8d..1d4163d7e 100644 --- a/flink/src/main/scala/ai/chronon/flink/AvroCodecFn.scala +++ b/flink/src/main/scala/ai/chronon/flink/AvroCodecFn.scala @@ -20,7 +20,7 @@ import scala.jdk.CollectionConverters._ */ case class AvroCodecFn[T](groupByServingInfoParsed: GroupByServingInfoParsed) extends RichFlatMapFunction[Map[String, Any], PutRequest] { - private val logger = LoggerFactory.getLogger(getClass) + private @transient lazy @transient lazy @transient lazy val logger = LoggerFactory.getLogger(getClass) @transient protected var avroConversionErrorCounter: Counter = _ diff --git a/flink/src/main/scala/ai/chronon/flink/SparkExpressionEvalFn.scala b/flink/src/main/scala/ai/chronon/flink/SparkExpressionEvalFn.scala index 6c5a32dcf..8fd5800ce 100644 --- a/flink/src/main/scala/ai/chronon/flink/SparkExpressionEvalFn.scala +++ b/flink/src/main/scala/ai/chronon/flink/SparkExpressionEvalFn.scala @@ -28,7 +28,7 @@ import scala.jdk.CollectionConverters.{asScalaBufferConverter, mapAsScalaMapConv * @tparam T The type of the input data. */ class SparkExpressionEvalFn[T](encoder: Encoder[T], groupBy: GroupBy) extends RichFlatMapFunction[T, Map[String, Any]] { - private val logger = LoggerFactory.getLogger(getClass) + private @transient lazy @transient lazy @transient lazy val logger = LoggerFactory.getLogger(getClass) private val query: Query = groupBy.streamingSource.get.getEvents.query diff --git a/online/src/main/scala/ai/chronon/online/Api.scala b/online/src/main/scala/ai/chronon/online/Api.scala index af58081d3..bc8dbe3d2 100644 --- a/online/src/main/scala/ai/chronon/online/Api.scala +++ b/online/src/main/scala/ai/chronon/online/Api.scala @@ -41,7 +41,7 @@ object KVStore { // the main system level api for key value storage // used for streaming writes, batch bulk uploads & fetching trait KVStore { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) implicit val executionContext: ExecutionContext = FlexibleExecutionContext.buildExecutionContext def create(dataset: String): Unit diff --git a/online/src/main/scala/ai/chronon/online/DataStreamBuilder.scala b/online/src/main/scala/ai/chronon/online/DataStreamBuilder.scala index 597681d06..77c1817c0 100644 --- a/online/src/main/scala/ai/chronon/online/DataStreamBuilder.scala +++ b/online/src/main/scala/ai/chronon/online/DataStreamBuilder.scala @@ -49,7 +49,7 @@ object TopicInfo { } case class DataStream(df: DataFrame, partitions: Int, topicInfo: TopicInfo) { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) // apply a query to a given data stream def apply(query: api.Query, keys: Seq[String] = null, dataModel: DataModel = DataModel.Events): DataStream = { diff --git a/online/src/main/scala/ai/chronon/online/Fetcher.scala b/online/src/main/scala/ai/chronon/online/Fetcher.scala index 3022761c3..d1fea612b 100644 --- a/online/src/main/scala/ai/chronon/online/Fetcher.scala +++ b/online/src/main/scala/ai/chronon/online/Fetcher.scala @@ -81,7 +81,7 @@ class Fetcher(val kvStore: KVStore, debug: Boolean = false, val externalSourceRegistry: ExternalSourceRegistry = null) extends FetcherBase(kvStore, metaDataSet, timeoutMillis, debug) { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) def buildJoinCodec(joinConf: api.Join): JoinCodec = { val keyFields = new mutable.LinkedHashSet[StructField] diff --git a/online/src/main/scala/ai/chronon/online/FetcherBase.scala b/online/src/main/scala/ai/chronon/online/FetcherBase.scala index 478ddb240..5a7bfd2b8 100644 --- a/online/src/main/scala/ai/chronon/online/FetcherBase.scala +++ b/online/src/main/scala/ai/chronon/online/FetcherBase.scala @@ -43,7 +43,7 @@ class FetcherBase(kvStore: KVStore, timeoutMillis: Long = 10000, debug: Boolean = false) extends MetadataStore(kvStore, metaDataSet, timeoutMillis) { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) private case class GroupByRequestMeta( groupByServingInfoParsed: GroupByServingInfoParsed, @@ -63,7 +63,7 @@ class FetcherBase(kvStore: KVStore, overallLatency: Long, context: Metrics.Context, totalResponseValueBytes: Int): Map[String, AnyRef] = { - val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val latestBatchValue = batchResponsesTry.map(_.maxBy(_.millis)) val servingInfo = latestBatchValue.map(timedVal => updateServingInfo(timedVal.millis, oldServingInfo)).getOrElse(oldServingInfo) diff --git a/online/src/main/scala/ai/chronon/online/MetadataStore.scala b/online/src/main/scala/ai/chronon/online/MetadataStore.scala index 6e016133a..2a3e2c4d7 100644 --- a/online/src/main/scala/ai/chronon/online/MetadataStore.scala +++ b/online/src/main/scala/ai/chronon/online/MetadataStore.scala @@ -37,7 +37,7 @@ import scala.util.{Failure, Success, Try} case class DataMetrics(series: Seq[(Long, SortedMap[String, Any])]) class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, timeoutMillis: Long) { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) private var partitionSpec = PartitionSpec(format = "yyyy-MM-dd", spanMillis = WindowUtils.Day.millis) private val CONF_BATCH_SIZE = 50 diff --git a/online/src/test/scala/ai/chronon/online/TileCodecTest.scala b/online/src/test/scala/ai/chronon/online/TileCodecTest.scala index b60e0053c..9a7b2a780 100644 --- a/online/src/test/scala/ai/chronon/online/TileCodecTest.scala +++ b/online/src/test/scala/ai/chronon/online/TileCodecTest.scala @@ -23,7 +23,7 @@ import org.junit.Test import scala.collection.JavaConverters._ class TileCodecTest { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) private val histogram = Map[String, Int]("A" -> 3, "B" -> 2).asJava private val aggregationsAndExpected: Array[(Aggregation, Seq[Any])] = Array( diff --git a/online/src/test/scala/ai/chronon/online/test/DataStreamBuilderTest.scala b/online/src/test/scala/ai/chronon/online/test/DataStreamBuilderTest.scala index 2b6661027..74ec3a211 100644 --- a/online/src/test/scala/ai/chronon/online/test/DataStreamBuilderTest.scala +++ b/online/src/test/scala/ai/chronon/online/test/DataStreamBuilderTest.scala @@ -27,7 +27,7 @@ import org.junit.Test import scala.util.ScalaJavaConversions.JListOps class DataStreamBuilderTest { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = { System.setSecurityManager(null) val spark = SparkSession diff --git a/project/FolderCleaner.scala b/project/FolderCleaner.scala index 96c2b213b..49dcb6354 100644 --- a/project/FolderCleaner.scala +++ b/project/FolderCleaner.scala @@ -3,7 +3,7 @@ import java.io.File import scala.reflect.io.Directory object Folder { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) def clean(files: File*): Unit = { logger.info(s"Removing folders ${files.map(_.getAbsolutePath)}") files.foreach { file => diff --git a/project/ThriftGen.scala b/project/ThriftGen.scala index 3e9a3f945..a412cc3bc 100644 --- a/project/ThriftGen.scala +++ b/project/ThriftGen.scala @@ -4,7 +4,7 @@ import sbt._ import sys.process._ object Thrift { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) def gen(inputPath: String, outputPath: String, language: String, cleanupSuffixPath: String = "", extension: String = null): Seq[File] = { s"""echo "Generating files from thrift file: $outputPath \ninto folder $inputPath" """ !; s"rm -rf $outputPath/$cleanupSuffixPath" !; diff --git a/spark/src/main/scala/ai/chronon/spark/Analyzer.scala b/spark/src/main/scala/ai/chronon/spark/Analyzer.scala index 22d66e353..e6d058996 100644 --- a/spark/src/main/scala/ai/chronon/spark/Analyzer.scala +++ b/spark/src/main/scala/ai/chronon/spark/Analyzer.scala @@ -70,7 +70,7 @@ class Analyzer(tableUtils: TableUtils, sample: Double = 0.1, enableHitter: Boolean = false, silenceMode: Boolean = false) { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) // include ts into heavy hitter analysis - useful to surface timestamps that have wrong units // include total approx row count - so it is easy to understand the percentage of skewed data def heavyHittersWithTsAndCount(df: DataFrame, diff --git a/spark/src/main/scala/ai/chronon/spark/BootstrapInfo.scala b/spark/src/main/scala/ai/chronon/spark/BootstrapInfo.scala index 4a4301b3f..80d4bae0b 100644 --- a/spark/src/main/scala/ai/chronon/spark/BootstrapInfo.scala +++ b/spark/src/main/scala/ai/chronon/spark/BootstrapInfo.scala @@ -69,7 +69,7 @@ case class BootstrapInfo( } object BootstrapInfo { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) // Build metadata for the join that contains schema information for join parts, external parts and bootstrap parts def from(joinConf: api.Join, diff --git a/spark/src/main/scala/ai/chronon/spark/Comparison.scala b/spark/src/main/scala/ai/chronon/spark/Comparison.scala index bbd67ecf1..5ade1f544 100644 --- a/spark/src/main/scala/ai/chronon/spark/Comparison.scala +++ b/spark/src/main/scala/ai/chronon/spark/Comparison.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.types.{DecimalType, DoubleType, FloatType, MapType} import java.util object Comparison { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) // used for comparison def sortedJson(m: Map[String, Any]): String = { diff --git a/spark/src/main/scala/ai/chronon/spark/Driver.scala b/spark/src/main/scala/ai/chronon/spark/Driver.scala index c813ed7ca..94b8a17d3 100644 --- a/spark/src/main/scala/ai/chronon/spark/Driver.scala +++ b/spark/src/main/scala/ai/chronon/spark/Driver.scala @@ -56,7 +56,7 @@ class DummyExtensions extends (SparkSessionExtensions => Unit) { // The mega chronon cli object Driver { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) def parseConf[T <: TBase[_, _]: Manifest: ClassTag](confPath: String): T = ThriftJsonCodec.fromJsonFile[T](confPath, check = true) @@ -214,7 +214,7 @@ object Driver { } object JoinBackfill { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("join") with OfflineSubcommand @@ -262,7 +262,7 @@ object Driver { } object GroupByBackfill { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("group-by-backfill") with OfflineSubcommand @@ -526,7 +526,7 @@ object Driver { } object FetcherCli { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("fetch") with OnlineSubcommand { val keyJson: ScallopOption[String] = opt[String](required = false, descr = "json of the keys to fetch") @@ -646,7 +646,7 @@ object Driver { } object MetadataUploader { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("metadata-upload") with OnlineSubcommand { val confPath: ScallopOption[String] = opt[String](required = true, descr = "Path to the Chronon config file or directory") @@ -691,7 +691,7 @@ object Driver { } object GroupByStreaming { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) def dataStream(session: SparkSession, host: String, topic: String): DataFrame = { TopicChecker.topicShouldExist(topic, host) session.streams.addListener(new StreamingQueryListener() { @@ -715,7 +715,7 @@ object Driver { } class Args extends Subcommand("group-by-streaming") with OnlineSubcommand { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val confPath: ScallopOption[String] = opt[String](required = true, descr = "path to groupBy conf") val DEFAULT_LAG_MILLIS = 2000 // 2seconds val kafkaBootstrap: ScallopOption[String] = diff --git a/spark/src/main/scala/ai/chronon/spark/Extensions.scala b/spark/src/main/scala/ai/chronon/spark/Extensions.scala index d237b183c..45ad2559d 100644 --- a/spark/src/main/scala/ai/chronon/spark/Extensions.scala +++ b/spark/src/main/scala/ai/chronon/spark/Extensions.scala @@ -84,7 +84,7 @@ object Extensions { } implicit class DataframeOps(df: DataFrame) { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) private implicit val tableUtils: TableUtils = TableUtils(df.sparkSession) // This is safe to call on dataframes that are un-shuffled from their disk sources - diff --git a/spark/src/main/scala/ai/chronon/spark/FastHashing.scala b/spark/src/main/scala/ai/chronon/spark/FastHashing.scala index d25a23633..82da3d077 100644 --- a/spark/src/main/scala/ai/chronon/spark/FastHashing.scala +++ b/spark/src/main/scala/ai/chronon/spark/FastHashing.scala @@ -42,7 +42,7 @@ case class KeyWithHash(data: Array[Any], hash: Array[Byte], hashInt: Int) extend } object FastHashing { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) // function to generate a fast-ish hasher // the approach tries to accumulate several tiny closures to compute the final hash def generateKeyBuilder(keys: Array[String], schema: StructType): Row => KeyWithHash = { diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala index 2869558e9..521b58e89 100644 --- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala +++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala @@ -44,7 +44,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation], skewFilter: Option[String] = None, finalize: Boolean = true) extends Serializable { - val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) protected[spark] val tsIndex: Int = inputDf.schema.fieldNames.indexOf(Constants.TimeColumn) protected val selectedSchema: Array[(String, api.DataType)] = SparkConversions.toChrononSchema(inputDf.schema) @@ -392,7 +392,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation], // TODO: truncate queryRange for caching object GroupBy { - val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) // Need to use a case class here to allow null matching case class SourceDataProfile(earliestRequired: String, earliestPresent: String, latestAllowed: String) diff --git a/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala b/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala index 32f1ce719..472b32a2a 100644 --- a/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala +++ b/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala @@ -35,7 +35,7 @@ import scala.util.ScalaJavaConversions.{ListOps, MapOps} import scala.util.Try class GroupByUpload(endPartition: String, groupBy: GroupBy) extends Serializable { - val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) implicit val sparkSession: SparkSession = groupBy.sparkSession implicit private val tableUtils: TableUtils = TableUtils(sparkSession) private def fromBase(rdd: RDD[(Array[Any], Array[Any])]): KvRdd = { @@ -105,7 +105,7 @@ class GroupByUpload(endPartition: String, groupBy: GroupBy) extends Serializable } object GroupByUpload { - val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) // TODO - remove this if spark streaming can't reach hive tables private def buildServingInfo(groupByConf: api.GroupBy, diff --git a/spark/src/main/scala/ai/chronon/spark/Join.scala b/spark/src/main/scala/ai/chronon/spark/Join.scala index 939d80c67..b3d41fb4f 100644 --- a/spark/src/main/scala/ai/chronon/spark/Join.scala +++ b/spark/src/main/scala/ai/chronon/spark/Join.scala @@ -66,7 +66,7 @@ class Join(joinConf: api.Join, mutationScan: Boolean = true, showDf: Boolean = false) extends JoinBase(joinConf, endPartition, tableUtils, skipFirstHole, mutationScan, showDf) { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) private val bootstrapTable = joinConf.metaData.bootstrapTable diff --git a/spark/src/main/scala/ai/chronon/spark/JoinBase.scala b/spark/src/main/scala/ai/chronon/spark/JoinBase.scala index 8cbb4d4db..a9ad6b11d 100644 --- a/spark/src/main/scala/ai/chronon/spark/JoinBase.scala +++ b/spark/src/main/scala/ai/chronon/spark/JoinBase.scala @@ -39,7 +39,7 @@ abstract class JoinBase(joinConf: api.Join, skipFirstHole: Boolean, mutationScan: Boolean = true, showDf: Boolean = false) { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) assert(Option(joinConf.metaData.outputNamespace).nonEmpty, s"output namespace could not be empty or null") val metrics: Metrics.Context = Metrics.Context(Metrics.Environment.JoinOffline, joinConf) private val outputTable = joinConf.metaData.outputTable diff --git a/spark/src/main/scala/ai/chronon/spark/JoinUtils.scala b/spark/src/main/scala/ai/chronon/spark/JoinUtils.scala index 017c8d99a..83ff14bce 100644 --- a/spark/src/main/scala/ai/chronon/spark/JoinUtils.scala +++ b/spark/src/main/scala/ai/chronon/spark/JoinUtils.scala @@ -31,7 +31,7 @@ import scala.collection.Seq import scala.util.ScalaJavaConversions.MapOps object JoinUtils { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) /*** * Util methods for join computation diff --git a/spark/src/main/scala/ai/chronon/spark/KvRdd.scala b/spark/src/main/scala/ai/chronon/spark/KvRdd.scala index bbbe9d553..4dd9f4819 100644 --- a/spark/src/main/scala/ai/chronon/spark/KvRdd.scala +++ b/spark/src/main/scala/ai/chronon/spark/KvRdd.scala @@ -71,7 +71,7 @@ sealed trait BaseKvRdd { case class KvRdd(data: RDD[(Array[Any], Array[Any])], keySchema: StructType, valueSchema: StructType)(implicit sparkSession: SparkSession) extends BaseKvRdd { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val withTime = false def toAvroDf(jsonPercent: Int = 1): DataFrame = { @@ -113,7 +113,7 @@ case class TimedKvRdd(data: RDD[(Array[Any], Array[Any], Long)], valueSchema: StructType, storeSchemasPrefix: Option[String] = None)(implicit sparkSession: SparkSession) extends BaseKvRdd { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val withTime = true // TODO make json percent configurable diff --git a/spark/src/main/scala/ai/chronon/spark/LabelJoin.scala b/spark/src/main/scala/ai/chronon/spark/LabelJoin.scala index 788ad67be..8306dcc84 100644 --- a/spark/src/main/scala/ai/chronon/spark/LabelJoin.scala +++ b/spark/src/main/scala/ai/chronon/spark/LabelJoin.scala @@ -32,7 +32,7 @@ import scala.collection.Seq import scala.util.ScalaJavaConversions.IterableOps class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) assert(Option(joinConf.metaData.outputNamespace).nonEmpty, s"output namespace could not be empty or null") assert( diff --git a/spark/src/main/scala/ai/chronon/spark/LocalDataLoader.scala b/spark/src/main/scala/ai/chronon/spark/LocalDataLoader.scala index 4d152ddb5..4a2785305 100644 --- a/spark/src/main/scala/ai/chronon/spark/LocalDataLoader.scala +++ b/spark/src/main/scala/ai/chronon/spark/LocalDataLoader.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.types.{StringType, TimestampType} import java.io.File object LocalDataLoader { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) def writeTableFromFile(file: File, tableName: String, session: SparkSession): Unit = { logger.info(s"Checking table: ${tableName}") if (session.catalog.tableExists(tableName)) return diff --git a/spark/src/main/scala/ai/chronon/spark/LogFlattenerJob.scala b/spark/src/main/scala/ai/chronon/spark/LogFlattenerJob.scala index 39f8bc8f8..5cf9b4e9a 100644 --- a/spark/src/main/scala/ai/chronon/spark/LogFlattenerJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/LogFlattenerJob.scala @@ -51,7 +51,7 @@ class LogFlattenerJob(session: SparkSession, schemaTable: String, stepDays: Option[Int] = None) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) implicit val tableUtils: TableUtils = TableUtils(session) val joinTblProps: Map[String, String] = Option(joinConf.metaData.tableProperties) .map(_.toScala) diff --git a/spark/src/main/scala/ai/chronon/spark/MetadataExporter.scala b/spark/src/main/scala/ai/chronon/spark/MetadataExporter.scala index 02c00dd4d..3a8fa81aa 100644 --- a/spark/src/main/scala/ai/chronon/spark/MetadataExporter.scala +++ b/spark/src/main/scala/ai/chronon/spark/MetadataExporter.scala @@ -29,7 +29,7 @@ import java.nio.file.Paths import scala.collection.immutable.Map object MetadataExporter { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val GROUPBY_PATH_SUFFIX = "/group_bys" val JOIN_PATH_SUFFIX = "/joins" diff --git a/spark/src/main/scala/ai/chronon/spark/SparkSessionBuilder.scala b/spark/src/main/scala/ai/chronon/spark/SparkSessionBuilder.scala index 64b7c7372..64ce6e654 100644 --- a/spark/src/main/scala/ai/chronon/spark/SparkSessionBuilder.scala +++ b/spark/src/main/scala/ai/chronon/spark/SparkSessionBuilder.scala @@ -26,7 +26,7 @@ import scala.reflect.io.Path import scala.util.Properties object SparkSessionBuilder { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val DefaultWarehouseDir = new File("/tmp/chronon/spark-warehouse") diff --git a/spark/src/main/scala/ai/chronon/spark/StagingQuery.scala b/spark/src/main/scala/ai/chronon/spark/StagingQuery.scala index 0b201ef64..acdce1a77 100644 --- a/spark/src/main/scala/ai/chronon/spark/StagingQuery.scala +++ b/spark/src/main/scala/ai/chronon/spark/StagingQuery.scala @@ -26,7 +26,7 @@ import scala.collection.mutable import scala.util.ScalaJavaConversions._ class StagingQuery(stagingQueryConf: api.StagingQuery, endPartition: String, tableUtils: TableUtils) { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) assert(Option(stagingQueryConf.metaData.outputNamespace).nonEmpty, s"output namespace could not be empty or null") private val outputTable = stagingQueryConf.metaData.outputTable private val tableProps = Option(stagingQueryConf.metaData.tableProperties) @@ -93,7 +93,7 @@ class StagingQuery(stagingQueryConf: api.StagingQuery, endPartition: String, tab } object StagingQuery { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) def substitute(tu: TableUtils, query: String, start: String, end: String, latest: String): String = { val macros: Array[ParametricMacro] = Array( diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala index 2bfe7418d..565acbc1e 100644 --- a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala +++ b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala @@ -37,7 +37,7 @@ import scala.concurrent.{ExecutionContext, ExecutionContextExecutor} import scala.util.{Failure, Success, Try} case class TableUtils(sparkSession: SparkSession) { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) private val ARCHIVE_TIMESTAMP_FORMAT = "yyyyMMddHHmmss" private lazy val archiveTimestampFormatter = DateTimeFormatter diff --git a/spark/src/main/scala/ai/chronon/spark/stats/CompareBaseJob.scala b/spark/src/main/scala/ai/chronon/spark/stats/CompareBaseJob.scala index 78ec9aaae..343dee507 100644 --- a/spark/src/main/scala/ai/chronon/spark/stats/CompareBaseJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/stats/CompareBaseJob.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.types.DataType import scala.collection.mutable.ListBuffer object CompareBaseJob { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) def checkConsistency( leftFields: Map[String, DataType], diff --git a/spark/src/main/scala/ai/chronon/spark/stats/CompareJob.scala b/spark/src/main/scala/ai/chronon/spark/stats/CompareJob.scala index 7993ff9e4..baf4db314 100644 --- a/spark/src/main/scala/ai/chronon/spark/stats/CompareJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/stats/CompareJob.scala @@ -39,7 +39,7 @@ class CompareJob( startDate: String, endDate: String ) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val tableProps: Map[String, String] = Option(joinConf.metaData.tableProperties) .map(_.toScala) .orNull @@ -107,7 +107,7 @@ class CompareJob( } object CompareJob { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) /** * Extract the discrepancy metrics (like missing records, data mismatch) from the hourly compare metrics, consolidate diff --git a/spark/src/main/scala/ai/chronon/spark/stats/ConsistencyJob.scala b/spark/src/main/scala/ai/chronon/spark/stats/ConsistencyJob.scala index 85159a53b..11d24f55f 100644 --- a/spark/src/main/scala/ai/chronon/spark/stats/ConsistencyJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/stats/ConsistencyJob.scala @@ -29,7 +29,7 @@ import java.util import scala.util.ScalaJavaConversions.{JListOps, ListOps, MapOps} class ConsistencyJob(session: SparkSession, joinConf: Join, endDate: String) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val tblProperties: Map[String, String] = Option(joinConf.metaData.tableProperties) .map(_.toScala) diff --git a/spark/src/main/scala/ai/chronon/spark/stats/SummaryJob.scala b/spark/src/main/scala/ai/chronon/spark/stats/SummaryJob.scala index 175b096fd..9cdad4ae5 100644 --- a/spark/src/main/scala/ai/chronon/spark/stats/SummaryJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/stats/SummaryJob.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.SparkSession * Follow pattern of OOC for computing offline and uploading online as well. */ class SummaryJob(session: SparkSession, joinConf: Join, endDate: String) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val tableUtils: TableUtils = TableUtils(session) private val loggingStatsTable = joinConf.metaData.loggingStatsTable diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/streaming/GroupBy.scala index f3fc5cf0a..cfa5fe48c 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/GroupBy.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/GroupBy.scala @@ -41,7 +41,7 @@ class GroupBy(inputStream: DataFrame, onlineImpl: Api, debug: Boolean = false) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) private def buildStreamingQuery(inputTable: String): String = { val streamingSource = groupByConf.streamingSource.get diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala b/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala index 4fca7e51a..da35a08f7 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala @@ -64,7 +64,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map session: SparkSession, apiImpl: Api) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val context: Metrics.Context = Metrics.Context(Metrics.Environment.GroupByStreaming, groupByConf) @@ -107,7 +107,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map private val microBatchIntervalMillis: Int = getProp("batch_interval_millis", "1000").toInt private case class PutRequestHelper(inputSchema: StructType) extends Serializable { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) private val keyIndices: Array[Int] = keyColumns.map(inputSchema.fieldIndex) private val valueIndices: Array[Int] = valueColumns.map(inputSchema.fieldIndex) private val tsIndex: Int = inputSchema.fieldIndex(eventTimeColumn) diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/KafkaStreamBuilder.scala b/spark/src/main/scala/ai/chronon/spark/streaming/KafkaStreamBuilder.scala index 35515a9f5..2dd7ec6f4 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/KafkaStreamBuilder.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/KafkaStreamBuilder.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.streaming.StreamingQueryListener.{ } object KafkaStreamBuilder extends StreamBuilder { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) override def from(topicInfo: TopicInfo)(implicit session: SparkSession, conf: Map[String, String]): DataStream = { val conf = topicInfo.params val bootstrap = conf.getOrElse("bootstrap", conf("host") + conf.get("port").map(":" + _).getOrElse("")) diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/StreamingStats.scala b/spark/src/main/scala/ai/chronon/spark/streaming/StreamingStats.scala index ea621d89a..dcbed3349 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/StreamingStats.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/StreamingStats.scala @@ -25,7 +25,7 @@ import java.time.format.DateTimeFormatter import java.time.{Instant, ZoneId, ZoneOffset} class StreamingStats(val publishDelaySeconds: Int) { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) private var latencyHistogram: KllFloatsSketch = new KllFloatsSketch() private var latencyMsTotal: Long = 0 private var writesTotal: Long = 0 diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala b/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala index 0b30293d4..e34d75263 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala @@ -35,7 +35,7 @@ import scala.reflect.ClassTag import scala.util.Try object TopicChecker { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) def getPartitions(topic: String, bootstrap: String): Int = { val props = new Properties() @@ -86,7 +86,7 @@ object TopicChecker { } class Args(arguments: Seq[String]) extends ScallopConf(arguments) { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val conf: ScallopOption[String] = opt[String](descr = "Conf to pull topic and bootstrap server information") val bootstrap: ScallopOption[String] = opt[String](descr = "Kafka bootstrap server in host:port format") val topic: ScallopOption[String] = opt[String](descr = "kafka topic to check metadata for") diff --git a/spark/src/test/scala/ai/chronon/spark/test/AnalyzerTest.scala b/spark/src/test/scala/ai/chronon/spark/test/AnalyzerTest.scala index 98d3ac8af..cbe97537d 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/AnalyzerTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/AnalyzerTest.scala @@ -27,7 +27,7 @@ import org.junit.Assert.assertTrue import org.junit.Test class AnalyzerTest { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("AnalyzerTest", local = true) private val tableUtils = TableUtils(spark) diff --git a/spark/src/test/scala/ai/chronon/spark/test/ChainingFetcherTest.scala b/spark/src/test/scala/ai/chronon/spark/test/ChainingFetcherTest.scala index 99ee21fa1..4ea7fe215 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/ChainingFetcherTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/ChainingFetcherTest.scala @@ -36,12 +36,11 @@ import java.lang import java.util.TimeZone import java.util.concurrent.Executors import scala.collection.Seq -import scala.Console.logger.info import scala.concurrent.ExecutionContext import scala.util.ScalaJavaConversions._ class ChainingFetcherTest extends TestCase { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val sessionName = "ChainingFetcherTest" val spark: SparkSession = SparkSessionBuilder.build(sessionName, local = true) private val tableUtils = TableUtils(spark) diff --git a/spark/src/test/scala/ai/chronon/spark/test/CompareTest.scala b/spark/src/test/scala/ai/chronon/spark/test/CompareTest.scala index ba02f7335..7cda1a1c4 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/CompareTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/CompareTest.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession} import org.junit.Test class CompareTest { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build("CompareTest", local = true) private val tableUtils = TableUtils(spark) @@ -64,7 +64,7 @@ class CompareTest { CompareBaseJob.compare(leftDf, rightDf, keys, tableUtils) val metricsDf = metricsKvRdd.toFlatDf metricsDf.show() - logger.info(result) + logger.info(result.toString) assert(result.series.length == 4, "Invalid result length") for (rowIndex <- 0 until leftData.length) { for ((colName, index) <- leftColumns.zipWithIndex) { @@ -101,7 +101,7 @@ class CompareTest { ) val metricsDf = metricsKvRdd.toFlatDf metricsDf.show() - logger.info(result) + logger.info(result.toString) assert(result.series.length == 4, "Invalid result length") for (rowIndex <- 0 until leftData.length) { for ((colName, index) <- leftColumns.zipWithIndex) { diff --git a/spark/src/test/scala/ai/chronon/spark/test/FeatureWithLabelJoinTest.scala b/spark/src/test/scala/ai/chronon/spark/test/FeatureWithLabelJoinTest.scala index c05aa74b9..10f1c8316 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/FeatureWithLabelJoinTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/FeatureWithLabelJoinTest.scala @@ -26,7 +26,7 @@ import org.junit.Assert.assertEquals import org.junit.Test class FeatureWithLabelJoinTest { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("FeatureWithLabelJoinTest", local = true) private val namespace = "final_join" diff --git a/spark/src/test/scala/ai/chronon/spark/test/FetchStatsTest.scala b/spark/src/test/scala/ai/chronon/spark/test/FetchStatsTest.scala index 3d4288d99..d7aa88702 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/FetchStatsTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/FetchStatsTest.scala @@ -47,7 +47,7 @@ import scala.concurrent.{Await, ExecutionContext} * Fetch stats. */ class FetchStatsTest extends TestCase { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("FetchStatsTest", local = true) val tableUtils = TableUtils(spark) diff --git a/spark/src/test/scala/ai/chronon/spark/test/FetcherTest.scala b/spark/src/test/scala/ai/chronon/spark/test/FetcherTest.scala index 201e6241f..5990aec75 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/FetcherTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/FetcherTest.scala @@ -39,7 +39,6 @@ import org.junit.Assert.{assertEquals, assertFalse, assertTrue} import java.lang import java.util.TimeZone import java.util.concurrent.Executors -import scala.Console.logger.info import scala.collection.Seq import scala.compat.java8.FutureConverters import scala.concurrent.duration.{Duration, SECONDS} @@ -48,7 +47,7 @@ import scala.io.Source import scala.util.ScalaJavaConversions._ class FetcherTest extends TestCase { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val sessionName = "FetcherTest" val spark: SparkSession = SparkSessionBuilder.build(sessionName, local = true) private val tableUtils = TableUtils(spark) @@ -568,14 +567,13 @@ class FetcherTest extends TestCase { val responseMap = responses.head.values.get logger.info("====== Empty request response map ======") - logger.info(responseMap) assertEquals(joinConf.joinParts.size() + joinConf.derivations.toScala.derivationsWithoutStar.size, responseMap.size) assertEquals(responseMap.keys.count(_.endsWith("_exception")), joinConf.joinParts.size()) } } object FetcherTestUtil { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) def joinResponses(spark: SparkSession, requests: Array[Request], mockApi: MockApi, diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala index 835dd5486..f8c9876b2 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala @@ -16,7 +16,6 @@ package ai.chronon.spark.test -import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.{CStream, Column, NaiveAggregator} import ai.chronon.aggregator.windowing.FiveMinuteResolution import ai.chronon.api.Extensions._ @@ -46,7 +45,6 @@ import org.junit.Test import scala.collection.mutable class GroupByTest { - private val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTest", local = true) implicit val tableUtils = TableUtils(spark) @@ -80,7 +78,7 @@ class GroupByTest { val diff = Comparison.sideBySide(actualDf, expectedDf, List("user", tableUtils.partitionColumn)) if (diff.count() > 0) { diff.show() - logger.info("diff result rows") + println("diff result rows") } assertEquals(0, diff.count()) } @@ -132,7 +130,7 @@ class GroupByTest { val diff = Comparison.sideBySide(actualDf, expectedDf, List("user", tableUtils.partitionColumn)) if (diff.count() > 0) { diff.show() - logger.info("diff result rows") + println("diff result rows") } assertEquals(0, diff.count()) } @@ -179,10 +177,10 @@ class GroupByTest { val diff = Comparison.sideBySide(computed, expected, List("user", "ts")) if (diff.count() > 0) { - logger.info(s"Actual count: ${computed.count()}") - logger.info(s"Expected count: ${expected.count()}") - logger.info(s"Diff count: ${diff.count()}") - logger.info(s"diff result rows last_k_test") + println(s"Actual count: ${computed.count()}") + println(s"Expected count: ${expected.count()}") + println(s"Diff count: ${diff.count()}") + println(s"diff result rows last_k_test") diff.show() diff.rdd.foreach { row => val gson = new Gson() @@ -197,7 +195,7 @@ class GroupByTest { val computedStr = gson.toJson(computed) val expectedStr = gson.toJson(expected) if (computedStr != expectedStr) { - logger.info(s""" + println(s""" |computed [$computedCount]: ${gson.toJson(computed)} |expected [$expectedCount]: ${gson.toJson(expected)} |""".stripMargin) @@ -265,7 +263,7 @@ class GroupByTest { val diff = Comparison.sideBySide(naiveDf, resultDf, List("user", Constants.TimeColumn)) if (diff.count() > 0) { diff.show() - logger.info("diff result rows") + println("diff result rows") } assertEquals(0, diff.count()) } @@ -546,16 +544,16 @@ class GroupByTest { | latestB.listing = COALESCE(C.listing, '--null--') AND latestB.ts = C.ts |""".stripMargin val expectedInputDf = spark.sql(expectedSQL) - logger.info("Expected input DF: ") + println("Expected input DF: ") expectedInputDf.show() - logger.info("Computed input DF: ") + println("Computed input DF: ") newGroupBy.inputDf.show() val diff = Comparison.sideBySide(newGroupBy.inputDf, expectedInputDf, List("listing", "user", "ds")) if (diff.count() > 0) { - logger.info(s"Actual count: ${newGroupBy.inputDf.count()}") - logger.info(s"Expected count: ${expectedInputDf.count()}") - logger.info(s"Diff count: ${diff.count()}") + println(s"Actual count: ${newGroupBy.inputDf.count()}") + println(s"Expected count: ${expectedInputDf.count()}") + println(s"Diff count: ${diff.count()}") diff.show() } assertEquals(0, diff.count()) diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByUploadTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByUploadTest.scala index 111aafb8a..2ffdc69d4 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/GroupByUploadTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByUploadTest.scala @@ -34,7 +34,7 @@ import scala.concurrent.Await import scala.util.ScalaJavaConversions.{JMapOps, ListOps, MapOps} class GroupByUploadTest { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByUploadTest", local = true) private val namespace = "group_by_upload_test" diff --git a/spark/src/test/scala/ai/chronon/spark/test/InMemoryKvStore.scala b/spark/src/test/scala/ai/chronon/spark/test/InMemoryKvStore.scala index 455c5ab95..c266f56ea 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/InMemoryKvStore.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/InMemoryKvStore.scala @@ -30,7 +30,7 @@ import scala.concurrent.Future import scala.util.Try class InMemoryKvStore(tableUtils: () => TableUtils) extends KVStore with Serializable { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) //type aliases for readability type Key = String type Data = Array[Byte] @@ -139,7 +139,7 @@ class InMemoryKvStore(tableUtils: () => TableUtils) extends KVStore with Seriali } object InMemoryKvStore { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val stores: ConcurrentHashMap[String, InMemoryKvStore] = new ConcurrentHashMap[String, InMemoryKvStore] // We would like to create one instance of InMemoryKVStore per executors, but share SparkContext diff --git a/spark/src/test/scala/ai/chronon/spark/test/InMemoryStream.scala b/spark/src/test/scala/ai/chronon/spark/test/InMemoryStream.scala index 8c075619c..4de384117 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/InMemoryStream.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/InMemoryStream.scala @@ -35,7 +35,7 @@ import org.apache.spark.sql.{DataFrame, Dataset, Encoder, Encoders, Row, SparkSe import java.util.Base64 class InMemoryStream { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) private def encode(schema: org.apache.avro.Schema)(row: Row): Array[Byte] = { val gr: GenericRecord = new GenericData.Record(schema) diff --git a/spark/src/test/scala/ai/chronon/spark/test/JoinTest.scala b/spark/src/test/scala/ai/chronon/spark/test/JoinTest.scala index 6460d6fff..2f93e4b96 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/JoinTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/JoinTest.scala @@ -16,7 +16,6 @@ package ai.chronon.spark.test -import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.api import ai.chronon.api.{Accuracy, Builders, Constants, LongType, Operation, StringType, TimeUnit, Window} @@ -36,7 +35,6 @@ import scala.collection.JavaConverters._ import scala.util.ScalaJavaConversions.ListOps class JoinTest { - private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build( "JoinTest", @@ -85,7 +83,7 @@ class JoinTest { snapshotTable = dollarTable ) - //logger.info("Rupee Source start partition $month") + //println("Rupee Source start partition $month") val rupeeSource = Builders.Source.entities( query = Builders.Query( @@ -149,7 +147,7 @@ class JoinTest { dropStart, dropEnd ) - logger.info(tableUtils.partitions(s"$namespace.test_user_transaction_features")) + println(tableUtils.partitions(s"$namespace.test_user_transaction_features")) joinConf.joinParts.toScala .map(jp => joinConf.partOutputTable(jp)) @@ -163,7 +161,7 @@ class JoinTest { resetUDFs() val runner2 = new Join(joinConf, end, tableUtils) val computed = runner2.computeJoin(Some(3)) - logger.info(s"join start = $start") + println(s"join start = $start") val expectedQuery = s""" |WITH @@ -213,11 +211,11 @@ class JoinTest { val diff = Comparison.sideBySide(computed, expected, List("user_name", "user", "ts", "ds")) if (diff.count() > 0) { - logger.info(s"Actual count: ${computed.count()}") - logger.info(s"Expected count: ${expected.count()}") - logger.info(s"Diff count: ${diff.count()}") - logger.info(s"Queries count: ${queries.count()}") - logger.info(s"diff result rows") + println(s"Actual count: ${computed.count()}") + println(s"Expected count: ${expected.count()}") + println(s"Diff count: ${diff.count()}") + println(s"Queries count: ${queries.count()}") + println(s"diff result rows") diff.show() } assertEquals(0, diff.count()) @@ -231,7 +229,7 @@ class JoinTest { val endMinus2 = tableUtils.partitionSpec.minus(end, new Window(2, TimeUnit.DAYS)) tableUtils.dropPartitionRange(s"$namespace.test_user_transaction_features", endMinus1, endMinus1) - logger.info(tableUtils.partitions(s"$namespace.test_user_transaction_features")) + println(tableUtils.partitions(s"$namespace.test_user_transaction_features")) joinConf.joinParts.asScala .map(jp => joinConf.partOutputTable(jp)) @@ -245,11 +243,11 @@ class JoinTest { val diff2 = Comparison.sideBySide(computed2, expected2, List("user_name", "user", "ts", "ds")) if (diff2.count() > 0) { - logger.info(s"Actual count: ${computed2.count()}") - logger.info(s"Expected count: ${expected2.count()}") - logger.info(s"Diff count: ${diff2.count()}") - logger.info(s"Queries count: ${queries.count()}") - logger.info(s"diff result rows") + println(s"Actual count: ${computed2.count()}") + println(s"Expected count: ${expected2.count()}") + println(s"Diff count: ${diff2.count()}") + println(s"Queries count: ${queries.count()}") + println(s"diff result rows") diff2.show() } assertEquals(0, diff2.count()) @@ -344,18 +342,18 @@ class JoinTest { | AND countries.country = grouped_heights.country """.stripMargin) - logger.info("showing join result") + println("showing join result") computed.show() - logger.info("showing query result") + println("showing query result") expected.show() - logger.info( + println( s"Left side count: ${spark.sql(s"SELECT country, ds from $countryTable where ds >= '$start' and ds <= '$end'").count()}") - logger.info(s"Actual count: ${computed.count()}") - logger.info(s"Expected count: ${expected.count()}") + println(s"Actual count: ${computed.count()}") + println(s"Expected count: ${expected.count()}") val diff = Comparison.sideBySide(computed, expected, List("country", "ds")) if (diff.count() > 0) { - logger.info(s"Diff count: ${diff.count()}") - logger.info(s"diff result rows") + println(s"Diff count: ${diff.count()}") + println(s"diff result rows") diff.show() } assertEquals(diff.count(), 0) @@ -364,14 +362,14 @@ class JoinTest { * should not trigger a backfill and exit the program properly */ - // use console to redirect logger.info message to Java IO + // use console to redirect println message to Java IO val stream = new java.io.ByteArrayOutputStream() Console.withOut(stream) { // rerun the same join job runner.computeJoin(Some(7)) } val stdOutMsg = stream.toString() - logger.info(s"std out message =\n $stdOutMsg") + println(s"std out message =\n $stdOutMsg") // make sure that the program exits with target print statements assertTrue(stdOutMsg.contains(s"There is no data to compute based on end partition of $end.")) } @@ -417,12 +415,12 @@ class JoinTest { val runner = new Join(joinConf, end, tableUtils) val computed = runner.computeJoin(Some(7)) - logger.info("showing join result") + println("showing join result") computed.show() val leftSideCount = spark.sql(s"SELECT country, ds from $countryTable where ds == '$end'").count() - logger.info(s"Left side expected count: $leftSideCount") - logger.info(s"Actual count: ${computed.count()}") + println(s"Left side expected count: $leftSideCount") + println(s"Actual count: ${computed.count()}") assertEquals(leftSideCount, computed.count()) // There should be only one partition in computed df which equals to end partition val allPartitions = computed.select("ds").rdd.map(row => row(0)).collect().toSet @@ -493,8 +491,8 @@ class JoinTest { val diff = Comparison.sideBySide(computed, expected, List("item", "ts", "ds")) if (diff.count() > 0) { - logger.info(s"Diff count: ${diff.count()}") - logger.info(s"diff result rows") + println(s"Diff count: ${diff.count()}") + println(s"diff result rows") diff.show() } assertEquals(diff.count(), 0) @@ -568,8 +566,8 @@ class JoinTest { tableUtils.sql(s"SELECT item, ts, ds from $itemQueriesTable where ds >= '$start' and ds <= '$dayAndMonthBefore'") assertEquals(queriesBare.count(), computed.count()) if (diff.count() > 0) { - logger.info(s"Diff count: ${diff.count()}") - logger.info(s"diff result rows") + println(s"Diff count: ${diff.count()}") + println(s"diff result rows") diff .replaceWithReadableTime(Seq("ts", "a_user_unit_test_item_views_ts_max", "b_user_unit_test_item_views_ts_max"), dropOriginal = true) @@ -589,7 +587,7 @@ class JoinTest { // Run job val itemQueriesTable = s"$namespace.item_queries" - logger.info("Item Queries DF: ") + println("Item Queries DF: ") val q = s""" |SELECT @@ -630,8 +628,8 @@ class JoinTest { tableUtils.sql(s"SELECT item, ts, ds from $itemQueriesTable where ds >= '$start' and ds <= '$dayAndMonthBefore'") assertEquals(queriesBare.count(), computed.count()) if (diff.count() > 0) { - logger.info(s"Diff count: ${diff.count()}") - logger.info(s"diff result rows") + println(s"Diff count: ${diff.count()}") + println(s"diff result rows") diff.show() } assertEquals(diff.count(), 0) @@ -704,7 +702,7 @@ class JoinTest { None, viewsGroupByCumulative.inferredAccuracy ) - logger.info(renderedIncremental) + println(renderedIncremental) assert(renderedIncremental.contains(s"(ds >= '2021-01-01') AND (ds <= '2021-01-01')")) } @@ -754,7 +752,7 @@ class JoinTest { val runner = new Join(joinConf, end, tableUtils) val computed = runner.computeJoin(Some(7)) - logger.info(s"join start = $start") + println(s"join start = $start") val expected = tableUtils.sql(s""" |WITH | users AS (SELECT user, ds from $usersTable where ds >= '$start' and ds <= '$end'), @@ -772,18 +770,18 @@ class JoinTest { | AND users.ds = grouped_names.ds """.stripMargin) - logger.info("showing join result") + println("showing join result") computed.show() - logger.info("showing query result") + println("showing query result") expected.show() - logger.info( + println( s"Left side count: ${spark.sql(s"SELECT user, ds from $namesTable where ds >= '$start' and ds <= '$end'").count()}") - logger.info(s"Actual count: ${computed.count()}") - logger.info(s"Expected count: ${expected.count()}") + println(s"Actual count: ${computed.count()}") + println(s"Expected count: ${expected.count()}") val diff = Comparison.sideBySide(computed, expected, List("user", "ds")) if (diff.count() > 0) { - logger.info(s"Diff count: ${diff.count()}") - logger.info(s"diff result rows") + println(s"Diff count: ${diff.count()}") + println(s"diff result rows") diff.show() } assertEquals(diff.count(), 0) @@ -807,7 +805,7 @@ class JoinTest { val leftChangeJoin = new Join(joinConf = leftChangeJoinConf, endPartition = dayAndMonthBefore, tableUtils) val leftChangeRecompute = JoinUtils.tablesToRecompute(leftChangeJoinConf, leftChangeJoinConf.metaData.outputTable, tableUtils) - logger.info(leftChangeRecompute) + println(leftChangeRecompute) assertEquals(leftChangeRecompute.size, 3) val partTable = s"${leftChangeJoinConf.metaData.outputTable}_user_unit_test_item_views" assertEquals(leftChangeRecompute, @@ -874,8 +872,8 @@ class JoinTest { tableUtils.sql(s"SELECT item, ts, ds from $itemQueriesTable where ds >= '$start' and ds <= '$dayAndMonthBefore'") assertEquals(queriesBare.count(), computed.count()) if (diff.count() > 0) { - logger.info(s"Diff count: ${diff.count()}") - logger.info(s"diff result rows") + println(s"Diff count: ${diff.count()}") + println(s"diff result rows") diff .replaceWithReadableTime( Seq("ts", "a_user_3_unit_test_item_views_ts_max", "b_user_3_unit_test_item_views_ts_max"), @@ -1008,7 +1006,7 @@ class JoinTest { ) val skipBloomComputed = new Join(joinConf, today, testTableUtils).computeJoin() val leftSideCount = testSpark.sql(s"SELECT item, ts, ds from $itemQueriesTable where ds >= '$start'").count() - logger.info("computed count: " + skipBloomComputed.count()) + println("computed count: " + skipBloomComputed.count()) assertEquals(leftSideCount, skipBloomComputed.count()) } diff --git a/spark/src/test/scala/ai/chronon/spark/test/LabelJoinTest.scala b/spark/src/test/scala/ai/chronon/spark/test/LabelJoinTest.scala index 70ac0d5f8..a08721134 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/LabelJoinTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/LabelJoinTest.scala @@ -24,7 +24,7 @@ import org.junit.Assert.assertEquals import org.junit.Test class LabelJoinTest { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("LabelJoinTest", local = true) diff --git a/spark/src/test/scala/ai/chronon/spark/test/MetadataExporterTest.scala b/spark/src/test/scala/ai/chronon/spark/test/MetadataExporterTest.scala index 5f5b49cf6..46f741cbf 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/MetadataExporterTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/MetadataExporterTest.scala @@ -31,7 +31,7 @@ import scala.io.Source import java.io.File class MetadataExporterTest extends TestCase { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val sessionName = "MetadataExporter" val spark: SparkSession = SparkSessionBuilder.build(sessionName, local = true) diff --git a/spark/src/test/scala/ai/chronon/spark/test/MutationsTest.scala b/spark/src/test/scala/ai/chronon/spark/test/MutationsTest.scala index 5bd56c710..59b1eb878 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/MutationsTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/MutationsTest.scala @@ -33,7 +33,7 @@ import org.junit.Test * Join is the events and the entity value at the exact timestamp of the ts. */ class MutationsTest { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build( "MutationsTest", diff --git a/spark/src/test/scala/ai/chronon/spark/test/StagingQueryTest.scala b/spark/src/test/scala/ai/chronon/spark/test/StagingQueryTest.scala index 53a449306..b61530d74 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/StagingQueryTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/StagingQueryTest.scala @@ -27,7 +27,7 @@ import org.junit.Assert.assertEquals import org.junit.Test class StagingQueryTest { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build("StagingQueryTest", local = true) implicit private val tableUtils: TableUtils = TableUtils(spark) @@ -70,9 +70,9 @@ class StagingQueryTest { val diff = Comparison.sideBySide(expected, computed, List("user", "ts", "ds")) if (diff.count() > 0) { logger.info(s"Actual count: ${expected.count()}") - logger.info(expected.show()) + expected.show() logger.info(s"Computed count: ${computed.count()}") - logger.info(computed.show()) + computed.show() logger.info(s"Diff count: ${diff.count()}") logger.info(s"diff result rows") diff.show() @@ -144,9 +144,9 @@ class StagingQueryTest { val diffV2 = Comparison.sideBySide(expectedUpdated, computedUpdated, List("user", "ts", "ds")) if (diffV2.count() > 0) { logger.info(s"Actual count: ${expectedUpdated.count()}") - logger.info(expectedUpdated.show()) + expectedUpdated.show() logger.info(s"Computed count: ${computedUpdated.count()}") - logger.info(computedUpdated.show()) + computedUpdated.show() logger.info(s"Diff count: ${diffV2.count()}") logger.info(s"diff result rows") diffV2.show() @@ -200,9 +200,9 @@ class StagingQueryTest { val diff = Comparison.sideBySide(expected, computed, List("user", "ts", "ds")) if (diff.count() > 0) { logger.info(s"Actual count: ${expected.count()}") - logger.info(expected.show()) + expected.show() logger.info(s"Computed count: ${computed.count()}") - logger.info(computed.show()) + computed.show() logger.info(s"Diff count: ${diff.count()}") logger.info(s"diff result rows") diff.show() @@ -252,9 +252,9 @@ class StagingQueryTest { val diff = Comparison.sideBySide(expected, computed, List("user", "ts", "ds")) if (diff.count() > 0) { logger.info(s"Actual count: ${expected.count()}") - logger.info(expected.show()) + expected.show() logger.info(s"Computed count: ${computed.count()}") - logger.info(computed.show()) + computed.show() logger.info(s"Diff count: ${diff.count()}") logger.info(s"diff result rows") diff.show() diff --git a/spark/src/test/scala/ai/chronon/spark/test/StatsComputeTest.scala b/spark/src/test/scala/ai/chronon/spark/test/StatsComputeTest.scala index d079260bc..f58a501e9 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/StatsComputeTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/StatsComputeTest.scala @@ -28,7 +28,7 @@ import ai.chronon.spark.stats.StatsCompute import org.apache.spark.sql.functions.lit class StatsComputeTest { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build("StatsComputeTest", local = true) implicit val tableUtils = TableUtils(spark) val namespace: String = "stats_compute_test" diff --git a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/DerivationTest.scala b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/DerivationTest.scala index 7e74cf0de..75cbf2a8b 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/DerivationTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/DerivationTest.scala @@ -36,7 +36,7 @@ import scala.concurrent.duration.Duration import scala.util.ScalaJavaConversions.JListOps class DerivationTest { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("DerivationTest", local = true) private val tableUtils = TableUtils(spark) diff --git a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/LogBootstrapTest.scala b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/LogBootstrapTest.scala index 71337c0e1..00fce2256 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/LogBootstrapTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/LogBootstrapTest.scala @@ -34,7 +34,7 @@ import scala.concurrent.duration.Duration import scala.util.ScalaJavaConversions._ class LogBootstrapTest { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("BootstrapTest", local = true) val namespace = "test_log_bootstrap" diff --git a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/TableBootstrapTest.scala b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/TableBootstrapTest.scala index f30f0b20d..1bbf6a887 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/TableBootstrapTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/TableBootstrapTest.scala @@ -29,7 +29,7 @@ import org.junit.Test import scala.util.ScalaJavaConversions.JListOps class TableBootstrapTest { - private val logger = LoggerFactory.getLogger(getClass) + @transient lazy val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("BootstrapTest", local = true) private val tableUtils = TableUtils(spark)