From c6a0e9c632e6197ddd15998b5ce8bdcbb73a9521 Mon Sep 17 00:00:00 2001 From: Paul Swartz Date: Wed, 21 Aug 2024 11:47:10 -0400 Subject: [PATCH] doc: initial calculation/writeup of terminal schedule accuracy --- ...ght_rail_terminal_schedule_accuracy.livemd | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 reports/light_rail_terminal_schedule_accuracy.livemd diff --git a/reports/light_rail_terminal_schedule_accuracy.livemd b/reports/light_rail_terminal_schedule_accuracy.livemd new file mode 100644 index 0000000..a587648 --- /dev/null +++ b/reports/light_rail_terminal_schedule_accuracy.livemd @@ -0,0 +1,84 @@ + + +# Light Rail Terminal Schedule Accuracy + +```elixir +Mix.install([ + {:explorer, "~> 0.9.1"}, + {:kino, "~> 0.13.2"}, + {:req, "~> 0.5.6"} +]) + +``` + +## Grab All The Data + +```elixir +require Explorer.DataFrame, as: DF +alias Explorer.Series + +# one business week, starting 2024-08-12 +start_date = ~D[2024-08-12] +range = 0..4 +files = for add <- range do + date = Date.add(start_date, add) + "#{Date.to_iso8601(date)}-subway-on-time-performance-v1.parquet" +end + +df = files +|> Enum.map(&DF.from_parquet!(Kino.FS.file_path(&1))) +|> DF.concat_rows() +Kino.DataTable.new(df) +``` + +```elixir +require DF + +service_date_epoch = df["service_date"] +|> Series.cast(:string) +|> Series.strptime("%Y%m%d") +|> Series.cast(:integer) +|> Series.quotient(1000000) + +dst_offset = -4 * 3600 + +df = DF.put(df, :service_date_epoch, service_date_epoch) +df = DF.put(df, :scheduled_timestamp, Series.add(df["service_date_epoch"], Series.subtract(df["scheduled_departure_time"], dst_offset))) +df = DF.put(df, :diff, Series.subtract(df["stop_timestamp"], df["scheduled_timestamp"])) +df = DF.mutate(df, stop_timestamp: cast(stop_timestamp * 1000, {:naive_datetime, :millisecond}), scheduled_timestamp: cast(scheduled_timestamp * 1000, {:naive_datetime, :millisecond})) +df = DF.filter(df, trunk_route_id == "Green" and parent_station in ["place-lake", "place-clmnl", "place-river", "place-hsmnl", "place-unsqu", "place-mdftf"]) + +df +|> DF.select(["trip_id", "parent_station", "stop_sequence", "move_timestamp", "stop_timestamp", "scheduled_timestamp", "diff"]) +|> DF.sort_by([asc: trip_id, asc: scheduled_timestamp]) +|> Kino.DataTable.new() + +``` + +## Overall Accuracy + +Values are in seconds. Negative values are departures earlier than the schedule; positive values are after the schedule. + +```elixir +df +|> DF.summarise(count: count(diff), nil_count: nil_count(diff), mean: mean(diff), std: standard_deviation(diff), p25: quantile(diff, 0.25), p50: median(diff), p75: quantile(diff, 0.75)) +|> Kino.DataTable.new() +``` + +## Accuracy by Terminal + +Values are in seconds. Negative values are departures earlier than the schedule; positive values are after the schedule. + +```elixir +df +|> DF.group_by(:parent_station) +|> DF.summarise(count: count(diff), nil_count: nil_count(diff), mean: mean(diff), std: standard_deviation(diff), p25: quantile(diff, 0.25), p50: median(diff), p75: quantile(diff, 0.75)) +|> Kino.DataTable.new() +``` + +## Summary + +* half of all trains leave more than 4.5 minutes earlier than the schedule +* a quarter of trains leave later than the schedule +* Union Square is the least accurate, with half of trains leaving more than 20 minutes earlier than the schedule, and 40% of departures not matching the schedule at all +* Boston College is the most variable, with a standard deviation of 36 minutes