Skip to content

Commit

Permalink
Merge pull request jeromekelleher#458 from jeromekelleher/fix-plots
Browse files Browse the repository at this point in the history
Fixup some issues with resource and sample plotting
  • Loading branch information
jeromekelleher authored Dec 18, 2024
2 parents 33a72d6 + 22086e9 commit e9d1fdc
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 27 deletions.
40 changes: 14 additions & 26 deletions sc2ts/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -904,10 +904,7 @@ def samples_summary(self):
data.append({"date": date, **row})
df = pd.DataFrame(data)
df["inserted"] = df["total"] - df["rejected"] - df["exact_matches"]
if "total_hmm_cost" not in df:
# TMP! Remove this once we've got total_hmm_cost in the actual m
df["total_hmm_cost"] = df["mean_hmm_cost"] * df["total"]
return df.astype({"date": "datetime64[s]"})
return df

def sample_groups_summary(self):
data = []
Expand Down Expand Up @@ -1583,13 +1580,12 @@ def plot_deletion_overlaps(self, annotate_threshold=0.9):
def plot_samples_per_day(
self, start_date="2020-01-01", end_date="3000-01-01", scorpio_fraction=0.05
):
start_date = np.datetime64(start_date)
end_date = np.datetime64(end_date)
df = self.samples_summary()
df = df[(df.date >= start_date) & (df.date < end_date)]

dfa = df.groupby("date").sum().reset_index()
dfa = df.groupby("date").sum().reset_index().astype({"date": "datetime64[s]"})
dfa["mean_hmm_cost"] = dfa["total_hmm_cost"] / dfa["total"]

fig, (ax1, ax2, ax3, ax4) = self._wide_plot(4, height=12, sharex=True)
exact_col = "tab:red"
in_col = "tab:purple"
Expand Down Expand Up @@ -1630,7 +1626,9 @@ def plot_samples_per_day(

df_scorpio = df.pivot_table(
columns="scorpio", index="date", values="total", aggfunc="sum", fill_value=0
)
).reset_index()
# Need force conversion back to datetime here for some reason
df_scorpio = df_scorpio.astype({"date": "datetime64[s]"}).set_index("date")
# convert to fractions
df_scorpio = df_scorpio.divide(df_scorpio.sum(axis="columns"), axis="index")
# Remove columns that don't have more than the threshold
Expand Down Expand Up @@ -1668,18 +1666,13 @@ def plot_resources(self, start_date="2020-01-01", end_date="3000-01-01"):
ts = self.ts
fig, ax = self._wide_plot(3, height=8, sharex=True)

start_date = np.datetime64(start_date)
end_date = np.datetime64(end_date)
df = self.samples_summary()

dfs = self.samples_summary().set_index("date")
dfs = self.samples_summary()
dfa = dfs.groupby("date").sum()
dfa["mean_hmm_cost"] = dfa["total_hmm_cost"] / dfa["total"]
df = self.resources_summary().set_index("date")
# Should be able to do this with join, but I failed
df["samples_in_arg"] = dfa.loc[df.index]["inserted"]
df["samples_processed"] = dfa.loc[df.index]["total"]
df["mean_hmm_cost"] = dfa.loc[df.index]["mean_hmm_cost"]
df = dfa.join(self.resources_summary(), how="inner")
df = df.rename(
columns={"inserted": "smaples_in_arg", "total": "samples_processed"}
)
df = df[(df.index >= start_date) & (df.index < end_date)]

df["cpu_time"] = df.user_time + df.sys_time
Expand Down Expand Up @@ -1727,17 +1720,12 @@ def plot_resources(self, start_date="2020-01-01", end_date="3000-01-01"):
def resources_summary(self):
ts = self.ts
data = []
df_samples = self.samples_summary()
dates = df_samples["date"].unique()
assert len(dates) == ts.num_provenances
for j in range(ts.num_provenances):
p = ts.provenance(j)
for p in ts.provenances():
record = json.loads(p.record)
text_date = record["parameters"]["date"]
assert text_date == str(dates[j]).split(" ")[0]
resources = record["resources"]
data.append({"date": dates[j], **resources})
return pd.DataFrame(data)
data.append({"date": text_date, **resources})
return pd.DataFrame(data).set_index("date")

def node_type_summary(self):
ts = self.ts
Expand Down
2 changes: 1 addition & 1 deletion tests/test_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def test_draw_subtree(self, fx_ti_2020_02_13):
def test_resources_summary(self, fx_ti_2020_02_13):
df = fx_ti_2020_02_13.resources_summary()
assert df.shape[0] == 20
assert np.all(df.date.astype(str).str.startswith("2020"))
assert np.all(df.index.astype(str).str.startswith("2020"))

def test_samples_summary(self, fx_ti_2020_02_13):
df = fx_ti_2020_02_13.samples_summary()
Expand Down

0 comments on commit e9d1fdc

Please sign in to comment.