Skip to content

Commit

Permalink
Update metadata parsing for #112
Browse files Browse the repository at this point in the history
  • Loading branch information
jluethi committed Jul 21, 2022
1 parent e4652ee commit 7a39e2a
Showing 1 changed file with 147 additions and 92 deletions.
239 changes: 147 additions & 92 deletions fractal/tasks/metadata_parsing.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@
Institute for Biomedical Research and Pelkmans Lab from the University of
Zurich.
"""

from xml.etree import ElementTree
import pandas as pd
import warnings

import numpy as np
import pandas as pd
from defusedxml import ElementTree


def parse_yokogawa_metadata(mrf_path, mlf_path):
"""
Expand All @@ -28,27 +29,41 @@ def parse_yokogawa_metadata(mrf_path, mlf_path):
mrf_frame, mlf_frame, error_count = read_metadata_files(mrf_path, mlf_path)

per_site_parameters = [
'x_micrometer',
'y_micrometer',
'pixel_size_x',
'pixel_size_y',
'bit_depth'
]
grouping_params = ['well_id', 'field_id']
grouped_sites = mlf_frame.loc[:, grouping_params + per_site_parameters
].groupby(by = grouping_params)
"x_micrometer",
"y_micrometer",
"pixel_size_x",
"pixel_size_y",
"x_pixel",
"y_pixel",
"bit_depth",
]
grouping_params = ["well_id", "field_id"]
grouped_sites = mlf_frame.loc[
:, grouping_params + per_site_parameters
].groupby(by=grouping_params)
check_grouped_sites_consistency(grouped_sites, per_site_parameters)
site_metadata = grouped_sites.mean()

# Cast image pixel sizes & bit depth to int
site_metadata = site_metadata.astype(
{
"x_pixel": "int",
"y_pixel": "int",
"bit_depth": "int",
}
)

# Absolute Z positions are not saved by the Yokogawa,
# only relative positions to the autofocus
site_metadata['z_micrometer'] = 0
site_metadata["z_micrometer"] = 0

site_metadata = pd.concat([site_metadata, get_z_steps(mlf_frame)], axis=1)

if error_count > 0:
print(f"Succesfully parsed {len(site_metadata)} sites, could not " \
f"parse {error_count} sites due to errors (see warnings).")
print(
f"Succesfully parsed {len(site_metadata)} sites, could not "
f"parse {error_count} sites due to errors (see warnings)."
)
total_files = len(mlf_frame)
# TODO: Check whether the total_files correspond to the number of
# relevant input images in the input folder. Returning it for now
Expand All @@ -62,9 +77,10 @@ def parse_yokogawa_metadata(mrf_path, mlf_path):


def read_metadata_files(mrf_path, mlf_path):
# parsing of mrf & mlf files are based on the yokogawa_image_collection_task
# v0.5 in drogon, written by Dario Vischi. Now modified for Fractal use
# https://github.com/fmi-basel/job-system-workflows/blob/00bbf34448972d27f258a2c28245dd96180e8229/src/gliberal_workflows/tasks/yokogawa_image_collection_task/versions/version_0_5.py
# parsing of mrf & mlf files are based on the
# yokogawa_image_collection_task v0.5 in drogon, written by Dario Vischi.
# https://github.com/fmi-basel/job-system-workflows/blob/00bbf34448972d27f258a2c28245dd96180e8229/src/gliberal_workflows/tasks/yokogawa_image_collection_task/versions/version_0_5.py # noqa
# Now modified for Fractal use

mrf_frame = read_mrf_file(mrf_path)
# TODO: filter_position & filter_wheel_position are parsed, but not
Expand All @@ -78,25 +94,27 @@ def read_metadata_files(mrf_path, mlf_path):
return mrf_frame, mlf_frame, error_count



def read_mrf_file(mrf_path):

# Prepare mrf dataframe
mrf_columns = [
"channel_id", "horiz_pixel_dim", "vert_pixel_dim",
"camera_no", "bit_depth", "horiz_pixels", "vert_pixels",
"filter_wheel_position", "filter_position", "shading_corr_src"
"channel_id",
"horiz_pixel_dim",
"vert_pixel_dim",
"camera_no",
"bit_depth",
"horiz_pixels",
"vert_pixels",
"filter_wheel_position",
"filter_position",
"shading_corr_src",
]
mrf_frame = pd.DataFrame(
columns=mrf_columns
)
mrf_frame = pd.DataFrame(columns=mrf_columns)

mrf_xml = ElementTree.parse(mrf_path).getroot()
# Read mrf file
ns = {"bts": "http://www.yokogawa.co.jp/BTS/BTSSchema/1.0"}
for channel in mrf_xml.findall(
"bts:MeasurementChannel", namespaces=ns
):
for channel in mrf_xml.findall("bts:MeasurementChannel", namespaces=ns):
mrf_frame.loc[channel.get("{%s}Ch" % ns["bts"])] = [
channel.get("{%s}Ch" % ns["bts"]),
float(channel.get("{%s}HorizontalPixelDimension" % ns["bts"])),
Expand All @@ -107,7 +125,7 @@ def read_mrf_file(mrf_path):
int(channel.get("{%s}VerticalPixels" % ns["bts"])),
int(channel.get("{%s}FilterWheelPosition" % ns["bts"])),
int(channel.get("{%s}FilterPosition" % ns["bts"])),
channel.get("{%s}ShadingCorrectionSource" % ns["bts"])
channel.get("{%s}ShadingCorrectionSource" % ns["bts"]),
]

return mrf_frame
Expand All @@ -126,28 +144,40 @@ def blocks(fh, size=65536):
break
yield block

with open(mlf_path, "r", encoding="utf-8", errors='ignore') as fh:
line_counter = sum(block.count("\n") for block in blocks(fh))

mlf_entries = mlf_xml.findall("bts:MeasurementRecord", namespaces=ns)
nb_lines = len(mlf_entries)

# Prepare mlf dataframe
mlf_columns = [
"type", "well_id", "column", "row", "time_point", "field_id",
"z_index", "timeline_id", "action_id", "action",
"x_micrometer", "y_micrometer", "z_micrometer",
"pixel_size_x", "pixel_size_y", "bit_depth", "width", "height",
"channel_id", 'camera_no', "file_name"
"type",
"well_id",
"column",
"row",
"time_point",
"field_id",
"z_index",
"timeline_id",
"action_id",
"action",
"x_micrometer",
"y_micrometer",
"z_micrometer",
"x_pixel",
"y_pixel",
"pixel_size_x",
"pixel_size_y",
"bit_depth",
"width",
"height",
"channel_id",
"camera_no",
"file_name",
]
mlf_frame = pd.DataFrame(
columns=mlf_columns, index=range(0, nb_lines)
)
mlf_frame = pd.DataFrame(columns=mlf_columns, index=range(0, nb_lines))

mrf_channel_indices = {
row.channel_id: idx for idx, (_, row) in enumerate(
mrf_frame.iterrows()
)
row.channel_id: idx
for idx, (_, row) in enumerate(mrf_frame.iterrows())
}

error_count = 0
Expand Down Expand Up @@ -180,17 +210,15 @@ def blocks(fh, size=65536):

well_row_id = record.get("{%s}Row" % ns["bts"])
well_col_id = record.get("{%s}Column" % ns["bts"])
well_id = chr(64+int(well_row_id)) + str(well_col_id).zfill(2)
well_id = chr(64 + int(well_row_id)) + str(well_col_id).zfill(2)

x_pixel = np.nan
y_pixel = np.nan
bit_depth = np.nan
width = np.nan
height = np.nan
camera_no = np.nan
pixel_size_x = np.nan
pixel_size_y = np.nan
if rec_type == 'IMG':
if rec_type == "IMG":
mrf_idx = mrf_channel_indices[channel_id]
pixel_size_x = mrf_frame.iat[mrf_idx, 1]
pixel_size_y = mrf_frame.iat[mrf_idx, 2]
Expand All @@ -204,69 +232,93 @@ def blocks(fh, size=65536):
mlf_frame.iat[idx, 1] = well_id
mlf_frame.iat[idx, 2] = int(well_col_id)
mlf_frame.iat[idx, 3] = int(well_row_id)
mlf_frame.iat[idx, 4] = int(
record.get("{%s}TimePoint" % ns["bts"])
)
mlf_frame.iat[idx, 5] = \
int(record.get("{%s}FieldIndex" % ns["bts"]))
mlf_frame.iat[idx, 4] = int(record.get("{%s}TimePoint" % ns["bts"]))
mlf_frame.iat[idx, 5] = int(record.get("{%s}FieldIndex" % ns["bts"]))
mlf_frame.iat[idx, 6] = int(record.get("{%s}ZIndex" % ns["bts"]))
mlf_frame.iat[idx, 7] = \
int(record.get("{%s}TimelineIndex" % ns["bts"]))
mlf_frame.iat[idx, 8] = \
int(record.get("{%s}ActionIndex" % ns["bts"]))
mlf_frame.iat[idx, 7] = int(
record.get("{%s}TimelineIndex" % ns["bts"])
)
mlf_frame.iat[idx, 8] = int(record.get("{%s}ActionIndex" % ns["bts"]))
mlf_frame.iat[idx, 9] = record.get("{%s}Action" % ns["bts"])
mlf_frame.iat[idx, 10] = x_micrometer
mlf_frame.iat[idx, 11] = y_micrometer
mlf_frame.iat[idx, 12] = z_micrometer

mlf_frame.iat[idx, 13] = pixel_size_x
mlf_frame.iat[idx, 14] = pixel_size_y

mlf_frame.iat[idx, 15] = bit_depth
mlf_frame.iat[idx, 16] = width
mlf_frame.iat[idx, 17] = height
mlf_frame.iat[idx, 18] = channel_id
mlf_frame.iat[idx, 19] = camera_no
mlf_frame.iat[idx, 20] = record.text # file_name
mlf_frame.iat[idx, 13] = width
mlf_frame.iat[idx, 14] = height
mlf_frame.iat[idx, 15] = pixel_size_x
mlf_frame.iat[idx, 16] = pixel_size_y

mlf_frame.iat[idx, 17] = bit_depth
mlf_frame.iat[idx, 18] = width
mlf_frame.iat[idx, 19] = height
mlf_frame.iat[idx, 20] = channel_id
mlf_frame.iat[idx, 21] = camera_no
mlf_frame.iat[idx, 22] = record.text # file_name

mlf_frame = mlf_frame.dropna(thresh=(len(mlf_frame.columns)))
return mlf_frame, error_count


def calculate_steps(site_series: pd.Series):
# site_series is the z_micrometer series for a given site of a given channel
# This function calculates the step size in Z
# site_series is the z_micrometer series for a given site of a given
# channel. This function calculates the step size in Z
steps = site_series.diff()[1:]
assert steps.std().sum() == 0.0, "" \
"When parsing the Yokogawa mlf file, some sites " \
"had varying step size in Z. " \
if not steps.std().sum() == 0.0:
raise Exception(
"When parsing the Yokogawa mlf file, some sites "
"had varying step size in Z. "
"That is not supported for the OME-Zarr parsing"
)
return steps.mean()


def get_z_steps(mlf_frame):
# Process mlf_frame to extract Z information
# run checks on consistencies & return site-based z step dataframe
# Process mlf_frame to extract Z information (pixel size & steps).
# Run checks on consistencies & return site-based z step dataframe
# Group by well, field & channel
grouped_sites_z = mlf_frame.loc[:, [
'well_id',
'field_id',
'action_id',
'channel_id',
'z_micrometer']].groupby(by = [
'well_id',
'field_id',
'action_id',
'channel_id'])
grouped_sites_z = mlf_frame.loc[
:, ["well_id", "field_id", "action_id", "channel_id", "z_micrometer"]
].groupby(by=["well_id", "field_id", "action_id", "channel_id"])
# Group the whole site (combine channels), because Z steps need to be
# consistent between channels for OME-Zarr.
z_data = grouped_sites_z.apply(calculate_steps).groupby(['well_id', 'field_id'])
assert z_data.std().sum().sum() == 0.0, "" \
"When parsing the Yokogawa mlf file, channels had " \
"varying step size in Z. " \
z_data = grouped_sites_z.apply(calculate_steps).groupby(
["well_id", "field_id"]
)
if not z_data.std().sum().sum() == 0.0:
raise Exception(
"When parsing the Yokogawa mlf file, channels had "
"varying step size in Z. "
"That is not supported for the OME-Zarr parsing"
z_frame = z_data.mean()
z_frame.columns = ['pixel_size_z']
)

# Ensure that channels have the same number of z planes and
# reduce it to one value.
# Only check if there is more than one channel available
if any(
grouped_sites_z.count().groupby(["well_id", "field_id"]).count() > 1
):
if any(
grouped_sites_z.count()
.groupby(["well_id", "field_id"])
.std()
.sum()
!= 0
):
raise Exception(
"When parsing the Yokogawa mlf file, channels had "
"varying number of z planes."
"That is not supported for the OME-Zarr parsing"
)
z_steps = (
grouped_sites_z.count()
.groupby(["well_id", "field_id"])
.mean()
.astype(int)
)

# Combine the two dataframes
z_frame = pd.concat([z_data.mean(), z_steps], axis=1)
z_frame.columns = ["pixel_size_z", "z_pixel"]
return z_frame


Expand All @@ -275,7 +327,10 @@ def check_grouped_sites_consistency(grouped_sites, per_site_parameters):
# Same for pixel sizes
# Only relevant when a site has multiple entries
if grouped_sites.count().min().min() > 1:
assert grouped_sites.std().sum().sum() == 0.0, "" \
"When parsing the Yokogawa MeasurementData.mlf file, " \
f"some of the parameters {per_site_parameters} varied within " \
if not grouped_sites.std().sum().sum() == 0.0:
raise Exception(
""
"When parsing the Yokogawa MeasurementData.mlf file, "
f"some of the parameters {per_site_parameters} varied within "
"the site. That is not supported for the OME-Zarr parsing"
)

0 comments on commit 7a39e2a

Please sign in to comment.