Skip to content

Commit

Permalink
add support for binary request of utf8 fixed width strings
Browse files Browse the repository at this point in the history
  • Loading branch information
jreadey committed Nov 2, 2023
1 parent b8ccb83 commit 292aa57
Show file tree
Hide file tree
Showing 8 changed files with 265 additions and 89 deletions.
7 changes: 3 additions & 4 deletions hsds/chunk_sn.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,6 @@ async def PUT_Value(request):
else:
arr = jsonToArray(np_shape, dset_dtype, json_data)

log.debug(f"jsonToArray returned: {arr}")
if num_elements != np.prod(arr.shape):
msg = f"expected {num_elements} elements, but got {np.prod(arr.shape)}"
raise HTTPBadRequest(reason=msg)
Expand All @@ -520,13 +519,13 @@ async def PUT_Value(request):
arr_tmp[...] = arr
arr = arr_tmp
except ValueError:
log.warn(msg)
log.warn(f"ValueError: {msg}")
raise HTTPBadRequest(reason=msg)
except TypeError:
log.warn(msg)
log.warn(f"TypeError: {msg}")
raise HTTPBadRequest(reason=msg)
except IndexError:
log.warn(msg)
log.warn(f"IndexError: {msg}")
raise HTTPBadRequest(reason=msg)
log.debug(f"got json arr: {arr.shape}")
else:
Expand Down
9 changes: 6 additions & 3 deletions hsds/dset_sn.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,12 +801,15 @@ async def POST_Dataset(request):
shape_json["dims"] = dims
rank = 1
elif isinstance(shape, str):
# only valid string value is H5S_NULL
if shape != "H5S_NULL":
# only valid string value is H5S_NULL or H5S_SCALAR
if shape == "H5S_NULL":
shape_json["class"] = "H5S_NULL"
elif shape == "H5S_SCALAR":
shape_json["class"] = "H5S_SCALAR"
else:
msg = "POST Datset with invalid shape value"
log.warn(msg)
raise HTTPBadRequest(reason=msg)
shape_json["class"] = "H5S_NULL"
elif isinstance(shape, list):
if len(shape) == 0:
shape_json["class"] = "H5S_SCALAR"
Expand Down
38 changes: 21 additions & 17 deletions hsds/util/arrayUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ def toTuple(rank, data):
else:
return tuple(toTuple(rank - 1, x) for x in data)
else:
if isinstance(data, str):
data = data.encode("utf8")
return data


Expand Down Expand Up @@ -93,6 +95,23 @@ def getNumElements(dims):
return num_elements


def isVlen(dt):
"""
Return True if the type contains variable length elements
"""
is_vlen = False
if len(dt) > 1:
names = dt.names
for name in names:
if isVlen(dt[name]):
is_vlen = True
break
else:
if dt.metadata and "vlen" in dt.metadata:
is_vlen = True
return is_vlen


def jsonToArray(data_shape, data_dtype, data_json):
"""
Return numpy array from the given json array.
Expand Down Expand Up @@ -122,6 +141,8 @@ def fillVlenArray(rank, data, arr, index):
converted_data = toTuple(np_shape_rank, data_json)
data_json = converted_data
else:
if isinstance(data_json, str):
data_json = data_json.encode("utf8")
data_json = [data_json,] # listify

if not (None in data_json):
Expand Down Expand Up @@ -149,23 +170,6 @@ def fillVlenArray(rank, data, arr, index):
return arr


def isVlen(dt):
"""
Return True if the type contains variable length elements
"""
is_vlen = False
if len(dt) > 1:
names = dt.names
for name in names:
if isVlen(dt[name]):
is_vlen = True
break
else:
if dt.metadata and "vlen" in dt.metadata:
is_vlen = True
return is_vlen


def getElementSize(e, dt):
"""
Get number of byte needed to given element as a bytestream
Expand Down
11 changes: 7 additions & 4 deletions hsds/util/hdf5dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,19 +344,20 @@ def getTypeItem(dt, metadata=None):
type_info["strPad"] = "H5T_STR_NULLPAD"
elif dt.base.kind == "U":
# Fixed length unicode type
print("fixed UTF, itemsize:", dt.itemsize)
ref_check = check_dtype(ref=dt.base)
if ref_check is not None:
raise TypeError("unexpected reference type")

# Fixed length UTF8 string type
# Fixed length string type with unicode support
type_info["class"] = "H5T_STRING"

# this can be problematic if the encoding of the string is not valid,
# or reqires too many bytes. Use unicode sting length * 4 to handle all
# or reqires too many bytes. Use variable length strings to handle all
# UTF8 strings correctly
type_info["charSet"] = "H5T_CSET_UTF8"
# convert from UTF32 length to a fixed length
type_info["length"] = dt.itemsize // 4
type_info["length"] = dt.itemsize
type_info["strPad"] = "H5T_STR_NULLPAD"

elif dt.kind == "b":
Expand Down Expand Up @@ -627,7 +628,9 @@ def createBaseDataType(typeItem):
if typeItem["charSet"] == "H5T_CSET_ASCII":
type_code = "S"
elif typeItem["charSet"] == "H5T_CSET_UTF8":
type_code = "U"
# use the same type_code as ascii strings
# (othewise, numpy will reserve bytes for UTF32 representation)
type_code = "S"
else:
raise TypeError("unexpected 'charSet' value")
# a fixed size string
Expand Down
4 changes: 3 additions & 1 deletion tests/integ/dataset_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def testScalarDataset(self):
helper.validateId(root_uuid)

# create a dataset obj
data = {"type": "H5T_IEEE_F32LE"}
data = {"type": "H5T_IEEE_F32LE", "shape": "H5S_SCALAR"}
req = self.endpoint + "/datasets"
rsp = self.session.post(req, data=json.dumps(data), headers=headers)
self.assertEqual(rsp.status_code, 201)
Expand Down Expand Up @@ -207,6 +207,8 @@ def testScalarEmptyDimsDataset(self):
helper.validateId(root_uuid)

# create a dataset obj
# using an empty list for shape is equivalent to using
# "H5S_SCALAR"
data = {"type": "H5T_IEEE_F32LE", "shape": []}
req = self.endpoint + "/datasets"
rsp = self.session.post(req, data=json.dumps(data), headers=headers)
Expand Down
175 changes: 175 additions & 0 deletions tests/integ/value_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3407,6 +3407,181 @@ def testShapeReinitialization3D(self):
else:
self.assertEqual(n, 1)

def testPutFixedUTF8StringDataset(self):
# Test PUT value for 1d attribute with fixed length UTF-8 string
print("testPutFixedUTF8StringDataset", self.base_domain)
headers = helper.getRequestHeaders(domain=self.base_domain)

# get domain
req = f"{self.endpoint}/"
rsp = self.session.get(req, headers=headers)
rspJson = json.loads(rsp.text)
self.assertTrue("root" in rspJson)
root_uuid = rspJson["root"]
req = helper.getEndpoint() + "/"

# Get root uuid
rsp = self.session.get(req, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)
self.assertTrue("root" in rspJson)
root_uuid = rspJson["root"]
helper.validateId(root_uuid)

# create dataset
req = self.endpoint + "/datasets"

text = "this is the chinese character for the number eight: \u516b"

# size of datatype is in bytes
byte_data = bytearray(text, "UTF-8")
byte_length = len(byte_data)

fixed_str_type = {
"charSet": "H5T_CSET_UTF8",
"class": "H5T_STRING",
"length": byte_length + 1,
"strPad": "H5T_STR_NULLTERM",
}

data = {"type": fixed_str_type, "shape": "H5S_SCALAR"}
req = self.endpoint + "/datasets"
rsp = self.session.post(req, data=json.dumps(data), headers=headers)
self.assertEqual(rsp.status_code, 201)
rspJson = json.loads(rsp.text)
dset_uuid = rspJson["id"]
self.assertTrue(helper.validateId(dset_uuid))
self.assertTrue("type" in rspJson)
type_json = rspJson["type"]
self.assertTrue("class" in type_json)
self.assertEqual(type_json["class"], "H5T_STRING")
self.assertTrue("length" in type_json)
self.assertEqual(type_json["length"], byte_length + 1)
self.assertTrue("strPad" in type_json)
self.assertEqual(type_json["strPad"], "H5T_STR_NULLTERM")
self.assertTrue("charSet" in type_json)
self.assertEqual(type_json["charSet"], "H5T_CSET_UTF8")

# link new dataset
name = "fixed_utf8_str_dset"
req = self.endpoint + "/groups/" + root_uuid + "/links/" + name
payload = {"id": dset_uuid}
rsp = self.session.put(req, data=json.dumps(payload), headers=headers)
self.assertEqual(rsp.status_code, 201)

# write fixed utf8 string to dset
data = {"value": text}
req = self.endpoint + "/datasets/" + dset_uuid + "/value"
rsp = self.session.put(req, data=json.dumps(data), headers=headers)
self.assertEqual(rsp.status_code, 200)

# read value back from dset
rsp = self.session.get(req, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)
self.assertTrue("value" in rspJson)
self.assertEqual(rspJson["value"], text)

# write different utf8 string of same overall byte length
text = "this is the chinese character for the number eight: 888"
new_byte_length = len(bytearray(text, "UTF-8"))
self.assertEqual(byte_length, new_byte_length)

data = {"value": text}
req = self.endpoint + "/datasets/" + dset_uuid + "/value"
rsp = self.session.put(req, data=json.dumps(data), headers=headers)
self.assertEqual(rsp.status_code, 200)

# read value back from dset
rsp = self.session.get(req, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)
self.assertTrue("value" in rspJson)
self.assertEqual(rspJson["value"], text)

def testPutFixedUTF8StringDatasetBinary(self):
# Test PUT value for 1d attribute with fixed length UTF-8 string in binary
print("testPutFixedUTF8StringDatasetBinary", self.base_domain)
headers = helper.getRequestHeaders(domain=self.base_domain)
headers_bin_req = helper.getRequestHeaders(domain=self.base_domain)
headers_bin_req["Content-Type"] = "application/octet-stream"
headers_bin_rsp = helper.getRequestHeaders(domain=self.base_domain)
headers_bin_rsp["accept"] = "application/octet-stream"

req = helper.getEndpoint() + "/"

# Get root uuid
rsp = self.session.get(req, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)
self.assertTrue("root" in rspJson)
root_uuid = rspJson["root"]
helper.validateId(root_uuid)

# create dataset
req = self.endpoint + "/datasets"

text = "this is the chinese character for the number eight: \u516b"
# size of datatype is in bytes
binary_text = bytearray(text, "UTF-8")
byte_length = len(binary_text)

fixed_str_type = {
"charSet": "H5T_CSET_UTF8",
"class": "H5T_STRING",
"length": byte_length,
"strPad": "H5T_STR_NULLTERM",
}

data = {"type": fixed_str_type, "shape": "H5S_SCALAR"}
req = self.endpoint + "/datasets"
rsp = self.session.post(req, data=json.dumps(data), headers=headers)
self.assertEqual(rsp.status_code, 201)
rspJson = json.loads(rsp.text)
dset_uuid = rspJson["id"]
self.assertTrue(helper.validateId(dset_uuid))

# link new dataset
name = "fixed_utf8_str_dset_binary"
req = self.endpoint + "/groups/" + root_uuid + "/links/" + name
payload = {"id": dset_uuid}
rsp = self.session.put(req, data=json.dumps(payload), headers=headers)
self.assertEqual(rsp.status_code, 201)

# write fixed utf8 binary string to dset
req = self.endpoint + "/datasets/" + dset_uuid + "/value"
rsp = self.session.put(req, data=binary_text, headers=headers_bin_req)
self.assertEqual(rsp.status_code, 200)

# read value back from dset as json
rsp = self.session.get(req, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)
self.assertTrue("hrefs" in rspJson)
self.assertTrue("value" in rspJson)
self.assertEqual(rspJson["value"], text)

# read value back as binary
rsp = self.session.get(req, headers=headers_bin_rsp)
self.assertEqual(rsp.status_code, 200)
self.assertEqual(rsp.text, text)

# write different utf8 binary string of same overall byte length
text = "this is the chinese character for the number eight: 888"
binary_text = bytearray(text, "UTF-8")
new_byte_length = len(binary_text)
self.assertEqual(byte_length, new_byte_length)

# read as JSON
req = self.endpoint + "/datasets/" + dset_uuid + "/value"
rsp = self.session.put(req, data=binary_text, headers=headers_bin_req)
self.assertEqual(rsp.status_code, 200)

# read as binary
rsp = self.session.get(req, headers=headers_bin_rsp)
self.assertEqual(rsp.status_code, 200)
self.assertEqual(rsp.text, text)


if __name__ == "__main__":
# setup test files
Expand Down
Loading

0 comments on commit 292aa57

Please sign in to comment.