From 45f8644039b1b9c3643a97a7b48d30dbe2f779d7 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 4 Jan 2024 10:51:35 +0800 Subject: [PATCH] Multiops (#295) * added POST req for attributes * added support for multiple attribute put * updates based on review comments * add putAttributes servicenode_lib func * support for attribute encoding * enable multi-attribute delete * updates per review comments --- hsds/attr_dn.py | 383 +++++++++--- hsds/attr_sn.py | 1043 ++++++++++++++++++++++++++------- hsds/datanode.py | 24 +- hsds/datanode_lib.py | 4 +- hsds/domain_crawl.py | 326 +++++++++++ hsds/domain_sn.py | 312 +--------- hsds/dset_sn.py | 11 +- hsds/folder_crawl.py | 131 +++++ hsds/servicenode.py | 70 ++- hsds/servicenode_lib.py | 250 +++++++- hsds/util/arrayUtil.py | 68 ++- hsds/util/attrUtil.py | 5 - hsds/util/domainUtil.py | 11 + hsds/util/dsetUtil.py | 10 +- hsds/util/httpUtil.py | 25 +- tests/integ/acl_test.py | 49 -- tests/integ/attr_test.py | 763 +++++++++++++++++++++++- tests/integ/domain_test.py | 17 +- tests/integ/value_test.py | 5 +- tests/unit/array_util_test.py | 190 +++++- 20 files changed, 2932 insertions(+), 765 deletions(-) create mode 100644 hsds/domain_crawl.py create mode 100644 hsds/folder_crawl.py diff --git a/hsds/attr_dn.py b/hsds/attr_dn.py index c7db4d79..d80ca322 100755 --- a/hsds/attr_dn.py +++ b/hsds/attr_dn.py @@ -20,6 +20,10 @@ from aiohttp.web import json_response from .util.attrUtil import validateAttributeName +from .util.hdf5dtype import getItemSize, createDataType +from .util.dsetUtil import getShapeDims +from .util.arrayUtil import arrayToBytes, jsonToArray, decodeData +from .util.arrayUtil import bytesToArray, bytesArrayToList from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj from . import hsds_logger as log @@ -27,7 +31,7 @@ def _index(items, marker, create_order=False): """Locate the leftmost value exactly equal to x""" if create_order: - # list is not ordered, juse search linearly + # list is not ordered, just search linearly for i in range(len(items)): if items[i] == marker: return i @@ -39,6 +43,79 @@ def _index(items, marker, create_order=False): return -1 +def _getAttribute(attr_name, obj_json, include_data=True, encoding=None): + """ copy relevant fields from src to target """ + + if not isinstance(obj_json, dict): + msg = f"expected dict but got: {type(obj_json)}" + log.error(msg) + raise HTTPInternalServerError() + + if "attributes" not in obj_json: + msg = "expected to find attributes key in obj_json" + log.error(msg) + raise HTTPInternalServerError() + + attributes = obj_json["attributes"] + if attr_name not in attributes: + # this should be checked before calling this function + msg = f"attribute {attr_name} not found" + log.error(msg) + raise HTTPInternalServerError() + + src_attr = attributes[attr_name] + log.debug(f"_getAttribute - src_attr: {src_attr}") + + for key in ("created", "type", "shape", "value"): + if key not in src_attr: + msg = f"Expected to find key: {key} in {src_attr}" + log.error(msg) + raise HTTPInternalServerError() + + des_attr = {} + type_json = src_attr["type"] + shape_json = src_attr["shape"] + des_attr["created"] = src_attr["created"] + des_attr["type"] = type_json + des_attr["shape"] = shape_json + des_attr["name"] = attr_name + + if encoding: + item_size = getItemSize(type_json) + if item_size == "H5T_VARIABLE": + msg = "encoded value request but only json can be returned for " + msg = f"{attr_name} since it has variable length type" + log.warn(msg) + encoding = None + log.debug("base64 encoding requested") + + if include_data: + value_json = src_attr["value"] + if "encoding" in src_attr: + des_attr["encoding"] = src_attr["encoding"] + # just copy the encoded value + des_attr["value"] = value_json + elif encoding: + # return base64 encoded value + if value_json is None: + des_attr["value"] = None + else: + arr_dtype = createDataType(type_json) + np_shape = getShapeDims(shape_json) + try: + arr = jsonToArray(np_shape, arr_dtype, value_json) + except ValueError as e: + msg = f"Bad Request: input data doesn't match selection: {e}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + output_data = arrayToBytes(arr, encoding=encoding) + des_attr["value"] = output_data.decode("ascii") + des_attr["encoding"] = encoding + else: + des_attr["value"] = src_attr["value"] + return des_attr + + async def GET_Attributes(request): """ Return JSON for attribute collection """ @@ -50,14 +127,20 @@ async def GET_Attributes(request): if "bucket" in params: bucket = params["bucket"] else: - bucket = None + msg = "POST Attributes without bucket param" + log.warn(msg) + raise HTTPBadRequest(reason=msg) create_order = False - if "CreateOrder" in params and params["CreateOrder"]: + if params.get("CreateOrder"): create_order = True + encoding = None + if params.get("encoding"): + encoding = params["encoding"] + include_data = False - if "IncludeData" in params and params["IncludeData"]: + if params.get("IncludeData"): include_data = True limit = None @@ -121,14 +204,9 @@ async def GET_Attributes(request): attr_list = [] for i in range(start_index, end_index): attr_name = titles[i] - src_attr = attr_dict[attr_name] - des_attr = {} - des_attr["created"] = src_attr["created"] - des_attr["type"] = src_attr["type"] - des_attr["shape"] = src_attr["shape"] - des_attr["name"] = attr_name - if include_data: - des_attr["value"] = src_attr["value"] + kwargs = {"include_data": include_data, "encoding": encoding} + log.debug(f"_getAttribute kwargs: {kwargs}") + des_attr = _getAttribute(attr_name, obj_json, **kwargs) attr_list.append(des_attr) resp_json = {"attributes": attr_list} @@ -137,137 +215,230 @@ async def GET_Attributes(request): return resp -async def GET_Attribute(request): - """HTTP GET method to return JSON for /(obj)//attributes/ +async def POST_Attributes(request): + """ Return JSON for attribute collection """ log.request(request) app = request.app + + if not request.has_body: + msg = "POST_Attributes with no body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + body = await request.json() + if "attributes" not in body: + msg = f"POST_Attributes expected attributes in body but got: {body.keys()}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + titles = body["attributes"] # list of attribute names to fetch + params = request.rel_url.query obj_id = get_obj_id(request) - - attr_name = request.match_info.get('name') - validateAttributeName(attr_name) if "bucket" in params: bucket = params["bucket"] else: - bucket = None + msg = "POST Attributes without bucket param" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + include_data = False + log.debug(f"got params: {params}") + if "IncludeData" in params and params["IncludeData"]: + include_data = True + log.debug("include attr data") + if params.get("encoding"): + encoding = params["encoding"] + log.debug("POST_Attributes requested base64 encoding") + else: + encoding = None obj_json = await get_metadata_obj(app, obj_id, bucket=bucket) - msg = f"GET attribute obj_id: {obj_id} name: {attr_name} bucket: {bucket}" - log.info(msg) - log.debug(f"got obj_json: {obj_json}") + log.debug(f"Get attributes obj_id: {obj_id} got json") if "attributes" not in obj_json: - log.error(f"unexpected obj data for id: {obj_id}") + msg = f"unexpected data for obj id: {obj_id}" + msg.error(msg) raise HTTPInternalServerError() - attributes = obj_json["attributes"] - if attr_name not in attributes: - msg = f"Attribute '{attr_name}' not found for id: {obj_id}" + # return a list of attributes based on sorted dictionary keys + attr_dict = obj_json["attributes"] + attr_list = [] + kwargs = {"include_data": include_data} + if encoding: + kwargs["encoding"] = encoding + + for attr_name in titles: + if attr_name not in attr_dict: + continue + des_attr = _getAttribute(attr_name, obj_json, **kwargs) + attr_list.append(des_attr) + + resp_json = {"attributes": attr_list} + if not attr_list: + msg = f"POST attributes - requested {len(titles)} but none were found" log.warn(msg) raise HTTPNotFound() - - attr_json = attributes[attr_name] - - resp = json_response(attr_json) + if len(attr_list) != len(titles): + msg = f"POST attributes - requested {len(titles)} attributes but only " + msg += f"{len(attr_list)} were found" + log.warn(msg) + raise HTTPNotFound() + log.debug(f"POST attributes returning: {resp_json}") + resp = json_response(resp_json) log.response(request, resp=resp) return resp -async def PUT_Attribute(request): - """ Handler for PUT /(obj)//attributes/ +async def PUT_Attributes(request): + """ Handler for PUT /(obj)//attributes """ log.request(request) app = request.app params = request.rel_url.query + log.debug(f"got PUT_Attributes params: {params}") obj_id = get_obj_id(request) - attr_name = request.match_info.get('name') - log.info("PUT attribute {} in {}".format(attr_name, obj_id)) - validateAttributeName(attr_name) - if not request.has_body: log.error("PUT_Attribute with no body") raise HTTPBadRequest(message="body expected") body = await request.json() + log.debug(f"got body: {body}") if "bucket" in params: bucket = params["bucket"] elif "bucket" in body: bucket = params["bucket"] else: - bucket = None + msg = "PUT Attributes without bucket param" + log.warn(msg) + raise HTTPBadRequest(reason=msg) replace = False if "replace" in params and params["replace"]: replace = True log.info("replace attribute") - datatype = None - shape = None - value = None - if "type" not in body: - log.error("PUT attribute with no type in body") - raise HTTPInternalServerError() - - datatype = body["type"] - - if "shape" not in body: - log.error("PUT attribute with no shape in body") - raise HTTPInternalServerError() - shape = body["shape"] - - if "value" in body: - value = body["value"] + if "attributes" in body: + items = body["attributes"] + else: + # make it look like a dictionary anyway to make + # the processing more consistent + items = {} + if "name" not in body: + log.error("PUT attribute with no name in body") + raise HTTPInternalServerError() + attr_name = body["name"] + attribute = {} + if "type" in body: + attribute["type"] = body["type"] + if "shape" in body: + attribute["shape"] = body["shape"] + if "value" in body: + attribute["value"] = body["value"] + if "encoding" in body: + attribute["encoding"] = body["encoding"] + items[attr_name] = attribute + + # validate input + for attr_name in items: + validateAttributeName(attr_name) + attr_json = items[attr_name] + if "type" not in attr_json: + log.error("PUT attribute with no type in body") + raise HTTPInternalServerError() + if "shape" not in attr_json: + log.error("PUT attribute with no shape in body") + raise HTTPInternalServerError() + if "value" in attr_json and attr_json.get("encoding"): + # decode and store as JSON if possible + value = attr_json["value"] + arr_dtype = createDataType(attr_json["type"]) # np datatype + attr_shape = attr_json["shape"] + np_dims = getShapeDims(attr_shape) + log.debug(f"np_dims: {np_dims}") + try: + arr = bytesToArray(value, arr_dtype, np_dims, encoding="base64") + except ValueError as e: + msg = f"Bad Request: encoded input data doesn't match shape and type: {e}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + log.debug(f"got arr: {arr}") + log.debug(f"arr.shape: {arr.shape}") + data = arr.tolist() + try: + json_data = bytesArrayToList(data) + log.debug(f"converted encoded data to {json_data}") + if attr_shape["class"] == "H5S_SCALAR": + attr_json["value"] = json_data[0] # just store the scalar + else: + attr_json["value"] = json_data + del attr_json["encoding"] # don't need to store as base64 + except ValueError as err: + msg = f"Cannot decode bytes to list: {err}, will store as base64" + log.warn(msg) + attr_json["value"] = value # use the base64 data + + log.debug(f"attribute {attr_name}: {attr_json}") + + log.info(f"PUT {len(items)} attributes to obj_id: {obj_id} bucket: {bucket}") obj_json = await get_metadata_obj(app, obj_id, bucket=bucket) - log.debug(f"PUT attribute obj_id: {obj_id} bucket: {bucket} got json") - if "attributes" not in obj_json: log.error(f"unexpected obj data for id: {obj_id}") raise HTTPInternalServerError() attributes = obj_json["attributes"] - if attr_name in attributes and not replace: - # Attribute already exists, return a 409 - msg = f"Attempt to overwrite attribute: {attr_name} " - msg += f"in obj_id: {obj_id}" - log.warn(msg) - raise HTTPConflict() - if replace and attr_name not in attributes: - # Replace requires attribute exists - msg = f"Attempt to update missing attribute: {attr_name} " - msg += f"in obj_id: {obj_id}" - log.warn() - raise HTTPNotFound() - - if replace: - orig_attr = attributes[attr_name] - create_time = orig_attr["created"] - else: - create_time = time.time() - - # ok - all set, create attribute obj - attr_json = {"type": datatype, - "shape": shape, - "value": value, - "created": create_time} - attributes[attr_name] = attr_json + # check for conflicts, also set timestamp + create_time = time.time() + new_attribute = False # set this if we have any new attributes + for attr_name in items: + attribute = items[attr_name] + if attr_name in attributes: + log.debug(f"attribute {attr_name} exists") + if replace: + # don't change the create timestamp + log.debug(f"attribute {attr_name} exists, but will be updated") + old_item = attributes[attr_name] + attribute["created"] = old_item["created"] + else: + # Attribute already exists, return a 409 + msg = f"Attempt to overwrite attribute: {attr_name} " + msg += f"in obj_id: {obj_id}" + log.warn(msg) + raise HTTPConflict() + else: + # set the timestamp + log.debug(f"new attribute {attr_name}") + attribute["created"] = create_time + new_attribute = True + + # ok - all set, create the attributes + for attr_name in items: + log.debug(f"adding attribute {attr_name}") + attr_json = items[attr_name] + attributes[attr_name] = attr_json # write back to S3, save to metadata cache await save_metadata_obj(app, obj_id, obj_json, bucket=bucket) - resp_json = {} + if new_attribute: + status = 201 + else: + status = 200 - resp = json_response(resp_json, status=201) + resp_json = {"status": status} + + resp = json_response(resp_json, status=status) log.response(request, resp=resp) return resp -async def DELETE_Attribute(request): - """HTTP DELETE method for /(obj)//attributes/ +async def DELETE_Attributes(request): + """HTTP DELETE method for /(obj)//attributes """ log.request(request) app = request.app @@ -277,15 +448,40 @@ async def DELETE_Attribute(request): if "bucket" in params: bucket = params["bucket"] else: - bucket = None + msg = "DELETE Attributes without bucket param" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if "encoding" in params: + encoding = params["encoding"] + if encoding != "base64": + msg = "only base64 encoding is supported" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + encoding = None - attr_name = request.match_info.get('name') - log.info(f"DELETE attribute {attr_name} in {obj_id} bucket: {bucket}") - validateAttributeName(attr_name) + if "separator" in params: + separator = params["separator"] + else: + separator = "/" + + if "attr_names" not in params: + msg = "expected attr_names for DELETE attributes" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + attr_names_param = params["attr_names"] + if encoding: + attr_names_param = decodeData(attr_names_param).decode("utf-8") + + attr_names = attr_names_param.split(separator) + + log.info(f"DELETE attribute {attr_names} in {obj_id} bucket: {bucket}") obj_json = await get_metadata_obj(app, obj_id, bucket=bucket) - log.debug(f"DELETE attribute obj_id: {obj_id} got json") + log.debug(f"DELETE attributes obj_id: {obj_id} got json") if "attributes" not in obj_json: msg = f"unexpected data for obj id: {obj_id}" msg.error(msg) @@ -294,12 +490,13 @@ async def DELETE_Attribute(request): # return a list of attributes based on sorted dictionary keys attributes = obj_json["attributes"] - if attr_name not in attributes: - msg = f"Attribute {attr_name} not found in objid: {obj_id}" - log.warn(msg) - raise HTTPNotFound() + for attr_name in attr_names: + if attr_name not in attributes: + msg = f"Attribute {attr_name} not found in objid: {obj_id}" + log.warn(msg) + raise HTTPNotFound() - del attributes[attr_name] + del attributes[attr_name] await save_metadata_obj(app, obj_id, obj_json, bucket=bucket) diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index 885e72f8..579ae7ff 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -14,23 +14,26 @@ # import numpy as np -from aiohttp.web_exceptions import HTTPBadRequest, HTTPInternalServerError +from aiohttp.web_exceptions import HTTPBadRequest, HTTPNotFound, HTTPInternalServerError from aiohttp.web import StreamResponse from json import JSONDecodeError -from .util.httpUtil import http_get, http_put, http_delete, getHref +from .util.httpUtil import getHref from .util.httpUtil import getAcceptType, jsonResponse -from .util.idUtil import isValidUuid, getDataNodeUrl +from .util.idUtil import isValidUuid, getCollectionForId, getRootObjId from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain, verifyRoot from .util.attrUtil import validateAttributeName, getRequestCollectionName from .util.hdf5dtype import validateTypeItem, getBaseTypeJson from .util.hdf5dtype import createDataType, getItemSize -from .util.arrayUtil import jsonToArray, getNumElements -from .util.arrayUtil import bytesArrayToList +from .util.arrayUtil import jsonToArray, getNumElements, bytesArrayToList +from .util.arrayUtil import bytesToArray, arrayToBytes, decodeData, encodeData from .util.dsetUtil import getShapeDims + from .servicenode_lib import getDomainJson, getObjectJson, validateAction +from .servicenode_lib import getAttributes, putAttributes, deleteAttributes +from .domain_crawl import DomainCrawler from . import hsds_logger as log from . import config @@ -54,15 +57,23 @@ async def GET_Attributes(request): log.warn(msg) raise HTTPBadRequest(reason=msg) - include_data = False + kwargs = {} + ignore_nan = False - if "IncludeData" in params and params["IncludeData"]: - include_data = True - if "ignore_nan" in params and params["ignore_nan"]: - ignore_nan = True - create_order = False + include_data = True + if "IncludeData" in params: + IncludeData = params["IncludeData"] + if not IncludeData or IncludeData == "0": + include_data = False + kwargs["include_data"] = False + log.debug(f"include_data: {include_data}") + + if "ignore_nan" in params and params["ignore_nan"]: + ignore_nan = True + kwargs["ignore_nan"] = True + if "CreateOrder" in params and params["CreateOrder"]: - create_order = True + kwargs["create_order"] = True limit = None if "Limit" in params: @@ -72,9 +83,11 @@ async def GET_Attributes(request): msg = "Bad Request: Expected int type for limit" log.warn(msg) raise HTTPBadRequest(reason=msg) + kwargs["limit"] = limit marker = None if "Marker" in params: marker = params["Marker"] + kwargs["marker"] = marker username, pswd = getUserPasswordFromRequest(request) if username is None and app["allow_noauth"]: @@ -88,29 +101,15 @@ async def GET_Attributes(request): log.warn(msg) raise HTTPBadRequest(reason=msg) bucket = getBucketForDomain(domain) + log.debug(f"bucket: {bucket}") + kwargs["bucket"] = bucket # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, obj_id, username, "read") - req = getDataNodeUrl(app, obj_id) + attributes = await getAttributes(app, obj_id, **kwargs) - req += "/" + collection + "/" + obj_id + "/attributes" - params = {} - if limit is not None: - params["Limit"] = str(limit) - if marker is not None: - params["Marker"] = marker - if include_data: - params["IncludeData"] = 1 - if bucket: - params["bucket"] = bucket - if create_order: - params["CreateOrder"] = 1 - - log.debug(f"get attributes: {req}") - dn_json = await http_get(app, req, params=params) log.debug(f"got attributes json from dn for obj_id: {obj_id}") - attributes = dn_json["attributes"] # mixin hrefs for attribute in attributes: @@ -138,6 +137,7 @@ async def GET_Attribute(request): """HTTP method to return JSON for an attribute""" log.request(request) app = request.app + params = request.rel_url.query # returns datasets|groups|datatypes collection = getRequestCollectionName(request) @@ -168,30 +168,55 @@ async def GET_Attribute(request): # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, obj_id, username, "read") - params = request.rel_url.query + if "ignore_nan" in params and params["ignore_nan"]: ignore_nan = True else: ignore_nan = False - req = getDataNodeUrl(app, obj_id) - req += f"/{collection}/{obj_id}/attributes/{attr_name}" - log.debug(f"get Attribute: {req}") - params = {} - if bucket: - params["bucket"] = bucket - dn_json = await http_get(app, req, params=params) - log.debug(f"got attributes json from dn for obj_id: {obj_id}") + if "IncludeData" in params and not params["IncludeData"]: + include_data = False + else: + include_data = True + + if params.get("encoding"): + if params["encoding"] != "base64": + msg = "only base64 encoding is supported" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + encoding = "base64" + else: + encoding = None + + kwargs = {"bucket": bucket, "include_data": include_data, "attr_names": [attr_name, ]} + if ignore_nan: + kwargs["ignore_nan"] = ignore_nan + if encoding: + kwargs["encoding"] = encoding + + attributes = await getAttributes(app, obj_id, **kwargs) + if not attributes: + log.error("no attributes returned") # should have been raised by getAttributes + raise HTTPInternalServerError() + if len(attributes) > 1: + log.error(f"expected one attribute but got: {len(attributes)}") + raise HTTPInternalServerError() + + log.debug(f"got attributes: {attributes}") + attribute = attributes[0] resp_json = {} resp_json["name"] = attr_name - resp_json["type"] = dn_json["type"] - resp_json["shape"] = dn_json["shape"] - if "value" in dn_json: - resp_json["value"] = dn_json["value"] - resp_json["created"] = dn_json["created"] + resp_json["type"] = attribute["type"] + resp_json["shape"] = attribute["shape"] + if "value" in attribute: + resp_json["value"] = attribute["value"] + resp_json["created"] = attribute["created"] # attributes don't get modified, so use created timestamp as lastModified - resp_json["lastModified"] = dn_json["created"] + # TBD: but they can if replace is set! + resp_json["lastModified"] = attribute["created"] + if "encoding" in attribute: + resp_json["encoding"] = attribute["encoding"] hrefs = [] obj_uri = "/" + collection + "/" + obj_id @@ -205,59 +230,8 @@ async def GET_Attribute(request): return resp -async def PUT_Attribute(request): - """HTTP method to create a new attribute""" - log.request(request) - app = request.app - # returns datasets|groups|datatypes - collection = getRequestCollectionName(request) - - obj_id = request.match_info.get("id") - if not obj_id: - msg = "Missing object id" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if not isValidUuid(obj_id, obj_class=collection): - msg = f"Invalid object id: {obj_id}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - attr_name = request.match_info.get("name") - log.debug(f"Attribute name: [{attr_name}]") - validateAttributeName(attr_name) - - log.info(f"PUT Attribute id: {obj_id} name: {attr_name}") - username, pswd = getUserPasswordFromRequest(request) - # write actions need auth - await validateUserPassword(app, username, pswd) - - if not request.has_body: - msg = "PUT Attribute with no body" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - try: - body = await request.json() - except JSONDecodeError: - msg = "Unable to load JSON body" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - domain = getDomainFromRequest(request) - if not isValidDomain(domain): - msg = f"Invalid domain: {domain}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - bucket = getBucketForDomain(domain) - - # get domain JSON - domain_json = await getDomainJson(app, domain) - verifyRoot(domain_json) - - root_id = domain_json["root"] - - # TBD - verify that the obj_id belongs to the given domain - await validateAction(app, domain, obj_id, username, "create") - +async def _getTypeFromRequest(app, body, obj_id=None, bucket=None): + """ return a type json from the request body """ if "type" not in body: msg = "PUT attribute with no type in body" log.warn(msg) @@ -270,19 +244,19 @@ async def PUT_Attribute(request): log.debug(f"got ctypeid: {ctype_id}") ctype_json = await getObjectJson(app, ctype_id, bucket=bucket) log.debug(f"ctype {ctype_id}: {ctype_json}") + root_id = getRootObjId(obj_id) if ctype_json["root"] != root_id: msg = "Referenced committed datatype must belong in same domain" log.warn(msg) raise HTTPBadRequest(reason=msg) datatype = ctype_json["type"] - # add the ctype_id to type type + # add the ctype_id to the type datatype["id"] = ctype_id elif isinstance(datatype, str): try: # convert predefined type string (e.g. "H5T_STD_I32LE") to # corresponding json representation datatype = getBaseTypeJson(datatype) - log.debug(f"got datatype: {datatype}") except TypeError: msg = "PUT attribute with invalid predefined type" log.warn(msg) @@ -303,7 +277,11 @@ async def PUT_Attribute(request): log.warn(msg) raise HTTPBadRequest(reason=msg) - dims = None + return datatype + + +def _getShapeFromRequest(body): + """ get shape json from request body """ shape_json = {} if "shape" in body: shape_body = body["shape"] @@ -343,19 +321,19 @@ async def PUT_Attribute(request): # use H5S_SIMPLE as class if isinstance(shape_body, list) and len(shape_body) == 0: shape_json["class"] = "H5S_SCALAR" - dims = [ - 1, - ] else: shape_json["class"] = "H5S_SIMPLE" dims = getShapeDims(shape_body) shape_json["dims"] = dims else: shape_json["class"] = "H5S_SCALAR" - dims = [ - 1, - ] + return shape_json + + +def _getValueFromRequest(body, data_type, data_shape): + """ Get attribute value from request json """ + dims = getShapeDims(data_shape) if "value" in body: if dims is None: msg = "Bad Request: data can not be included with H5S_NULL space" @@ -363,49 +341,364 @@ async def PUT_Attribute(request): raise HTTPBadRequest(reason=msg) value = body["value"] # validate that the value agrees with type/shape - arr_dtype = createDataType(datatype) # np datatype + arr_dtype = createDataType(data_type) # np datatype if len(dims) == 0: - np_dims = [ - 1, - ] + np_dims = [1, ] else: np_dims = dims - log.debug(f"attribute dims: {np_dims}") - log.debug(f"attribute value: {value}") - try: - arr = jsonToArray(np_dims, arr_dtype, value) - except ValueError as e: - if value is None: - arr = np.array([]).astype(arr_dtype) + + if body.get("encoding"): + item_size = getItemSize(data_type) + if item_size == "H5T_VARIABLE": + msg = "base64 encoding is not support for variable length attributes" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + try: + data = decodeData(value) + except ValueError: + msg = "unable to decode data" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + expected_numbytes = arr_dtype.itemsize * np.prod(dims) + if len(data) != expected_numbytes: + msg = f"expected: {expected_numbytes} but got: {len(data)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + # check to see if this works with our shape and type + try: + log.debug(f"data: {data}") + log.debug(f"type: {arr_dtype}") + log.debug(f"np_dims: {np_dims}") + arr = bytesToArray(data, arr_dtype, np_dims) + except ValueError as e: + msg = f"Bad Request: encoded input data doesn't match shape and type: {e}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + value_json = None + # now try converting to JSON + list_data = arr.tolist() + try: + value_json = bytesArrayToList(list_data) + except ValueError as err: + msg = f"Cannot decode bytes to list: {err}, will store as encoded bytes" + log.warn(msg) + if value_json: + log.debug("will store base64 input as json") + if data_shape["class"] == "H5S_SCALAR": + # just use the scalar value + value = value_json[0] + else: + value = value_json # return this else: + value = data # return bytes to signal that this needs to be encoded + else: + # verify that the input data matches the array shape and type + try: + jsonToArray(np_dims, arr_dtype, value) + except ValueError as e: msg = f"Bad Request: input data doesn't match selection: {e}" log.warn(msg) raise HTTPBadRequest(reason=msg) - log.debug(f"Got: {arr.size} array elements") else: value = None - # ready to add attribute now - req = getDataNodeUrl(app, obj_id) - req += f"/{collection}/{obj_id}/attributes/{attr_name}" - log.info("PUT Attribute: " + req) - - attr_json = {} - attr_json["type"] = datatype - attr_json["shape"] = shape_json - if value is not None: - attr_json["value"] = value - params = {} + return value + + +async def _getAttributeFromRequest(app, req_json, obj_id=None, bucket=None): + """ return attribute from given request json """ + attr_item = {} + attr_type = await _getTypeFromRequest(app, req_json, obj_id=obj_id, bucket=bucket) + attr_shape = _getShapeFromRequest(req_json) + attr_item = {"type": attr_type, "shape": attr_shape} + attr_value = _getValueFromRequest(req_json, attr_type, attr_shape) + if attr_value is not None: + if isinstance(attr_value, bytes): + attr_value = encodeData(attr_value) # store as base64 + attr_item["encoding"] = "base64" + else: + # just store the JSON dict or primitive value + attr_item["value"] = attr_value + else: + attr_item["value"] = None + + return attr_item + + +async def _getAttributesFromRequest(request, req_json, obj_id=None, bucket=None): + """ read the given JSON dictinary and return dict of attribute json """ + + app = request.app + attr_items = {} + kwargs = {"obj_id": obj_id} if bucket: - params["bucket"] = bucket + kwargs["bucket"] = bucket + if "attributes" in req_json: + attributes = req_json["attributes"] + if not isinstance(attributes, dict): + msg = f"expected list for attributes but got: {type(attributes)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + # read each attr_item and canonicalize the shape, type, verify value + for attr_name in attributes: + attr_json = attributes[attr_name] + attr_item = await _getAttributeFromRequest(app, attr_json, **kwargs) + attr_items[attr_name] = attr_item + + elif "type" in req_json: + # single attribute create - fake an item list + attr_item = await _getAttributeFromRequest(app, req_json, **kwargs) + if "name" in req_json: + attr_name = req_json["name"] + else: + attr_name = request.match_info.get("name") + validateAttributeName(attr_name) + if not attr_name: + msg = "Missing attribute name" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + attr_items[attr_name] = attr_item + else: + log.debug(f"_getAttributes from request - no attribute defined in {req_json}") + + return attr_items + + +async def PUT_Attribute(request): + """HTTP method to create a new attribute""" + log.request(request) + app = request.app + params = request.rel_url.query + # returns datasets|groups|datatypes + collection = getRequestCollectionName(request) - put_rsp = await http_put(app, req, params=params, data=attr_json) - log.info(f"PUT Attribute resp: {put_rsp}") + req_obj_id = request.match_info.get("id") + if not req_obj_id: + msg = "Missing object id" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if not isValidUuid(req_obj_id, obj_class=collection): + msg = f"Invalid object id: {req_obj_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + attr_name = request.match_info.get("name") + if attr_name: + log.debug(f"Attribute name: [{attr_name}]") + validateAttributeName(attr_name) + + log.info(f"PUT Attributes id: {req_obj_id} name: {attr_name}") + username, pswd = getUserPasswordFromRequest(request) + # write actions need auth + await validateUserPassword(app, username, pswd) + + if not request.has_body: + msg = "PUT Attribute with no body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + try: + body = await request.json() + except JSONDecodeError: + msg = "Unable to load JSON body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + domain = getDomainFromRequest(request) + if not isValidDomain(domain): + msg = f"Invalid domain: {domain}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + bucket = getBucketForDomain(domain) + + # get domain JSON + domain_json = await getDomainJson(app, domain) + verifyRoot(domain_json) + + # TBD - verify that the obj_id belongs to the given domain + await validateAction(app, domain, req_obj_id, username, "create") + + # get attribute from request body + kwargs = {"bucket": bucket, "obj_id": req_obj_id} + attr_body = await _getAttributeFromRequest(app, body, **kwargs) + + # write attribute to DN + attr_json = {attr_name: attr_body} + log.debug(f"putting attr {attr_name} to DN: {attr_json}") + + kwargs = {"bucket": bucket} + if "replace" in params and params["replace"]: + # allow attribute to be overwritten + log.debug("setting replace for PUT Atttribute") + kwargs["replace"] = True + else: + log.debug("replace is not set for PUT Attribute") + status = await putAttributes(app, req_obj_id, attr_json, **kwargs) + log.info(f"PUT Attributes status: {status}") hrefs = [] # TBD req_rsp = {"hrefs": hrefs} # attribute creation successful - resp = await jsonResponse(request, req_rsp, status=201) + resp = await jsonResponse(request, req_rsp, status=status) + log.response(request, resp=resp) + return resp + + +async def PUT_Attributes(request): + """HTTP method to create a new attribute""" + log.request(request) + params = request.rel_url.query + app = request.app + status = None + + log.debug("PUT_Attributes") + + username, pswd = getUserPasswordFromRequest(request) + # write actions need auth + await validateUserPassword(app, username, pswd) + + if not request.has_body: + msg = "PUT Attribute with no body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + try: + body = await request.json() + except JSONDecodeError: + msg = "Unable to load JSON body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + domain = getDomainFromRequest(request) + if not isValidDomain(domain): + msg = f"Invalid domain: {domain}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + bucket = getBucketForDomain(domain) + log.debug(f"got bucket: {bucket}") + + # get domain JSON + domain_json = await getDomainJson(app, domain) + verifyRoot(domain_json) + + req_obj_id = request.match_info.get("id") + if not req_obj_id: + req_obj_id = domain_json["root"] + kwargs = {"obj_id": req_obj_id, "bucket": bucket} + attr_items = await _getAttributesFromRequest(request, body, **kwargs) + + if attr_items: + log.debug(f"PUT Attribute {len(attr_items)} attibutes to add") + else: + log.debug("no attributes defined yet") + + # next, sort out where these attributes are going to + + obj_ids = {} + if "obj_ids" in body: + body_ids = body["obj_ids"] + if isinstance(body_ids, list): + # multi cast the attributes - each attribute in attr-items + # will be written to each of the objects identified by obj_id + if not attr_items: + msg = "no attributes provided" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + for obj_id in body_ids: + if not isValidUuid(obj_id): + msg = f"Invalid object id: {obj_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + obj_ids[obj_id] = attr_items + + msg = f"{len(attr_items)} attributes will be multicast to " + msg += f"{len(obj_ids)} objects" + log.info(msg) + elif isinstance(body_ids, dict): + # each value is body_ids is a set of attriutes to write to the object + # unlike the above case, different attributes can be written to + # different objects + if attr_items: + msg = "attributes defined outside the obj_ids dict" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + for obj_id in body_ids: + if not isValidUuid(obj_id): + msg = f"Invalid object id: {obj_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + id_json = body_ids[obj_id] + + kwargs = {"obj_id": obj_id, "bucket": bucket} + obj_items = await _getAttributesFromRequest(request, id_json, **kwargs) + if obj_items: + obj_ids[obj_id] = obj_items + + # write different attributes to different objects + msg = f"put attributes over {len(obj_ids)} objects" + else: + msg = f"unexpected type for obj_ids: {type(obj_ids)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + # use the object id from the request + obj_id = request.match_info.get("id") + if not obj_id: + msg = "Missing object id" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + obj_ids[obj_id] = attr_items # make it look like a list for consistency + + log.debug(f"got {len(obj_ids)} obj_ids") + + # TBD - verify that the obj_id belongs to the given domain + await validateAction(app, domain, req_obj_id, username, "create") + + kwargs = {"bucket": bucket} + if params.get("replace"): + kwargs["replace"] = True + + count = len(obj_ids) + if count == 0: + msg = "no obj_ids defined" + log.warn(f"PUT_Attributes: {msg}") + raise HTTPBadRequest(reason=msg) + elif count == 1: + # just send one PUT Attributes request to the dn + obj_id = list(obj_ids.keys())[0] + attr_json = obj_ids[obj_id] + log.debug(f"got attr_json: {attr_json}") + + status = await putAttributes(app, obj_id, attr_json, **kwargs) + + else: + # put multi obj + + # mixin some additonal kwargs + crawler_params = {"follow_links": False} + if bucket: + crawler_params["bucket"] = bucket + + kwargs = {"action": "put_attr", "raise_error": True, "params": crawler_params} + crawler = DomainCrawler(app, obj_ids, **kwargs) + + # will raise exception on not found, server busy, etc. + await crawler.crawl() + + status = crawler.get_status() + + log.info("DomainCrawler done for put_attrs action") + + hrefs = [] # TBD + req_rsp = {"hrefs": hrefs} + # attribute creation successful + log.debug(f"PUT_Attributes returning status: {status}") + resp = await jsonResponse(request, req_rsp, status=status) log.response(request, resp=resp) return resp @@ -447,15 +740,10 @@ async def DELETE_Attribute(request): # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, obj_id, username, "delete") - req = getDataNodeUrl(app, obj_id) - req += "/" + collection + "/" + obj_id + "/attributes/" + attr_name - log.info("PUT Attribute: " + req) - params = {} - if bucket: - params["bucket"] = bucket - rsp_json = await http_delete(app, req, params=params) + attr_names = [attr_name, ] + kwargs = {"attr_names": attr_names, "bucket": bucket} - log.info(f"PUT Attribute resp: {rsp_json}") + await deleteAttributes(app, obj_id, **kwargs) hrefs = [] # TBD req_rsp = {"hrefs": hrefs} @@ -509,15 +797,30 @@ async def GET_AttributeValue(request): ignore_nan = True else: ignore_nan = False + if "encoding" in params: + encoding = params["encoding"] + if encoding and encoding != "base64": + msg = f"invalid encoding value: {encoding}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + encoding = None - req = getDataNodeUrl(app, obj_id) - req += "/" + collection + "/" + obj_id + "/attributes/" + attr_name - log.debug("get Attribute: " + req) - params = {} - if bucket: - params["bucket"] = bucket - dn_json = await http_get(app, req, params=params) - log.debug("got attributes json from dn for obj_id: " + str(dn_json)) + attr_names = [attr_name, ] + kwargs = {"attr_names": attr_names, "bucket": bucket} + if ignore_nan: + kwargs["ignore_nan"] = True + + attributes = await getAttributes(app, obj_id, **kwargs) + + if not attributes: + msg = f"attribute {attr_name} not found" + log.warn(msg) + raise HTTPNotFound() + + dn_json = attributes[0] + + log.debug(f"got attributes json from dn for obj_id: {dn_json}") attr_shape = dn_json["shape"] log.debug(f"attribute shape: {attr_shape}") @@ -538,19 +841,35 @@ async def GET_AttributeValue(request): log.info(msg) response_type = "json" - if response_type == "binary": + log.debug(f"response_type: {response_type}") + + if response_type == "binary" or encoding: arr_dtype = createDataType(type_json) # np datatype np_shape = getShapeDims(shape_json) - try: - arr = jsonToArray(np_shape, arr_dtype, dn_json["value"]) - except ValueError as e: - if dn_json["value"] is None: - arr = np.array([]).astype(arr_dtype) - else: + if dn_json["value"] is None: + arr = np.zeros(np_shape, dtype=arr_dtype) + elif dn_json.get("encoding") == "base64": + # data is a base64 string we can directly convert to a + # np array + data = dn_json["value"] + if not isinstance(data, str): + msg = "expected string for base64 encoded attribute" + msg += f" but got: {type(data)}" + log.error(msg) + raise HTTPInternalServerError() + arr = bytesToArray(data, arr_dtype, np_shape, encoding="base64") + else: + try: + arr = jsonToArray(np_shape, arr_dtype, dn_json["value"]) + except ValueError as e: msg = f"Bad Request: input data doesn't match selection: {e}" log.warn(msg) raise HTTPBadRequest(reason=msg) - output_data = arr.tobytes() + output_data = arrayToBytes(arr) + else: + output_data = None # will return as json if possible + + if response_type == "binary": msg = f"GET AttributeValue - returning {len(output_data)} " msg += "bytes binary data" log.debug(msg) @@ -580,7 +899,20 @@ async def GET_AttributeValue(request): else: resp_json = {} if "value" in dn_json: - resp_json["value"] = dn_json["value"] + json_value = dn_json["value"] + if dn_json.get("encoding") == "base64": + resp_json["value"] = json_value + resp_json["encoding"] = "base64" + elif output_data is not None: + # query param requesting base64 encoded value + # convert output_data bytes to base64 string + output_data = encodeData(output_data) + output_data = output_data.decode("ascii") # convert to a string + resp_json["value"] = output_data + resp_json["encoding"] = "base64" + else: + # just return json data + resp_json["value"] = json_value hrefs = [] obj_uri = "/" + collection + "/" + obj_id @@ -638,14 +970,18 @@ async def PUT_AttributeValue(request): # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, obj_id, username, "update") - req = getDataNodeUrl(app, obj_id) - req += "/" + collection + "/" + obj_id + "/attributes/" + attr_name - log.debug("get Attribute: " + req) - params = {} - if bucket: - params["bucket"] = bucket - dn_json = await http_get(app, req, params=params) - log.debug("got attributes json from dn for obj_id: " + str(obj_id)) + attr_names = [attr_name, ] + kwargs = {"attr_names": attr_names, "bucket": bucket} + + attributes = await getAttributes(app, obj_id, **kwargs) + + if not attributes: + msg = f"attribute {attr_name} not found" + log.warn(msg) + raise HTTPNotFound() + + dn_json = attributes[0] + log.debug(f"got dn_json: {dn_json}") attr_shape = dn_json["shape"] @@ -689,6 +1025,7 @@ async def PUT_AttributeValue(request): msg += f"{request.content_length}" log.error(msg) raise HTTPInternalServerError() + log.debug(f"read {len(binary_data)} bytes of binary data") arr = None # np array to hold request data @@ -700,20 +1037,11 @@ async def PUT_AttributeValue(request): log.warn(msg) raise HTTPBadRequest(reason=msg) arr = np.fromstring(binary_data, dtype=np_dtype) - arr = arr.reshape(np_shape) # conform to selection shape - # convert to JSON for transmission to DN - data = arr.tolist() - - try: - value = bytesArrayToList(data) - except ValueError as err: - msg = f"Cannot decode bytes to list: {err}" - raise HTTPBadRequest(reason=msg) - if attr_shape["class"] == "H5S_SCALAR": - # just send the value, not a list - value = value[0] - + arr = arr.reshape([]) + else: + arr = arr.reshape(np_shape) # conform to selection shape + log.debug(f"got array {arr} from binary data") else: try: body = await request.json() @@ -728,35 +1056,46 @@ async def PUT_AttributeValue(request): raise HTTPBadRequest(reason=msg) value = body["value"] - # validate that the value agrees with type/shape - try: - arr = jsonToArray(np_shape, np_dtype, value) - except ValueError as e: - if value is None: - arr = np.array([]).astype(np_dtype) - else: - msg = f"Bad Request: input data doesn't match selection: {e}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + if value is None: + # write empty array + arr = np.zeros(np_shape, dtype=np_dtype) + elif "encoding" in body and body["encoding"] == "base64": + arr = bytesToArray(value, np_dtype, np_shape, encoding="base64") + else: + # validate that the value agrees with type/shape + try: + arr = jsonToArray(np_shape, np_dtype, value) + except ValueError as e: + if value is None: + arr = np.array([]).astype(np_dtype) + else: + msg = f"Bad Request: input data doesn't match selection: {e}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + log.debug(f"Got: {arr.size} array elements") + # convert to base64 for transmission to DN + data = arrayToBytes(arr, encoding="base64") + # ready to add attribute now - attr_json = {} - attr_json["type"] = type_json - attr_json["shape"] = attr_shape - attr_json["value"] = value - - req = getDataNodeUrl(app, obj_id) - req += "/" + collection + "/" + obj_id + "/attributes/" + attr_name - log.info(f"PUT Attribute Value: {req}") - - dn_json["value"] = value - params = {} - params = {"replace": 1} # let the DN know we can overwrite the attribute - if bucket: - params["bucket"] = bucket - put_rsp = await http_put(app, req, params=params, data=attr_json) - log.info(f"PUT Attribute Value resp: {put_rsp}") + attr_body = {} + attr_body["type"] = type_json + attr_body["shape"] = attr_shape + attr_body["value"] = data.decode("ascii") + attr_body["encoding"] = "base64" + attr_json = {attr_name: attr_body} + + kwargs = {"bucket": bucket, "replace": True} + + status = await putAttributes(app, obj_id, attr_json, **kwargs) + + if status != 200: + msg = "putAttributesValue, expected DN status of 200" + msg += f" but got {status}" + log.warn(msg) + else: + log.info("PUT AttributesValue status: 200") hrefs = [] # TBD req_rsp = {"hrefs": hrefs} @@ -764,3 +1103,297 @@ async def PUT_AttributeValue(request): resp = await jsonResponse(request, req_rsp) log.response(request, resp=resp) return resp + + +async def POST_Attributes(request): + """HTTP method to get multiple attribute values""" + log.request(request) + app = request.app + log.info("POST_Attributes") + req_id = request.match_info.get("id") + + if not request.has_body: + msg = "POST Attributes with no body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + try: + body = await request.json() + except JSONDecodeError: + msg = "Unable to load JSON body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if "attr_names" in body: + attr_names = body["attr_names"] + if not isinstance(attr_names, list): + msg = f"expected list for attr_names but got: {type(attr_names)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + attr_names = None + + if "obj_ids" in body: + obj_ids = body["obj_ids"] + else: + obj_ids = None + + if attr_names is None and obj_ids is None: + msg = "expected body to contain one of attr_names, obj_ids keys" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + # construct an item list from attr_names and obj_ids + items = {} + if obj_ids is None: + if not req_id: + msg = "no object id in request" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + items[req_id] = attr_names + elif isinstance(obj_ids, list): + if attr_names is None: + msg = "no attr_names - will return all attributes for each object" + log.debug(msg) + for obj_id in obj_ids: + items[obj_id] = None + elif isinstance(obj_ids, dict): + if attr_names is not None: + msg = "attr_names must not be provided if obj_ids is a dict" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + for obj_id in obj_ids: + names_for_id = obj_ids[obj_id] + if not isinstance(names_for_id, list): + msg = "expected list of attribute names" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + items[obj_id] = names_for_id + + log.debug(f"POST Attributes items: {items}") + + # do a check that everything is as it should with the item list + for obj_id in items: + if not isValidUuid(obj_id): + msg = f"Invalid object id: {obj_id}" + log.warn(msg) + + attr_names = items[obj_id] + + if attr_names is None: + log.debug(f"getting all attributes for {obj_id}") + elif isinstance(attr_names, list): + for attr_name in attr_names: + validateAttributeName(attr_name) # raises HTTPBadRequest if invalid + else: + msg = f"expected list for attr_names but got: {type(attr_names)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + username, pswd = getUserPasswordFromRequest(request) + if username is None and app["allow_noauth"]: + username = "default" + else: + await validateUserPassword(app, username, pswd) + + domain = getDomainFromRequest(request) + if not isValidDomain(domain): + msg = f"Invalid domain value: {domain}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + bucket = getBucketForDomain(domain) + + accept_type = getAcceptType(request) + if accept_type != "json": + msg = f"{accept_type} response requested for POST Attributes, " + msg += "but only json is supported" + log.warn(msg) + + # get domain JSON + domain_json = await getDomainJson(app, domain) + verifyRoot(domain_json) + + # TBD - verify that the obj_id belongs to the given domain + await validateAction(app, domain, obj_id, username, "read") + + params = request.rel_url.query + log.debug(f"got params: {params}") + include_data = True + if "IncludeData" in params: + IncludeData = params["IncludeData"] + if not IncludeData or IncludeData == "0": + include_data = False + + if params.get("ignore_nan"): + ignore_nan = True + else: + ignore_nan = False + + if params.get("encoding"): + encoding = params["encoding"] + if encoding != "base64": + msg = "only base64 encoding is supported" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + encoding = None + + resp_json = {} + + if len(items) == 0: + msg = "no obj ids specified for POST Attributes" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + elif len(items) == 1: + # just make a request the datanode + obj_id = list(items.keys())[0] + collection = getCollectionForId(obj_id) + attr_names = items[obj_id] + kwargs = {"attr_names": attr_names, "bucket": bucket} + if not include_data: + kwargs["include_data"] = False + if ignore_nan: + kwargs["ignore_nan"] = True + if encoding: + kwargs["encoding"] = encoding + + attributes = await getAttributes(app, obj_id, **kwargs) + + # mixin hrefs + for attribute in attributes: + attr_name = attribute["name"] + attr_href = f"/{collection}/{obj_id}/attributes/{attr_name}" + attribute["href"] = getHref(request, attr_href) + + resp_json["attributes"] = attributes + else: + # get multi obj + # don't follow links! + crawler_params = {"follow_links": False, "bucket": bucket} + # mixin params + if not include_data: + crawler_params["include_data"] = False + + if ignore_nan: + crawler_params["ignore_nan"] = True + + if encoding: + crawler_params["encoding"] = encoding + + kwargs = {"action": "get_attr", "raise_error": True, "params": crawler_params} + crawler = DomainCrawler(app, items, **kwargs) + # will raise exception on NotFound, etc. + await crawler.crawl() + + msg = f"DomainCrawler returned: {len(crawler._obj_dict)} objects" + log.info(msg) + attributes = crawler._obj_dict + # mixin hrefs + for obj_id in attributes: + obj_attributes = attributes[obj_id] + msg = f"POST_Attributes, obj_id {obj_id} " + msg += f"returned {len(obj_attributes)}" + log.debug(msg) + + collection = getCollectionForId(obj_id) + for attribute in obj_attributes: + log.debug(f"attribute: {attribute}") + attr_name = attribute["name"] + attr_href = f"/{collection}/{obj_id}/attributes/{attr_name}" + attribute["href"] = getHref(request, attr_href) + log.debug(f"got {len(attributes)} attributes") + resp_json["attributes"] = attributes + + hrefs = [] + collection = getCollectionForId(req_id) + obj_uri = "/" + collection + "/" + req_id + href = getHref(request, obj_uri + "/attributes") + hrefs.append({"rel": "self", "href": href}) + hrefs.append({"rel": "home", "href": getHref(request, "/")}) + hrefs.append({"rel": "owner", "href": getHref(request, obj_uri)}) + resp_json["hrefs"] = hrefs + + resp = await jsonResponse(request, resp_json, ignore_nan=ignore_nan) + log.response(request, resp=resp) + return resp + + +async def DELETE_Attributes(request): + """HTTP method to delete multiple attribute values""" + log.request(request) + app = request.app + log.info("DELETE_Attributes") + obj_id = request.match_info.get("id") + if not isValidUuid(obj_id): + msg = f"Invalid object id: {obj_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + params = request.rel_url.query + log.debug(f"got params: {params}") + + if "attr_names" not in params: + msg = "expected attr_names query param" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + attr_names_query_string = params["attr_names"] + if not attr_names_query_string: + msg = "empty attr_names query param" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if "encoding" in params: + encoding = params["encoding"] + if encoding != "base64": + msg = "only base64 encoding is supported for attribute names" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + encoding = None + + if "separator" in params: + separator = params["separator"] + else: + separator = "/" + + if encoding: + # this can be used to deal with non-url encodable names + attr_names_query_string = decodeData(attr_names_query_string).decode("ascii") + + log.debug(f"got attr_names query string: {attr_names_query_string}") + + # Use the given separator character to construct a list from + # the query string + attr_names = attr_names_query_string.split(separator) + log.info(f"delete {len(attr_names)} attributes for {obj_id}") + log.debug(f"attr_names: {attr_names}") + + username, pswd = getUserPasswordFromRequest(request) + await validateUserPassword(app, username, pswd) + + domain = getDomainFromRequest(request) + if not isValidDomain(domain): + msg = f"Invalid domain value: {domain}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + bucket = getBucketForDomain(domain) + + # get domain JSON + domain_json = await getDomainJson(app, domain) + verifyRoot(domain_json) + + # TBD - verify that the obj_id belongs to the given domain + await validateAction(app, domain, obj_id, username, "delete") + + kwargs = {"attr_names": attr_names, "bucket": bucket, "separator": separator} + + await deleteAttributes(app, obj_id, **kwargs) + + resp_json = {} + hrefs = [] + resp_json["hrefs"] = hrefs + + resp = await jsonResponse(request, resp_json) + log.response(request, resp=resp) + return resp diff --git a/hsds/datanode.py b/hsds/datanode.py index 33581d67..1efd9063 100644 --- a/hsds/datanode.py +++ b/hsds/datanode.py @@ -31,8 +31,8 @@ from .group_dn import GET_Group, POST_Group, DELETE_Group, PUT_Group from .group_dn import POST_Root from .link_dn import GET_Links, GET_Link, PUT_Link, DELETE_Link -from .attr_dn import GET_Attributes, GET_Attribute, PUT_Attribute -from .attr_dn import DELETE_Attribute +from .attr_dn import GET_Attributes, POST_Attributes +from .attr_dn import PUT_Attributes, DELETE_Attributes from .ctype_dn import GET_Datatype, POST_Datatype, DELETE_Datatype from .dset_dn import GET_Dataset, POST_Dataset, DELETE_Dataset from .dset_dn import PUT_DatasetShape @@ -63,26 +63,24 @@ async def init(): app.router.add_route("DELETE", "/groups/{id}/links/{title}", DELETE_Link) app.router.add_route("PUT", "/groups/{id}/links/{title}", PUT_Link) app.router.add_route("GET", "/groups/{id}/attributes", GET_Attributes) - app.router.add_route("GET", "/groups/{id}/attributes/{name}", GET_Attribute) - app.router.add_route("DELETE", "/groups/{id}/attributes/{name}", DELETE_Attribute) - app.router.add_route("PUT", "/groups/{id}/attributes/{name}", PUT_Attribute) + app.router.add_route("POST", "/groups/{id}/attributes", POST_Attributes) + app.router.add_route("DELETE", "/groups/{id}/attributes", DELETE_Attributes) + app.router.add_route("PUT", "/groups/{id}/attributes", PUT_Attributes) app.router.add_route("GET", "/datatypes/{id}", GET_Datatype) app.router.add_route("DELETE", "/datatypes/{id}", DELETE_Datatype) app.router.add_route("POST", "/datatypes", POST_Datatype) app.router.add_route("GET", "/datatypes/{id}/attributes", GET_Attributes) - app.router.add_route("GET", "/datatypes/{id}/attributes/{name}", GET_Attribute) - app.router.add_route( - "DELETE", "/datatypes/{id}/attributes/{name}", DELETE_Attribute - ) - app.router.add_route("PUT", "/datatypes/{id}/attributes/{name}", PUT_Attribute) + app.router.add_route("POST", "/datatypes/{id}/attributes", POST_Attributes) + app.router.add_route("DELETE", "/datatypes/{id}/attributes", DELETE_Attributes) + app.router.add_route("PUT", "/datatypes/{id}/attributes", PUT_Attributes) app.router.add_route("GET", "/datasets/{id}", GET_Dataset) app.router.add_route("DELETE", "/datasets/{id}", DELETE_Dataset) app.router.add_route("POST", "/datasets", POST_Dataset) app.router.add_route("PUT", "/datasets/{id}/shape", PUT_DatasetShape) app.router.add_route("GET", "/datasets/{id}/attributes", GET_Attributes) - app.router.add_route("GET", "/datasets/{id}/attributes/{name}", GET_Attribute) - app.router.add_route("DELETE", "/datasets/{id}/attributes/{name}", DELETE_Attribute) - app.router.add_route("PUT", "/datasets/{id}/attributes/{name}", PUT_Attribute) + app.router.add_route("POST", "/datasets/{id}/attributes", POST_Attributes) + app.router.add_route("DELETE", "/datasets/{id}/attributes", DELETE_Attributes) + app.router.add_route("PUT", "/datasets/{id}/attributes", PUT_Attributes) app.router.add_route("PUT", "/chunks/{id}", PUT_Chunk) app.router.add_route("GET", "/chunks/{id}", GET_Chunk) app.router.add_route("POST", "/chunks/{id}", POST_Chunk) diff --git a/hsds/datanode_lib.py b/hsds/datanode_lib.py index 108eff07..c782b0e4 100644 --- a/hsds/datanode_lib.py +++ b/hsds/datanode_lib.py @@ -328,9 +328,7 @@ async def get_metadata_obj(app, obj_id, bucket=None): if isValidDomain(obj_id): domain_bucket = getBucketForDomain(obj_id) if bucket and domain_bucket and bucket != domain_bucket: - msg = ( - f"get_metadata_obj for domain: {obj_id} but bucket param was: {bucket}" - ) + msg = f"get_metadata_obj for domain: {obj_id} but bucket param was: {bucket}" log.error(msg) raise HTTPInternalServerError() if not bucket: diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py new file mode 100644 index 00000000..84ee7a87 --- /dev/null +++ b/hsds/domain_crawl.py @@ -0,0 +1,326 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +# +# domain crawler +# + +import asyncio + +from aiohttp.web_exceptions import HTTPServiceUnavailable, HTTPConflict, HTTPBadRequest +from aiohttp.web_exceptions import HTTPInternalServerError, HTTPNotFound, HTTPGone + + +from .util.idUtil import getCollectionForId, getDataNodeUrl + +from .servicenode_lib import getObjectJson, getAttributes, putAttributes +from . import hsds_logger as log + + +class DomainCrawler: + def __init__( + self, + app, + objs, + action="get_obj", + params=None, + max_tasks=40, + max_objects_limit=0, + raise_error=False + ): + log.info(f"DomainCrawler.__init__ root_id: {len(objs)} objs") + log.debug(f"params: {params}") + self._app = app + self._action = action + self._max_objects_limit = max_objects_limit + self._params = params + self._max_tasks = max_tasks + self._q = asyncio.Queue() + self._obj_dict = {} + self.seen_ids = set() + self._raise_error = raise_error + if not objs: + log.error("no objs for crawler to crawl!") + raise ValueError() + + for obj_id in objs: + log.debug(f"adding {obj_id} to the queue") + self._q.put_nowait(obj_id) + if isinstance(objs, dict): + self._objs = objs + else: + self._objs = None + + async def get_attributes(self, obj_id, attr_names): + # get the given attributes for the obj_id + msg = f"get_attributes for {obj_id}" + if attr_names: + msg += f", {len(attr_names)} attributes" + log.debug(msg) + + kwargs = {} + for key in ("include_data", "ignore_nan", "bucket"): + if key in self._params: + kwargs[key] = self._params[key] + if attr_names: + kwargs["attr_names"] = attr_names + log.debug(f"using kwargs: {kwargs}") + + status = 200 + # make sure to catch all expected exceptions, otherwise + # the task will never complete + try: + attributes = await getAttributes(self._app, obj_id, **kwargs) + except HTTPBadRequest: + status = 400 + except HTTPNotFound: + status = 404 + except HTTPGone: + status = 410 + except HTTPServiceUnavailable: + status = 503 + except HTTPInternalServerError: + status = 500 + except Exception as e: + log.error(f"unexpected exception from post request: {e}") + status = 500 + + if status == 200: + log.debug(f"got attributes: {attributes}") + self._obj_dict[obj_id] = attributes + else: + log.warn(f"Domain crawler - got {status} status for obj_id {obj_id}") + self._obj_dict[obj_id] = {"status": status} + + async def put_attributes(self, obj_id, attr_items): + # write the given attributes for the obj_id + log.debug(f"put_attributes for {obj_id}, {len(attr_items)} attributes") + req = getDataNodeUrl(self._app, obj_id) + collection = getCollectionForId(obj_id) + req += f"/{collection}/{obj_id}/attributes" + kwargs = {} + if "bucket" in self._params: + kwargs["bucket"] = self._params["bucket"] + if "replace" in self._params: + kwargs["replace"] = self._params["replace"] + status = None + try: + status = await putAttributes(self._app, obj_id, attr_items, **kwargs) + except HTTPConflict: + log.warn("DomainCrawler - got HTTPConflict from http_put") + status = 409 + except HTTPServiceUnavailable: + status = 503 + except HTTPInternalServerError: + status = 500 + except Exception as e: + log.error(f"unexpected exception {e}") + + log.debug(f"DomainCrawler fetch for {obj_id} - returning status: {status}") + self._obj_dict[obj_id] = {"status": status} + + async def get_obj_json(self, obj_id): + """ get the given obj_json for the obj_id. + for each group found, search the links if include_links is set """ + log.debug(f"get_obj_json: {obj_id}") + collection = getCollectionForId(obj_id) + kwargs = {} + + for k in ("include_links", "include_attrs", "bucket"): + if k in self._params: + kwargs[k] = self._params[k] + if collection == "groups" and self._params.get("follow_links"): + follow_links = True + kwargs["include_links"] = True # get them so we can follow them + else: + follow_links = False + if follow_links or self._params.get("include_attrs"): + kwargs["refresh"] = True # don't want a cached version in this case + + log.debug(f"follow_links: {follow_links}") + log.debug(f"getObjectJson kwargs: {kwargs}") + obj_json = None + status = 200 + try: + obj_json = await getObjectJson(self._app, obj_id, **kwargs) + except HTTPNotFound: + status = 404 + except HTTPServiceUnavailable: + status = 503 + except HTTPInternalServerError: + status = 500 + except Exception as e: + log.error(f"unexpected exception {e}") + status = 500 + log.debug(f"getObjectJson status: {status}") + + if obj_json is None: + msg = f"DomainCrawler - getObjectJson for {obj_id} " + if status >= 500: + msg += f"failed, status: {status}" + log.error(msg) + else: + msg += f"returned status: {status}" + log.warn(msg) + return + + log.debug(f"DomainCrawler - got json for {obj_id}") + log.debug(f"obj_json: {obj_json}") + + log.debug("store obj json") + self._obj_dict[obj_id] = obj_json # store the obj_json + + # for groups iterate through all the hard links and + # add to the lookup ids set + + log.debug(f"gotCollection: {collection}") + + if collection == "groups" and follow_links: + if "links" not in obj_json: + log.error("expected links key in obj_json") + return + links = obj_json["links"] + log.debug(f"DomainCrawler links: {links}") + for title in links: + log.debug(f"DomainCrawler - got link: {title}") + link_obj = links[title] + num_objects = len(self._obj_dict) + if self._params.get("max_objects_limit") is not None: + max_objects_limit = self._params["max_objects_limit"] + if num_objects >= max_objects_limit: + msg = "DomainCrawler reached limit of " + msg += f"{max_objects_limit}" + log.info(msg) + break + if link_obj["class"] != "H5L_TYPE_HARD": + # just follow hardlinks + continue + link_id = link_obj["id"] + if link_id not in self._obj_dict: + # haven't seen this object yet, get obj json + log.debug(f"DomainCrawler - adding link_id: {link_id}") + self._obj_dict[link_id] = {} # placeholder for obj id + self._q.put_nowait(link_id) + + def get_status(self): + """ return the highest status of any of the returned objects """ + status = None + for obj_id in self._obj_dict: + item = self._obj_dict[obj_id] + log.debug(f"item: {item}") + if "status" in item: + item_status = item["status"] + if status is None or item_status > status: + # return the more severe error + log.debug(f"setting status to {item_status}") + status = item_status + return status + + async def crawl(self): + workers = [asyncio.Task(self.work()) for _ in range(self._max_tasks)] + # When all work is done, exit. + msg = "DomainCrawler - await queue.join - " + msg += f"count: {len(self._obj_dict)}" + log.info(msg) + await self._q.join() + msg = "DomainCrawler - join complete - " + msg += f"count: {len(self._obj_dict)}" + log.info(msg) + + for w in workers: + w.cancel() + log.debug("DomainCrawler - workers canceled") + + status = self.get_status() + if status: + log.debug(f"DomainCrawler -- status: {status}") + log.debug(f"raise_error: {self._raise_error}") + if self._raise_error: + # throw the approriate exception if other than 200, 201 + if status == 200: + pass # ok + elif status == 201: + pass # also ok + elif status == 400: + log.warn("DomainCrawler - BadRequest") + raise HTTPBadRequest(reason="unkown") + elif status == 404: + log.warn("DomainCrawler - not found") + raise HTTPNotFound() + elif status == 409: + log.warn("DomainCrawler - conflict") + raise HTTPConflict() + elif status == 410: + log.warn("DomainCrawler - gone") + raise HTTPGone() + elif status == 500: + log.error("DomainCrawler - internal server error") + raise HTTPInternalServerError() + elif status == 503: + log.error("DomainCrawler - server busy") + raise HTTPServiceUnavailable() + else: + log.error(f"DomainCrawler - unexpected status: {status}") + raise HTTPInternalServerError() + + async def work(self): + while True: + obj_id = await self._q.get() + await self.fetch(obj_id) + self._q.task_done() + + async def fetch(self, obj_id): + log.debug(f"DomainCrawler fetch for id: {obj_id}") + log.debug(f"action: {self._action}") + if self._action == "get_obj": + log.debug("DomainCrawler - get obj") + # just get the obj json + await self.get_obj_json(obj_id) + elif self._action == "get_attr": + log.debug("DomainCrawler - get attributes") + # fetch the given attributes + if self._objs is None: + log.error("DomainCrawler - self._objs not set") + return + if obj_id not in self._objs: + log.error(f"couldn't find {obj_id} in self._objs") + return + attr_names = self._objs[obj_id] + if attr_names is None: + log.debug(f"fetch all attributes for {obj_id}") + else: + if not isinstance(attr_names, list): + log.error("expected list for attribute names") + return + if len(attr_names) == 0: + log.warn("expected at least one name in attr_names list") + return + + log.debug(f"DomainCrawler - got attribute names: {attr_names}") + await self.get_attributes(obj_id, attr_names) + elif self._action == "put_attr": + log.debug("DomainCrawler - put attributes") + # write attributes + if self._objs and obj_id not in self._objs: + log.error(f"couldn't find {obj_id} in self._objs") + return + attr_items = self._objs[obj_id] + log.debug(f"got {len(attr_items)} attr_items") + + await self.put_attributes(obj_id, attr_items) + else: + msg = f"DomainCrawler: unexpected action: {self._action}" + log.error(msg) + + msg = f"DomainCrawler - fetch complete obj_id: {obj_id}, " + msg += f"{len(self._obj_dict)} objects found" + log.debug(msg) + log.debug(f"obj_dict: {self._obj_dict}") diff --git a/hsds/domain_sn.py b/hsds/domain_sn.py index 8ca13f72..ad6baf70 100755 --- a/hsds/domain_sn.py +++ b/hsds/domain_sn.py @@ -34,211 +34,19 @@ from .util.authUtil import validateUserPassword, getAclKeys from .util.domainUtil import getParentDomain, getDomainFromRequest from .util.domainUtil import isValidDomain, getBucketForDomain -from .util.domainUtil import getPathForDomain +from .util.domainUtil import getPathForDomain, getLimits from .util.storUtil import getStorKeys, getCompressors from .util.boolparser import BooleanParser from .util.globparser import globmatch from .servicenode_lib import getDomainJson, getObjectJson, getObjectIdByPath -from .servicenode_lib import getRootInfo, checkBucketAccess, doFlush +from .servicenode_lib import getRootInfo, checkBucketAccess, doFlush, getDomainResponse from .basenode import getVersion +from .domain_crawl import DomainCrawler +from .folder_crawl import FolderCrawler from . import hsds_logger as log from . import config -class DomainCrawler: - def __init__( - self, - app, - root_id, - bucket=None, - include_attrs=True, - max_tasks=40, - max_objects_limit=0, - ): - log.info(f"DomainCrawler.__init__ root_id: {root_id}") - self._app = app - self._max_objects_limit = max_objects_limit - self._include_attrs = include_attrs - self._max_tasks = max_tasks - self._q = asyncio.Queue() - self._obj_dict = {} - self.seen_ids = set() - self._q.put_nowait(root_id) - self._bucket = bucket - - async def crawl(self): - workers = [asyncio.Task(self.work()) for _ in range(self._max_tasks)] - # When all work is done, exit. - msg = "DomainCrawler - await queue.join - " - msg += f"count: {len(self._obj_dict)}" - log.info(msg) - await self._q.join() - msg = "DomainCrawler - join complete - " - msg += f"count: {len(self._obj_dict)}" - log.info(msg) - - for w in workers: - w.cancel() - log.debug("DomainCrawler - workers canceled") - - async def work(self): - while True: - obj_id = await self._q.get() - await self.fetch(obj_id) - self._q.task_done() - - async def fetch(self, obj_id): - log.debug(f"DomainCrawler - fetch for obj_id: {obj_id}") - kwargs = { - "include_links": True, - "include_attrs": self._include_attrs, - "bucket": self._bucket, - } - obj_json = await getObjectJson(self._app, obj_id, **kwargs) - log.debug(f"DomainCrawler - got json for {obj_id}") - - # including links, so don't need link count - if "link_count" in obj_json: - del obj_json["link_count"] - self._obj_dict[obj_id] = obj_json - if self._include_attrs: - del obj_json["attributeCount"] - - # if this is a group, iterate through all the hard links and - # add to the lookup ids set - if getCollectionForId(obj_id) == "groups": - links = obj_json["links"] - log.debug(f"DomainCrawler links: {links}") - for title in links: - log.debug(f"DomainCrawler - got link: {title}") - link_obj = links[title] - num_objects = len(self._obj_dict) - if self._max_objects_limit > 0: - if num_objects >= self._max_objects_limit: - msg = "DomainCrawler reached limit of " - msg += f"{self._max_objects_limit}" - log.info(msg) - break - if link_obj["class"] != "H5L_TYPE_HARD": - continue - link_id = link_obj["id"] - if link_id not in self._obj_dict: - # haven't seen this object yet, get obj json - log.debug(f"DomainCrawler - adding link_id: {link_id}") - self._obj_dict[link_id] = {} # placeholder for obj id - self._q.put_nowait(link_id) - msg = f"DomainCrawler - fetch complete obj_id: {obj_id}, " - msg += f"{len(self._obj_dict)} objects found" - log.debug(msg) - - -class FolderCrawler: - def __init__( - self, - app, - domains, - bucket=None, - get_root=False, - verbose=False, - max_tasks_per_node=100, - ): - log.info(f"FolderCrawler.__init__ {len(domains)} domain names") - self._app = app - self._get_root = get_root - self._verbose = verbose - self._q = asyncio.Queue() - self._domain_dict = {} - self._group_dict = {} - for domain in domains: - self._q.put_nowait(domain) - self._bucket = bucket - max_tasks = max_tasks_per_node * getNodeCount(app) - if len(domains) > max_tasks: - self._max_tasks = max_tasks - else: - self._max_tasks = len(domains) - - async def crawl(self): - workers = [asyncio.Task(self.work()) for _ in range(self._max_tasks)] - # When all work is done, exit. - msg = f"FolderCrawler max_tasks {self._max_tasks} = await queue.join " - msg += f"- count: {len(self._domain_dict)}" - log.info(msg) - await self._q.join() - folder_count = len(self._domain_dict) - msg = f"FolderCrawler - join complete - count: {folder_count}" - log.info(msg) - - for w in workers: - w.cancel() - log.debug("FolderCrawler - workers canceled") - - async def work(self): - while True: - start = time.time() - domain = await self._q.get() - await self.fetch(domain) - self._q.task_done() - elapsed = time.time() - start - msg = f"FolderCrawler - task {domain} start: {start:.3f} " - msg += f"elapsed: {elapsed:.3f}" - log.debug(msg) - - async def fetch(self, domain): - msg = f"FolderCrawler - fetch for domain: {domain} bucket: " - msg += f"{self._bucket}" - log.debug(msg) - domain_key = self._bucket + domain - try: - kwargs = {"reload": True} - domain_json = await getDomainJson(self._app, domain_key, **kwargs) - msg = f"FolderCrawler - {domain} got domain_json: {domain_json}" - log.debug(msg) - if domain_json: - kwargs = {"verbose": self._verbose, "bucket": self._bucket} - domain_rsp = await get_domain_response(self._app, domain_json, **kwargs) - for k in ("limits", "version", "compressors"): - if k in domain_rsp: - # don't return given key for multi-domain responses - del domain_rsp[k] - msg = f"FolderCrawler - {domain} get domain_rsp: {domain_rsp}" - log.debug(msg) - # mixin domain name - self._domain_dict[domain] = domain_rsp - if self._get_root and "root" in domain_json: - root_id = domain_json["root"] - log.debug(f"fetching root json for {root_id}") - root_json = await getObjectJson( - self._app, - root_id, - include_links=False, - include_attrs=True, - bucket=self._bucket, - ) - log.debug(f"got root_json: {root_json}") - self._group_dict[root_id] = root_json - else: - log.warn(f"FolderCrawler - no domain found for {domain}") - except HTTPNotFound: - # One of the domains not found, but continue through the list - log.warn(f"fetch result - not found error for: {domain}") - except HTTPGone: - log.warn(f"fetch result - domain: {domain} has been deleted") - except HTTPInternalServerError: - log.error(f"fetch result - internal error fetching: {domain}") - except HTTPForbidden: - log.warn(f"fetch result - access not allowed for: {domain}") - except HTTPBadRequest: - log.error(f"fetch result - bad request for: {domain}") - except HTTPServiceUnavailable: - msg = f"fetch result - service unavailable for domain: {domain}" - log.warn(msg) - except Exception as e: - msg = f"fetch result - unexpected exception for domain {domain}: " - msg += f"exception of type {type(e)}, {e}" - log.error(msg) - - async def get_collections(app, root_id, bucket=None): """Return the object ids for given root.""" @@ -301,15 +109,17 @@ async def getDomainObjects(app, root_id, include_attrs=False, bucket=None): keyed by obj id """ - log.info(f"getDomainObjects for root: {root_id}") + log.info(f"getDomainObjects for root: {root_id}, include_attrs: {include_attrs}") max_objects_limit = int(config.get("domain_req_max_objects_limit", default=500)) - kwargs = { + crawler_params = { "include_attrs": include_attrs, "bucket": bucket, + "follow_links": True, "max_objects_limit": max_objects_limit, } - crawler = DomainCrawler(app, root_id, **kwargs) + + crawler = DomainCrawler(app, [root_id, ], action="get_obj", params=crawler_params) await crawler.crawl() if len(crawler._obj_dict) >= max_objects_limit: msg = "getDomainObjects - too many objects: " @@ -344,99 +154,6 @@ def getIdList(objs, marker=None, limit=None): return ret_ids -def getLimits(): - """return limits the client may need""" - limits = {} - limits["min_chunk_size"] = int(config.get("min_chunk_size")) - limits["max_chunk_size"] = int(config.get("max_chunk_size")) - limits["max_request_size"] = int(config.get("max_request_size")) - - return limits - - -async def get_domain_response(app, domain_json, bucket=None, verbose=False): - rsp_json = {} - if "root" in domain_json: - rsp_json["root"] = domain_json["root"] - rsp_json["class"] = "domain" - else: - rsp_json["class"] = "folder" - if "owner" in domain_json: - rsp_json["owner"] = domain_json["owner"] - if "created" in domain_json: - rsp_json["created"] = domain_json["created"] - - lastModified = 0 - if "lastModified" in domain_json: - lastModified = domain_json["lastModified"] - totalSize = len(json.dumps(domain_json)) - metadata_bytes = 0 - allocated_bytes = 0 - linked_bytes = 0 - num_chunks = 0 - num_linked_chunks = 0 - md5_sum = "" - - if verbose and "root" in domain_json: - root_id = domain_json["root"] - root_info = await getRootInfo(app, domain_json["root"], bucket=bucket) - if root_info: - log.info(f"got root_info for root: {root_id}") - allocated_bytes = root_info["allocated_bytes"] - totalSize += allocated_bytes - if "linked_bytes" in root_info: - linked_bytes += root_info["linked_bytes"] - totalSize += linked_bytes - if "num_linked_chunks" in root_info: - num_linked_chunks = root_info["num_linked_chunks"] - if "metadata_bytes" in root_info: - # this key was added for schema v2 - metadata_bytes = root_info["metadata_bytes"] - totalSize += metadata_bytes - if root_info["lastModified"] > lastModified: - lastModified = root_info["lastModified"] - if "md5_sum" in root_info: - md5_sum = root_info["md5_sum"] - - num_groups = root_info["num_groups"] - num_datatypes = root_info["num_datatypes"] - num_datasets = len(root_info["datasets"]) - num_chunks = root_info["num_chunks"] - rsp_json["scan_info"] = root_info # return verbose info here - - else: - # root info not available - just return 0 for these values - log.info(f"root_info not available for root: {root_id}") - allocated_bytes = 0 - totalSize = 0 - num_groups = 0 - num_datasets = 0 - num_datatypes = 0 - num_chunks = 0 - - num_objects = num_groups + num_datasets + num_datatypes + num_chunks - rsp_json["num_groups"] = num_groups - rsp_json["num_datasets"] = num_datasets - rsp_json["num_datatypes"] = num_datatypes - rsp_json["num_objects"] = num_objects - rsp_json["total_size"] = totalSize - rsp_json["allocated_bytes"] = allocated_bytes - rsp_json["num_objects"] = num_objects - rsp_json["metadata_bytes"] = metadata_bytes - rsp_json["linked_bytes"] = linked_bytes - rsp_json["num_chunks"] = num_chunks - rsp_json["num_linked_chunks"] = num_linked_chunks - rsp_json["md5_sum"] = md5_sum - - # pass back config parameters the client may care about - - rsp_json["limits"] = getLimits() - rsp_json["compressors"] = getCompressors() - rsp_json["version"] = getVersion() - rsp_json["lastModified"] = lastModified - return rsp_json - - async def get_domains(request): """This method is called by GET_Domains and GET_Domain""" app = request.app @@ -701,6 +418,7 @@ async def GET_Domain(request): log.request(request) app = request.app params = request.rel_url.query + log.debug(f"GET_Domain query params: {params}") parent_id = None include_links = False @@ -796,6 +514,7 @@ async def GET_Domain(request): # it's in the meta_cache). kwargs = {"refresh": True, "bucket": bucket, "include_attrs": include_attrs, "include_links": include_links} + log.debug(f"kwargs for getObjectJson: {kwargs}") obj_json = await getObjectJson(app, obj_id, **kwargs) @@ -840,14 +559,13 @@ async def GET_Domain(request): # return just the keys as per the REST API kwargs = {"verbose": verbose, "bucket": bucket} - rsp_json = await get_domain_response(app, domain_json, **kwargs) + rsp_json = await getDomainResponse(app, domain_json, **kwargs) # include domain objects if requested - if "getobjs" in params and params["getobjs"] and "root" in domain_json: + if params.get("getobjs") and "root" in domain_json: + + log.debug("getting all domain objects") root_id = domain_json["root"] - include_attrs = False - if "include_attrs" in params and params["include_attrs"]: - include_attrs = True kwargs = {"include_attrs": include_attrs, "bucket": bucket} domain_objs = await getDomainObjects(app, root_id, **kwargs) if domain_objs: diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 26e4bf5a..7df44873 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -716,6 +716,7 @@ async def POST_Dataset(request): log.warn(msg) raise HTTPBadRequest(reason=msg) + log.debug(f"got body: {body}") # get domain, check authorization domain = getDomainFromRequest(request) if not isValidDomain(domain): @@ -725,6 +726,7 @@ async def POST_Dataset(request): bucket = getBucketForDomain(domain) domain_json = await getDomainJson(app, domain, reload=True) + log.debug(f"got domain_json: {domain_json}") root_id = domain_json["root"] # throws exception if not allowed @@ -741,6 +743,7 @@ async def POST_Dataset(request): raise HTTPBadRequest(reason=msg) datatype = body["type"] + log.debug(f"got datatype: {datatype}") if isinstance(datatype, str) and datatype.startswith("t-"): # Committed type - fetch type json from DN ctype_id = datatype @@ -793,11 +796,10 @@ async def POST_Dataset(request): shape_json["class"] = "H5S_SCALAR" else: shape = body["shape"] + log.debug(f"got shape: {shape}") if isinstance(shape, int): shape_json["class"] = "H5S_SIMPLE" - dims = [ - shape, - ] + dims = [shape, ] shape_json["dims"] = dims rank = 1 elif isinstance(shape, str): @@ -1040,6 +1042,7 @@ async def POST_Dataset(request): link_title = None if "link" in body: link_body = body["link"] + log.debug(f"got link_body: {link_body}") if "id" in link_body: link_id = link_body["id"] if "name" in link_body: @@ -1051,7 +1054,7 @@ async def POST_Dataset(request): await validateAction(app, domain, link_id, username, "create") dset_id = createObjId("datasets", rootid=root_id) - log.info(f"new dataset id: {dset_id}") + log.info(f"new dataset id: {dset_id}") dataset_json = { "id": dset_id, diff --git a/hsds/folder_crawl.py b/hsds/folder_crawl.py new file mode 100644 index 00000000..48f37ce6 --- /dev/null +++ b/hsds/folder_crawl.py @@ -0,0 +1,131 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +# +# service node of hsds cluster +# + +import time +import asyncio +from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden, HTTPNotFound +from aiohttp.web_exceptions import HTTPGone, HTTPInternalServerError +from aiohttp.web_exceptions import HTTPServiceUnavailable + +from .util.idUtil import getNodeCount +from .servicenode_lib import getObjectJson, getDomainResponse, getDomainJson +from . import hsds_logger as log + + +class FolderCrawler: + def __init__( + self, + app, + domains, + bucket=None, + get_root=False, + verbose=False, + max_tasks_per_node=100, + ): + log.info(f"FolderCrawler.__init__ {len(domains)} domain names") + self._app = app + self._get_root = get_root + self._verbose = verbose + self._q = asyncio.Queue() + self._domain_dict = {} + self._group_dict = {} + for domain in domains: + self._q.put_nowait(domain) + self._bucket = bucket + max_tasks = max_tasks_per_node * getNodeCount(app) + if len(domains) > max_tasks: + self._max_tasks = max_tasks + else: + self._max_tasks = len(domains) + + async def crawl(self): + workers = [asyncio.Task(self.work()) for _ in range(self._max_tasks)] + # When all work is done, exit. + msg = f"FolderCrawler max_tasks {self._max_tasks} = await queue.join " + msg += f"- count: {len(self._domain_dict)}" + log.info(msg) + await self._q.join() + folder_count = len(self._domain_dict) + msg = f"FolderCrawler - join complete - count: {folder_count}" + log.info(msg) + + for w in workers: + w.cancel() + log.debug("FolderCrawler - workers canceled") + + async def work(self): + while True: + start = time.time() + domain = await self._q.get() + await self.fetch(domain) + self._q.task_done() + elapsed = time.time() - start + msg = f"FolderCrawler - task {domain} start: {start:.3f} " + msg += f"elapsed: {elapsed:.3f}" + log.debug(msg) + + async def fetch(self, domain): + msg = f"FolderCrawler - fetch for domain: {domain} bucket: " + msg += f"{self._bucket}" + log.debug(msg) + domain_key = self._bucket + domain + try: + kwargs = {"reload": True} + domain_json = await getDomainJson(self._app, domain_key, **kwargs) + msg = f"FolderCrawler - {domain} got domain_json: {domain_json}" + log.debug(msg) + if domain_json: + kwargs = {"verbose": self._verbose, "bucket": self._bucket} + domain_rsp = await getDomainResponse(self._app, domain_json, **kwargs) + for k in ("limits", "version", "compressors"): + if k in domain_rsp: + # don't return given key for multi-domain responses + del domain_rsp[k] + msg = f"FolderCrawler - {domain} get domain_rsp: {domain_rsp}" + log.debug(msg) + # mixin domain name + self._domain_dict[domain] = domain_rsp + if self._get_root and "root" in domain_json: + root_id = domain_json["root"] + log.debug(f"fetching root json for {root_id}") + root_json = await getObjectJson( + self._app, + root_id, + include_links=False, + include_attrs=True, + bucket=self._bucket, + ) + log.debug(f"got root_json: {root_json}") + self._group_dict[root_id] = root_json + else: + log.warn(f"FolderCrawler - no domain found for {domain}") + except HTTPNotFound: + # One of the domains not found, but continue through the list + log.warn(f"fetch result - not found error for: {domain}") + except HTTPGone: + log.warn(f"fetch result - domain: {domain} has been deleted") + except HTTPInternalServerError: + log.error(f"fetch result - internal error fetching: {domain}") + except HTTPForbidden: + log.warn(f"fetch result - access not allowed for: {domain}") + except HTTPBadRequest: + log.error(f"fetch result - bad request for: {domain}") + except HTTPServiceUnavailable: + msg = f"fetch result - service unavailable for domain: {domain}" + log.warn(msg) + except Exception as e: + msg = f"fetch result - unexpected exception for domain {domain}: " + msg += f"exception of type {type(e)}, {e}" + log.error(msg) diff --git a/hsds/servicenode.py b/hsds/servicenode.py index 312fd54c..22b9822a 100755 --- a/hsds/servicenode.py +++ b/hsds/servicenode.py @@ -30,8 +30,8 @@ from .domain_sn import GET_ACL, GET_ACLs, PUT_ACL from .group_sn import GET_Group, POST_Group, DELETE_Group from .link_sn import GET_Links, GET_Link, PUT_Link, DELETE_Link -from .attr_sn import GET_Attributes, GET_Attribute, PUT_Attribute -from .attr_sn import DELETE_Attribute, GET_AttributeValue, PUT_AttributeValue +from .attr_sn import GET_Attributes, GET_Attribute, PUT_Attribute, PUT_Attributes, DELETE_Attribute +from .attr_sn import DELETE_Attributes, GET_AttributeValue, PUT_AttributeValue, POST_Attributes from .ctype_sn import GET_Datatype, POST_Datatype, DELETE_Datatype from .dset_sn import GET_Dataset, POST_Dataset, DELETE_Dataset from .dset_sn import GET_DatasetShape, PUT_DatasetShape, GET_DatasetType @@ -44,98 +44,134 @@ async def init(): # call app.router.add_get() here to add node-specific routes # + + # + # domain paths + # path = "/" app.router.add_route("GET", path, GET_Domain) app.router.add_route("DELETE", path, DELETE_Domain) app.router.add_route("PUT", path, PUT_Domain) + path = "/domains" app.router.add_route("GET", path, GET_Domains) + + # + # acls paths + # path = "/acls/{username}" app.router.add_route("GET", path, GET_ACL) app.router.add_route("PUT", path, PUT_ACL) + path = "/acls" app.router.add_route("GET", path, GET_ACLs) + + # + # groups paths + # path = "/groups/" app.router.add_route("GET", path, GET_Group) + path = "/groups" app.router.add_route("GET", path, GET_Groups) app.router.add_route("POST", path, POST_Group) + path = "/groups/{id}" app.router.add_route("GET", path, GET_Group) app.router.add_route("DELETE", path, DELETE_Group) + path = "/groups/{id}/links" app.router.add_route("GET", path, GET_Links) + path = "/groups/{id}/links/{title}" app.router.add_route("GET", path, GET_Link) app.router.add_route("DELETE", path, DELETE_Link) app.router.add_route("PUT", path, PUT_Link) + path = "/groups/{id}/attributes" app.router.add_route("GET", path, GET_Attributes) + app.router.add_route("POST", path, POST_Attributes) + app.router.add_route("PUT", path, PUT_Attributes) + app.router.add_route("DELETE", path, DELETE_Attributes) + path = "/groups/{id}/attributes/{name}" app.router.add_route("GET", path, GET_Attribute) app.router.add_route("DELETE", path, DELETE_Attribute) app.router.add_route("PUT", path, PUT_Attribute) + path = "/groups/{id}/attributes/{name}/value" app.router.add_route("GET", path, GET_AttributeValue) app.router.add_route("PUT", path, PUT_AttributeValue) - path = "/groups/{id}/acls/{username}" - app.router.add_route("GET", path, GET_ACL) - app.router.add_route("PUT", path, PUT_ACL) - path = "/groups/{id}/acls" - app.router.add_route("GET", path, GET_ACLs) + + # + # datatypes paths + # path = "/datatypes" app.router.add_route("GET", path, GET_Datatypes) app.router.add_route("POST", path, POST_Datatype) + path = "/datatypes/" app.router.add_route("GET", path, GET_Datatype) + path = "/datatypes/{id}" app.router.add_route("GET", path, GET_Datatype) app.router.add_route("DELETE", path, DELETE_Datatype) + path = "/datatypes/{id}/attributes" app.router.add_route("GET", path, GET_Attributes) + app.router.add_route("POST", path, POST_Attributes) + app.router.add_route("PUT", path, PUT_Attributes) + app.router.add_route("DELETE", path, DELETE_Attributes) + path = "/datatypes/{id}/attributes/{name}" app.router.add_route("GET", path, GET_Attribute) app.router.add_route("DELETE", path, DELETE_Attribute) app.router.add_route("PUT", path, PUT_Attribute) + path = "/datatypes/{id}/attributes/{name}/value" app.router.add_route("GET", path, GET_AttributeValue) app.router.add_route("PUT", path, PUT_AttributeValue) - path = "/datatypes/{id}/acls/{username}" - app.router.add_route("GET", path, GET_ACL) - app.router.add_route("PUT", path, PUT_ACL) - path = "/datatypes/{id}/acls" - app.router.add_route("GET", path, GET_ACLs) + + # + # datasets paths + # path = "/datasets/{id}" app.router.add_route("GET", path, GET_Dataset) app.router.add_route("DELETE", path, DELETE_Dataset) + path = "/datasets/" app.router.add_route("GET", path, GET_Dataset) + path = "/datasets" app.router.add_route("GET", path, GET_Datasets) app.router.add_route("POST", path, POST_Dataset) + path = "/datasets/{id}/shape" app.router.add_route("GET", path, GET_DatasetShape) app.router.add_route("PUT", path, PUT_DatasetShape) + path = "/datasets/{id}/type" app.router.add_route("GET", path, GET_DatasetType) + path = "/datasets/{id}/attributes" app.router.add_route("GET", path, GET_Attributes) + app.router.add_route("POST", path, POST_Attributes) + app.router.add_route("PUT", path, PUT_Attributes) + app.router.add_route("DELETE", path, DELETE_Attributes) + path = "/datasets/{id}/attributes/{name}" app.router.add_route("GET", path, GET_Attribute) app.router.add_route("DELETE", path, DELETE_Attribute) app.router.add_route("PUT", path, PUT_Attribute) + path = "/datasets/{id}/attributes/{name}/value" app.router.add_route("GET", path, GET_AttributeValue) app.router.add_route("PUT", path, PUT_AttributeValue) + path = "/datasets/{id}/value" app.router.add_route("PUT", path, PUT_Value) app.router.add_route("GET", path, GET_Value) app.router.add_route("POST", path, POST_Value) - path = "/datasets/{id}/acls/{username}" - app.router.add_route("GET", path, GET_ACL) - app.router.add_route("PUT", path, PUT_ACL) - path = "/datasets/{id}/acls" - app.router.add_route("GET", path, GET_ACLs) # Add CORS to all routes cors_domain = config.get("cors_domain") diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 2a7b45c1..c5f0c561 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -14,18 +14,22 @@ # import asyncio +import json from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden from aiohttp.web_exceptions import HTTPNotFound, HTTPInternalServerError from aiohttp.client_exceptions import ClientOSError, ClientError from .util.authUtil import getAclKeys +from .util.arrayUtil import encodeData from .util.idUtil import getDataNodeUrl, getCollectionForId, isSchema2Id, getS3Key from .util.linkUtil import h5Join from .util.storUtil import getStorJSONObj, isStorObj from .util.authUtil import aclCheck -from .util.httpUtil import http_get, http_put -from .util.domainUtil import getBucketForDomain, verifyRoot +from .util.httpUtil import http_get, http_put, http_post, http_delete +from .util.domainUtil import getBucketForDomain, verifyRoot, getLimits +from .util.storUtil import getCompressors +from .basenode import getVersion from . import hsds_logger as log @@ -72,6 +76,88 @@ async def getDomainJson(app, domain, reload=False): return domain_json +async def getDomainResponse(app, domain_json, bucket=None, verbose=False): + """ construct JSON response for domain request """ + rsp_json = {} + if "root" in domain_json: + rsp_json["root"] = domain_json["root"] + rsp_json["class"] = "domain" + else: + rsp_json["class"] = "folder" + if "owner" in domain_json: + rsp_json["owner"] = domain_json["owner"] + if "created" in domain_json: + rsp_json["created"] = domain_json["created"] + + lastModified = 0 + if "lastModified" in domain_json: + lastModified = domain_json["lastModified"] + totalSize = len(json.dumps(domain_json)) + metadata_bytes = 0 + allocated_bytes = 0 + linked_bytes = 0 + num_chunks = 0 + num_linked_chunks = 0 + md5_sum = "" + + if verbose and "root" in domain_json: + root_id = domain_json["root"] + root_info = await getRootInfo(app, root_id, bucket=bucket) + if root_info: + allocated_bytes = root_info["allocated_bytes"] + totalSize += allocated_bytes + if "linked_bytes" in root_info: + linked_bytes += root_info["linked_bytes"] + totalSize += linked_bytes + if "num_linked_chunks" in root_info: + num_linked_chunks = root_info["num_linked_chunks"] + if "metadata_bytes" in root_info: + # this key was added for schema v2 + metadata_bytes = root_info["metadata_bytes"] + totalSize += metadata_bytes + if root_info["lastModified"] > lastModified: + lastModified = root_info["lastModified"] + if "md5_sum" in root_info: + md5_sum = root_info["md5_sum"] + + num_groups = root_info["num_groups"] + num_datatypes = root_info["num_datatypes"] + num_datasets = len(root_info["datasets"]) + num_chunks = root_info["num_chunks"] + rsp_json["scan_info"] = root_info # return verbose info here + + else: + # root info not available - just return 0 for these values + allocated_bytes = 0 + totalSize = 0 + num_groups = 0 + num_datasets = 0 + num_datatypes = 0 + num_chunks = 0 + + num_objects = num_groups + num_datasets + num_datatypes + num_chunks + rsp_json["num_groups"] = num_groups + rsp_json["num_datasets"] = num_datasets + rsp_json["num_datatypes"] = num_datatypes + rsp_json["num_objects"] = num_objects + rsp_json["total_size"] = totalSize + rsp_json["allocated_bytes"] = allocated_bytes + rsp_json["num_objects"] = num_objects + rsp_json["metadata_bytes"] = metadata_bytes + rsp_json["linked_bytes"] = linked_bytes + rsp_json["num_chunks"] = num_chunks + rsp_json["num_linked_chunks"] = num_linked_chunks + rsp_json["md5_sum"] = md5_sum + + # pass back config parameters the client may care about + + rsp_json["limits"] = getLimits() + rsp_json["compressors"] = getCompressors() + rsp_json["version"] = getVersion() + rsp_json["lastModified"] = lastModified + return rsp_json + + def checkBucketAccess(app, bucket, action="read"): """ if the given bucket is not the default bucket, check that non-default bucket access is enabled. @@ -169,6 +255,10 @@ async def getObjectJson( meta_cache = app["meta_cache"] obj_json = None + msg = f"GetObjectJson - obj_id: {obj_id} refresh: {refresh} " + msg += f"include_links: {include_links} include_attrs: {include_attrs}" + log.debug(msg) + if include_links or include_attrs: # links and attributes are subject to change, so always refresh refresh = True @@ -207,12 +297,22 @@ async def getObjectJson( log.debug(f"getObjectJson - fetching {obj_id} from {req}") # throws 404 if doesn't exist obj_json = await http_get(app, req, params=params) - meta_cache[obj_id] = obj_json + if obj_json is None: msg = f"Object: {obj_id} not found, req: {req}, params: {params}" log.warn(msg) raise HTTPNotFound() + # store object in meta_cache (but don't include links or attributes, + # since they are volatile) + cache_obj = {} + for k in obj_json: + if k in ("links", "attributes"): + continue + cache_obj[k] = obj_json[k] + meta_cache[obj_id] = cache_obj + log.debug(f"stored {cache_obj} in meta_cache") + return obj_json @@ -313,11 +413,14 @@ async def getObjectIdByPath(app, obj_id, h5path, bucket=None, refresh=False, dom if link_json["h5path"][0] == '/': msg = "External link by absolute path" log.debug(msg) + kwargs = {} + kwargs["bucket"] = bucket + kwargs["refresh"] = refresh + kwargs["domain"] = domain + kwargs["follow_soft_links"] = follow_soft_links + kwargs["follow_external_links"] = follow_external_links obj_id, domain, link_json = await getObjectIdByPath( - app, ext_domain_json["root"], link_json["h5path"], - bucket=bucket, refresh=refresh, domain=domain, - follow_soft_links=follow_soft_links, - follow_external_links=follow_external_links) + app, ext_domain_json["root"], link_json["h5path"], **kwargs) else: msg = "Cannot follow external link by relative path" log.warn(msg) @@ -539,3 +642,136 @@ async def doFlush(app, root_id, bucket=None): else: log.info("doFlush no fails, returning dn ids") return dn_ids + + +async def getAttributes(app, obj_id, + attr_names=None, + include_data=True, + ignore_nan=False, + create_order=False, + encoding=None, + limit=0, + marker=None, + bucket=None + ): + """ get the requested set of attributes from the given object """ + if attr_names is None: + msg = "attr_names is None, do a GET for all attributes" + log.debug(msg) + + collection = getCollectionForId(obj_id) + node_url = getDataNodeUrl(app, obj_id) + req = f"{node_url}/{collection}/{obj_id}/attributes" + log.debug(f"getAttributes: {req}") + params = {} + if include_data: + params["IncludeData"] = 1 + if ignore_nan: + params["ignore_nan"] = 1 + if bucket: + params["bucket"] = bucket + if create_order: + params["CreateOrder"] = 1 + if encoding: + params["encoding"] = encoding + + if attr_names: + # send names via a POST request + data = {"attributes": attr_names} + log.debug(f"using params: {params}") + dn_json = await http_post(app, req, data=data, params=params) + log.debug(f"attributes POST response for obj_id {obj_id} got: {dn_json}") + else: + # some additonal query params for get attributes + if limit: + params["Limit"] = limit + if marker: + params["Marker"] = marker + log.debug(f"using params: {params}") + # do a get to fetch all the attributes + dn_json = await http_get(app, req, params=params) + log.debug(f"attribute GET response for obj_id {obj_id} got: {dn_json}") + + log.debug(f"got attributes json from dn for obj_id: {obj_id}") + if "attributes" not in dn_json: + msg = f"expected attributes key from dn, but got: {dn_json}" + log.error(msg) + raise HTTPInternalServerError() + + attributes = dn_json["attributes"] + if not isinstance(attributes, list): + msg = f"was expecting list of attributes, but got: {type(attributes)}" + log.error(msg) + raise HTTPInternalServerError() + + if attr_names and len(attributes) < len(attr_names): + msg = f"POST attributes requested {len(attr_names)}, " + msg += f"but only {len(attributes)} were returned" + log.warn(msg) + + log.debug(f"getAttributes returning {len(attributes)} attributes") + return attributes + + +async def putAttributes(app, + obj_id, + attr_json=None, + replace=False, + bucket=None + ): + + """ write the given attributes to the appropriate DN """ + req = getDataNodeUrl(app, obj_id) + collection = getCollectionForId(obj_id) + req += f"/{collection}/{obj_id}/attributes" + log.info(f"putAttribute: {req}") + + params = {} + if replace: + # allow attribute to be overwritten + log.debug("setting replace for putAtttributes") + params["replace"] = 1 + else: + log.debug("replace is not set for putAttributes") + + if bucket: + params["bucket"] = bucket + + data = {"attributes": attr_json} + log.debug(f"put attributes params: {params}") + log.debug(f"put attributes: {attr_json}") + put_rsp = await http_put(app, req, data=data, params=params) + + if "status" in put_rsp: + status = put_rsp["status"] + else: + status = 201 + + log.info(f"putAttributes status: {status}") + + return status + + +async def deleteAttributes(app, obj_id, attr_names=None, separator="/", bucket=None): + """ delete the requested set of attributes from the given object """ + + if attr_names is None or len(attr_names) == 0: + msg = "provide a list of attribute names for deletion" + log.debug(msg) + raise HTTPBadRequest(reason=msg) + + collection = getCollectionForId(obj_id) + node_url = getDataNodeUrl(app, obj_id) + req = f"{node_url}/{collection}/{obj_id}/attributes" + log.debug(f"deleteAttributes: {req}") + # always use base64 to avoid any issues with url encoding + params = {"encoding": "base64", "separator": separator} + if bucket: + params["bucket"] = bucket + + # stringify the list of attr_names + attr_name_param = separator.join(attr_names) + attr_name_param = encodeData(attr_name_param).decode("ascii") + params["attr_names"] = attr_name_param + log.debug(f"using params: {params}") + await http_delete(app, req, params=params) diff --git a/hsds/util/arrayUtil.py b/hsds/util/arrayUtil.py index 10a28cfa..5cb40f58 100644 --- a/hsds/util/arrayUtil.py +++ b/hsds/util/arrayUtil.py @@ -425,29 +425,69 @@ def readElement(buffer, offset, arr, index, dt): return offset -def arrayToBytes(arr): +def encodeData(data, encoding="base64"): + """ Encode given data """ + if encoding != "base64": + raise ValueError("only base64 encoding is supported") + try: + if isinstance(data, str): + data = data.encode("utf8") + except UnicodeEncodeError: + raise ValueError("can not encode string value") + if not isinstance(data, bytes): + msg = "Expected str or bytes type to encodeData, " + msg += f"but got: {type(data)}" + raise TypeError(msg) + try: + encoded_data = base64.b64encode(data) + except Exception as e: + # TBD: what exceptions can be raised? + raise ValueError(f"Unable to encode: {e}") + return encoded_data + + +def decodeData(data, encoding="base64"): + if encoding != "base64": + raise ValueError("only base64 decoding is supported") + try: + decoded_data = base64.b64decode(data) + except Exception as e: + # TBD: catch actual exception + raise ValueError(f"Unable to decode: {e}") + return decoded_data + + +def arrayToBytes(arr, encoding=None): """ Return byte representation of numpy array """ - if not isVlen(arr.dtype): - # can just return normal numpy bytestream - return arr.tobytes() + if isVlen(arr.dtype): + nSize = getByteArraySize(arr) + buffer = bytearray(nSize) + offset = 0 + nElements = math.prod(arr.shape) + arr1d = arr.reshape((nElements,)) + for e in arr1d: + # print("arrayToBytes:", e) + offset = copyElement(e, arr1d.dtype, buffer, offset) + data = bytes(buffer) + else: + # fixed length type + data = arr.tobytes() - nSize = getByteArraySize(arr) - buffer = bytearray(nSize) - offset = 0 - nElements = math.prod(arr.shape) - arr1d = arr.reshape((nElements,)) - for e in arr1d: - # print("arrayToBytes:", e) - offset = copyElement(e, arr1d.dtype, buffer, offset) - return bytes(buffer) + if encoding: + data = encodeData(data) + return data -def bytesToArray(data, dt, shape): +def bytesToArray(data, dt, shape, encoding=None): """ Create numpy array based on byte representation """ + if encoding: + # decode the data + # will raise ValueError if non-decodeable + data = decodeData(data) if not isVlen(dt): # regular numpy from string arr = np.frombuffer(data, dtype=dt) diff --git a/hsds/util/attrUtil.py b/hsds/util/attrUtil.py index bbecf4bc..68ef2cdc 100755 --- a/hsds/util/attrUtil.py +++ b/hsds/util/attrUtil.py @@ -51,8 +51,3 @@ def validateAttributeName(name): msg = f"attribute name must be a string, but got: {type(name)}" log.warn(msg) raise HTTPBadRequest(reason=msg) - if name.find("/") > -1: - msg = "attribute names cannot contain slashes" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - # TBD - add any other restrictions diff --git a/hsds/util/domainUtil.py b/hsds/util/domainUtil.py index 3659b9e5..3fa2e7f7 100644 --- a/hsds/util/domainUtil.py +++ b/hsds/util/domainUtil.py @@ -13,6 +13,7 @@ import os.path as op from aiohttp.web_exceptions import HTTPBadRequest +from .. import config # # Domain utilities # @@ -287,3 +288,13 @@ def verifyRoot(domain_json): # can't use hsds logger, since it would create a circular dependency print("WARN> " + msg) raise HTTPBadRequest(reason=msg) + + +def getLimits(): + """return limits the client may need""" + limits = {} + limits["min_chunk_size"] = int(config.get("min_chunk_size")) + limits["max_chunk_size"] = int(config.get("max_chunk_size")) + limits["max_request_size"] = int(config.get("max_request_size")) + + return limits diff --git a/hsds/util/dsetUtil.py b/hsds/util/dsetUtil.py index 330f8aed..a0de6cd5 100644 --- a/hsds/util/dsetUtil.py +++ b/hsds/util/dsetUtil.py @@ -366,9 +366,7 @@ def getShapeDims(shape): """ dims = None if isinstance(shape, int): - dims = [ - shape, - ] + dims = [shape, ] elif isinstance(shape, list) or isinstance(shape, tuple): dims = shape # can use as is elif isinstance(shape, str): @@ -382,9 +380,7 @@ def getShapeDims(shape): if shape["class"] == "H5S_NULL": dims = None elif shape["class"] == "H5S_SCALAR": - dims = [ - 1, - ] + dims = [1,] elif shape["class"] == "H5S_SIMPLE": if "dims" not in shape: raise ValueError("'dims' key expected for shape") @@ -392,7 +388,7 @@ def getShapeDims(shape): else: raise ValueError("Unknown shape class: {}".format(shape["class"])) else: - raise ValueError("Unexpected shape class: {}".format(type(shape))) + raise ValueError(f"Unexpected shape class: {type(shape)}") return dims diff --git a/hsds/util/httpUtil.py b/hsds/util/httpUtil.py index cffe475d..1cc1e0dd 100644 --- a/hsds/util/httpUtil.py +++ b/hsds/util/httpUtil.py @@ -277,16 +277,16 @@ async def http_get(app, url, params=None, client=None): else: retval = await rsp.json() elif status_code == 400: - log.info(f"BadRequest to {url}") + log.warn(f"BadRequest to {url}") raise HTTPBadRequest() elif status_code == 403: - log.info(f"Forbiden to access {url}") + log.warn(f"Forbiden to access {url}") raise HTTPForbidden() elif status_code == 404: - log.info(f"Object: {url} not found") + log.warn(f"Object: {url} not found") raise HTTPNotFound() elif status_code == 410: - log.info(f"Object: {url} removed") + log.warn(f"Object: {url} removed") raise HTTPGone() elif status_code == 503: log.warn(f"503 error for http_get_Json {url}") @@ -309,7 +309,10 @@ async def http_post(app, url, data=None, params=None, client=None): """ Helper function - async HTTP POST """ - log.info(f"http_post('{url}', {len(data)} bytes)") + msg = f"http_post('{url}'" + if isinstance(data, bytes): + msg += f" {len(data)} bytes" + log.info(msg) if client is None: client = get_http_client(app, url=url) url = get_http_std_url(url) @@ -335,21 +338,21 @@ async def http_post(app, url, data=None, params=None, client=None): return None elif rsp.status == 400: msg = f"POST request HTTPBadRequest error for url: {url}" - log.info(msg) + log.warn(msg) raise HTTPBadRequest() elif rsp.status == 404: msg = f"POST request HTTPNotFound error for url: {url}" - log.info(msg) + log.warn(msg) raise HTTPNotFound() elif rsp.status == 410: - log.info(f"POST request HTTPGone error for url: {url}") + log.warn(f"POST request HTTPGone error for url: {url}") raise HTTPGone() elif rsp.status == 503: log.warn(f"503 error for http_get_Json {url}") raise HTTPServiceUnavailable() else: msg = f"POST request error for url: {url} status: {rsp.status}" - log.warn(msg) + log.error(msg) raise HTTPInternalServerError() if isBinaryResponse(rsp): # return binary data @@ -378,7 +381,7 @@ async def http_put(app, url, data=None, params=None, client=None): client = get_http_client(app, url=url) url = get_http_std_url(url) if isinstance(data, bytes): - log.debug("setting http_put for binary") + log.debug(f"setting http_put for binary, {len(data)} bytes") kwargs = {"data": data} else: log.debug("setting http_put for json") @@ -442,8 +445,6 @@ async def http_delete(app, url, data=None, params=None, client=None): kwargs["timeout"] = timeout if params: kwargs["params"] = params - if data: - params["json"] = data try: async with client.delete(url, **kwargs) as rsp: diff --git a/tests/integ/acl_test.py b/tests/integ/acl_test.py index a75c8c0b..2af5e24c 100644 --- a/tests/integ/acl_test.py +++ b/tests/integ/acl_test.py @@ -79,21 +79,6 @@ def testGetAcl(self): rsp = self.session.get(req, headers=headers) rspJson = json.loads(rsp.text) self.assertTrue("root" in rspJson) - root_uuid = rspJson["root"] - - # get the ACL for the Group - req = helper.getEndpoint() + "/groups/" + root_uuid + "/acls/" + username - rsp = self.session.get(req, headers=headers) - self.assertEqual(rsp.status_code, 200) - self.assertEqual(rsp.headers["content-type"], "application/json; charset=utf-8") - rsp_json = json.loads(rsp.text) - self.assertTrue("acl" in rsp_json) - self.assertTrue("hrefs" in rsp_json) - acl = rsp_json["acl"] - self.assertEqual(len(acl.keys()), len(acl_keys) + 1) - for k in acl_keys: - self.assertTrue(k in acl) - self.assertEqual(acl[k], True) # try getting the ACL for a random user, should return 404 req = helper.getEndpoint() + "/acls/joebob" @@ -165,17 +150,6 @@ def testGetAcls(self): self.assertTrue("root" in rspJson) root_uuid = rspJson["root"] - # get the ACLs for the Group - req = helper.getEndpoint() + "/groups/" + root_uuid + "/acls" - rsp = self.session.get(req, headers=headers) - self.assertEqual(rsp.status_code, 200) - self.assertEqual(rsp.headers["content-type"], "application/json; charset=utf-8") - rsp_json = json.loads(rsp.text) - self.assertTrue("acls" in rsp_json) - self.assertTrue("hrefs" in rsp_json) - acls = rsp_json["acls"] - self.assertEqual(len(acls), expected_acl_count) - # create a dataset payload = { "type": "H5T_STD_I32LE", @@ -189,18 +163,6 @@ def testGetAcls(self): dset_uuid = rspJson["id"] self.assertTrue(helper.validateId(dset_uuid)) - # now try getting the ACLs for the dataset - req = helper.getEndpoint() + "/datasets/" + dset_uuid + "/acls" - rsp = self.session.get(req, headers=headers) - self.assertEqual(rsp.status_code, 200) - self.assertEqual(rsp.headers["content-type"], "application/json; charset=utf-8") - rsp_json = json.loads(rsp.text) - self.assertTrue("acls" in rsp_json) - self.assertTrue("hrefs" in rsp_json) - acls = rsp_json["acls"] - - self.assertEqual(len(acls), expected_acl_count) - # create a committed type payload = {"type": "H5T_IEEE_F64LE", "link": {"id": root_uuid, "name": "dtype"}} @@ -213,17 +175,6 @@ def testGetAcls(self): dtype_uuid = rspJson["id"] self.assertTrue(helper.validateId(dtype_uuid)) - # now try getting the ACLs for the datatype - req = helper.getEndpoint() + "/datatypes/" + dtype_uuid + "/acls" - rsp = self.session.get(req, headers=headers) - self.assertEqual(rsp.status_code, 200) - self.assertEqual(rsp.headers["content-type"], "application/json; charset=utf-8") - rsp_json = json.loads(rsp.text) - self.assertTrue("acls" in rsp_json) - self.assertTrue("hrefs" in rsp_json) - acls = rsp_json["acls"] - self.assertEqual(len(acls), expected_acl_count) - # try fetching ACLs from a user who doesn't have readACL permissions req = helper.getEndpoint() + "/acls" user2name = config.get("user2_name") diff --git a/tests/integ/attr_test.py b/tests/integ/attr_test.py index e6fabeda..6e203981 100644 --- a/tests/integ/attr_test.py +++ b/tests/integ/attr_test.py @@ -13,6 +13,7 @@ import unittest import json import numpy as np +import base64 import helper import config @@ -31,6 +32,14 @@ def tearDown(self): if self.session: self.session.close() + def getUUIDByPath(self, domain, h5path): + return helper.getUUIDByPath(domain, h5path, session=self.session) + + def getRootUUID(self, domain, username=None, password=None): + return helper.getRootUUID( + domain, username=username, password=password, session=self.session + ) + # main def testListAttr(self): @@ -102,7 +111,7 @@ def testListAttr(self): # get all the attributes req = self.endpoint + "/groups/" + root_uuid + "/attributes" - params = {} + params = {"IncludeData": 0} if creation_order: params["CreateOrder"] = 1 rsp = self.session.get(req, params=params, headers=headers) @@ -125,7 +134,6 @@ def testListAttr(self): self.assertTrue("shape" in attrJson) shapeJson = attrJson["shape"] self.assertEqual(shapeJson["class"], "H5S_SCALAR") - # self.assertTrue("value" not in attrJson) # TBD - change api to include value? self.assertTrue("created" in attrJson) self.assertTrue("href" in attrJson) self.assertTrue("value" not in attrJson) @@ -232,18 +240,16 @@ def testObjAttr(self): # do a GET for attribute "attr" (should return 404) attr_name = "attr" - attr_payload = {"type": "H5T_STD_I32LE", "value": 42} req = f"{self.endpoint}/{col_name}/{obj1_id}/attributes/{attr_name}" rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 404) # not found + attr_payload = {"type": "H5T_STD_I32LE", "value": 42} # try adding the attribute as a different user user2_name = config.get("user2_name") if user2_name: - headers = helper.getRequestHeaders( - domain=self.base_domain, username="test_user2" - ) + headers = helper.getRequestHeaders(domain=self.base_domain, username="test_user2") rsp = self.session.put(req, data=json.dumps(attr_payload), headers=headers) self.assertEqual(rsp.status_code, 403) # forbidden else: @@ -287,6 +293,12 @@ def testObjAttr(self): rsp = self.session.put(req, data=json.dumps(attr_payload), headers=headers) self.assertEqual(rsp.status_code, 409) # conflict + # set the replace param and we should get a 200 + params = {"replace": 1} + data = json.dumps(attr_payload) + rsp = self.session.put(req, params=params, data=data, headers=headers) + self.assertEqual(rsp.status_code, 200) # OK + # delete the attribute rsp = self.session.delete(req, headers=headers) self.assertEqual(rsp.status_code, 200) # OK @@ -676,6 +688,65 @@ def testPutFixedUTF8StringBinary(self): self.assertTrue("charSet" in type_json) self.assertEqual(type_json["charSet"], "H5T_CSET_UTF8") + def testPutNonUTF8String(self): + # Test PUT value for 1d attribute with string that is not UTF encodable + print("testPutFixedUTF8String", self.base_domain) + + headers = helper.getRequestHeaders(domain=self.base_domain) + headers_bin_req = helper.getRequestHeaders(domain=self.base_domain) + headers_bin_req["Content-Type"] = "application/octet-stream" + + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create attr + data = b'\xfe\xff' # invlaid UTF sequence + + num_bytes = len(data) + fixed_str_type = { + "charSet": "H5T_CSET_UTF8", + "class": "H5T_STRING", + "length": num_bytes, + "strPad": "H5T_STR_NULLTERM", + } + + scalar_shape = {"class": "H5S_SCALAR"} + body = {"type": fixed_str_type, "shape": scalar_shape} + attr_name = "bad_utf_attr" + req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name + rsp = self.session.put(req, data=json.dumps(body), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # write binary value for attribute + rsp = self.session.put(req + "/value", data=data, headers=headers_bin_req) + self.assertEqual(rsp.status_code, 200) + + # read attr + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("encoding" in rspJson) + self.assertEqual(rspJson["encoding"], "base64") + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], "/v8=") # base64 encoded value for b'\xfe\xff' + self.assertTrue("type" in rspJson) + type_json = rspJson["type"] + self.assertTrue("class" in type_json) + self.assertEqual(type_json["class"], "H5T_STRING") + self.assertTrue("length" in type_json) + self.assertEqual(type_json["length"], num_bytes) + self.assertTrue("strPad" in type_json) + self.assertEqual(type_json["strPad"], "H5T_STR_NULLTERM") + self.assertTrue("charSet" in type_json) + self.assertEqual(type_json["charSet"], "H5T_CSET_UTF8") + def testPutVLenString(self): # Test PUT value for 1d attribute with variable length string types print("testPutVLenString", self.base_domain) @@ -1332,7 +1403,7 @@ def testGetAttributeBinaryValue(self): def testPutAttributeBinaryValue(self): # Test Put Attribute value with binary response - print("testGetAttributeBinaryValue", self.base_domain) + print("testPutAttributeBinaryValue", self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) headers_bin_req = helper.getRequestHeaders(domain=self.base_domain) @@ -1347,7 +1418,7 @@ def testPutAttributeBinaryValue(self): root_uuid = rspJson["root"] helper.validateId(root_uuid) - # create attr without any data + # create attr with one-dimensional array value = [2, 3, 5, 7, 11, 13] extent = len(value) body = {"type": "H5T_STD_I32LE", "shape": extent} @@ -1395,6 +1466,76 @@ def testPutAttributeBinaryValue(self): self.assertFalse("shape" in rspJson) self.assertEqual(rspJson["value"], value) + def testPutAttributeWithEncoding(self): + # Test Put Attribute with base64 encoding of the value + print("testPutAttributeWithEncoding", self.base_domain) + + headers = helper.getRequestHeaders(domain=self.base_domain) + + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # one dimensional list of ints + value = [2, 3, 5, 7, 11, 13] + extent = len(value) + + # convert to numpy array + arr = np.array(value, np.int32) + arr_bytes = arr.tobytes() + encoded_bytes = base64.b64encode(arr_bytes) # base64 encode array data + encoded_value = encoded_bytes.decode("ascii") # convert bytes to string + body = {"type": "H5T_STD_I32LE", "shape": extent} + body["value"] = encoded_value + body["encoding"] = "base64" + attr_name = "encoded_int_arr" + req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name + rsp = self.session.put(req, data=json.dumps(body), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # read attr back + req = f"{self.endpoint}/groups/{root_uuid}/attributes/{attr_name}" + + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("value" in rspJson) + self.assertTrue("type" in rspJson) + self.assertTrue("shape" in rspJson) + self.assertTrue("encoding" not in rspJson) + # self.assertEqual(rspJson["encoding"], "base64") + self.assertEqual(rspJson["value"], value) + # get the encoded value back + params = {"encoding": "base64"} + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("value" in rspJson) + self.assertTrue("type" in rspJson) + self.assertTrue("shape" in rspJson) + self.assertTrue("encoding" in rspJson) + self.assertEqual(rspJson["encoding"], "base64") + self.assertEqual(rspJson["value"], encoded_value) + + # do a binary read + headers_bin_rsp = helper.getRequestHeaders(domain=self.base_domain) + headers_bin_rsp["accept"] = "application/octet-stream" + req = f"{self.endpoint}/groups/{root_uuid}/attributes/{attr_name}/value" + + rsp = self.session.get(req, headers=headers_bin_rsp) + self.assertEqual(rsp.status_code, 200) + self.assertEqual(rsp.headers["Content-Type"], "application/octet-stream") + data = rsp.content + self.assertEqual(len(data), len(arr_bytes)) + self.assertEqual(data, arr_bytes) + def testNaNAttributeValue(self): # Test GET Attribute value with JSON response that contains NaN data print("testNaNAttributeValue", self.base_domain) @@ -1410,9 +1551,7 @@ def testNaNAttributeValue(self): helper.validateId(root_uuid) # create attr - value = [ - np.NaN, - ] * 6 + value = [np.NaN, ] * 6 data = {"type": "H5T_IEEE_F32LE", "shape": 6, "value": value} attr_name = "nan_arr_attr" req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name @@ -1422,9 +1561,7 @@ def testNaNAttributeValue(self): # get all attributes, then by name, and then by value for req_suffix in ("", f"/{attr_name}", f"/{attr_name}/value"): for ignore_nan in (False, True): - req = ( - self.endpoint + "/groups/" + root_uuid + "/attributes" + req_suffix - ) + req = f"{self.endpoint}/groups/{root_uuid}/attributes{req_suffix}" params = {} if not req_suffix: # fetch data when getting all attribute @@ -1432,9 +1569,6 @@ def testNaNAttributeValue(self): if ignore_nan: params["ignore_nan"] = 1 rsp = self.session.get(req, headers=headers, params=params) - if rsp.status_code == 500 and not ignore_nan: - # nan value can generate json encoiding errors - continue self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) self.assertTrue("hrefs" in rspJson) @@ -1454,6 +1588,601 @@ def testNaNAttributeValue(self): else: self.assertTrue(np.isnan(rspValue[i])) + def testNonURLEncodableAttributeName(self): + print("testURLEncodableAttributeName", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create a subgroup + req = self.endpoint + "/groups" + rsp = self.session.post(req, headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + grp_id = rspJson["id"] + self.assertTrue(helper.validateId(grp_id)) + + # link as "grp1" + grp_name = "grp1" + req = self.endpoint + "/groups/" + root_uuid + "/links/" + grp_name + payload = {"id": grp_id} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # created + + attr_name = "#attr/1#" # add a slash for extra challenge points + req = self.endpoint + "/groups/" + grp_id + "/attributes" # request without name + bad_req = f"{req}/{attr_name}" # this request will fail because of the hash char + + # create attr + value = [i * 2 for i in range(6)] + data = {"type": "H5T_IEEE_F32LE", "shape": 6, "value": value} + rsp = self.session.put(bad_req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 404) # regular put doesn't work + + attributes = {attr_name: data} + body = {"attributes": attributes} + + rsp = self.session.put(req, data=json.dumps(body), headers=headers) + self.assertEqual(rsp.status_code, 201) # this is ok + + # get all attributes and verify the one we created is there + expected_type = {'class': 'H5T_FLOAT', 'base': 'H5T_IEEE_F32LE'} + expected_shape = {'class': 'H5S_SIMPLE', 'dims': [6]} + params = {"IncludeData": 1} + rsp = self.session.get(req, headers=headers, params=params) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("attributes" in rspJson) + rsp_attributes = rspJson["attributes"] + self.assertEqual(len(rsp_attributes), 1) + rsp_attr = rsp_attributes[0] + self.assertTrue("name" in rsp_attr) + self.assertEqual(rsp_attr["name"], attr_name) + self.assertTrue("href" in rsp_attr) + self.assertTrue("created" in rsp_attr) + self.assertTrue("type" in rsp_attr) + self.assertEqual(rsp_attr["type"], expected_type) + self.assertTrue("shape" in rsp_attr) + self.assertTrue(rsp_attr["shape"], expected_shape) + self.assertTrue("value" in rsp_attr) + self.assertEqual(rsp_attr["value"], value) + + # try doing a get on this specific attribute + rsp = self.session.get(bad_req, headers=headers) + self.assertEqual(rsp.status_code, 404) # can't do a get with the attribute name + + # do a post request with the attribute name + attr_names = [attr_name, ] + data = {"attr_names": attr_names} + rsp = self.session.post(req, data=json.dumps(data), params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("attributes" in rspJson) + rsp_attributes = rspJson["attributes"] + self.assertEqual(len(rsp_attributes), 1) + rsp_attr = rsp_attributes[0] + + self.assertTrue("name" in rsp_attr) + self.assertEqual(rsp_attr["name"], attr_name) + self.assertTrue("href" in rsp_attr) + self.assertTrue("created" in rsp_attr) + self.assertTrue("type" in rsp_attr) + self.assertEqual(rsp_attr["type"], expected_type) + self.assertTrue("shape" in rsp_attr) + self.assertTrue(rsp_attr["shape"], expected_shape) + self.assertTrue("value" in rsp_attr) + self.assertEqual(rsp_attr["value"], value) + + # try deleting the attribute by name + rsp = self.session.delete(bad_req, headers=headers) + self.assertEqual(rsp.status_code, 404) # not found + + # send attribute name as an encoded query param + attr_names_param = base64.b64encode(attr_name.encode("utf8")).decode("ascii") + # specify a separator since our attribute name has the default slash + params = {"attr_names": attr_names_param, "encoding": "base64", "separator": "!"} + rsp = self.session.delete(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # verify the attribute is gone + rsp = self.session.get(req, headers=headers, params=params) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("attributes" in rspJson) + rsp_attributes = rspJson["attributes"] + self.assertEqual(len(rsp_attributes), 0) + + def testPostAttributeSingle(self): + domain = helper.getTestDomain("tall.h5") + print("testGetDomain", domain) + headers = helper.getRequestHeaders(domain=domain) + headers["Origin"] = "https://www.hdfgroup.org" # test CORS + headers_bin_rsp = helper.getRequestHeaders(domain=domain) + headers_bin_rsp["accept"] = "application/octet-stream" + + # verify domain exists + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + if rsp.status_code != 200: + msg = f"WARNING: Failed to get domain: {domain}. Is test data setup?" + print(msg) + return # abort rest of test + domainJson = json.loads(rsp.text) + root_id = domainJson["root"] + helper.validateId(root_id) + + attr_names = ["attr1", "attr2"] + expected_types = ["H5T_STD_I8LE", "H5T_STD_I32BE"] + expected_values = [[97, 98, 99, 100, 101, 102, 103, 104, 105, 0], + [[0, 1], [2, 3]]] + + data = {"attr_names": attr_names} + req = helper.getEndpoint() + "/groups/" + root_id + "/attributes" + params = {"IncludeData": 0} + rsp = self.session.post(req, data=json.dumps(data), params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("attributes" in rspJson) + attributes = rspJson["attributes"] + self.assertTrue(isinstance(attributes, list)) + + self.assertEqual(len(attributes), len(attr_names)) + for i in range(len(attr_names)): + attrJson = attributes[i] + self.assertTrue("name" in attrJson) + self.assertEqual(attrJson["name"], attr_names[i]) + self.assertTrue("type" in attrJson) + type_json = attrJson["type"] + self.assertEqual(type_json["class"], "H5T_INTEGER") + self.assertEqual(type_json["base"], expected_types[i]) + self.assertTrue("shape" in attrJson) + shapeJson = attrJson["shape"] + self.assertEqual(shapeJson["class"], "H5S_SIMPLE") + self.assertTrue("created" in attrJson) + self.assertTrue("href" in attrJson) + self.assertTrue("value" not in attrJson) + + # test with returning all attribute values + params = {"IncludeData": 1} + rsp = self.session.post(req, data=json.dumps(data), params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("attributes" in rspJson) + attributes = rspJson["attributes"] + self.assertTrue(isinstance(attributes, list)) + + self.assertEqual(len(attributes), len(attr_names)) + for i in range(len(attr_names)): + attrJson = attributes[i] + self.assertTrue("name" in attrJson) + self.assertEqual(attrJson["name"], attr_names[i]) + self.assertTrue("type" in attrJson) + type_json = attrJson["type"] + self.assertEqual(type_json["class"], "H5T_INTEGER") + self.assertEqual(type_json["base"], expected_types[i]) + self.assertTrue("shape" in attrJson) + shapeJson = attrJson["shape"] + self.assertEqual(shapeJson["class"], "H5S_SIMPLE") + self.assertTrue("created" in attrJson) + self.assertTrue("href" in attrJson) + self.assertTrue("value" in attrJson) + self.assertEqual(attrJson["value"], expected_values[i]) + + def testPostAttributeMultiple(self): + """ Get attributes for multiple objs """ + domain = helper.getTestDomain("tall.h5") + print("testGetDomain", domain) + headers = helper.getRequestHeaders(domain=domain) + headers["Origin"] = "https://www.hdfgroup.org" # test CORS + headers_bin_rsp = helper.getRequestHeaders(domain=domain) + headers_bin_rsp["accept"] = "application/octet-stream" + + # verify domain exists + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + if rsp.status_code != 200: + msg = f"WARNING: Failed to get domain: {domain}. Is test data setup?" + print(msg) + return # abort rest of test + domainJson = json.loads(rsp.text) + root_id = domainJson["root"] + helper.validateId(root_id) + dset_id = self.getUUIDByPath(domain, "/g1/g1.1/dset1.1.1") + helper.validateId(dset_id) + + attr_names = ["attr1", "attr2"] + obj_ids = [root_id, dset_id] + expected_types_lookup = {} + expected_types_lookup[root_id] = ["H5T_STD_I8LE", "H5T_STD_I32BE"] + expected_types_lookup[dset_id] = ["H5T_STD_I8LE", "H5T_STD_I8LE"] + expected_values_lookup = {} + expected_values_lookup[root_id] = [ + [97, 98, 99, 100, 101, 102, 103, 104, 105, 0], + [[0, 1], [2, 3]]] + expected_values_lookup[dset_id] = [ + [49, 115, 116, 32, 97, 116, 116, 114, 105, 98, 117, 116, + 101, 32, 111, 102, 32, 100, 115, 101, 116, 49, 46, 49, + 46, 49, 0], + [50, 110, 100, 32, 97, 116, 116, 114, 105, 98, 117, 116, + 101, 32, 111, 102, 32, 100, 115, 101, 116, 49, 46, 49, + 46, 49, 0] + ] + + data = {"attr_names": attr_names, "obj_ids": obj_ids} + params = {"IncludeData": 0} + req = helper.getEndpoint() + "/groups/" + root_id + "/attributes" + rsp = self.session.post(req, data=json.dumps(data), params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("attributes" in rspJson) + attributes = rspJson["attributes"] + self.assertTrue(isinstance(attributes, dict)) + self.assertEqual(len(attributes), 2) + + self.assertTrue(root_id in attributes) + self.assertTrue(dset_id in attributes) + + for obj_id in attributes.keys(): + expected_types = expected_types_lookup[obj_id] + obj_attributes = attributes[obj_id] + + self.assertEqual(len(obj_attributes), len(attr_names)) + for i in range(len(attr_names)): + attrJson = obj_attributes[i] + self.assertTrue("name" in attrJson) + self.assertEqual(attrJson["name"], attr_names[i]) + self.assertTrue("type" in attrJson) + type_json = attrJson["type"] + self.assertEqual(type_json["class"], "H5T_INTEGER") + self.assertEqual(type_json["base"], expected_types[i]) + self.assertTrue("shape" in attrJson) + shapeJson = attrJson["shape"] + self.assertEqual(shapeJson["class"], "H5S_SIMPLE") + self.assertTrue("created" in attrJson) + self.assertTrue("href" in attrJson) + self.assertTrue("value" not in attrJson) + + # test with returning attribute values + params = {"IncludeData": 1} + rsp = self.session.post(req, data=json.dumps(data), params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("attributes" in rspJson) + attributes = rspJson["attributes"] + self.assertTrue(isinstance(attributes, dict)) + self.assertEqual(len(attributes), 2) + + self.assertTrue(root_id in attributes) + self.assertTrue(dset_id in attributes) + + for obj_id in attributes.keys(): + expected_types = expected_types_lookup[obj_id] + expected_values = expected_values_lookup[obj_id] + obj_attributes = attributes[obj_id] + + self.assertEqual(len(obj_attributes), len(attr_names)) + for i in range(len(attr_names)): + attrJson = obj_attributes[i] + self.assertTrue("name" in attrJson) + self.assertEqual(attrJson["name"], attr_names[i]) + self.assertTrue("type" in attrJson) + type_json = attrJson["type"] + self.assertEqual(type_json["class"], "H5T_INTEGER") + self.assertEqual(type_json["base"], expected_types[i]) + self.assertTrue("shape" in attrJson) + shapeJson = attrJson["shape"] + self.assertEqual(shapeJson["class"], "H5S_SIMPLE") + self.assertTrue("created" in attrJson) + self.assertTrue("href" in attrJson) + self.assertTrue("value" in attrJson) + self.assertEqual(attrJson["value"], expected_values[i]) + + # test with unique attr names per obj id + items = {} + items[root_id] = [attr_names[0], attr_names[1]] + items[dset_id] = [attr_names[1], ] + data = {"obj_ids": items} + req = helper.getEndpoint() + "/groups/" + root_id + "/attributes" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 200) + + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("attributes" in rspJson) + attributes = rspJson["attributes"] + self.assertTrue(isinstance(attributes, dict)) + self.assertEqual(len(attributes), 2) + + self.assertTrue(root_id in attributes) + self.assertTrue(dset_id in attributes) + root_attrs = attributes[root_id] + self.assertEqual(len(root_attrs), 2) + dset_attrs = attributes[dset_id] + self.assertEqual(len(dset_attrs), 1) # only asked for attr2 + dset_attr = dset_attrs[0] + self.assertEqual(dset_attr["name"], "attr2") + + # try asking for a non-existent attribute + items = {} + items[root_id] = [attr_names[0], attr_names[1]] + items[dset_id] = [attr_names[1], "foobar"] + data = {"obj_ids": items} + req = helper.getEndpoint() + "/groups/" + root_id + "/attributes" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 404) + + # test with not providing any attribute names - should return all attributes + # for set of obj ids + data = {"obj_ids": obj_ids} + req = helper.getEndpoint() + "/groups/" + root_id + "/attributes" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("attributes" in rspJson) + attributes = rspJson["attributes"] + self.assertEqual(len(attributes), 2) + self.assertTrue(root_id in attributes) + root_attrs = attributes[root_id] + self.assertEqual(len(root_attrs), 2) + dset_attrs = attributes[dset_id] + self.assertEqual(len(dset_attrs), 2) + + def testPutAttributeMultiple(self): + print("testPutAttributeMultiple", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_id = rspJson["root"] + + # create a dataset + req = self.endpoint + "/datasets" + data = {"type": "H5T_IEEE_F32LE"} + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["attributeCount"], 0) + dset_id = rspJson["id"] + self.assertTrue(helper.validateId(dset_id)) + + # link new obj as '/dset' + req = self.endpoint + "/groups/" + root_id + "/links/dset1" + payload = {"id": dset_id} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # created + + # get obj and verify it has no attributes + req = self.endpoint + "/datasets/" + dset_id + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["attributeCount"], 0) # no attributes + + # create some groups + grp_count = 3 + + grp_names = [f"group{i+1}" for i in range(grp_count)] + grp_ids = [] + + for grp_name in grp_names: + # create sub_groups + req = self.endpoint + "/groups" + rsp = self.session.post(req, headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["attributeCount"], 0) + grp_id = rspJson["id"] + self.assertTrue(helper.validateId(grp_id)) + grp_ids.append(grp_id) + + # link new obj as '/grp_name' + req = self.endpoint + "/groups/" + root_id + "/links/" + grp_name + payload = {"id": grp_id} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # created + + # get obj and verify it has no attributes + req = self.endpoint + "/groups/" + grp_id + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["attributeCount"], 0) # no attributes + + # setup some attributes to write + attr_count = 4 + attributes = {} + extent = 10 + for i in range(attr_count): + value = [i * 10 + j for j in range(extent)] + data = {"type": "H5T_STD_I32LE", "shape": extent, "value": value} + attr_name = f"attr{i+1:04d}" + attributes[attr_name] = data + + # write attributes to the dataset + data = {"attributes": attributes} + req = self.endpoint + "/datasets/" + dset_id + "/attributes" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # do a get on the attributes + params = {"IncludeData": 1} + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("attributes" in rspJson) + ret_attrs = rspJson["attributes"] + self.assertEqual(len(ret_attrs), attr_count) + for i in range(attr_count): + attr = ret_attrs[i] + self.assertTrue("name" in attr) + self.assertEqual(attr["name"], f"attr{i+1:04d}") + self.assertTrue("value" in attr) + attr_value = attr["value"] + self.assertEqual(len(attr_value), extent) + self.assertEqual(attr_value, [i * 10 + j for j in range(extent)]) + + # try writing again, should get 409 + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 409) + + # write attributes to the three group objects + data = {"obj_ids": grp_ids, "attributes": attributes} + req = self.endpoint + "/groups/" + root_id + "/attributes" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # do a write with different attributes to different groups + attributes = {} + base_ord = ord('A') + for i in range(grp_count): + obj_id = grp_ids[i] + obj_attrs = {} + for j in range(i + 1): + value = [i * 10000 + (j + 1) * 100 + k for k in range(extent)] + data = {"type": "H5T_STD_I32LE", "shape": extent, "value": value} + attr_name = f"attr_{chr(base_ord + j)}" + obj_attrs[attr_name] = data + attributes[obj_id] = {"attributes": obj_attrs} + + # write attributes to the three group objects + # attr_A to obj_id[0], attr_A and attr_B to obj_id[1], etc + data = {"obj_ids": attributes} # no "attributes" key this time + req = self.endpoint + "/groups/" + root_id + "/attributes" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # do a get attributes on the three group objects to verify + for i in range(grp_count): + grp_id = grp_ids[i] + # do a get on the attributes + params = {"IncludeData": 1} + req = self.endpoint + "/groups/" + grp_id + "/attributes" + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("attributes" in rspJson) + ret_attrs = rspJson["attributes"] + # expect the 4 attributes we wrote in the first post + # plus (i+1) in the second post + self.assertEqual(len(ret_attrs), attr_count + i + 1) + for j in range(len(ret_attrs)): + attr = ret_attrs[j] + self.assertTrue("name" in attr) + if j < attr_count: + # should see attr0001, attr0002, etc. + expected_name = f"attr{j + 1:04d}" + expected_value = [j * 10 + k for k in range(extent)] + else: + # should see attr_A, attr_B, etc. + expected_name = f"attr_{chr(base_ord + j - attr_count)}" + min_val = i * 10000 + (j + 1 - attr_count) * 100 + expected_value = [min_val + k for k in range(extent)] + + self.assertEqual(attr["name"], expected_name) + self.assertTrue("value" in attr) + attr_value = attr["value"] + self.assertEqual(len(attr_value), extent) + self.assertEqual(attr_value, expected_value) + + # try writing again, should get 409 + req = self.endpoint + "/groups/" + root_id + "/attributes" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 409) + + def testDeleteAttributesMultiple(self): + print("testDeleteAttributesMultiple", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + attr_count = 10 + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create a subgroup + req = self.endpoint + "/groups" + rsp = self.session.post(req, headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + grp_id = rspJson["id"] + self.assertTrue(helper.validateId(grp_id)) + + # link as "grp1" + grp_name = "grp1" + req = self.endpoint + "/groups/" + root_uuid + "/links/" + grp_name + payload = {"id": grp_id} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # created + + attr_names = [] + # Create attributes + for i in range(attr_count): + attr_name = f"attr{i:04d}" + attr_names.append(attr_name) + value = [i] + data = {"type": "H5T_IEEE_F32LE", "shape": 1, "value": value} + req = self.endpoint + "/groups/" + grp_id + "/attributes/" + attr_name + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) # Created + + # Delete all by parameter + separator = '/' + params = {"attr_names": separator.join(attr_names)} + req = self.endpoint + "/groups/" + grp_id + "/attributes" + rsp = self.session.delete(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # Attempt to read deleted attributes + for i in range(attr_count): + req = self.endpoint + "/groups/" + grp_id + "/attributes/" + attr_names[i] + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 404) + + # Create another batch of attributes + for i in range(attr_count): + value = [i] + data = {"type": "H5T_IEEE_F32LE", "shape": 1, "value": value} + req = self.endpoint + "/groups/" + grp_id + "/attributes/" + attr_names[i] + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) # Created + + # Delete with custom separator + separator = ':' + params = {"attr_names": separator.join(attr_names)} + params["separator"] = ":" + req = self.endpoint + "/groups/" + grp_id + "/attributes" + rsp = self.session.delete(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # Attempt to read + for i in range(attr_count): + req = self.endpoint + "/groups/" + grp_id + "/attributes/" + attr_names[i] + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 404) + if __name__ == "__main__": # setup test files diff --git a/tests/integ/domain_test.py b/tests/integ/domain_test.py index 670a02c3..e339aa6e 100755 --- a/tests/integ/domain_test.py +++ b/tests/integ/domain_test.py @@ -68,9 +68,8 @@ def testGetDomain(self): req = helper.getEndpoint() + "/" rsp = self.session.get(req, headers=headers) if rsp.status_code != 200: - print( - "WARNING: Failed to get domain: {}. Is test data setup?".format(domain) - ) + msg = f"WARNING: Failed to get domain: {domain}. Is test data setup?" + print(msg) return # abort rest of test self.assertEqual(rsp.headers["content-type"], "application/json; charset=utf-8") rspJson = json.loads(rsp.text) @@ -146,7 +145,7 @@ def testGetDomain(self): attr_count = 0 for objid in domain_objs: obj_json = domain_objs[objid] - self.assertFalse("attributeCount" in obj_json) + self.assertTrue("attributeCount" in obj_json) self.assertTrue("attributes" in obj_json) attributes = obj_json["attributes"] for attr_name in attributes: @@ -198,9 +197,8 @@ def testGetByPath(self): req = helper.getEndpoint() + "/" rsp = self.session.get(req, headers=headers) if rsp.status_code != 200: - print( - "WARNING: Failed to get domain: {}. Is test data setup?".format(domain) - ) + msg = f"WARNING: Failed to get domain: {domain}. Is test data setup?" + print(msg) return # abort rest of test domainJson = json.loads(rsp.text) self.assertTrue("root" in domainJson) @@ -919,9 +917,8 @@ def testDomainCollections(self): rsp = self.session.get(req, headers=headers) if rsp.status_code != 200: - print( - "WARNING: Failed to get domain: {}. Is test data setup?".format(domain) - ) + msg = f"WARNING: Failed to get domain: {domain}. Is test data setup?" + print(msg) return # abort rest of test rspJson = json.loads(rsp.text) diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index 3d3f53ab..0de87d53 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -1660,9 +1660,8 @@ def testGet(self): req = helper.getEndpoint() + "/" rsp = self.session.get(req, headers=headers) if rsp.status_code != 200: - print( - "WARNING: Failed to get domain: {}. Is test data setup?".format(domain) - ) + msg = f"WARNING: Failed to get domain: {domain}. Is test data setup?" + print(msg) return # abort rest of test domainJson = json.loads(rsp.text) root_uuid = domainJson["root"] diff --git a/tests/unit/array_util_test.py b/tests/unit/array_util_test.py index 1695e82d..774f21b1 100644 --- a/tests/unit/array_util_test.py +++ b/tests/unit/array_util_test.py @@ -81,9 +81,7 @@ def testToTuple(self): self.assertEqual([(1, 0.1), (2, 0.2), (3, 0.3), (4, 0.4)], out) out = toTuple(2, data3d) # treat input as 2d array of two-field compound types self.assertEqual([[(0, 0.0), (1, 0.1)], [(2, 0.2), (3, 0.3)]], out) - out = toTuple( - 1, data3d - ) # treat input a 1d array of compound type of compound types + out = toTuple(1, data3d) # treat input a 1d array of compound type of compound types self.assertEqual([((0, 0.0), (1, 0.1)), ((2, 0.2), (3, 0.3))], out) def testGetNumElements(self): @@ -91,9 +89,7 @@ def testGetNumElements(self): nelements = getNumElements(shape) self.assertEqual(nelements, 4) - shape = [ - 10, - ] + shape = [10,] nelements = getNumElements(shape) self.assertEqual(nelements, 10) @@ -103,9 +99,7 @@ def testGetNumElements(self): def testJsonToArray(self): dt = np.dtype("i4") - shape = [ - 4, - ] + shape = [4, ] data = [0, 2, 4, 6] out = jsonToArray(shape, dt, data) @@ -142,6 +136,17 @@ def testJsonToArray(self): e0 = out[0].tolist() self.assertEqual(e0, (6, b"six")) + # test ascii chars >127 + dt = np.dtype("S26") + data = "extended ascii char 241: " + chr(241) + out = jsonToArray(shape, dt, data) + self.assertEqual(out[0], b'extended ascii char 241: \xc3') + + dt = np.dtype("S12") + data = "eight: \u516b" + out = jsonToArray(shape, dt, data) + self.assertEqual(out[0], b'eight: \xe5\x85\xab') + # VLEN ascii dt = special_dtype(vlen=bytes) data = [b"one", b"two", b"three", b"four", b"five"] @@ -290,6 +295,33 @@ def testToBytes(self): arr_copy = bytesToArray(buffer, dt, (3,)) self.assertTrue(ndarray_compare(arr, arr_copy)) + # fixed length UTF8 string + dt = np.dtype("S10") + arr = np.asarray(b'eight: \xe5\x85\xab', dtype=dt) + buffer = arrayToBytes(arr) + + # convert back to array + arr_copy = bytesToArray(buffer, dt, ()) + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # invalid UTF string + dt = np.dtype("S2") + arr = np.asarray(b'\xff\xfe', dtype=dt) + buffer = arrayToBytes(arr) + + # convert back to array + arr_copy = bytesToArray(buffer, dt, ()) + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # invalid UTF string with base64 encoding + dt = np.dtype("S2") + arr = np.asarray(b'\xff\xfe', dtype=dt) + buffer = b'//4=' # this is the base64 encoding of b'\xff\xfe' + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + # Compound non-vlen dt = np.dtype([("x", "f8"), ("y", "i4")]) arr = np.zeros((4,), dtype=dt) @@ -457,6 +489,146 @@ def testToBytes(self): arr_copy = bytesToArray(buffer, dt, (4,)) self.assertTrue(ndarray_compare(arr, arr_copy)) + def testArrToBytesBase64(self): + # Simple array + dt = np.dtype(" expected_num_bytes) + + # convert buffer back to arr + arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") + self.assertTrue(np.array_equal(arr, arr_copy)) + + # fixed length string + dt = np.dtype("S8") + arr = np.asarray(("abcdefgh", "ABCDEFGH", "12345678"), dtype=dt) + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (3,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # Compound non-vlen + dt = np.dtype([("x", "f8"), ("y", "i4")]) + arr = np.zeros((4,), dtype=dt) + arr[0] = (3.12, 42) + arr[3] = (1.28, 69) + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # VLEN of int32's + dt = np.dtype("O", metadata={"vlen": np.dtype("int32")}) + arr = np.zeros((4,), dtype=dt) + arr[0] = np.int32([1, ]) + arr[1] = np.int32([1, 2]) + arr[2] = 0 # test un-intialized value + arr[3] = np.int32([1, 2, 3]) + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # VLEN of strings + dt = np.dtype("O", metadata={"vlen": str}) + arr = np.zeros((5,), dtype=dt) + arr[0] = "one: \u4e00" + arr[1] = "two: \u4e8c" + arr[2] = "three: \u4e09" + arr[3] = "four: \u56db" + arr[4] = 0 + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (5,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + # VLEN of bytes + dt = np.dtype("O", metadata={"vlen": bytes}) + arr = np.zeros((5,), dtype=dt) + arr[0] = b"Parting" + arr[1] = b"is such" + arr[2] = b"sweet" + arr[3] = b"sorrow" + arr[4] = 0 + + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (5,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # + # Compound str vlen + # + dt_vstr = np.dtype("O", metadata={"vlen": str}) + dt = np.dtype([("x", "i4"), ("tag", dt_vstr), ("code", "S4")]) + arr = np.zeros((4,), dtype=dt) + arr[0] = (42, "Hello", "X1") + arr[3] = (84, "Bye", "XYZ") + count = getByteArraySize(arr) + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # + # Compound int vlen + # + dt_vint = np.dtype("O", metadata={"vlen": "int32"}) + dt = np.dtype([("x", "int32"), ("tag", dt_vint)]) + arr = np.zeros((4,), dtype=dt) + arr[0] = (42, np.array((), dtype="int32")) + arr[3] = (84, np.array((1, 2, 3), dtype="int32")) + count = getByteArraySize(arr) + self.assertEqual(count, 44) + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # + # VLEN utf string with array type + # + dt_arr_str = np.dtype("(2,)O", metadata={"vlen": str}) + dt = np.dtype([("x", "i4"), ("tag", dt_arr_str)]) + arr = np.zeros((4,), dtype=dt) + dt_str = np.dtype("O", metadata={"vlen": str}) + arr[0] = (42, np.asarray(["hi", "bye"], dtype=dt_str)) + arr[3] = (84, np.asarray(["hi-hi", "bye-bye"], dtype=dt_str)) + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") + + self.assertEqual(arr.dtype, arr_copy.dtype) + self.assertEqual(arr.shape, arr_copy.shape) + for i in range(4): + e = arr[i] + e_copy = arr_copy[i] + self.assertTrue(np.array_equal(e, e_copy)) + # + # VLEN ascii with array type + # + dt_arr_str = np.dtype("(2,)O", metadata={"vlen": bytes}) + dt = np.dtype([("x", "i4"), ("tag", dt_arr_str)]) + arr = np.zeros((4,), dtype=dt) + dt_str = np.dtype("O", metadata={"vlen": bytes}) + arr[0] = (42, np.asarray([b"hi", b"bye"], dtype=dt_str)) + arr[3] = (84, np.asarray([b"hi-hi", b"bye-bye"], dtype=dt_str)) + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + def testArrayCompareInt(self): # Simple array dt = np.dtype("