Skip to content

Commit

Permalink
mdp2solr and document updates
Browse files Browse the repository at this point in the history
  • Loading branch information
fils committed Jan 20, 2024
1 parent 5c1e543 commit 2bd5d5f
Show file tree
Hide file tree
Showing 7 changed files with 42 additions and 27 deletions.
7 changes: 6 additions & 1 deletion archinterfaces/ODIS-WIS2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,9 @@ ROA is most notably realized in the design of RESTful APIs, which are intended t
* requirements: https://wmo-im.github.io/wcmp2/standard/wcmp2-DRAFT.html#_overview_2
* https://github.com/wmo-im/wis2-gdc
* TODO: SHACL for Table 2. WCMP record core properties
* mqtt vs pygeoapi
* mqtt vs pygeoapi

Needs
- boilerplate about ODIS and the vision of the interop with OIH and WIS2
- workflow image (need to chat with them about how much to show in that)
- https://docs.ogc.org/DRAFTS/20-004.html#clause-crawlable-catalog
1 change: 1 addition & 0 deletions book/thematics/depth/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ The guidance:
For external references to geospatial metadata:

- Best to import and add to your own JSON-LD, so you can validate before pushing to ODIS (don’t want an upstream error to corrupt an otherwise valid record)
- Min max depth URL examples: http://vocab.nerc.ac.uk/collection/P01/current/MAXWDIST/ and http://vocab.nerc.ac.uk/collection/P01/current/MINWDIST/


CODE BLOCK [x] - Marine Regions reference
Expand Down
10 changes: 8 additions & 2 deletions graphOps/extraction/mdp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ for s3 something like the following can be used assuming you have set the enviro
MINIO_ACCESS_KEY and MINIO_SECRET_KEY.

```Bash
python mdp.py --source "s3://nas.local:54321/public/graphs/test1/africaioc_release.nq" --output "./output/test.parquet"
python mdp.py --source "s3://nas.local:49153/public/graphs/test1/africaioc_release.nq" --output "./output/test.parquet"
```

Can now save to s3 as well:

```Bash
python mdp.py --source "s3://nas.local:54321/public/graphs/test1/africaioc_release.nq" --output "s3://nas.local:54321/public/graphs/products/africaioc.parquet"
python mdp.py --source "s3://nas.local:49153/public/graphs/test1/africaioc_release.nq" --output "s3://nas.local:54321/public/graphs/products/africaioc.parquet"
```

At present, I only support Parquet and CSV output based on the file extension.
Expand All @@ -41,3 +41,9 @@ At present, I only support Parquet and CSV output based on the file extension.
```Bash
python mdp2Solr.py --source ./output/cioos.parquet --outputdir ./output/solr
```

## MDP2Solr

```Bash
python mdp2solr.py --source "s3://nas.local:49153/public/assets/africaioc.parquet" --output "./output/solr/
```
19 changes: 15 additions & 4 deletions graphOps/extraction/mdp/defs/readobject.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,33 @@
import os
import boto3
from minio import Minio


def getBytes(source):
url, bucket, obj = parse_s3_url(source)

sk = os.getenv("MINIO_SECRET_KEY")
ak = os.getenv("MINIO_ACCESS_KEY")

# Create client with access and secret key.
mc = Minio(url, ak, sk, secure=False)
data = mc.get_object(bucket, obj)
d = data.read()

return d

def reads3url(source):
url, bucket, obj = parse_s3_url(source)

sk = os.getenv("MINIO_SECRET_KEY")
ak = os.getenv("MINIO_ACCESS_KEY")

# Create client with access and secret key.
mc = Minio("nas.lan:49153", ak, sk, secure=False)
mc = Minio(url, ak, sk, secure=False)
d = read_object_to_string(mc, bucket, obj)

return d


def read_object_to_string(mc, bucket_name, object_name):
try:
data = mc.get_object(bucket_name, object_name)
Expand All @@ -23,7 +36,6 @@ def read_object_to_string(mc, bucket_name, object_name):
except Exception as e:
print(e)


def parse_s3_url(s3_url):
protocol, url = s3_url.split("://")
if protocol != 's3':
Expand All @@ -34,5 +46,4 @@ def parse_s3_url(s3_url):
bucket_name = split_url[1]
object_path = "/".join(split_url[2:])


return server_url, bucket_name, object_path
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import argparse
import os
import os, io
import sys
import numpy as np
import pandas as pd
Expand All @@ -8,6 +8,8 @@
import pyarrow.parquet as pq
import s3fs

from defs import readobject

# Master Data Product to Solr

def main():
Expand All @@ -25,23 +27,6 @@ def main():
print("Error: the --outputdir argument is required")
sys.exit(1)

# ---------------------------------------------------------------------
# make this part of an "if s3://" block?
session = boto3.Session(
aws_access_key_id='YOUR_ACCESS_KEY_ID',
aws_secret_access_key='YOUR_SECRET_ACCESS_KEY'
)

# s3 = s3fs.S3FileSystem(client_kwargs={'endpoint_url': 'https://your-custom-endpoint'})
s3 = s3fs.S3FileSystem(session=session)

file_path = 's3://your-bucket/your-parquet-file.parquet'

with s3.open(file_path, 'rb') as f:
df = pd.read_parquet(f)

# ---------------------------------------------------------------------

u = args.source
od = args.outputdir

Expand All @@ -55,10 +40,14 @@ def main():
os.makedirs(od)

# Load the master data product from ODIS
mf = pd.read_parquet(u)
# mf = pd.read_parquet(u)
b = readobject.getBytes(u)
table = pq.read_table(io.BytesIO(b))

for index, row in mf.iterrows():
# Convert to pandas dataframe
mf = table.to_pandas()

for index, row in mf.iterrows():
data = ObjDict()

# not in arrays
Expand Down
File renamed without changes.
3 changes: 3 additions & 0 deletions graphOps/extraction/mdp/notes_todo.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# TODO

* Add the columns "completeness" and "accreditation" to the products

0 comments on commit 2bd5d5f

Please sign in to comment.