mdp2solr and document updates

iodepo · Jan 20, 2024 · 2bd5d5f · 2bd5d5f
1 parent 5c1e543
commit 2bd5d5f
Show file tree

Hide file tree

Showing 7 changed files with 42 additions and 27 deletions.
diff --git a/archinterfaces/ODIS-WIS2/README.md b/archinterfaces/ODIS-WIS2/README.md
@@ -61,4 +61,9 @@ ROA is most notably realized in the design of RESTful APIs, which are intended t
 * requirements: https://wmo-im.github.io/wcmp2/standard/wcmp2-DRAFT.html#_overview_2 
 * https://github.com/wmo-im/wis2-gdc 
   * TODO:  SHACL for Table 2. WCMP record core properties 
-* mqtt vs pygeoapi 
+* mqtt vs pygeoapi 
+
+Needs 
+- boilerplate about ODIS and the vision of the interop with OIH and WIS2
+- workflow image (need to chat with them about how much to show in that)
+- https://docs.ogc.org/DRAFTS/20-004.html#clause-crawlable-catalog 
diff --git a/book/thematics/depth/index.md b/book/thematics/depth/index.md
@@ -66,6 +66,7 @@ The guidance:
 For external references to geospatial metadata:
 
 - Best to import and add to your own JSON-LD, so you can validate before pushing to ODIS (don’t want an upstream error to corrupt an otherwise valid record)
+- Min max depth URL examples: http://vocab.nerc.ac.uk/collection/P01/current/MAXWDIST/ and http://vocab.nerc.ac.uk/collection/P01/current/MINWDIST/
 
 
 CODE BLOCK [x] - Marine Regions reference

diff --git a/graphOps/extraction/mdp/README.md b/graphOps/extraction/mdp/README.md
@@ -25,13 +25,13 @@ for s3 something like the following can be used assuming you have set the enviro
  MINIO_ACCESS_KEY and  MINIO_SECRET_KEY.
 
 ```Bash
- python mdp.py  --source "s3://nas.local:54321/public/graphs/test1/africaioc_release.nq"  --output "./output/test.parquet"
+ python mdp.py  --source "s3://nas.local:49153/public/graphs/test1/africaioc_release.nq"  --output "./output/test.parquet"
 ```
 
 Can now save to s3 as well:
 
 ```Bash
- python mdp.py  --source "s3://nas.local:54321/public/graphs/test1/africaioc_release.nq"  --output "s3://nas.local:54321/public/graphs/products/africaioc.parquet"
+ python mdp.py  --source "s3://nas.local:49153/public/graphs/test1/africaioc_release.nq"  --output "s3://nas.local:54321/public/graphs/products/africaioc.parquet"
 ```
 
 At present, I only support Parquet and CSV output based on the file extension.  
@@ -41,3 +41,9 @@ At present, I only support Parquet and CSV output based on the file extension.
 ```Bash
 python mdp2Solr.py --source ./output/cioos.parquet --outputdir ./output/solr
 ```
+
+## MDP2Solr
+
+```Bash
+ python mdp2solr.py  --source "s3://nas.local:49153/public/assets/africaioc.parquet"  --output "./output/solr/
+```
diff --git a/graphOps/extraction/mdp/defs/readobject.py b/graphOps/extraction/mdp/defs/readobject.py
@@ -1,20 +1,33 @@
 import os
+import boto3
 from minio import Minio
 
 
+def getBytes(source):
+    url, bucket, obj = parse_s3_url(source)
+
+    sk = os.getenv("MINIO_SECRET_KEY")
+    ak = os.getenv("MINIO_ACCESS_KEY")
+
+    # Create client with access and secret key.
+    mc = Minio(url, ak, sk, secure=False)
+    data = mc.get_object(bucket, obj)
+    d = data.read()
+
+    return d
+
 def reads3url(source):
     url, bucket, obj = parse_s3_url(source)
 
     sk = os.getenv("MINIO_SECRET_KEY")
     ak = os.getenv("MINIO_ACCESS_KEY")
 
     # Create client with access and secret key.
-    mc = Minio("nas.lan:49153", ak, sk, secure=False)
+    mc = Minio(url, ak, sk, secure=False)
     d = read_object_to_string(mc, bucket, obj)
 
     return d
 
-
 def read_object_to_string(mc, bucket_name, object_name):
     try:
         data = mc.get_object(bucket_name, object_name)
@@ -23,7 +36,6 @@ def read_object_to_string(mc, bucket_name, object_name):
     except Exception as e:
         print(e)
 
-
 def parse_s3_url(s3_url):
     protocol, url = s3_url.split("://")
     if protocol != 's3':
@@ -34,5 +46,4 @@ def parse_s3_url(s3_url):
     bucket_name = split_url[1]
     object_path = "/".join(split_url[2:])
 
-
     return server_url, bucket_name, object_path
diff --git a/graphOps/extraction/mdp2solr/mdp2Solr.py → graphOps/extraction/mdp/mdp2Solr.py b/graphOps/extraction/mdp2solr/mdp2Solr.py → graphOps/extraction/mdp/mdp2Solr.py
@@ -1,5 +1,5 @@
 import argparse
-import os
+import os, io
 import sys
 import numpy as np
 import pandas as pd
@@ -8,6 +8,8 @@
 import pyarrow.parquet as pq
 import s3fs
 
+from defs import readobject
+
 # Master Data Product to Solr
 
 def main():
@@ -25,23 +27,6 @@ def main():
         print("Error: the --outputdir argument is required")
         sys.exit(1)
 
-    # ---------------------------------------------------------------------
-    # make this part of an "if s3://" block?
-    session = boto3.Session(
-        aws_access_key_id='YOUR_ACCESS_KEY_ID',
-        aws_secret_access_key='YOUR_SECRET_ACCESS_KEY'
-    )
-
-    # s3 = s3fs.S3FileSystem(client_kwargs={'endpoint_url': 'https://your-custom-endpoint'})
-    s3 = s3fs.S3FileSystem(session=session)
-
-    file_path = 's3://your-bucket/your-parquet-file.parquet'
-
-    with s3.open(file_path, 'rb') as f:
-        df = pd.read_parquet(f)
-
-    # ---------------------------------------------------------------------
-
     u = args.source
     od = args.outputdir
 
@@ -55,10 +40,14 @@ def main():
         os.makedirs(od)
 
     # Load the master data product from ODIS
-    mf = pd.read_parquet(u)
+    # mf = pd.read_parquet(u)
+    b =  readobject.getBytes(u)
+    table = pq.read_table(io.BytesIO(b))
 
-    for index, row in mf.iterrows():
+    # Convert to pandas dataframe
+    mf = table.to_pandas()
 
+    for index, row in mf.iterrows():
         data = ObjDict()
 
         # not in arrays

diff --git a/graphOps/extraction/mdp/notes.md → graphOps/extraction/mdp/notes_solr.md b/graphOps/extraction/mdp/notes.md → graphOps/extraction/mdp/notes_solr.md
diff --git a/graphOps/extraction/mdp/notes_todo.md b/graphOps/extraction/mdp/notes_todo.md
@@ -0,0 +1,3 @@
+# TODO
+
+* Add the columns "completeness" and "accreditation" to the products
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# TODO

		* Add the columns "completeness" and "accreditation" to the products