From 4549ebdb702777ab689d764758c6c006ff2aff8c Mon Sep 17 00:00:00 2001 From: christophercarlon Date: Wed, 10 Jul 2024 21:01:08 +0100 Subject: [PATCH] Updates 2024-07-10 - Updated data catalogue list and addeddd new get creds module --- .../lambda_jobs/get_creds.py | 34 ++++++++ herding_cats_pipelines/lambda_jobs/main.py | 86 ++++++++----------- 2 files changed, 69 insertions(+), 51 deletions(-) create mode 100644 herding_cats_pipelines/lambda_jobs/get_creds.py diff --git a/herding_cats_pipelines/lambda_jobs/get_creds.py b/herding_cats_pipelines/lambda_jobs/get_creds.py new file mode 100644 index 0000000..8c7cb93 --- /dev/null +++ b/herding_cats_pipelines/lambda_jobs/get_creds.py @@ -0,0 +1,34 @@ +import boto3 +import json +from loguru import logger + +def get_param(parameter_name: str, region_name: str = "eu-west-2") -> str: + """ + Retrieve a parameter from AWS Systems Manager Parameter Store. + """ + ssm = boto3.client('ssm', region_name=region_name) + try: + response = ssm.get_parameter(Name=parameter_name, WithDecryption=True) + return response['Parameter']['Value'] + except Exception as e: + logger.error(f"Unable to retrieve parameter: {str(e)}") + raise e + +def get_secret(secret_name: str, region_name: str = "eu-west-2") -> json: + """ + Create an AWS Secrets Manager client. + + Returns a JSON with env vars. + """ + session = boto3.session.Session() + client = session.client(service_name='secretsmanager', + region_name=region_name) + + try: + get_secret_value_response = client.get_secret_value(SecretId=secret_name) + except Exception as e: + logger.error(f"Unable to retrieve secret: {str(e)}") + raise e + else: + secret = get_secret_value_response['SecretString'] + return json.loads(secret) \ No newline at end of file diff --git a/herding_cats_pipelines/lambda_jobs/main.py b/herding_cats_pipelines/lambda_jobs/main.py index 57f680e..9f6a3cc 100644 --- a/herding_cats_pipelines/lambda_jobs/main.py +++ b/herding_cats_pipelines/lambda_jobs/main.py @@ -6,66 +6,50 @@ from loguru import logger - -def get_param(parameter_name: str, region_name: str = "eu-west-2") -> str: - """ - Retrieve a parameter from AWS Systems Manager Parameter Store. - """ - ssm = boto3.client('ssm', region_name=region_name) - try: - response = ssm.get_parameter(Name=parameter_name, WithDecryption=True) - return response['Parameter']['Value'] - except Exception as e: - logger.error(f"Unable to retrieve parameter {parameter_name}: {str(e)}") - raise e - -def get_secret(secret_name: str, region_name: str = "eu-west-2") -> json: - """ - Create an AWS Secrets Manager client. - - Returns a JSON with env vars. - """ - session = boto3.session.Session() - client = session.client(service_name='secretsmanager', - region_name=region_name) - - try: - get_secret_value_response = client.get_secret_value(SecretId=secret_name) - except Exception as e: - logger.error(f"Unable to retrieve secret {secret_name}: {str(e)}") - raise e - else: - secret = get_secret_value_response['SecretString'] - return json.loads(secret) +from get_creds import get_param, get_secret +from urllib.parse import urlparse def lambda_handler(event, context) -> json: """ - AWS Lambda function to fetch data catalogue from London Datastore and dump it to S3 + AWS Lambda function to fetch UK open data catalogues. + + Loop through links and dump catalogue data to s3 bucket. """ + + catalogues_list = [ + "https://data.london.gov.uk/api/action/package_search", + "https://opendata.bristol.gov.uk/api/feed/dcat-ap/2.1.1.json" + ] + try: + # Fetch aws params and secrets secret_name = get_param("herding_cats_param") secret = get_secret(secret_name) bucket_name = secret["herding_cats_raw_data_bucket"] - url = "https://data.london.gov.uk/api/action/package_search" - response = requests.get(url) - response.raise_for_status() - data = response.json() - print("Data Successfully Fetched") - - # Dump data to S3 - s3 = boto3.client('s3') - bucket_name = bucket_name - file_name = 'london_datastore.json' + # Loop through the links + for link in catalogues_list: + response = requests.get(link, timeout=15) + response.raise_for_status() + data = response.json() + logger.success(f"Data Successfully Fetched for {link}") + + # Extract domain name from the link + domain = urlparse(link).netloc + + # Use domain as file name + file_name = f"{domain}.json" + + # Dump data to S3 + s3 = boto3.client('s3') + s3.put_object( + Bucket=bucket_name, + Key=file_name, + Body=json.dumps(data), + ContentType='application/json' + ) + logger.success(f"Data Was Successfully Dumped to {file_name}") - s3.put_object( - Bucket=bucket_name, - Key=file_name, - Body=json.dumps(data), - ContentType='application/json' - ) - logger.success("Data Was Successfully Dumped...") - return { 'statusCode': 200, 'body': json.dumps({'message': 'Data successfully fetched and dumped to S3'}) @@ -81,4 +65,4 @@ def lambda_handler(event, context) -> json: return { 'statusCode': 500, 'body': json.dumps({'error': f'S3 dump error: {str(e)}'}) - } \ No newline at end of file + }