diff --git a/server/cp/ai/__init__.py b/server/cp/ai/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/server/cp/ai/semaphore.py b/server/cp/ai/semaphore.py new file mode 100644 index 00000000..df4675b2 --- /dev/null +++ b/server/cp/ai/semaphore.py @@ -0,0 +1,425 @@ +import os +import logging +import requests +import xml.etree.ElementTree as ET +from flask import current_app, abort +from superdesk.text_checkers.ai.base import AIServiceBase +import traceback +import json + + +logger = logging.getLogger(__name__) +session = requests.Session() + +TIMEOUT = (5, 30) + + +class Semaphore(AIServiceBase): + """Semaphore autotagging service + + Environment variables SEMAPHORE_BASE_URL, SEMAPHORE_ANALYZE_URL, SEMAPHORE_SEARCH_URL, SEMAPHORE_GET_PARENT_URL and SEMAPHORE_API_KEY must be set. + """ + + name = "semaphore" + label = "Semaphore autotagging service" + + def __init__(self, app=None): + # SEMAPHORE_BASE_URL OR TOKEN_ENDPOINT Goes Here + self.base_url = os.getenv("SEMAPHORE_BASE_URL") + + # SEMAPHORE_ANALYZE_URL Goes Here + self.analyze_url = os.getenv("SEMAPHORE_ANALYZE_URL") + + # SEMAPHORE_API_KEY Goes Here + self.api_key = os.getenv("SEMAPHORE_API_KEY") + + # SEMAPHORE_SEARCH_URL Goes Here + self.search_url = os.getenv("SEMAPHORE_SEARCH_URL") + + # SEMAPHORE_GET_PARENT_URL Goes Here + self.get_parent_url = os.getenv("SEMAPHORE_GET_PARENT_URL") + + def convert_to_desired_format(input_data): + result = { + "result": { + "tags": { + "subject": input_data["subject"], + "organisation": input_data["organisation"], + "person": input_data["person"], + "event": input_data["event"], + "place": input_data["place"], + "object": [], # Assuming no data for 'object' + }, + "broader": {"subject": input_data["broader"]}, + } + } + + return result + + def get_access_token(self): + """Get access token for Semaphore.""" + url = self.base_url + + payload = f"grant_type=apikey&key={self.api_key}" + headers = {"Content-Type": "application/x-www-form-urlencoded"} + response = session.post(url, headers=headers, data=payload, timeout=TIMEOUT) + response.raise_for_status() + return response.json().get("access_token") + + def fetch_parent_info(self, qcode): + headers = {"Authorization": f"Bearer {self.get_access_token()}"} + try: + frank = "?relationshipType=has%20broader" + + query = qcode + parent_url = self.get_parent_url + query + frank + + response = session.get(parent_url, headers=headers) + response.raise_for_status() + root = ET.fromstring(response.text) + path = root.find(".//PATH[@TYPE='Narrower Term']") + parent_info = [] + if path is not None: + for field in path.findall("FIELD"): + if field.find("CLASS").get("NAME") == "Topic": + parent_info.append( + { + "name": field.get("NAME"), + "qcode": field.get("ID"), + "parent": None, # Set to None initially + } + ) + return parent_info, parent_info[::-1] + + except Exception as e: + logger.error(f"Error fetching parent info: {str(e)}") + return [] + + def analyze_2(self, html_content: str) -> dict: + try: + if not self.base_url or not self.api_key: + logger.warning( + "Semaphore Search is not configured properly, can't analyze content" + ) + return {} + + query = html_content["searchString"] + + new_url = self.search_url + query + ".json" + + # Make a POST request using XML payload + headers = {"Authorization": f"bearer {self.get_access_token()}"} + + try: + response = session.get(new_url, headers=headers) + + response.raise_for_status() + except Exception as e: + traceback.print_exc() + logger.error(f"An error occurred while making the request: {str(e)}") + + root = response.text + + # def transform_xml_response(xml_data): + def transform_xml_response(api_response): + result = { + "subject": [], + "organisation": [], + "person": [], + "event": [], + "place": [], + "broader": [], + } + + # Process each termHint item in the API response + for item in api_response["termHints"]: + scheme_url = "http://cv.cp.org/" + + if "Organization" in item["classes"]: + scheme_url = "http://cv.cp.org/Organizations/" + category = "organisation" + elif "People" in item["classes"]: + scheme_url = "http://cv.cp.org/People/" + category = "person" + elif "Event" in item["classes"]: + scheme_url = "http://cv.cp.org/Events/" + category = "event" + elif "Place" in item["classes"]: + scheme_url = "http://cv.cp.org/Places/" + category = "place" + else: + # For 'subject', a different scheme might be used + category = "subject" + scheme_url = "http://cv.iptc.org/newscodes/mediatopic/" + + entry = { + "name": item["name"], + "qcode": item["id"], + "source": "Semaphore", + "altids": {"source_name": "source_id"}, + "original_source": "original_source_value", + "scheme": scheme_url, + "parent": None, # Initial parent assignment + } + + # Assign to correct category based on class + if "Organization" in item["classes"]: + result["organisation"].append(entry) + elif "People" in item["classes"]: + result["person"].append(entry) + elif "Event" in item["classes"]: + result["event"].append(entry) + elif "Place" in item["classes"]: + result["place"].append(entry) + else: + # Fetch parent info for each subject item + parent_info, reversed_parent_info = self.fetch_parent_info( + item["id"] + ) + + # Assign the immediate parent to the subject item + if parent_info: + entry["parent"] = reversed_parent_info[0][ + "qcode" + ] # Immediate parent is the first in the list + entry["scheme"] = "http://cv.iptc.org/newscodes/mediatopic/" + + result["subject"].append(entry) + + # Process broader items using reversed_parent_info + for i in range(len(reversed_parent_info)): + broader_entry = { + "name": reversed_parent_info[i]["name"], + "qcode": reversed_parent_info[i]["qcode"], + "parent": reversed_parent_info[i + 1]["qcode"] + if i + 1 < len(reversed_parent_info) + else None, + "source": "Semaphore", + "altids": {"source_name": "source_id"}, + "original_source": "original_source_value", + "scheme": "http://cv.iptc.org/newscodes/mediatopic/", + } + result["broader"].append(broader_entry) + + return result + + def convert_to_desired_format(input_data): + result = { + "result": { + "tags": { + "subject": input_data["subject"], + "organisation": input_data["organisation"], + "person": input_data["person"], + "event": input_data["event"], + "place": input_data["place"], + "object": [], # Assuming no data for 'object' + }, + "broader": {"subject": input_data["broader"]}, + } + } + + return result + + root = json.loads(root) + json_response = transform_xml_response(root) + + json_response = convert_to_desired_format(json_response) + + return json_response + + except requests.exceptions.RequestException as e: + traceback.print_exc() + logger.error( + f"Semaphore Search request failed. We are in analyze RequestError exception: {str(e)}" + ) + + def analyze(self, html_content: str, tags=None) -> dict: + try: + if not self.base_url or not self.api_key: + logger.warning( + "Semaphore is not configured properly, can't analyze content" + ) + return {} + + try: + for key, value in html_content.items(): + if key == "searchString": + print( + "______________________________________---------------------------------------" + ) + print("Running for Search") + + self.output = self.analyze_2(html_content) + return self.output + + except TypeError: + pass + + # Convert HTML to XML + xml_payload = self.html_to_xml(html_content) + + payload = {"XML_INPUT": xml_payload} + + # Make a POST request using XML payload + headers = {"Authorization": f"bearer {self.get_access_token()}"} + + logger.info( + "REQUEST url=%s headers=%s payload=%s", + self.analyze_url, + headers, + payload, + ) + + try: + response = session.post(self.analyze_url, headers=headers, data=payload) + response.raise_for_status() + except Exception as e: + traceback.print_exc() + logger.error(f"An error occurred while making the request: {str(e)}") + + root = response.text + + def transform_xml_response(xml_data): + # Parse the XML data + root = ET.fromstring(xml_data) + + # Initialize a dictionary to hold the transformed data + response_dict = { + "subject": [], + "organisation": [], + "person": [], + "event": [], + "place": [], + } + + # Temporary storage for path labels and GUIDs + path_labels = {} + path_guids = {} + + # Helper function to add data to the dictionary if it's not a duplicate and has a qcode + def add_to_dict(group, tag_data): + if tag_data["qcode"] and tag_data not in response_dict[group]: + response_dict[group].append(tag_data) + + # Iterate through the XML elements and populate the dictionary + for element in root.iter(): + if element.tag == "META": + meta_name = element.get("name") + meta_value = element.get("value") + meta_score = element.get("score") + meta_id = element.get("id") + + # Process 'Media Topic_PATH_LABEL' and 'Media Topic_PATH_GUID' + if meta_name == "Media Topic_PATH_LABEL": + path_labels[meta_score] = meta_value.split("/")[1:] + elif meta_name == "Media Topic_PATH_GUID": + path_guids[meta_score] = meta_value.split("/")[1:] + + # Process other categories + else: + group = None + if "Organization" in meta_name: + group = "organisation" + scheme_url = "http://cv.cp.org/Organizations/" + elif "Person" in meta_name: + group = "person" + scheme_url = "http://cv.cp.org/People/" + elif "Event" in meta_name: + group = "event" + scheme_url = "http://cv.cp.org/Events/" + elif "Place" in meta_name: + group = "place" + scheme_url = "http://cv.cp.org/Places/" + + if group: + tag_data = { + "name": meta_value, + "qcode": meta_id if meta_id else "", + "source": "Semaphore", + "altids": {"source_name": "source_id"}, + "original_source": "original_source_value", + "scheme": scheme_url, + } + add_to_dict(group, tag_data) + + # Match path labels with path GUIDs based on scores + for score, labels in path_labels.items(): + guids = path_guids.get(score, []) + if len(labels) != len(guids): + continue # Skip if there's a mismatch in the number of labels and GUIDs + + parent_qcode = None # Track the parent qcode + for label, guid in zip(labels, guids): + tag_data = { + "name": label, + "qcode": guid, + "parent": parent_qcode, + "source": "Semaphore", + "altids": {"source_name": "source_id"}, + "original_source": "original_source_value", + "scheme": "http://cv.iptc.org/newscodes/mediatopic/", + } + add_to_dict("subject", tag_data) + parent_qcode = ( + guid # Update the parent qcode for the next iteration + ) + + return response_dict + + json_response = transform_xml_response(root) + + return json_response + + except requests.exceptions.RequestException as e: + traceback.print_exc() + logger.error( + f"Semaphore request failed. We are in analyze RequestError exception: {str(e)}" + ) + + except Exception as e: + traceback.print_exc() + logger.error(f"An error occurred. We are in analyze exception: {str(e)}") + + def html_to_xml(self, html_content: str) -> str: + def clean_html_content(input_str): + # Remove full HTML tags using regular expressions + your_string = input_str.replace("

", "") + your_string = your_string.replace("

", "") + your_string = your_string.replace("
", "") + your_string = your_string.replace(" ", "") + your_string = your_string.replace("&", "") + your_string = your_string.replace("<>", "") + + return your_string + + xml_template = """ + + + <?xml version="1.0" encoding="UTF-8"?> + <story> + <headline>{}</headline> + <headline_extended>{}</headline_extended> + <body_html>{}</body_html> + <slugline>{}</slugline> + </story> + + + + """ + + body_html = html_content["body_html"] + headline = html_content["headline"] + headline_extended = html_content["abstract"] + slugline = html_content["slugline"] + + # Embed the 'body_html' into the XML template + xml_output = xml_template.format( + headline, headline_extended, body_html, slugline + ) + xml_output = clean_html_content(xml_output) + + return xml_output + + +def init_app(app): + Semaphore(app) diff --git a/server/cp/output/__init__.py b/server/cp/output/__init__.py index 61d8ad06..febfc623 100644 --- a/server/cp/output/__init__.py +++ b/server/cp/output/__init__.py @@ -3,6 +3,8 @@ from .formatter.jimi import JimiFormatter # noqa from .formatter.newsmlg2 import CPNewsMLG2Formatter # noqa +from .formatter.cp_ninjs_formatter import CPNINJSFormatter # noqa +from .formatter.semaphore import SemaphoreFormatter # noqa from superdesk.publish.publish_service import PublishService, set_publish_service diff --git a/server/cp/output/formatter/cp_ninjs_formatter.py b/server/cp/output/formatter/cp_ninjs_formatter.py new file mode 100644 index 00000000..28fecfff --- /dev/null +++ b/server/cp/output/formatter/cp_ninjs_formatter.py @@ -0,0 +1,91 @@ +from superdesk.publish.formatters.ninjs_formatter import ( + NINJSFormatter, + filter_empty_vals, + get_locale_name, +) + + +def format_cv_item(item, language): + """Format item from controlled vocabulary for output.""" + if item.get("scheme") == "subject": + + return filter_empty_vals( + { + "code": item.get("qcode"), + "name": get_locale_name(item, language), + "scheme": "http://cv.iptc.org/newscodes/mediatopic/", + } + ) + else: + + return filter_empty_vals( + { + "code": item.get("qcode"), + "name": get_locale_name(item, language), + "scheme": item.get("scheme"), + } + ) + + +class CPNINJSFormatter(NINJSFormatter): + type = "cpninjs" + name = "CP NINJS" + + def _transform_to_ninjs(self, article, subscriber, recursive=True): + ninjs = super()._transform_to_ninjs(article, subscriber, recursive=recursive) + + if ( + article.get("subject") + or article.get("organisation") + or article.get("place") + or article.get("event") + or article.get("person") + ): + combined_subjects = ( + self._get_subject(article) + + self._get_organisation(article) + + self._get_place(article) + + self._get_event(article) + + self._get_person(article) + ) + ninjs["subject"] = combined_subjects + + return ninjs + + def _get_subject(self, article): + """Get subject list for article.""" + return [ + format_cv_item(item, article.get("language", "")) + for item in article.get("subject", []) + ] + + # Updated Code here to fetch Organisations from Article + def _get_organisation(self, article): + return [ + format_cv_item(item, article.get("language", "")) + for item in article.get("organisation", []) + ] + + # Updated Code here to fetch Places from Article + def _get_place(self, article): + """Get place list for article.""" + return [ + format_cv_item(item, article.get("language", "")) + for item in article.get("place", []) + ] + + # Updated Code here to fetch Events from Article + def _get_event(self, article): + """Get event list for article.""" + return [ + format_cv_item(item, article.get("language", "")) + for item in article.get("event", []) + ] + + # Updated Code here to fetch Person from Article + def _get_person(self, article): + """Get person list for article.""" + return [ + format_cv_item(item, article.get("language", "")) + for item in article.get("person", []) + ] diff --git a/server/cp/output/formatter/semaphore.py b/server/cp/output/formatter/semaphore.py new file mode 100644 index 00000000..b4a17bdf --- /dev/null +++ b/server/cp/output/formatter/semaphore.py @@ -0,0 +1,27 @@ +import logging +from superdesk.text_utils import get_text +from .cp_ninjs_formatter import CPNINJSFormatter +from cp.ai.semaphore import Semaphore # Import the Semaphore integration class + +logger = logging.getLogger(__name__) + + +class SemaphoreFormatter(CPNINJSFormatter): + def can_format(self, format_type, article): + return format_type.lower() == "semaphore" and article.get("type") == "text" + + def _transform_to_ninjs(self, article, subscriber, recursive=True): + semaphore = Semaphore() # Initialize the Semaphore integration + formatted_data = {} # Define how you want to format the data for Semaphore + + try: + # Example: format the data + formatted_data["uuid"] = article["guid"] + formatted_data["headline"] = get_text(article["headline"]) + # Add more formatting logic here + + except Exception as e: + logger.error(f"Error formatting data for Semaphore: {str(e)}") + formatted_data = {} # Return an empty dictionary in case of an error + + return formatted_data diff --git a/server/cp/output/transmitters/semaphore.py b/server/cp/output/transmitters/semaphore.py new file mode 100644 index 00000000..5f713ec0 --- /dev/null +++ b/server/cp/output/transmitters/semaphore.py @@ -0,0 +1,19 @@ +from flask import current_app, json + +from superdesk.publish import register_transmitter +from superdesk.publish.publish_service import PublishService +from superdesk.text_checkers.ai.semaphore import ( + Semaphore, +) # Import the Semaphore integration class + + +class SemaphoreTransmitter(PublishService): + def _transmit(self, queue_item, subscriber): + semaphore = Semaphore(current_app) # Initialize the Semaphore integration + item = json.loads(queue_item["formatted_item"]) + # Modify this part to transmit the item using the Semaphore integration + semaphore.transmit(item) + + +# Register the Semaphore transmitter +register_transmitter("semaphore", SemaphoreTransmitter(), []) diff --git a/server/requirements.in b/server/requirements.in index 3e8e4575..073a2556 100755 --- a/server/requirements.in +++ b/server/requirements.in @@ -4,6 +4,6 @@ python3-saml>=1.9,<1.10 python-xmp-toolkit>=2.0.1,<2.1 num2words==0.5.10 -git+https://github.com/superdesk/superdesk-core.git@develop#egg=superdesk-core -git+https://github.com/superdesk/superdesk-planning.git@develop#egg=superdesk-planning +git+https://github.com/superdesk/superdesk-core.git@v2.6.7#egg=superdesk-core +git+https://github.com/superdesk/superdesk-planning.git@v2.6.2#egg=superdesk-planning git+https://github.com/superdesk/superdesk-analytics.git@v2.6.0-rc1#egg=superdesk-analytics diff --git a/server/requirements.txt b/server/requirements.txt index 414d4a68..c2bb8706 100644 --- a/server/requirements.txt +++ b/server/requirements.txt @@ -4,7 +4,7 @@ # # pip-compile requirements.in # -amqp==5.1.1 +amqp==5.2.0 # via kombu arrow==0.13.0 # via @@ -14,7 +14,7 @@ async-timeout==4.0.3 # via redis authlib==0.14.3 # via superdesk-core -babel==2.12.1 +babel==2.14.0 # via flask-babel bcrypt==3.1.7 # via superdesk-core @@ -26,32 +26,34 @@ blinker==1.4 # flask-mail # raven # superdesk-core -boto3==1.28.45 +boto3==1.34.21 # via superdesk-core -botocore==1.31.45 +botocore==1.34.21 # via # boto3 # s3transfer -cachetools==5.3.1 +cachetools==5.3.2 # via flask-oidc-ex celery[redis]==5.2.7 - # via superdesk-core + # via + # celery + # superdesk-core cerberus==1.3.5 # via # eve # superdesk-core -certifi==2023.7.22 +certifi==2023.11.17 # via # elastic-apm # elasticsearch # requests -cffi==1.15.1 +cffi==1.16.0 # via # bcrypt # cryptography chardet==3.0.4 # via superdesk-core -charset-normalizer==3.2.0 +charset-normalizer==3.3.2 # via requests ciso8601==1.0.8 # via eve-elastic @@ -62,7 +64,6 @@ click==8.1.7 # click-plugins # click-repl # flask - # superdesk-core click-didyoumean==0.3.0 # via celery click-plugins==1.1.1 @@ -71,11 +72,11 @@ click-repl==0.3.0 # via celery croniter==0.3.37 # via superdesk-core -cryptography==41.0.3 +cryptography==41.0.7 # via # authlib # jwcrypto -deepdiff==6.5.0 +deepdiff==6.7.1 # via superdesk-planning defusedxml==0.7.1 # via python3-saml @@ -84,11 +85,15 @@ deprecated==1.2.14 docopt==0.6.2 # via num2words draftjs-exporter[lxml]==2.1.7 - # via superdesk-core + # via + # draftjs-exporter + # superdesk-core ecs-logging==2.1.0 # via elastic-apm -elastic-apm[flask]==6.18.0 - # via superdesk-core +elastic-apm[flask]==6.20.0 + # via + # elastic-apm + # superdesk-core elasticsearch==7.13.4 # via eve-elastic eve==1.1.2 @@ -97,7 +102,7 @@ eve-elastic==7.3.2 # via superdesk-core events==0.3 # via eve -feedparser==6.0.10 +feedparser==6.0.11 # via superdesk-core flask==1.1.2 # via @@ -130,7 +135,7 @@ httplib2==0.22.0 # via oauth2client icalendar==4.0.9 # via superdesk-planning -idna==3.4 +idna==3.6 # via requests isodate==0.6.1 # via python3-saml @@ -148,7 +153,7 @@ jmespath==1.0.1 # via # boto3 # botocore -jwcrypto==1.5.0 +jwcrypto==1.5.1 # via # flask-oidc-ex # python-jwt @@ -179,13 +184,13 @@ oauthlib==3.2.2 # via requests-oauthlib ordered-set==4.1.0 # via deepdiff -packaging==23.1 +packaging==23.2 # via gunicorn pillow==9.2.0 # via superdesk-core -prompt-toolkit==3.0.39 +prompt-toolkit==3.0.43 # via click-repl -pyasn1==0.5.0 +pyasn1==0.5.1 # via # ldap3 # oauth2client @@ -209,13 +214,13 @@ python-dateutil==2.7.5 # croniter # icalendar # superdesk-core -python-jwt==4.0.0 +python-jwt==4.1.0 # via flask-oidc-ex python-magic==0.4.27 # via superdesk-core python-twitter==3.5 # via superdesk-core -python-xmp-toolkit==2.0.1 +python-xmp-toolkit==2.0.2 # via -r requirements.in python3-saml==1.9.0 # via -r requirements.in @@ -231,7 +236,9 @@ pytz==2023.3.post1 pyyaml==6.0.1 # via superdesk-core raven[flask]==6.10.0 - # via superdesk-core + # via + # raven + # superdesk-core redis==4.5.5 # via # celery @@ -247,11 +254,11 @@ requests-oauthlib==1.3.1 # via python-twitter rsa==4.9 # via oauth2client -s3transfer==0.6.2 +s3transfer==0.10.0 # via boto3 sgmllib3k==1.0.0 # via feedparser -simplejson==3.19.1 +simplejson==3.19.2 # via eve six==1.16.0 # via @@ -262,11 +269,11 @@ six==1.16.0 # python-dateutil superdesk-analytics @ git+https://github.com/superdesk/superdesk-analytics.git@v2.6.0-rc1 # via -r requirements.in -superdesk-core @ git+https://github.com/superdesk/superdesk-core.git@develop +superdesk-core @ git+https://github.com/superdesk/superdesk-core.git@v2.6.7 # via -r requirements.in -superdesk-planning @ git+https://github.com/superdesk/superdesk-planning.git@develop +superdesk-planning @ git+https://github.com/superdesk/superdesk-planning.git@v2.6.2 # via -r requirements.in -typing-extensions==4.7.1 +typing-extensions==4.9.0 # via superdesk-core tzlocal==2.1 # via superdesk-core @@ -279,12 +286,12 @@ urllib3==1.25.11 # elasticsearch # requests # superdesk-core -vine==5.0.0 +vine==5.1.0 # via # amqp # celery # kombu -wcwidth==0.2.6 +wcwidth==0.2.13 # via prompt-toolkit websockets==10.3 # via superdesk-core @@ -292,7 +299,7 @@ werkzeug==1.0.1 # via # flask # superdesk-core -wrapt==1.15.0 +wrapt==1.14.1 # via # deprecated # elastic-apm diff --git a/server/settings.py b/server/settings.py index 3c18a0e9..96ce80be 100644 --- a/server/settings.py +++ b/server/settings.py @@ -42,6 +42,7 @@ "cp.ultrad", "cp.planning_exports", "cp.set_province_on_publish", + "cp.ai.semaphore", ] MACROS_MODULE = "cp.macros"