sustainable-processes · Joearrowsmith · Apr 13, 2023 · Apr 13, 2023 · Apr 13, 2023 · Apr 13, 2023
diff --git a/orderly/extract/defaults.py b/orderly/extract/defaults.py
@@ -110,7 +110,7 @@ def get_molecule_replacements() -> Dict[MOLECULE_IDENTIFIER, SMILES]:
     """
     Returns a dictionary mapping common representations of molecules (particularly catalysts) to a canonical representation.
     """
-    molecule_replacements = {}
+    molecule_replacements: Dict[str, str] = {}
 
     # Add a catalyst to the molecule_replacements dict (Done by Alexander)
     molecule_replacements[
@@ -214,7 +214,7 @@ def get_molecule_replacements() -> Dict[MOLECULE_IDENTIFIER, SMILES]:
     return molecule_replacements
 
 
-def get_molecule_str_force_nones() -> List[MOLECULE_IDENTIFIER]:
+def get_molecule_str_force_nones() -> List[INVALID_IDENTIFIER]:
     return [
         "solution",  # someone probably wrote 'water solution' and that was translated to 'water' and 'solution' I'd imagine
         "liquid",

diff --git a/orderly/extract/extractor.py b/orderly/extract/extractor.py
@@ -32,21 +32,11 @@ class OrdExtractor:
     1) Extract all the relevant data (raw): reactants, products, catalysts, reagents, yields, temp, time
     2) Canonicalise all the molecules
     3) Write to a pickle file
-
-    Args:
-            ord_file_path (pathlib.Path):
-            trust_labelling (bool):
-            manual_replacements_dict (Dict[str, str]):
-            metals (METALS, optional) default=None
-            solvents_set (Set[SOLVENT]], optional) default=None
-            filename (str, optional) default=None:
-            contains_substring (str, optional) default=None:
-            inverse_contains_substring (bool) default=False:
     """
 
     ord_file_path: pathlib.Path
     trust_labelling: bool
-    manual_replacements_dict: Dict[MOLECULE_IDENTIFIER, Optional[SMILES | CANON_SMILES]]
+    manual_replacements_dict: MANUAL_REPLACEMENTS_DICT
     metals: Optional[METALS] = None
     solvents_set: Optional[Set[SOLVENT]] = None
     filename: Optional[str] = None
@@ -161,12 +151,14 @@ def get_rxn_string_and_is_mapped(
         rxn_str = rxn_str_extended_smiles.split(" ")[
             0
         ]  # this is to get rid of the extended smiles info
-        return rxn_str, is_mapped
+        return RXN_STR(rxn_str), is_mapped
 
     @staticmethod
     def extract_info_from_rxn(
         rxn: ord_reaction_pb2.Reaction,
-    ) -> Optional[Tuple[REACTANTS, AGENTS, PRODUCTS, str, List[MOLECULE_IDENTIFIER]]]:
+    ) -> Optional[
+        Tuple[REACTANTS, AGENTS, PRODUCTS, RXN_STR, List[MOLECULE_IDENTIFIER]]
+    ]:
         """
         Input a reaction object, and return the reactants, agents, products, and the reaction smiles string
         """
@@ -185,9 +177,9 @@ def extract_info_from_rxn(
         del agent
         del product_from_rxn
 
-        non_smiles_names_list = []
+        non_smiles_names_list: List[MOLECULE_IDENTIFIER] = []
         # We need molecules wihtout maping info, so we can compare them to the products
-        reactants_from_rxn_without_mapping = []
+        reactants_from_rxn_without_mapping: CANON_REACTANTS = []
         for smi in reactants_from_rxn:
             canon_smi = orderly.extract.canonicalise.get_canonicalised_smiles(
                 smi, is_mapped
@@ -198,7 +190,7 @@ def extract_info_from_rxn(
             reactants_from_rxn_without_mapping.append(canon_smi)
         # assert len(reactants_from_rxn) == len(reactants_from_rxn_without_mapping)
 
-        products_from_rxn_without_mapping = []
+        products_from_rxn_without_mapping: CANON_PRODUCTS = []
         for smi in products_from_rxn:
             canon_smi = orderly.extract.canonicalise.get_canonicalised_smiles(
                 smi, is_mapped
@@ -209,7 +201,7 @@ def extract_info_from_rxn(
             products_from_rxn_without_mapping.append(canon_smi)
         # assert len(products_from_rxn) == len(products_from_rxn_without_mapping)
 
-        cleaned_agents = []
+        cleaned_agents: CANON_AGENTS = []
         for smi in agents:
             canon_smi = orderly.extract.canonicalise.get_canonicalised_smiles(
                 smi, is_mapped
@@ -318,7 +310,7 @@ def rxn_outcomes_extractor(
         Extract reaction information from ORD output object (ord_reaction_pb2.Reaction.outcomes)
         """
         # products & yield
-        yields = []
+        yields: YIELDS = []
         products = []
         non_smiles_names_list = []
 
@@ -339,7 +331,8 @@ def rxn_outcomes_extractor(
             for measurement in measurements:
                 if measurement.type == 3:  # YIELD
                     y = float(measurement.percentage.value)
-                    y = round(y, 2)
+                    y = YIELD(round(y, 2))
+                    continue
             # people sometimes report a product such as '[Na+].[Na+].[O-]B1OB2OB([O-])OB(O1)O2' and then only report one yield, this is a problem...
             # We'll resolve this by moving the longest smiles string to the front of the list, then appending the yield to the front of the list, and padding with None to ensure that the lists are the same length
 
@@ -350,7 +343,7 @@ def rxn_outcomes_extractor(
             y_list = [y] + [None] * (len(product_list) - 1)
 
             products += product_list
-            yields += y_list
+            yields += y_list  # type: ignore
 
         return products, yields, non_smiles_names_list
 
@@ -365,54 +358,52 @@ def temperature_extractor(
         temp_unit = rxn.conditions.temperature.setpoint.units
 
         if temp_unit == 1:  # celcius
-            return float(rxn.conditions.temperature.setpoint.value)
+            return TEMPERATURE_CELCIUS(float(rxn.conditions.temperature.setpoint.value))
         elif temp_unit == 2:  # fahrenheit
             f = rxn.conditions.temperature.setpoint.value
             c = (f - 32) * 5 / 9
-            return float(c)
+            return TEMPERATURE_CELCIUS(float(c))
         elif temp_unit == 3:  # kelvin
             k = rxn.conditions.temperature.setpoint.value
             c = k - 273.15
-            return float(c)
+            return TEMPERATURE_CELCIUS(float(c))
         elif temp_unit == 0:  # unspecified
             # instead of using the setpoint, use the control type
             # temperatures are in celcius
             temp_control_type = rxn.conditions.temperature.control.type
             if temp_control_type == 2:  # AMBIENT
-                return 25.0
+                return TEMPERATURE_CELCIUS(25.0)
             elif temp_control_type == 6:  # ICE_BATH
-                return 0.0
+                return TEMPERATURE_CELCIUS(0.0)
             elif temp_control_type == 9:  # DRY_ICE_BATH
-                return -78.5
+                return TEMPERATURE_CELCIUS(-78.5)
             elif temp_control_type == 11:  # LIQUID_NITROGEN
-                return -196.0
+                return TEMPERATURE_CELCIUS(-196.0)
         return None  # No temperature found
 
     @staticmethod
-    def rxn_time_extractor(rxn: ord_reaction_pb2.Reaction) -> Optional[float]:
+    def rxn_time_extractor(rxn: ord_reaction_pb2.Reaction) -> Optional[RXN_TIME]:
         if rxn.outcomes[0].reaction_time.units == 1:  # hour
-            return round(float(rxn.outcomes[0].reaction_time.value), 2)
+            return RXN_TIME(round(float(rxn.outcomes[0].reaction_time.value), 2))
         elif rxn.outcomes[0].reaction_time.units == 2:  # minutes
             m = rxn.outcomes[0].reaction_time.value
             h = m / 60
-            return round(float(h), 2)
+            return RXN_TIME(round(float(h), 2))
         elif rxn.outcomes[0].reaction_time.units == 3:  # seconds
             s = rxn.outcomes[0].reaction_time.value
             h = s / 3600
-            return round(float(h), 2)
+            return RXN_TIME(round(float(h), 2))
         elif rxn.outcomes[0].reaction_time.units == 4:  # day
             d = rxn.outcomes[0].reaction_time.value
             h = d * 24
-            return round(float(h), 2)
+            return RXN_TIME(round(float(h), 2))
         else:
             return None  # no time found
 
     @staticmethod
     def apply_replacements_dict(
         smiles_list: List[MOLECULE_IDENTIFIER],
-        manual_replacements_dict: Dict[
-            MOLECULE_IDENTIFIER, Optional[SMILES | CANON_SMILES]
-        ],
+        manual_replacements_dict: MANUAL_REPLACEMENTS_DICT,
     ) -> List[SMILES]:
         smiles_list = [
             x
@@ -510,9 +501,7 @@ def merge_to_agents(
     @staticmethod
     def handle_reaction_object(
         rxn: ord_reaction_pb2.Reaction,
-        manual_replacements_dict: Dict[
-            MOLECULE_IDENTIFIER, Optional[SMILES | CANON_SMILES]
-        ],
+        manual_replacements_dict: MANUAL_REPLACEMENTS_DICT,
         solvents_set: Set[SOLVENT],
         metals: METALS,
         trust_labelling: bool = False,

diff --git a/orderly/extract/main.py b/orderly/extract/main.py
@@ -107,8 +107,8 @@ def build_solvents_set_and_dict(
 
 def build_replacements(
     molecule_replacements: Optional[Dict[MOLECULE_IDENTIFIER, SMILES]] = None,
-    molecule_str_force_nones: Optional[List[MOLECULE_IDENTIFIER]] = None,
-) -> Dict[MOLECULE_IDENTIFIER, Optional[SMILES]]:
+    molecule_str_force_nones: Optional[List[INVALID_IDENTIFIER]] = None,
+) -> Dict[MOLECULE_IDENTIFIER | INVALID_IDENTIFIER, Optional[SMILES]]:
     """
     Builds dictionary mapping english name molecule identifiers to canonical smiles. Dict is based on manually curated list.
     """
@@ -128,7 +128,7 @@ def build_replacements(
             orderly.extract.defaults.get_molecule_str_force_nones()
         )
 
-    molecule_replacements_with_force_nones: Dict[MOLECULE_IDENTIFIER, Optional[SMILES]] = molecule_replacements.copy()  # type: ignore
+    molecule_replacements_with_force_nones: Dict[MOLECULE_IDENTIFIER | INVALID_IDENTIFIER, Optional[SMILES]] = molecule_replacements.copy()  # type: ignore
 
     for molecule_str in molecule_str_force_nones:
         molecule_replacements_with_force_nones[molecule_str] = None
@@ -141,7 +141,7 @@ def get_manual_replacements_dict(
     molecule_replacements: Optional[Dict[MOLECULE_IDENTIFIER, CANON_SMILES]] = None,
     molecule_str_force_nones: Optional[List[MOLECULE_IDENTIFIER]] = None,
     solvents_path: Optional[pathlib.Path] = None,
-) -> Dict[MOLECULE_IDENTIFIER, Optional[SMILES | CANON_SMILES]]:
+) -> MANUAL_REPLACEMENTS_DICT:
     """
     Combines manually curated dictioary of molecule names to canonical smiles strings with the dictionary of solvent names to canonical smiles strings.
     """

diff --git a/orderly/types.py b/orderly/types.py
@@ -5,14 +5,18 @@
 )  # protobuf uses a different type for the repeat composite container for each OS so we need a generic type that is not using the true type
 
 MOLECULE_IDENTIFIER = str  # NewType('MOLECULE_IDENTIFIER', str)
+INVALID_IDENTIFIER = str  # NewType('INVALID_IDENTIFIER', str)
+
 SMILES = str  # NewType('SMILES', str)
 CANON_SMILES = (
     str  # NewType('CANON_SMILES', SMILES)  # This is for SMILES canonicalised by RDKit
 )
 
-MANUAL_REPLACEMENTS_DICT = Dict[MOLECULE_IDENTIFIER, Optional[SMILES | CANON_SMILES]]
+MANUAL_REPLACEMENTS_DICT = Dict[
+    MOLECULE_IDENTIFIER | INVALID_IDENTIFIER, Optional[SMILES | CANON_SMILES]
+]
 
-RXN_STR = str  # NewType('RXN_STR', str)
+RXN_STR = NewType("RXN_STR", str)
 
 REAGENT = Union[CANON_SMILES, SMILES, MOLECULE_IDENTIFIER]
 CANON_REAGENT = CANON_SMILES
@@ -49,10 +53,10 @@
 AGENTS = List[AGENT]
 CANON_AGENTS = List[CANON_AGENT]
 
-YIELD = float  # NewType('YIELD', float)
+YIELD = NewType("YIELD", float)
 YIELDS = List[Optional[YIELD]]
 
-TEMPERATURE_CELCIUS = float  # NewType('TEMPERATURE_CELCIUS', float)
+TEMPERATURE_CELCIUS = NewType("TEMPERATURE_CELCIUS", float)
 TEMPERATURES_CELCIUS = List[Optional[TEMPERATURE_CELCIUS]]
 
-RXN_TIME = float  # NewType('RXN_TIME', float)  # hours
+RXN_TIME = NewType("RXN_TIME", float)  # hours
diff --git a/tests/test_extract.py b/tests/test_extract.py
@@ -4,6 +4,8 @@
 
 from ord_schema.proto import reaction_pb2 as ord_reaction_pb2
 
+from orderly.types import YIELD, MANUAL_REPLACEMENTS_DICT
+
 REPETITIONS = 3
 SLOW_REPETITIONS = 1
 
@@ -635,9 +637,9 @@ def test_match_yield_with_product(
     execution_number: int,
     rxn_str_products: List[str],
     labelled_products: List[str],
-    input_yields: Optional[List[Optional[float]]],
+    input_yields: Optional[List[Optional[YIELD]]],
     expected_products: List[str],
-    expected_yields: Optional[List[Optional[float]]],
+    expected_yields: Optional[List[Optional[YIELD]]],
 ) -> None:
     import orderly.extract.extractor
 
@@ -931,7 +933,7 @@ def test_handle_reaction_object(
     execution_number: int,
     file_name: str,
     rxn_idx: int,
-    manual_replacements_dict: Optional[Dict[str, Optional[str]]],
+    manual_replacements_dict: MANUAL_REPLACEMENTS_DICT,
     trust_labelling: bool,
     expected_reactants: List[str],
     expected_agents: List[str],