From d0c8d5469cc4c87493241d94bb7d4d6b90153396 Mon Sep 17 00:00:00 2001 From: joearrowsmith Date: Thu, 13 Apr 2023 16:47:56 +0100 Subject: [PATCH 1/5] working on hard typing --- orderly/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orderly/types.py b/orderly/types.py index a788efe2..8039af8f 100644 --- a/orderly/types.py +++ b/orderly/types.py @@ -55,4 +55,4 @@ TEMPERATURE_CELCIUS = float # NewType('TEMPERATURE_CELCIUS', float) TEMPERATURES_CELCIUS = List[Optional[TEMPERATURE_CELCIUS]] -RXN_TIME = float # NewType('RXN_TIME', float) # hours +RXN_TIME = NewType('RXN_TIME', float) # hours From 4e1345f46771fdeed86a5ac38f2882cccb1bf816 Mon Sep 17 00:00:00 2001 From: joearrowsmith Date: Thu, 13 Apr 2023 17:10:50 +0100 Subject: [PATCH 2/5] done yield --- orderly/extract/extractor.py | 31 ++++++++++++++++--------------- orderly/types.py | 4 ++-- tests/test_extract.py | 6 ++++-- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/orderly/extract/extractor.py b/orderly/extract/extractor.py index 86ab5ce5..c32c96ea 100644 --- a/orderly/extract/extractor.py +++ b/orderly/extract/extractor.py @@ -318,7 +318,7 @@ def rxn_outcomes_extractor( Extract reaction information from ORD output object (ord_reaction_pb2.Reaction.outcomes) """ # products & yield - yields = [] + yields: YIELDS = [] products = [] non_smiles_names_list = [] @@ -339,7 +339,8 @@ def rxn_outcomes_extractor( for measurement in measurements: if measurement.type == 3: # YIELD y = float(measurement.percentage.value) - y = round(y, 2) + y = YIELD(round(y, 2)) + continue # people sometimes report a product such as '[Na+].[Na+].[O-]B1OB2OB([O-])OB(O1)O2' and then only report one yield, this is a problem... # We'll resolve this by moving the longest smiles string to the front of the list, then appending the yield to the front of the list, and padding with None to ensure that the lists are the same length @@ -350,7 +351,7 @@ def rxn_outcomes_extractor( y_list = [y] + [None] * (len(product_list) - 1) products += product_list - yields += y_list + yields += y_list # type: ignore return products, yields, non_smiles_names_list @@ -365,45 +366,45 @@ def temperature_extractor( temp_unit = rxn.conditions.temperature.setpoint.units if temp_unit == 1: # celcius - return float(rxn.conditions.temperature.setpoint.value) + return TEMPERATURE_CELCIUS(float(rxn.conditions.temperature.setpoint.value)) elif temp_unit == 2: # fahrenheit f = rxn.conditions.temperature.setpoint.value c = (f - 32) * 5 / 9 - return float(c) + return TEMPERATURE_CELCIUS(float(c)) elif temp_unit == 3: # kelvin k = rxn.conditions.temperature.setpoint.value c = k - 273.15 - return float(c) + return TEMPERATURE_CELCIUS(float(c)) elif temp_unit == 0: # unspecified # instead of using the setpoint, use the control type # temperatures are in celcius temp_control_type = rxn.conditions.temperature.control.type if temp_control_type == 2: # AMBIENT - return 25.0 + return TEMPERATURE_CELCIUS(25.0) elif temp_control_type == 6: # ICE_BATH - return 0.0 + return TEMPERATURE_CELCIUS(0.0) elif temp_control_type == 9: # DRY_ICE_BATH - return -78.5 + return TEMPERATURE_CELCIUS(-78.5) elif temp_control_type == 11: # LIQUID_NITROGEN - return -196.0 + return TEMPERATURE_CELCIUS(-196.0) return None # No temperature found @staticmethod - def rxn_time_extractor(rxn: ord_reaction_pb2.Reaction) -> Optional[float]: + def rxn_time_extractor(rxn: ord_reaction_pb2.Reaction) -> Optional[RXN_TIME]: if rxn.outcomes[0].reaction_time.units == 1: # hour - return round(float(rxn.outcomes[0].reaction_time.value), 2) + return RXN_TIME(round(float(rxn.outcomes[0].reaction_time.value), 2)) elif rxn.outcomes[0].reaction_time.units == 2: # minutes m = rxn.outcomes[0].reaction_time.value h = m / 60 - return round(float(h), 2) + return RXN_TIME(round(float(h), 2)) elif rxn.outcomes[0].reaction_time.units == 3: # seconds s = rxn.outcomes[0].reaction_time.value h = s / 3600 - return round(float(h), 2) + return RXN_TIME(round(float(h), 2)) elif rxn.outcomes[0].reaction_time.units == 4: # day d = rxn.outcomes[0].reaction_time.value h = d * 24 - return round(float(h), 2) + return RXN_TIME(round(float(h), 2)) else: return None # no time found diff --git a/orderly/types.py b/orderly/types.py index 8039af8f..3bd8ef66 100644 --- a/orderly/types.py +++ b/orderly/types.py @@ -49,10 +49,10 @@ AGENTS = List[AGENT] CANON_AGENTS = List[CANON_AGENT] -YIELD = float # NewType('YIELD', float) +YIELD = NewType('YIELD', float) YIELDS = List[Optional[YIELD]] -TEMPERATURE_CELCIUS = float # NewType('TEMPERATURE_CELCIUS', float) +TEMPERATURE_CELCIUS = NewType('TEMPERATURE_CELCIUS', float) TEMPERATURES_CELCIUS = List[Optional[TEMPERATURE_CELCIUS]] RXN_TIME = NewType('RXN_TIME', float) # hours diff --git a/tests/test_extract.py b/tests/test_extract.py index 8e95fc75..5141f893 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -4,6 +4,8 @@ from ord_schema.proto import reaction_pb2 as ord_reaction_pb2 +from orderly.types import YIELD + REPETITIONS = 3 SLOW_REPETITIONS = 1 @@ -635,9 +637,9 @@ def test_match_yield_with_product( execution_number: int, rxn_str_products: List[str], labelled_products: List[str], - input_yields: Optional[List[Optional[float]]], + input_yields: Optional[List[Optional[YIELD]]], expected_products: List[str], - expected_yields: Optional[List[Optional[float]]], + expected_yields: Optional[List[Optional[YIELD]]], ) -> None: import orderly.extract.extractor From 75a80d7a7f97dbe1da8c9ff6da13390914d13554 Mon Sep 17 00:00:00 2001 From: joearrowsmith Date: Thu, 13 Apr 2023 17:16:15 +0100 Subject: [PATCH 3/5] Rxn str working --- orderly/extract/extractor.py | 4 ++-- orderly/types.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/orderly/extract/extractor.py b/orderly/extract/extractor.py index c32c96ea..98e0883a 100644 --- a/orderly/extract/extractor.py +++ b/orderly/extract/extractor.py @@ -161,12 +161,12 @@ def get_rxn_string_and_is_mapped( rxn_str = rxn_str_extended_smiles.split(" ")[ 0 ] # this is to get rid of the extended smiles info - return rxn_str, is_mapped + return RXN_STR(rxn_str), is_mapped @staticmethod def extract_info_from_rxn( rxn: ord_reaction_pb2.Reaction, - ) -> Optional[Tuple[REACTANTS, AGENTS, PRODUCTS, str, List[MOLECULE_IDENTIFIER]]]: + ) -> Optional[Tuple[REACTANTS, AGENTS, PRODUCTS, RXN_STR, List[MOLECULE_IDENTIFIER]]]: """ Input a reaction object, and return the reactants, agents, products, and the reaction smiles string """ diff --git a/orderly/types.py b/orderly/types.py index 3bd8ef66..93c93e24 100644 --- a/orderly/types.py +++ b/orderly/types.py @@ -4,7 +4,7 @@ "REPEATEDCOMPOSITECONTAINER", bound=Iterable[Any] ) # protobuf uses a different type for the repeat composite container for each OS so we need a generic type that is not using the true type -MOLECULE_IDENTIFIER = str # NewType('MOLECULE_IDENTIFIER', str) +MOLECULE_IDENTIFIER = str # NewType('MOLECULE_IDENTIFIER', str) SMILES = str # NewType('SMILES', str) CANON_SMILES = ( str # NewType('CANON_SMILES', SMILES) # This is for SMILES canonicalised by RDKit @@ -12,7 +12,7 @@ MANUAL_REPLACEMENTS_DICT = Dict[MOLECULE_IDENTIFIER, Optional[SMILES | CANON_SMILES]] -RXN_STR = str # NewType('RXN_STR', str) +RXN_STR = NewType('RXN_STR', str) REAGENT = Union[CANON_SMILES, SMILES, MOLECULE_IDENTIFIER] CANON_REAGENT = CANON_SMILES From 834b767cf37f09bf7632eb5c1f912fb98777d492 Mon Sep 17 00:00:00 2001 From: joearrowsmith Date: Thu, 13 Apr 2023 17:20:06 +0100 Subject: [PATCH 4/5] adding manual replacements dict --- orderly/extract/extractor.py | 20 +++----------------- orderly/extract/main.py | 2 +- tests/test_extract.py | 4 ++-- 3 files changed, 6 insertions(+), 20 deletions(-) diff --git a/orderly/extract/extractor.py b/orderly/extract/extractor.py index 98e0883a..5feaf9bd 100644 --- a/orderly/extract/extractor.py +++ b/orderly/extract/extractor.py @@ -32,21 +32,11 @@ class OrdExtractor: 1) Extract all the relevant data (raw): reactants, products, catalysts, reagents, yields, temp, time 2) Canonicalise all the molecules 3) Write to a pickle file - - Args: - ord_file_path (pathlib.Path): - trust_labelling (bool): - manual_replacements_dict (Dict[str, str]): - metals (METALS, optional) default=None - solvents_set (Set[SOLVENT]], optional) default=None - filename (str, optional) default=None: - contains_substring (str, optional) default=None: - inverse_contains_substring (bool) default=False: """ ord_file_path: pathlib.Path trust_labelling: bool - manual_replacements_dict: Dict[MOLECULE_IDENTIFIER, Optional[SMILES | CANON_SMILES]] + manual_replacements_dict: MANUAL_REPLACEMENTS_DICT metals: Optional[METALS] = None solvents_set: Optional[Set[SOLVENT]] = None filename: Optional[str] = None @@ -411,9 +401,7 @@ def rxn_time_extractor(rxn: ord_reaction_pb2.Reaction) -> Optional[RXN_TIME]: @staticmethod def apply_replacements_dict( smiles_list: List[MOLECULE_IDENTIFIER], - manual_replacements_dict: Dict[ - MOLECULE_IDENTIFIER, Optional[SMILES | CANON_SMILES] - ], + manual_replacements_dict: MANUAL_REPLACEMENTS_DICT, ) -> List[SMILES]: smiles_list = [ x @@ -511,9 +499,7 @@ def merge_to_agents( @staticmethod def handle_reaction_object( rxn: ord_reaction_pb2.Reaction, - manual_replacements_dict: Dict[ - MOLECULE_IDENTIFIER, Optional[SMILES | CANON_SMILES] - ], + manual_replacements_dict: MANUAL_REPLACEMENTS_DICT, solvents_set: Set[SOLVENT], metals: METALS, trust_labelling: bool = False, diff --git a/orderly/extract/main.py b/orderly/extract/main.py index 2abaa404..6b4ca3e1 100644 --- a/orderly/extract/main.py +++ b/orderly/extract/main.py @@ -141,7 +141,7 @@ def get_manual_replacements_dict( molecule_replacements: Optional[Dict[MOLECULE_IDENTIFIER, CANON_SMILES]] = None, molecule_str_force_nones: Optional[List[MOLECULE_IDENTIFIER]] = None, solvents_path: Optional[pathlib.Path] = None, -) -> Dict[MOLECULE_IDENTIFIER, Optional[SMILES | CANON_SMILES]]: +) -> MANUAL_REPLACEMENTS_DICT: """ Combines manually curated dictioary of molecule names to canonical smiles strings with the dictionary of solvent names to canonical smiles strings. """ diff --git a/tests/test_extract.py b/tests/test_extract.py index 5141f893..0c097294 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -4,7 +4,7 @@ from ord_schema.proto import reaction_pb2 as ord_reaction_pb2 -from orderly.types import YIELD +from orderly.types import YIELD, MANUAL_REPLACEMENTS_DICT REPETITIONS = 3 SLOW_REPETITIONS = 1 @@ -933,7 +933,7 @@ def test_handle_reaction_object( execution_number: int, file_name: str, rxn_idx: int, - manual_replacements_dict: Optional[Dict[str, Optional[str]]], + manual_replacements_dict: MANUAL_REPLACEMENTS_DICT, trust_labelling: bool, expected_reactants: List[str], expected_agents: List[str], From c8196f2308a84cc656f84254f28642796c473c75 Mon Sep 17 00:00:00 2001 From: joearrowsmith Date: Thu, 13 Apr 2023 18:07:35 +0100 Subject: [PATCH 5/5] waving the white flag --- orderly/extract/defaults.py | 4 ++-- orderly/extract/extractor.py | 12 +++++++----- orderly/extract/main.py | 6 +++--- orderly/types.py | 16 ++++++++++------ 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/orderly/extract/defaults.py b/orderly/extract/defaults.py index b988ac37..ef82d83d 100644 --- a/orderly/extract/defaults.py +++ b/orderly/extract/defaults.py @@ -110,7 +110,7 @@ def get_molecule_replacements() -> Dict[MOLECULE_IDENTIFIER, SMILES]: """ Returns a dictionary mapping common representations of molecules (particularly catalysts) to a canonical representation. """ - molecule_replacements = {} + molecule_replacements: Dict[str, str] = {} # Add a catalyst to the molecule_replacements dict (Done by Alexander) molecule_replacements[ @@ -214,7 +214,7 @@ def get_molecule_replacements() -> Dict[MOLECULE_IDENTIFIER, SMILES]: return molecule_replacements -def get_molecule_str_force_nones() -> List[MOLECULE_IDENTIFIER]: +def get_molecule_str_force_nones() -> List[INVALID_IDENTIFIER]: return [ "solution", # someone probably wrote 'water solution' and that was translated to 'water' and 'solution' I'd imagine "liquid", diff --git a/orderly/extract/extractor.py b/orderly/extract/extractor.py index 5feaf9bd..d7864f3a 100644 --- a/orderly/extract/extractor.py +++ b/orderly/extract/extractor.py @@ -156,7 +156,9 @@ def get_rxn_string_and_is_mapped( @staticmethod def extract_info_from_rxn( rxn: ord_reaction_pb2.Reaction, - ) -> Optional[Tuple[REACTANTS, AGENTS, PRODUCTS, RXN_STR, List[MOLECULE_IDENTIFIER]]]: + ) -> Optional[ + Tuple[REACTANTS, AGENTS, PRODUCTS, RXN_STR, List[MOLECULE_IDENTIFIER]] + ]: """ Input a reaction object, and return the reactants, agents, products, and the reaction smiles string """ @@ -175,9 +177,9 @@ def extract_info_from_rxn( del agent del product_from_rxn - non_smiles_names_list = [] + non_smiles_names_list: List[MOLECULE_IDENTIFIER] = [] # We need molecules wihtout maping info, so we can compare them to the products - reactants_from_rxn_without_mapping = [] + reactants_from_rxn_without_mapping: CANON_REACTANTS = [] for smi in reactants_from_rxn: canon_smi = orderly.extract.canonicalise.get_canonicalised_smiles( smi, is_mapped @@ -188,7 +190,7 @@ def extract_info_from_rxn( reactants_from_rxn_without_mapping.append(canon_smi) # assert len(reactants_from_rxn) == len(reactants_from_rxn_without_mapping) - products_from_rxn_without_mapping = [] + products_from_rxn_without_mapping: CANON_PRODUCTS = [] for smi in products_from_rxn: canon_smi = orderly.extract.canonicalise.get_canonicalised_smiles( smi, is_mapped @@ -199,7 +201,7 @@ def extract_info_from_rxn( products_from_rxn_without_mapping.append(canon_smi) # assert len(products_from_rxn) == len(products_from_rxn_without_mapping) - cleaned_agents = [] + cleaned_agents: CANON_AGENTS = [] for smi in agents: canon_smi = orderly.extract.canonicalise.get_canonicalised_smiles( smi, is_mapped diff --git a/orderly/extract/main.py b/orderly/extract/main.py index 6b4ca3e1..fb987281 100644 --- a/orderly/extract/main.py +++ b/orderly/extract/main.py @@ -107,8 +107,8 @@ def build_solvents_set_and_dict( def build_replacements( molecule_replacements: Optional[Dict[MOLECULE_IDENTIFIER, SMILES]] = None, - molecule_str_force_nones: Optional[List[MOLECULE_IDENTIFIER]] = None, -) -> Dict[MOLECULE_IDENTIFIER, Optional[SMILES]]: + molecule_str_force_nones: Optional[List[INVALID_IDENTIFIER]] = None, +) -> Dict[MOLECULE_IDENTIFIER | INVALID_IDENTIFIER, Optional[SMILES]]: """ Builds dictionary mapping english name molecule identifiers to canonical smiles. Dict is based on manually curated list. """ @@ -128,7 +128,7 @@ def build_replacements( orderly.extract.defaults.get_molecule_str_force_nones() ) - molecule_replacements_with_force_nones: Dict[MOLECULE_IDENTIFIER, Optional[SMILES]] = molecule_replacements.copy() # type: ignore + molecule_replacements_with_force_nones: Dict[MOLECULE_IDENTIFIER | INVALID_IDENTIFIER, Optional[SMILES]] = molecule_replacements.copy() # type: ignore for molecule_str in molecule_str_force_nones: molecule_replacements_with_force_nones[molecule_str] = None diff --git a/orderly/types.py b/orderly/types.py index 93c93e24..9bb48569 100644 --- a/orderly/types.py +++ b/orderly/types.py @@ -4,15 +4,19 @@ "REPEATEDCOMPOSITECONTAINER", bound=Iterable[Any] ) # protobuf uses a different type for the repeat composite container for each OS so we need a generic type that is not using the true type -MOLECULE_IDENTIFIER = str # NewType('MOLECULE_IDENTIFIER', str) +MOLECULE_IDENTIFIER = str # NewType('MOLECULE_IDENTIFIER', str) +INVALID_IDENTIFIER = str # NewType('INVALID_IDENTIFIER', str) + SMILES = str # NewType('SMILES', str) CANON_SMILES = ( str # NewType('CANON_SMILES', SMILES) # This is for SMILES canonicalised by RDKit ) -MANUAL_REPLACEMENTS_DICT = Dict[MOLECULE_IDENTIFIER, Optional[SMILES | CANON_SMILES]] +MANUAL_REPLACEMENTS_DICT = Dict[ + MOLECULE_IDENTIFIER | INVALID_IDENTIFIER, Optional[SMILES | CANON_SMILES] +] -RXN_STR = NewType('RXN_STR', str) +RXN_STR = NewType("RXN_STR", str) REAGENT = Union[CANON_SMILES, SMILES, MOLECULE_IDENTIFIER] CANON_REAGENT = CANON_SMILES @@ -49,10 +53,10 @@ AGENTS = List[AGENT] CANON_AGENTS = List[CANON_AGENT] -YIELD = NewType('YIELD', float) +YIELD = NewType("YIELD", float) YIELDS = List[Optional[YIELD]] -TEMPERATURE_CELCIUS = NewType('TEMPERATURE_CELCIUS', float) +TEMPERATURE_CELCIUS = NewType("TEMPERATURE_CELCIUS", float) TEMPERATURES_CELCIUS = List[Optional[TEMPERATURE_CELCIUS]] -RXN_TIME = NewType('RXN_TIME', float) # hours +RXN_TIME = NewType("RXN_TIME", float) # hours