Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hard typing #38

Merged
merged 5 commits into from
Apr 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions orderly/extract/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def get_molecule_replacements() -> Dict[MOLECULE_IDENTIFIER, SMILES]:
"""
Returns a dictionary mapping common representations of molecules (particularly catalysts) to a canonical representation.
"""
molecule_replacements = {}
molecule_replacements: Dict[str, str] = {}

# Add a catalyst to the molecule_replacements dict (Done by Alexander)
molecule_replacements[
Expand Down Expand Up @@ -214,7 +214,7 @@ def get_molecule_replacements() -> Dict[MOLECULE_IDENTIFIER, SMILES]:
return molecule_replacements


def get_molecule_str_force_nones() -> List[MOLECULE_IDENTIFIER]:
def get_molecule_str_force_nones() -> List[INVALID_IDENTIFIER]:
return [
"solution", # someone probably wrote 'water solution' and that was translated to 'water' and 'solution' I'd imagine
"liquid",
Expand Down
65 changes: 27 additions & 38 deletions orderly/extract/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,21 +32,11 @@ class OrdExtractor:
1) Extract all the relevant data (raw): reactants, products, catalysts, reagents, yields, temp, time
2) Canonicalise all the molecules
3) Write to a pickle file

Args:
ord_file_path (pathlib.Path):
trust_labelling (bool):
manual_replacements_dict (Dict[str, str]):
metals (METALS, optional) default=None
solvents_set (Set[SOLVENT]], optional) default=None
filename (str, optional) default=None:
contains_substring (str, optional) default=None:
inverse_contains_substring (bool) default=False:
"""

ord_file_path: pathlib.Path
trust_labelling: bool
manual_replacements_dict: Dict[MOLECULE_IDENTIFIER, Optional[SMILES | CANON_SMILES]]
manual_replacements_dict: MANUAL_REPLACEMENTS_DICT
metals: Optional[METALS] = None
solvents_set: Optional[Set[SOLVENT]] = None
filename: Optional[str] = None
Expand Down Expand Up @@ -161,12 +151,14 @@ def get_rxn_string_and_is_mapped(
rxn_str = rxn_str_extended_smiles.split(" ")[
0
] # this is to get rid of the extended smiles info
return rxn_str, is_mapped
return RXN_STR(rxn_str), is_mapped

@staticmethod
def extract_info_from_rxn(
rxn: ord_reaction_pb2.Reaction,
) -> Optional[Tuple[REACTANTS, AGENTS, PRODUCTS, str, List[MOLECULE_IDENTIFIER]]]:
) -> Optional[
Tuple[REACTANTS, AGENTS, PRODUCTS, RXN_STR, List[MOLECULE_IDENTIFIER]]
]:
"""
Input a reaction object, and return the reactants, agents, products, and the reaction smiles string
"""
Expand All @@ -185,9 +177,9 @@ def extract_info_from_rxn(
del agent
del product_from_rxn

non_smiles_names_list = []
non_smiles_names_list: List[MOLECULE_IDENTIFIER] = []
# We need molecules wihtout maping info, so we can compare them to the products
reactants_from_rxn_without_mapping = []
reactants_from_rxn_without_mapping: CANON_REACTANTS = []
for smi in reactants_from_rxn:
canon_smi = orderly.extract.canonicalise.get_canonicalised_smiles(
smi, is_mapped
Expand All @@ -198,7 +190,7 @@ def extract_info_from_rxn(
reactants_from_rxn_without_mapping.append(canon_smi)
# assert len(reactants_from_rxn) == len(reactants_from_rxn_without_mapping)

products_from_rxn_without_mapping = []
products_from_rxn_without_mapping: CANON_PRODUCTS = []
for smi in products_from_rxn:
canon_smi = orderly.extract.canonicalise.get_canonicalised_smiles(
smi, is_mapped
Expand All @@ -209,7 +201,7 @@ def extract_info_from_rxn(
products_from_rxn_without_mapping.append(canon_smi)
# assert len(products_from_rxn) == len(products_from_rxn_without_mapping)

cleaned_agents = []
cleaned_agents: CANON_AGENTS = []
for smi in agents:
canon_smi = orderly.extract.canonicalise.get_canonicalised_smiles(
smi, is_mapped
Expand Down Expand Up @@ -318,7 +310,7 @@ def rxn_outcomes_extractor(
Extract reaction information from ORD output object (ord_reaction_pb2.Reaction.outcomes)
"""
# products & yield
yields = []
yields: YIELDS = []
products = []
non_smiles_names_list = []

Expand All @@ -339,7 +331,8 @@ def rxn_outcomes_extractor(
for measurement in measurements:
if measurement.type == 3: # YIELD
y = float(measurement.percentage.value)
y = round(y, 2)
y = YIELD(round(y, 2))
continue
# people sometimes report a product such as '[Na+].[Na+].[O-]B1OB2OB([O-])OB(O1)O2' and then only report one yield, this is a problem...
# We'll resolve this by moving the longest smiles string to the front of the list, then appending the yield to the front of the list, and padding with None to ensure that the lists are the same length

Expand All @@ -350,7 +343,7 @@ def rxn_outcomes_extractor(
y_list = [y] + [None] * (len(product_list) - 1)

products += product_list
yields += y_list
yields += y_list # type: ignore

return products, yields, non_smiles_names_list

Expand All @@ -365,54 +358,52 @@ def temperature_extractor(
temp_unit = rxn.conditions.temperature.setpoint.units

if temp_unit == 1: # celcius
return float(rxn.conditions.temperature.setpoint.value)
return TEMPERATURE_CELCIUS(float(rxn.conditions.temperature.setpoint.value))
elif temp_unit == 2: # fahrenheit
f = rxn.conditions.temperature.setpoint.value
c = (f - 32) * 5 / 9
return float(c)
return TEMPERATURE_CELCIUS(float(c))
elif temp_unit == 3: # kelvin
k = rxn.conditions.temperature.setpoint.value
c = k - 273.15
return float(c)
return TEMPERATURE_CELCIUS(float(c))
elif temp_unit == 0: # unspecified
# instead of using the setpoint, use the control type
# temperatures are in celcius
temp_control_type = rxn.conditions.temperature.control.type
if temp_control_type == 2: # AMBIENT
return 25.0
return TEMPERATURE_CELCIUS(25.0)
elif temp_control_type == 6: # ICE_BATH
return 0.0
return TEMPERATURE_CELCIUS(0.0)
elif temp_control_type == 9: # DRY_ICE_BATH
return -78.5
return TEMPERATURE_CELCIUS(-78.5)
elif temp_control_type == 11: # LIQUID_NITROGEN
return -196.0
return TEMPERATURE_CELCIUS(-196.0)
return None # No temperature found

@staticmethod
def rxn_time_extractor(rxn: ord_reaction_pb2.Reaction) -> Optional[float]:
def rxn_time_extractor(rxn: ord_reaction_pb2.Reaction) -> Optional[RXN_TIME]:
if rxn.outcomes[0].reaction_time.units == 1: # hour
return round(float(rxn.outcomes[0].reaction_time.value), 2)
return RXN_TIME(round(float(rxn.outcomes[0].reaction_time.value), 2))
elif rxn.outcomes[0].reaction_time.units == 2: # minutes
m = rxn.outcomes[0].reaction_time.value
h = m / 60
return round(float(h), 2)
return RXN_TIME(round(float(h), 2))
elif rxn.outcomes[0].reaction_time.units == 3: # seconds
s = rxn.outcomes[0].reaction_time.value
h = s / 3600
return round(float(h), 2)
return RXN_TIME(round(float(h), 2))
elif rxn.outcomes[0].reaction_time.units == 4: # day
d = rxn.outcomes[0].reaction_time.value
h = d * 24
return round(float(h), 2)
return RXN_TIME(round(float(h), 2))
else:
return None # no time found

@staticmethod
def apply_replacements_dict(
smiles_list: List[MOLECULE_IDENTIFIER],
manual_replacements_dict: Dict[
MOLECULE_IDENTIFIER, Optional[SMILES | CANON_SMILES]
],
manual_replacements_dict: MANUAL_REPLACEMENTS_DICT,
) -> List[SMILES]:
smiles_list = [
x
Expand Down Expand Up @@ -510,9 +501,7 @@ def merge_to_agents(
@staticmethod
def handle_reaction_object(
rxn: ord_reaction_pb2.Reaction,
manual_replacements_dict: Dict[
MOLECULE_IDENTIFIER, Optional[SMILES | CANON_SMILES]
],
manual_replacements_dict: MANUAL_REPLACEMENTS_DICT,
solvents_set: Set[SOLVENT],
metals: METALS,
trust_labelling: bool = False,
Expand Down
8 changes: 4 additions & 4 deletions orderly/extract/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ def build_solvents_set_and_dict(

def build_replacements(
molecule_replacements: Optional[Dict[MOLECULE_IDENTIFIER, SMILES]] = None,
molecule_str_force_nones: Optional[List[MOLECULE_IDENTIFIER]] = None,
) -> Dict[MOLECULE_IDENTIFIER, Optional[SMILES]]:
molecule_str_force_nones: Optional[List[INVALID_IDENTIFIER]] = None,
) -> Dict[MOLECULE_IDENTIFIER | INVALID_IDENTIFIER, Optional[SMILES]]:
"""
Builds dictionary mapping english name molecule identifiers to canonical smiles. Dict is based on manually curated list.
"""
Expand All @@ -128,7 +128,7 @@ def build_replacements(
orderly.extract.defaults.get_molecule_str_force_nones()
)

molecule_replacements_with_force_nones: Dict[MOLECULE_IDENTIFIER, Optional[SMILES]] = molecule_replacements.copy() # type: ignore
molecule_replacements_with_force_nones: Dict[MOLECULE_IDENTIFIER | INVALID_IDENTIFIER, Optional[SMILES]] = molecule_replacements.copy() # type: ignore

for molecule_str in molecule_str_force_nones:
molecule_replacements_with_force_nones[molecule_str] = None
Expand All @@ -141,7 +141,7 @@ def get_manual_replacements_dict(
molecule_replacements: Optional[Dict[MOLECULE_IDENTIFIER, CANON_SMILES]] = None,
molecule_str_force_nones: Optional[List[MOLECULE_IDENTIFIER]] = None,
solvents_path: Optional[pathlib.Path] = None,
) -> Dict[MOLECULE_IDENTIFIER, Optional[SMILES | CANON_SMILES]]:
) -> MANUAL_REPLACEMENTS_DICT:
"""
Combines manually curated dictioary of molecule names to canonical smiles strings with the dictionary of solvent names to canonical smiles strings.
"""
Expand Down
14 changes: 9 additions & 5 deletions orderly/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,18 @@
) # protobuf uses a different type for the repeat composite container for each OS so we need a generic type that is not using the true type

MOLECULE_IDENTIFIER = str # NewType('MOLECULE_IDENTIFIER', str)
INVALID_IDENTIFIER = str # NewType('INVALID_IDENTIFIER', str)

SMILES = str # NewType('SMILES', str)
CANON_SMILES = (
str # NewType('CANON_SMILES', SMILES) # This is for SMILES canonicalised by RDKit
)

MANUAL_REPLACEMENTS_DICT = Dict[MOLECULE_IDENTIFIER, Optional[SMILES | CANON_SMILES]]
MANUAL_REPLACEMENTS_DICT = Dict[
MOLECULE_IDENTIFIER | INVALID_IDENTIFIER, Optional[SMILES | CANON_SMILES]
]

RXN_STR = str # NewType('RXN_STR', str)
RXN_STR = NewType("RXN_STR", str)

REAGENT = Union[CANON_SMILES, SMILES, MOLECULE_IDENTIFIER]
CANON_REAGENT = CANON_SMILES
Expand Down Expand Up @@ -49,10 +53,10 @@
AGENTS = List[AGENT]
CANON_AGENTS = List[CANON_AGENT]

YIELD = float # NewType('YIELD', float)
YIELD = NewType("YIELD", float)
YIELDS = List[Optional[YIELD]]

TEMPERATURE_CELCIUS = float # NewType('TEMPERATURE_CELCIUS', float)
TEMPERATURE_CELCIUS = NewType("TEMPERATURE_CELCIUS", float)
TEMPERATURES_CELCIUS = List[Optional[TEMPERATURE_CELCIUS]]

RXN_TIME = float # NewType('RXN_TIME', float) # hours
RXN_TIME = NewType("RXN_TIME", float) # hours
8 changes: 5 additions & 3 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from ord_schema.proto import reaction_pb2 as ord_reaction_pb2

from orderly.types import YIELD, MANUAL_REPLACEMENTS_DICT

REPETITIONS = 3
SLOW_REPETITIONS = 1

Expand Down Expand Up @@ -635,9 +637,9 @@ def test_match_yield_with_product(
execution_number: int,
rxn_str_products: List[str],
labelled_products: List[str],
input_yields: Optional[List[Optional[float]]],
input_yields: Optional[List[Optional[YIELD]]],
expected_products: List[str],
expected_yields: Optional[List[Optional[float]]],
expected_yields: Optional[List[Optional[YIELD]]],
) -> None:
import orderly.extract.extractor

Expand Down Expand Up @@ -931,7 +933,7 @@ def test_handle_reaction_object(
execution_number: int,
file_name: str,
rxn_idx: int,
manual_replacements_dict: Optional[Dict[str, Optional[str]]],
manual_replacements_dict: MANUAL_REPLACEMENTS_DICT,
trust_labelling: bool,
expected_reactants: List[str],
expected_agents: List[str],
Expand Down