-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
342 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
import pandas as pd | ||
from rdkit import Chem | ||
from rdkit.Chem import Descriptors, Crippen, rdMolDescriptors | ||
from math import log10 | ||
|
||
|
||
class CNS_MPO_csv_to_df: | ||
def __init__(self, csv_file): | ||
self.csv_file = csv_file | ||
self._df = None | ||
self.calculate() | ||
|
||
def read_csv(self): | ||
df = pd.read_csv(self.csv_file, sep=";") | ||
df["pKa"] = df["pKa"].str.replace(",", ".").astype(float) | ||
return df | ||
|
||
def clogD(self, logP, pKa, pH=7.4): | ||
return logP - log10(1 + 10 ** (pH - pKa)) | ||
|
||
def csv_file_preparation(self): | ||
dictionary = {"MW": [], "LogP": [], "HBD": [], "TPSA": []} | ||
|
||
for cpd in self._df["Smiles"]: | ||
molecule = Chem.MolFromSmiles(cpd) | ||
mol_mw = Descriptors.MolWt(molecule) | ||
mol_logp = Crippen.MolLogP(molecule) | ||
mol_hbd = rdMolDescriptors.CalcNumHBD(molecule) | ||
mol_tpsa = Descriptors.TPSA(molecule) | ||
|
||
dictionary["MW"].append(mol_mw) | ||
dictionary["LogP"].append(mol_logp) | ||
dictionary["HBD"].append(mol_hbd) | ||
dictionary["TPSA"].append(mol_tpsa) | ||
|
||
df_descriptors = pd.DataFrame(dictionary) | ||
df_descriptors["pKa"] = self._df["pKa"] | ||
df_descriptors["LogD"] = df_descriptors.apply( | ||
lambda x: self.clogD(x["LogP"], x["pKa"]), axis=1 | ||
) | ||
return df_descriptors | ||
|
||
def mw_score_func(self, mw): | ||
if mw <= 360: | ||
return 1 | ||
elif 360 < mw <= 500: | ||
return -0.005 * mw + 2.5 | ||
else: | ||
return 0 | ||
|
||
def logp_score_func(self, logp): | ||
if logp <= 3: | ||
return 1 | ||
elif 3 < logp <= 5: | ||
return -0.5 * logp + 2.5 | ||
else: | ||
return 0 | ||
|
||
def logd_score_func(self, logd): | ||
if logd <= 2: | ||
return 1 | ||
elif 2 < logd <= 4: | ||
return -0.5 * logd + 2 | ||
else: | ||
return 0 | ||
|
||
def pka_score_func(self, pka): | ||
if pka <= 8: | ||
return 1 | ||
elif 8 < pka <= 10: | ||
return -0.5 * pka + 5 | ||
else: | ||
return 0 | ||
|
||
def tpsa_score_func(self, tpsa): | ||
if 40 <= tpsa <= 90: | ||
return 1 | ||
elif 90 < tpsa <= 120: | ||
return -0.0333 * tpsa + 4 | ||
elif 20 <= tpsa < 40: | ||
return 0.05 * tpsa - 1 | ||
else: | ||
return 0 | ||
|
||
def hbd_score_func(self, hbd): | ||
if hbd == 0: | ||
return 1 | ||
elif hbd == 1: | ||
return 0.75 | ||
elif hbd == 2: | ||
return 0.5 | ||
elif hbd == 3: | ||
return 0.25 | ||
else: | ||
return 0 | ||
|
||
def calcCNS_MPO(self): | ||
df_descriptors = self.csv_file_preparation() | ||
df_descriptors["MW_score"] = df_descriptors["MW"].apply( | ||
self.mw_score_func | ||
) | ||
df_descriptors["LogP_score"] = df_descriptors["LogP"].apply( | ||
self.logp_score_func | ||
) | ||
df_descriptors["LogD_score"] = df_descriptors["LogD"].apply( | ||
self.logd_score_func | ||
) | ||
df_descriptors["pKa_score"] = df_descriptors["pKa"].apply( | ||
self.pka_score_func | ||
) | ||
df_descriptors["TPSA_score"] = df_descriptors["TPSA"].apply( | ||
self.tpsa_score_func | ||
) | ||
df_descriptors["HBD_score"] = df_descriptors["HBD"].apply( | ||
self.hbd_score_func | ||
) | ||
|
||
df_descriptors["CNS_MPO"] = ( | ||
df_descriptors["MW_score"] | ||
+ df_descriptors["LogP_score"] | ||
+ df_descriptors["LogD_score"] | ||
+ df_descriptors["pKa_score"] | ||
+ df_descriptors["TPSA_score"] | ||
+ df_descriptors["HBD_score"] | ||
) | ||
df_descriptors["Id"] = self._df["Id"] | ||
|
||
return df_descriptors[ | ||
["Id", "MW", "LogP", "LogD", "pKa", "TPSA", "HBD", "CNS_MPO"] | ||
] | ||
|
||
def calculate(self): | ||
self._df = self.read_csv() | ||
self._df = self.calcCNS_MPO() | ||
|
||
def __getitem__(self, key): | ||
if isinstance(key, (int, slice)): | ||
return self._df.iloc[key] | ||
elif isinstance(key, str): | ||
return self._df[key] | ||
else: | ||
raise KeyError(f"Unsupported key type: {type(key)}") | ||
|
||
def __repr__(self): | ||
return repr(self._df) | ||
|
||
def __str__(self): | ||
return str(self._df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
import pandas as pd | ||
from rdkit import Chem | ||
from rdkit.Chem import Descriptors, Crippen, rdMolDescriptors | ||
from math import log10 | ||
|
||
|
||
class CNS_MPO_single_molecule: | ||
def __init__(self, smiles_list, pKa_list): | ||
if len(smiles_list) != len(pKa_list): | ||
raise ValueError( | ||
"Length of smiles_list must be equal to length of pKa_list" | ||
) | ||
self.smiles_list = smiles_list | ||
self.pKa_list = pKa_list | ||
self._df = None | ||
self.calculate() | ||
|
||
def clogD(self, logP, pKa, pH=7.4): | ||
return logP - log10(1 + 10 ** (pH - pKa)) | ||
|
||
def csv_file_preparation(self): | ||
dictionary = {"MW": [], "LogP": [], "HBD": [], "TPSA": []} | ||
|
||
for cpd in self.smiles_list: | ||
molecule = Chem.MolFromSmiles(cpd) | ||
if molecule is None: | ||
# Handle invalid SMILES | ||
dictionary["MW"].append(None) | ||
dictionary["LogP"].append(None) | ||
dictionary["HBD"].append(None) | ||
dictionary["TPSA"].append(None) | ||
continue | ||
mol_mw = Descriptors.MolWt(molecule) | ||
mol_logp = Crippen.MolLogP(molecule) | ||
mol_hbd = rdMolDescriptors.CalcNumHBD(molecule) | ||
mol_tpsa = Descriptors.TPSA(molecule) | ||
|
||
dictionary["MW"].append(mol_mw) | ||
dictionary["LogP"].append(mol_logp) | ||
dictionary["HBD"].append(mol_hbd) | ||
dictionary["TPSA"].append(mol_tpsa) | ||
|
||
df_descriptors = pd.DataFrame(dictionary) | ||
df_descriptors["pKa"] = self.pKa_list | ||
df_descriptors["LogD"] = df_descriptors.apply( | ||
lambda x: self.clogD(x["LogP"], x["pKa"]), axis=1 | ||
) | ||
return df_descriptors | ||
|
||
def mw_score_func(self, mw): | ||
if mw is None: | ||
return 0 | ||
if mw <= 360: | ||
return 1 | ||
elif 360 < mw <= 500: | ||
return -0.005 * mw + 2.5 | ||
else: | ||
return 0 | ||
|
||
def logp_score_func(self, logp): | ||
if logp is None: | ||
return 0 | ||
if logp <= 3: | ||
return 1 | ||
elif 3 < logp <= 5: | ||
return -0.5 * logp + 2.5 | ||
else: | ||
return 0 | ||
|
||
def logd_score_func(self, logd): | ||
if logd is None: | ||
return 0 | ||
if logd <= 2: | ||
return 1 | ||
elif 2 < logd <= 4: | ||
return -0.5 * logd + 2 | ||
else: | ||
return 0 | ||
|
||
def pka_score_func(self, pka): | ||
if pka is None: | ||
return 0 | ||
if pka <= 8: | ||
return 1 | ||
elif 8 < pka <= 10: | ||
return -0.5 * pka + 5 | ||
else: | ||
return 0 | ||
|
||
def tpsa_score_func(self, tpsa): | ||
if tpsa is None: | ||
return 0 | ||
if 40 <= tpsa <= 90: | ||
return 1 | ||
elif 90 < tpsa <= 120: | ||
return -0.0333 * tpsa + 4 | ||
elif 20 <= tpsa < 40: | ||
return 0.05 * tpsa - 1 | ||
else: | ||
return 0 | ||
|
||
def hbd_score_func(self, hbd): | ||
if hbd is None: | ||
return 0 | ||
if hbd == 0: | ||
return 1 | ||
elif hbd == 1: | ||
return 0.75 | ||
elif hbd == 2: | ||
return 0.5 | ||
elif hbd == 3: | ||
return 0.25 | ||
else: | ||
return 0 | ||
|
||
def calcCNS_MPO(self): | ||
df_descriptors = self.csv_file_preparation() | ||
df_descriptors["MW_score"] = df_descriptors["MW"].apply( | ||
self.mw_score_func | ||
) | ||
df_descriptors["LogP_score"] = df_descriptors["LogP"].apply( | ||
self.logp_score_func | ||
) | ||
df_descriptors["LogD_score"] = df_descriptors["LogD"].apply( | ||
self.logd_score_func | ||
) | ||
df_descriptors["pKa_score"] = df_descriptors["pKa"].apply( | ||
self.pka_score_func | ||
) | ||
df_descriptors["TPSA_score"] = df_descriptors["TPSA"].apply( | ||
self.tpsa_score_func | ||
) | ||
df_descriptors["HBD_score"] = df_descriptors["HBD"].apply( | ||
self.hbd_score_func | ||
) | ||
|
||
df_descriptors["CNS_MPO"] = ( | ||
df_descriptors["MW_score"] | ||
+ df_descriptors["LogP_score"] | ||
+ df_descriptors["LogD_score"] | ||
+ df_descriptors["pKa_score"] | ||
+ df_descriptors["TPSA_score"] | ||
+ df_descriptors["HBD_score"] | ||
) | ||
|
||
# Return only the specified columns | ||
return df_descriptors[ | ||
["MW", "LogP", "HBD", "TPSA", "pKa", "LogD", "CNS_MPO"] | ||
] | ||
|
||
def calculate(self): | ||
self._df = self.calcCNS_MPO() | ||
|
||
def __getitem__(self, key): | ||
if isinstance(key, (int, slice)): | ||
return self._df.iloc[key] | ||
elif isinstance(key, str): | ||
return self._df[key] | ||
else: | ||
raise KeyError(f"Unsupported key type: {type(key)}") | ||
|
||
def __repr__(self): | ||
return repr(self._df) | ||
|
||
def __str__(self): | ||
return str(self._df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
It is important that in the CSV file each column is located in the next cell, and not after a comma. | ||
Required separator is semicolon (";"), not comma (","). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
Id;Smiles;pKa | ||
telmisartan;CCCC1=NC2=C(N1CC3=CC=C(C4=CC=CC=C4C(O)=O)C=C3)C=C(C5=NC6=CC=CC=C6N5C)C=C2C;3,581 | ||
ibuprofen;CC(CC1=CC=C(C(C(O)=O)C)C=C1)C;4,366 | ||
sulfasalazine;OC(C1=CC(/N=N/C2=CC=C(S(=O)(NC3=NC=CC=C3)=O)C=C2)=CC=C1O)=O;2,715 | ||
risperidone;O=C1C(CCN2CCC(CC2)C3=NOC4=C3C=CC(F)=C4)=C(N=C5N1CCCC5)C;8,765 | ||
paroxetine;C1CNCC(C1C2=CC=C(C=C2)F)COC3=CC4=C(C=C3)OCO4;9,365 | ||
methylophenidate;O=C(OC)C(C1CCCCN1)C2=CC=CC=C2;8,375 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# Example of CNS_MPO_single_molecule() use | ||
x = CNS_MPO_single_molecule( | ||
smiles_list=[ | ||
"C(C=1CCN(C5)CCC(C5)c(n4)c(c3o4)ccc(c3)F)(=O)N(C2)C(CCC2)=NC1C", | ||
"C1CNCC(C1C2=CC=C(C=C2)F)COC3=CC4=C(C=C3)OCO4", | ||
], | ||
pKa_list=[8.765, 9.365], | ||
) | ||
print(x) | ||
|
||
|
||
# Example of CNS_MPO_csv_to_df() use | ||
filepath = 'path_to_CSV_files' | ||
x = CNS_MPO_csv_to_df(csv_file=filepath) | ||
print(x[1:3][['Id','MW']]) #1 user can specify columns and rows with slicing, which can enable him to get desired output | ||
print(x[1:3]['Id']) #2 | ||
print(x[1:3]) #3 | ||
print(x) #4 but also user can get whole object | ||
|