Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Adam-maz authored Aug 26, 2024
1 parent 8fa523c commit 48e0712
Show file tree
Hide file tree
Showing 5 changed files with 342 additions and 0 deletions.
148 changes: 148 additions & 0 deletions cns_mpo_csv_to_df.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, rdMolDescriptors
from math import log10


class CNS_MPO_csv_to_df:
def __init__(self, csv_file):
self.csv_file = csv_file
self._df = None
self.calculate()

def read_csv(self):
df = pd.read_csv(self.csv_file, sep=";")
df["pKa"] = df["pKa"].str.replace(",", ".").astype(float)
return df

def clogD(self, logP, pKa, pH=7.4):
return logP - log10(1 + 10 ** (pH - pKa))

def csv_file_preparation(self):
dictionary = {"MW": [], "LogP": [], "HBD": [], "TPSA": []}

for cpd in self._df["Smiles"]:
molecule = Chem.MolFromSmiles(cpd)
mol_mw = Descriptors.MolWt(molecule)
mol_logp = Crippen.MolLogP(molecule)
mol_hbd = rdMolDescriptors.CalcNumHBD(molecule)
mol_tpsa = Descriptors.TPSA(molecule)

dictionary["MW"].append(mol_mw)
dictionary["LogP"].append(mol_logp)
dictionary["HBD"].append(mol_hbd)
dictionary["TPSA"].append(mol_tpsa)

df_descriptors = pd.DataFrame(dictionary)
df_descriptors["pKa"] = self._df["pKa"]
df_descriptors["LogD"] = df_descriptors.apply(
lambda x: self.clogD(x["LogP"], x["pKa"]), axis=1
)
return df_descriptors

def mw_score_func(self, mw):
if mw <= 360:
return 1
elif 360 < mw <= 500:
return -0.005 * mw + 2.5
else:
return 0

def logp_score_func(self, logp):
if logp <= 3:
return 1
elif 3 < logp <= 5:
return -0.5 * logp + 2.5
else:
return 0

def logd_score_func(self, logd):
if logd <= 2:
return 1
elif 2 < logd <= 4:
return -0.5 * logd + 2
else:
return 0

def pka_score_func(self, pka):
if pka <= 8:
return 1
elif 8 < pka <= 10:
return -0.5 * pka + 5
else:
return 0

def tpsa_score_func(self, tpsa):
if 40 <= tpsa <= 90:
return 1
elif 90 < tpsa <= 120:
return -0.0333 * tpsa + 4
elif 20 <= tpsa < 40:
return 0.05 * tpsa - 1
else:
return 0

def hbd_score_func(self, hbd):
if hbd == 0:
return 1
elif hbd == 1:
return 0.75
elif hbd == 2:
return 0.5
elif hbd == 3:
return 0.25
else:
return 0

def calcCNS_MPO(self):
df_descriptors = self.csv_file_preparation()
df_descriptors["MW_score"] = df_descriptors["MW"].apply(
self.mw_score_func
)
df_descriptors["LogP_score"] = df_descriptors["LogP"].apply(
self.logp_score_func
)
df_descriptors["LogD_score"] = df_descriptors["LogD"].apply(
self.logd_score_func
)
df_descriptors["pKa_score"] = df_descriptors["pKa"].apply(
self.pka_score_func
)
df_descriptors["TPSA_score"] = df_descriptors["TPSA"].apply(
self.tpsa_score_func
)
df_descriptors["HBD_score"] = df_descriptors["HBD"].apply(
self.hbd_score_func
)

df_descriptors["CNS_MPO"] = (
df_descriptors["MW_score"]
+ df_descriptors["LogP_score"]
+ df_descriptors["LogD_score"]
+ df_descriptors["pKa_score"]
+ df_descriptors["TPSA_score"]
+ df_descriptors["HBD_score"]
)
df_descriptors["Id"] = self._df["Id"]

return df_descriptors[
["Id", "MW", "LogP", "LogD", "pKa", "TPSA", "HBD", "CNS_MPO"]
]

def calculate(self):
self._df = self.read_csv()
self._df = self.calcCNS_MPO()

def __getitem__(self, key):
if isinstance(key, (int, slice)):
return self._df.iloc[key]
elif isinstance(key, str):
return self._df[key]
else:
raise KeyError(f"Unsupported key type: {type(key)}")

def __repr__(self):
return repr(self._df)

def __str__(self):
return str(self._df)
166 changes: 166 additions & 0 deletions cns_mpo_single_molecule.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, rdMolDescriptors
from math import log10


class CNS_MPO_single_molecule:
def __init__(self, smiles_list, pKa_list):
if len(smiles_list) != len(pKa_list):
raise ValueError(
"Length of smiles_list must be equal to length of pKa_list"
)
self.smiles_list = smiles_list
self.pKa_list = pKa_list
self._df = None
self.calculate()

def clogD(self, logP, pKa, pH=7.4):
return logP - log10(1 + 10 ** (pH - pKa))

def csv_file_preparation(self):
dictionary = {"MW": [], "LogP": [], "HBD": [], "TPSA": []}

for cpd in self.smiles_list:
molecule = Chem.MolFromSmiles(cpd)
if molecule is None:
# Handle invalid SMILES
dictionary["MW"].append(None)
dictionary["LogP"].append(None)
dictionary["HBD"].append(None)
dictionary["TPSA"].append(None)
continue
mol_mw = Descriptors.MolWt(molecule)
mol_logp = Crippen.MolLogP(molecule)
mol_hbd = rdMolDescriptors.CalcNumHBD(molecule)
mol_tpsa = Descriptors.TPSA(molecule)

dictionary["MW"].append(mol_mw)
dictionary["LogP"].append(mol_logp)
dictionary["HBD"].append(mol_hbd)
dictionary["TPSA"].append(mol_tpsa)

df_descriptors = pd.DataFrame(dictionary)
df_descriptors["pKa"] = self.pKa_list
df_descriptors["LogD"] = df_descriptors.apply(
lambda x: self.clogD(x["LogP"], x["pKa"]), axis=1
)
return df_descriptors

def mw_score_func(self, mw):
if mw is None:
return 0
if mw <= 360:
return 1
elif 360 < mw <= 500:
return -0.005 * mw + 2.5
else:
return 0

def logp_score_func(self, logp):
if logp is None:
return 0
if logp <= 3:
return 1
elif 3 < logp <= 5:
return -0.5 * logp + 2.5
else:
return 0

def logd_score_func(self, logd):
if logd is None:
return 0
if logd <= 2:
return 1
elif 2 < logd <= 4:
return -0.5 * logd + 2
else:
return 0

def pka_score_func(self, pka):
if pka is None:
return 0
if pka <= 8:
return 1
elif 8 < pka <= 10:
return -0.5 * pka + 5
else:
return 0

def tpsa_score_func(self, tpsa):
if tpsa is None:
return 0
if 40 <= tpsa <= 90:
return 1
elif 90 < tpsa <= 120:
return -0.0333 * tpsa + 4
elif 20 <= tpsa < 40:
return 0.05 * tpsa - 1
else:
return 0

def hbd_score_func(self, hbd):
if hbd is None:
return 0
if hbd == 0:
return 1
elif hbd == 1:
return 0.75
elif hbd == 2:
return 0.5
elif hbd == 3:
return 0.25
else:
return 0

def calcCNS_MPO(self):
df_descriptors = self.csv_file_preparation()
df_descriptors["MW_score"] = df_descriptors["MW"].apply(
self.mw_score_func
)
df_descriptors["LogP_score"] = df_descriptors["LogP"].apply(
self.logp_score_func
)
df_descriptors["LogD_score"] = df_descriptors["LogD"].apply(
self.logd_score_func
)
df_descriptors["pKa_score"] = df_descriptors["pKa"].apply(
self.pka_score_func
)
df_descriptors["TPSA_score"] = df_descriptors["TPSA"].apply(
self.tpsa_score_func
)
df_descriptors["HBD_score"] = df_descriptors["HBD"].apply(
self.hbd_score_func
)

df_descriptors["CNS_MPO"] = (
df_descriptors["MW_score"]
+ df_descriptors["LogP_score"]
+ df_descriptors["LogD_score"]
+ df_descriptors["pKa_score"]
+ df_descriptors["TPSA_score"]
+ df_descriptors["HBD_score"]
)

# Return only the specified columns
return df_descriptors[
["MW", "LogP", "HBD", "TPSA", "pKa", "LogD", "CNS_MPO"]
]

def calculate(self):
self._df = self.calcCNS_MPO()

def __getitem__(self, key):
if isinstance(key, (int, slice)):
return self._df.iloc[key]
elif isinstance(key, str):
return self._df[key]
else:
raise KeyError(f"Unsupported key type: {type(key)}")

def __repr__(self):
return repr(self._df)

def __str__(self):
return str(self._df)
2 changes: 2 additions & 0 deletions csv_file_instruction.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
It is important that in the CSV file each column is located in the next cell, and not after a comma.
Required separator is semicolon (";"), not comma (",").
7 changes: 7 additions & 0 deletions csv_file_with_example_molecules.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Id;Smiles;pKa
telmisartan;CCCC1=NC2=C(N1CC3=CC=C(C4=CC=CC=C4C(O)=O)C=C3)C=C(C5=NC6=CC=CC=C6N5C)C=C2C;3,581
ibuprofen;CC(CC1=CC=C(C(C(O)=O)C)C=C1)C;4,366
sulfasalazine;OC(C1=CC(/N=N/C2=CC=C(S(=O)(NC3=NC=CC=C3)=O)C=C2)=CC=C1O)=O;2,715
risperidone;O=C1C(CCN2CCC(CC2)C3=NOC4=C3C=CC(F)=C4)=C(N=C5N1CCCC5)C;8,765
paroxetine;C1CNCC(C1C2=CC=C(C=C2)F)COC3=CC4=C(C=C3)OCO4;9,365
methylophenidate;O=C(OC)C(C1CCCCN1)C2=CC=CC=C2;8,375
19 changes: 19 additions & 0 deletions example_of_use.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Example of CNS_MPO_single_molecule() use
x = CNS_MPO_single_molecule(
smiles_list=[
"C(C=1CCN(C5)CCC(C5)c(n4)c(c3o4)ccc(c3)F)(=O)N(C2)C(CCC2)=NC1C",
"C1CNCC(C1C2=CC=C(C=C2)F)COC3=CC4=C(C=C3)OCO4",
],
pKa_list=[8.765, 9.365],
)
print(x)


# Example of CNS_MPO_csv_to_df() use
filepath = 'path_to_CSV_files'
x = CNS_MPO_csv_to_df(csv_file=filepath)
print(x[1:3][['Id','MW']]) #1 user can specify columns and rows with slicing, which can enable him to get desired output
print(x[1:3]['Id']) #2
print(x[1:3]) #3
print(x) #4 but also user can get whole object

0 comments on commit 48e0712

Please sign in to comment.