Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lwreg.retrieve() now returns a dictionary #40

Merged
merged 3 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 40 additions & 65 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,83 +59,58 @@ ERROR:root:Compound already registered

### Python
```
In [1]: import lwreg
>>> import lwreg

In [3]: lwreg.initdb()
>>> from lwreg import utils

>>> lwreg.set_default_config(utils.defaultConfig()) # you generally will want to provide more information about the database

>>> lwreg.initdb()
This will destroy any existing information in the registration database.
are you sure? [yes/no]: yes
Out[3]: True
True

In [4]: lwreg.register(smiles='CCO')
Out[4]: 1
>>> lwreg.register(smiles='CCO')
1

In [5]: lwreg.register(smiles='CCOC')
Out[5]: 2
>>> lwreg.register(smiles='CCOC')
2

In [6]: from rdkit import Chem
>>> from rdkit import Chem

In [7]: m = Chem.MolFromSmiles('CCOCC')
>>> m = Chem.MolFromSmiles('CCOCC')

In [8]: lwreg.register(mol=m)
Out[8]: 3
>>> lwreg.register(mol=m)
3

In [9]: lwreg.register(mol=m)
>>> lwreg.register(mol=m)
---------------------------------------------------------------------------
IntegrityError Traceback (most recent call last)
Input In [9], in <cell line: 1>()
Input In [10], in <cell line: 1>()
----> 1 lwreg.register(mol=m)

File ~/Code/lightweight-registration/lwreg/utils.py:249, in register(config, mol, molfile, molblock, smiles, escape, fail_on_duplicate, no_verbose)
247 cn = _connect(config)
248 curs = cn.cursor()
--> 249 mrn = _register_mol(tpl, escape, cn, curs, config, fail_on_duplicate)
250 if not no_verbose:
251 print(mrn)

File ~/Code/lightweight-registration/lwreg/utils.py:184, in _register_mol(tpl, escape, cn, curs, config, failOnDuplicate)
181 mhash, layers = hash_mol(sMol, escape=escape, config=config)
183 # will fail if the fullhash is already there
--> 184 curs.execute(
185 _replace_placeholders(
186 'insert into hashes values (?,?,?,?,?,?,?,?,?)'), (
187 mrn,
188 mhash,
189 layers[RegistrationHash.HashLayer.FORMULA],
190 layers[RegistrationHash.HashLayer.CANONICAL_SMILES],
191 layers[RegistrationHash.HashLayer.NO_STEREO_SMILES],
192 layers[RegistrationHash.HashLayer.TAUTOMER_HASH],
193 layers[RegistrationHash.HashLayer.NO_STEREO_TAUTOMER_HASH],
194 layers[RegistrationHash.HashLayer.ESCAPE],
195 layers[RegistrationHash.HashLayer.SGROUP_DATA],
196 ))
198 cn.commit()
199 except _violations:

... DETAILS REMOVED ...

IntegrityError: UNIQUE constraint failed: hashes.fullhash

In [10]: lwreg.query(smiles='CCOC')
Out[10]: [2]
>>> lwreg.query(smiles='CCOC')
[2]

In [11]: lwreg.query(smiles='CCOCC')
Out[11]: [3]
>>> lwreg.query(smiles='CCOCC')
[3]

In [12]: lwreg.query(smiles='CCOCO')
Out[12]: []
>>> lwreg.query(smiles='CCOCO')
[]

In [13]: lwreg.retrieve(id=2)
Out[13]:
((2,
'\n RDKit 2D\n\n 0 0 0 0 0 0 0 0 0 0999 V3000\nM V30 BEGIN CTAB\nM V30 COUNTS 4 3 0 0 0\nM V30 BEGIN ATOM\nM V30 1 C 0.000000 0.000000 0.000000 0\nM V30 2 C 1.299038 0.750000 0.000000 0\nM V30 3 O 2.598076 -0.000000 0.000000 0\nM V30 4 C 3.897114 0.750000 0.000000 0\nM V30 END ATOM\nM V30 BEGIN BOND\nM V30 1 1 1 2\nM V30 2 1 2 3\nM V30 3 1 3 4\nM V30 END BOND\nM V30 END CTAB\nM END\n',
'mol'),)
>>> lwreg.retrieve(id=2)
{2: ('\n RDKit 2D\n\n 0 0 0 0 0 0 0 0 0 0999 V3000\nM V30 BEGIN CTAB\nM V30 COUNTS 4 3 0 0 0\nM V30 BEGIN ATOM\nM V30 1 C 0.000000 0.000000 0.000000 0\nM V30 2 C 1.299038 0.750000 0.000000 0\nM V30 3 O 2.598076 -0.000000 0.000000 0\nM V30 4 C 3.897114 0.750000 0.000000 0\nM V30 END ATOM\nM V30 BEGIN BOND\nM V30 1 1 1 2\nM V30 2 1 2 3\nM V30 3 1 3 4\nM V30 END BOND\nM V30 END CTAB\nM END\n',
'mol')}

In [14]: lwreg.retrieve(ids=[2,3])
Out[14]:
((2,
'\n RDKit 2D\n\n 0 0 0 0 0 0 0 0 0 0999 V3000\nM V30 BEGIN CTAB\nM V30 COUNTS 4 3 0 0 0\nM V30 BEGIN ATOM\nM V30 1 C 0.000000 0.000000 0.000000 0\nM V30 2 C 1.299038 0.750000 0.000000 0\nM V30 3 O 2.598076 -0.000000 0.000000 0\nM V30 4 C 3.897114 0.750000 0.000000 0\nM V30 END ATOM\nM V30 BEGIN BOND\nM V30 1 1 1 2\nM V30 2 1 2 3\nM V30 3 1 3 4\nM V30 END BOND\nM V30 END CTAB\nM END\n',
>>> lwreg.retrieve(ids=[2,3])
{2: ('\n RDKit 2D\n\n 0 0 0 0 0 0 0 0 0 0999 V3000\nM V30 BEGIN CTAB\nM V30 COUNTS 4 3 0 0 0\nM V30 BEGIN ATOM\nM V30 1 C 0.000000 0.000000 0.000000 0\nM V30 2 C 1.299038 0.750000 0.000000 0\nM V30 3 O 2.598076 -0.000000 0.000000 0\nM V30 4 C 3.897114 0.750000 0.000000 0\nM V30 END ATOM\nM V30 BEGIN BOND\nM V30 1 1 1 2\nM V30 2 1 2 3\nM V30 3 1 3 4\nM V30 END BOND\nM V30 END CTAB\nM END\n',
'mol'),
(3,
'\n RDKit 2D\n\n 0 0 0 0 0 0 0 0 0 0999 V3000\nM V30 BEGIN CTAB\nM V30 COUNTS 5 4 0 0 0\nM V30 BEGIN ATOM\nM V30 1 C 0.000000 0.000000 0.000000 0\nM V30 2 C 1.299038 0.750000 0.000000 0\nM V30 3 O 2.598076 -0.000000 0.000000 0\nM V30 4 C 3.897114 0.750000 0.000000 0\nM V30 5 C 5.196152 -0.000000 0.000000 0\nM V30 END ATOM\nM V30 BEGIN BOND\nM V30 1 1 1 2\nM V30 2 1 2 3\nM V30 3 1 3 4\nM V30 4 1 4 5\nM V30 END BOND\nM V30 END CTAB\nM END\n',
'mol'))
3: ('\n RDKit 2D\n\n 0 0 0 0 0 0 0 0 0 0999 V3000\nM V30 BEGIN CTAB\nM V30 COUNTS 5 4 0 0 0\nM V30 BEGIN ATOM\nM V30 1 C 0.000000 0.000000 0.000000 0\nM V30 2 C 1.299038 0.750000 0.000000 0\nM V30 3 O 2.598076 -0.000000 0.000000 0\nM V30 4 C 3.897114 0.750000 0.000000 0\nM V30 5 C 5.196152 -0.000000 0.000000 0\nM V30 END ATOM\nM V30 BEGIN BOND\nM V30 1 1 1 2\nM V30 2 1 2 3\nM V30 3 1 3 4\nM V30 4 1 4 5\nM V30 END BOND\nM V30 END CTAB\nM END\n',
'mol')}


```
Expand All @@ -148,10 +123,10 @@ Start with a couple of examples showing what the 'fragment' and 'charge' built-i

```
>>> config['standardization'] = 'fragment'
>>> Chem.MolToSmiles(lwreg.utils.standardize_mol(Chem.MolFromSmiles('CC[O-].[Na+]'),config=config))
>>> Chem.MolToSmiles(lwreg.standardize_mol(Chem.MolFromSmiles('CC[O-].[Na+]'),config=config))
'CC[O-]'
>>> config['standardization'] = 'charge'
>>> Chem.MolToSmiles(lwreg.utils.standardize_mol(Chem.MolFromSmiles('CC[O-].[Na+]'),config=config))
>>> Chem.MolToSmiles(lwreg.standardize_mol(Chem.MolFromSmiles('CC[O-].[Na+]'),config=config))
'CCO'
```

Expand All @@ -163,23 +138,23 @@ Now define a custom filter which rejects (by returning None) molecules which hav
... return mol
...
>>> config['standardization'] = reject_charged_molecules
>>> Chem.MolToSmiles(lwreg.utils.standardize_mol(Chem.MolFromSmiles('CC[O-].[Na+]'),config=config))
>>> Chem.MolToSmiles(lwreg.standardize_mol(Chem.MolFromSmiles('CC[O-].[Na+]'),config=config))
'CC[O-].[Na+]'
```

Here's an example which fails:

```
>>> lwreg.utils.standardize_mol(Chem.MolFromSmiles('CC[O-]'),config=config) is None
>>> lwreg.standardize_mol(Chem.MolFromSmiles('CC[O-]'),config=config) is None
True
```

We can chain standardization/filtering operations together by providing a list. The individual operations are run in order. Here's an example where we attempt to neutralise the molecule by finding the charge parent and then apply our reject_charged_molecules filter:
```
>>> config['standardization'] = ['charge',reject_charged_molecules]
>>> lwreg.utils.standardize_mol(Chem.MolFromSmiles('CC[O-]'),config=config) is None
>>> lwreg.standardize_mol(Chem.MolFromSmiles('CC[O-]'),config=config) is None
False
>>> lwreg.utils.standardize_mol(Chem.MolFromSmiles('CC[N+](C)(C)C'),config=config) is None
>>> lwreg.standardize_mol(Chem.MolFromSmiles('CC[N+](C)(C)C'),config=config) is None
True
```
That last one failed because the quarternary nitrogen can't be neutralized.
Expand All @@ -197,7 +172,7 @@ Note that once a database is created in `registerConformers` mode, it probably s

- `register()` and `bulk_register()` require molecules to have associated conformers. Both return `(molregno, conf_id)` tuples instead of just `molregno`s
- `query()`: if called with the `ids` argument, this will return all of the conformers for the supplied molregnos as `(molregno, conf_id)` tuples. If called with a molecule, the conformer of the molecule will be hashed and looked up in the `conformers`` table, returns a list of `(molregno, conf_id)` tuples.
- `retrieve()`: if called with `(molregno, conf_id)` tuple(s), this will return `(molregno, conf_id, molblock)` tuples where the `molblock`s contain the coordinates of the registered conformers.
- `retrieve()`: if called with `(molregno, conf_id)` tuple(s), this will return a dictionary of `(molblock, 'mol')` tuples with `(molregno, conf_id)` tuples as keys where the `molblock`s contain the coordinates of the registered conformers.

### Hashing conformers for registration

Expand All @@ -215,7 +190,7 @@ create table registration_metadata (key text, value text);
create table hashes (molregno integer primary key, fullhash text unique,
formula text, canonical_smiles text, no_stereo_smiles text,
tautomer_hash text, no_stereo_tautomer_hash text, "escape" text, sgroup_data text, rdkitVersion text);
create table orig_data (molregno integer primary key, data text, datatype text);
create table orig_data (molregno integer primary key, data text, datatype text, timestamp DATETIME DEFAULT CURRENT_TIMESTAMP);
create table molblocks (molregno integer primary key, molblock text, standardization text);
```

Expand Down
57 changes: 42 additions & 15 deletions lwreg/test_lwreg.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,28 +246,27 @@ def testRetrieve(self):
self.assertEqual(len(res), 0)
res = utils.retrieve(ids=[1], config=self._config)
self.assertEqual(len(res), 1)
tpl = res[0]
self.assertEqual(len(tpl), 3)
self.assertEqual(tpl[0], 1)
mb = Chem.MolFromMolBlock(tpl[1])
tpl = res[1]
self.assertEqual(len(tpl), 2)
mb = Chem.MolFromMolBlock(tpl[0])
self.assertEqual(
utils.query(smiles=Chem.MolToSmiles(mb), config=self._config), [1])
self.assertEqual(tpl[2], 'mol')
self.assertEqual(tpl[1], 'mol')
res = utils.retrieve(ids=[100], config=self._config)
self.assertEqual(len(res), 0)

res = utils.retrieve(ids=[1, 2], config=self._config, as_hashes=True)
for row in res:
self.assertTrue('molregno' in row)
self.assertTrue('fullhash' in row)
self.assertTrue('canonical_smiles' in row)
for k, v in res.items():
self.assertFalse('molregno' in v)
self.assertTrue('fullhash' in v)
self.assertTrue('canonical_smiles' in v)

res = utils.retrieve(ids=[1, 5],
config=self._config,
as_submitted=True)
self.assertEqual(len(res), 2)
self.assertEqual(res[0][2], 'smiles')
self.assertEqual(res[1][2], 'pkl')
self.assertEqual(res[1][1], 'smiles')
self.assertEqual(res[5][1], 'pkl')

def testStandardizationOptions(self):
lconfig = self._config.copy()
Expand Down Expand Up @@ -758,10 +757,38 @@ def testConformerRetrieve(self):
config=self._config)

res = utils.retrieve(ids=(regids[0], regids[2]), config=self._config)
self.assertEqual(res[0][0:2], (regids[0][0], regids[0][1]))
self.assertTrue('M END' in res[0][2])
self.assertEqual(res[1][0:2], (regids[2][0], regids[2][1]))
self.assertTrue('M END' in res[1][2])
self.assertEqual(len(res), 2)
self.assertTrue(regids[0] in res)
self.assertTrue(regids[2] in res)
self.assertIn('M END', res[regids[0]][0])
self.assertEqual(res[regids[0]][1], 'mol')
self.assertIn('M END', res[regids[2]][0])
self.assertEqual(res[regids[2]][1], 'mol')

# query with just molregnos... then we get back the same thing as if we
# were not in conformer mode.
mrn0 = regids[0][0]
mrn2 = regids[2][0]

res = utils.retrieve(id=mrn0, config=self._config)
self.assertIn(mrn0, res)
self.assertIn('M END', res[mrn0][0])

res = utils.retrieve(ids=(mrn0, mrn2), config=self._config)
self.assertEqual(len(res), 2)
self.assertTrue(mrn0 in res)
self.assertTrue(mrn2 in res)
self.assertIn('M END', res[mrn0][0])
self.assertEqual(res[mrn0][1], 'mol')
self.assertIn('M END', res[mrn2][0])
self.assertEqual(res[mrn2][1], 'mol')

res = utils.retrieve(ids=(mrn0, mrn2),
config=self._config,
as_hashes=True)
self.assertIn(mrn0, res)
self.assertIn(mrn2, res)
self.assertIn('fullhash', res[mrn0])

def testConformerQueryById(self):
utils._initdb(config=self._config, confirm=True)
Expand Down
35 changes: 26 additions & 9 deletions lwreg/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -997,11 +997,14 @@ def retrieve(config=None,
as_hashes=False,
no_verbose=True):
""" returns the molecule data for one or more registry ids (molregnos)
The return value is a tuple of (molregno, data, format) 3-tuples

The return value is a dictionary of (data, format) 2-tuples with molregnos as keys

only one of id or ids should be provided

If registerConformers is set the conformers can be retrieved by providing
the tuples of (molregno, conf_id) and the return value will be a dictionary
of (data, 'mol') 2-tuples with (molregno, conf_id) tuples as keys

Keyword arguments:
config -- configuration dict
ids -- an iterable of registry ids (molregnos)
Expand All @@ -1024,16 +1027,20 @@ def retrieve(config=None,
try:
ids = [(int(id[0]), int(id[1]))]
getConfs = True
except ValueError:
except TypeError:
ids = [int(id)]
getConfs = False
else:
ids = [int(id)]
getConfs = False
elif ids is not None:
if registerConformers:
ids = [(int(x), int(y)) for x, y in ids]
getConfs = True
try:
ids = [(int(x), int(y)) for x, y in ids]
getConfs = True
except TypeError:
ids = [int(x) for x in ids]
getConfs = False
else:
if isinstance(ids, str):
ids = [int(x) for x in ids.split(',')]
Expand Down Expand Up @@ -1064,20 +1071,30 @@ def retrieve(config=None,
print(entry)
else:
print('not found')
resDict = {}
if as_hashes:
tres = []
colns = [x[0] for x in curs.description]
for row in res:
rowd = {}
for i, coln in enumerate(colns):
if coln == 'rdkitversion':
continue
if coln == 'molregno':
mrn = row[i]
continue
if row[i] is None:
continue
rowd[coln] = row[i]
tres.append(rowd)
res = tres
return tuple(res)
resDict[mrn] = rowd
else:
if not getConfs:
for mrn, data, fmt in res:
resDict[mrn] = (data, fmt)
else:
for mrn, confId, molb in res:
resDict[(mrn, confId)] = (molb, 'mol')

return resDict


def _registerMetadata(curs, config):
Expand Down