diff --git a/src/main/java/com/actelion/research/chem/Molecule.java b/src/main/java/com/actelion/research/chem/Molecule.java index 12e51d67..876f5ba3 100644 --- a/src/main/java/com/actelion/research/chem/Molecule.java +++ b/src/main/java/com/actelion/research/chem/Molecule.java @@ -1675,24 +1675,11 @@ public void swapBonds(int bond1, int bond2) { * @param atom */ public void deleteAtom(int atom) { - for (int bnd=0; bnd 0) { - position = parseAtomInsideBrackets(smiles, position, endIndex, true, true); - option--; - } - - return parseAtomInsideBrackets(smiles, position, endIndex, true, true); - } - - /** - * @param smiles - * @param position + * @param position points to second character of atom description, e.g. of the atom label * @param endIndex * @param allowSmarts - * @return + * @return position of first character after closing ']' (or delimiting ',' if enumerating SMARTS) * @throws Exception */ - protected int parseAtomInsideBrackets(byte[] smiles, int position, int endIndex, boolean allowSmarts, boolean allowOptions) throws Exception { + protected int parseAtomInsideBrackets(byte[] smiles, int position, int endIndex, boolean allowSmarts, boolean allowAtomOptions) throws Exception { if (smiles[position-1] == '$') { // recursive SMARTS recursiveSmartsList = new ArrayList<>(); position += parseRecursiveGroup(smiles, position-1, recursiveSmartsList) - 1; if (smiles[position++] != ']') { - if (!allowOptions) + if (!allowAtomOptions) throw new Exception("SmilesParser: A positive recursive SMARTS followed by another one or by atom query features is not supported. Position:" + (position - 1)); - position = advanceJustAfterClosingBracket(smiles, position); + if ((mMode & SmilesParser.MODE_ENUMERATE_SMARTS) == 0) + position = advanceJustAfterClosingBracket(smiles, position); } return position; @@ -319,7 +290,9 @@ else if (!mayBeAromatic) continue; } - if (smiles[position] == 'D') { // non-H-neighbours + if (smiles[position] == 'D' // number of explicit neighbours (incl. explicit H) + || smiles[position] == 'd') { // (RDKit extension) number of non-H-neighbours + // we translate both to the number of non-H neighbours (for 'D' we assume no explicit H to be present) position++; position += range.parse(position, 1, 1); long flags = 0; @@ -347,7 +320,7 @@ else if ((atomQueryFeatures & Molecule.cAtomQFNeighbours) != 0) continue; } - if (smiles[position] == 'z' && mAllowCactvs) { // electro-negative neighbour count (CACTVS extension) + if (smiles[position] == 'z' && mAllowCactvs) { // electro-negative neighbour count (CACTVS,RDKit extension) position++; position += range.parse(position, 1, 4); long flags = 0; @@ -490,11 +463,33 @@ else if (!range.isRange()) continue; } + if (smiles[position] == '^') { // RDKit hybridisation is translated into number of pi-electrons + position++; + + int hybridization = smiles[position++] - '0'; + + if (hybridization < 1 || hybridization > 3) + throw new Exception("SmilesParser: Unsupported hybridization. Position:"+position); + + long piElectrons = (hybridization == 1) ? Molecule.cAtomQFNot2PiElectrons + : (hybridization == 2) ? Molecule.cAtomQFNot1PiElectron : Molecule.cAtomQFNot0PiElectrons; + + if (!isNot) + piElectrons = Molecule.cAtomQFPiElectrons & ~piElectrons; + + atomQueryFeatures |= piElectrons; + + continue; + } + if (smiles[position] == '$') { // recursive SMARTS if (!isNot) throw new Exception("SmilesParser: non-negated recursive SMARTS relating to preceding atom are not supported yet. Position:"+position); - position += parseRecursiveGroup(smiles, position, getExcludeGroupList()); + if (excludeGroupList == null) + excludeGroupList = new ArrayList<>(); + + position += parseRecursiveGroup(smiles, position, excludeGroupList); continue; } @@ -504,19 +499,28 @@ else if (!range.isRange()) continue; } - if (allowSmarts && (smiles[position] == ',' && isRepeatedAllowedORFeature(smiles, position, skipCount))) { // we allow OR-logic for some query options if they have the same type + if (allowSmarts && smiles[position] == ',' && isRepeatedAllowedORFeature(smiles, position, skipCount)) { // we allow OR-logic for some query options if they have the same type smartsFeatureFound = true; position += skipCount[0] + 1; continue; } + if (allowSmarts && smiles[position] == ',' && (mMode & SmilesParser.MODE_ENUMERATE_SMARTS) != 0) { + smartsFeatureFound = true; + position += 1; + break; + } + + if (smiles[position] == ',') + throw new Exception("SmilesParser: alternative atom definitions not supported. (Tip: enumerate SMARTS): '"+(char)smiles[position]+"', position:"+position); + throw new Exception("SmilesParser: unexpected character inside brackets: '"+(char)smiles[position]+"', position:"+position); } return position; } - protected boolean parseAtomLabelInBrackets(byte[] smiles, int position, int endIndex, AtomLabelInfo info) throws Exception { + private boolean parseAtomLabelInBrackets(byte[] smiles, int position, int endIndex, AtomLabelInfo info) throws Exception { info.mayBeAromatic = true; info.mayBeAliphatic = true; if (smiles[position] == '#') { @@ -538,12 +542,30 @@ protected boolean parseAtomLabelInBrackets(byte[] smiles, int position, int endI if (smiles[position] >= 'A' && smiles[position] <= 'Z') { info.labelLength = (smiles[position+1] >= 'a' && smiles[position+1] <= 'z') ? 2 : 1; info.atomicNo = Molecule.getAtomicNoFromLabel(new String(smiles, position, info.labelLength, StandardCharsets.UTF_8)); + if (info.labelLength == 2 && info.atomicNo == 0) { + info.labelLength = 1; + info.atomicNo = Molecule.getAtomicNoFromLabel(new String(smiles, position, info.labelLength, StandardCharsets.UTF_8)); + } info.mayBeAromatic = false; + if (info.atomicNo == 0) + throw new Exception("SmilesParser: Unknown atom label. position:"+(position-1)); return true; } - if (smiles[position] >= 'a' && smiles[position] <= 'z') { - info.labelLength = (smiles[position+1] >= 'a' && smiles[position+1] <= 'z') ? 2 : 1; + if ((smiles[position] == 'A' && smiles[position+1] == 's') + || (smiles[position] == 'S' && smiles[position+1] == 'e')) { + info.labelLength = 2; + info.atomicNo = Molecule.getAtomicNoFromLabel(new String(smiles, position, info.labelLength, StandardCharsets.UTF_8)); + info.mayBeAliphatic = false; + return true; + } + + if (smiles[position] == 'c' + || smiles[position] == 'n' + || smiles[position] == 'o' + || smiles[position] == 'p' + || smiles[position] == 's') { + info.labelLength = 1; info.atomicNo = Molecule.getAtomicNoFromLabel(new String(smiles, position, info.labelLength, StandardCharsets.UTF_8)); info.mayBeAliphatic = false; return true; @@ -646,7 +668,10 @@ else if (smiles[endIndex] == ')') throw new Exception("SmilesParser: Missing closing ')' for recursive SMARTS. '('-position:"+(dollarIndex+1)); StereoMolecule group = new StereoMolecule(16, 16); - new SmilesParser(mMode).parse(group, smiles, dollarIndex+2, endIndex-1); + group.setFragment(true); + SmilesParser parser = new SmilesParser(mMode); + parser.setEnumerationPositionList(mParentParser.getEnumerationPositionList()); + parser.parse(group, smiles, dollarIndex+2, endIndex-1); groupList.add(group); if (smiles[dollarIndex-1] == '!') @@ -705,8 +730,6 @@ public boolean atomQueryFeaturesFound() { } public ArrayList getExcludeGroupList() { - if (excludeGroupList == null) - excludeGroupList = new ArrayList<>(); return excludeGroupList; } diff --git a/src/main/java/com/actelion/research/chem/SmilesParser.java b/src/main/java/com/actelion/research/chem/SmilesParser.java index 6a972716..5e17ce4d 100644 --- a/src/main/java/com/actelion/research/chem/SmilesParser.java +++ b/src/main/java/com/actelion/research/chem/SmilesParser.java @@ -55,7 +55,7 @@ public class SmilesParser { public static final int MODE_NO_CACTUS_SYNTAX = 16; // if not set, then some CACTVS SMARTS extensions will be recognized and translated as close as possible public static final int MODE_SINGLE_DOT_SEPARATOR = 32; // CONSIDER single dots '.' (rather than '..') as moelcule separator when parsing reactions public static final int MODE_CREATE_SMARTS_WARNING = 64; - public static final int MODE_FIRST_SMARTS_OPTION_ONLY = 128; // If multiple ',' separated atom options exist, then always use the first one + public static final int MODE_ENUMERATE_SMARTS = 128; private static final int INITIAL_CONNECTIONS = 16; private static final int MAX_CONNECTIONS = 100; // largest allowed one in SMILES is 99 @@ -68,14 +68,12 @@ public class SmilesParser { private StereoMolecule mMol; private boolean[] mIsAromaticBond; - private int mSmartsMode,mAromaticAtoms,mAromaticBonds,mCoordinateMode; - private final int mMode; + private int mMode,mSmartsMode,mAromaticAtoms,mAromaticBonds,mCoordinateMode; private long mRandomSeed; private final boolean mCreateSmartsWarnings,mMakeHydrogenExplicit,mSingleDotSeparator; private StringBuilder mSmartsWarningBuffer; private boolean mSmartsFeatureFound; - private TreeMap mOptionMap; - private OptionCounter mControllingOptionCounter; + private ArrayList mEnumerationPositionList; /** * Creates a new SmilesParser that doesn't allow SMARTS features to be present in @@ -258,21 +256,41 @@ else if (part == 1) return rxn; } - public StereoMolecule[] enumerateSmarts(String smarts) { + protected ArrayList getEnumerationPositionList() { + return mEnumerationPositionList; + } + + protected void setEnumerationPositionList(ArrayList l) { + mEnumerationPositionList = l; + } + + public String[] enumerateSmarts(String smarts) throws Exception { + mEnumerationPositionList = new ArrayList<>(); mSmartsMode = SMARTS_MODE_IS_SMARTS; - mOptionMap = new TreeMap<>(); - ArrayList enumeration = new ArrayList<>(); - while (mControllingOptionCounter == null || mControllingOptionCounter.increase()) { - StereoMolecule mol = new StereoMolecule(); - try { - parse(mol, smarts); - } - catch (Exception e) { - return null; - } - enumeration.add(mol); + mMode |= MODE_ENUMERATE_SMARTS; + + ArrayList smartsList = new ArrayList<>(); + smartsList.add(smarts); + + try { + parse(new StereoMolecule(), smarts); + } + catch (Exception e) { + System.out.println(e.getMessage()); + } + + EnumerationPosition[] options = mEnumerationPositionList.toArray(new EnumerationPosition[0]); + Arrays.sort(options); + + for (EnumerationPosition option : options) { + ArrayList enumeration = new ArrayList<>(); + for (String s : smartsList) + option.enumerate(this, s.getBytes(StandardCharsets.UTF_8), enumeration); + + smartsList = enumeration; } - return enumeration.toArray(new StereoMolecule[0]); + + return smartsList.toArray(new String[0]); } /** @@ -360,23 +378,15 @@ public void parse(StereoMolecule mol, byte[] smiles, int position, int endIndex, if (!squareBracketOpen) { position = atomParser.parseAtomOutsideBrackets(smiles, position, endIndex, allowSmarts); } - else if ((mMode & MODE_FIRST_SMARTS_OPTION_ONLY) != 0) { - position = atomParser.parseAtomInsideBrackets(smiles, position, endIndex, 0); - } - else if (mOptionMap != null) { - OptionCounter counter = mOptionMap.get(position); - if (counter == null) { - position = atomParser.parseAtomInsideBrackets(smiles, position, endIndex, 0); - if (smiles[position] != ']') { // we have multiple options and create a counter - counter = new OptionCounter(mControllingOptionCounter); - mOptionMap.put(position, counter); - mControllingOptionCounter = counter; + else if ((mMode & MODE_ENUMERATE_SMARTS) != 0) { + EnumerationPosition ep = new EnumerationPosition(position-1); + position = atomParser.parseAtomInsideBrackets(smiles, position, endIndex, true, true); + if (smiles[position-1] != ']') { // we have multiple options and create an option list + while (smiles[position-1] != ']') { + position = atomParser.parseAtomInsideBrackets(smiles, position+1, endIndex, true, true); + ep.increase(); } - } - else { - position = atomParser.parseAtomInsideBrackets(smiles, position, endIndex, counter.mIndex); - if (counter.mCount == 0 && smiles[position] != ']') // now we know the max count - counter.mCount = counter.mIndex+1; + mEnumerationPositionList.add(ep); } } else { @@ -386,8 +396,23 @@ else if (mOptionMap != null) { squareBracketOpen = false; if (atomParser.getRecursiveGroup() != null) { + fromAtom = baseAtom[bracketLevel]; + baseAtom[bracketLevel] = mol.getAllAtoms(); mol.addMolecule(atomParser.getRecursiveGroup()); + + if (fromAtom != -1 && bondType != Molecule.cBondTypeDeleted) { + int bond = mMol.addBond(fromAtom, fromAtom, bondType); + if (bondQueryFeatures != 0) { + mSmartsFeatureFound = true; + mMol.setBondQueryFeature(bond, bondQueryFeatures, true); + } + } + + // Reset bond type and query features to default. + bondType = Molecule.cBondTypeSingle; + bondQueryFeatures = 0; + continue; } @@ -407,8 +432,8 @@ else if (mOptionMap != null) { mAromaticAtoms++; fromAtom = baseAtom[bracketLevel]; - if (baseAtom[bracketLevel] != -1 && bondType != Molecule.cBondTypeDeleted) { - int bond = mMol.addBond(baseAtom[bracketLevel], atom, bondType); + if (fromAtom != -1 && bondType != Molecule.cBondTypeDeleted) { + int bond = mMol.addBond(fromAtom, atom, bondType); if (bondQueryFeatures != 0) { mSmartsFeatureFound = true; mMol.setBondQueryFeature(bond, bondQueryFeatures, true); @@ -1353,26 +1378,45 @@ private boolean assignKnownEZBondParities() { return paritiesFound; } - private static class OptionCounter { - int mIndex,mCount; - OptionCounter mPrevious; + private class EnumerationPosition implements Comparable { + int mPosition,mCount; - public OptionCounter(OptionCounter previous) { - mPrevious = previous; + /** + * @param position position of first option in original smarts + */ + public EnumerationPosition(int position) { + mPosition = position; + mCount = 1; } - public boolean increase() { - if (mPrevious.increase()) - return true; + public void increase() { + mCount++; + } - mIndex++; - if (mIndex < mCount) - return true; + public void enumerate(SmilesParser parser, byte[] smarts, ArrayList enumeration) throws Exception { + ArrayList optionList = new ArrayList<>(); + + int start = mPosition; + SmilesAtomParser atomParser = new SmilesAtomParser(parser, mMode | mSmartsMode); + int end = atomParser.parseAtomInsideBrackets(smarts, start+1, smarts.length, true, true)-1; + if (smarts[end] != ']') { // we have multiple options and create an option list + optionList.add(new String(smarts, start, end-start)); + while (smarts[end] != ']') { + start = end+1; + end = atomParser.parseAtomInsideBrackets(smarts, start+1, smarts.length, true, true)-1; + optionList.add(new String(smarts, start, end-start)); + } + } - mIndex = 0; - return false; + for (String option : optionList) + enumeration.add(new String(smarts, 0, mPosition) + option + new String(smarts, end, smarts.length-end)); } + + @Override + public int compareTo(EnumerationPosition o) { + return Integer.compare(o.mPosition, mPosition); } + } private static class ParityNeighbour { int mAtom,mPosition; diff --git a/src/main/java/com/actelion/research/chem/phesa/pharmacophore/pp/IPharmacophorePoint.java b/src/main/java/com/actelion/research/chem/phesa/pharmacophore/pp/IPharmacophorePoint.java index 12e6de46..f76aff03 100644 --- a/src/main/java/com/actelion/research/chem/phesa/pharmacophore/pp/IPharmacophorePoint.java +++ b/src/main/java/com/actelion/research/chem/phesa/pharmacophore/pp/IPharmacophorePoint.java @@ -9,7 +9,6 @@ public interface IPharmacophorePoint { - public enum Functionality {ACCEPTOR(PharmacophoreCalculator.ACCEPTOR_ID), DONOR(PharmacophoreCalculator.DONOR_ID), NEG_CHARGE(PharmacophoreCalculator.CHARGE_NEG_ID), POS_CHARGE(PharmacophoreCalculator.CHARGE_POS_ID), AROM_RING(PharmacophoreCalculator.AROM_RING_ID),EXIT_VECTOR(PharmacophoreCalculator.EXIT_VECTOR_ID); @@ -20,7 +19,6 @@ public enum Functionality {ACCEPTOR(PharmacophoreCalculator.ACCEPTOR_ID), DONOR( public int getIndex() { return this.index; } - } @@ -79,8 +77,4 @@ default public double getVectorSimilarity(IPharmacophorePoint pp2,Coordinates di default public double getVectorSimilarity(IPharmacophorePoint pp2) { return getVectorSimilarity(pp2, pp2.getDirectionality()); } - - - - }